I advise you to write iterator classes. They are easy to implement (basically they are state machines), they have a low memory footprint, they can be combined to build up increasingly complex expressions (please scroll down to see the final result), and the resulting iterator can be wrapped in an enumerator.
Each iterator class has the following methods:
- first: initializes the state machine (first match)
- next: proceeds to the next state (next match)
- ok: initially true, but becomes false once 'next' proceeds beyond the last match
- get: returns the current match (as a string)
- clone: clones the object; essential for repetition, because each instance needs its own state
Start off with the most trivial case: a sequence of one or more characters that should be matched literally (e.g. /foo/). Needless to say this has only one match, so 'ok' will become false upon the first call of 'next'.
function Literal(literal) { this.literal = literal; }
Literal.prototype.first = function() { this.i = 0; };
Literal.prototype.next = function() { this.i++; };
Literal.prototype.ok = function() { return this.i == 0; };
Literal.prototype.get = function() { return this.literal; };
Literal.prototype.clone = function() { return new Literal(this.literal); };
Character classes ([abc]) are trivial too. The constructor accepts a string of characters; if you prefer arrays, that's easy to fix.
function CharacterClass(chars) { this.chars = chars; }
CharacterClass.prototype.first = function() { this.i = 0; };
CharacterClass.prototype.next = function() { this.i++; };
CharacterClass.prototype.ok = function() { return this.i < this.chars.length; };
CharacterClass.prototype.get = function() { return this.chars.charAt(this.i); };
CharacterClass.prototype.clone = function() { return new CharacterClass(this.chars); };
Now we need iterators that combine other iterators to form more complex regular expressions. A sequence is just two or more patterns in a row (like foo[abc]).
function Sequence(iterators) {
if (arguments.length > 0) {
this.iterators = iterators.length ? iterators : [new Literal('')];
}
}
Sequence.prototype.first = function() {
for (var i in this.iterators) this.iterators[i].first();
};
Sequence.prototype.next = function() {
if (this.ok()) {
var i = this.iterators.length;
while (this.iterators[--i].next(), i > 0 && !this.iterators[i].ok()) {
this.iterators[i].first();
}
}
};
Sequence.prototype.ok = function() {
return this.iterators[0].ok();
};
Sequence.prototype.get = function() {
var retval = '';
for (var i in this.iterators) {
retval += this.iterators[i].get();
}
return retval;
};
Sequence.prototype.clone = function() {
return new Sequence(this.iterators.map(function(it) { return it.clone(); }));
};
Another way to combine iterators is the choice (a.k.a. alternatives), e.g. foo|bar.
function Choice(iterators) { this.iterators = iterators; }
Choice.prototype.first = function() {
this.count = 0;
for (var i in this.iterators) this.iterators[i].first();
};
Choice.prototype.next = function() {
if (this.ok()) {
this.iterators[this.count].next();
while (this.ok() && !this.iterators[this.count].ok()) this.count++;
}
};
Choice.prototype.ok = function() {
return this.count < this.iterators.length;
};
Choice.prototype.get = function() {
return this.iterators[this.count].get();
};
Choice.prototype.clone = function() {
return new Choice(this.iterators.map(function(it) { return it.clone(); }));
};
Other regex features can be implemented by combining the existing classes. Class inheritance is a great way to do this. For example, an optional pattern (x?) is just a choice between the empty string and x.
function Optional(iterator) {
if (arguments.length > 0) {
Choice.call(this, [new Literal(''), iterator]);
}
}
Optional.prototype = new Choice();
Repetition (x{n,m}) is a combination of Sequence and Optional. Because I have to inherit one or the other, my implementation consists of two mutually dependent classes.
function RepeatFromZero(maxTimes, iterator) {
if (arguments.length > 0) {
Optional.call(this, new Repeat(1, maxTimes, iterator));
}
}
RepeatFromZero.prototype = new Optional();
function Repeat(minTimes, maxTimes, iterator) {
if (arguments.length > 0) {
var sequence = [];
for (var i = 0; i < minTimes; i++) {
sequence.push(iterator.clone()); // need to clone the iterator
}
if (minTimes < maxTimes) {
sequence.push(new RepeatFromZero(maxTimes - minTimes, iterator));
}
Sequence.call(this, sequence);
}
}
Repeat.prototype = new Sequence();
As I said earlier, an iterator can be wrapped into an enumerator. This is simply a loop that you can break whenever you want.
function Enumerator(iterator) {
this.iterator = iterator;
this.each = function(callback) {
for (this.iterator.first(); this.iterator.ok(); this.iterator.next()) {
callback(this.iterator.get());
}
};
}
Time to put it all together. Let's take some silly regular expression:
([ab]{2}){1,2}|[cd](f|ef{0,2}e)
Composing the iterator object is really straightforward:
function GetIterationsAsHtml() {
var iterator = new Choice([
new Repeat(1, 2,
new Repeat(2, 2, new CharacterClass('ab'))),
new Sequence([
new CharacterClass('cd'),
new Choice([
new Literal('f'),
new Sequence([
new Literal('e'),
new RepeatFromZero(2, new Literal('f')),
new Literal('e')
])
])
])
]);
var iterations = '<ol>
';
var enumerator = new Enumerator(iterator);
enumerator.each(function(iteration) { iterations += '<li>' + iteration + '</li>
'; });
return iterations + '</ol>';
}
This yields 28 matches, but I will spare you the output.
My apologies if my code is not compliant to software patterns, is not browser-compatible (works OK on Chrome and Firefox) or suffers from poor OOP. I just hope it makes the concept clear.
EDIT: for completeness, and following OP's initiative, I implemented one more iterator class: the reference.
A reference (1 2 etc) picks up the current match of an earlier capturing group (i.e. anything in parentheses). Its implementation is very similar to Literal, in that it has exactly one match.
function Reference(iterator) { this.iterator = iterator; }
Reference.prototype.first = function() { this.i = 0; };
Reference.prototype.next = function() { this.i++; };
Reference.prototype.ok = function() { return this.i == 0; };
Reference.prototype.get = function() { return this.iterator.get(); };
Reference.prototype.clone = function() { return new Reference(this.iterator); };
The constructor is given an iterator that represents the referenced subpattern. Taking (foo|bar)([xy])21
as an example (yields fooxxfoo, fooyyfoo, barxxbar, baryybar):
var groups = new Array();
var iterator = new Sequence([
groups[1] = new Choice([new Literal('foo'), new Literal('bar')]),
groups[2] = new CharacterClass('xy'),
new Reference(groups[2]),
new Reference(groups[1])
]);
Capturing groups are specified as you build up the tree of iterator classes. I am still doing that manually here, but eventually you want this to be automated. That is just a matter of mapping your parse tree to a similar tree of iterator classes.
EDIT 2: here's a relatively simple recursive function that will convert a parse tree produced by ret.js into an iterator.
function ParseTreeMapper() {
this.capturingGroups = [];
}
ParseTreeMapper.prototype.mapToIterator = function(parseTree) {
switch (parseTree.type) {
case ret.types.ROOT:
case ret.types.GROUP:
var me = this;
var mapToSequence = function(parseTrees) {
return new Sequence(parseTrees.map(function(t) {
return me.mapToIterator(t);
}));
};
var group = parseTree.options ?
new Choice(parseTree.options.map(mapToSequence)) :
mapToSequence(parseTree.stack);
if (parseTree.remember) {
this.capturingGroups.push(group);
}
return group;
case ret.types.SET:
return new CharacterClass(this.mapToCharacterClass(parseTree.set));
case ret.types.REPETITION:
return new Repeat(parseInt(parseTree.min), parseInt(parseTree.max), this.mapToIterator(parseTree.value));
case ret.types.REFERENCE:
var ref = parseInt(parseTree.value) - 1;
return ref in this.capturingGroups ?
new Reference(this.capturingGroups[ref]) :
new Literal('<ReferenceOutOfRange>');
case ret.types.CHAR:
return new Literal(String.fromCharCode(parseTree.value));
default:
return new Literal('<UnsupportedType>');
}
};
ParseTreeMapper.prototype.mapToCharacterClass = function(parseTrees) {
var chars = '';
for (var i in parseTrees) {
var tree = parseTrees[i];
switch (tree.type) {
case ret.types.CHAR:
chars += String.fromCharCode(tree.value);
break;
case ret.types.RANGE:
for (var code = tree.from; code <= tree.to; code++) {
chars += String.fromCharCode(code);
}
break;
}
}
return chars;
};
Usage:
var regex = 'b[a-n]{3}';
var parseTree = ret(regex); // requires ret.js
var iterator = new ParseTreeMapper().mapToIterator(parseTree);
I put all components together in this demo: http://jsfiddle.net/Pmnwk/3/
Note: many regex syntax constructs are not supported (anchors, look-ahead, look-behind, recursion), but I guess it is already pretty much up to par with invRegex.py.