Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(7)

Unified Diff: re2/compile.cc

Issue 1740045: code review 1740045: factor common prefixes from alternations (Closed)
Patch Set: code review 1740045: factor common prefixes from alternations Created 14 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | re2/mimics_pcre.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: re2/compile.cc
===================================================================
--- a/re2/compile.cc
+++ b/re2/compile.cc
@@ -93,7 +93,7 @@
}
Prog::Inst* ip = &inst0[l.p>>1];
- if(l.p&1)
+ if (l.p&1)
ip->out1_ = l2.p;
else
ip->set_out(l2.p);
@@ -132,7 +132,7 @@
// Compiles alternation of all the re to a new Prog.
// Each re has a match with an id equal to its index in the vector.
static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
- const vector<Regexp*>& re);
+ Regexp* re);
// Interface for Regexp::Walker, which helps traverse the Regexp.
// The walk is purely post-recursive: given the machines for the
@@ -207,7 +207,7 @@
// Single rune.
Frag Literal(Rune r, bool foldcase);
- void Setup(Regexp::ParseFlags, int64);
+ void Setup(Regexp::ParseFlags, int64, RE2::Anchor);
Prog* Finish();
// Returns .* where dot = any byte
@@ -230,6 +230,8 @@
map<uint64, int> rune_cache_;
Frag rune_range_;
+ RE2::Anchor anchor_; // anchor mode for RE2::Set
+
DISALLOW_EVIL_CONSTRUCTORS(Compiler);
};
@@ -478,7 +480,8 @@
rune_range_.end = nullPatchList;
}
-int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {
+int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase,
+ int next) {
Frag f = ByteRange(lo, hi, foldcase);
if (next != 0) {
PatchList::Patch(inst_, f.end, next);
@@ -492,7 +495,9 @@
// In Latin1 mode, there's no point in caching.
// In forward UTF-8 mode, only need to cache continuation bytes.
if (encoding_ == kEncodingLatin1 ||
- (encoding_ == kEncodingUTF8 && !reversed_ && !(0x80 <= lo && hi <= 0xbf))) {
+ (encoding_ == kEncodingUTF8 &&
+ !reversed_ &&
+ !(0x80 <= lo && hi <= 0xbf))) {
return UncachedRuneByteSuffix(lo, hi, foldcase, next);
}
@@ -723,6 +728,14 @@
case kRegexpEmptyMatch:
return Nop();
+ case kRegexpHaveMatch: {
+ Frag f = Match(re->match_id());
+ // Remember unanchored match to end of string.
+ if (anchor_ != RE2::ANCHOR_BOTH)
+ f = Cat(DotStar(), f);
+ return f;
+ }
+
case kRegexpConcat: {
Frag f = child_frags[0];
for (int i = 1; i < nchild_frags; i++)
@@ -896,7 +909,8 @@
}
}
-void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem) {
+void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
+ RE2::Anchor anchor) {
prog_->set_flags(flags);
if (flags & Regexp::Latin1)
@@ -926,6 +940,8 @@
max_inst_ = m;
}
+
+ anchor_ = anchor;
}
// Compiles re, returning program.
@@ -936,7 +952,7 @@
Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) {
Compiler c;
- c.Setup(re->parse_flags(), max_mem);
+ c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */);
c.reversed_ = reversed;
// Simplify to remove things like counted repetitions
@@ -1035,30 +1051,22 @@
// Compiles RE set to Prog.
Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
- const vector<Regexp*>& re) {
+ Regexp* re) {
Compiler c;
- c.Setup(static_cast<Regexp::ParseFlags>(options.ParseFlags()),
- options.max_mem());
+ Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags());
+ c.Setup(pf, options.max_mem(), anchor);
- // Accumulate alternation of fragments.
- Frag all = kNullFrag;
- for (int i = 0; i < re.size(); i++) {
- Frag f = c.WalkExponential(re[i], kNullFrag, 2*c.max_inst_);
- if (c.failed_)
- return NULL;
- switch (anchor) {
- case RE2::UNANCHORED:
- f = c.Cat(c.Cat(c.DotStar(), f), c.DotStar());
- break;
- case RE2::ANCHOR_START:
- f = c.Cat(f, c.DotStar());
- break;
- case RE2::ANCHOR_BOTH:
- break;
- }
- f = c.Cat(f, c.Match(i));
- all = c.Alt(all, f);
+ // Compile alternation of fragments.
+ Frag all = c.WalkExponential(re, kNullFrag, 2*c.max_inst_);
+ re->Decref();
+ if (c.failed_)
+ return NULL;
+
+ if (anchor == RE2::UNANCHORED) {
+ // The trailing .* was added while handling kRegexpHaveMatch.
+ // We just have to add the leading one.
+ all = c.Cat(c.DotStar(), all);
}
c.prog_->set_start(all.begin);
@@ -1066,14 +1074,6 @@
c.prog_->set_anchor_start(true);
c.prog_->set_anchor_end(true);
- // Also create unanchored version, which starts with a .*? loop.
- if (c.prog_->anchor_start()) {
- c.prog_->set_start_unanchored(c.prog_->start());
- } else {
- Frag unanchored = c.Cat(c.DotStar(), all);
- c.prog_->set_start_unanchored(unanchored.begin);
- }
-
Prog* prog = c.Finish();
if (prog == NULL)
return NULL;
@@ -1093,7 +1093,7 @@
}
Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
- const vector<Regexp*>& re) {
+ Regexp* re) {
return Compiler::CompileSet(options, anchor, re);
}
« no previous file with comments | « no previous file | re2/mimics_pcre.cc » ('j') | no next file with comments »

Powered by Google App Engine
RSS Feeds Recent Issues | This issue
This is Rietveld f62528b