Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(67)

Side by Side Diff: re2/compile.cc

Issue 1740045: code review 1740045: factor common prefixes from alternations (Closed)
Patch Set: code review 1740045: factor common prefixes from alternations Created 14 years, 8 months ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | re2/mimics_pcre.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2007 The RE2 Authors. All Rights Reserved. 1 // Copyright 2007 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style 2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file. 3 // license that can be found in the LICENSE file.
4 4
5 // Compile regular expression to Prog. 5 // Compile regular expression to Prog.
6 // 6 //
7 // Prog and Inst are defined in prog.h. 7 // Prog and Inst are defined in prog.h.
8 // This file's external interface is just Regexp::CompileToProg. 8 // This file's external interface is just Regexp::CompileToProg.
9 // The Compiler class defined in this file is private. 9 // The Compiler class defined in this file is private.
10 10
(...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after
86 86
87 PatchList l = l1; 87 PatchList l = l1;
88 for (;;) { 88 for (;;) {
89 PatchList next = PatchList::Deref(inst0, l); 89 PatchList next = PatchList::Deref(inst0, l);
90 if (next.p == 0) 90 if (next.p == 0)
91 break; 91 break;
92 l = next; 92 l = next;
93 } 93 }
94 94
95 Prog::Inst* ip = &inst0[l.p>>1]; 95 Prog::Inst* ip = &inst0[l.p>>1];
96 if(l.p&1) 96 if (l.p&1)
97 ip->out1_ = l2.p; 97 ip->out1_ = l2.p;
98 else 98 else
99 ip->set_out(l2.p); 99 ip->set_out(l2.p);
100 100
101 return l1; 101 return l1;
102 } 102 }
103 103
104 // Compiled program fragment. 104 // Compiled program fragment.
105 struct Frag { 105 struct Frag {
106 uint32 begin; 106 uint32 begin;
(...skipping 18 matching lines...) Expand all
125 125
126 // Compiles Regexp to a new Prog. 126 // Compiles Regexp to a new Prog.
127 // Caller is responsible for deleting Prog when finished with it. 127 // Caller is responsible for deleting Prog when finished with it.
128 // If reversed is true, compiles for walking over the input 128 // If reversed is true, compiles for walking over the input
129 // string backward (reverses all concatenations). 129 // string backward (reverses all concatenations).
130 static Prog *Compile(Regexp* re, bool reversed, int64 max_mem); 130 static Prog *Compile(Regexp* re, bool reversed, int64 max_mem);
131 131
132 // Compiles alternation of all the re to a new Prog. 132 // Compiles alternation of all the re to a new Prog.
133 // Each re has a match with an id equal to its index in the vector. 133 // Each re has a match with an id equal to its index in the vector.
134 static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor, 134 static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
135 const vector<Regexp*>& re); 135 Regexp* re);
136 136
137 // Interface for Regexp::Walker, which helps traverse the Regexp. 137 // Interface for Regexp::Walker, which helps traverse the Regexp.
138 // The walk is purely post-recursive: given the machines for the 138 // The walk is purely post-recursive: given the machines for the
139 // children, PostVisit combines them to create the machine for 139 // children, PostVisit combines them to create the machine for
140 // the current node. The child_args are Frags. 140 // the current node. The child_args are Frags.
141 // The Compiler traverses the Regexp parse tree, visiting 141 // The Compiler traverses the Regexp parse tree, visiting
142 // each node in depth-first order. It invokes PreVisit before 142 // each node in depth-first order. It invokes PreVisit before
143 // visiting the node's children and PostVisit after visiting 143 // visiting the node's children and PostVisit after visiting
144 // the children. 144 // the children.
145 Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop); 145 Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop);
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after
200 200
201 // Adds a suffix to alternation. 201 // Adds a suffix to alternation.
202 void AddSuffix(int id); 202 void AddSuffix(int id);
203 203
204 // Returns the alternation of all the added suffixes. 204 // Returns the alternation of all the added suffixes.
205 Frag EndRange(); 205 Frag EndRange();
206 206
207 // Single rune. 207 // Single rune.
208 Frag Literal(Rune r, bool foldcase); 208 Frag Literal(Rune r, bool foldcase);
209 209
210 void Setup(Regexp::ParseFlags, int64); 210 void Setup(Regexp::ParseFlags, int64, RE2::Anchor);
211 Prog* Finish(); 211 Prog* Finish();
212 212
213 // Returns .* where dot = any byte 213 // Returns .* where dot = any byte
214 Frag DotStar(); 214 Frag DotStar();
215 215
216 private: 216 private:
217 Prog* prog_; // Program being built. 217 Prog* prog_; // Program being built.
218 bool failed_; // Did we give up compiling? 218 bool failed_; // Did we give up compiling?
219 Encoding encoding_; // Input encoding 219 Encoding encoding_; // Input encoding
220 bool reversed_; // Should program run backward over text? 220 bool reversed_; // Should program run backward over text?
221 221
222 int max_inst_; // Maximum number of instructions. 222 int max_inst_; // Maximum number of instructions.
223 223
224 Prog::Inst* inst_; // Pointer to first instruction. 224 Prog::Inst* inst_; // Pointer to first instruction.
225 int inst_len_; // Number of instructions used. 225 int inst_len_; // Number of instructions used.
226 int inst_cap_; // Number of instructions allocated. 226 int inst_cap_; // Number of instructions allocated.
227 227
228 int64 max_mem_; // Total memory budget. 228 int64 max_mem_; // Total memory budget.
229 229
230 map<uint64, int> rune_cache_; 230 map<uint64, int> rune_cache_;
231 Frag rune_range_; 231 Frag rune_range_;
232 232
233 RE2::Anchor anchor_; // anchor mode for RE2::Set
234
233 DISALLOW_EVIL_CONSTRUCTORS(Compiler); 235 DISALLOW_EVIL_CONSTRUCTORS(Compiler);
234 }; 236 };
235 237
236 Compiler::Compiler() { 238 Compiler::Compiler() {
237 prog_ = new Prog(); 239 prog_ = new Prog();
238 failed_ = false; 240 failed_ = false;
239 encoding_ = kEncodingUTF8; 241 encoding_ = kEncodingUTF8;
240 reversed_ = false; 242 reversed_ = false;
241 inst_ = NULL; 243 inst_ = NULL;
242 inst_len_ = 0; 244 inst_len_ = 0;
(...skipping 228 matching lines...) Expand 10 before | Expand all | Expand 10 after
471 // The Frag accumulates in rune_range_. Caching common 473 // The Frag accumulates in rune_range_. Caching common
472 // suffixes reduces the UTF-8 "." from 32 to 24 instructions, 474 // suffixes reduces the UTF-8 "." from 32 to 24 instructions,
473 // and it reduces the corresponding one-pass NFA from 16 nodes to 8. 475 // and it reduces the corresponding one-pass NFA from 16 nodes to 8.
474 476
475 void Compiler::BeginRange() { 477 void Compiler::BeginRange() {
476 rune_cache_.clear(); 478 rune_cache_.clear();
477 rune_range_.begin = NULL; 479 rune_range_.begin = NULL;
478 rune_range_.end = nullPatchList; 480 rune_range_.end = nullPatchList;
479 } 481 }
480 482
481 int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next ) { 483 int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase,
484 int next) {
482 Frag f = ByteRange(lo, hi, foldcase); 485 Frag f = ByteRange(lo, hi, foldcase);
483 if (next != 0) { 486 if (next != 0) {
484 PatchList::Patch(inst_, f.end, next); 487 PatchList::Patch(inst_, f.end, next);
485 } else { 488 } else {
486 rune_range_.end = PatchList::Append(inst_, rune_range_.end, f.end); 489 rune_range_.end = PatchList::Append(inst_, rune_range_.end, f.end);
487 } 490 }
488 return f.begin; 491 return f.begin;
489 } 492 }
490 493
491 int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) { 494 int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {
492 // In Latin1 mode, there's no point in caching. 495 // In Latin1 mode, there's no point in caching.
493 // In forward UTF-8 mode, only need to cache continuation bytes. 496 // In forward UTF-8 mode, only need to cache continuation bytes.
494 if (encoding_ == kEncodingLatin1 || 497 if (encoding_ == kEncodingLatin1 ||
495 (encoding_ == kEncodingUTF8 && !reversed_ && !(0x80 <= lo && hi <= 0xbf))) { 498 (encoding_ == kEncodingUTF8 &&
499 !reversed_ &&
500 !(0x80 <= lo && hi <= 0xbf))) {
496 return UncachedRuneByteSuffix(lo, hi, foldcase, next); 501 return UncachedRuneByteSuffix(lo, hi, foldcase, next);
497 } 502 }
498 503
499 uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | foldcase; 504 uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | foldcase;
500 map<uint64, int>::iterator it = rune_cache_.find(key); 505 map<uint64, int>::iterator it = rune_cache_.find(key);
501 if (it != rune_cache_.end()) 506 if (it != rune_cache_.end())
502 return it->second; 507 return it->second;
503 int id = UncachedRuneByteSuffix(lo, hi, foldcase, next); 508 int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
504 rune_cache_[key] = id; 509 rune_cache_[key] = id;
505 return id; 510 return id;
(...skipping 210 matching lines...) Expand 10 before | Expand all | Expand 10 after
716 case kRegexpRepeat: 721 case kRegexpRepeat:
717 // Should not see; code at bottom of function will print error 722 // Should not see; code at bottom of function will print error
718 break; 723 break;
719 724
720 case kRegexpNoMatch: 725 case kRegexpNoMatch:
721 return NoMatch(); 726 return NoMatch();
722 727
723 case kRegexpEmptyMatch: 728 case kRegexpEmptyMatch:
724 return Nop(); 729 return Nop();
725 730
731 case kRegexpHaveMatch: {
732 Frag f = Match(re->match_id());
733 // Remember unanchored match to end of string.
734 if (anchor_ != RE2::ANCHOR_BOTH)
735 f = Cat(DotStar(), f);
736 return f;
737 }
738
726 case kRegexpConcat: { 739 case kRegexpConcat: {
727 Frag f = child_frags[0]; 740 Frag f = child_frags[0];
728 for (int i = 1; i < nchild_frags; i++) 741 for (int i = 1; i < nchild_frags; i++)
729 f = Cat(f, child_frags[i]); 742 f = Cat(f, child_frags[i]);
730 return f; 743 return f;
731 } 744 }
732 745
733 case kRegexpAlternate: { 746 case kRegexpAlternate: {
734 Frag f = child_frags[0]; 747 Frag f = child_frags[0];
735 for (int i = 1; i < nchild_frags; i++) 748 for (int i = 1; i < nchild_frags; i++)
(...skipping 153 matching lines...) Expand 10 before | Expand all | Expand 10 after
889 continue; 902 continue;
890 case kRegexpEndText: 903 case kRegexpEndText:
891 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); 904 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
892 re->Decref(); 905 re->Decref();
893 return true; 906 return true;
894 } 907 }
895 return false; 908 return false;
896 } 909 }
897 } 910 }
898 911
899 void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem) { 912 void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
913 RE2::Anchor anchor) {
900 prog_->set_flags(flags); 914 prog_->set_flags(flags);
901 915
902 if (flags & Regexp::Latin1) 916 if (flags & Regexp::Latin1)
903 encoding_ = kEncodingLatin1; 917 encoding_ = kEncodingLatin1;
904 max_mem_ = max_mem; 918 max_mem_ = max_mem;
905 if (max_mem <= 0) { 919 if (max_mem <= 0) {
906 max_inst_ = 100000; // more than enough 920 max_inst_ = 100000; // more than enough
907 } else if (max_mem <= sizeof(Prog)) { 921 } else if (max_mem <= sizeof(Prog)) {
908 // No room for anything. 922 // No room for anything.
909 max_inst_ = 0; 923 max_inst_ = 0;
910 } else { 924 } else {
911 int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst); 925 int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
912 // Limit instruction count so that inst->id() fits nicely in an int. 926 // Limit instruction count so that inst->id() fits nicely in an int.
913 // SparseArray also assumes that the indices (inst->id()) are ints. 927 // SparseArray also assumes that the indices (inst->id()) are ints.
914 // The call to WalkExponential uses 2*max_inst_ below, 928 // The call to WalkExponential uses 2*max_inst_ below,
915 // and other places in the code use 2 or 3 * prog->size(). 929 // and other places in the code use 2 or 3 * prog->size().
916 // Limiting to 2^24 should avoid overflow in those places. 930 // Limiting to 2^24 should avoid overflow in those places.
917 // (The point of allowing more than 32 bits of memory is to 931 // (The point of allowing more than 32 bits of memory is to
918 // have plenty of room for the DFA states, not to use it up 932 // have plenty of room for the DFA states, not to use it up
919 // on the program.) 933 // on the program.)
920 if (m >= 1<<24) 934 if (m >= 1<<24)
921 m = 1<<24; 935 m = 1<<24;
922 936
923 // Inst imposes its own limit (currently bigger than 2^24 but be safe). 937 // Inst imposes its own limit (currently bigger than 2^24 but be safe).
924 if (m > Prog::Inst::kMaxInst) 938 if (m > Prog::Inst::kMaxInst)
925 m = Prog::Inst::kMaxInst; 939 m = Prog::Inst::kMaxInst;
926 940
927 max_inst_ = m; 941 max_inst_ = m;
928 } 942 }
943
944 anchor_ = anchor;
929 } 945 }
930 946
931 // Compiles re, returning program. 947 // Compiles re, returning program.
932 // Caller is responsible for deleting prog_. 948 // Caller is responsible for deleting prog_.
933 // If reversed is true, compiles a program that expects 949 // If reversed is true, compiles a program that expects
934 // to run over the input string backward (reverses all concatenations). 950 // to run over the input string backward (reverses all concatenations).
935 // The reversed flag is also recorded in the returned program. 951 // The reversed flag is also recorded in the returned program.
936 Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) { 952 Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) {
937 Compiler c; 953 Compiler c;
938 954
939 c.Setup(re->parse_flags(), max_mem); 955 c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */);
940 c.reversed_ = reversed; 956 c.reversed_ = reversed;
941 957
942 // Simplify to remove things like counted repetitions 958 // Simplify to remove things like counted repetitions
943 // and character classes like \d. 959 // and character classes like \d.
944 Regexp* sre = re->Simplify(); 960 Regexp* sre = re->Simplify();
945 if (sre == NULL) 961 if (sre == NULL)
946 return NULL; 962 return NULL;
947 963
948 // Record whether prog is anchored, removing the anchors. 964 // Record whether prog is anchored, removing the anchors.
949 // (They get in the way of other optimizations.) 965 // (They get in the way of other optimizations.)
(...skipping 78 matching lines...) Expand 10 before | Expand all | Expand 10 after
1028 Prog* Regexp::CompileToReverseProg(int64 max_mem) { 1044 Prog* Regexp::CompileToReverseProg(int64 max_mem) {
1029 return Compiler::Compile(this, true, max_mem); 1045 return Compiler::Compile(this, true, max_mem);
1030 } 1046 }
1031 1047
1032 Frag Compiler::DotStar() { 1048 Frag Compiler::DotStar() {
1033 return Star(ByteRange(0x00, 0xff, false), true); 1049 return Star(ByteRange(0x00, 0xff, false), true);
1034 } 1050 }
1035 1051
1036 // Compiles RE set to Prog. 1052 // Compiles RE set to Prog.
1037 Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor, 1053 Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
1038 const vector<Regexp*>& re) { 1054 Regexp* re) {
1039 Compiler c; 1055 Compiler c;
1040 1056
1041 c.Setup(static_cast<Regexp::ParseFlags>(options.ParseFlags()), 1057 Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags());
1042 options.max_mem()); 1058 c.Setup(pf, options.max_mem(), anchor);
1043 1059
1044 // Accumulate alternation of fragments. 1060 // Compile alternation of fragments.
1045 Frag all = kNullFrag; 1061 Frag all = c.WalkExponential(re, kNullFrag, 2*c.max_inst_);
1046 for (int i = 0; i < re.size(); i++) { 1062 re->Decref();
1047 Frag f = c.WalkExponential(re[i], kNullFrag, 2*c.max_inst_); 1063 if (c.failed_)
1048 if (c.failed_) 1064 return NULL;
1049 return NULL; 1065
1050 switch (anchor) { 1066 if (anchor == RE2::UNANCHORED) {
1051 case RE2::UNANCHORED: 1067 // The trailing .* was added while handling kRegexpHaveMatch.
1052 f = c.Cat(c.Cat(c.DotStar(), f), c.DotStar()); 1068 // We just have to add the leading one.
1053 break; 1069 all = c.Cat(c.DotStar(), all);
1054 case RE2::ANCHOR_START:
1055 f = c.Cat(f, c.DotStar());
1056 break;
1057 case RE2::ANCHOR_BOTH:
1058 break;
1059 }
1060 f = c.Cat(f, c.Match(i));
1061 all = c.Alt(all, f);
1062 } 1070 }
1063 1071
1064 c.prog_->set_start(all.begin); 1072 c.prog_->set_start(all.begin);
1065 c.prog_->set_start_unanchored(all.begin); 1073 c.prog_->set_start_unanchored(all.begin);
1066 c.prog_->set_anchor_start(true); 1074 c.prog_->set_anchor_start(true);
1067 c.prog_->set_anchor_end(true); 1075 c.prog_->set_anchor_end(true);
1068 1076
1069 // Also create unanchored version, which starts with a .*? loop.
1070 if (c.prog_->anchor_start()) {
1071 c.prog_->set_start_unanchored(c.prog_->start());
1072 } else {
1073 Frag unanchored = c.Cat(c.DotStar(), all);
1074 c.prog_->set_start_unanchored(unanchored.begin);
1075 }
1076
1077 Prog* prog = c.Finish(); 1077 Prog* prog = c.Finish();
1078 if (prog == NULL) 1078 if (prog == NULL)
1079 return NULL; 1079 return NULL;
1080 1080
1081 // Make sure DFA has enough memory to operate, 1081 // Make sure DFA has enough memory to operate,
1082 // since we're not going to fall back to the NFA. 1082 // since we're not going to fall back to the NFA.
1083 bool failed; 1083 bool failed;
1084 StringPiece sp = "hello, world"; 1084 StringPiece sp = "hello, world";
1085 prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch, 1085 prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
1086 NULL, &failed, NULL); 1086 NULL, &failed, NULL);
1087 if (failed) { 1087 if (failed) {
1088 delete prog; 1088 delete prog;
1089 return NULL; 1089 return NULL;
1090 } 1090 }
1091 1091
1092 return prog; 1092 return prog;
1093 } 1093 }
1094 1094
1095 Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor, 1095 Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
1096 const vector<Regexp*>& re) { 1096 Regexp* re) {
1097 return Compiler::CompileSet(options, anchor, re); 1097 return Compiler::CompileSet(options, anchor, re);
1098 } 1098 }
1099 1099
1100 } // namespace re2 1100 } // namespace re2
OLDNEW
« no previous file with comments | « no previous file | re2/mimics_pcre.cc » ('j') | no next file with comments »

Powered by Google App Engine
RSS Feeds Recent Issues | This issue
This is Rietveld f62528b