OLD | NEW |
1 // Copyright 2007 The RE2 Authors. All Rights Reserved. | 1 // Copyright 2007 The RE2 Authors. All Rights Reserved. |
2 // Use of this source code is governed by a BSD-style | 2 // Use of this source code is governed by a BSD-style |
3 // license that can be found in the LICENSE file. | 3 // license that can be found in the LICENSE file. |
4 | 4 |
5 // Compile regular expression to Prog. | 5 // Compile regular expression to Prog. |
6 // | 6 // |
7 // Prog and Inst are defined in prog.h. | 7 // Prog and Inst are defined in prog.h. |
8 // This file's external interface is just Regexp::CompileToProg. | 8 // This file's external interface is just Regexp::CompileToProg. |
9 // The Compiler class defined in this file is private. | 9 // The Compiler class defined in this file is private. |
10 | 10 |
(...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
86 | 86 |
87 PatchList l = l1; | 87 PatchList l = l1; |
88 for (;;) { | 88 for (;;) { |
89 PatchList next = PatchList::Deref(inst0, l); | 89 PatchList next = PatchList::Deref(inst0, l); |
90 if (next.p == 0) | 90 if (next.p == 0) |
91 break; | 91 break; |
92 l = next; | 92 l = next; |
93 } | 93 } |
94 | 94 |
95 Prog::Inst* ip = &inst0[l.p>>1]; | 95 Prog::Inst* ip = &inst0[l.p>>1]; |
96 if(l.p&1) | 96 if (l.p&1) |
97 ip->out1_ = l2.p; | 97 ip->out1_ = l2.p; |
98 else | 98 else |
99 ip->set_out(l2.p); | 99 ip->set_out(l2.p); |
100 | 100 |
101 return l1; | 101 return l1; |
102 } | 102 } |
103 | 103 |
104 // Compiled program fragment. | 104 // Compiled program fragment. |
105 struct Frag { | 105 struct Frag { |
106 uint32 begin; | 106 uint32 begin; |
(...skipping 18 matching lines...) Expand all Loading... |
125 | 125 |
126 // Compiles Regexp to a new Prog. | 126 // Compiles Regexp to a new Prog. |
127 // Caller is responsible for deleting Prog when finished with it. | 127 // Caller is responsible for deleting Prog when finished with it. |
128 // If reversed is true, compiles for walking over the input | 128 // If reversed is true, compiles for walking over the input |
129 // string backward (reverses all concatenations). | 129 // string backward (reverses all concatenations). |
130 static Prog *Compile(Regexp* re, bool reversed, int64 max_mem); | 130 static Prog *Compile(Regexp* re, bool reversed, int64 max_mem); |
131 | 131 |
132 // Compiles alternation of all the re to a new Prog. | 132 // Compiles alternation of all the re to a new Prog. |
133 // Each re has a match with an id equal to its index in the vector. | 133 // Each re has a match with an id equal to its index in the vector. |
134 static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor, | 134 static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor, |
135 const vector<Regexp*>& re); | 135 Regexp* re); |
136 | 136 |
137 // Interface for Regexp::Walker, which helps traverse the Regexp. | 137 // Interface for Regexp::Walker, which helps traverse the Regexp. |
138 // The walk is purely post-recursive: given the machines for the | 138 // The walk is purely post-recursive: given the machines for the |
139 // children, PostVisit combines them to create the machine for | 139 // children, PostVisit combines them to create the machine for |
140 // the current node. The child_args are Frags. | 140 // the current node. The child_args are Frags. |
141 // The Compiler traverses the Regexp parse tree, visiting | 141 // The Compiler traverses the Regexp parse tree, visiting |
142 // each node in depth-first order. It invokes PreVisit before | 142 // each node in depth-first order. It invokes PreVisit before |
143 // visiting the node's children and PostVisit after visiting | 143 // visiting the node's children and PostVisit after visiting |
144 // the children. | 144 // the children. |
145 Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop); | 145 Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop); |
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
200 | 200 |
201 // Adds a suffix to alternation. | 201 // Adds a suffix to alternation. |
202 void AddSuffix(int id); | 202 void AddSuffix(int id); |
203 | 203 |
204 // Returns the alternation of all the added suffixes. | 204 // Returns the alternation of all the added suffixes. |
205 Frag EndRange(); | 205 Frag EndRange(); |
206 | 206 |
207 // Single rune. | 207 // Single rune. |
208 Frag Literal(Rune r, bool foldcase); | 208 Frag Literal(Rune r, bool foldcase); |
209 | 209 |
210 void Setup(Regexp::ParseFlags, int64); | 210 void Setup(Regexp::ParseFlags, int64, RE2::Anchor); |
211 Prog* Finish(); | 211 Prog* Finish(); |
212 | 212 |
213 // Returns .* where dot = any byte | 213 // Returns .* where dot = any byte |
214 Frag DotStar(); | 214 Frag DotStar(); |
215 | 215 |
216 private: | 216 private: |
217 Prog* prog_; // Program being built. | 217 Prog* prog_; // Program being built. |
218 bool failed_; // Did we give up compiling? | 218 bool failed_; // Did we give up compiling? |
219 Encoding encoding_; // Input encoding | 219 Encoding encoding_; // Input encoding |
220 bool reversed_; // Should program run backward over text? | 220 bool reversed_; // Should program run backward over text? |
221 | 221 |
222 int max_inst_; // Maximum number of instructions. | 222 int max_inst_; // Maximum number of instructions. |
223 | 223 |
224 Prog::Inst* inst_; // Pointer to first instruction. | 224 Prog::Inst* inst_; // Pointer to first instruction. |
225 int inst_len_; // Number of instructions used. | 225 int inst_len_; // Number of instructions used. |
226 int inst_cap_; // Number of instructions allocated. | 226 int inst_cap_; // Number of instructions allocated. |
227 | 227 |
228 int64 max_mem_; // Total memory budget. | 228 int64 max_mem_; // Total memory budget. |
229 | 229 |
230 map<uint64, int> rune_cache_; | 230 map<uint64, int> rune_cache_; |
231 Frag rune_range_; | 231 Frag rune_range_; |
232 | 232 |
| 233 RE2::Anchor anchor_; // anchor mode for RE2::Set |
| 234 |
233 DISALLOW_EVIL_CONSTRUCTORS(Compiler); | 235 DISALLOW_EVIL_CONSTRUCTORS(Compiler); |
234 }; | 236 }; |
235 | 237 |
236 Compiler::Compiler() { | 238 Compiler::Compiler() { |
237 prog_ = new Prog(); | 239 prog_ = new Prog(); |
238 failed_ = false; | 240 failed_ = false; |
239 encoding_ = kEncodingUTF8; | 241 encoding_ = kEncodingUTF8; |
240 reversed_ = false; | 242 reversed_ = false; |
241 inst_ = NULL; | 243 inst_ = NULL; |
242 inst_len_ = 0; | 244 inst_len_ = 0; |
(...skipping 228 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
471 // The Frag accumulates in rune_range_. Caching common | 473 // The Frag accumulates in rune_range_. Caching common |
472 // suffixes reduces the UTF-8 "." from 32 to 24 instructions, | 474 // suffixes reduces the UTF-8 "." from 32 to 24 instructions, |
473 // and it reduces the corresponding one-pass NFA from 16 nodes to 8. | 475 // and it reduces the corresponding one-pass NFA from 16 nodes to 8. |
474 | 476 |
475 void Compiler::BeginRange() { | 477 void Compiler::BeginRange() { |
476 rune_cache_.clear(); | 478 rune_cache_.clear(); |
477 rune_range_.begin = NULL; | 479 rune_range_.begin = NULL; |
478 rune_range_.end = nullPatchList; | 480 rune_range_.end = nullPatchList; |
479 } | 481 } |
480 | 482 |
481 int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next
) { | 483 int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, |
| 484 int next) { |
482 Frag f = ByteRange(lo, hi, foldcase); | 485 Frag f = ByteRange(lo, hi, foldcase); |
483 if (next != 0) { | 486 if (next != 0) { |
484 PatchList::Patch(inst_, f.end, next); | 487 PatchList::Patch(inst_, f.end, next); |
485 } else { | 488 } else { |
486 rune_range_.end = PatchList::Append(inst_, rune_range_.end, f.end); | 489 rune_range_.end = PatchList::Append(inst_, rune_range_.end, f.end); |
487 } | 490 } |
488 return f.begin; | 491 return f.begin; |
489 } | 492 } |
490 | 493 |
491 int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) { | 494 int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) { |
492 // In Latin1 mode, there's no point in caching. | 495 // In Latin1 mode, there's no point in caching. |
493 // In forward UTF-8 mode, only need to cache continuation bytes. | 496 // In forward UTF-8 mode, only need to cache continuation bytes. |
494 if (encoding_ == kEncodingLatin1 || | 497 if (encoding_ == kEncodingLatin1 || |
495 (encoding_ == kEncodingUTF8 && !reversed_ && !(0x80 <= lo && hi <= 0xbf)))
{ | 498 (encoding_ == kEncodingUTF8 && |
| 499 !reversed_ && |
| 500 !(0x80 <= lo && hi <= 0xbf))) { |
496 return UncachedRuneByteSuffix(lo, hi, foldcase, next); | 501 return UncachedRuneByteSuffix(lo, hi, foldcase, next); |
497 } | 502 } |
498 | 503 |
499 uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | foldcase; | 504 uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | foldcase; |
500 map<uint64, int>::iterator it = rune_cache_.find(key); | 505 map<uint64, int>::iterator it = rune_cache_.find(key); |
501 if (it != rune_cache_.end()) | 506 if (it != rune_cache_.end()) |
502 return it->second; | 507 return it->second; |
503 int id = UncachedRuneByteSuffix(lo, hi, foldcase, next); | 508 int id = UncachedRuneByteSuffix(lo, hi, foldcase, next); |
504 rune_cache_[key] = id; | 509 rune_cache_[key] = id; |
505 return id; | 510 return id; |
(...skipping 210 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
716 case kRegexpRepeat: | 721 case kRegexpRepeat: |
717 // Should not see; code at bottom of function will print error | 722 // Should not see; code at bottom of function will print error |
718 break; | 723 break; |
719 | 724 |
720 case kRegexpNoMatch: | 725 case kRegexpNoMatch: |
721 return NoMatch(); | 726 return NoMatch(); |
722 | 727 |
723 case kRegexpEmptyMatch: | 728 case kRegexpEmptyMatch: |
724 return Nop(); | 729 return Nop(); |
725 | 730 |
| 731 case kRegexpHaveMatch: { |
| 732 Frag f = Match(re->match_id()); |
| 733 // Remember unanchored match to end of string. |
| 734 if (anchor_ != RE2::ANCHOR_BOTH) |
| 735 f = Cat(DotStar(), f); |
| 736 return f; |
| 737 } |
| 738 |
726 case kRegexpConcat: { | 739 case kRegexpConcat: { |
727 Frag f = child_frags[0]; | 740 Frag f = child_frags[0]; |
728 for (int i = 1; i < nchild_frags; i++) | 741 for (int i = 1; i < nchild_frags; i++) |
729 f = Cat(f, child_frags[i]); | 742 f = Cat(f, child_frags[i]); |
730 return f; | 743 return f; |
731 } | 744 } |
732 | 745 |
733 case kRegexpAlternate: { | 746 case kRegexpAlternate: { |
734 Frag f = child_frags[0]; | 747 Frag f = child_frags[0]; |
735 for (int i = 1; i < nchild_frags; i++) | 748 for (int i = 1; i < nchild_frags; i++) |
(...skipping 153 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
889 continue; | 902 continue; |
890 case kRegexpEndText: | 903 case kRegexpEndText: |
891 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); | 904 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); |
892 re->Decref(); | 905 re->Decref(); |
893 return true; | 906 return true; |
894 } | 907 } |
895 return false; | 908 return false; |
896 } | 909 } |
897 } | 910 } |
898 | 911 |
899 void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem) { | 912 void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem, |
| 913 RE2::Anchor anchor) { |
900 prog_->set_flags(flags); | 914 prog_->set_flags(flags); |
901 | 915 |
902 if (flags & Regexp::Latin1) | 916 if (flags & Regexp::Latin1) |
903 encoding_ = kEncodingLatin1; | 917 encoding_ = kEncodingLatin1; |
904 max_mem_ = max_mem; | 918 max_mem_ = max_mem; |
905 if (max_mem <= 0) { | 919 if (max_mem <= 0) { |
906 max_inst_ = 100000; // more than enough | 920 max_inst_ = 100000; // more than enough |
907 } else if (max_mem <= sizeof(Prog)) { | 921 } else if (max_mem <= sizeof(Prog)) { |
908 // No room for anything. | 922 // No room for anything. |
909 max_inst_ = 0; | 923 max_inst_ = 0; |
910 } else { | 924 } else { |
911 int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst); | 925 int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst); |
912 // Limit instruction count so that inst->id() fits nicely in an int. | 926 // Limit instruction count so that inst->id() fits nicely in an int. |
913 // SparseArray also assumes that the indices (inst->id()) are ints. | 927 // SparseArray also assumes that the indices (inst->id()) are ints. |
914 // The call to WalkExponential uses 2*max_inst_ below, | 928 // The call to WalkExponential uses 2*max_inst_ below, |
915 // and other places in the code use 2 or 3 * prog->size(). | 929 // and other places in the code use 2 or 3 * prog->size(). |
916 // Limiting to 2^24 should avoid overflow in those places. | 930 // Limiting to 2^24 should avoid overflow in those places. |
917 // (The point of allowing more than 32 bits of memory is to | 931 // (The point of allowing more than 32 bits of memory is to |
918 // have plenty of room for the DFA states, not to use it up | 932 // have plenty of room for the DFA states, not to use it up |
919 // on the program.) | 933 // on the program.) |
920 if (m >= 1<<24) | 934 if (m >= 1<<24) |
921 m = 1<<24; | 935 m = 1<<24; |
922 | 936 |
923 // Inst imposes its own limit (currently bigger than 2^24 but be safe). | 937 // Inst imposes its own limit (currently bigger than 2^24 but be safe). |
924 if (m > Prog::Inst::kMaxInst) | 938 if (m > Prog::Inst::kMaxInst) |
925 m = Prog::Inst::kMaxInst; | 939 m = Prog::Inst::kMaxInst; |
926 | 940 |
927 max_inst_ = m; | 941 max_inst_ = m; |
928 } | 942 } |
| 943 |
| 944 anchor_ = anchor; |
929 } | 945 } |
930 | 946 |
931 // Compiles re, returning program. | 947 // Compiles re, returning program. |
932 // Caller is responsible for deleting prog_. | 948 // Caller is responsible for deleting prog_. |
933 // If reversed is true, compiles a program that expects | 949 // If reversed is true, compiles a program that expects |
934 // to run over the input string backward (reverses all concatenations). | 950 // to run over the input string backward (reverses all concatenations). |
935 // The reversed flag is also recorded in the returned program. | 951 // The reversed flag is also recorded in the returned program. |
936 Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) { | 952 Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) { |
937 Compiler c; | 953 Compiler c; |
938 | 954 |
939 c.Setup(re->parse_flags(), max_mem); | 955 c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */); |
940 c.reversed_ = reversed; | 956 c.reversed_ = reversed; |
941 | 957 |
942 // Simplify to remove things like counted repetitions | 958 // Simplify to remove things like counted repetitions |
943 // and character classes like \d. | 959 // and character classes like \d. |
944 Regexp* sre = re->Simplify(); | 960 Regexp* sre = re->Simplify(); |
945 if (sre == NULL) | 961 if (sre == NULL) |
946 return NULL; | 962 return NULL; |
947 | 963 |
948 // Record whether prog is anchored, removing the anchors. | 964 // Record whether prog is anchored, removing the anchors. |
949 // (They get in the way of other optimizations.) | 965 // (They get in the way of other optimizations.) |
(...skipping 78 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1028 Prog* Regexp::CompileToReverseProg(int64 max_mem) { | 1044 Prog* Regexp::CompileToReverseProg(int64 max_mem) { |
1029 return Compiler::Compile(this, true, max_mem); | 1045 return Compiler::Compile(this, true, max_mem); |
1030 } | 1046 } |
1031 | 1047 |
1032 Frag Compiler::DotStar() { | 1048 Frag Compiler::DotStar() { |
1033 return Star(ByteRange(0x00, 0xff, false), true); | 1049 return Star(ByteRange(0x00, 0xff, false), true); |
1034 } | 1050 } |
1035 | 1051 |
1036 // Compiles RE set to Prog. | 1052 // Compiles RE set to Prog. |
1037 Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor, | 1053 Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor, |
1038 const vector<Regexp*>& re) { | 1054 Regexp* re) { |
1039 Compiler c; | 1055 Compiler c; |
1040 | 1056 |
1041 c.Setup(static_cast<Regexp::ParseFlags>(options.ParseFlags()), | 1057 Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags()); |
1042 options.max_mem()); | 1058 c.Setup(pf, options.max_mem(), anchor); |
1043 | 1059 |
1044 // Accumulate alternation of fragments. | 1060 // Compile alternation of fragments. |
1045 Frag all = kNullFrag; | 1061 Frag all = c.WalkExponential(re, kNullFrag, 2*c.max_inst_); |
1046 for (int i = 0; i < re.size(); i++) { | 1062 re->Decref(); |
1047 Frag f = c.WalkExponential(re[i], kNullFrag, 2*c.max_inst_); | 1063 if (c.failed_) |
1048 if (c.failed_) | 1064 return NULL; |
1049 return NULL; | 1065 |
1050 switch (anchor) { | 1066 if (anchor == RE2::UNANCHORED) { |
1051 case RE2::UNANCHORED: | 1067 // The trailing .* was added while handling kRegexpHaveMatch. |
1052 f = c.Cat(c.Cat(c.DotStar(), f), c.DotStar()); | 1068 // We just have to add the leading one. |
1053 break; | 1069 all = c.Cat(c.DotStar(), all); |
1054 case RE2::ANCHOR_START: | |
1055 f = c.Cat(f, c.DotStar()); | |
1056 break; | |
1057 case RE2::ANCHOR_BOTH: | |
1058 break; | |
1059 } | |
1060 f = c.Cat(f, c.Match(i)); | |
1061 all = c.Alt(all, f); | |
1062 } | 1070 } |
1063 | 1071 |
1064 c.prog_->set_start(all.begin); | 1072 c.prog_->set_start(all.begin); |
1065 c.prog_->set_start_unanchored(all.begin); | 1073 c.prog_->set_start_unanchored(all.begin); |
1066 c.prog_->set_anchor_start(true); | 1074 c.prog_->set_anchor_start(true); |
1067 c.prog_->set_anchor_end(true); | 1075 c.prog_->set_anchor_end(true); |
1068 | 1076 |
1069 // Also create unanchored version, which starts with a .*? loop. | |
1070 if (c.prog_->anchor_start()) { | |
1071 c.prog_->set_start_unanchored(c.prog_->start()); | |
1072 } else { | |
1073 Frag unanchored = c.Cat(c.DotStar(), all); | |
1074 c.prog_->set_start_unanchored(unanchored.begin); | |
1075 } | |
1076 | |
1077 Prog* prog = c.Finish(); | 1077 Prog* prog = c.Finish(); |
1078 if (prog == NULL) | 1078 if (prog == NULL) |
1079 return NULL; | 1079 return NULL; |
1080 | 1080 |
1081 // Make sure DFA has enough memory to operate, | 1081 // Make sure DFA has enough memory to operate, |
1082 // since we're not going to fall back to the NFA. | 1082 // since we're not going to fall back to the NFA. |
1083 bool failed; | 1083 bool failed; |
1084 StringPiece sp = "hello, world"; | 1084 StringPiece sp = "hello, world"; |
1085 prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch, | 1085 prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch, |
1086 NULL, &failed, NULL); | 1086 NULL, &failed, NULL); |
1087 if (failed) { | 1087 if (failed) { |
1088 delete prog; | 1088 delete prog; |
1089 return NULL; | 1089 return NULL; |
1090 } | 1090 } |
1091 | 1091 |
1092 return prog; | 1092 return prog; |
1093 } | 1093 } |
1094 | 1094 |
1095 Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor, | 1095 Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor, |
1096 const vector<Regexp*>& re) { | 1096 Regexp* re) { |
1097 return Compiler::CompileSet(options, anchor, re); | 1097 return Compiler::CompileSet(options, anchor, re); |
1098 } | 1098 } |
1099 | 1099 |
1100 } // namespace re2 | 1100 } // namespace re2 |
OLD | NEW |