Платформа ЦРНП "Мирокод" для разработки проектов
https://git.mirocod.ru
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
896 lines
19 KiB
896 lines
19 KiB
package syntax |
|
|
|
import ( |
|
"bytes" |
|
"fmt" |
|
"strconv" |
|
"unicode" |
|
"unicode/utf8" |
|
) |
|
|
|
type Prefix struct { |
|
PrefixStr []rune |
|
PrefixSet CharSet |
|
CaseInsensitive bool |
|
} |
|
|
|
// It takes a RegexTree and computes the set of chars that can start it. |
|
func getFirstCharsPrefix(tree *RegexTree) *Prefix { |
|
s := regexFcd{ |
|
fcStack: make([]regexFc, 32), |
|
intStack: make([]int, 32), |
|
} |
|
fc := s.regexFCFromRegexTree(tree) |
|
|
|
if fc == nil || fc.nullable || fc.cc.IsEmpty() { |
|
return nil |
|
} |
|
fcSet := fc.getFirstChars() |
|
return &Prefix{PrefixSet: fcSet, CaseInsensitive: fc.caseInsensitive} |
|
} |
|
|
|
type regexFcd struct { |
|
intStack []int |
|
intDepth int |
|
fcStack []regexFc |
|
fcDepth int |
|
skipAllChildren bool // don't process any more children at the current level |
|
skipchild bool // don't process the current child. |
|
failed bool |
|
} |
|
|
|
/* |
|
* The main FC computation. It does a shortcutted depth-first walk |
|
* through the tree and calls CalculateFC to emits code before |
|
* and after each child of an interior node, and at each leaf. |
|
*/ |
|
func (s *regexFcd) regexFCFromRegexTree(tree *RegexTree) *regexFc { |
|
curNode := tree.root |
|
curChild := 0 |
|
|
|
for { |
|
if len(curNode.children) == 0 { |
|
// This is a leaf node |
|
s.calculateFC(curNode.t, curNode, 0) |
|
} else if curChild < len(curNode.children) && !s.skipAllChildren { |
|
// This is an interior node, and we have more children to analyze |
|
s.calculateFC(curNode.t|beforeChild, curNode, curChild) |
|
|
|
if !s.skipchild { |
|
curNode = curNode.children[curChild] |
|
// this stack is how we get a depth first walk of the tree. |
|
s.pushInt(curChild) |
|
curChild = 0 |
|
} else { |
|
curChild++ |
|
s.skipchild = false |
|
} |
|
continue |
|
} |
|
|
|
// This is an interior node where we've finished analyzing all the children, or |
|
// the end of a leaf node. |
|
s.skipAllChildren = false |
|
|
|
if s.intIsEmpty() { |
|
break |
|
} |
|
|
|
curChild = s.popInt() |
|
curNode = curNode.next |
|
|
|
s.calculateFC(curNode.t|afterChild, curNode, curChild) |
|
if s.failed { |
|
return nil |
|
} |
|
|
|
curChild++ |
|
} |
|
|
|
if s.fcIsEmpty() { |
|
return nil |
|
} |
|
|
|
return s.popFC() |
|
} |
|
|
|
// To avoid recursion, we use a simple integer stack. |
|
// This is the push. |
|
func (s *regexFcd) pushInt(I int) { |
|
if s.intDepth >= len(s.intStack) { |
|
expanded := make([]int, s.intDepth*2) |
|
copy(expanded, s.intStack) |
|
s.intStack = expanded |
|
} |
|
|
|
s.intStack[s.intDepth] = I |
|
s.intDepth++ |
|
} |
|
|
|
// True if the stack is empty. |
|
func (s *regexFcd) intIsEmpty() bool { |
|
return s.intDepth == 0 |
|
} |
|
|
|
// This is the pop. |
|
func (s *regexFcd) popInt() int { |
|
s.intDepth-- |
|
return s.intStack[s.intDepth] |
|
} |
|
|
|
// We also use a stack of RegexFC objects. |
|
// This is the push. |
|
func (s *regexFcd) pushFC(fc regexFc) { |
|
if s.fcDepth >= len(s.fcStack) { |
|
expanded := make([]regexFc, s.fcDepth*2) |
|
copy(expanded, s.fcStack) |
|
s.fcStack = expanded |
|
} |
|
|
|
s.fcStack[s.fcDepth] = fc |
|
s.fcDepth++ |
|
} |
|
|
|
// True if the stack is empty. |
|
func (s *regexFcd) fcIsEmpty() bool { |
|
return s.fcDepth == 0 |
|
} |
|
|
|
// This is the pop. |
|
func (s *regexFcd) popFC() *regexFc { |
|
s.fcDepth-- |
|
return &s.fcStack[s.fcDepth] |
|
} |
|
|
|
// This is the top. |
|
func (s *regexFcd) topFC() *regexFc { |
|
return &s.fcStack[s.fcDepth-1] |
|
} |
|
|
|
// Called in Beforechild to prevent further processing of the current child |
|
func (s *regexFcd) skipChild() { |
|
s.skipchild = true |
|
} |
|
|
|
// FC computation and shortcut cases for each node type |
|
func (s *regexFcd) calculateFC(nt nodeType, node *regexNode, CurIndex int) { |
|
//fmt.Printf("NodeType: %v, CurIndex: %v, Desc: %v\n", nt, CurIndex, node.description()) |
|
ci := false |
|
rtl := false |
|
|
|
if nt <= ntRef { |
|
if (node.options & IgnoreCase) != 0 { |
|
ci = true |
|
} |
|
if (node.options & RightToLeft) != 0 { |
|
rtl = true |
|
} |
|
} |
|
|
|
switch nt { |
|
case ntConcatenate | beforeChild, ntAlternate | beforeChild, ntTestref | beforeChild, ntLoop | beforeChild, ntLazyloop | beforeChild: |
|
break |
|
|
|
case ntTestgroup | beforeChild: |
|
if CurIndex == 0 { |
|
s.skipChild() |
|
} |
|
break |
|
|
|
case ntEmpty: |
|
s.pushFC(regexFc{nullable: true}) |
|
break |
|
|
|
case ntConcatenate | afterChild: |
|
if CurIndex != 0 { |
|
child := s.popFC() |
|
cumul := s.topFC() |
|
|
|
s.failed = !cumul.addFC(*child, true) |
|
} |
|
|
|
fc := s.topFC() |
|
if !fc.nullable { |
|
s.skipAllChildren = true |
|
} |
|
break |
|
|
|
case ntTestgroup | afterChild: |
|
if CurIndex > 1 { |
|
child := s.popFC() |
|
cumul := s.topFC() |
|
|
|
s.failed = !cumul.addFC(*child, false) |
|
} |
|
break |
|
|
|
case ntAlternate | afterChild, ntTestref | afterChild: |
|
if CurIndex != 0 { |
|
child := s.popFC() |
|
cumul := s.topFC() |
|
|
|
s.failed = !cumul.addFC(*child, false) |
|
} |
|
break |
|
|
|
case ntLoop | afterChild, ntLazyloop | afterChild: |
|
if node.m == 0 { |
|
fc := s.topFC() |
|
fc.nullable = true |
|
} |
|
break |
|
|
|
case ntGroup | beforeChild, ntGroup | afterChild, ntCapture | beforeChild, ntCapture | afterChild, ntGreedy | beforeChild, ntGreedy | afterChild: |
|
break |
|
|
|
case ntRequire | beforeChild, ntPrevent | beforeChild: |
|
s.skipChild() |
|
s.pushFC(regexFc{nullable: true}) |
|
break |
|
|
|
case ntRequire | afterChild, ntPrevent | afterChild: |
|
break |
|
|
|
case ntOne, ntNotone: |
|
s.pushFC(newRegexFc(node.ch, nt == ntNotone, false, ci)) |
|
break |
|
|
|
case ntOneloop, ntOnelazy: |
|
s.pushFC(newRegexFc(node.ch, false, node.m == 0, ci)) |
|
break |
|
|
|
case ntNotoneloop, ntNotonelazy: |
|
s.pushFC(newRegexFc(node.ch, true, node.m == 0, ci)) |
|
break |
|
|
|
case ntMulti: |
|
if len(node.str) == 0 { |
|
s.pushFC(regexFc{nullable: true}) |
|
} else if !rtl { |
|
s.pushFC(newRegexFc(node.str[0], false, false, ci)) |
|
} else { |
|
s.pushFC(newRegexFc(node.str[len(node.str)-1], false, false, ci)) |
|
} |
|
break |
|
|
|
case ntSet: |
|
s.pushFC(regexFc{cc: node.set.Copy(), nullable: false, caseInsensitive: ci}) |
|
break |
|
|
|
case ntSetloop, ntSetlazy: |
|
s.pushFC(regexFc{cc: node.set.Copy(), nullable: node.m == 0, caseInsensitive: ci}) |
|
break |
|
|
|
case ntRef: |
|
s.pushFC(regexFc{cc: *AnyClass(), nullable: true, caseInsensitive: false}) |
|
break |
|
|
|
case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd: |
|
s.pushFC(regexFc{nullable: true}) |
|
break |
|
|
|
default: |
|
panic(fmt.Sprintf("unexpected op code: %v", nt)) |
|
} |
|
} |
|
|
|
type regexFc struct { |
|
cc CharSet |
|
nullable bool |
|
caseInsensitive bool |
|
} |
|
|
|
func newRegexFc(ch rune, not, nullable, caseInsensitive bool) regexFc { |
|
r := regexFc{ |
|
caseInsensitive: caseInsensitive, |
|
nullable: nullable, |
|
} |
|
if not { |
|
if ch > 0 { |
|
r.cc.addRange('\x00', ch-1) |
|
} |
|
if ch < 0xFFFF { |
|
r.cc.addRange(ch+1, utf8.MaxRune) |
|
} |
|
} else { |
|
r.cc.addRange(ch, ch) |
|
} |
|
return r |
|
} |
|
|
|
func (r *regexFc) getFirstChars() CharSet { |
|
if r.caseInsensitive { |
|
r.cc.addLowercase() |
|
} |
|
|
|
return r.cc |
|
} |
|
|
|
func (r *regexFc) addFC(fc regexFc, concatenate bool) bool { |
|
if !r.cc.IsMergeable() || !fc.cc.IsMergeable() { |
|
return false |
|
} |
|
|
|
if concatenate { |
|
if !r.nullable { |
|
return true |
|
} |
|
|
|
if !fc.nullable { |
|
r.nullable = false |
|
} |
|
} else { |
|
if fc.nullable { |
|
r.nullable = true |
|
} |
|
} |
|
|
|
r.caseInsensitive = r.caseInsensitive || fc.caseInsensitive |
|
r.cc.addSet(fc.cc) |
|
|
|
return true |
|
} |
|
|
|
// This is a related computation: it takes a RegexTree and computes the |
|
// leading substring if it sees one. It's quite trivial and gives up easily. |
|
func getPrefix(tree *RegexTree) *Prefix { |
|
var concatNode *regexNode |
|
nextChild := 0 |
|
|
|
curNode := tree.root |
|
|
|
for { |
|
switch curNode.t { |
|
case ntConcatenate: |
|
if len(curNode.children) > 0 { |
|
concatNode = curNode |
|
nextChild = 0 |
|
} |
|
|
|
case ntGreedy, ntCapture: |
|
curNode = curNode.children[0] |
|
concatNode = nil |
|
continue |
|
|
|
case ntOneloop, ntOnelazy: |
|
if curNode.m > 0 { |
|
return &Prefix{ |
|
PrefixStr: repeat(curNode.ch, curNode.m), |
|
CaseInsensitive: (curNode.options & IgnoreCase) != 0, |
|
} |
|
} |
|
return nil |
|
|
|
case ntOne: |
|
return &Prefix{ |
|
PrefixStr: []rune{curNode.ch}, |
|
CaseInsensitive: (curNode.options & IgnoreCase) != 0, |
|
} |
|
|
|
case ntMulti: |
|
return &Prefix{ |
|
PrefixStr: curNode.str, |
|
CaseInsensitive: (curNode.options & IgnoreCase) != 0, |
|
} |
|
|
|
case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning, ntStart, |
|
ntEndZ, ntEnd, ntEmpty, ntRequire, ntPrevent: |
|
|
|
default: |
|
return nil |
|
} |
|
|
|
if concatNode == nil || nextChild >= len(concatNode.children) { |
|
return nil |
|
} |
|
|
|
curNode = concatNode.children[nextChild] |
|
nextChild++ |
|
} |
|
} |
|
|
|
// repeat the rune r, c times... up to the max of MaxPrefixSize |
|
func repeat(r rune, c int) []rune { |
|
if c > MaxPrefixSize { |
|
c = MaxPrefixSize |
|
} |
|
|
|
ret := make([]rune, c) |
|
|
|
// binary growth using copy for speed |
|
ret[0] = r |
|
bp := 1 |
|
for bp < len(ret) { |
|
copy(ret[bp:], ret[:bp]) |
|
bp *= 2 |
|
} |
|
|
|
return ret |
|
} |
|
|
|
// BmPrefix precomputes the Boyer-Moore |
|
// tables for fast string scanning. These tables allow |
|
// you to scan for the first occurrence of a string within |
|
// a large body of text without examining every character. |
|
// The performance of the heuristic depends on the actual |
|
// string and the text being searched, but usually, the longer |
|
// the string that is being searched for, the fewer characters |
|
// need to be examined. |
|
type BmPrefix struct { |
|
positive []int |
|
negativeASCII []int |
|
negativeUnicode [][]int |
|
pattern []rune |
|
lowASCII rune |
|
highASCII rune |
|
rightToLeft bool |
|
caseInsensitive bool |
|
} |
|
|
|
func newBmPrefix(pattern []rune, caseInsensitive, rightToLeft bool) *BmPrefix { |
|
|
|
b := &BmPrefix{ |
|
rightToLeft: rightToLeft, |
|
caseInsensitive: caseInsensitive, |
|
pattern: pattern, |
|
} |
|
|
|
if caseInsensitive { |
|
for i := 0; i < len(b.pattern); i++ { |
|
// We do the ToLower character by character for consistency. With surrogate chars, doing |
|
// a ToLower on the entire string could actually change the surrogate pair. This is more correct |
|
// linguistically, but since Regex doesn't support surrogates, it's more important to be |
|
// consistent. |
|
|
|
b.pattern[i] = unicode.ToLower(b.pattern[i]) |
|
} |
|
} |
|
|
|
var beforefirst, last, bump int |
|
var scan, match int |
|
|
|
if !rightToLeft { |
|
beforefirst = -1 |
|
last = len(b.pattern) - 1 |
|
bump = 1 |
|
} else { |
|
beforefirst = len(b.pattern) |
|
last = 0 |
|
bump = -1 |
|
} |
|
|
|
// PART I - the good-suffix shift table |
|
// |
|
// compute the positive requirement: |
|
// if char "i" is the first one from the right that doesn't match, |
|
// then we know the matcher can advance by _positive[i]. |
|
// |
|
// This algorithm is a simplified variant of the standard |
|
// Boyer-Moore good suffix calculation. |
|
|
|
b.positive = make([]int, len(b.pattern)) |
|
|
|
examine := last |
|
ch := b.pattern[examine] |
|
b.positive[examine] = bump |
|
examine -= bump |
|
|
|
Outerloop: |
|
for { |
|
// find an internal char (examine) that matches the tail |
|
|
|
for { |
|
if examine == beforefirst { |
|
break Outerloop |
|
} |
|
if b.pattern[examine] == ch { |
|
break |
|
} |
|
examine -= bump |
|
} |
|
|
|
match = last |
|
scan = examine |
|
|
|
// find the length of the match |
|
for { |
|
if scan == beforefirst || b.pattern[match] != b.pattern[scan] { |
|
// at the end of the match, note the difference in _positive |
|
// this is not the length of the match, but the distance from the internal match |
|
// to the tail suffix. |
|
if b.positive[match] == 0 { |
|
b.positive[match] = match - scan |
|
} |
|
|
|
// System.Diagnostics.Debug.WriteLine("Set positive[" + match + "] to " + (match - scan)); |
|
|
|
break |
|
} |
|
|
|
scan -= bump |
|
match -= bump |
|
} |
|
|
|
examine -= bump |
|
} |
|
|
|
match = last - bump |
|
|
|
// scan for the chars for which there are no shifts that yield a different candidate |
|
|
|
// The inside of the if statement used to say |
|
// "_positive[match] = last - beforefirst;" |
|
// This is slightly less aggressive in how much we skip, but at worst it |
|
// should mean a little more work rather than skipping a potential match. |
|
for match != beforefirst { |
|
if b.positive[match] == 0 { |
|
b.positive[match] = bump |
|
} |
|
|
|
match -= bump |
|
} |
|
|
|
// PART II - the bad-character shift table |
|
// |
|
// compute the negative requirement: |
|
// if char "ch" is the reject character when testing position "i", |
|
// we can slide up by _negative[ch]; |
|
// (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch)) |
|
// |
|
// the lookup table is divided into ASCII and Unicode portions; |
|
// only those parts of the Unicode 16-bit code set that actually |
|
// appear in the string are in the table. (Maximum size with |
|
// Unicode is 65K; ASCII only case is 512 bytes.) |
|
|
|
b.negativeASCII = make([]int, 128) |
|
|
|
for i := 0; i < len(b.negativeASCII); i++ { |
|
b.negativeASCII[i] = last - beforefirst |
|
} |
|
|
|
b.lowASCII = 127 |
|
b.highASCII = 0 |
|
|
|
for examine = last; examine != beforefirst; examine -= bump { |
|
ch = b.pattern[examine] |
|
|
|
switch { |
|
case ch < 128: |
|
if b.lowASCII > ch { |
|
b.lowASCII = ch |
|
} |
|
|
|
if b.highASCII < ch { |
|
b.highASCII = ch |
|
} |
|
|
|
if b.negativeASCII[ch] == last-beforefirst { |
|
b.negativeASCII[ch] = last - examine |
|
} |
|
case ch <= 0xffff: |
|
i, j := ch>>8, ch&0xFF |
|
|
|
if b.negativeUnicode == nil { |
|
b.negativeUnicode = make([][]int, 256) |
|
} |
|
|
|
if b.negativeUnicode[i] == nil { |
|
newarray := make([]int, 256) |
|
|
|
for k := 0; k < len(newarray); k++ { |
|
newarray[k] = last - beforefirst |
|
} |
|
|
|
if i == 0 { |
|
copy(newarray, b.negativeASCII) |
|
//TODO: this line needed? |
|
b.negativeASCII = newarray |
|
} |
|
|
|
b.negativeUnicode[i] = newarray |
|
} |
|
|
|
if b.negativeUnicode[i][j] == last-beforefirst { |
|
b.negativeUnicode[i][j] = last - examine |
|
} |
|
default: |
|
// we can't do the filter because this algo doesn't support |
|
// unicode chars >0xffff |
|
return nil |
|
} |
|
} |
|
|
|
return b |
|
} |
|
|
|
func (b *BmPrefix) String() string { |
|
return string(b.pattern) |
|
} |
|
|
|
// Dump returns the contents of the filter as a human readable string |
|
func (b *BmPrefix) Dump(indent string) string { |
|
buf := &bytes.Buffer{} |
|
|
|
fmt.Fprintf(buf, "%sBM Pattern: %s\n%sPositive: ", indent, string(b.pattern), indent) |
|
for i := 0; i < len(b.positive); i++ { |
|
buf.WriteString(strconv.Itoa(b.positive[i])) |
|
buf.WriteRune(' ') |
|
} |
|
buf.WriteRune('\n') |
|
|
|
if b.negativeASCII != nil { |
|
buf.WriteString(indent) |
|
buf.WriteString("Negative table\n") |
|
for i := 0; i < len(b.negativeASCII); i++ { |
|
if b.negativeASCII[i] != len(b.pattern) { |
|
fmt.Fprintf(buf, "%s %s %s\n", indent, Escape(string(rune(i))), strconv.Itoa(b.negativeASCII[i])) |
|
} |
|
} |
|
} |
|
|
|
return buf.String() |
|
} |
|
|
|
// Scan uses the Boyer-Moore algorithm to find the first occurrence |
|
// of the specified string within text, beginning at index, and |
|
// constrained within beglimit and endlimit. |
|
// |
|
// The direction and case-sensitivity of the match is determined |
|
// by the arguments to the RegexBoyerMoore constructor. |
|
func (b *BmPrefix) Scan(text []rune, index, beglimit, endlimit int) int { |
|
var ( |
|
defadv, test, test2 int |
|
match, startmatch, endmatch int |
|
bump, advance int |
|
chTest rune |
|
unicodeLookup []int |
|
) |
|
|
|
if !b.rightToLeft { |
|
defadv = len(b.pattern) |
|
startmatch = len(b.pattern) - 1 |
|
endmatch = 0 |
|
test = index + defadv - 1 |
|
bump = 1 |
|
} else { |
|
defadv = -len(b.pattern) |
|
startmatch = 0 |
|
endmatch = -defadv - 1 |
|
test = index + defadv |
|
bump = -1 |
|
} |
|
|
|
chMatch := b.pattern[startmatch] |
|
|
|
for { |
|
if test >= endlimit || test < beglimit { |
|
return -1 |
|
} |
|
|
|
chTest = text[test] |
|
|
|
if b.caseInsensitive { |
|
chTest = unicode.ToLower(chTest) |
|
} |
|
|
|
if chTest != chMatch { |
|
if chTest < 128 { |
|
advance = b.negativeASCII[chTest] |
|
} else if chTest < 0xffff && len(b.negativeUnicode) > 0 { |
|
unicodeLookup = b.negativeUnicode[chTest>>8] |
|
if len(unicodeLookup) > 0 { |
|
advance = unicodeLookup[chTest&0xFF] |
|
} else { |
|
advance = defadv |
|
} |
|
} else { |
|
advance = defadv |
|
} |
|
|
|
test += advance |
|
} else { // if (chTest == chMatch) |
|
test2 = test |
|
match = startmatch |
|
|
|
for { |
|
if match == endmatch { |
|
if b.rightToLeft { |
|
return test2 + 1 |
|
} else { |
|
return test2 |
|
} |
|
} |
|
|
|
match -= bump |
|
test2 -= bump |
|
|
|
chTest = text[test2] |
|
|
|
if b.caseInsensitive { |
|
chTest = unicode.ToLower(chTest) |
|
} |
|
|
|
if chTest != b.pattern[match] { |
|
advance = b.positive[match] |
|
if (chTest & 0xFF80) == 0 { |
|
test2 = (match - startmatch) + b.negativeASCII[chTest] |
|
} else if chTest < 0xffff && len(b.negativeUnicode) > 0 { |
|
unicodeLookup = b.negativeUnicode[chTest>>8] |
|
if len(unicodeLookup) > 0 { |
|
test2 = (match - startmatch) + unicodeLookup[chTest&0xFF] |
|
} else { |
|
test += advance |
|
break |
|
} |
|
} else { |
|
test += advance |
|
break |
|
} |
|
|
|
if b.rightToLeft { |
|
if test2 < advance { |
|
advance = test2 |
|
} |
|
} else if test2 > advance { |
|
advance = test2 |
|
} |
|
|
|
test += advance |
|
break |
|
} |
|
} |
|
} |
|
} |
|
} |
|
|
|
// When a regex is anchored, we can do a quick IsMatch test instead of a Scan |
|
func (b *BmPrefix) IsMatch(text []rune, index, beglimit, endlimit int) bool { |
|
if !b.rightToLeft { |
|
if index < beglimit || endlimit-index < len(b.pattern) { |
|
return false |
|
} |
|
|
|
return b.matchPattern(text, index) |
|
} else { |
|
if index > endlimit || index-beglimit < len(b.pattern) { |
|
return false |
|
} |
|
|
|
return b.matchPattern(text, index-len(b.pattern)) |
|
} |
|
} |
|
|
|
func (b *BmPrefix) matchPattern(text []rune, index int) bool { |
|
if len(text)-index < len(b.pattern) { |
|
return false |
|
} |
|
|
|
if b.caseInsensitive { |
|
for i := 0; i < len(b.pattern); i++ { |
|
//Debug.Assert(textinfo.ToLower(_pattern[i]) == _pattern[i], "pattern should be converted to lower case in constructor!"); |
|
if unicode.ToLower(text[index+i]) != b.pattern[i] { |
|
return false |
|
} |
|
} |
|
return true |
|
} else { |
|
for i := 0; i < len(b.pattern); i++ { |
|
if text[index+i] != b.pattern[i] { |
|
return false |
|
} |
|
} |
|
return true |
|
} |
|
} |
|
|
|
type AnchorLoc int16 |
|
|
|
// where the regex can be pegged |
|
const ( |
|
AnchorBeginning AnchorLoc = 0x0001 |
|
AnchorBol = 0x0002 |
|
AnchorStart = 0x0004 |
|
AnchorEol = 0x0008 |
|
AnchorEndZ = 0x0010 |
|
AnchorEnd = 0x0020 |
|
AnchorBoundary = 0x0040 |
|
AnchorECMABoundary = 0x0080 |
|
) |
|
|
|
func getAnchors(tree *RegexTree) AnchorLoc { |
|
|
|
var concatNode *regexNode |
|
nextChild, result := 0, AnchorLoc(0) |
|
|
|
curNode := tree.root |
|
|
|
for { |
|
switch curNode.t { |
|
case ntConcatenate: |
|
if len(curNode.children) > 0 { |
|
concatNode = curNode |
|
nextChild = 0 |
|
} |
|
|
|
case ntGreedy, ntCapture: |
|
curNode = curNode.children[0] |
|
concatNode = nil |
|
continue |
|
|
|
case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning, |
|
ntStart, ntEndZ, ntEnd: |
|
return result | anchorFromType(curNode.t) |
|
|
|
case ntEmpty, ntRequire, ntPrevent: |
|
|
|
default: |
|
return result |
|
} |
|
|
|
if concatNode == nil || nextChild >= len(concatNode.children) { |
|
return result |
|
} |
|
|
|
curNode = concatNode.children[nextChild] |
|
nextChild++ |
|
} |
|
} |
|
|
|
func anchorFromType(t nodeType) AnchorLoc { |
|
switch t { |
|
case ntBol: |
|
return AnchorBol |
|
case ntEol: |
|
return AnchorEol |
|
case ntBoundary: |
|
return AnchorBoundary |
|
case ntECMABoundary: |
|
return AnchorECMABoundary |
|
case ntBeginning: |
|
return AnchorBeginning |
|
case ntStart: |
|
return AnchorStart |
|
case ntEndZ: |
|
return AnchorEndZ |
|
case ntEnd: |
|
return AnchorEnd |
|
default: |
|
return 0 |
|
} |
|
} |
|
|
|
// anchorDescription returns a human-readable description of the anchors |
|
func (anchors AnchorLoc) String() string { |
|
buf := &bytes.Buffer{} |
|
|
|
if 0 != (anchors & AnchorBeginning) { |
|
buf.WriteString(", Beginning") |
|
} |
|
if 0 != (anchors & AnchorStart) { |
|
buf.WriteString(", Start") |
|
} |
|
if 0 != (anchors & AnchorBol) { |
|
buf.WriteString(", Bol") |
|
} |
|
if 0 != (anchors & AnchorBoundary) { |
|
buf.WriteString(", Boundary") |
|
} |
|
if 0 != (anchors & AnchorECMABoundary) { |
|
buf.WriteString(", ECMABoundary") |
|
} |
|
if 0 != (anchors & AnchorEol) { |
|
buf.WriteString(", Eol") |
|
} |
|
if 0 != (anchors & AnchorEnd) { |
|
buf.WriteString(", End") |
|
} |
|
if 0 != (anchors & AnchorEndZ) { |
|
buf.WriteString(", EndZ") |
|
} |
|
|
|
// trim off comma |
|
if buf.Len() >= 2 { |
|
return buf.String()[2:] |
|
} |
|
return "None" |
|
}
|
|
|