Платформа ЦРНП "Мирокод" для разработки проектов
https://git.mirocod.ru
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
2202 lines
49 KiB
2202 lines
49 KiB
package syntax |
|
|
|
import ( |
|
"fmt" |
|
"math" |
|
"os" |
|
"sort" |
|
"strconv" |
|
"unicode" |
|
) |
|
|
|
type RegexOptions int32 |
|
|
|
const ( |
|
IgnoreCase RegexOptions = 0x0001 // "i" |
|
Multiline = 0x0002 // "m" |
|
ExplicitCapture = 0x0004 // "n" |
|
Compiled = 0x0008 // "c" |
|
Singleline = 0x0010 // "s" |
|
IgnorePatternWhitespace = 0x0020 // "x" |
|
RightToLeft = 0x0040 // "r" |
|
Debug = 0x0080 // "d" |
|
ECMAScript = 0x0100 // "e" |
|
RE2 = 0x0200 // RE2 compat mode |
|
) |
|
|
|
func optionFromCode(ch rune) RegexOptions { |
|
// case-insensitive |
|
switch ch { |
|
case 'i', 'I': |
|
return IgnoreCase |
|
case 'r', 'R': |
|
return RightToLeft |
|
case 'm', 'M': |
|
return Multiline |
|
case 'n', 'N': |
|
return ExplicitCapture |
|
case 's', 'S': |
|
return Singleline |
|
case 'x', 'X': |
|
return IgnorePatternWhitespace |
|
case 'd', 'D': |
|
return Debug |
|
case 'e', 'E': |
|
return ECMAScript |
|
default: |
|
return 0 |
|
} |
|
} |
|
|
|
// An Error describes a failure to parse a regular expression |
|
// and gives the offending expression. |
|
type Error struct { |
|
Code ErrorCode |
|
Expr string |
|
Args []interface{} |
|
} |
|
|
|
func (e *Error) Error() string { |
|
if len(e.Args) == 0 { |
|
return "error parsing regexp: " + e.Code.String() + " in `" + e.Expr + "`" |
|
} |
|
return "error parsing regexp: " + fmt.Sprintf(e.Code.String(), e.Args...) + " in `" + e.Expr + "`" |
|
} |
|
|
|
// An ErrorCode describes a failure to parse a regular expression. |
|
type ErrorCode string |
|
|
|
const ( |
|
// internal issue |
|
ErrInternalError ErrorCode = "regexp/syntax: internal error" |
|
// Parser errors |
|
ErrUnterminatedComment = "unterminated comment" |
|
ErrInvalidCharRange = "invalid character class range" |
|
ErrInvalidRepeatSize = "invalid repeat count" |
|
ErrInvalidUTF8 = "invalid UTF-8" |
|
ErrCaptureGroupOutOfRange = "capture group number out of range" |
|
ErrUnexpectedParen = "unexpected )" |
|
ErrMissingParen = "missing closing )" |
|
ErrMissingBrace = "missing closing }" |
|
ErrInvalidRepeatOp = "invalid nested repetition operator" |
|
ErrMissingRepeatArgument = "missing argument to repetition operator" |
|
ErrConditionalExpression = "illegal conditional (?(...)) expression" |
|
ErrTooManyAlternates = "too many | in (?()|)" |
|
ErrUnrecognizedGrouping = "unrecognized grouping construct: (%v" |
|
ErrInvalidGroupName = "invalid group name: group names must begin with a word character and have a matching terminator" |
|
ErrCapNumNotZero = "capture number cannot be zero" |
|
ErrUndefinedBackRef = "reference to undefined group number %v" |
|
ErrUndefinedNameRef = "reference to undefined group name %v" |
|
ErrAlternationCantCapture = "alternation conditions do not capture and cannot be named" |
|
ErrAlternationCantHaveComment = "alternation conditions cannot be comments" |
|
ErrMalformedReference = "(?(%v) ) malformed" |
|
ErrUndefinedReference = "(?(%v) ) reference to undefined group" |
|
ErrIllegalEndEscape = "illegal \\ at end of pattern" |
|
ErrMalformedSlashP = "malformed \\p{X} character escape" |
|
ErrIncompleteSlashP = "incomplete \\p{X} character escape" |
|
ErrUnknownSlashP = "unknown unicode category, script, or property '%v'" |
|
ErrUnrecognizedEscape = "unrecognized escape sequence \\%v" |
|
ErrMissingControl = "missing control character" |
|
ErrUnrecognizedControl = "unrecognized control character" |
|
ErrTooFewHex = "insufficient hexadecimal digits" |
|
ErrInvalidHex = "hex values may not be larger than 0x10FFFF" |
|
ErrMalformedNameRef = "malformed \\k<...> named back reference" |
|
ErrBadClassInCharRange = "cannot include class \\%v in character range" |
|
ErrUnterminatedBracket = "unterminated [] set" |
|
ErrSubtractionMustBeLast = "a subtraction must be the last element in a character class" |
|
ErrReversedCharRange = "[x-y] range in reverse order" |
|
) |
|
|
|
func (e ErrorCode) String() string { |
|
return string(e) |
|
} |
|
|
|
type parser struct { |
|
stack *regexNode |
|
group *regexNode |
|
alternation *regexNode |
|
concatenation *regexNode |
|
unit *regexNode |
|
|
|
patternRaw string |
|
pattern []rune |
|
|
|
currentPos int |
|
specialCase *unicode.SpecialCase |
|
|
|
autocap int |
|
capcount int |
|
captop int |
|
capsize int |
|
|
|
caps map[int]int |
|
capnames map[string]int |
|
|
|
capnumlist []int |
|
capnamelist []string |
|
|
|
options RegexOptions |
|
optionsStack []RegexOptions |
|
ignoreNextParen bool |
|
} |
|
|
|
const ( |
|
maxValueDiv10 int = math.MaxInt32 / 10 |
|
maxValueMod10 = math.MaxInt32 % 10 |
|
) |
|
|
|
// Parse converts a regex string into a parse tree |
|
func Parse(re string, op RegexOptions) (*RegexTree, error) { |
|
p := parser{ |
|
options: op, |
|
caps: make(map[int]int), |
|
} |
|
p.setPattern(re) |
|
|
|
if err := p.countCaptures(); err != nil { |
|
return nil, err |
|
} |
|
|
|
p.reset(op) |
|
root, err := p.scanRegex() |
|
|
|
if err != nil { |
|
return nil, err |
|
} |
|
tree := &RegexTree{ |
|
root: root, |
|
caps: p.caps, |
|
capnumlist: p.capnumlist, |
|
captop: p.captop, |
|
Capnames: p.capnames, |
|
Caplist: p.capnamelist, |
|
options: op, |
|
} |
|
|
|
if tree.options&Debug > 0 { |
|
os.Stdout.WriteString(tree.Dump()) |
|
} |
|
|
|
return tree, nil |
|
} |
|
|
|
func (p *parser) setPattern(pattern string) { |
|
p.patternRaw = pattern |
|
p.pattern = make([]rune, 0, len(pattern)) |
|
|
|
//populate our rune array to handle utf8 encoding |
|
for _, r := range pattern { |
|
p.pattern = append(p.pattern, r) |
|
} |
|
} |
|
func (p *parser) getErr(code ErrorCode, args ...interface{}) error { |
|
return &Error{Code: code, Expr: p.patternRaw, Args: args} |
|
} |
|
|
|
func (p *parser) noteCaptureSlot(i, pos int) { |
|
if _, ok := p.caps[i]; !ok { |
|
// the rhs of the hashtable isn't used in the parser |
|
p.caps[i] = pos |
|
p.capcount++ |
|
|
|
if p.captop <= i { |
|
if i == math.MaxInt32 { |
|
p.captop = i |
|
} else { |
|
p.captop = i + 1 |
|
} |
|
} |
|
} |
|
} |
|
|
|
func (p *parser) noteCaptureName(name string, pos int) { |
|
if p.capnames == nil { |
|
p.capnames = make(map[string]int) |
|
} |
|
|
|
if _, ok := p.capnames[name]; !ok { |
|
p.capnames[name] = pos |
|
p.capnamelist = append(p.capnamelist, name) |
|
} |
|
} |
|
|
|
func (p *parser) assignNameSlots() { |
|
if p.capnames != nil { |
|
for _, name := range p.capnamelist { |
|
for p.isCaptureSlot(p.autocap) { |
|
p.autocap++ |
|
} |
|
pos := p.capnames[name] |
|
p.capnames[name] = p.autocap |
|
p.noteCaptureSlot(p.autocap, pos) |
|
|
|
p.autocap++ |
|
} |
|
} |
|
|
|
// if the caps array has at least one gap, construct the list of used slots |
|
if p.capcount < p.captop { |
|
p.capnumlist = make([]int, p.capcount) |
|
i := 0 |
|
|
|
for k := range p.caps { |
|
p.capnumlist[i] = k |
|
i++ |
|
} |
|
|
|
sort.Ints(p.capnumlist) |
|
} |
|
|
|
// merge capsnumlist into capnamelist |
|
if p.capnames != nil || p.capnumlist != nil { |
|
var oldcapnamelist []string |
|
var next int |
|
var k int |
|
|
|
if p.capnames == nil { |
|
oldcapnamelist = nil |
|
p.capnames = make(map[string]int) |
|
p.capnamelist = []string{} |
|
next = -1 |
|
} else { |
|
oldcapnamelist = p.capnamelist |
|
p.capnamelist = []string{} |
|
next = p.capnames[oldcapnamelist[0]] |
|
} |
|
|
|
for i := 0; i < p.capcount; i++ { |
|
j := i |
|
if p.capnumlist != nil { |
|
j = p.capnumlist[i] |
|
} |
|
|
|
if next == j { |
|
p.capnamelist = append(p.capnamelist, oldcapnamelist[k]) |
|
k++ |
|
|
|
if k == len(oldcapnamelist) { |
|
next = -1 |
|
} else { |
|
next = p.capnames[oldcapnamelist[k]] |
|
} |
|
|
|
} else { |
|
//feature: culture? |
|
str := strconv.Itoa(j) |
|
p.capnamelist = append(p.capnamelist, str) |
|
p.capnames[str] = j |
|
} |
|
} |
|
} |
|
} |
|
|
|
func (p *parser) consumeAutocap() int { |
|
r := p.autocap |
|
p.autocap++ |
|
return r |
|
} |
|
|
|
// CountCaptures is a prescanner for deducing the slots used for |
|
// captures by doing a partial tokenization of the pattern. |
|
func (p *parser) countCaptures() error { |
|
var ch rune |
|
|
|
p.noteCaptureSlot(0, 0) |
|
|
|
p.autocap = 1 |
|
|
|
for p.charsRight() > 0 { |
|
pos := p.textpos() |
|
ch = p.moveRightGetChar() |
|
switch ch { |
|
case '\\': |
|
if p.charsRight() > 0 { |
|
p.scanBackslash(true) |
|
} |
|
|
|
case '#': |
|
if p.useOptionX() { |
|
p.moveLeft() |
|
p.scanBlank() |
|
} |
|
|
|
case '[': |
|
p.scanCharSet(false, true) |
|
|
|
case ')': |
|
if !p.emptyOptionsStack() { |
|
p.popOptions() |
|
} |
|
|
|
case '(': |
|
if p.charsRight() >= 2 && p.rightChar(1) == '#' && p.rightChar(0) == '?' { |
|
p.moveLeft() |
|
p.scanBlank() |
|
} else { |
|
p.pushOptions() |
|
if p.charsRight() > 0 && p.rightChar(0) == '?' { |
|
// we have (?... |
|
p.moveRight(1) |
|
|
|
if p.charsRight() > 1 && (p.rightChar(0) == '<' || p.rightChar(0) == '\'') { |
|
// named group: (?<... or (?'... |
|
|
|
p.moveRight(1) |
|
ch = p.rightChar(0) |
|
|
|
if ch != '0' && IsWordChar(ch) { |
|
if ch >= '1' && ch <= '9' { |
|
dec, err := p.scanDecimal() |
|
if err != nil { |
|
return err |
|
} |
|
p.noteCaptureSlot(dec, pos) |
|
} else { |
|
p.noteCaptureName(p.scanCapname(), pos) |
|
} |
|
} |
|
} else if p.useRE2() && p.charsRight() > 2 && (p.rightChar(0) == 'P' && p.rightChar(1) == '<') { |
|
// RE2-compat (?P<) |
|
p.moveRight(2) |
|
ch = p.rightChar(0) |
|
if IsWordChar(ch) { |
|
p.noteCaptureName(p.scanCapname(), pos) |
|
} |
|
|
|
} else { |
|
// (?... |
|
|
|
// get the options if it's an option construct (?cimsx-cimsx...) |
|
p.scanOptions() |
|
|
|
if p.charsRight() > 0 { |
|
if p.rightChar(0) == ')' { |
|
// (?cimsx-cimsx) |
|
p.moveRight(1) |
|
p.popKeepOptions() |
|
} else if p.rightChar(0) == '(' { |
|
// alternation construct: (?(foo)yes|no) |
|
// ignore the next paren so we don't capture the condition |
|
p.ignoreNextParen = true |
|
|
|
// break from here so we don't reset ignoreNextParen |
|
continue |
|
} |
|
} |
|
} |
|
} else { |
|
if !p.useOptionN() && !p.ignoreNextParen { |
|
p.noteCaptureSlot(p.consumeAutocap(), pos) |
|
} |
|
} |
|
} |
|
|
|
p.ignoreNextParen = false |
|
|
|
} |
|
} |
|
|
|
p.assignNameSlots() |
|
return nil |
|
} |
|
|
|
func (p *parser) reset(topopts RegexOptions) { |
|
p.currentPos = 0 |
|
p.autocap = 1 |
|
p.ignoreNextParen = false |
|
|
|
if len(p.optionsStack) > 0 { |
|
p.optionsStack = p.optionsStack[:0] |
|
} |
|
|
|
p.options = topopts |
|
p.stack = nil |
|
} |
|
|
|
func (p *parser) scanRegex() (*regexNode, error) { |
|
ch := '@' // nonspecial ch, means at beginning |
|
isQuant := false |
|
|
|
p.startGroup(newRegexNodeMN(ntCapture, p.options, 0, -1)) |
|
|
|
for p.charsRight() > 0 { |
|
wasPrevQuantifier := isQuant |
|
isQuant = false |
|
|
|
if err := p.scanBlank(); err != nil { |
|
return nil, err |
|
} |
|
|
|
startpos := p.textpos() |
|
|
|
// move past all of the normal characters. We'll stop when we hit some kind of control character, |
|
// or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace. |
|
if p.useOptionX() { |
|
for p.charsRight() > 0 { |
|
ch = p.rightChar(0) |
|
//UGLY: clean up, this is ugly |
|
if !(!isStopperX(ch) || (ch == '{' && !p.isTrueQuantifier())) { |
|
break |
|
} |
|
p.moveRight(1) |
|
} |
|
} else { |
|
for p.charsRight() > 0 { |
|
ch = p.rightChar(0) |
|
if !(!isSpecial(ch) || ch == '{' && !p.isTrueQuantifier()) { |
|
break |
|
} |
|
p.moveRight(1) |
|
} |
|
} |
|
|
|
endpos := p.textpos() |
|
|
|
p.scanBlank() |
|
|
|
if p.charsRight() == 0 { |
|
ch = '!' // nonspecial, means at end |
|
} else if ch = p.rightChar(0); isSpecial(ch) { |
|
isQuant = isQuantifier(ch) |
|
p.moveRight(1) |
|
} else { |
|
ch = ' ' // nonspecial, means at ordinary char |
|
} |
|
|
|
if startpos < endpos { |
|
cchUnquantified := endpos - startpos |
|
if isQuant { |
|
cchUnquantified-- |
|
} |
|
wasPrevQuantifier = false |
|
|
|
if cchUnquantified > 0 { |
|
p.addToConcatenate(startpos, cchUnquantified, false) |
|
} |
|
|
|
if isQuant { |
|
p.addUnitOne(p.charAt(endpos - 1)) |
|
} |
|
} |
|
|
|
switch ch { |
|
case '!': |
|
goto BreakOuterScan |
|
|
|
case ' ': |
|
goto ContinueOuterScan |
|
|
|
case '[': |
|
cc, err := p.scanCharSet(p.useOptionI(), false) |
|
if err != nil { |
|
return nil, err |
|
} |
|
p.addUnitSet(cc) |
|
|
|
case '(': |
|
p.pushOptions() |
|
|
|
if grouper, err := p.scanGroupOpen(); err != nil { |
|
return nil, err |
|
} else if grouper == nil { |
|
p.popKeepOptions() |
|
} else { |
|
p.pushGroup() |
|
p.startGroup(grouper) |
|
} |
|
|
|
continue |
|
|
|
case '|': |
|
p.addAlternate() |
|
goto ContinueOuterScan |
|
|
|
case ')': |
|
if p.emptyStack() { |
|
return nil, p.getErr(ErrUnexpectedParen) |
|
} |
|
|
|
if err := p.addGroup(); err != nil { |
|
return nil, err |
|
} |
|
if err := p.popGroup(); err != nil { |
|
return nil, err |
|
} |
|
p.popOptions() |
|
|
|
if p.unit == nil { |
|
goto ContinueOuterScan |
|
} |
|
|
|
case '\\': |
|
n, err := p.scanBackslash(false) |
|
if err != nil { |
|
return nil, err |
|
} |
|
p.addUnitNode(n) |
|
|
|
case '^': |
|
if p.useOptionM() { |
|
p.addUnitType(ntBol) |
|
} else { |
|
p.addUnitType(ntBeginning) |
|
} |
|
|
|
case '$': |
|
if p.useOptionM() { |
|
p.addUnitType(ntEol) |
|
} else { |
|
p.addUnitType(ntEndZ) |
|
} |
|
|
|
case '.': |
|
if p.useOptionE() { |
|
p.addUnitSet(ECMAAnyClass()) |
|
} else if p.useOptionS() { |
|
p.addUnitSet(AnyClass()) |
|
} else { |
|
p.addUnitNotone('\n') |
|
} |
|
|
|
case '{', '*', '+', '?': |
|
if p.unit == nil { |
|
if wasPrevQuantifier { |
|
return nil, p.getErr(ErrInvalidRepeatOp) |
|
} else { |
|
return nil, p.getErr(ErrMissingRepeatArgument) |
|
} |
|
} |
|
p.moveLeft() |
|
|
|
default: |
|
return nil, p.getErr(ErrInternalError) |
|
} |
|
|
|
if err := p.scanBlank(); err != nil { |
|
return nil, err |
|
} |
|
|
|
if p.charsRight() > 0 { |
|
isQuant = p.isTrueQuantifier() |
|
} |
|
if p.charsRight() == 0 || !isQuant { |
|
//maintain odd C# assignment order -- not sure if required, could clean up? |
|
p.addConcatenate() |
|
goto ContinueOuterScan |
|
} |
|
|
|
ch = p.moveRightGetChar() |
|
|
|
// Handle quantifiers |
|
for p.unit != nil { |
|
var min, max int |
|
var lazy bool |
|
|
|
switch ch { |
|
case '*': |
|
min = 0 |
|
max = math.MaxInt32 |
|
|
|
case '?': |
|
min = 0 |
|
max = 1 |
|
|
|
case '+': |
|
min = 1 |
|
max = math.MaxInt32 |
|
|
|
case '{': |
|
{ |
|
var err error |
|
startpos = p.textpos() |
|
if min, err = p.scanDecimal(); err != nil { |
|
return nil, err |
|
} |
|
max = min |
|
if startpos < p.textpos() { |
|
if p.charsRight() > 0 && p.rightChar(0) == ',' { |
|
p.moveRight(1) |
|
if p.charsRight() == 0 || p.rightChar(0) == '}' { |
|
max = math.MaxInt32 |
|
} else { |
|
if max, err = p.scanDecimal(); err != nil { |
|
return nil, err |
|
} |
|
} |
|
} |
|
} |
|
|
|
if startpos == p.textpos() || p.charsRight() == 0 || p.moveRightGetChar() != '}' { |
|
p.addConcatenate() |
|
p.textto(startpos - 1) |
|
goto ContinueOuterScan |
|
} |
|
} |
|
|
|
default: |
|
return nil, p.getErr(ErrInternalError) |
|
} |
|
|
|
if err := p.scanBlank(); err != nil { |
|
return nil, err |
|
} |
|
|
|
if p.charsRight() == 0 || p.rightChar(0) != '?' { |
|
lazy = false |
|
} else { |
|
p.moveRight(1) |
|
lazy = true |
|
} |
|
|
|
if min > max { |
|
return nil, p.getErr(ErrInvalidRepeatSize) |
|
} |
|
|
|
p.addConcatenate3(lazy, min, max) |
|
} |
|
|
|
ContinueOuterScan: |
|
} |
|
|
|
BreakOuterScan: |
|
; |
|
|
|
if !p.emptyStack() { |
|
return nil, p.getErr(ErrMissingParen) |
|
} |
|
|
|
if err := p.addGroup(); err != nil { |
|
return nil, err |
|
} |
|
|
|
return p.unit, nil |
|
|
|
} |
|
|
|
/* |
|
* Simple parsing for replacement patterns |
|
*/ |
|
func (p *parser) scanReplacement() (*regexNode, error) { |
|
var c, startpos int |
|
|
|
p.concatenation = newRegexNode(ntConcatenate, p.options) |
|
|
|
for { |
|
c = p.charsRight() |
|
if c == 0 { |
|
break |
|
} |
|
|
|
startpos = p.textpos() |
|
|
|
for c > 0 && p.rightChar(0) != '$' { |
|
p.moveRight(1) |
|
c-- |
|
} |
|
|
|
p.addToConcatenate(startpos, p.textpos()-startpos, true) |
|
|
|
if c > 0 { |
|
if p.moveRightGetChar() == '$' { |
|
n, err := p.scanDollar() |
|
if err != nil { |
|
return nil, err |
|
} |
|
p.addUnitNode(n) |
|
} |
|
p.addConcatenate() |
|
} |
|
} |
|
|
|
return p.concatenation, nil |
|
} |
|
|
|
/* |
|
* Scans $ patterns recognized within replacement patterns |
|
*/ |
|
func (p *parser) scanDollar() (*regexNode, error) { |
|
if p.charsRight() == 0 { |
|
return newRegexNodeCh(ntOne, p.options, '$'), nil |
|
} |
|
|
|
ch := p.rightChar(0) |
|
angled := false |
|
backpos := p.textpos() |
|
lastEndPos := backpos |
|
|
|
// Note angle |
|
|
|
if ch == '{' && p.charsRight() > 1 { |
|
angled = true |
|
p.moveRight(1) |
|
ch = p.rightChar(0) |
|
} |
|
|
|
// Try to parse backreference: \1 or \{1} or \{cap} |
|
|
|
if ch >= '0' && ch <= '9' { |
|
if !angled && p.useOptionE() { |
|
capnum := -1 |
|
newcapnum := int(ch - '0') |
|
p.moveRight(1) |
|
if p.isCaptureSlot(newcapnum) { |
|
capnum = newcapnum |
|
lastEndPos = p.textpos() |
|
} |
|
|
|
for p.charsRight() > 0 { |
|
ch = p.rightChar(0) |
|
if ch < '0' || ch > '9' { |
|
break |
|
} |
|
digit := int(ch - '0') |
|
if newcapnum > maxValueDiv10 || (newcapnum == maxValueDiv10 && digit > maxValueMod10) { |
|
return nil, p.getErr(ErrCaptureGroupOutOfRange) |
|
} |
|
|
|
newcapnum = newcapnum*10 + digit |
|
|
|
p.moveRight(1) |
|
if p.isCaptureSlot(newcapnum) { |
|
capnum = newcapnum |
|
lastEndPos = p.textpos() |
|
} |
|
} |
|
p.textto(lastEndPos) |
|
if capnum >= 0 { |
|
return newRegexNodeM(ntRef, p.options, capnum), nil |
|
} |
|
} else { |
|
capnum, err := p.scanDecimal() |
|
if err != nil { |
|
return nil, err |
|
} |
|
if !angled || p.charsRight() > 0 && p.moveRightGetChar() == '}' { |
|
if p.isCaptureSlot(capnum) { |
|
return newRegexNodeM(ntRef, p.options, capnum), nil |
|
} |
|
} |
|
} |
|
} else if angled && IsWordChar(ch) { |
|
capname := p.scanCapname() |
|
|
|
if p.charsRight() > 0 && p.moveRightGetChar() == '}' { |
|
if p.isCaptureName(capname) { |
|
return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil |
|
} |
|
} |
|
} else if !angled { |
|
capnum := 1 |
|
|
|
switch ch { |
|
case '$': |
|
p.moveRight(1) |
|
return newRegexNodeCh(ntOne, p.options, '$'), nil |
|
case '&': |
|
capnum = 0 |
|
case '`': |
|
capnum = replaceLeftPortion |
|
case '\'': |
|
capnum = replaceRightPortion |
|
case '+': |
|
capnum = replaceLastGroup |
|
case '_': |
|
capnum = replaceWholeString |
|
} |
|
|
|
if capnum != 1 { |
|
p.moveRight(1) |
|
return newRegexNodeM(ntRef, p.options, capnum), nil |
|
} |
|
} |
|
|
|
// unrecognized $: literalize |
|
|
|
p.textto(backpos) |
|
return newRegexNodeCh(ntOne, p.options, '$'), nil |
|
} |
|
|
|
// scanGroupOpen scans chars following a '(' (not counting the '('), and returns |
|
// a RegexNode for the type of group scanned, or nil if the group |
|
// simply changed options (?cimsx-cimsx) or was a comment (#...). |
|
func (p *parser) scanGroupOpen() (*regexNode, error) { |
|
var ch rune |
|
var nt nodeType |
|
var err error |
|
close := '>' |
|
start := p.textpos() |
|
|
|
// just return a RegexNode if we have: |
|
// 1. "(" followed by nothing |
|
// 2. "(x" where x != ? |
|
// 3. "(?)" |
|
if p.charsRight() == 0 || p.rightChar(0) != '?' || (p.rightChar(0) == '?' && (p.charsRight() > 1 && p.rightChar(1) == ')')) { |
|
if p.useOptionN() || p.ignoreNextParen { |
|
p.ignoreNextParen = false |
|
return newRegexNode(ntGroup, p.options), nil |
|
} |
|
return newRegexNodeMN(ntCapture, p.options, p.consumeAutocap(), -1), nil |
|
} |
|
|
|
p.moveRight(1) |
|
|
|
for { |
|
if p.charsRight() == 0 { |
|
break |
|
} |
|
|
|
switch ch = p.moveRightGetChar(); ch { |
|
case ':': |
|
nt = ntGroup |
|
|
|
case '=': |
|
p.options &= ^RightToLeft |
|
nt = ntRequire |
|
|
|
case '!': |
|
p.options &= ^RightToLeft |
|
nt = ntPrevent |
|
|
|
case '>': |
|
nt = ntGreedy |
|
|
|
case '\'': |
|
close = '\'' |
|
fallthrough |
|
|
|
case '<': |
|
if p.charsRight() == 0 { |
|
goto BreakRecognize |
|
} |
|
|
|
switch ch = p.moveRightGetChar(); ch { |
|
case '=': |
|
if close == '\'' { |
|
goto BreakRecognize |
|
} |
|
|
|
p.options |= RightToLeft |
|
nt = ntRequire |
|
|
|
case '!': |
|
if close == '\'' { |
|
goto BreakRecognize |
|
} |
|
|
|
p.options |= RightToLeft |
|
nt = ntPrevent |
|
|
|
default: |
|
p.moveLeft() |
|
capnum := -1 |
|
uncapnum := -1 |
|
proceed := false |
|
|
|
// grab part before - |
|
|
|
if ch >= '0' && ch <= '9' { |
|
if capnum, err = p.scanDecimal(); err != nil { |
|
return nil, err |
|
} |
|
|
|
if !p.isCaptureSlot(capnum) { |
|
capnum = -1 |
|
} |
|
|
|
// check if we have bogus characters after the number |
|
if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') { |
|
return nil, p.getErr(ErrInvalidGroupName) |
|
} |
|
if capnum == 0 { |
|
return nil, p.getErr(ErrCapNumNotZero) |
|
} |
|
} else if IsWordChar(ch) { |
|
capname := p.scanCapname() |
|
|
|
if p.isCaptureName(capname) { |
|
capnum = p.captureSlotFromName(capname) |
|
} |
|
|
|
// check if we have bogus character after the name |
|
if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') { |
|
return nil, p.getErr(ErrInvalidGroupName) |
|
} |
|
} else if ch == '-' { |
|
proceed = true |
|
} else { |
|
// bad group name - starts with something other than a word character and isn't a number |
|
return nil, p.getErr(ErrInvalidGroupName) |
|
} |
|
|
|
// grab part after - if any |
|
|
|
if (capnum != -1 || proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' { |
|
p.moveRight(1) |
|
|
|
//no more chars left, no closing char, etc |
|
if p.charsRight() == 0 { |
|
return nil, p.getErr(ErrInvalidGroupName) |
|
} |
|
|
|
ch = p.rightChar(0) |
|
if ch >= '0' && ch <= '9' { |
|
if uncapnum, err = p.scanDecimal(); err != nil { |
|
return nil, err |
|
} |
|
|
|
if !p.isCaptureSlot(uncapnum) { |
|
return nil, p.getErr(ErrUndefinedBackRef, uncapnum) |
|
} |
|
|
|
// check if we have bogus characters after the number |
|
if p.charsRight() > 0 && p.rightChar(0) != close { |
|
return nil, p.getErr(ErrInvalidGroupName) |
|
} |
|
} else if IsWordChar(ch) { |
|
uncapname := p.scanCapname() |
|
|
|
if !p.isCaptureName(uncapname) { |
|
return nil, p.getErr(ErrUndefinedNameRef, uncapname) |
|
} |
|
uncapnum = p.captureSlotFromName(uncapname) |
|
|
|
// check if we have bogus character after the name |
|
if p.charsRight() > 0 && p.rightChar(0) != close { |
|
return nil, p.getErr(ErrInvalidGroupName) |
|
} |
|
} else { |
|
// bad group name - starts with something other than a word character and isn't a number |
|
return nil, p.getErr(ErrInvalidGroupName) |
|
} |
|
} |
|
|
|
// actually make the node |
|
|
|
if (capnum != -1 || uncapnum != -1) && p.charsRight() > 0 && p.moveRightGetChar() == close { |
|
return newRegexNodeMN(ntCapture, p.options, capnum, uncapnum), nil |
|
} |
|
goto BreakRecognize |
|
} |
|
|
|
case '(': |
|
// alternation construct (?(...) | ) |
|
|
|
parenPos := p.textpos() |
|
if p.charsRight() > 0 { |
|
ch = p.rightChar(0) |
|
|
|
// check if the alternation condition is a backref |
|
if ch >= '0' && ch <= '9' { |
|
var capnum int |
|
if capnum, err = p.scanDecimal(); err != nil { |
|
return nil, err |
|
} |
|
if p.charsRight() > 0 && p.moveRightGetChar() == ')' { |
|
if p.isCaptureSlot(capnum) { |
|
return newRegexNodeM(ntTestref, p.options, capnum), nil |
|
} |
|
return nil, p.getErr(ErrUndefinedReference, capnum) |
|
} |
|
|
|
return nil, p.getErr(ErrMalformedReference, capnum) |
|
|
|
} else if IsWordChar(ch) { |
|
capname := p.scanCapname() |
|
|
|
if p.isCaptureName(capname) && p.charsRight() > 0 && p.moveRightGetChar() == ')' { |
|
return newRegexNodeM(ntTestref, p.options, p.captureSlotFromName(capname)), nil |
|
} |
|
} |
|
} |
|
// not a backref |
|
nt = ntTestgroup |
|
p.textto(parenPos - 1) // jump to the start of the parentheses |
|
p.ignoreNextParen = true // but make sure we don't try to capture the insides |
|
|
|
charsRight := p.charsRight() |
|
if charsRight >= 3 && p.rightChar(1) == '?' { |
|
rightchar2 := p.rightChar(2) |
|
// disallow comments in the condition |
|
if rightchar2 == '#' { |
|
return nil, p.getErr(ErrAlternationCantHaveComment) |
|
} |
|
|
|
// disallow named capture group (?<..>..) in the condition |
|
if rightchar2 == '\'' { |
|
return nil, p.getErr(ErrAlternationCantCapture) |
|
} |
|
|
|
if charsRight >= 4 && (rightchar2 == '<' && p.rightChar(3) != '!' && p.rightChar(3) != '=') { |
|
return nil, p.getErr(ErrAlternationCantCapture) |
|
} |
|
} |
|
|
|
case 'P': |
|
if p.useRE2() { |
|
// support for P<name> syntax |
|
if p.charsRight() < 3 { |
|
goto BreakRecognize |
|
} |
|
|
|
ch = p.moveRightGetChar() |
|
if ch != '<' { |
|
goto BreakRecognize |
|
} |
|
|
|
ch = p.moveRightGetChar() |
|
p.moveLeft() |
|
|
|
if IsWordChar(ch) { |
|
capnum := -1 |
|
capname := p.scanCapname() |
|
|
|
if p.isCaptureName(capname) { |
|
capnum = p.captureSlotFromName(capname) |
|
} |
|
|
|
// check if we have bogus character after the name |
|
if p.charsRight() > 0 && p.rightChar(0) != '>' { |
|
return nil, p.getErr(ErrInvalidGroupName) |
|
} |
|
|
|
// actually make the node |
|
|
|
if capnum != -1 && p.charsRight() > 0 && p.moveRightGetChar() == '>' { |
|
return newRegexNodeMN(ntCapture, p.options, capnum, -1), nil |
|
} |
|
goto BreakRecognize |
|
|
|
} else { |
|
// bad group name - starts with something other than a word character and isn't a number |
|
return nil, p.getErr(ErrInvalidGroupName) |
|
} |
|
} |
|
// if we're not using RE2 compat mode then |
|
// we just behave like normal |
|
fallthrough |
|
|
|
default: |
|
p.moveLeft() |
|
|
|
nt = ntGroup |
|
// disallow options in the children of a testgroup node |
|
if p.group.t != ntTestgroup { |
|
p.scanOptions() |
|
} |
|
if p.charsRight() == 0 { |
|
goto BreakRecognize |
|
} |
|
|
|
if ch = p.moveRightGetChar(); ch == ')' { |
|
return nil, nil |
|
} |
|
|
|
if ch != ':' { |
|
goto BreakRecognize |
|
} |
|
|
|
} |
|
|
|
return newRegexNode(nt, p.options), nil |
|
} |
|
|
|
BreakRecognize: |
|
|
|
// break Recognize comes here |
|
|
|
return nil, p.getErr(ErrUnrecognizedGrouping, string(p.pattern[start:p.textpos()])) |
|
} |
|
|
|
// scans backslash specials and basics |
|
func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) { |
|
|
|
if p.charsRight() == 0 { |
|
return nil, p.getErr(ErrIllegalEndEscape) |
|
} |
|
|
|
switch ch := p.rightChar(0); ch { |
|
case 'b', 'B', 'A', 'G', 'Z', 'z': |
|
p.moveRight(1) |
|
return newRegexNode(p.typeFromCode(ch), p.options), nil |
|
|
|
case 'w': |
|
p.moveRight(1) |
|
if p.useOptionE() { |
|
return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil |
|
} |
|
return newRegexNodeSet(ntSet, p.options, WordClass()), nil |
|
|
|
case 'W': |
|
p.moveRight(1) |
|
if p.useOptionE() { |
|
return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil |
|
} |
|
return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil |
|
|
|
case 's': |
|
p.moveRight(1) |
|
if p.useOptionE() { |
|
return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil |
|
} |
|
return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil |
|
|
|
case 'S': |
|
p.moveRight(1) |
|
if p.useOptionE() { |
|
return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil |
|
} |
|
return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil |
|
|
|
case 'd': |
|
p.moveRight(1) |
|
if p.useOptionE() { |
|
return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil |
|
} |
|
return newRegexNodeSet(ntSet, p.options, DigitClass()), nil |
|
|
|
case 'D': |
|
p.moveRight(1) |
|
if p.useOptionE() { |
|
return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil |
|
} |
|
return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil |
|
|
|
case 'p', 'P': |
|
p.moveRight(1) |
|
prop, err := p.parseProperty() |
|
if err != nil { |
|
return nil, err |
|
} |
|
cc := &CharSet{} |
|
cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw) |
|
if p.useOptionI() { |
|
cc.addLowercase() |
|
} |
|
|
|
return newRegexNodeSet(ntSet, p.options, cc), nil |
|
|
|
default: |
|
return p.scanBasicBackslash(scanOnly) |
|
} |
|
} |
|
|
|
// Scans \-style backreferences and character escapes |
|
func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) { |
|
if p.charsRight() == 0 { |
|
return nil, p.getErr(ErrIllegalEndEscape) |
|
} |
|
angled := false |
|
close := '\x00' |
|
|
|
backpos := p.textpos() |
|
ch := p.rightChar(0) |
|
|
|
// allow \k<foo> instead of \<foo>, which is now deprecated |
|
|
|
if ch == 'k' { |
|
if p.charsRight() >= 2 { |
|
p.moveRight(1) |
|
ch = p.moveRightGetChar() |
|
|
|
if ch == '<' || ch == '\'' { |
|
angled = true |
|
if ch == '\'' { |
|
close = '\'' |
|
} else { |
|
close = '>' |
|
} |
|
} |
|
} |
|
|
|
if !angled || p.charsRight() <= 0 { |
|
return nil, p.getErr(ErrMalformedNameRef) |
|
} |
|
|
|
ch = p.rightChar(0) |
|
|
|
} else if (ch == '<' || ch == '\'') && p.charsRight() > 1 { // Note angle without \g |
|
angled = true |
|
if ch == '\'' { |
|
close = '\'' |
|
} else { |
|
close = '>' |
|
} |
|
|
|
p.moveRight(1) |
|
ch = p.rightChar(0) |
|
} |
|
|
|
// Try to parse backreference: \<1> or \<cap> |
|
|
|
if angled && ch >= '0' && ch <= '9' { |
|
capnum, err := p.scanDecimal() |
|
if err != nil { |
|
return nil, err |
|
} |
|
|
|
if p.charsRight() > 0 && p.moveRightGetChar() == close { |
|
if p.isCaptureSlot(capnum) { |
|
return newRegexNodeM(ntRef, p.options, capnum), nil |
|
} |
|
return nil, p.getErr(ErrUndefinedBackRef, capnum) |
|
} |
|
} else if !angled && ch >= '1' && ch <= '9' { // Try to parse backreference or octal: \1 |
|
capnum, err := p.scanDecimal() |
|
if err != nil { |
|
return nil, err |
|
} |
|
|
|
if scanOnly { |
|
return nil, nil |
|
} |
|
|
|
if p.useOptionE() || p.isCaptureSlot(capnum) { |
|
return newRegexNodeM(ntRef, p.options, capnum), nil |
|
} |
|
if capnum <= 9 { |
|
return nil, p.getErr(ErrUndefinedBackRef, capnum) |
|
} |
|
|
|
} else if angled && IsWordChar(ch) { |
|
capname := p.scanCapname() |
|
|
|
if p.charsRight() > 0 && p.moveRightGetChar() == close { |
|
if p.isCaptureName(capname) { |
|
return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil |
|
} |
|
return nil, p.getErr(ErrUndefinedNameRef, capname) |
|
} |
|
} |
|
|
|
// Not backreference: must be char code |
|
|
|
p.textto(backpos) |
|
ch, err := p.scanCharEscape() |
|
if err != nil { |
|
return nil, err |
|
} |
|
|
|
if p.useOptionI() { |
|
ch = unicode.ToLower(ch) |
|
} |
|
|
|
return newRegexNodeCh(ntOne, p.options, ch), nil |
|
} |
|
|
|
// Scans X for \p{X} or \P{X} |
|
func (p *parser) parseProperty() (string, error) { |
|
if p.charsRight() < 3 { |
|
return "", p.getErr(ErrIncompleteSlashP) |
|
} |
|
ch := p.moveRightGetChar() |
|
if ch != '{' { |
|
return "", p.getErr(ErrMalformedSlashP) |
|
} |
|
|
|
startpos := p.textpos() |
|
for p.charsRight() > 0 { |
|
ch = p.moveRightGetChar() |
|
if !(IsWordChar(ch) || ch == '-') { |
|
p.moveLeft() |
|
break |
|
} |
|
} |
|
capname := string(p.pattern[startpos:p.textpos()]) |
|
|
|
if p.charsRight() == 0 || p.moveRightGetChar() != '}' { |
|
return "", p.getErr(ErrIncompleteSlashP) |
|
} |
|
|
|
if !isValidUnicodeCat(capname) { |
|
return "", p.getErr(ErrUnknownSlashP, capname) |
|
} |
|
|
|
return capname, nil |
|
} |
|
|
|
// Returns ReNode type for zero-length assertions with a \ code. |
|
func (p *parser) typeFromCode(ch rune) nodeType { |
|
switch ch { |
|
case 'b': |
|
if p.useOptionE() { |
|
return ntECMABoundary |
|
} |
|
return ntBoundary |
|
case 'B': |
|
if p.useOptionE() { |
|
return ntNonECMABoundary |
|
} |
|
return ntNonboundary |
|
case 'A': |
|
return ntBeginning |
|
case 'G': |
|
return ntStart |
|
case 'Z': |
|
return ntEndZ |
|
case 'z': |
|
return ntEnd |
|
default: |
|
return ntNothing |
|
} |
|
} |
|
|
|
// Scans whitespace or x-mode comments. |
|
func (p *parser) scanBlank() error { |
|
if p.useOptionX() { |
|
for { |
|
for p.charsRight() > 0 && isSpace(p.rightChar(0)) { |
|
p.moveRight(1) |
|
} |
|
|
|
if p.charsRight() == 0 { |
|
break |
|
} |
|
|
|
if p.rightChar(0) == '#' { |
|
for p.charsRight() > 0 && p.rightChar(0) != '\n' { |
|
p.moveRight(1) |
|
} |
|
} else if p.charsRight() >= 3 && p.rightChar(2) == '#' && |
|
p.rightChar(1) == '?' && p.rightChar(0) == '(' { |
|
for p.charsRight() > 0 && p.rightChar(0) != ')' { |
|
p.moveRight(1) |
|
} |
|
if p.charsRight() == 0 { |
|
return p.getErr(ErrUnterminatedComment) |
|
} |
|
p.moveRight(1) |
|
} else { |
|
break |
|
} |
|
} |
|
} else { |
|
for { |
|
if p.charsRight() < 3 || p.rightChar(2) != '#' || |
|
p.rightChar(1) != '?' || p.rightChar(0) != '(' { |
|
return nil |
|
} |
|
|
|
for p.charsRight() > 0 && p.rightChar(0) != ')' { |
|
p.moveRight(1) |
|
} |
|
if p.charsRight() == 0 { |
|
return p.getErr(ErrUnterminatedComment) |
|
} |
|
p.moveRight(1) |
|
} |
|
} |
|
return nil |
|
} |
|
|
|
func (p *parser) scanCapname() string { |
|
startpos := p.textpos() |
|
|
|
for p.charsRight() > 0 { |
|
if !IsWordChar(p.moveRightGetChar()) { |
|
p.moveLeft() |
|
break |
|
} |
|
} |
|
|
|
return string(p.pattern[startpos:p.textpos()]) |
|
} |
|
|
|
//Scans contents of [] (not including []'s), and converts to a set. |
|
func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { |
|
ch := '\x00' |
|
chPrev := '\x00' |
|
inRange := false |
|
firstChar := true |
|
closed := false |
|
|
|
var cc *CharSet |
|
if !scanOnly { |
|
cc = &CharSet{} |
|
} |
|
|
|
if p.charsRight() > 0 && p.rightChar(0) == '^' { |
|
p.moveRight(1) |
|
if !scanOnly { |
|
cc.negate = true |
|
} |
|
} |
|
|
|
for ; p.charsRight() > 0; firstChar = false { |
|
fTranslatedChar := false |
|
ch = p.moveRightGetChar() |
|
if ch == ']' { |
|
if !firstChar { |
|
closed = true |
|
break |
|
} else if p.useOptionE() { |
|
if !scanOnly { |
|
cc.addRanges(NoneClass().ranges) |
|
} |
|
closed = true |
|
break |
|
} |
|
|
|
} else if ch == '\\' && p.charsRight() > 0 { |
|
switch ch = p.moveRightGetChar(); ch { |
|
case 'D', 'd': |
|
if !scanOnly { |
|
if inRange { |
|
return nil, p.getErr(ErrBadClassInCharRange, ch) |
|
} |
|
cc.addDigit(p.useOptionE(), ch == 'D', p.patternRaw) |
|
} |
|
continue |
|
|
|
case 'S', 's': |
|
if !scanOnly { |
|
if inRange { |
|
return nil, p.getErr(ErrBadClassInCharRange, ch) |
|
} |
|
cc.addSpace(p.useOptionE(), ch == 'S') |
|
} |
|
continue |
|
|
|
case 'W', 'w': |
|
if !scanOnly { |
|
if inRange { |
|
return nil, p.getErr(ErrBadClassInCharRange, ch) |
|
} |
|
|
|
cc.addWord(p.useOptionE(), ch == 'W') |
|
} |
|
continue |
|
|
|
case 'p', 'P': |
|
if !scanOnly { |
|
if inRange { |
|
return nil, p.getErr(ErrBadClassInCharRange, ch) |
|
} |
|
prop, err := p.parseProperty() |
|
if err != nil { |
|
return nil, err |
|
} |
|
cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw) |
|
} else { |
|
p.parseProperty() |
|
} |
|
|
|
continue |
|
|
|
case '-': |
|
if !scanOnly { |
|
cc.addRange(ch, ch) |
|
} |
|
continue |
|
|
|
default: |
|
p.moveLeft() |
|
var err error |
|
ch, err = p.scanCharEscape() // non-literal character |
|
if err != nil { |
|
return nil, err |
|
} |
|
fTranslatedChar = true |
|
break // this break will only break out of the switch |
|
} |
|
} else if ch == '[' { |
|
// This is code for Posix style properties - [:Ll:] or [:IsTibetan:]. |
|
// It currently doesn't do anything other than skip the whole thing! |
|
if p.charsRight() > 0 && p.rightChar(0) == ':' && !inRange { |
|
savePos := p.textpos() |
|
|
|
p.moveRight(1) |
|
negate := false |
|
if p.charsRight() > 1 && p.rightChar(0) == '^' { |
|
negate = true |
|
p.moveRight(1) |
|
} |
|
|
|
nm := p.scanCapname() // snag the name |
|
if !scanOnly && p.useRE2() { |
|
// look up the name since these are valid for RE2 |
|
// add the group based on the name |
|
if ok := cc.addNamedASCII(nm, negate); !ok { |
|
return nil, p.getErr(ErrInvalidCharRange) |
|
} |
|
} |
|
if p.charsRight() < 2 || p.moveRightGetChar() != ':' || p.moveRightGetChar() != ']' { |
|
p.textto(savePos) |
|
} else if p.useRE2() { |
|
// move on |
|
continue |
|
} |
|
} |
|
} |
|
|
|
if inRange { |
|
inRange = false |
|
if !scanOnly { |
|
if ch == '[' && !fTranslatedChar && !firstChar { |
|
// We thought we were in a range, but we're actually starting a subtraction. |
|
// In that case, we'll add chPrev to our char class, skip the opening [, and |
|
// scan the new character class recursively. |
|
cc.addChar(chPrev) |
|
sub, err := p.scanCharSet(caseInsensitive, false) |
|
if err != nil { |
|
return nil, err |
|
} |
|
cc.addSubtraction(sub) |
|
|
|
if p.charsRight() > 0 && p.rightChar(0) != ']' { |
|
return nil, p.getErr(ErrSubtractionMustBeLast) |
|
} |
|
} else { |
|
// a regular range, like a-z |
|
if chPrev > ch { |
|
return nil, p.getErr(ErrReversedCharRange) |
|
} |
|
cc.addRange(chPrev, ch) |
|
} |
|
} |
|
} else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' { |
|
// this could be the start of a range |
|
chPrev = ch |
|
inRange = true |
|
p.moveRight(1) |
|
} else if p.charsRight() >= 1 && ch == '-' && !fTranslatedChar && p.rightChar(0) == '[' && !firstChar { |
|
// we aren't in a range, and now there is a subtraction. Usually this happens |
|
// only when a subtraction follows a range, like [a-z-[b]] |
|
if !scanOnly { |
|
p.moveRight(1) |
|
sub, err := p.scanCharSet(caseInsensitive, false) |
|
if err != nil { |
|
return nil, err |
|
} |
|
cc.addSubtraction(sub) |
|
|
|
if p.charsRight() > 0 && p.rightChar(0) != ']' { |
|
return nil, p.getErr(ErrSubtractionMustBeLast) |
|
} |
|
} else { |
|
p.moveRight(1) |
|
p.scanCharSet(caseInsensitive, true) |
|
} |
|
} else { |
|
if !scanOnly { |
|
cc.addRange(ch, ch) |
|
} |
|
} |
|
} |
|
|
|
if !closed { |
|
return nil, p.getErr(ErrUnterminatedBracket) |
|
} |
|
|
|
if !scanOnly && caseInsensitive { |
|
cc.addLowercase() |
|
} |
|
|
|
return cc, nil |
|
} |
|
|
|
// Scans any number of decimal digits (pegs value at 2^31-1 if too large) |
|
func (p *parser) scanDecimal() (int, error) { |
|
i := 0 |
|
var d int |
|
|
|
for p.charsRight() > 0 { |
|
d = int(p.rightChar(0) - '0') |
|
if d < 0 || d > 9 { |
|
break |
|
} |
|
p.moveRight(1) |
|
|
|
if i > maxValueDiv10 || (i == maxValueDiv10 && d > maxValueMod10) { |
|
return 0, p.getErr(ErrCaptureGroupOutOfRange) |
|
} |
|
|
|
i *= 10 |
|
i += d |
|
} |
|
|
|
return int(i), nil |
|
} |
|
|
|
// Returns true for options allowed only at the top level |
|
func isOnlyTopOption(option RegexOptions) bool { |
|
return option == RightToLeft || option == ECMAScript || option == RE2 |
|
} |
|
|
|
// Scans cimsx-cimsx option string, stops at the first unrecognized char. |
|
func (p *parser) scanOptions() { |
|
|
|
for off := false; p.charsRight() > 0; p.moveRight(1) { |
|
ch := p.rightChar(0) |
|
|
|
if ch == '-' { |
|
off = true |
|
} else if ch == '+' { |
|
off = false |
|
} else { |
|
option := optionFromCode(ch) |
|
if option == 0 || isOnlyTopOption(option) { |
|
return |
|
} |
|
|
|
if off { |
|
p.options &= ^option |
|
} else { |
|
p.options |= option |
|
} |
|
} |
|
} |
|
} |
|
|
|
// Scans \ code for escape codes that map to single unicode chars. |
|
func (p *parser) scanCharEscape() (rune, error) { |
|
|
|
ch := p.moveRightGetChar() |
|
|
|
if ch >= '0' && ch <= '7' { |
|
p.moveLeft() |
|
return p.scanOctal(), nil |
|
} |
|
|
|
switch ch { |
|
case 'x': |
|
// support for \x{HEX} syntax from Perl and PCRE |
|
if p.charsRight() > 0 && p.rightChar(0) == '{' { |
|
p.moveRight(1) |
|
return p.scanHexUntilBrace() |
|
} |
|
return p.scanHex(2) |
|
case 'u': |
|
return p.scanHex(4) |
|
case 'a': |
|
return '\u0007', nil |
|
case 'b': |
|
return '\b', nil |
|
case 'e': |
|
return '\u001B', nil |
|
case 'f': |
|
return '\f', nil |
|
case 'n': |
|
return '\n', nil |
|
case 'r': |
|
return '\r', nil |
|
case 't': |
|
return '\t', nil |
|
case 'v': |
|
return '\u000B', nil |
|
case 'c': |
|
return p.scanControl() |
|
default: |
|
if !p.useOptionE() && IsWordChar(ch) { |
|
return 0, p.getErr(ErrUnrecognizedEscape, string(ch)) |
|
} |
|
return ch, nil |
|
} |
|
} |
|
|
|
// Grabs and converts an ascii control character |
|
func (p *parser) scanControl() (rune, error) { |
|
if p.charsRight() <= 0 { |
|
return 0, p.getErr(ErrMissingControl) |
|
} |
|
|
|
ch := p.moveRightGetChar() |
|
|
|
// \ca interpreted as \cA |
|
|
|
if ch >= 'a' && ch <= 'z' { |
|
ch = (ch - ('a' - 'A')) |
|
} |
|
ch = (ch - '@') |
|
if ch >= 0 && ch < ' ' { |
|
return ch, nil |
|
} |
|
|
|
return 0, p.getErr(ErrUnrecognizedControl) |
|
|
|
} |
|
|
|
// Scan hex digits until we hit a closing brace. |
|
// Non-hex digits, hex value too large for UTF-8, or running out of chars are errors |
|
func (p *parser) scanHexUntilBrace() (rune, error) { |
|
// PCRE spec reads like unlimited hex digits are allowed, but unicode has a limit |
|
// so we can enforce that |
|
i := 0 |
|
hasContent := false |
|
|
|
for p.charsRight() > 0 { |
|
ch := p.moveRightGetChar() |
|
if ch == '}' { |
|
// hit our close brace, we're done here |
|
// prevent \x{} |
|
if !hasContent { |
|
return 0, p.getErr(ErrTooFewHex) |
|
} |
|
return rune(i), nil |
|
} |
|
hasContent = true |
|
// no brace needs to be hex digit |
|
d := hexDigit(ch) |
|
if d < 0 { |
|
return 0, p.getErr(ErrMissingBrace) |
|
} |
|
|
|
i *= 0x10 |
|
i += d |
|
|
|
if i > unicode.MaxRune { |
|
return 0, p.getErr(ErrInvalidHex) |
|
} |
|
} |
|
|
|
// we only make it here if we run out of digits without finding the brace |
|
return 0, p.getErr(ErrMissingBrace) |
|
} |
|
|
|
// Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF) |
|
func (p *parser) scanHex(c int) (rune, error) { |
|
|
|
i := 0 |
|
|
|
if p.charsRight() >= c { |
|
for c > 0 { |
|
d := hexDigit(p.moveRightGetChar()) |
|
if d < 0 { |
|
break |
|
} |
|
i *= 0x10 |
|
i += d |
|
c-- |
|
} |
|
} |
|
|
|
if c > 0 { |
|
return 0, p.getErr(ErrTooFewHex) |
|
} |
|
|
|
return rune(i), nil |
|
} |
|
|
|
// Returns n <= 0xF for a hex digit. |
|
func hexDigit(ch rune) int { |
|
|
|
if d := uint(ch - '0'); d <= 9 { |
|
return int(d) |
|
} |
|
|
|
if d := uint(ch - 'a'); d <= 5 { |
|
return int(d + 0xa) |
|
} |
|
|
|
if d := uint(ch - 'A'); d <= 5 { |
|
return int(d + 0xa) |
|
} |
|
|
|
return -1 |
|
} |
|
|
|
// Scans up to three octal digits (stops before exceeding 0377). |
|
func (p *parser) scanOctal() rune { |
|
// Consume octal chars only up to 3 digits and value 0377 |
|
|
|
c := 3 |
|
|
|
if c > p.charsRight() { |
|
c = p.charsRight() |
|
} |
|
|
|
//we know the first char is good because the caller had to check |
|
i := 0 |
|
d := int(p.rightChar(0) - '0') |
|
for c > 0 && d <= 7 { |
|
i *= 8 |
|
i += d |
|
if p.useOptionE() && i >= 0x20 { |
|
break |
|
} |
|
c-- |
|
|
|
p.moveRight(1) |
|
if !p.rightMost() { |
|
d = int(p.rightChar(0) - '0') |
|
} |
|
} |
|
|
|
// Octal codes only go up to 255. Any larger and the behavior that Perl follows |
|
// is simply to truncate the high bits. |
|
i &= 0xFF |
|
|
|
return rune(i) |
|
} |
|
|
|
// Returns the current parsing position. |
|
func (p *parser) textpos() int { |
|
return p.currentPos |
|
} |
|
|
|
// Zaps to a specific parsing position. |
|
func (p *parser) textto(pos int) { |
|
p.currentPos = pos |
|
} |
|
|
|
// Returns the char at the right of the current parsing position and advances to the right. |
|
func (p *parser) moveRightGetChar() rune { |
|
ch := p.pattern[p.currentPos] |
|
p.currentPos++ |
|
return ch |
|
} |
|
|
|
// Moves the current position to the right. |
|
func (p *parser) moveRight(i int) { |
|
// default would be 1 |
|
p.currentPos += i |
|
} |
|
|
|
// Moves the current parsing position one to the left. |
|
func (p *parser) moveLeft() { |
|
p.currentPos-- |
|
} |
|
|
|
// Returns the char left of the current parsing position. |
|
func (p *parser) charAt(i int) rune { |
|
return p.pattern[i] |
|
} |
|
|
|
// Returns the char i chars right of the current parsing position. |
|
func (p *parser) rightChar(i int) rune { |
|
// default would be 0 |
|
return p.pattern[p.currentPos+i] |
|
} |
|
|
|
// Number of characters to the right of the current parsing position. |
|
func (p *parser) charsRight() int { |
|
return len(p.pattern) - p.currentPos |
|
} |
|
|
|
func (p *parser) rightMost() bool { |
|
return p.currentPos == len(p.pattern) |
|
} |
|
|
|
// Looks up the slot number for a given name |
|
func (p *parser) captureSlotFromName(capname string) int { |
|
return p.capnames[capname] |
|
} |
|
|
|
// True if the capture slot was noted |
|
func (p *parser) isCaptureSlot(i int) bool { |
|
if p.caps != nil { |
|
_, ok := p.caps[i] |
|
return ok |
|
} |
|
|
|
return (i >= 0 && i < p.capsize) |
|
} |
|
|
|
// Looks up the slot number for a given name |
|
func (p *parser) isCaptureName(capname string) bool { |
|
if p.capnames == nil { |
|
return false |
|
} |
|
|
|
_, ok := p.capnames[capname] |
|
return ok |
|
} |
|
|
|
// option shortcuts |
|
|
|
// True if N option disabling '(' autocapture is on. |
|
func (p *parser) useOptionN() bool { |
|
return (p.options & ExplicitCapture) != 0 |
|
} |
|
|
|
// True if I option enabling case-insensitivity is on. |
|
func (p *parser) useOptionI() bool { |
|
return (p.options & IgnoreCase) != 0 |
|
} |
|
|
|
// True if M option altering meaning of $ and ^ is on. |
|
func (p *parser) useOptionM() bool { |
|
return (p.options & Multiline) != 0 |
|
} |
|
|
|
// True if S option altering meaning of . is on. |
|
func (p *parser) useOptionS() bool { |
|
return (p.options & Singleline) != 0 |
|
} |
|
|
|
// True if X option enabling whitespace/comment mode is on. |
|
func (p *parser) useOptionX() bool { |
|
return (p.options & IgnorePatternWhitespace) != 0 |
|
} |
|
|
|
// True if E option enabling ECMAScript behavior on. |
|
func (p *parser) useOptionE() bool { |
|
return (p.options & ECMAScript) != 0 |
|
} |
|
|
|
// true to use RE2 compatibility parsing behavior. |
|
func (p *parser) useRE2() bool { |
|
return (p.options & RE2) != 0 |
|
} |
|
|
|
// True if options stack is empty. |
|
func (p *parser) emptyOptionsStack() bool { |
|
return len(p.optionsStack) == 0 |
|
} |
|
|
|
// Finish the current quantifiable (when a quantifier is not found or is not possible) |
|
func (p *parser) addConcatenate() { |
|
// The first (| inside a Testgroup group goes directly to the group |
|
p.concatenation.addChild(p.unit) |
|
p.unit = nil |
|
} |
|
|
|
// Finish the current quantifiable (when a quantifier is found) |
|
func (p *parser) addConcatenate3(lazy bool, min, max int) { |
|
p.concatenation.addChild(p.unit.makeQuantifier(lazy, min, max)) |
|
p.unit = nil |
|
} |
|
|
|
// Sets the current unit to a single char node |
|
func (p *parser) addUnitOne(ch rune) { |
|
if p.useOptionI() { |
|
ch = unicode.ToLower(ch) |
|
} |
|
|
|
p.unit = newRegexNodeCh(ntOne, p.options, ch) |
|
} |
|
|
|
// Sets the current unit to a single inverse-char node |
|
func (p *parser) addUnitNotone(ch rune) { |
|
if p.useOptionI() { |
|
ch = unicode.ToLower(ch) |
|
} |
|
|
|
p.unit = newRegexNodeCh(ntNotone, p.options, ch) |
|
} |
|
|
|
// Sets the current unit to a single set node |
|
func (p *parser) addUnitSet(set *CharSet) { |
|
p.unit = newRegexNodeSet(ntSet, p.options, set) |
|
} |
|
|
|
// Sets the current unit to a subtree |
|
func (p *parser) addUnitNode(node *regexNode) { |
|
p.unit = node |
|
} |
|
|
|
// Sets the current unit to an assertion of the specified type |
|
func (p *parser) addUnitType(t nodeType) { |
|
p.unit = newRegexNode(t, p.options) |
|
} |
|
|
|
// Finish the current group (in response to a ')' or end) |
|
func (p *parser) addGroup() error { |
|
if p.group.t == ntTestgroup || p.group.t == ntTestref { |
|
p.group.addChild(p.concatenation.reverseLeft()) |
|
if (p.group.t == ntTestref && len(p.group.children) > 2) || len(p.group.children) > 3 { |
|
return p.getErr(ErrTooManyAlternates) |
|
} |
|
} else { |
|
p.alternation.addChild(p.concatenation.reverseLeft()) |
|
p.group.addChild(p.alternation) |
|
} |
|
|
|
p.unit = p.group |
|
return nil |
|
} |
|
|
|
// Pops the option stack, but keeps the current options unchanged. |
|
func (p *parser) popKeepOptions() { |
|
lastIdx := len(p.optionsStack) - 1 |
|
p.optionsStack = p.optionsStack[:lastIdx] |
|
} |
|
|
|
// Recalls options from the stack. |
|
func (p *parser) popOptions() { |
|
lastIdx := len(p.optionsStack) - 1 |
|
// get the last item on the stack and then remove it by reslicing |
|
p.options = p.optionsStack[lastIdx] |
|
p.optionsStack = p.optionsStack[:lastIdx] |
|
} |
|
|
|
// Saves options on a stack. |
|
func (p *parser) pushOptions() { |
|
p.optionsStack = append(p.optionsStack, p.options) |
|
} |
|
|
|
// Add a string to the last concatenate. |
|
func (p *parser) addToConcatenate(pos, cch int, isReplacement bool) { |
|
var node *regexNode |
|
|
|
if cch == 0 { |
|
return |
|
} |
|
|
|
if cch > 1 { |
|
str := p.pattern[pos : pos+cch] |
|
|
|
if p.useOptionI() && !isReplacement { |
|
// We do the ToLower character by character for consistency. With surrogate chars, doing |
|
// a ToLower on the entire string could actually change the surrogate pair. This is more correct |
|
// linguistically, but since Regex doesn't support surrogates, it's more important to be |
|
// consistent. |
|
for i := 0; i < len(str); i++ { |
|
str[i] = unicode.ToLower(str[i]) |
|
} |
|
} |
|
|
|
node = newRegexNodeStr(ntMulti, p.options, str) |
|
} else { |
|
ch := p.charAt(pos) |
|
|
|
if p.useOptionI() && !isReplacement { |
|
ch = unicode.ToLower(ch) |
|
} |
|
|
|
node = newRegexNodeCh(ntOne, p.options, ch) |
|
} |
|
|
|
p.concatenation.addChild(node) |
|
} |
|
|
|
// Push the parser state (in response to an open paren) |
|
func (p *parser) pushGroup() { |
|
p.group.next = p.stack |
|
p.alternation.next = p.group |
|
p.concatenation.next = p.alternation |
|
p.stack = p.concatenation |
|
} |
|
|
|
// Remember the pushed state (in response to a ')') |
|
func (p *parser) popGroup() error { |
|
p.concatenation = p.stack |
|
p.alternation = p.concatenation.next |
|
p.group = p.alternation.next |
|
p.stack = p.group.next |
|
|
|
// The first () inside a Testgroup group goes directly to the group |
|
if p.group.t == ntTestgroup && len(p.group.children) == 0 { |
|
if p.unit == nil { |
|
return p.getErr(ErrConditionalExpression) |
|
} |
|
|
|
p.group.addChild(p.unit) |
|
p.unit = nil |
|
} |
|
return nil |
|
} |
|
|
|
// True if the group stack is empty. |
|
func (p *parser) emptyStack() bool { |
|
return p.stack == nil |
|
} |
|
|
|
// Start a new round for the parser state (in response to an open paren or string start) |
|
func (p *parser) startGroup(openGroup *regexNode) { |
|
p.group = openGroup |
|
p.alternation = newRegexNode(ntAlternate, p.options) |
|
p.concatenation = newRegexNode(ntConcatenate, p.options) |
|
} |
|
|
|
// Finish the current concatenation (in response to a |) |
|
func (p *parser) addAlternate() { |
|
// The | parts inside a Testgroup group go directly to the group |
|
|
|
if p.group.t == ntTestgroup || p.group.t == ntTestref { |
|
p.group.addChild(p.concatenation.reverseLeft()) |
|
} else { |
|
p.alternation.addChild(p.concatenation.reverseLeft()) |
|
} |
|
|
|
p.concatenation = newRegexNode(ntConcatenate, p.options) |
|
} |
|
|
|
// For categorizing ascii characters. |
|
|
|
const ( |
|
Q byte = 5 // quantifier |
|
S = 4 // ordinary stopper |
|
Z = 3 // ScanBlank stopper |
|
X = 2 // whitespace |
|
E = 1 // should be escaped |
|
) |
|
|
|
var _category = []byte{ |
|
//01 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F |
|
0, 0, 0, 0, 0, 0, 0, 0, 0, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
// ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? |
|
X, 0, 0, Z, S, 0, 0, 0, S, S, Q, Q, 0, 0, S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, |
|
//@A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ |
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, S, 0, |
|
//'a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ |
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, S, 0, 0, 0, |
|
} |
|
|
|
func isSpace(ch rune) bool { |
|
return (ch <= ' ' && _category[ch] == X) |
|
} |
|
|
|
// Returns true for those characters that terminate a string of ordinary chars. |
|
func isSpecial(ch rune) bool { |
|
return (ch <= '|' && _category[ch] >= S) |
|
} |
|
|
|
// Returns true for those characters that terminate a string of ordinary chars. |
|
func isStopperX(ch rune) bool { |
|
return (ch <= '|' && _category[ch] >= X) |
|
} |
|
|
|
// Returns true for those characters that begin a quantifier. |
|
func isQuantifier(ch rune) bool { |
|
return (ch <= '{' && _category[ch] >= Q) |
|
} |
|
|
|
func (p *parser) isTrueQuantifier() bool { |
|
nChars := p.charsRight() |
|
if nChars == 0 { |
|
return false |
|
} |
|
|
|
startpos := p.textpos() |
|
ch := p.charAt(startpos) |
|
if ch != '{' { |
|
return ch <= '{' && _category[ch] >= Q |
|
} |
|
|
|
//UGLY: this is ugly -- the original code was ugly too |
|
pos := startpos |
|
for { |
|
nChars-- |
|
if nChars <= 0 { |
|
break |
|
} |
|
pos++ |
|
ch = p.charAt(pos) |
|
if ch < '0' || ch > '9' { |
|
break |
|
} |
|
} |
|
|
|
if nChars == 0 || pos-startpos == 1 { |
|
return false |
|
} |
|
if ch == '}' { |
|
return true |
|
} |
|
if ch != ',' { |
|
return false |
|
} |
|
for { |
|
nChars-- |
|
if nChars <= 0 { |
|
break |
|
} |
|
pos++ |
|
ch = p.charAt(pos) |
|
if ch < '0' || ch > '9' { |
|
break |
|
} |
|
} |
|
|
|
return nChars > 0 && ch == '}' |
|
}
|
|
|