|
- package syntax
-
- import (
- "fmt"
- "math"
- "os"
- "sort"
- "strconv"
- "unicode"
- )
-
- type RegexOptions int32
-
- const (
- IgnoreCase RegexOptions = 0x0001 // "i"
- Multiline = 0x0002 // "m"
- ExplicitCapture = 0x0004 // "n"
- Compiled = 0x0008 // "c"
- Singleline = 0x0010 // "s"
- IgnorePatternWhitespace = 0x0020 // "x"
- RightToLeft = 0x0040 // "r"
- Debug = 0x0080 // "d"
- ECMAScript = 0x0100 // "e"
- RE2 = 0x0200 // RE2 compat mode
- )
-
- func optionFromCode(ch rune) RegexOptions {
- // case-insensitive
- switch ch {
- case 'i', 'I':
- return IgnoreCase
- case 'r', 'R':
- return RightToLeft
- case 'm', 'M':
- return Multiline
- case 'n', 'N':
- return ExplicitCapture
- case 's', 'S':
- return Singleline
- case 'x', 'X':
- return IgnorePatternWhitespace
- case 'd', 'D':
- return Debug
- case 'e', 'E':
- return ECMAScript
- default:
- return 0
- }
- }
-
- // An Error describes a failure to parse a regular expression
- // and gives the offending expression.
- type Error struct {
- Code ErrorCode
- Expr string
- Args []interface{}
- }
-
- func (e *Error) Error() string {
- if len(e.Args) == 0 {
- return "error parsing regexp: " + e.Code.String() + " in `" + e.Expr + "`"
- }
- return "error parsing regexp: " + fmt.Sprintf(e.Code.String(), e.Args...) + " in `" + e.Expr + "`"
- }
-
- // An ErrorCode describes a failure to parse a regular expression.
- type ErrorCode string
-
- const (
- // internal issue
- ErrInternalError ErrorCode = "regexp/syntax: internal error"
- // Parser errors
- ErrUnterminatedComment = "unterminated comment"
- ErrInvalidCharRange = "invalid character class range"
- ErrInvalidRepeatSize = "invalid repeat count"
- ErrInvalidUTF8 = "invalid UTF-8"
- ErrCaptureGroupOutOfRange = "capture group number out of range"
- ErrUnexpectedParen = "unexpected )"
- ErrMissingParen = "missing closing )"
- ErrMissingBrace = "missing closing }"
- ErrInvalidRepeatOp = "invalid nested repetition operator"
- ErrMissingRepeatArgument = "missing argument to repetition operator"
- ErrConditionalExpression = "illegal conditional (?(...)) expression"
- ErrTooManyAlternates = "too many | in (?()|)"
- ErrUnrecognizedGrouping = "unrecognized grouping construct: (%v"
- ErrInvalidGroupName = "invalid group name: group names must begin with a word character and have a matching terminator"
- ErrCapNumNotZero = "capture number cannot be zero"
- ErrUndefinedBackRef = "reference to undefined group number %v"
- ErrUndefinedNameRef = "reference to undefined group name %v"
- ErrAlternationCantCapture = "alternation conditions do not capture and cannot be named"
- ErrAlternationCantHaveComment = "alternation conditions cannot be comments"
- ErrMalformedReference = "(?(%v) ) malformed"
- ErrUndefinedReference = "(?(%v) ) reference to undefined group"
- ErrIllegalEndEscape = "illegal \\ at end of pattern"
- ErrMalformedSlashP = "malformed \\p{X} character escape"
- ErrIncompleteSlashP = "incomplete \\p{X} character escape"
- ErrUnknownSlashP = "unknown unicode category, script, or property '%v'"
- ErrUnrecognizedEscape = "unrecognized escape sequence \\%v"
- ErrMissingControl = "missing control character"
- ErrUnrecognizedControl = "unrecognized control character"
- ErrTooFewHex = "insufficient hexadecimal digits"
- ErrInvalidHex = "hex values may not be larger than 0x10FFFF"
- ErrMalformedNameRef = "malformed \\k<...> named back reference"
- ErrBadClassInCharRange = "cannot include class \\%v in character range"
- ErrUnterminatedBracket = "unterminated [] set"
- ErrSubtractionMustBeLast = "a subtraction must be the last element in a character class"
- ErrReversedCharRange = "[x-y] range in reverse order"
- )
-
- func (e ErrorCode) String() string {
- return string(e)
- }
-
- type parser struct {
- stack *regexNode
- group *regexNode
- alternation *regexNode
- concatenation *regexNode
- unit *regexNode
-
- patternRaw string
- pattern []rune
-
- currentPos int
- specialCase *unicode.SpecialCase
-
- autocap int
- capcount int
- captop int
- capsize int
-
- caps map[int]int
- capnames map[string]int
-
- capnumlist []int
- capnamelist []string
-
- options RegexOptions
- optionsStack []RegexOptions
- ignoreNextParen bool
- }
-
- const (
- maxValueDiv10 int = math.MaxInt32 / 10
- maxValueMod10 = math.MaxInt32 % 10
- )
-
- // Parse converts a regex string into a parse tree
- func Parse(re string, op RegexOptions) (*RegexTree, error) {
- p := parser{
- options: op,
- caps: make(map[int]int),
- }
- p.setPattern(re)
-
- if err := p.countCaptures(); err != nil {
- return nil, err
- }
-
- p.reset(op)
- root, err := p.scanRegex()
-
- if err != nil {
- return nil, err
- }
- tree := &RegexTree{
- root: root,
- caps: p.caps,
- capnumlist: p.capnumlist,
- captop: p.captop,
- Capnames: p.capnames,
- Caplist: p.capnamelist,
- options: op,
- }
-
- if tree.options&Debug > 0 {
- os.Stdout.WriteString(tree.Dump())
- }
-
- return tree, nil
- }
-
- func (p *parser) setPattern(pattern string) {
- p.patternRaw = pattern
- p.pattern = make([]rune, 0, len(pattern))
-
- //populate our rune array to handle utf8 encoding
- for _, r := range pattern {
- p.pattern = append(p.pattern, r)
- }
- }
- func (p *parser) getErr(code ErrorCode, args ...interface{}) error {
- return &Error{Code: code, Expr: p.patternRaw, Args: args}
- }
-
- func (p *parser) noteCaptureSlot(i, pos int) {
- if _, ok := p.caps[i]; !ok {
- // the rhs of the hashtable isn't used in the parser
- p.caps[i] = pos
- p.capcount++
-
- if p.captop <= i {
- if i == math.MaxInt32 {
- p.captop = i
- } else {
- p.captop = i + 1
- }
- }
- }
- }
-
- func (p *parser) noteCaptureName(name string, pos int) {
- if p.capnames == nil {
- p.capnames = make(map[string]int)
- }
-
- if _, ok := p.capnames[name]; !ok {
- p.capnames[name] = pos
- p.capnamelist = append(p.capnamelist, name)
- }
- }
-
- func (p *parser) assignNameSlots() {
- if p.capnames != nil {
- for _, name := range p.capnamelist {
- for p.isCaptureSlot(p.autocap) {
- p.autocap++
- }
- pos := p.capnames[name]
- p.capnames[name] = p.autocap
- p.noteCaptureSlot(p.autocap, pos)
-
- p.autocap++
- }
- }
-
- // if the caps array has at least one gap, construct the list of used slots
- if p.capcount < p.captop {
- p.capnumlist = make([]int, p.capcount)
- i := 0
-
- for k := range p.caps {
- p.capnumlist[i] = k
- i++
- }
-
- sort.Ints(p.capnumlist)
- }
-
- // merge capsnumlist into capnamelist
- if p.capnames != nil || p.capnumlist != nil {
- var oldcapnamelist []string
- var next int
- var k int
-
- if p.capnames == nil {
- oldcapnamelist = nil
- p.capnames = make(map[string]int)
- p.capnamelist = []string{}
- next = -1
- } else {
- oldcapnamelist = p.capnamelist
- p.capnamelist = []string{}
- next = p.capnames[oldcapnamelist[0]]
- }
-
- for i := 0; i < p.capcount; i++ {
- j := i
- if p.capnumlist != nil {
- j = p.capnumlist[i]
- }
-
- if next == j {
- p.capnamelist = append(p.capnamelist, oldcapnamelist[k])
- k++
-
- if k == len(oldcapnamelist) {
- next = -1
- } else {
- next = p.capnames[oldcapnamelist[k]]
- }
-
- } else {
- //feature: culture?
- str := strconv.Itoa(j)
- p.capnamelist = append(p.capnamelist, str)
- p.capnames[str] = j
- }
- }
- }
- }
-
- func (p *parser) consumeAutocap() int {
- r := p.autocap
- p.autocap++
- return r
- }
-
- // CountCaptures is a prescanner for deducing the slots used for
- // captures by doing a partial tokenization of the pattern.
- func (p *parser) countCaptures() error {
- var ch rune
-
- p.noteCaptureSlot(0, 0)
-
- p.autocap = 1
-
- for p.charsRight() > 0 {
- pos := p.textpos()
- ch = p.moveRightGetChar()
- switch ch {
- case '\\':
- if p.charsRight() > 0 {
- p.scanBackslash(true)
- }
-
- case '#':
- if p.useOptionX() {
- p.moveLeft()
- p.scanBlank()
- }
-
- case '[':
- p.scanCharSet(false, true)
-
- case ')':
- if !p.emptyOptionsStack() {
- p.popOptions()
- }
-
- case '(':
- if p.charsRight() >= 2 && p.rightChar(1) == '#' && p.rightChar(0) == '?' {
- p.moveLeft()
- p.scanBlank()
- } else {
- p.pushOptions()
- if p.charsRight() > 0 && p.rightChar(0) == '?' {
- // we have (?...
- p.moveRight(1)
-
- if p.charsRight() > 1 && (p.rightChar(0) == '<' || p.rightChar(0) == '\'') {
- // named group: (?<... or (?'...
-
- p.moveRight(1)
- ch = p.rightChar(0)
-
- if ch != '0' && IsWordChar(ch) {
- if ch >= '1' && ch <= '9' {
- dec, err := p.scanDecimal()
- if err != nil {
- return err
- }
- p.noteCaptureSlot(dec, pos)
- } else {
- p.noteCaptureName(p.scanCapname(), pos)
- }
- }
- } else if p.useRE2() && p.charsRight() > 2 && (p.rightChar(0) == 'P' && p.rightChar(1) == '<') {
- // RE2-compat (?P<)
- p.moveRight(2)
- ch = p.rightChar(0)
- if IsWordChar(ch) {
- p.noteCaptureName(p.scanCapname(), pos)
- }
-
- } else {
- // (?...
-
- // get the options if it's an option construct (?cimsx-cimsx...)
- p.scanOptions()
-
- if p.charsRight() > 0 {
- if p.rightChar(0) == ')' {
- // (?cimsx-cimsx)
- p.moveRight(1)
- p.popKeepOptions()
- } else if p.rightChar(0) == '(' {
- // alternation construct: (?(foo)yes|no)
- // ignore the next paren so we don't capture the condition
- p.ignoreNextParen = true
-
- // break from here so we don't reset ignoreNextParen
- continue
- }
- }
- }
- } else {
- if !p.useOptionN() && !p.ignoreNextParen {
- p.noteCaptureSlot(p.consumeAutocap(), pos)
- }
- }
- }
-
- p.ignoreNextParen = false
-
- }
- }
-
- p.assignNameSlots()
- return nil
- }
-
- func (p *parser) reset(topopts RegexOptions) {
- p.currentPos = 0
- p.autocap = 1
- p.ignoreNextParen = false
-
- if len(p.optionsStack) > 0 {
- p.optionsStack = p.optionsStack[:0]
- }
-
- p.options = topopts
- p.stack = nil
- }
-
- func (p *parser) scanRegex() (*regexNode, error) {
- ch := '@' // nonspecial ch, means at beginning
- isQuant := false
-
- p.startGroup(newRegexNodeMN(ntCapture, p.options, 0, -1))
-
- for p.charsRight() > 0 {
- wasPrevQuantifier := isQuant
- isQuant = false
-
- if err := p.scanBlank(); err != nil {
- return nil, err
- }
-
- startpos := p.textpos()
-
- // move past all of the normal characters. We'll stop when we hit some kind of control character,
- // or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace.
- if p.useOptionX() {
- for p.charsRight() > 0 {
- ch = p.rightChar(0)
- //UGLY: clean up, this is ugly
- if !(!isStopperX(ch) || (ch == '{' && !p.isTrueQuantifier())) {
- break
- }
- p.moveRight(1)
- }
- } else {
- for p.charsRight() > 0 {
- ch = p.rightChar(0)
- if !(!isSpecial(ch) || ch == '{' && !p.isTrueQuantifier()) {
- break
- }
- p.moveRight(1)
- }
- }
-
- endpos := p.textpos()
-
- p.scanBlank()
-
- if p.charsRight() == 0 {
- ch = '!' // nonspecial, means at end
- } else if ch = p.rightChar(0); isSpecial(ch) {
- isQuant = isQuantifier(ch)
- p.moveRight(1)
- } else {
- ch = ' ' // nonspecial, means at ordinary char
- }
-
- if startpos < endpos {
- cchUnquantified := endpos - startpos
- if isQuant {
- cchUnquantified--
- }
- wasPrevQuantifier = false
-
- if cchUnquantified > 0 {
- p.addToConcatenate(startpos, cchUnquantified, false)
- }
-
- if isQuant {
- p.addUnitOne(p.charAt(endpos - 1))
- }
- }
-
- switch ch {
- case '!':
- goto BreakOuterScan
-
- case ' ':
- goto ContinueOuterScan
-
- case '[':
- cc, err := p.scanCharSet(p.useOptionI(), false)
- if err != nil {
- return nil, err
- }
- p.addUnitSet(cc)
-
- case '(':
- p.pushOptions()
-
- if grouper, err := p.scanGroupOpen(); err != nil {
- return nil, err
- } else if grouper == nil {
- p.popKeepOptions()
- } else {
- p.pushGroup()
- p.startGroup(grouper)
- }
-
- continue
-
- case '|':
- p.addAlternate()
- goto ContinueOuterScan
-
- case ')':
- if p.emptyStack() {
- return nil, p.getErr(ErrUnexpectedParen)
- }
-
- if err := p.addGroup(); err != nil {
- return nil, err
- }
- if err := p.popGroup(); err != nil {
- return nil, err
- }
- p.popOptions()
-
- if p.unit == nil {
- goto ContinueOuterScan
- }
-
- case '\\':
- n, err := p.scanBackslash(false)
- if err != nil {
- return nil, err
- }
- p.addUnitNode(n)
-
- case '^':
- if p.useOptionM() {
- p.addUnitType(ntBol)
- } else {
- p.addUnitType(ntBeginning)
- }
-
- case '$':
- if p.useOptionM() {
- p.addUnitType(ntEol)
- } else {
- p.addUnitType(ntEndZ)
- }
-
- case '.':
- if p.useOptionE() {
- p.addUnitSet(ECMAAnyClass())
- } else if p.useOptionS() {
- p.addUnitSet(AnyClass())
- } else {
- p.addUnitNotone('\n')
- }
-
- case '{', '*', '+', '?':
- if p.unit == nil {
- if wasPrevQuantifier {
- return nil, p.getErr(ErrInvalidRepeatOp)
- } else {
- return nil, p.getErr(ErrMissingRepeatArgument)
- }
- }
- p.moveLeft()
-
- default:
- return nil, p.getErr(ErrInternalError)
- }
-
- if err := p.scanBlank(); err != nil {
- return nil, err
- }
-
- if p.charsRight() > 0 {
- isQuant = p.isTrueQuantifier()
- }
- if p.charsRight() == 0 || !isQuant {
- //maintain odd C# assignment order -- not sure if required, could clean up?
- p.addConcatenate()
- goto ContinueOuterScan
- }
-
- ch = p.moveRightGetChar()
-
- // Handle quantifiers
- for p.unit != nil {
- var min, max int
- var lazy bool
-
- switch ch {
- case '*':
- min = 0
- max = math.MaxInt32
-
- case '?':
- min = 0
- max = 1
-
- case '+':
- min = 1
- max = math.MaxInt32
-
- case '{':
- {
- var err error
- startpos = p.textpos()
- if min, err = p.scanDecimal(); err != nil {
- return nil, err
- }
- max = min
- if startpos < p.textpos() {
- if p.charsRight() > 0 && p.rightChar(0) == ',' {
- p.moveRight(1)
- if p.charsRight() == 0 || p.rightChar(0) == '}' {
- max = math.MaxInt32
- } else {
- if max, err = p.scanDecimal(); err != nil {
- return nil, err
- }
- }
- }
- }
-
- if startpos == p.textpos() || p.charsRight() == 0 || p.moveRightGetChar() != '}' {
- p.addConcatenate()
- p.textto(startpos - 1)
- goto ContinueOuterScan
- }
- }
-
- default:
- return nil, p.getErr(ErrInternalError)
- }
-
- if err := p.scanBlank(); err != nil {
- return nil, err
- }
-
- if p.charsRight() == 0 || p.rightChar(0) != '?' {
- lazy = false
- } else {
- p.moveRight(1)
- lazy = true
- }
-
- if min > max {
- return nil, p.getErr(ErrInvalidRepeatSize)
- }
-
- p.addConcatenate3(lazy, min, max)
- }
-
- ContinueOuterScan:
- }
-
- BreakOuterScan:
- ;
-
- if !p.emptyStack() {
- return nil, p.getErr(ErrMissingParen)
- }
-
- if err := p.addGroup(); err != nil {
- return nil, err
- }
-
- return p.unit, nil
-
- }
-
- /*
- * Simple parsing for replacement patterns
- */
- func (p *parser) scanReplacement() (*regexNode, error) {
- var c, startpos int
-
- p.concatenation = newRegexNode(ntConcatenate, p.options)
-
- for {
- c = p.charsRight()
- if c == 0 {
- break
- }
-
- startpos = p.textpos()
-
- for c > 0 && p.rightChar(0) != '$' {
- p.moveRight(1)
- c--
- }
-
- p.addToConcatenate(startpos, p.textpos()-startpos, true)
-
- if c > 0 {
- if p.moveRightGetChar() == '$' {
- n, err := p.scanDollar()
- if err != nil {
- return nil, err
- }
- p.addUnitNode(n)
- }
- p.addConcatenate()
- }
- }
-
- return p.concatenation, nil
- }
-
- /*
- * Scans $ patterns recognized within replacement patterns
- */
- func (p *parser) scanDollar() (*regexNode, error) {
- if p.charsRight() == 0 {
- return newRegexNodeCh(ntOne, p.options, '$'), nil
- }
-
- ch := p.rightChar(0)
- angled := false
- backpos := p.textpos()
- lastEndPos := backpos
-
- // Note angle
-
- if ch == '{' && p.charsRight() > 1 {
- angled = true
- p.moveRight(1)
- ch = p.rightChar(0)
- }
-
- // Try to parse backreference: \1 or \{1} or \{cap}
-
- if ch >= '0' && ch <= '9' {
- if !angled && p.useOptionE() {
- capnum := -1
- newcapnum := int(ch - '0')
- p.moveRight(1)
- if p.isCaptureSlot(newcapnum) {
- capnum = newcapnum
- lastEndPos = p.textpos()
- }
-
- for p.charsRight() > 0 {
- ch = p.rightChar(0)
- if ch < '0' || ch > '9' {
- break
- }
- digit := int(ch - '0')
- if newcapnum > maxValueDiv10 || (newcapnum == maxValueDiv10 && digit > maxValueMod10) {
- return nil, p.getErr(ErrCaptureGroupOutOfRange)
- }
-
- newcapnum = newcapnum*10 + digit
-
- p.moveRight(1)
- if p.isCaptureSlot(newcapnum) {
- capnum = newcapnum
- lastEndPos = p.textpos()
- }
- }
- p.textto(lastEndPos)
- if capnum >= 0 {
- return newRegexNodeM(ntRef, p.options, capnum), nil
- }
- } else {
- capnum, err := p.scanDecimal()
- if err != nil {
- return nil, err
- }
- if !angled || p.charsRight() > 0 && p.moveRightGetChar() == '}' {
- if p.isCaptureSlot(capnum) {
- return newRegexNodeM(ntRef, p.options, capnum), nil
- }
- }
- }
- } else if angled && IsWordChar(ch) {
- capname := p.scanCapname()
-
- if p.charsRight() > 0 && p.moveRightGetChar() == '}' {
- if p.isCaptureName(capname) {
- return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
- }
- }
- } else if !angled {
- capnum := 1
-
- switch ch {
- case '$':
- p.moveRight(1)
- return newRegexNodeCh(ntOne, p.options, '$'), nil
- case '&':
- capnum = 0
- case '`':
- capnum = replaceLeftPortion
- case '\'':
- capnum = replaceRightPortion
- case '+':
- capnum = replaceLastGroup
- case '_':
- capnum = replaceWholeString
- }
-
- if capnum != 1 {
- p.moveRight(1)
- return newRegexNodeM(ntRef, p.options, capnum), nil
- }
- }
-
- // unrecognized $: literalize
-
- p.textto(backpos)
- return newRegexNodeCh(ntOne, p.options, '$'), nil
- }
-
- // scanGroupOpen scans chars following a '(' (not counting the '('), and returns
- // a RegexNode for the type of group scanned, or nil if the group
- // simply changed options (?cimsx-cimsx) or was a comment (#...).
- func (p *parser) scanGroupOpen() (*regexNode, error) {
- var ch rune
- var nt nodeType
- var err error
- close := '>'
- start := p.textpos()
-
- // just return a RegexNode if we have:
- // 1. "(" followed by nothing
- // 2. "(x" where x != ?
- // 3. "(?)"
- if p.charsRight() == 0 || p.rightChar(0) != '?' || (p.rightChar(0) == '?' && (p.charsRight() > 1 && p.rightChar(1) == ')')) {
- if p.useOptionN() || p.ignoreNextParen {
- p.ignoreNextParen = false
- return newRegexNode(ntGroup, p.options), nil
- }
- return newRegexNodeMN(ntCapture, p.options, p.consumeAutocap(), -1), nil
- }
-
- p.moveRight(1)
-
- for {
- if p.charsRight() == 0 {
- break
- }
-
- switch ch = p.moveRightGetChar(); ch {
- case ':':
- nt = ntGroup
-
- case '=':
- p.options &= ^RightToLeft
- nt = ntRequire
-
- case '!':
- p.options &= ^RightToLeft
- nt = ntPrevent
-
- case '>':
- nt = ntGreedy
-
- case '\'':
- close = '\''
- fallthrough
-
- case '<':
- if p.charsRight() == 0 {
- goto BreakRecognize
- }
-
- switch ch = p.moveRightGetChar(); ch {
- case '=':
- if close == '\'' {
- goto BreakRecognize
- }
-
- p.options |= RightToLeft
- nt = ntRequire
-
- case '!':
- if close == '\'' {
- goto BreakRecognize
- }
-
- p.options |= RightToLeft
- nt = ntPrevent
-
- default:
- p.moveLeft()
- capnum := -1
- uncapnum := -1
- proceed := false
-
- // grab part before -
-
- if ch >= '0' && ch <= '9' {
- if capnum, err = p.scanDecimal(); err != nil {
- return nil, err
- }
-
- if !p.isCaptureSlot(capnum) {
- capnum = -1
- }
-
- // check if we have bogus characters after the number
- if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
- return nil, p.getErr(ErrInvalidGroupName)
- }
- if capnum == 0 {
- return nil, p.getErr(ErrCapNumNotZero)
- }
- } else if IsWordChar(ch) {
- capname := p.scanCapname()
-
- if p.isCaptureName(capname) {
- capnum = p.captureSlotFromName(capname)
- }
-
- // check if we have bogus character after the name
- if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
- return nil, p.getErr(ErrInvalidGroupName)
- }
- } else if ch == '-' {
- proceed = true
- } else {
- // bad group name - starts with something other than a word character and isn't a number
- return nil, p.getErr(ErrInvalidGroupName)
- }
-
- // grab part after - if any
-
- if (capnum != -1 || proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' {
- p.moveRight(1)
-
- //no more chars left, no closing char, etc
- if p.charsRight() == 0 {
- return nil, p.getErr(ErrInvalidGroupName)
- }
-
- ch = p.rightChar(0)
- if ch >= '0' && ch <= '9' {
- if uncapnum, err = p.scanDecimal(); err != nil {
- return nil, err
- }
-
- if !p.isCaptureSlot(uncapnum) {
- return nil, p.getErr(ErrUndefinedBackRef, uncapnum)
- }
-
- // check if we have bogus characters after the number
- if p.charsRight() > 0 && p.rightChar(0) != close {
- return nil, p.getErr(ErrInvalidGroupName)
- }
- } else if IsWordChar(ch) {
- uncapname := p.scanCapname()
-
- if !p.isCaptureName(uncapname) {
- return nil, p.getErr(ErrUndefinedNameRef, uncapname)
- }
- uncapnum = p.captureSlotFromName(uncapname)
-
- // check if we have bogus character after the name
- if p.charsRight() > 0 && p.rightChar(0) != close {
- return nil, p.getErr(ErrInvalidGroupName)
- }
- } else {
- // bad group name - starts with something other than a word character and isn't a number
- return nil, p.getErr(ErrInvalidGroupName)
- }
- }
-
- // actually make the node
-
- if (capnum != -1 || uncapnum != -1) && p.charsRight() > 0 && p.moveRightGetChar() == close {
- return newRegexNodeMN(ntCapture, p.options, capnum, uncapnum), nil
- }
- goto BreakRecognize
- }
-
- case '(':
- // alternation construct (?(...) | )
-
- parenPos := p.textpos()
- if p.charsRight() > 0 {
- ch = p.rightChar(0)
-
- // check if the alternation condition is a backref
- if ch >= '0' && ch <= '9' {
- var capnum int
- if capnum, err = p.scanDecimal(); err != nil {
- return nil, err
- }
- if p.charsRight() > 0 && p.moveRightGetChar() == ')' {
- if p.isCaptureSlot(capnum) {
- return newRegexNodeM(ntTestref, p.options, capnum), nil
- }
- return nil, p.getErr(ErrUndefinedReference, capnum)
- }
-
- return nil, p.getErr(ErrMalformedReference, capnum)
-
- } else if IsWordChar(ch) {
- capname := p.scanCapname()
-
- if p.isCaptureName(capname) && p.charsRight() > 0 && p.moveRightGetChar() == ')' {
- return newRegexNodeM(ntTestref, p.options, p.captureSlotFromName(capname)), nil
- }
- }
- }
- // not a backref
- nt = ntTestgroup
- p.textto(parenPos - 1) // jump to the start of the parentheses
- p.ignoreNextParen = true // but make sure we don't try to capture the insides
-
- charsRight := p.charsRight()
- if charsRight >= 3 && p.rightChar(1) == '?' {
- rightchar2 := p.rightChar(2)
- // disallow comments in the condition
- if rightchar2 == '#' {
- return nil, p.getErr(ErrAlternationCantHaveComment)
- }
-
- // disallow named capture group (?<..>..) in the condition
- if rightchar2 == '\'' {
- return nil, p.getErr(ErrAlternationCantCapture)
- }
-
- if charsRight >= 4 && (rightchar2 == '<' && p.rightChar(3) != '!' && p.rightChar(3) != '=') {
- return nil, p.getErr(ErrAlternationCantCapture)
- }
- }
-
- case 'P':
- if p.useRE2() {
- // support for P<name> syntax
- if p.charsRight() < 3 {
- goto BreakRecognize
- }
-
- ch = p.moveRightGetChar()
- if ch != '<' {
- goto BreakRecognize
- }
-
- ch = p.moveRightGetChar()
- p.moveLeft()
-
- if IsWordChar(ch) {
- capnum := -1
- capname := p.scanCapname()
-
- if p.isCaptureName(capname) {
- capnum = p.captureSlotFromName(capname)
- }
-
- // check if we have bogus character after the name
- if p.charsRight() > 0 && p.rightChar(0) != '>' {
- return nil, p.getErr(ErrInvalidGroupName)
- }
-
- // actually make the node
-
- if capnum != -1 && p.charsRight() > 0 && p.moveRightGetChar() == '>' {
- return newRegexNodeMN(ntCapture, p.options, capnum, -1), nil
- }
- goto BreakRecognize
-
- } else {
- // bad group name - starts with something other than a word character and isn't a number
- return nil, p.getErr(ErrInvalidGroupName)
- }
- }
- // if we're not using RE2 compat mode then
- // we just behave like normal
- fallthrough
-
- default:
- p.moveLeft()
-
- nt = ntGroup
- // disallow options in the children of a testgroup node
- if p.group.t != ntTestgroup {
- p.scanOptions()
- }
- if p.charsRight() == 0 {
- goto BreakRecognize
- }
-
- if ch = p.moveRightGetChar(); ch == ')' {
- return nil, nil
- }
-
- if ch != ':' {
- goto BreakRecognize
- }
-
- }
-
- return newRegexNode(nt, p.options), nil
- }
-
- BreakRecognize:
-
- // break Recognize comes here
-
- return nil, p.getErr(ErrUnrecognizedGrouping, string(p.pattern[start:p.textpos()]))
- }
-
- // scans backslash specials and basics
- func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {
-
- if p.charsRight() == 0 {
- return nil, p.getErr(ErrIllegalEndEscape)
- }
-
- switch ch := p.rightChar(0); ch {
- case 'b', 'B', 'A', 'G', 'Z', 'z':
- p.moveRight(1)
- return newRegexNode(p.typeFromCode(ch), p.options), nil
-
- case 'w':
- p.moveRight(1)
- if p.useOptionE() {
- return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil
- }
- return newRegexNodeSet(ntSet, p.options, WordClass()), nil
-
- case 'W':
- p.moveRight(1)
- if p.useOptionE() {
- return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil
- }
- return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil
-
- case 's':
- p.moveRight(1)
- if p.useOptionE() {
- return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil
- }
- return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil
-
- case 'S':
- p.moveRight(1)
- if p.useOptionE() {
- return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil
- }
- return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil
-
- case 'd':
- p.moveRight(1)
- if p.useOptionE() {
- return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil
- }
- return newRegexNodeSet(ntSet, p.options, DigitClass()), nil
-
- case 'D':
- p.moveRight(1)
- if p.useOptionE() {
- return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil
- }
- return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil
-
- case 'p', 'P':
- p.moveRight(1)
- prop, err := p.parseProperty()
- if err != nil {
- return nil, err
- }
- cc := &CharSet{}
- cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw)
- if p.useOptionI() {
- cc.addLowercase()
- }
-
- return newRegexNodeSet(ntSet, p.options, cc), nil
-
- default:
- return p.scanBasicBackslash(scanOnly)
- }
- }
-
- // Scans \-style backreferences and character escapes
- func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) {
- if p.charsRight() == 0 {
- return nil, p.getErr(ErrIllegalEndEscape)
- }
- angled := false
- close := '\x00'
-
- backpos := p.textpos()
- ch := p.rightChar(0)
-
- // allow \k<foo> instead of \<foo>, which is now deprecated
-
- if ch == 'k' {
- if p.charsRight() >= 2 {
- p.moveRight(1)
- ch = p.moveRightGetChar()
-
- if ch == '<' || ch == '\'' {
- angled = true
- if ch == '\'' {
- close = '\''
- } else {
- close = '>'
- }
- }
- }
-
- if !angled || p.charsRight() <= 0 {
- return nil, p.getErr(ErrMalformedNameRef)
- }
-
- ch = p.rightChar(0)
-
- } else if (ch == '<' || ch == '\'') && p.charsRight() > 1 { // Note angle without \g
- angled = true
- if ch == '\'' {
- close = '\''
- } else {
- close = '>'
- }
-
- p.moveRight(1)
- ch = p.rightChar(0)
- }
-
- // Try to parse backreference: \<1> or \<cap>
-
- if angled && ch >= '0' && ch <= '9' {
- capnum, err := p.scanDecimal()
- if err != nil {
- return nil, err
- }
-
- if p.charsRight() > 0 && p.moveRightGetChar() == close {
- if p.isCaptureSlot(capnum) {
- return newRegexNodeM(ntRef, p.options, capnum), nil
- }
- return nil, p.getErr(ErrUndefinedBackRef, capnum)
- }
- } else if !angled && ch >= '1' && ch <= '9' { // Try to parse backreference or octal: \1
- capnum, err := p.scanDecimal()
- if err != nil {
- return nil, err
- }
-
- if scanOnly {
- return nil, nil
- }
-
- if p.isCaptureSlot(capnum) {
- return newRegexNodeM(ntRef, p.options, capnum), nil
- }
- if capnum <= 9 && !p.useOptionE() {
- return nil, p.getErr(ErrUndefinedBackRef, capnum)
- }
-
- } else if angled && IsWordChar(ch) {
- capname := p.scanCapname()
-
- if p.charsRight() > 0 && p.moveRightGetChar() == close {
- if p.isCaptureName(capname) {
- return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
- }
- return nil, p.getErr(ErrUndefinedNameRef, capname)
- }
- }
-
- // Not backreference: must be char code
-
- p.textto(backpos)
- ch, err := p.scanCharEscape()
- if err != nil {
- return nil, err
- }
-
- if p.useOptionI() {
- ch = unicode.ToLower(ch)
- }
-
- return newRegexNodeCh(ntOne, p.options, ch), nil
- }
-
- // Scans X for \p{X} or \P{X}
- func (p *parser) parseProperty() (string, error) {
- if p.charsRight() < 3 {
- return "", p.getErr(ErrIncompleteSlashP)
- }
- ch := p.moveRightGetChar()
- if ch != '{' {
- return "", p.getErr(ErrMalformedSlashP)
- }
-
- startpos := p.textpos()
- for p.charsRight() > 0 {
- ch = p.moveRightGetChar()
- if !(IsWordChar(ch) || ch == '-') {
- p.moveLeft()
- break
- }
- }
- capname := string(p.pattern[startpos:p.textpos()])
-
- if p.charsRight() == 0 || p.moveRightGetChar() != '}' {
- return "", p.getErr(ErrIncompleteSlashP)
- }
-
- if !isValidUnicodeCat(capname) {
- return "", p.getErr(ErrUnknownSlashP, capname)
- }
-
- return capname, nil
- }
-
- // Returns ReNode type for zero-length assertions with a \ code.
- func (p *parser) typeFromCode(ch rune) nodeType {
- switch ch {
- case 'b':
- if p.useOptionE() {
- return ntECMABoundary
- }
- return ntBoundary
- case 'B':
- if p.useOptionE() {
- return ntNonECMABoundary
- }
- return ntNonboundary
- case 'A':
- return ntBeginning
- case 'G':
- return ntStart
- case 'Z':
- return ntEndZ
- case 'z':
- return ntEnd
- default:
- return ntNothing
- }
- }
-
- // Scans whitespace or x-mode comments.
- func (p *parser) scanBlank() error {
- if p.useOptionX() {
- for {
- for p.charsRight() > 0 && isSpace(p.rightChar(0)) {
- p.moveRight(1)
- }
-
- if p.charsRight() == 0 {
- break
- }
-
- if p.rightChar(0) == '#' {
- for p.charsRight() > 0 && p.rightChar(0) != '\n' {
- p.moveRight(1)
- }
- } else if p.charsRight() >= 3 && p.rightChar(2) == '#' &&
- p.rightChar(1) == '?' && p.rightChar(0) == '(' {
- for p.charsRight() > 0 && p.rightChar(0) != ')' {
- p.moveRight(1)
- }
- if p.charsRight() == 0 {
- return p.getErr(ErrUnterminatedComment)
- }
- p.moveRight(1)
- } else {
- break
- }
- }
- } else {
- for {
- if p.charsRight() < 3 || p.rightChar(2) != '#' ||
- p.rightChar(1) != '?' || p.rightChar(0) != '(' {
- return nil
- }
-
- for p.charsRight() > 0 && p.rightChar(0) != ')' {
- p.moveRight(1)
- }
- if p.charsRight() == 0 {
- return p.getErr(ErrUnterminatedComment)
- }
- p.moveRight(1)
- }
- }
- return nil
- }
-
- func (p *parser) scanCapname() string {
- startpos := p.textpos()
-
- for p.charsRight() > 0 {
- if !IsWordChar(p.moveRightGetChar()) {
- p.moveLeft()
- break
- }
- }
-
- return string(p.pattern[startpos:p.textpos()])
- }
-
- //Scans contents of [] (not including []'s), and converts to a set.
- func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
- ch := '\x00'
- chPrev := '\x00'
- inRange := false
- firstChar := true
- closed := false
-
- var cc *CharSet
- if !scanOnly {
- cc = &CharSet{}
- }
-
- if p.charsRight() > 0 && p.rightChar(0) == '^' {
- p.moveRight(1)
- if !scanOnly {
- cc.negate = true
- }
- }
-
- for ; p.charsRight() > 0; firstChar = false {
- fTranslatedChar := false
- ch = p.moveRightGetChar()
- if ch == ']' {
- if !firstChar {
- closed = true
- break
- } else if p.useOptionE() {
- if !scanOnly {
- cc.addRanges(NoneClass().ranges)
- }
- closed = true
- break
- }
-
- } else if ch == '\\' && p.charsRight() > 0 {
- switch ch = p.moveRightGetChar(); ch {
- case 'D', 'd':
- if !scanOnly {
- if inRange {
- return nil, p.getErr(ErrBadClassInCharRange, ch)
- }
- cc.addDigit(p.useOptionE(), ch == 'D', p.patternRaw)
- }
- continue
-
- case 'S', 's':
- if !scanOnly {
- if inRange {
- return nil, p.getErr(ErrBadClassInCharRange, ch)
- }
- cc.addSpace(p.useOptionE(), ch == 'S')
- }
- continue
-
- case 'W', 'w':
- if !scanOnly {
- if inRange {
- return nil, p.getErr(ErrBadClassInCharRange, ch)
- }
-
- cc.addWord(p.useOptionE(), ch == 'W')
- }
- continue
-
- case 'p', 'P':
- if !scanOnly {
- if inRange {
- return nil, p.getErr(ErrBadClassInCharRange, ch)
- }
- prop, err := p.parseProperty()
- if err != nil {
- return nil, err
- }
- cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw)
- } else {
- p.parseProperty()
- }
-
- continue
-
- case '-':
- if !scanOnly {
- cc.addRange(ch, ch)
- }
- continue
-
- default:
- p.moveLeft()
- var err error
- ch, err = p.scanCharEscape() // non-literal character
- if err != nil {
- return nil, err
- }
- fTranslatedChar = true
- break // this break will only break out of the switch
- }
- } else if ch == '[' {
- // This is code for Posix style properties - [:Ll:] or [:IsTibetan:].
- // It currently doesn't do anything other than skip the whole thing!
- if p.charsRight() > 0 && p.rightChar(0) == ':' && !inRange {
- savePos := p.textpos()
-
- p.moveRight(1)
- negate := false
- if p.charsRight() > 1 && p.rightChar(0) == '^' {
- negate = true
- p.moveRight(1)
- }
-
- nm := p.scanCapname() // snag the name
- if !scanOnly && p.useRE2() {
- // look up the name since these are valid for RE2
- // add the group based on the name
- if ok := cc.addNamedASCII(nm, negate); !ok {
- return nil, p.getErr(ErrInvalidCharRange)
- }
- }
- if p.charsRight() < 2 || p.moveRightGetChar() != ':' || p.moveRightGetChar() != ']' {
- p.textto(savePos)
- } else if p.useRE2() {
- // move on
- continue
- }
- }
- }
-
- if inRange {
- inRange = false
- if !scanOnly {
- if ch == '[' && !fTranslatedChar && !firstChar {
- // We thought we were in a range, but we're actually starting a subtraction.
- // In that case, we'll add chPrev to our char class, skip the opening [, and
- // scan the new character class recursively.
- cc.addChar(chPrev)
- sub, err := p.scanCharSet(caseInsensitive, false)
- if err != nil {
- return nil, err
- }
- cc.addSubtraction(sub)
-
- if p.charsRight() > 0 && p.rightChar(0) != ']' {
- return nil, p.getErr(ErrSubtractionMustBeLast)
- }
- } else {
- // a regular range, like a-z
- if chPrev > ch {
- return nil, p.getErr(ErrReversedCharRange)
- }
- cc.addRange(chPrev, ch)
- }
- }
- } else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' {
- // this could be the start of a range
- chPrev = ch
- inRange = true
- p.moveRight(1)
- } else if p.charsRight() >= 1 && ch == '-' && !fTranslatedChar && p.rightChar(0) == '[' && !firstChar {
- // we aren't in a range, and now there is a subtraction. Usually this happens
- // only when a subtraction follows a range, like [a-z-[b]]
- if !scanOnly {
- p.moveRight(1)
- sub, err := p.scanCharSet(caseInsensitive, false)
- if err != nil {
- return nil, err
- }
- cc.addSubtraction(sub)
-
- if p.charsRight() > 0 && p.rightChar(0) != ']' {
- return nil, p.getErr(ErrSubtractionMustBeLast)
- }
- } else {
- p.moveRight(1)
- p.scanCharSet(caseInsensitive, true)
- }
- } else {
- if !scanOnly {
- cc.addRange(ch, ch)
- }
- }
- }
-
- if !closed {
- return nil, p.getErr(ErrUnterminatedBracket)
- }
-
- if !scanOnly && caseInsensitive {
- cc.addLowercase()
- }
-
- return cc, nil
- }
-
- // Scans any number of decimal digits (pegs value at 2^31-1 if too large)
- func (p *parser) scanDecimal() (int, error) {
- i := 0
- var d int
-
- for p.charsRight() > 0 {
- d = int(p.rightChar(0) - '0')
- if d < 0 || d > 9 {
- break
- }
- p.moveRight(1)
-
- if i > maxValueDiv10 || (i == maxValueDiv10 && d > maxValueMod10) {
- return 0, p.getErr(ErrCaptureGroupOutOfRange)
- }
-
- i *= 10
- i += d
- }
-
- return int(i), nil
- }
-
- // Returns true for options allowed only at the top level
- func isOnlyTopOption(option RegexOptions) bool {
- return option == RightToLeft || option == ECMAScript || option == RE2
- }
-
- // Scans cimsx-cimsx option string, stops at the first unrecognized char.
- func (p *parser) scanOptions() {
-
- for off := false; p.charsRight() > 0; p.moveRight(1) {
- ch := p.rightChar(0)
-
- if ch == '-' {
- off = true
- } else if ch == '+' {
- off = false
- } else {
- option := optionFromCode(ch)
- if option == 0 || isOnlyTopOption(option) {
- return
- }
-
- if off {
- p.options &= ^option
- } else {
- p.options |= option
- }
- }
- }
- }
-
- // Scans \ code for escape codes that map to single unicode chars.
- func (p *parser) scanCharEscape() (r rune, err error) {
-
- ch := p.moveRightGetChar()
-
- if ch >= '0' && ch <= '7' {
- p.moveLeft()
- return p.scanOctal(), nil
- }
-
- pos := p.textpos()
-
- switch ch {
- case 'x':
- // support for \x{HEX} syntax from Perl and PCRE
- if p.charsRight() > 0 && p.rightChar(0) == '{' {
- if p.useOptionE() {
- return ch, nil
- }
- p.moveRight(1)
- return p.scanHexUntilBrace()
- } else {
- r, err = p.scanHex(2)
- }
- case 'u':
- r, err = p.scanHex(4)
- case 'a':
- return '\u0007', nil
- case 'b':
- return '\b', nil
- case 'e':
- return '\u001B', nil
- case 'f':
- return '\f', nil
- case 'n':
- return '\n', nil
- case 'r':
- return '\r', nil
- case 't':
- return '\t', nil
- case 'v':
- return '\u000B', nil
- case 'c':
- r, err = p.scanControl()
- default:
- if !p.useOptionE() && IsWordChar(ch) {
- return 0, p.getErr(ErrUnrecognizedEscape, string(ch))
- }
- return ch, nil
- }
- if err != nil && p.useOptionE() {
- p.textto(pos)
- return ch, nil
- }
- return
- }
-
- // Grabs and converts an ascii control character
- func (p *parser) scanControl() (rune, error) {
- if p.charsRight() <= 0 {
- return 0, p.getErr(ErrMissingControl)
- }
-
- ch := p.moveRightGetChar()
-
- // \ca interpreted as \cA
-
- if ch >= 'a' && ch <= 'z' {
- ch = (ch - ('a' - 'A'))
- }
- ch = (ch - '@')
- if ch >= 0 && ch < ' ' {
- return ch, nil
- }
-
- return 0, p.getErr(ErrUnrecognizedControl)
-
- }
-
- // Scan hex digits until we hit a closing brace.
- // Non-hex digits, hex value too large for UTF-8, or running out of chars are errors
- func (p *parser) scanHexUntilBrace() (rune, error) {
- // PCRE spec reads like unlimited hex digits are allowed, but unicode has a limit
- // so we can enforce that
- i := 0
- hasContent := false
-
- for p.charsRight() > 0 {
- ch := p.moveRightGetChar()
- if ch == '}' {
- // hit our close brace, we're done here
- // prevent \x{}
- if !hasContent {
- return 0, p.getErr(ErrTooFewHex)
- }
- return rune(i), nil
- }
- hasContent = true
- // no brace needs to be hex digit
- d := hexDigit(ch)
- if d < 0 {
- return 0, p.getErr(ErrMissingBrace)
- }
-
- i *= 0x10
- i += d
-
- if i > unicode.MaxRune {
- return 0, p.getErr(ErrInvalidHex)
- }
- }
-
- // we only make it here if we run out of digits without finding the brace
- return 0, p.getErr(ErrMissingBrace)
- }
-
- // Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF)
- func (p *parser) scanHex(c int) (rune, error) {
-
- i := 0
-
- if p.charsRight() >= c {
- for c > 0 {
- d := hexDigit(p.moveRightGetChar())
- if d < 0 {
- break
- }
- i *= 0x10
- i += d
- c--
- }
- }
-
- if c > 0 {
- return 0, p.getErr(ErrTooFewHex)
- }
-
- return rune(i), nil
- }
-
- // Returns n <= 0xF for a hex digit.
- func hexDigit(ch rune) int {
-
- if d := uint(ch - '0'); d <= 9 {
- return int(d)
- }
-
- if d := uint(ch - 'a'); d <= 5 {
- return int(d + 0xa)
- }
-
- if d := uint(ch - 'A'); d <= 5 {
- return int(d + 0xa)
- }
-
- return -1
- }
-
- // Scans up to three octal digits (stops before exceeding 0377).
- func (p *parser) scanOctal() rune {
- // Consume octal chars only up to 3 digits and value 0377
-
- c := 3
-
- if c > p.charsRight() {
- c = p.charsRight()
- }
-
- //we know the first char is good because the caller had to check
- i := 0
- d := int(p.rightChar(0) - '0')
- for c > 0 && d <= 7 && d >= 0 {
- if i >= 0x20 && p.useOptionE() {
- break
- }
- i *= 8
- i += d
- c--
-
- p.moveRight(1)
- if !p.rightMost() {
- d = int(p.rightChar(0) - '0')
- }
- }
-
- // Octal codes only go up to 255. Any larger and the behavior that Perl follows
- // is simply to truncate the high bits.
- i &= 0xFF
-
- return rune(i)
- }
-
- // Returns the current parsing position.
- func (p *parser) textpos() int {
- return p.currentPos
- }
-
- // Zaps to a specific parsing position.
- func (p *parser) textto(pos int) {
- p.currentPos = pos
- }
-
- // Returns the char at the right of the current parsing position and advances to the right.
- func (p *parser) moveRightGetChar() rune {
- ch := p.pattern[p.currentPos]
- p.currentPos++
- return ch
- }
-
- // Moves the current position to the right.
- func (p *parser) moveRight(i int) {
- // default would be 1
- p.currentPos += i
- }
-
- // Moves the current parsing position one to the left.
- func (p *parser) moveLeft() {
- p.currentPos--
- }
-
- // Returns the char left of the current parsing position.
- func (p *parser) charAt(i int) rune {
- return p.pattern[i]
- }
-
- // Returns the char i chars right of the current parsing position.
- func (p *parser) rightChar(i int) rune {
- // default would be 0
- return p.pattern[p.currentPos+i]
- }
-
- // Number of characters to the right of the current parsing position.
- func (p *parser) charsRight() int {
- return len(p.pattern) - p.currentPos
- }
-
- func (p *parser) rightMost() bool {
- return p.currentPos == len(p.pattern)
- }
-
- // Looks up the slot number for a given name
- func (p *parser) captureSlotFromName(capname string) int {
- return p.capnames[capname]
- }
-
- // True if the capture slot was noted
- func (p *parser) isCaptureSlot(i int) bool {
- if p.caps != nil {
- _, ok := p.caps[i]
- return ok
- }
-
- return (i >= 0 && i < p.capsize)
- }
-
- // Looks up the slot number for a given name
- func (p *parser) isCaptureName(capname string) bool {
- if p.capnames == nil {
- return false
- }
-
- _, ok := p.capnames[capname]
- return ok
- }
-
- // option shortcuts
-
- // True if N option disabling '(' autocapture is on.
- func (p *parser) useOptionN() bool {
- return (p.options & ExplicitCapture) != 0
- }
-
- // True if I option enabling case-insensitivity is on.
- func (p *parser) useOptionI() bool {
- return (p.options & IgnoreCase) != 0
- }
-
- // True if M option altering meaning of $ and ^ is on.
- func (p *parser) useOptionM() bool {
- return (p.options & Multiline) != 0
- }
-
- // True if S option altering meaning of . is on.
- func (p *parser) useOptionS() bool {
- return (p.options & Singleline) != 0
- }
-
- // True if X option enabling whitespace/comment mode is on.
- func (p *parser) useOptionX() bool {
- return (p.options & IgnorePatternWhitespace) != 0
- }
-
- // True if E option enabling ECMAScript behavior on.
- func (p *parser) useOptionE() bool {
- return (p.options & ECMAScript) != 0
- }
-
- // true to use RE2 compatibility parsing behavior.
- func (p *parser) useRE2() bool {
- return (p.options & RE2) != 0
- }
-
- // True if options stack is empty.
- func (p *parser) emptyOptionsStack() bool {
- return len(p.optionsStack) == 0
- }
-
- // Finish the current quantifiable (when a quantifier is not found or is not possible)
- func (p *parser) addConcatenate() {
- // The first (| inside a Testgroup group goes directly to the group
- p.concatenation.addChild(p.unit)
- p.unit = nil
- }
-
- // Finish the current quantifiable (when a quantifier is found)
- func (p *parser) addConcatenate3(lazy bool, min, max int) {
- p.concatenation.addChild(p.unit.makeQuantifier(lazy, min, max))
- p.unit = nil
- }
-
- // Sets the current unit to a single char node
- func (p *parser) addUnitOne(ch rune) {
- if p.useOptionI() {
- ch = unicode.ToLower(ch)
- }
-
- p.unit = newRegexNodeCh(ntOne, p.options, ch)
- }
-
- // Sets the current unit to a single inverse-char node
- func (p *parser) addUnitNotone(ch rune) {
- if p.useOptionI() {
- ch = unicode.ToLower(ch)
- }
-
- p.unit = newRegexNodeCh(ntNotone, p.options, ch)
- }
-
- // Sets the current unit to a single set node
- func (p *parser) addUnitSet(set *CharSet) {
- p.unit = newRegexNodeSet(ntSet, p.options, set)
- }
-
- // Sets the current unit to a subtree
- func (p *parser) addUnitNode(node *regexNode) {
- p.unit = node
- }
-
- // Sets the current unit to an assertion of the specified type
- func (p *parser) addUnitType(t nodeType) {
- p.unit = newRegexNode(t, p.options)
- }
-
- // Finish the current group (in response to a ')' or end)
- func (p *parser) addGroup() error {
- if p.group.t == ntTestgroup || p.group.t == ntTestref {
- p.group.addChild(p.concatenation.reverseLeft())
- if (p.group.t == ntTestref && len(p.group.children) > 2) || len(p.group.children) > 3 {
- return p.getErr(ErrTooManyAlternates)
- }
- } else {
- p.alternation.addChild(p.concatenation.reverseLeft())
- p.group.addChild(p.alternation)
- }
-
- p.unit = p.group
- return nil
- }
-
- // Pops the option stack, but keeps the current options unchanged.
- func (p *parser) popKeepOptions() {
- lastIdx := len(p.optionsStack) - 1
- p.optionsStack = p.optionsStack[:lastIdx]
- }
-
- // Recalls options from the stack.
- func (p *parser) popOptions() {
- lastIdx := len(p.optionsStack) - 1
- // get the last item on the stack and then remove it by reslicing
- p.options = p.optionsStack[lastIdx]
- p.optionsStack = p.optionsStack[:lastIdx]
- }
-
- // Saves options on a stack.
- func (p *parser) pushOptions() {
- p.optionsStack = append(p.optionsStack, p.options)
- }
-
- // Add a string to the last concatenate.
- func (p *parser) addToConcatenate(pos, cch int, isReplacement bool) {
- var node *regexNode
-
- if cch == 0 {
- return
- }
-
- if cch > 1 {
- str := p.pattern[pos : pos+cch]
-
- if p.useOptionI() && !isReplacement {
- // We do the ToLower character by character for consistency. With surrogate chars, doing
- // a ToLower on the entire string could actually change the surrogate pair. This is more correct
- // linguistically, but since Regex doesn't support surrogates, it's more important to be
- // consistent.
- for i := 0; i < len(str); i++ {
- str[i] = unicode.ToLower(str[i])
- }
- }
-
- node = newRegexNodeStr(ntMulti, p.options, str)
- } else {
- ch := p.charAt(pos)
-
- if p.useOptionI() && !isReplacement {
- ch = unicode.ToLower(ch)
- }
-
- node = newRegexNodeCh(ntOne, p.options, ch)
- }
-
- p.concatenation.addChild(node)
- }
-
- // Push the parser state (in response to an open paren)
- func (p *parser) pushGroup() {
- p.group.next = p.stack
- p.alternation.next = p.group
- p.concatenation.next = p.alternation
- p.stack = p.concatenation
- }
-
- // Remember the pushed state (in response to a ')')
- func (p *parser) popGroup() error {
- p.concatenation = p.stack
- p.alternation = p.concatenation.next
- p.group = p.alternation.next
- p.stack = p.group.next
-
- // The first () inside a Testgroup group goes directly to the group
- if p.group.t == ntTestgroup && len(p.group.children) == 0 {
- if p.unit == nil {
- return p.getErr(ErrConditionalExpression)
- }
-
- p.group.addChild(p.unit)
- p.unit = nil
- }
- return nil
- }
-
- // True if the group stack is empty.
- func (p *parser) emptyStack() bool {
- return p.stack == nil
- }
-
- // Start a new round for the parser state (in response to an open paren or string start)
- func (p *parser) startGroup(openGroup *regexNode) {
- p.group = openGroup
- p.alternation = newRegexNode(ntAlternate, p.options)
- p.concatenation = newRegexNode(ntConcatenate, p.options)
- }
-
- // Finish the current concatenation (in response to a |)
- func (p *parser) addAlternate() {
- // The | parts inside a Testgroup group go directly to the group
-
- if p.group.t == ntTestgroup || p.group.t == ntTestref {
- p.group.addChild(p.concatenation.reverseLeft())
- } else {
- p.alternation.addChild(p.concatenation.reverseLeft())
- }
-
- p.concatenation = newRegexNode(ntConcatenate, p.options)
- }
-
- // For categorizing ascii characters.
-
- const (
- Q byte = 5 // quantifier
- S = 4 // ordinary stopper
- Z = 3 // ScanBlank stopper
- X = 2 // whitespace
- E = 1 // should be escaped
- )
-
- var _category = []byte{
- //01 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F
- 0, 0, 0, 0, 0, 0, 0, 0, 0, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- // ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
- X, 0, 0, Z, S, 0, 0, 0, S, S, Q, Q, 0, 0, S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q,
- //@A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, S, 0,
- //'a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, S, 0, 0, 0,
- }
-
- func isSpace(ch rune) bool {
- return (ch <= ' ' && _category[ch] == X)
- }
-
- // Returns true for those characters that terminate a string of ordinary chars.
- func isSpecial(ch rune) bool {
- return (ch <= '|' && _category[ch] >= S)
- }
-
- // Returns true for those characters that terminate a string of ordinary chars.
- func isStopperX(ch rune) bool {
- return (ch <= '|' && _category[ch] >= X)
- }
-
- // Returns true for those characters that begin a quantifier.
- func isQuantifier(ch rune) bool {
- return (ch <= '{' && _category[ch] >= Q)
- }
-
- func (p *parser) isTrueQuantifier() bool {
- nChars := p.charsRight()
- if nChars == 0 {
- return false
- }
-
- startpos := p.textpos()
- ch := p.charAt(startpos)
- if ch != '{' {
- return ch <= '{' && _category[ch] >= Q
- }
-
- //UGLY: this is ugly -- the original code was ugly too
- pos := startpos
- for {
- nChars--
- if nChars <= 0 {
- break
- }
- pos++
- ch = p.charAt(pos)
- if ch < '0' || ch > '9' {
- break
- }
- }
-
- if nChars == 0 || pos-startpos == 1 {
- return false
- }
- if ch == '}' {
- return true
- }
- if ch != ',' {
- return false
- }
- for {
- nChars--
- if nChars <= 0 {
- break
- }
- pos++
- ch = p.charAt(pos)
- if ch < '0' || ch > '9' {
- break
- }
- }
-
- return nChars > 0 && ch == '}'
- }
|