|
- package syntax
-
- import (
- "bytes"
- "fmt"
- "strconv"
- "unicode"
- "unicode/utf8"
- )
-
- type Prefix struct {
- PrefixStr []rune
- PrefixSet CharSet
- CaseInsensitive bool
- }
-
- // It takes a RegexTree and computes the set of chars that can start it.
- func getFirstCharsPrefix(tree *RegexTree) *Prefix {
- s := regexFcd{
- fcStack: make([]regexFc, 32),
- intStack: make([]int, 32),
- }
- fc := s.regexFCFromRegexTree(tree)
-
- if fc == nil || fc.nullable || fc.cc.IsEmpty() {
- return nil
- }
- fcSet := fc.getFirstChars()
- return &Prefix{PrefixSet: fcSet, CaseInsensitive: fc.caseInsensitive}
- }
-
- type regexFcd struct {
- intStack []int
- intDepth int
- fcStack []regexFc
- fcDepth int
- skipAllChildren bool // don't process any more children at the current level
- skipchild bool // don't process the current child.
- failed bool
- }
-
- /*
- * The main FC computation. It does a shortcutted depth-first walk
- * through the tree and calls CalculateFC to emits code before
- * and after each child of an interior node, and at each leaf.
- */
- func (s *regexFcd) regexFCFromRegexTree(tree *RegexTree) *regexFc {
- curNode := tree.root
- curChild := 0
-
- for {
- if len(curNode.children) == 0 {
- // This is a leaf node
- s.calculateFC(curNode.t, curNode, 0)
- } else if curChild < len(curNode.children) && !s.skipAllChildren {
- // This is an interior node, and we have more children to analyze
- s.calculateFC(curNode.t|beforeChild, curNode, curChild)
-
- if !s.skipchild {
- curNode = curNode.children[curChild]
- // this stack is how we get a depth first walk of the tree.
- s.pushInt(curChild)
- curChild = 0
- } else {
- curChild++
- s.skipchild = false
- }
- continue
- }
-
- // This is an interior node where we've finished analyzing all the children, or
- // the end of a leaf node.
- s.skipAllChildren = false
-
- if s.intIsEmpty() {
- break
- }
-
- curChild = s.popInt()
- curNode = curNode.next
-
- s.calculateFC(curNode.t|afterChild, curNode, curChild)
- if s.failed {
- return nil
- }
-
- curChild++
- }
-
- if s.fcIsEmpty() {
- return nil
- }
-
- return s.popFC()
- }
-
- // To avoid recursion, we use a simple integer stack.
- // This is the push.
- func (s *regexFcd) pushInt(I int) {
- if s.intDepth >= len(s.intStack) {
- expanded := make([]int, s.intDepth*2)
- copy(expanded, s.intStack)
- s.intStack = expanded
- }
-
- s.intStack[s.intDepth] = I
- s.intDepth++
- }
-
- // True if the stack is empty.
- func (s *regexFcd) intIsEmpty() bool {
- return s.intDepth == 0
- }
-
- // This is the pop.
- func (s *regexFcd) popInt() int {
- s.intDepth--
- return s.intStack[s.intDepth]
- }
-
- // We also use a stack of RegexFC objects.
- // This is the push.
- func (s *regexFcd) pushFC(fc regexFc) {
- if s.fcDepth >= len(s.fcStack) {
- expanded := make([]regexFc, s.fcDepth*2)
- copy(expanded, s.fcStack)
- s.fcStack = expanded
- }
-
- s.fcStack[s.fcDepth] = fc
- s.fcDepth++
- }
-
- // True if the stack is empty.
- func (s *regexFcd) fcIsEmpty() bool {
- return s.fcDepth == 0
- }
-
- // This is the pop.
- func (s *regexFcd) popFC() *regexFc {
- s.fcDepth--
- return &s.fcStack[s.fcDepth]
- }
-
- // This is the top.
- func (s *regexFcd) topFC() *regexFc {
- return &s.fcStack[s.fcDepth-1]
- }
-
- // Called in Beforechild to prevent further processing of the current child
- func (s *regexFcd) skipChild() {
- s.skipchild = true
- }
-
- // FC computation and shortcut cases for each node type
- func (s *regexFcd) calculateFC(nt nodeType, node *regexNode, CurIndex int) {
- //fmt.Printf("NodeType: %v, CurIndex: %v, Desc: %v\n", nt, CurIndex, node.description())
- ci := false
- rtl := false
-
- if nt <= ntRef {
- if (node.options & IgnoreCase) != 0 {
- ci = true
- }
- if (node.options & RightToLeft) != 0 {
- rtl = true
- }
- }
-
- switch nt {
- case ntConcatenate | beforeChild, ntAlternate | beforeChild, ntTestref | beforeChild, ntLoop | beforeChild, ntLazyloop | beforeChild:
- break
-
- case ntTestgroup | beforeChild:
- if CurIndex == 0 {
- s.skipChild()
- }
- break
-
- case ntEmpty:
- s.pushFC(regexFc{nullable: true})
- break
-
- case ntConcatenate | afterChild:
- if CurIndex != 0 {
- child := s.popFC()
- cumul := s.topFC()
-
- s.failed = !cumul.addFC(*child, true)
- }
-
- fc := s.topFC()
- if !fc.nullable {
- s.skipAllChildren = true
- }
- break
-
- case ntTestgroup | afterChild:
- if CurIndex > 1 {
- child := s.popFC()
- cumul := s.topFC()
-
- s.failed = !cumul.addFC(*child, false)
- }
- break
-
- case ntAlternate | afterChild, ntTestref | afterChild:
- if CurIndex != 0 {
- child := s.popFC()
- cumul := s.topFC()
-
- s.failed = !cumul.addFC(*child, false)
- }
- break
-
- case ntLoop | afterChild, ntLazyloop | afterChild:
- if node.m == 0 {
- fc := s.topFC()
- fc.nullable = true
- }
- break
-
- case ntGroup | beforeChild, ntGroup | afterChild, ntCapture | beforeChild, ntCapture | afterChild, ntGreedy | beforeChild, ntGreedy | afterChild:
- break
-
- case ntRequire | beforeChild, ntPrevent | beforeChild:
- s.skipChild()
- s.pushFC(regexFc{nullable: true})
- break
-
- case ntRequire | afterChild, ntPrevent | afterChild:
- break
-
- case ntOne, ntNotone:
- s.pushFC(newRegexFc(node.ch, nt == ntNotone, false, ci))
- break
-
- case ntOneloop, ntOnelazy:
- s.pushFC(newRegexFc(node.ch, false, node.m == 0, ci))
- break
-
- case ntNotoneloop, ntNotonelazy:
- s.pushFC(newRegexFc(node.ch, true, node.m == 0, ci))
- break
-
- case ntMulti:
- if len(node.str) == 0 {
- s.pushFC(regexFc{nullable: true})
- } else if !rtl {
- s.pushFC(newRegexFc(node.str[0], false, false, ci))
- } else {
- s.pushFC(newRegexFc(node.str[len(node.str)-1], false, false, ci))
- }
- break
-
- case ntSet:
- s.pushFC(regexFc{cc: node.set.Copy(), nullable: false, caseInsensitive: ci})
- break
-
- case ntSetloop, ntSetlazy:
- s.pushFC(regexFc{cc: node.set.Copy(), nullable: node.m == 0, caseInsensitive: ci})
- break
-
- case ntRef:
- s.pushFC(regexFc{cc: *AnyClass(), nullable: true, caseInsensitive: false})
- break
-
- case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
- s.pushFC(regexFc{nullable: true})
- break
-
- default:
- panic(fmt.Sprintf("unexpected op code: %v", nt))
- }
- }
-
- type regexFc struct {
- cc CharSet
- nullable bool
- caseInsensitive bool
- }
-
- func newRegexFc(ch rune, not, nullable, caseInsensitive bool) regexFc {
- r := regexFc{
- caseInsensitive: caseInsensitive,
- nullable: nullable,
- }
- if not {
- if ch > 0 {
- r.cc.addRange('\x00', ch-1)
- }
- if ch < 0xFFFF {
- r.cc.addRange(ch+1, utf8.MaxRune)
- }
- } else {
- r.cc.addRange(ch, ch)
- }
- return r
- }
-
- func (r *regexFc) getFirstChars() CharSet {
- if r.caseInsensitive {
- r.cc.addLowercase()
- }
-
- return r.cc
- }
-
- func (r *regexFc) addFC(fc regexFc, concatenate bool) bool {
- if !r.cc.IsMergeable() || !fc.cc.IsMergeable() {
- return false
- }
-
- if concatenate {
- if !r.nullable {
- return true
- }
-
- if !fc.nullable {
- r.nullable = false
- }
- } else {
- if fc.nullable {
- r.nullable = true
- }
- }
-
- r.caseInsensitive = r.caseInsensitive || fc.caseInsensitive
- r.cc.addSet(fc.cc)
-
- return true
- }
-
- // This is a related computation: it takes a RegexTree and computes the
- // leading substring if it sees one. It's quite trivial and gives up easily.
- func getPrefix(tree *RegexTree) *Prefix {
- var concatNode *regexNode
- nextChild := 0
-
- curNode := tree.root
-
- for {
- switch curNode.t {
- case ntConcatenate:
- if len(curNode.children) > 0 {
- concatNode = curNode
- nextChild = 0
- }
-
- case ntGreedy, ntCapture:
- curNode = curNode.children[0]
- concatNode = nil
- continue
-
- case ntOneloop, ntOnelazy:
- if curNode.m > 0 {
- return &Prefix{
- PrefixStr: repeat(curNode.ch, curNode.m),
- CaseInsensitive: (curNode.options & IgnoreCase) != 0,
- }
- }
- return nil
-
- case ntOne:
- return &Prefix{
- PrefixStr: []rune{curNode.ch},
- CaseInsensitive: (curNode.options & IgnoreCase) != 0,
- }
-
- case ntMulti:
- return &Prefix{
- PrefixStr: curNode.str,
- CaseInsensitive: (curNode.options & IgnoreCase) != 0,
- }
-
- case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning, ntStart,
- ntEndZ, ntEnd, ntEmpty, ntRequire, ntPrevent:
-
- default:
- return nil
- }
-
- if concatNode == nil || nextChild >= len(concatNode.children) {
- return nil
- }
-
- curNode = concatNode.children[nextChild]
- nextChild++
- }
- }
-
- // repeat the rune r, c times... up to the max of MaxPrefixSize
- func repeat(r rune, c int) []rune {
- if c > MaxPrefixSize {
- c = MaxPrefixSize
- }
-
- ret := make([]rune, c)
-
- // binary growth using copy for speed
- ret[0] = r
- bp := 1
- for bp < len(ret) {
- copy(ret[bp:], ret[:bp])
- bp *= 2
- }
-
- return ret
- }
-
- // BmPrefix precomputes the Boyer-Moore
- // tables for fast string scanning. These tables allow
- // you to scan for the first occurrence of a string within
- // a large body of text without examining every character.
- // The performance of the heuristic depends on the actual
- // string and the text being searched, but usually, the longer
- // the string that is being searched for, the fewer characters
- // need to be examined.
- type BmPrefix struct {
- positive []int
- negativeASCII []int
- negativeUnicode [][]int
- pattern []rune
- lowASCII rune
- highASCII rune
- rightToLeft bool
- caseInsensitive bool
- }
-
- func newBmPrefix(pattern []rune, caseInsensitive, rightToLeft bool) *BmPrefix {
-
- b := &BmPrefix{
- rightToLeft: rightToLeft,
- caseInsensitive: caseInsensitive,
- pattern: pattern,
- }
-
- if caseInsensitive {
- for i := 0; i < len(b.pattern); i++ {
- // We do the ToLower character by character for consistency. With surrogate chars, doing
- // a ToLower on the entire string could actually change the surrogate pair. This is more correct
- // linguistically, but since Regex doesn't support surrogates, it's more important to be
- // consistent.
-
- b.pattern[i] = unicode.ToLower(b.pattern[i])
- }
- }
-
- var beforefirst, last, bump int
- var scan, match int
-
- if !rightToLeft {
- beforefirst = -1
- last = len(b.pattern) - 1
- bump = 1
- } else {
- beforefirst = len(b.pattern)
- last = 0
- bump = -1
- }
-
- // PART I - the good-suffix shift table
- //
- // compute the positive requirement:
- // if char "i" is the first one from the right that doesn't match,
- // then we know the matcher can advance by _positive[i].
- //
- // This algorithm is a simplified variant of the standard
- // Boyer-Moore good suffix calculation.
-
- b.positive = make([]int, len(b.pattern))
-
- examine := last
- ch := b.pattern[examine]
- b.positive[examine] = bump
- examine -= bump
-
- Outerloop:
- for {
- // find an internal char (examine) that matches the tail
-
- for {
- if examine == beforefirst {
- break Outerloop
- }
- if b.pattern[examine] == ch {
- break
- }
- examine -= bump
- }
-
- match = last
- scan = examine
-
- // find the length of the match
- for {
- if scan == beforefirst || b.pattern[match] != b.pattern[scan] {
- // at the end of the match, note the difference in _positive
- // this is not the length of the match, but the distance from the internal match
- // to the tail suffix.
- if b.positive[match] == 0 {
- b.positive[match] = match - scan
- }
-
- // System.Diagnostics.Debug.WriteLine("Set positive[" + match + "] to " + (match - scan));
-
- break
- }
-
- scan -= bump
- match -= bump
- }
-
- examine -= bump
- }
-
- match = last - bump
-
- // scan for the chars for which there are no shifts that yield a different candidate
-
- // The inside of the if statement used to say
- // "_positive[match] = last - beforefirst;"
- // This is slightly less aggressive in how much we skip, but at worst it
- // should mean a little more work rather than skipping a potential match.
- for match != beforefirst {
- if b.positive[match] == 0 {
- b.positive[match] = bump
- }
-
- match -= bump
- }
-
- // PART II - the bad-character shift table
- //
- // compute the negative requirement:
- // if char "ch" is the reject character when testing position "i",
- // we can slide up by _negative[ch];
- // (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch))
- //
- // the lookup table is divided into ASCII and Unicode portions;
- // only those parts of the Unicode 16-bit code set that actually
- // appear in the string are in the table. (Maximum size with
- // Unicode is 65K; ASCII only case is 512 bytes.)
-
- b.negativeASCII = make([]int, 128)
-
- for i := 0; i < len(b.negativeASCII); i++ {
- b.negativeASCII[i] = last - beforefirst
- }
-
- b.lowASCII = 127
- b.highASCII = 0
-
- for examine = last; examine != beforefirst; examine -= bump {
- ch = b.pattern[examine]
-
- switch {
- case ch < 128:
- if b.lowASCII > ch {
- b.lowASCII = ch
- }
-
- if b.highASCII < ch {
- b.highASCII = ch
- }
-
- if b.negativeASCII[ch] == last-beforefirst {
- b.negativeASCII[ch] = last - examine
- }
- case ch <= 0xffff:
- i, j := ch>>8, ch&0xFF
-
- if b.negativeUnicode == nil {
- b.negativeUnicode = make([][]int, 256)
- }
-
- if b.negativeUnicode[i] == nil {
- newarray := make([]int, 256)
-
- for k := 0; k < len(newarray); k++ {
- newarray[k] = last - beforefirst
- }
-
- if i == 0 {
- copy(newarray, b.negativeASCII)
- //TODO: this line needed?
- b.negativeASCII = newarray
- }
-
- b.negativeUnicode[i] = newarray
- }
-
- if b.negativeUnicode[i][j] == last-beforefirst {
- b.negativeUnicode[i][j] = last - examine
- }
- default:
- // we can't do the filter because this algo doesn't support
- // unicode chars >0xffff
- return nil
- }
- }
-
- return b
- }
-
- func (b *BmPrefix) String() string {
- return string(b.pattern)
- }
-
- // Dump returns the contents of the filter as a human readable string
- func (b *BmPrefix) Dump(indent string) string {
- buf := &bytes.Buffer{}
-
- fmt.Fprintf(buf, "%sBM Pattern: %s\n%sPositive: ", indent, string(b.pattern), indent)
- for i := 0; i < len(b.positive); i++ {
- buf.WriteString(strconv.Itoa(b.positive[i]))
- buf.WriteRune(' ')
- }
- buf.WriteRune('\n')
-
- if b.negativeASCII != nil {
- buf.WriteString(indent)
- buf.WriteString("Negative table\n")
- for i := 0; i < len(b.negativeASCII); i++ {
- if b.negativeASCII[i] != len(b.pattern) {
- fmt.Fprintf(buf, "%s %s %s\n", indent, Escape(string(rune(i))), strconv.Itoa(b.negativeASCII[i]))
- }
- }
- }
-
- return buf.String()
- }
-
- // Scan uses the Boyer-Moore algorithm to find the first occurrence
- // of the specified string within text, beginning at index, and
- // constrained within beglimit and endlimit.
- //
- // The direction and case-sensitivity of the match is determined
- // by the arguments to the RegexBoyerMoore constructor.
- func (b *BmPrefix) Scan(text []rune, index, beglimit, endlimit int) int {
- var (
- defadv, test, test2 int
- match, startmatch, endmatch int
- bump, advance int
- chTest rune
- unicodeLookup []int
- )
-
- if !b.rightToLeft {
- defadv = len(b.pattern)
- startmatch = len(b.pattern) - 1
- endmatch = 0
- test = index + defadv - 1
- bump = 1
- } else {
- defadv = -len(b.pattern)
- startmatch = 0
- endmatch = -defadv - 1
- test = index + defadv
- bump = -1
- }
-
- chMatch := b.pattern[startmatch]
-
- for {
- if test >= endlimit || test < beglimit {
- return -1
- }
-
- chTest = text[test]
-
- if b.caseInsensitive {
- chTest = unicode.ToLower(chTest)
- }
-
- if chTest != chMatch {
- if chTest < 128 {
- advance = b.negativeASCII[chTest]
- } else if chTest < 0xffff && len(b.negativeUnicode) > 0 {
- unicodeLookup = b.negativeUnicode[chTest>>8]
- if len(unicodeLookup) > 0 {
- advance = unicodeLookup[chTest&0xFF]
- } else {
- advance = defadv
- }
- } else {
- advance = defadv
- }
-
- test += advance
- } else { // if (chTest == chMatch)
- test2 = test
- match = startmatch
-
- for {
- if match == endmatch {
- if b.rightToLeft {
- return test2 + 1
- } else {
- return test2
- }
- }
-
- match -= bump
- test2 -= bump
-
- chTest = text[test2]
-
- if b.caseInsensitive {
- chTest = unicode.ToLower(chTest)
- }
-
- if chTest != b.pattern[match] {
- advance = b.positive[match]
- if (chTest & 0xFF80) == 0 {
- test2 = (match - startmatch) + b.negativeASCII[chTest]
- } else if chTest < 0xffff && len(b.negativeUnicode) > 0 {
- unicodeLookup = b.negativeUnicode[chTest>>8]
- if len(unicodeLookup) > 0 {
- test2 = (match - startmatch) + unicodeLookup[chTest&0xFF]
- } else {
- test += advance
- break
- }
- } else {
- test += advance
- break
- }
-
- if b.rightToLeft {
- if test2 < advance {
- advance = test2
- }
- } else if test2 > advance {
- advance = test2
- }
-
- test += advance
- break
- }
- }
- }
- }
- }
-
- // When a regex is anchored, we can do a quick IsMatch test instead of a Scan
- func (b *BmPrefix) IsMatch(text []rune, index, beglimit, endlimit int) bool {
- if !b.rightToLeft {
- if index < beglimit || endlimit-index < len(b.pattern) {
- return false
- }
-
- return b.matchPattern(text, index)
- } else {
- if index > endlimit || index-beglimit < len(b.pattern) {
- return false
- }
-
- return b.matchPattern(text, index-len(b.pattern))
- }
- }
-
- func (b *BmPrefix) matchPattern(text []rune, index int) bool {
- if len(text)-index < len(b.pattern) {
- return false
- }
-
- if b.caseInsensitive {
- for i := 0; i < len(b.pattern); i++ {
- //Debug.Assert(textinfo.ToLower(_pattern[i]) == _pattern[i], "pattern should be converted to lower case in constructor!");
- if unicode.ToLower(text[index+i]) != b.pattern[i] {
- return false
- }
- }
- return true
- } else {
- for i := 0; i < len(b.pattern); i++ {
- if text[index+i] != b.pattern[i] {
- return false
- }
- }
- return true
- }
- }
-
- type AnchorLoc int16
-
- // where the regex can be pegged
- const (
- AnchorBeginning AnchorLoc = 0x0001
- AnchorBol = 0x0002
- AnchorStart = 0x0004
- AnchorEol = 0x0008
- AnchorEndZ = 0x0010
- AnchorEnd = 0x0020
- AnchorBoundary = 0x0040
- AnchorECMABoundary = 0x0080
- )
-
- func getAnchors(tree *RegexTree) AnchorLoc {
-
- var concatNode *regexNode
- nextChild, result := 0, AnchorLoc(0)
-
- curNode := tree.root
-
- for {
- switch curNode.t {
- case ntConcatenate:
- if len(curNode.children) > 0 {
- concatNode = curNode
- nextChild = 0
- }
-
- case ntGreedy, ntCapture:
- curNode = curNode.children[0]
- concatNode = nil
- continue
-
- case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning,
- ntStart, ntEndZ, ntEnd:
- return result | anchorFromType(curNode.t)
-
- case ntEmpty, ntRequire, ntPrevent:
-
- default:
- return result
- }
-
- if concatNode == nil || nextChild >= len(concatNode.children) {
- return result
- }
-
- curNode = concatNode.children[nextChild]
- nextChild++
- }
- }
-
- func anchorFromType(t nodeType) AnchorLoc {
- switch t {
- case ntBol:
- return AnchorBol
- case ntEol:
- return AnchorEol
- case ntBoundary:
- return AnchorBoundary
- case ntECMABoundary:
- return AnchorECMABoundary
- case ntBeginning:
- return AnchorBeginning
- case ntStart:
- return AnchorStart
- case ntEndZ:
- return AnchorEndZ
- case ntEnd:
- return AnchorEnd
- default:
- return 0
- }
- }
-
- // anchorDescription returns a human-readable description of the anchors
- func (anchors AnchorLoc) String() string {
- buf := &bytes.Buffer{}
-
- if 0 != (anchors & AnchorBeginning) {
- buf.WriteString(", Beginning")
- }
- if 0 != (anchors & AnchorStart) {
- buf.WriteString(", Start")
- }
- if 0 != (anchors & AnchorBol) {
- buf.WriteString(", Bol")
- }
- if 0 != (anchors & AnchorBoundary) {
- buf.WriteString(", Boundary")
- }
- if 0 != (anchors & AnchorECMABoundary) {
- buf.WriteString(", ECMABoundary")
- }
- if 0 != (anchors & AnchorEol) {
- buf.WriteString(", Eol")
- }
- if 0 != (anchors & AnchorEnd) {
- buf.WriteString(", End")
- }
- if 0 != (anchors & AnchorEndZ) {
- buf.WriteString(", EndZ")
- }
-
- // trim off comma
- if buf.Len() >= 2 {
- return buf.String()[2:]
- }
- return "None"
- }
|