|
- package syntax
-
- import (
- "bytes"
- "fmt"
- "math"
- "strconv"
- )
-
- type RegexTree struct {
- root *regexNode
- caps map[int]int
- capnumlist []int
- captop int
- Capnames map[string]int
- Caplist []string
- options RegexOptions
- }
-
- // It is built into a parsed tree for a regular expression.
-
- // Implementation notes:
- //
- // Since the node tree is a temporary data structure only used
- // during compilation of the regexp to integer codes, it's
- // designed for clarity and convenience rather than
- // space efficiency.
- //
- // RegexNodes are built into a tree, linked by the n.children list.
- // Each node also has a n.parent and n.ichild member indicating
- // its parent and which child # it is in its parent's list.
- //
- // RegexNodes come in as many types as there are constructs in
- // a regular expression, for example, "concatenate", "alternate",
- // "one", "rept", "group". There are also node types for basic
- // peephole optimizations, e.g., "onerep", "notsetrep", etc.
- //
- // Because perl 5 allows "lookback" groups that scan backwards,
- // each node also gets a "direction". Normally the value of
- // boolean n.backward = false.
- //
- // During parsing, top-level nodes are also stacked onto a parse
- // stack (a stack of trees). For this purpose we have a n.next
- // pointer. [Note that to save a few bytes, we could overload the
- // n.parent pointer instead.]
- //
- // On the parse stack, each tree has a "role" - basically, the
- // nonterminal in the grammar that the parser has currently
- // assigned to the tree. That code is stored in n.role.
- //
- // Finally, some of the different kinds of nodes have data.
- // Two integers (for the looping constructs) are stored in
- // n.operands, an an object (either a string or a set)
- // is stored in n.data
- type regexNode struct {
- t nodeType
- children []*regexNode
- str []rune
- set *CharSet
- ch rune
- m int
- n int
- options RegexOptions
- next *regexNode
- }
-
- type nodeType int32
-
- const (
- // The following are leaves, and correspond to primitive operations
-
- ntOnerep nodeType = 0 // lef,back char,min,max a {n}
- ntNotonerep = 1 // lef,back char,min,max .{n}
- ntSetrep = 2 // lef,back set,min,max [\d]{n}
- ntOneloop = 3 // lef,back char,min,max a {,n}
- ntNotoneloop = 4 // lef,back char,min,max .{,n}
- ntSetloop = 5 // lef,back set,min,max [\d]{,n}
- ntOnelazy = 6 // lef,back char,min,max a {,n}?
- ntNotonelazy = 7 // lef,back char,min,max .{,n}?
- ntSetlazy = 8 // lef,back set,min,max [\d]{,n}?
- ntOne = 9 // lef char a
- ntNotone = 10 // lef char [^a]
- ntSet = 11 // lef set [a-z\s] \w \s \d
- ntMulti = 12 // lef string abcd
- ntRef = 13 // lef group \#
- ntBol = 14 // ^
- ntEol = 15 // $
- ntBoundary = 16 // \b
- ntNonboundary = 17 // \B
- ntBeginning = 18 // \A
- ntStart = 19 // \G
- ntEndZ = 20 // \Z
- ntEnd = 21 // \Z
-
- // Interior nodes do not correspond to primitive operations, but
- // control structures compositing other operations
-
- // Concat and alternate take n children, and can run forward or backwards
-
- ntNothing = 22 // []
- ntEmpty = 23 // ()
- ntAlternate = 24 // a|b
- ntConcatenate = 25 // ab
- ntLoop = 26 // m,x * + ? {,}
- ntLazyloop = 27 // m,x *? +? ?? {,}?
- ntCapture = 28 // n ()
- ntGroup = 29 // (?:)
- ntRequire = 30 // (?=) (?<=)
- ntPrevent = 31 // (?!) (?<!)
- ntGreedy = 32 // (?>) (?<)
- ntTestref = 33 // (?(n) | )
- ntTestgroup = 34 // (?(...) | )
-
- ntECMABoundary = 41 // \b
- ntNonECMABoundary = 42 // \B
- )
-
- func newRegexNode(t nodeType, opt RegexOptions) *regexNode {
- return ®exNode{
- t: t,
- options: opt,
- }
- }
-
- func newRegexNodeCh(t nodeType, opt RegexOptions, ch rune) *regexNode {
- return ®exNode{
- t: t,
- options: opt,
- ch: ch,
- }
- }
-
- func newRegexNodeStr(t nodeType, opt RegexOptions, str []rune) *regexNode {
- return ®exNode{
- t: t,
- options: opt,
- str: str,
- }
- }
-
- func newRegexNodeSet(t nodeType, opt RegexOptions, set *CharSet) *regexNode {
- return ®exNode{
- t: t,
- options: opt,
- set: set,
- }
- }
-
- func newRegexNodeM(t nodeType, opt RegexOptions, m int) *regexNode {
- return ®exNode{
- t: t,
- options: opt,
- m: m,
- }
- }
- func newRegexNodeMN(t nodeType, opt RegexOptions, m, n int) *regexNode {
- return ®exNode{
- t: t,
- options: opt,
- m: m,
- n: n,
- }
- }
-
- func (n *regexNode) writeStrToBuf(buf *bytes.Buffer) {
- for i := 0; i < len(n.str); i++ {
- buf.WriteRune(n.str[i])
- }
- }
-
- func (n *regexNode) addChild(child *regexNode) {
- reduced := child.reduce()
- n.children = append(n.children, reduced)
- reduced.next = n
- }
-
- func (n *regexNode) insertChildren(afterIndex int, nodes []*regexNode) {
- newChildren := make([]*regexNode, 0, len(n.children)+len(nodes))
- n.children = append(append(append(newChildren, n.children[:afterIndex]...), nodes...), n.children[afterIndex:]...)
- }
-
- // removes children including the start but not the end index
- func (n *regexNode) removeChildren(startIndex, endIndex int) {
- n.children = append(n.children[:startIndex], n.children[endIndex:]...)
- }
-
- // Pass type as OneLazy or OneLoop
- func (n *regexNode) makeRep(t nodeType, min, max int) {
- n.t += (t - ntOne)
- n.m = min
- n.n = max
- }
-
- func (n *regexNode) reduce() *regexNode {
- switch n.t {
- case ntAlternate:
- return n.reduceAlternation()
-
- case ntConcatenate:
- return n.reduceConcatenation()
-
- case ntLoop, ntLazyloop:
- return n.reduceRep()
-
- case ntGroup:
- return n.reduceGroup()
-
- case ntSet, ntSetloop:
- return n.reduceSet()
-
- default:
- return n
- }
- }
-
- // Basic optimization. Single-letter alternations can be replaced
- // by faster set specifications, and nested alternations with no
- // intervening operators can be flattened:
- //
- // a|b|c|def|g|h -> [a-c]|def|[gh]
- // apple|(?:orange|pear)|grape -> apple|orange|pear|grape
- func (n *regexNode) reduceAlternation() *regexNode {
- if len(n.children) == 0 {
- return newRegexNode(ntNothing, n.options)
- }
-
- wasLastSet := false
- lastNodeCannotMerge := false
- var optionsLast RegexOptions
- var i, j int
-
- for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
- at := n.children[i]
-
- if j < i {
- n.children[j] = at
- }
-
- for {
- if at.t == ntAlternate {
- for k := 0; k < len(at.children); k++ {
- at.children[k].next = n
- }
- n.insertChildren(i+1, at.children)
-
- j--
- } else if at.t == ntSet || at.t == ntOne {
- // Cannot merge sets if L or I options differ, or if either are negated.
- optionsAt := at.options & (RightToLeft | IgnoreCase)
-
- if at.t == ntSet {
- if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !at.set.IsMergeable() {
- wasLastSet = true
- lastNodeCannotMerge = !at.set.IsMergeable()
- optionsLast = optionsAt
- break
- }
- } else if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge {
- wasLastSet = true
- lastNodeCannotMerge = false
- optionsLast = optionsAt
- break
- }
-
- // The last node was a Set or a One, we're a Set or One and our options are the same.
- // Merge the two nodes.
- j--
- prev := n.children[j]
-
- var prevCharClass *CharSet
- if prev.t == ntOne {
- prevCharClass = &CharSet{}
- prevCharClass.addChar(prev.ch)
- } else {
- prevCharClass = prev.set
- }
-
- if at.t == ntOne {
- prevCharClass.addChar(at.ch)
- } else {
- prevCharClass.addSet(*at.set)
- }
-
- prev.t = ntSet
- prev.set = prevCharClass
- } else if at.t == ntNothing {
- j--
- } else {
- wasLastSet = false
- lastNodeCannotMerge = false
- }
- break
- }
- }
-
- if j < i {
- n.removeChildren(j, i)
- }
-
- return n.stripEnation(ntNothing)
- }
-
- // Basic optimization. Adjacent strings can be concatenated.
- //
- // (?:abc)(?:def) -> abcdef
- func (n *regexNode) reduceConcatenation() *regexNode {
- // Eliminate empties and concat adjacent strings/chars
-
- var optionsLast RegexOptions
- var optionsAt RegexOptions
- var i, j int
-
- if len(n.children) == 0 {
- return newRegexNode(ntEmpty, n.options)
- }
-
- wasLastString := false
-
- for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
- var at, prev *regexNode
-
- at = n.children[i]
-
- if j < i {
- n.children[j] = at
- }
-
- if at.t == ntConcatenate &&
- ((at.options & RightToLeft) == (n.options & RightToLeft)) {
- for k := 0; k < len(at.children); k++ {
- at.children[k].next = n
- }
-
- //insert at.children at i+1 index in n.children
- n.insertChildren(i+1, at.children)
-
- j--
- } else if at.t == ntMulti || at.t == ntOne {
- // Cannot merge strings if L or I options differ
- optionsAt = at.options & (RightToLeft | IgnoreCase)
-
- if !wasLastString || optionsLast != optionsAt {
- wasLastString = true
- optionsLast = optionsAt
- continue
- }
-
- j--
- prev = n.children[j]
-
- if prev.t == ntOne {
- prev.t = ntMulti
- prev.str = []rune{prev.ch}
- }
-
- if (optionsAt & RightToLeft) == 0 {
- if at.t == ntOne {
- prev.str = append(prev.str, at.ch)
- } else {
- prev.str = append(prev.str, at.str...)
- }
- } else {
- if at.t == ntOne {
- // insert at the front by expanding our slice, copying the data over, and then setting the value
- prev.str = append(prev.str, 0)
- copy(prev.str[1:], prev.str)
- prev.str[0] = at.ch
- } else {
- //insert at the front...this one we'll make a new slice and copy both into it
- merge := make([]rune, len(prev.str)+len(at.str))
- copy(merge, at.str)
- copy(merge[len(at.str):], prev.str)
- prev.str = merge
- }
- }
- } else if at.t == ntEmpty {
- j--
- } else {
- wasLastString = false
- }
- }
-
- if j < i {
- // remove indices j through i from the children
- n.removeChildren(j, i)
- }
-
- return n.stripEnation(ntEmpty)
- }
-
- // Nested repeaters just get multiplied with each other if they're not
- // too lumpy
- func (n *regexNode) reduceRep() *regexNode {
-
- u := n
- t := n.t
- min := n.m
- max := n.n
-
- for {
- if len(u.children) == 0 {
- break
- }
-
- child := u.children[0]
-
- // multiply reps of the same type only
- if child.t != t {
- childType := child.t
-
- if !(childType >= ntOneloop && childType <= ntSetloop && t == ntLoop ||
- childType >= ntOnelazy && childType <= ntSetlazy && t == ntLazyloop) {
- break
- }
- }
-
- // child can be too lumpy to blur, e.g., (a {100,105}) {3} or (a {2,})?
- // [but things like (a {2,})+ are not too lumpy...]
- if u.m == 0 && child.m > 1 || child.n < child.m*2 {
- break
- }
-
- u = child
- if u.m > 0 {
- if (math.MaxInt32-1)/u.m < min {
- u.m = math.MaxInt32
- } else {
- u.m = u.m * min
- }
- }
- if u.n > 0 {
- if (math.MaxInt32-1)/u.n < max {
- u.n = math.MaxInt32
- } else {
- u.n = u.n * max
- }
- }
- }
-
- if math.MaxInt32 == min {
- return newRegexNode(ntNothing, n.options)
- }
- return u
-
- }
-
- // Simple optimization. If a concatenation or alternation has only
- // one child strip out the intermediate node. If it has zero children,
- // turn it into an empty.
- func (n *regexNode) stripEnation(emptyType nodeType) *regexNode {
- switch len(n.children) {
- case 0:
- return newRegexNode(emptyType, n.options)
- case 1:
- return n.children[0]
- default:
- return n
- }
- }
-
- func (n *regexNode) reduceGroup() *regexNode {
- u := n
-
- for u.t == ntGroup {
- u = u.children[0]
- }
-
- return u
- }
-
- // Simple optimization. If a set is a singleton, an inverse singleton,
- // or empty, it's transformed accordingly.
- func (n *regexNode) reduceSet() *regexNode {
- // Extract empty-set, one and not-one case as special
-
- if n.set == nil {
- n.t = ntNothing
- } else if n.set.IsSingleton() {
- n.ch = n.set.SingletonChar()
- n.set = nil
- n.t += (ntOne - ntSet)
- } else if n.set.IsSingletonInverse() {
- n.ch = n.set.SingletonChar()
- n.set = nil
- n.t += (ntNotone - ntSet)
- }
-
- return n
- }
-
- func (n *regexNode) reverseLeft() *regexNode {
- if n.options&RightToLeft != 0 && n.t == ntConcatenate && len(n.children) > 0 {
- //reverse children order
- for left, right := 0, len(n.children)-1; left < right; left, right = left+1, right-1 {
- n.children[left], n.children[right] = n.children[right], n.children[left]
- }
- }
-
- return n
- }
-
- func (n *regexNode) makeQuantifier(lazy bool, min, max int) *regexNode {
- if min == 0 && max == 0 {
- return newRegexNode(ntEmpty, n.options)
- }
-
- if min == 1 && max == 1 {
- return n
- }
-
- switch n.t {
- case ntOne, ntNotone, ntSet:
- if lazy {
- n.makeRep(Onelazy, min, max)
- } else {
- n.makeRep(Oneloop, min, max)
- }
- return n
-
- default:
- var t nodeType
- if lazy {
- t = ntLazyloop
- } else {
- t = ntLoop
- }
- result := newRegexNodeMN(t, n.options, min, max)
- result.addChild(n)
- return result
- }
- }
-
- // debug functions
-
- var typeStr = []string{
- "Onerep", "Notonerep", "Setrep",
- "Oneloop", "Notoneloop", "Setloop",
- "Onelazy", "Notonelazy", "Setlazy",
- "One", "Notone", "Set",
- "Multi", "Ref",
- "Bol", "Eol", "Boundary", "Nonboundary",
- "Beginning", "Start", "EndZ", "End",
- "Nothing", "Empty",
- "Alternate", "Concatenate",
- "Loop", "Lazyloop",
- "Capture", "Group", "Require", "Prevent", "Greedy",
- "Testref", "Testgroup",
- "Unknown", "Unknown", "Unknown",
- "Unknown", "Unknown", "Unknown",
- "ECMABoundary", "NonECMABoundary",
- }
-
- func (n *regexNode) description() string {
- buf := &bytes.Buffer{}
-
- buf.WriteString(typeStr[n.t])
-
- if (n.options & ExplicitCapture) != 0 {
- buf.WriteString("-C")
- }
- if (n.options & IgnoreCase) != 0 {
- buf.WriteString("-I")
- }
- if (n.options & RightToLeft) != 0 {
- buf.WriteString("-L")
- }
- if (n.options & Multiline) != 0 {
- buf.WriteString("-M")
- }
- if (n.options & Singleline) != 0 {
- buf.WriteString("-S")
- }
- if (n.options & IgnorePatternWhitespace) != 0 {
- buf.WriteString("-X")
- }
- if (n.options & ECMAScript) != 0 {
- buf.WriteString("-E")
- }
-
- switch n.t {
- case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntOne, ntNotone:
- buf.WriteString("(Ch = " + CharDescription(n.ch) + ")")
- break
- case ntCapture:
- buf.WriteString("(index = " + strconv.Itoa(n.m) + ", unindex = " + strconv.Itoa(n.n) + ")")
- break
- case ntRef, ntTestref:
- buf.WriteString("(index = " + strconv.Itoa(n.m) + ")")
- break
- case ntMulti:
- fmt.Fprintf(buf, "(String = %s)", string(n.str))
- break
- case ntSet, ntSetloop, ntSetlazy:
- buf.WriteString("(Set = " + n.set.String() + ")")
- break
- }
-
- switch n.t {
- case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntSetloop, ntSetlazy, ntLoop, ntLazyloop:
- buf.WriteString("(Min = ")
- buf.WriteString(strconv.Itoa(n.m))
- buf.WriteString(", Max = ")
- if n.n == math.MaxInt32 {
- buf.WriteString("inf")
- } else {
- buf.WriteString(strconv.Itoa(n.n))
- }
- buf.WriteString(")")
-
- break
- }
-
- return buf.String()
- }
-
- var padSpace = []byte(" ")
-
- func (t *RegexTree) Dump() string {
- return t.root.dump()
- }
-
- func (n *regexNode) dump() string {
- var stack []int
- CurNode := n
- CurChild := 0
-
- buf := bytes.NewBufferString(CurNode.description())
- buf.WriteRune('\n')
-
- for {
- if CurNode.children != nil && CurChild < len(CurNode.children) {
- stack = append(stack, CurChild+1)
- CurNode = CurNode.children[CurChild]
- CurChild = 0
-
- Depth := len(stack)
- if Depth > 32 {
- Depth = 32
- }
- buf.Write(padSpace[:Depth])
- buf.WriteString(CurNode.description())
- buf.WriteRune('\n')
- } else {
- if len(stack) == 0 {
- break
- }
-
- CurChild = stack[len(stack)-1]
- stack = stack[:len(stack)-1]
- CurNode = CurNode.next
- }
- }
- return buf.String()
- }
|