wangwei
/
aiforge

 
			
			   
				 
					
						
						
							
							package syntax

import (
	"bytes"
	"fmt"
	"math"
	"strconv"
)

type RegexTree struct {
	root       *regexNode
	caps       map[int]int
	capnumlist []int
	captop     int
	Capnames   map[string]int
	Caplist    []string
	options    RegexOptions
}

// It is built into a parsed tree for a regular expression.

// Implementation notes:
//
// Since the node tree is a temporary data structure only used
// during compilation of the regexp to integer codes, it's
// designed for clarity and convenience rather than
// space efficiency.
//
// RegexNodes are built into a tree, linked by the n.children list.
// Each node also has a n.parent and n.ichild member indicating
// its parent and which child # it is in its parent's list.
//
// RegexNodes come in as many types as there are constructs in
// a regular expression, for example, "concatenate", "alternate",
// "one", "rept", "group". There are also node types for basic
// peephole optimizations, e.g., "onerep", "notsetrep", etc.
//
// Because perl 5 allows "lookback" groups that scan backwards,
// each node also gets a "direction". Normally the value of
// boolean n.backward = false.
//
// During parsing, top-level nodes are also stacked onto a parse
// stack (a stack of trees). For this purpose we have a n.next
// pointer. [Note that to save a few bytes, we could overload the
// n.parent pointer instead.]
//
// On the parse stack, each tree has a "role" - basically, the
// nonterminal in the grammar that the parser has currently
// assigned to the tree. That code is stored in n.role.
//
// Finally, some of the different kinds of nodes have data.
// Two integers (for the looping constructs) are stored in
// n.operands, an an object (either a string or a set)
// is stored in n.data
type regexNode struct {
	t        nodeType
	children []*regexNode
	str      []rune
	set      *CharSet
	ch       rune
	m        int
	n        int
	options  RegexOptions
	next     *regexNode
}

type nodeType int32

const (
	// The following are leaves, and correspond to primitive operations

	ntOnerep      nodeType = 0  // lef,back char,min,max    a {n}
	ntNotonerep            = 1  // lef,back char,min,max    .{n}
	ntSetrep               = 2  // lef,back set,min,max     [\d]{n}
	ntOneloop              = 3  // lef,back char,min,max    a {,n}
	ntNotoneloop           = 4  // lef,back char,min,max    .{,n}
	ntSetloop              = 5  // lef,back set,min,max     [\d]{,n}
	ntOnelazy              = 6  // lef,back char,min,max    a {,n}?
	ntNotonelazy           = 7  // lef,back char,min,max    .{,n}?
	ntSetlazy              = 8  // lef,back set,min,max     [\d]{,n}?
	ntOne                  = 9  // lef      char            a
	ntNotone               = 10 // lef      char            [^a]
	ntSet                  = 11 // lef      set             [a-z\s]  \w \s \d
	ntMulti                = 12 // lef      string          abcd
	ntRef                  = 13 // lef      group           \#
	ntBol                  = 14 //                          ^
	ntEol                  = 15 //                          $
	ntBoundary             = 16 //                          \b
	ntNonboundary          = 17 //                          \B
	ntBeginning            = 18 //                          \A
	ntStart                = 19 //                          \G
	ntEndZ                 = 20 //                          \Z
	ntEnd                  = 21 //                          \Z

	// Interior nodes do not correspond to primitive operations, but
	// control structures compositing other operations

	// Concat and alternate take n children, and can run forward or backwards

	ntNothing     = 22 //          []
	ntEmpty       = 23 //          ()
	ntAlternate   = 24 //          a|b
	ntConcatenate = 25 //          ab
	ntLoop        = 26 // m,x      * + ? {,}
	ntLazyloop    = 27 // m,x      *? +? ?? {,}?
	ntCapture     = 28 // n        ()
	ntGroup       = 29 //          (?:)
	ntRequire     = 30 //          (?=) (?<=)
	ntPrevent     = 31 //          (?!) (?<!)
	ntGreedy      = 32 //          (?>) (?<)
	ntTestref     = 33 //          (?(n) | )
	ntTestgroup   = 34 //          (?(...) | )

	ntECMABoundary    = 41 //                          \b
	ntNonECMABoundary = 42 //                          \B
)

func newRegexNode(t nodeType, opt RegexOptions) *regexNode {
	return &regexNode{
		t:       t,
		options: opt,
	}
}

func newRegexNodeCh(t nodeType, opt RegexOptions, ch rune) *regexNode {
	return &regexNode{
		t:       t,
		options: opt,
		ch:      ch,
	}
}

func newRegexNodeStr(t nodeType, opt RegexOptions, str []rune) *regexNode {
	return &regexNode{
		t:       t,
		options: opt,
		str:     str,
	}
}

func newRegexNodeSet(t nodeType, opt RegexOptions, set *CharSet) *regexNode {
	return &regexNode{
		t:       t,
		options: opt,
		set:     set,
	}
}

func newRegexNodeM(t nodeType, opt RegexOptions, m int) *regexNode {
	return &regexNode{
		t:       t,
		options: opt,
		m:       m,
	}
}
func newRegexNodeMN(t nodeType, opt RegexOptions, m, n int) *regexNode {
	return &regexNode{
		t:       t,
		options: opt,
		m:       m,
		n:       n,
	}
}

func (n *regexNode) writeStrToBuf(buf *bytes.Buffer) {
	for i := 0; i < len(n.str); i++ {
		buf.WriteRune(n.str[i])
	}
}

func (n *regexNode) addChild(child *regexNode) {
	reduced := child.reduce()
	n.children = append(n.children, reduced)
	reduced.next = n
}

func (n *regexNode) insertChildren(afterIndex int, nodes []*regexNode) {
	newChildren := make([]*regexNode, 0, len(n.children)+len(nodes))
	n.children = append(append(append(newChildren, n.children[:afterIndex]...), nodes...), n.children[afterIndex:]...)
}

// removes children including the start but not the end index
func (n *regexNode) removeChildren(startIndex, endIndex int) {
	n.children = append(n.children[:startIndex], n.children[endIndex:]...)
}

// Pass type as OneLazy or OneLoop
func (n *regexNode) makeRep(t nodeType, min, max int) {
	n.t += (t - ntOne)
	n.m = min
	n.n = max
}

func (n *regexNode) reduce() *regexNode {
	switch n.t {
	case ntAlternate:
		return n.reduceAlternation()

	case ntConcatenate:
		return n.reduceConcatenation()

	case ntLoop, ntLazyloop:
		return n.reduceRep()

	case ntGroup:
		return n.reduceGroup()

	case ntSet, ntSetloop:
		return n.reduceSet()

	default:
		return n
	}
}

// Basic optimization. Single-letter alternations can be replaced
// by faster set specifications, and nested alternations with no
// intervening operators can be flattened:
//
// a|b|c|def|g|h -> [a-c]|def|[gh]
// apple|(?:orange|pear)|grape -> apple|orange|pear|grape
func (n *regexNode) reduceAlternation() *regexNode {
	if len(n.children) == 0 {
		return newRegexNode(ntNothing, n.options)
	}

	wasLastSet := false
	lastNodeCannotMerge := false
	var optionsLast RegexOptions
	var i, j int

	for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
		at := n.children[i]

		if j < i {
			n.children[j] = at
		}

		for {
			if at.t == ntAlternate {
				for k := 0; k < len(at.children); k++ {
					at.children[k].next = n
				}
				n.insertChildren(i+1, at.children)

				j--
			} else if at.t == ntSet || at.t == ntOne {
				// Cannot merge sets if L or I options differ, or if either are negated.
				optionsAt := at.options & (RightToLeft | IgnoreCase)

				if at.t == ntSet {
					if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !at.set.IsMergeable() {
						wasLastSet = true
						lastNodeCannotMerge = !at.set.IsMergeable()
						optionsLast = optionsAt
						break
					}
				} else if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge {
					wasLastSet = true
					lastNodeCannotMerge = false
					optionsLast = optionsAt
					break
				}

				// The last node was a Set or a One, we're a Set or One and our options are the same.
				// Merge the two nodes.
				j--
				prev := n.children[j]

				var prevCharClass *CharSet
				if prev.t == ntOne {
					prevCharClass = &CharSet{}
					prevCharClass.addChar(prev.ch)
				} else {
					prevCharClass = prev.set
				}

				if at.t == ntOne {
					prevCharClass.addChar(at.ch)
				} else {
					prevCharClass.addSet(*at.set)
				}

				prev.t = ntSet
				prev.set = prevCharClass
			} else if at.t == ntNothing {
				j--
			} else {
				wasLastSet = false
				lastNodeCannotMerge = false
			}
			break
		}
	}

	if j < i {
		n.removeChildren(j, i)
	}

	return n.stripEnation(ntNothing)
}

// Basic optimization. Adjacent strings can be concatenated.
//
// (?:abc)(?:def) -> abcdef
func (n *regexNode) reduceConcatenation() *regexNode {
	// Eliminate empties and concat adjacent strings/chars

	var optionsLast RegexOptions
	var optionsAt RegexOptions
	var i, j int

	if len(n.children) == 0 {
		return newRegexNode(ntEmpty, n.options)
	}

	wasLastString := false

	for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
		var at, prev *regexNode

		at = n.children[i]

		if j < i {
			n.children[j] = at
		}

		if at.t == ntConcatenate &&
			((at.options & RightToLeft) == (n.options & RightToLeft)) {
			for k := 0; k < len(at.children); k++ {
				at.children[k].next = n
			}

			//insert at.children at i+1 index in n.children
			n.insertChildren(i+1, at.children)

			j--
		} else if at.t == ntMulti || at.t == ntOne {
			// Cannot merge strings if L or I options differ
			optionsAt = at.options & (RightToLeft | IgnoreCase)

			if !wasLastString || optionsLast != optionsAt {
				wasLastString = true
				optionsLast = optionsAt
				continue
			}

			j--
			prev = n.children[j]

			if prev.t == ntOne {
				prev.t = ntMulti
				prev.str = []rune{prev.ch}
			}

			if (optionsAt & RightToLeft) == 0 {
				if at.t == ntOne {
					prev.str = append(prev.str, at.ch)
				} else {
					prev.str = append(prev.str, at.str...)
				}
			} else {
				if at.t == ntOne {
					// insert at the front by expanding our slice, copying the data over, and then setting the value
					prev.str = append(prev.str, 0)
					copy(prev.str[1:], prev.str)
					prev.str[0] = at.ch
				} else {
					//insert at the front...this one we'll make a new slice and copy both into it
					merge := make([]rune, len(prev.str)+len(at.str))
					copy(merge, at.str)
					copy(merge[len(at.str):], prev.str)
					prev.str = merge
				}
			}
		} else if at.t == ntEmpty {
			j--
		} else {
			wasLastString = false
		}
	}

	if j < i {
		// remove indices j through i from the children
		n.removeChildren(j, i)
	}

	return n.stripEnation(ntEmpty)
}

// Nested repeaters just get multiplied with each other if they're not
// too lumpy
func (n *regexNode) reduceRep() *regexNode {

	u := n
	t := n.t
	min := n.m
	max := n.n

	for {
		if len(u.children) == 0 {
			break
		}

		child := u.children[0]

		// multiply reps of the same type only
		if child.t != t {
			childType := child.t

			if !(childType >= ntOneloop && childType <= ntSetloop && t == ntLoop ||
				childType >= ntOnelazy && childType <= ntSetlazy && t == ntLazyloop) {
				break
			}
		}

		// child can be too lumpy to blur, e.g., (a {100,105}) {3} or (a {2,})?
		// [but things like (a {2,})+ are not too lumpy...]
		if u.m == 0 && child.m > 1 || child.n < child.m*2 {
			break
		}

		u = child
		if u.m > 0 {
			if (math.MaxInt32-1)/u.m < min {
				u.m = math.MaxInt32
			} else {
				u.m = u.m * min
			}
		}
		if u.n > 0 {
			if (math.MaxInt32-1)/u.n < max {
				u.n = math.MaxInt32
			} else {
				u.n = u.n * max
			}
		}
	}

	if math.MaxInt32 == min {
		return newRegexNode(ntNothing, n.options)
	}
	return u

}

// Simple optimization. If a concatenation or alternation has only
// one child strip out the intermediate node. If it has zero children,
// turn it into an empty.
func (n *regexNode) stripEnation(emptyType nodeType) *regexNode {
	switch len(n.children) {
	case 0:
		return newRegexNode(emptyType, n.options)
	case 1:
		return n.children[0]
	default:
		return n
	}
}

func (n *regexNode) reduceGroup() *regexNode {
	u := n

	for u.t == ntGroup {
		u = u.children[0]
	}

	return u
}

// Simple optimization. If a set is a singleton, an inverse singleton,
// or empty, it's transformed accordingly.
func (n *regexNode) reduceSet() *regexNode {
	// Extract empty-set, one and not-one case as special

	if n.set == nil {
		n.t = ntNothing
	} else if n.set.IsSingleton() {
		n.ch = n.set.SingletonChar()
		n.set = nil
		n.t += (ntOne - ntSet)
	} else if n.set.IsSingletonInverse() {
		n.ch = n.set.SingletonChar()
		n.set = nil
		n.t += (ntNotone - ntSet)
	}

	return n
}

func (n *regexNode) reverseLeft() *regexNode {
	if n.options&RightToLeft != 0 && n.t == ntConcatenate && len(n.children) > 0 {
		//reverse children order
		for left, right := 0, len(n.children)-1; left < right; left, right = left+1, right-1 {
			n.children[left], n.children[right] = n.children[right], n.children[left]
		}
	}

	return n
}

func (n *regexNode) makeQuantifier(lazy bool, min, max int) *regexNode {
	if min == 0 && max == 0 {
		return newRegexNode(ntEmpty, n.options)
	}

	if min == 1 && max == 1 {
		return n
	}

	switch n.t {
	case ntOne, ntNotone, ntSet:
		if lazy {
			n.makeRep(Onelazy, min, max)
		} else {
			n.makeRep(Oneloop, min, max)
		}
		return n

	default:
		var t nodeType
		if lazy {
			t = ntLazyloop
		} else {
			t = ntLoop
		}
		result := newRegexNodeMN(t, n.options, min, max)
		result.addChild(n)
		return result
	}
}

// debug functions

var typeStr = []string{
	"Onerep", "Notonerep", "Setrep",
	"Oneloop", "Notoneloop", "Setloop",
	"Onelazy", "Notonelazy", "Setlazy",
	"One", "Notone", "Set",
	"Multi", "Ref",
	"Bol", "Eol", "Boundary", "Nonboundary",
	"Beginning", "Start", "EndZ", "End",
	"Nothing", "Empty",
	"Alternate", "Concatenate",
	"Loop", "Lazyloop",
	"Capture", "Group", "Require", "Prevent", "Greedy",
	"Testref", "Testgroup",
	"Unknown", "Unknown", "Unknown",
	"Unknown", "Unknown", "Unknown",
	"ECMABoundary", "NonECMABoundary",
}

func (n *regexNode) description() string {
	buf := &bytes.Buffer{}

	buf.WriteString(typeStr[n.t])

	if (n.options & ExplicitCapture) != 0 {
		buf.WriteString("-C")
	}
	if (n.options & IgnoreCase) != 0 {
		buf.WriteString("-I")
	}
	if (n.options & RightToLeft) != 0 {
		buf.WriteString("-L")
	}
	if (n.options & Multiline) != 0 {
		buf.WriteString("-M")
	}
	if (n.options & Singleline) != 0 {
		buf.WriteString("-S")
	}
	if (n.options & IgnorePatternWhitespace) != 0 {
		buf.WriteString("-X")
	}
	if (n.options & ECMAScript) != 0 {
		buf.WriteString("-E")
	}

	switch n.t {
	case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntOne, ntNotone:
		buf.WriteString("(Ch = " + CharDescription(n.ch) + ")")
		break
	case ntCapture:
		buf.WriteString("(index = " + strconv.Itoa(n.m) + ", unindex = " + strconv.Itoa(n.n) + ")")
		break
	case ntRef, ntTestref:
		buf.WriteString("(index = " + strconv.Itoa(n.m) + ")")
		break
	case ntMulti:
		fmt.Fprintf(buf, "(String = %s)", string(n.str))
		break
	case ntSet, ntSetloop, ntSetlazy:
		buf.WriteString("(Set = " + n.set.String() + ")")
		break
	}

	switch n.t {
	case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntSetloop, ntSetlazy, ntLoop, ntLazyloop:
		buf.WriteString("(Min = ")
		buf.WriteString(strconv.Itoa(n.m))
		buf.WriteString(", Max = ")
		if n.n == math.MaxInt32 {
			buf.WriteString("inf")
		} else {
			buf.WriteString(strconv.Itoa(n.n))
		}
		buf.WriteString(")")

		break
	}

	return buf.String()
}

var padSpace = []byte("                                ")

func (t *RegexTree) Dump() string {
	return t.root.dump()
}

func (n *regexNode) dump() string {
	var stack []int
	CurNode := n
	CurChild := 0

	buf := bytes.NewBufferString(CurNode.description())
	buf.WriteRune('\n')

	for {
		if CurNode.children != nil && CurChild < len(CurNode.children) {
			stack = append(stack, CurChild+1)
			CurNode = CurNode.children[CurChild]
			CurChild = 0

			Depth := len(stack)
			if Depth > 32 {
				Depth = 32
			}
			buf.Write(padSpace[:Depth])
			buf.WriteString(CurNode.description())
			buf.WriteRune('\n')
		} else {
			if len(stack) == 0 {
				break
			}

			CurChild = stack[len(stack)-1]
			stack = stack[:len(stack)-1]
			CurNode = CurNode.next
		}
	}
	return buf.String()
}