|
- package syntax
-
- import (
- "bytes"
- "fmt"
- "math"
- "os"
- )
-
- func Write(tree *RegexTree) (*Code, error) {
- w := writer{
- intStack: make([]int, 0, 32),
- emitted: make([]int, 2),
- stringhash: make(map[string]int),
- sethash: make(map[string]int),
- }
-
- code, err := w.codeFromTree(tree)
-
- if tree.options&Debug > 0 && code != nil {
- os.Stdout.WriteString(code.Dump())
- os.Stdout.WriteString("\n")
- }
-
- return code, err
- }
-
- type writer struct {
- emitted []int
-
- intStack []int
- curpos int
- stringhash map[string]int
- stringtable [][]rune
- sethash map[string]int
- settable []*CharSet
- counting bool
- count int
- trackcount int
- caps map[int]int
- }
-
- const (
- beforeChild nodeType = 64
- afterChild = 128
- //MaxPrefixSize is the largest number of runes we'll use for a BoyerMoyer prefix
- MaxPrefixSize = 50
- )
-
- // The top level RegexCode generator. It does a depth-first walk
- // through the tree and calls EmitFragment to emits code before
- // and after each child of an interior node, and at each leaf.
- //
- // It runs two passes, first to count the size of the generated
- // code, and second to generate the code.
- //
- // We should time it against the alternative, which is
- // to just generate the code and grow the array as we go.
- func (w *writer) codeFromTree(tree *RegexTree) (*Code, error) {
- var (
- curNode *regexNode
- curChild int
- capsize int
- )
- // construct sparse capnum mapping if some numbers are unused
-
- if tree.capnumlist == nil || tree.captop == len(tree.capnumlist) {
- capsize = tree.captop
- w.caps = nil
- } else {
- capsize = len(tree.capnumlist)
- w.caps = tree.caps
- for i := 0; i < len(tree.capnumlist); i++ {
- w.caps[tree.capnumlist[i]] = i
- }
- }
-
- w.counting = true
-
- for {
- if !w.counting {
- w.emitted = make([]int, w.count)
- }
-
- curNode = tree.root
- curChild = 0
-
- w.emit1(Lazybranch, 0)
-
- for {
- if len(curNode.children) == 0 {
- w.emitFragment(curNode.t, curNode, 0)
- } else if curChild < len(curNode.children) {
- w.emitFragment(curNode.t|beforeChild, curNode, curChild)
-
- curNode = curNode.children[curChild]
-
- w.pushInt(curChild)
- curChild = 0
- continue
- }
-
- if w.emptyStack() {
- break
- }
-
- curChild = w.popInt()
- curNode = curNode.next
-
- w.emitFragment(curNode.t|afterChild, curNode, curChild)
- curChild++
- }
-
- w.patchJump(0, w.curPos())
- w.emit(Stop)
-
- if !w.counting {
- break
- }
-
- w.counting = false
- }
-
- fcPrefix := getFirstCharsPrefix(tree)
- prefix := getPrefix(tree)
- rtl := (tree.options & RightToLeft) != 0
-
- var bmPrefix *BmPrefix
- //TODO: benchmark string prefixes
- if prefix != nil && len(prefix.PrefixStr) > 0 && MaxPrefixSize > 0 {
- if len(prefix.PrefixStr) > MaxPrefixSize {
- // limit prefix changes to 10k
- prefix.PrefixStr = prefix.PrefixStr[:MaxPrefixSize]
- }
- bmPrefix = newBmPrefix(prefix.PrefixStr, prefix.CaseInsensitive, rtl)
- } else {
- bmPrefix = nil
- }
-
- return &Code{
- Codes: w.emitted,
- Strings: w.stringtable,
- Sets: w.settable,
- TrackCount: w.trackcount,
- Caps: w.caps,
- Capsize: capsize,
- FcPrefix: fcPrefix,
- BmPrefix: bmPrefix,
- Anchors: getAnchors(tree),
- RightToLeft: rtl,
- }, nil
- }
-
- // The main RegexCode generator. It does a depth-first walk
- // through the tree and calls EmitFragment to emits code before
- // and after each child of an interior node, and at each leaf.
- func (w *writer) emitFragment(nodetype nodeType, node *regexNode, curIndex int) error {
- bits := InstOp(0)
-
- if nodetype <= ntRef {
- if (node.options & RightToLeft) != 0 {
- bits |= Rtl
- }
- if (node.options & IgnoreCase) != 0 {
- bits |= Ci
- }
- }
- ntBits := nodeType(bits)
-
- switch nodetype {
- case ntConcatenate | beforeChild, ntConcatenate | afterChild, ntEmpty:
- break
-
- case ntAlternate | beforeChild:
- if curIndex < len(node.children)-1 {
- w.pushInt(w.curPos())
- w.emit1(Lazybranch, 0)
- }
-
- case ntAlternate | afterChild:
- if curIndex < len(node.children)-1 {
- lbPos := w.popInt()
- w.pushInt(w.curPos())
- w.emit1(Goto, 0)
- w.patchJump(lbPos, w.curPos())
- } else {
- for i := 0; i < curIndex; i++ {
- w.patchJump(w.popInt(), w.curPos())
- }
- }
- break
-
- case ntTestref | beforeChild:
- if curIndex == 0 {
- w.emit(Setjump)
- w.pushInt(w.curPos())
- w.emit1(Lazybranch, 0)
- w.emit1(Testref, w.mapCapnum(node.m))
- w.emit(Forejump)
- }
-
- case ntTestref | afterChild:
- if curIndex == 0 {
- branchpos := w.popInt()
- w.pushInt(w.curPos())
- w.emit1(Goto, 0)
- w.patchJump(branchpos, w.curPos())
- w.emit(Forejump)
- if len(node.children) <= 1 {
- w.patchJump(w.popInt(), w.curPos())
- }
- } else if curIndex == 1 {
- w.patchJump(w.popInt(), w.curPos())
- }
-
- case ntTestgroup | beforeChild:
- if curIndex == 0 {
- w.emit(Setjump)
- w.emit(Setmark)
- w.pushInt(w.curPos())
- w.emit1(Lazybranch, 0)
- }
-
- case ntTestgroup | afterChild:
- if curIndex == 0 {
- w.emit(Getmark)
- w.emit(Forejump)
- } else if curIndex == 1 {
- Branchpos := w.popInt()
- w.pushInt(w.curPos())
- w.emit1(Goto, 0)
- w.patchJump(Branchpos, w.curPos())
- w.emit(Getmark)
- w.emit(Forejump)
- if len(node.children) <= 2 {
- w.patchJump(w.popInt(), w.curPos())
- }
- } else if curIndex == 2 {
- w.patchJump(w.popInt(), w.curPos())
- }
-
- case ntLoop | beforeChild, ntLazyloop | beforeChild:
-
- if node.n < math.MaxInt32 || node.m > 1 {
- if node.m == 0 {
- w.emit1(Nullcount, 0)
- } else {
- w.emit1(Setcount, 1-node.m)
- }
- } else if node.m == 0 {
- w.emit(Nullmark)
- } else {
- w.emit(Setmark)
- }
-
- if node.m == 0 {
- w.pushInt(w.curPos())
- w.emit1(Goto, 0)
- }
- w.pushInt(w.curPos())
-
- case ntLoop | afterChild, ntLazyloop | afterChild:
-
- startJumpPos := w.curPos()
- lazy := (nodetype - (ntLoop | afterChild))
-
- if node.n < math.MaxInt32 || node.m > 1 {
- if node.n == math.MaxInt32 {
- w.emit2(InstOp(Branchcount+lazy), w.popInt(), math.MaxInt32)
- } else {
- w.emit2(InstOp(Branchcount+lazy), w.popInt(), node.n-node.m)
- }
- } else {
- w.emit1(InstOp(Branchmark+lazy), w.popInt())
- }
-
- if node.m == 0 {
- w.patchJump(w.popInt(), startJumpPos)
- }
-
- case ntGroup | beforeChild, ntGroup | afterChild:
-
- case ntCapture | beforeChild:
- w.emit(Setmark)
-
- case ntCapture | afterChild:
- w.emit2(Capturemark, w.mapCapnum(node.m), w.mapCapnum(node.n))
-
- case ntRequire | beforeChild:
- // NOTE: the following line causes lookahead/lookbehind to be
- // NON-BACKTRACKING. It can be commented out with (*)
- w.emit(Setjump)
-
- w.emit(Setmark)
-
- case ntRequire | afterChild:
- w.emit(Getmark)
-
- // NOTE: the following line causes lookahead/lookbehind to be
- // NON-BACKTRACKING. It can be commented out with (*)
- w.emit(Forejump)
-
- case ntPrevent | beforeChild:
- w.emit(Setjump)
- w.pushInt(w.curPos())
- w.emit1(Lazybranch, 0)
-
- case ntPrevent | afterChild:
- w.emit(Backjump)
- w.patchJump(w.popInt(), w.curPos())
- w.emit(Forejump)
-
- case ntGreedy | beforeChild:
- w.emit(Setjump)
-
- case ntGreedy | afterChild:
- w.emit(Forejump)
-
- case ntOne, ntNotone:
- w.emit1(InstOp(node.t|ntBits), int(node.ch))
-
- case ntNotoneloop, ntNotonelazy, ntOneloop, ntOnelazy:
- if node.m > 0 {
- if node.t == ntOneloop || node.t == ntOnelazy {
- w.emit2(Onerep|bits, int(node.ch), node.m)
- } else {
- w.emit2(Notonerep|bits, int(node.ch), node.m)
- }
- }
- if node.n > node.m {
- if node.n == math.MaxInt32 {
- w.emit2(InstOp(node.t|ntBits), int(node.ch), math.MaxInt32)
- } else {
- w.emit2(InstOp(node.t|ntBits), int(node.ch), node.n-node.m)
- }
- }
-
- case ntSetloop, ntSetlazy:
- if node.m > 0 {
- w.emit2(Setrep|bits, w.setCode(node.set), node.m)
- }
- if node.n > node.m {
- if node.n == math.MaxInt32 {
- w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), math.MaxInt32)
- } else {
- w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), node.n-node.m)
- }
- }
-
- case ntMulti:
- w.emit1(InstOp(node.t|ntBits), w.stringCode(node.str))
-
- case ntSet:
- w.emit1(InstOp(node.t|ntBits), w.setCode(node.set))
-
- case ntRef:
- w.emit1(InstOp(node.t|ntBits), w.mapCapnum(node.m))
-
- case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
- w.emit(InstOp(node.t))
-
- default:
- return fmt.Errorf("unexpected opcode in regular expression generation: %v", nodetype)
- }
-
- return nil
- }
-
- // To avoid recursion, we use a simple integer stack.
- // This is the push.
- func (w *writer) pushInt(i int) {
- w.intStack = append(w.intStack, i)
- }
-
- // Returns true if the stack is empty.
- func (w *writer) emptyStack() bool {
- return len(w.intStack) == 0
- }
-
- // This is the pop.
- func (w *writer) popInt() int {
- //get our item
- idx := len(w.intStack) - 1
- i := w.intStack[idx]
- //trim our slice
- w.intStack = w.intStack[:idx]
- return i
- }
-
- // Returns the current position in the emitted code.
- func (w *writer) curPos() int {
- return w.curpos
- }
-
- // Fixes up a jump instruction at the specified offset
- // so that it jumps to the specified jumpDest.
- func (w *writer) patchJump(offset, jumpDest int) {
- w.emitted[offset+1] = jumpDest
- }
-
- // Returns an index in the set table for a charset
- // uses a map to eliminate duplicates.
- func (w *writer) setCode(set *CharSet) int {
- if w.counting {
- return 0
- }
-
- buf := &bytes.Buffer{}
-
- set.mapHashFill(buf)
- hash := buf.String()
- i, ok := w.sethash[hash]
- if !ok {
- i = len(w.sethash)
- w.sethash[hash] = i
- w.settable = append(w.settable, set)
- }
- return i
- }
-
- // Returns an index in the string table for a string.
- // uses a map to eliminate duplicates.
- func (w *writer) stringCode(str []rune) int {
- if w.counting {
- return 0
- }
-
- hash := string(str)
- i, ok := w.stringhash[hash]
- if !ok {
- i = len(w.stringhash)
- w.stringhash[hash] = i
- w.stringtable = append(w.stringtable, str)
- }
-
- return i
- }
-
- // When generating code on a regex that uses a sparse set
- // of capture slots, we hash them to a dense set of indices
- // for an array of capture slots. Instead of doing the hash
- // at match time, it's done at compile time, here.
- func (w *writer) mapCapnum(capnum int) int {
- if capnum == -1 {
- return -1
- }
-
- if w.caps != nil {
- return w.caps[capnum]
- }
-
- return capnum
- }
-
- // Emits a zero-argument operation. Note that the emit
- // functions all run in two modes: they can emit code, or
- // they can just count the size of the code.
- func (w *writer) emit(op InstOp) {
- if w.counting {
- w.count++
- if opcodeBacktracks(op) {
- w.trackcount++
- }
- return
- }
- w.emitted[w.curpos] = int(op)
- w.curpos++
- }
-
- // Emits a one-argument operation.
- func (w *writer) emit1(op InstOp, opd1 int) {
- if w.counting {
- w.count += 2
- if opcodeBacktracks(op) {
- w.trackcount++
- }
- return
- }
- w.emitted[w.curpos] = int(op)
- w.curpos++
- w.emitted[w.curpos] = opd1
- w.curpos++
- }
-
- // Emits a two-argument operation.
- func (w *writer) emit2(op InstOp, opd1, opd2 int) {
- if w.counting {
- w.count += 3
- if opcodeBacktracks(op) {
- w.trackcount++
- }
- return
- }
- w.emitted[w.curpos] = int(op)
- w.curpos++
- w.emitted[w.curpos] = opd1
- w.curpos++
- w.emitted[w.curpos] = opd2
- w.curpos++
- }
|