v0.0.1

分支 (1)

标签 (1)

管理

管理

master

v0.0.1

rsc
/
regexp
/
regmerge
/
match.go

// Copyright 2011 The Go Authors.  All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Copied from code.google.com/p/codesearch/regexp/copy.go
// and adapted for the problem of finding a matching string, not
// testing whether a particular string matches.

package main

import (
	"encoding/binary"
	"errors"
	"fmt"
	"hash/fnv"
	"hash"
	"log"
	"regexp/syntax"
)

// A matcher holds the state for running regular expression search.
type matcher struct {
	buf []byte
	prog      *syntax.Prog       // compiled program
	dstate    map[uint32]*dstate // dstate cache
	start     *dstate            // start state
	startLine *dstate            // start state for beginning of line
	z1, z2, z3    nstate             // three temporary nstates
	ids []int
	numState int
	maxState int
	numByte int
	numMatch int
	undo [256]byte
	all *dstate
	allTail **dstate
	h hash.Hash32
}

// An nstate corresponds to an NFA state.
type nstate struct {
	q       Set // queue of program instructions
	flag    flags      // flags (TODO)
	needFlag syntax.EmptyOp
}

// The flags record state about a position between bytes in the text.
type flags uint32

const (
	flagBOL  flags = 1 << iota // beginning of line
	flagEOL                    // end of line
	flagBOT                    // beginning of text
	flagEOT                    // end of text
	flagWord                   // last byte was word byte
)

// A dstate corresponds to a DFA state.
type dstate struct {
	enc      string       // encoded nstate
	nextAll *dstate
	nextHash *dstate
	prev *dstate
	prevByte int
	done bool
}

func (z *nstate) String() string {
	return fmt.Sprintf("%v/%#x+%#x", z.q.Dense(), z.flag, z.needFlag)
}

// enc encodes z as a string.
func (m *matcher) enc(z *nstate) []byte {
	buf := m.buf[:0]
	buf = append(buf, byte(z.needFlag), byte(z.flag))
	ids := m.ids[:0]
	for _, id := range z.q.Dense() {
		ids = append(ids, int(id))
	}
	sortInts(ids)
	last := ^uint32(0)
	for _, id := range ids {
		x := uint32(id)-last
		last = uint32(id)
		for x >= 0x80 {
			buf = append(buf, byte(x)|0x80)
			x >>= 7
		}
		buf = append(buf, byte(x))
	}
	m.buf = buf
	return buf
}

// dec decodes the encoding s into z.
func (m *matcher) dec(z *nstate, s string) {
	b := append(m.buf[:0], s...)
	m.buf = b
	z.needFlag = syntax.EmptyOp(b[0])
	b = b[1:]
	i, n := binary.Uvarint(b)
	if n <= 0 {
		bug()
	}
	b = b[n:]
	z.flag = flags(i)
	z.q.Reset()
	last := ^uint32(0)
	for len(b) > 0 {
		i, n = binary.Uvarint(b)
		if n <= 0 {
			bug()
		}
		b = b[n:]
		last += uint32(i)
		z.q.Add(last, 0, 0)
	}
}

// init initializes the matcher.
func (m *matcher) init(prog *syntax.Prog, n int) error {
	m.prog = prog
	m.dstate = make(map[uint32]*dstate)
	m.numMatch = n
	m.maxState = 10
	m.allTail = &m.all
	m.numByte = 256
	for i := range m.undo {
		m.undo[i] = byte(i)
	}
	m.h = fnv.New32()

	m.z1.q.Init(uint32(len(prog.Inst)))
	m.z2.q.Init(uint32(len(prog.Inst)))
	m.z3.q.Init(uint32(len(prog.Inst)))
	m.ids = make([]int, 0, len(prog.Inst))

	m.addq(&m.z1.q, uint32(prog.Start), syntax.EmptyBeginLine|syntax.EmptyBeginText)
	m.z1.flag = flagBOL | flagBOT
	m.start = m.cache(&m.z1, nil, 0)

	m.z1.q.Reset()
	m.addq(&m.z1.q, uint32(prog.Start), syntax.EmptyBeginLine)
	m.z1.flag = flagBOL
	m.startLine = m.cache(&m.z1, nil, 0)

	m.crunchProg()

	return nil
}

// stepEmpty steps runq to nextq expanding according to flag.
func (m *matcher) stepEmpty(runq, nextq *Set, flag syntax.EmptyOp) {
	nextq.Reset()
	for _, id := range runq.Dense() {
		m.addq(nextq, id, flag)
	}
}

// stepByte steps runq to nextq consuming c and then expanding according to flag.
// It returns true if a match ends immediately before c.
// c is either an input byte or endText.
func (m *matcher) stepByte(runq, nextq *Set, c int, flag syntax.EmptyOp) (match bool) {
	nextq.Reset()
	m.addq(nextq, uint32(m.prog.Start), flag)

	nmatch := 0
	for _, id := range runq.Dense() {
		i := &m.prog.Inst[id]
		switch i.Op {
		default:
			continue
		case syntax.InstMatch:
			nmatch++
			continue
		case instByteRange:
			if c == endText {
				break
			}
			lo := int((i.Arg >> 8) & 0xFF)
			hi := int(i.Arg & 0xFF)
			if i.Arg&argFold != 0 && 'a' <= c && c <= 'z' {
				c += 'A' - 'a'
			}
			if lo <= c && c <= hi {
				m.addq(nextq, i.Out, flag)
			}
		}
	}
	return nmatch == m.numMatch
}

// addq adds id to the queue, expanding according to flag.
func (m *matcher) addq(q *Set, id uint32, flag syntax.EmptyOp) {
	if q.Has(id, 0) {
		return
	}
	q.MustAdd(id)
	i := &m.prog.Inst[id]
	switch i.Op {
	case syntax.InstCapture, syntax.InstNop:
		m.addq(q, i.Out, flag)
	case syntax.InstAlt, syntax.InstAltMatch:
		m.addq(q, i.Out, flag)
		m.addq(q, i.Arg, flag)
	case syntax.InstEmptyWidth:
		if syntax.EmptyOp(i.Arg)&^flag == 0 {
			m.addq(q, i.Out, flag)
		}
	}
}

const endText = -1

// computeNext computes the next DFA state if we're in d reading c (an input byte or endText).
func (m *matcher) computeNext(this, next *nstate, d *dstate, c int) bool {
	// compute flags in effect before c
	flag := syntax.EmptyOp(0)
	if this.flag&flagBOL != 0 {
		flag |= syntax.EmptyBeginLine
	}
	if this.flag&flagBOT != 0 {
		flag |= syntax.EmptyBeginText
	}
	if this.flag&flagWord != 0 {
		if !isWordByte(c) {
			flag |= syntax.EmptyWordBoundary
		} else {
			flag |= syntax.EmptyNoWordBoundary
		}
	} else {
		if isWordByte(c) {
			flag |= syntax.EmptyWordBoundary
		} else {
			flag |= syntax.EmptyNoWordBoundary
		}
	}
	if c == '\n' {
		flag |= syntax.EmptyEndLine
	}
	if c == endText {
		flag |= syntax.EmptyEndLine | syntax.EmptyEndText
	}

	if flag &= this.needFlag; flag != 0 {
		// re-expand queue using new flags.
		// TODO: only do this when it matters
		// (something is gating on word boundaries).
		m.stepEmpty(&this.q, &next.q, flag)
		this, next = next, &m.z3
	}

	// now compute flags after c.
	flag = 0
	next.flag = 0
	if c == '\n' {
		flag |= syntax.EmptyBeginLine
		next.flag |= flagBOL
	}
	if isWordByte(c) {
		next.flag |= flagWord
	}

	// re-add start, process rune + expand according to flags.
	if m.stepByte(&this.q, &next.q, c, flag) {
		return true
	}
	next.needFlag = m.queueFlag(&next.q)
	if next.needFlag&syntax.EmptyBeginLine == 0 {
		next.flag &^= flagBOL
	}
	if next.needFlag&(syntax.EmptyWordBoundary|syntax.EmptyNoWordBoundary) == 0{
		next.flag &^= flagWord
	}

	m.cache(next, d, c)
	return false
}

func (m *matcher) queueFlag(runq *Set) syntax.EmptyOp {
	var e uint32
	for _, id := range runq.Dense() {
		i := &m.prog.Inst[id]
		if i.Op == syntax.InstEmptyWidth {
			e |= i.Arg
		}
	}
	return syntax.EmptyOp(e)
}

func (m *matcher) hash(enc []byte) uint32 {
	m.h.Reset()
	m.h.Write(enc)
	return m.h.Sum32()
}

func (m *matcher) find(h uint32, enc []byte) *dstate {
Search:
	for d := m.dstate[h]; d!=nil; d=d.nextHash {
		s := d.enc
		if len(s) != len(enc) {
			continue Search
		}
		for i, b := range enc {
			if s[i] != b {
				continue Search
			}
		}
		return d
	}
	return nil
}

func (m *matcher) cache(z *nstate, prev *dstate, prevByte int) *dstate {
	enc := m.enc(z)
	h := m.hash(enc)
	d := m.find(h, enc)
	if d != nil {
		return d
	}

	if m.numState >= m.maxState {
		panic(ErrMemory)
	}
	m.numState++
	d = &dstate{
		enc: string(enc),
		prev: prev,
		prevByte: prevByte,
		nextHash: m.dstate[h],
	}
	m.dstate[h] = d
	*m.allTail = d
	m.allTail = &d.nextAll

	return d
}

// isWordByte reports whether the byte c is a word character: ASCII only.
// This is used to implement \b and \B.  This is not right for Unicode, but:
//	- it's hard to get right in a byte-at-a-time matching world
//	  (the DFA has only one-byte lookahead)
//	- this crude approximation is the same one PCRE uses
func isWordByte(c int) bool {
	return 'A' <= c && c <= 'Z' ||
		'a' <= c && c <= 'z' ||
		'0' <= c && c <= '9' ||
		c == '_'
}

var ErrNoMatch = errors.New("no matching strings")
var ErrMemory = errors.New("exhausted memory")

func (m *matcher) findMatch(maxState int) (s string, err error) {
	defer func() {
		switch r := recover().(type) {
		case nil:
			return
		case error:
			err = r
			return
		default:
			panic(r)
		}
	}()

	m.maxState = maxState
	numState := 0
	var d *dstate
	var c int
	for d = m.all; d != nil; d = d.nextAll {
		numState++
		if d.done {
			continue
		}
		this, next := &m.z1, &m.z2
		m.dec(this, d.enc)
		if m.computeNext(this, next, d, endText) {
			c = endText
			goto Found
		}
		for _, cb := range m.undo[:m.numByte] {
			if m.computeNext(this, next, d, int(cb)) {
				c = int(cb)
				goto Found
			}
		}
		d.done = true
	}
	log.Printf("searched %d states; queued %d states", numState, m.numState)
	return "", ErrNoMatch

Found:
	var buf []byte
	if c >= 0 {
		buf = append(buf, byte(c))
	}
	for d1 := d; d1.prev != nil; d1= d1.prev {
		buf = append(buf, byte(d1.prevByte))
	}
	for i, j := 0, len(buf)-1; i < j; i, j = i+1, j-1 {
		buf[i], buf[j] = buf[j], buf[i]
	}
	log.Printf("searched %d states; queued %d states", numState, m.numState)
	return string(buf), nil
}