feat: rewrote glob to have a full doublestar impl
All checks were successful
default / ensure tests work (push) Successful in 35s
All checks were successful
default / ensure tests work (push) Successful in 35s
This implements a full double star glob implementation with it's own filesystem implementation.
This commit is contained in:
parent
3daeb3fc05
commit
24ad00274d
7 changed files with 1354 additions and 333 deletions
485
glob/glob.go
Normal file
485
glob/glob.go
Normal file
|
|
@ -0,0 +1,485 @@
|
|||
// Package glob implements a language for specifying glob patterns for path
|
||||
// names starting at some root. The language does not follow the specs from
|
||||
// filepath.Match but provides a superset which allows for directory
|
||||
// wildcards.
|
||||
//
|
||||
// Patterns consist of normal characters, non-separator wildcards '*' and '?',
|
||||
// separators '/' and directory wildcards '**'.
|
||||
//
|
||||
// A somewhat formal grammer can be given as:
|
||||
//
|
||||
// pattern = term, { '/', term };
|
||||
// term = '**' | name;
|
||||
// name = { charSpecial | group | escapedChar | '*' | '?' };
|
||||
// charSpecial = (* any unicode rune except '/', '*', '?', '[' and '\' *);
|
||||
// char = (* any unicode rune *);
|
||||
// escapedChar = '\\', char;
|
||||
// group = '[', [ '^' ] { escapedChar | groupChar | range } ']';
|
||||
// groupChar = (* any unicode rune except '-' and ']' *);
|
||||
// range = ( groupChar | escapedChar ), '-', (groupChar | escapedChar);
|
||||
//
|
||||
// The format operators have the following meaning:
|
||||
//
|
||||
// - any character (rune) matches the exactly this rune - with the following
|
||||
// exceptions
|
||||
// - `/` works as a directory separator. It matches directory boundarys of the
|
||||
// underlying system independently of the separator char used by the OS.
|
||||
// - `?` matches exactly one non-separator char
|
||||
// - `*` matches any number of non-separator chars - including zero
|
||||
// - `\` escapes a character's special meaning allowing `*` and `?` to be used
|
||||
// as regular characters.
|
||||
// - `**` matches any number of nested directories. If anything is matched it
|
||||
// always extends until a separator or the end of the name.
|
||||
// - Groups can be defined using the `[` and `]` characters. Inside a group the
|
||||
// special meaning of the characters mentioned before is disabled but the
|
||||
// following rules apply
|
||||
// - any character used as part of the group acts as a choice to pick from
|
||||
// - if the group's first character is a `^` the whole group is negated
|
||||
// - a range can be defined using `-` matching any rune between low and high
|
||||
// inclusive
|
||||
// - Multiple ranges can be given. Ranges can be combined with choices.
|
||||
// - The meaning of `-` and `]` can be escacped using `\`
|
||||
package glob
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
const (
|
||||
// Separator defines the path separator to use in patterns. This is always
|
||||
// a forward slash independently of the underlying's OS separator
|
||||
Separator = '/'
|
||||
// SingleWildcard defines the the single non-separator character wildcard
|
||||
// operator.
|
||||
SingleWildcard = '?'
|
||||
// AnyWildcard defines the the any number of non-separator characters
|
||||
// wildcard operator.
|
||||
AnyWildcard = '*'
|
||||
// Backslash escapes the next character's special meaning
|
||||
Backslash = '\\'
|
||||
// GroupStart starts a range
|
||||
GroupStart = '['
|
||||
// GroupEnd starts a range
|
||||
GroupEnd = ']'
|
||||
// GroupNegate when used as the first character of a group negates the group.
|
||||
GroupNegate = '^'
|
||||
// Range defines the range operator
|
||||
Range = '-'
|
||||
)
|
||||
|
||||
var (
|
||||
// ErrBadPattern is returned when an invalid pattern is found. Make
|
||||
// sure you use errors.Is to compare errors to this sentinel value.
|
||||
ErrBadPattern = errors.New("bad pattern")
|
||||
)
|
||||
|
||||
// Pattern defines a glob pattern prepared ahead of time which can be used to
|
||||
// match filenames. Pattern is safe to use concurrently.
|
||||
type Pattern struct {
|
||||
tokens []token
|
||||
}
|
||||
|
||||
// New creates a new pattern from pat and returns it. It returns an error
|
||||
// indicating any invalid pattern.
|
||||
func New(pat string) (*Pattern, error) {
|
||||
var tokens []token
|
||||
|
||||
p := pat
|
||||
for {
|
||||
if len(p) == 0 {
|
||||
return &Pattern{tokens: tokens}, nil
|
||||
}
|
||||
|
||||
r, l := utf8.DecodeRuneInString(p)
|
||||
|
||||
var t token
|
||||
switch r {
|
||||
case Separator:
|
||||
if len(tokens) > 0 && tokens[len(tokens)-1].r == Separator {
|
||||
return nil, fmt.Errorf("%w: unexpected //", ErrBadPattern)
|
||||
}
|
||||
t = token{tokenTypeLiteral, Separator, runeGroup{}}
|
||||
|
||||
case SingleWildcard:
|
||||
if len(tokens) > 0 && (tokens[len(tokens)-1].t == tokenTypeAnyRunes || tokens[len(tokens)-1].t == tokenTypeAnyDirectories) {
|
||||
return nil, fmt.Errorf("%w: unexpected ?", ErrBadPattern)
|
||||
}
|
||||
t = token{tokenTypeSingleRune, 0, runeGroup{}}
|
||||
|
||||
case AnyWildcard:
|
||||
if len(tokens) > 0 && (tokens[len(tokens)-1].t == tokenTypeSingleRune || tokens[len(tokens)-1].t == tokenTypeAnyDirectories) {
|
||||
return nil, fmt.Errorf("%w: unexpected ?", ErrBadPattern)
|
||||
}
|
||||
|
||||
t = token{tokenTypeAnyRunes, 0, runeGroup{}}
|
||||
|
||||
if len(p[l:]) > 0 {
|
||||
n, nl := utf8.DecodeRuneInString(p[l:])
|
||||
if n == AnyWildcard {
|
||||
d, _ := utf8.DecodeRuneInString(p[l+nl:])
|
||||
if d == utf8.RuneError {
|
||||
return nil, fmt.Errorf("%w: unexpected end of patterm after **", ErrBadPattern)
|
||||
}
|
||||
if d != Separator {
|
||||
return nil, fmt.Errorf("%w: unexpected %c after **", ErrBadPattern, d)
|
||||
}
|
||||
|
||||
t.t = tokenTypeAnyDirectories
|
||||
l += nl
|
||||
}
|
||||
}
|
||||
|
||||
case Backslash:
|
||||
if len(p[l:]) == 0 {
|
||||
return nil, fmt.Errorf("%w: no character given after \\", ErrBadPattern)
|
||||
}
|
||||
|
||||
p = p[l:]
|
||||
r, l = utf8.DecodeRuneInString(p)
|
||||
|
||||
t = token{tokenTypeLiteral, r, runeGroup{}}
|
||||
|
||||
case GroupStart:
|
||||
var err error
|
||||
t, l, err = parseGroup(p)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
case GroupEnd:
|
||||
return nil, fmt.Errorf("%w: using ] w/o [", ErrBadPattern)
|
||||
|
||||
default:
|
||||
t = token{tokenTypeLiteral, r, runeGroup{}}
|
||||
}
|
||||
|
||||
tokens = append(tokens, t)
|
||||
p = p[l:]
|
||||
}
|
||||
}
|
||||
|
||||
// String reconstructs the glob pattern from the tokens.
|
||||
func (pat *Pattern) String() string {
|
||||
var b strings.Builder
|
||||
for _, t := range pat.tokens {
|
||||
switch t.t {
|
||||
case tokenTypeLiteral:
|
||||
switch t.r {
|
||||
case GroupStart, GroupEnd, GroupNegate, AnyWildcard, SingleWildcard, Range:
|
||||
b.WriteRune(Backslash)
|
||||
fallthrough
|
||||
default:
|
||||
b.WriteRune(t.r)
|
||||
}
|
||||
case tokenTypeSingleRune:
|
||||
b.WriteRune(SingleWildcard)
|
||||
case tokenTypeAnyRunes:
|
||||
b.WriteRune(AnyWildcard)
|
||||
case tokenTypeAnyDirectories:
|
||||
b.WriteString("**")
|
||||
case tokenTypeGroup:
|
||||
b.WriteRune(GroupStart)
|
||||
if t.g.neg {
|
||||
b.WriteRune(GroupNegate)
|
||||
}
|
||||
for _, r := range t.g.runes {
|
||||
b.WriteRune(r)
|
||||
}
|
||||
for _, rg := range t.g.ranges {
|
||||
b.WriteRune(rg.lo)
|
||||
b.WriteRune(Range)
|
||||
b.WriteRune(rg.hi)
|
||||
}
|
||||
b.WriteRune(GroupEnd)
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func (pat *Pattern) GoString() string {
|
||||
return pat.String()
|
||||
}
|
||||
|
||||
// Match matches a file's path name f to the compiled pattern and returns
|
||||
// whether the path matches the pattern or not.
|
||||
func (pat *Pattern) Match(f string) bool {
|
||||
return match(f, pat.tokens, false)
|
||||
}
|
||||
|
||||
func (pat *Pattern) MatchPrefix(f string) bool {
|
||||
return match(f, pat.tokens, true)
|
||||
}
|
||||
|
||||
// GlobFS applies pat to all files found in fsys under root and returns the
|
||||
// matching path names as a string slice. It uses fs.WalkDir internally and all
|
||||
// constraints given for that function apply to GlobFS.
|
||||
func (pat *Pattern) GlobFS(fsys fs.FS, root string) ([]string, error) {
|
||||
results := make([]string, 0)
|
||||
err := fs.WalkDir(fsys, root, func(p string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if p == "." {
|
||||
return nil
|
||||
}
|
||||
|
||||
if root != "." && root != "" {
|
||||
p = strings.Replace(p, root, "", 1)
|
||||
}
|
||||
|
||||
if d.IsDir() {
|
||||
if !pat.MatchPrefix(p) {
|
||||
return fs.SkipDir
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
if pat.Match(p) {
|
||||
results = append(results, p)
|
||||
}
|
||||
|
||||
return nil
|
||||
})
|
||||
|
||||
return results, err
|
||||
}
|
||||
|
||||
func parseGroup(p string) (token, int, error) {
|
||||
// re-read the [. No need to assert the rune here as it has been
|
||||
// done in the main parsing loop.
|
||||
_, le := utf8.DecodeRuneInString(p)
|
||||
t := token{
|
||||
t: tokenTypeGroup,
|
||||
g: runeGroup{},
|
||||
}
|
||||
|
||||
initialLen := le
|
||||
var start rune
|
||||
|
||||
for {
|
||||
if len(p[le:]) == 0 {
|
||||
return t, le, fmt.Errorf("%w: missing %c", ErrBadPattern, GroupEnd)
|
||||
}
|
||||
|
||||
r, l := utf8.DecodeRuneInString(p[le:])
|
||||
le += l
|
||||
|
||||
if initialLen == le-l && r == GroupNegate {
|
||||
t.g.neg = true
|
||||
continue
|
||||
}
|
||||
|
||||
switch r {
|
||||
case GroupEnd:
|
||||
if start != 0 {
|
||||
t.g.runes = append(t.g.runes, start)
|
||||
}
|
||||
|
||||
return t, le, nil
|
||||
|
||||
case Range:
|
||||
if start == 0 {
|
||||
return t, le, fmt.Errorf("%w: missing start for character range", ErrBadPattern)
|
||||
}
|
||||
|
||||
if len(p[le:]) == 0 {
|
||||
return t, le, fmt.Errorf("%w: missing range end", ErrBadPattern)
|
||||
}
|
||||
|
||||
r, l = utf8.DecodeRuneInString(p[le:])
|
||||
le += l
|
||||
|
||||
switch r {
|
||||
case GroupEnd:
|
||||
return t, le, fmt.Errorf("%w: unterminated range", ErrBadPattern)
|
||||
|
||||
case Backslash:
|
||||
if len(p[le:]) == 0 {
|
||||
return t, le, fmt.Errorf("%w: missing character after \\", ErrBadPattern)
|
||||
}
|
||||
r, l = utf8.DecodeRuneInString(p[le:])
|
||||
le += l
|
||||
fallthrough
|
||||
|
||||
default:
|
||||
t.g.ranges = append(t.g.ranges, runeRange{start, r})
|
||||
start = 0
|
||||
}
|
||||
|
||||
case Backslash:
|
||||
if len(p[le:]) == 0 {
|
||||
return t, le, fmt.Errorf("%w: missing character after \\", ErrBadPattern)
|
||||
}
|
||||
|
||||
r, l = utf8.DecodeRuneInString(p[le:])
|
||||
le += l
|
||||
fallthrough
|
||||
|
||||
default:
|
||||
if start != 0 {
|
||||
t.g.runes = append(t.g.runes, start)
|
||||
}
|
||||
start = r
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// match is used internally to implement a simple recursive backtracking
|
||||
// algorithmn using the token list t to match against file path f. If matchPrefix
|
||||
// is set to true, match returns true if f is completely matched by any prefix
|
||||
// of t. Otherwise, match returns true if f is matched by _all_ tokens in t.
|
||||
func match(f string, t []token, matchPrefix bool) bool {
|
||||
for {
|
||||
if len(f) == 0 {
|
||||
if matchPrefix {
|
||||
return true
|
||||
}
|
||||
|
||||
if len(t) == 0 {
|
||||
return true
|
||||
}
|
||||
|
||||
if len(t) == 1 && t[0].t == tokenTypeAnyRunes {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
if len(t) == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
r, le := utf8.DecodeRuneInString(f)
|
||||
|
||||
switch t[0].t {
|
||||
case tokenTypeLiteral:
|
||||
if t[0].r != r {
|
||||
return false
|
||||
}
|
||||
|
||||
case tokenTypeGroup:
|
||||
if !t[0].g.match(r) {
|
||||
return false
|
||||
}
|
||||
|
||||
case tokenTypeSingleRune:
|
||||
if r == Separator {
|
||||
return false
|
||||
}
|
||||
|
||||
case tokenTypeAnyRunes:
|
||||
if r == Separator {
|
||||
return match(f, t[1:], matchPrefix)
|
||||
}
|
||||
|
||||
if match(f[le:], t, matchPrefix) {
|
||||
return true
|
||||
}
|
||||
|
||||
if match(f, t[1:], matchPrefix) {
|
||||
return true
|
||||
}
|
||||
|
||||
case tokenTypeAnyDirectories:
|
||||
if match(f, t[2:], matchPrefix) {
|
||||
return true
|
||||
}
|
||||
|
||||
var l2 int
|
||||
for {
|
||||
if len(f[le+l2:]) == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
n, nl := utf8.DecodeRuneInString(f[le+l2:])
|
||||
l2 += nl
|
||||
|
||||
if n == Separator {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if match(f[le+l2:], t[2:], matchPrefix) {
|
||||
return true
|
||||
}
|
||||
|
||||
return match(f[le+l2:], t, matchPrefix)
|
||||
}
|
||||
|
||||
t = t[1:]
|
||||
f = f[le:]
|
||||
}
|
||||
}
|
||||
|
||||
// tokenType enumerates the different types of tokens.
|
||||
type tokenType int
|
||||
|
||||
const (
|
||||
// a rune literal
|
||||
tokenTypeLiteral tokenType = iota + 1
|
||||
// any single non-separator rune
|
||||
tokenTypeSingleRune
|
||||
// any number of non-separator runes (incl. zero)
|
||||
tokenTypeAnyRunes
|
||||
// any number runes including separators. Matches whole directories.
|
||||
tokenTypeAnyDirectories
|
||||
// a group of rune consisting of named runes and/or ranges. Might be negated.
|
||||
tokenTypeGroup
|
||||
)
|
||||
|
||||
// token implements a single token in the pattern.
|
||||
type token struct {
|
||||
// the token's type
|
||||
t tokenType
|
||||
// a literal rune to matche. Literal runes are stored separate from groups
|
||||
// to improve matching performance.
|
||||
r rune
|
||||
// A rune group to match.
|
||||
g runeGroup
|
||||
}
|
||||
|
||||
// A group of runes. Groups can contain any number of enumerated runes and rune
|
||||
// ranges. In addition a whole group can be negated.
|
||||
type runeGroup struct {
|
||||
// Whether the group is negated
|
||||
neg bool
|
||||
// Enumerated runes contained in this group
|
||||
runes []rune
|
||||
// All ranges contained in this group
|
||||
ranges []runeRange
|
||||
}
|
||||
|
||||
// match matches r with g. It returns true if r is matched.
|
||||
func (g runeGroup) match(r rune) bool {
|
||||
for _, ru := range g.runes {
|
||||
if ru == r {
|
||||
return !g.neg
|
||||
}
|
||||
}
|
||||
|
||||
for _, rang := range g.ranges {
|
||||
if rang.match(r) {
|
||||
return !g.neg
|
||||
}
|
||||
}
|
||||
|
||||
return g.neg
|
||||
}
|
||||
|
||||
// A closed range of runes consisting of all runes between lo and hi both
|
||||
// inclusive.
|
||||
type runeRange struct {
|
||||
lo, hi rune
|
||||
}
|
||||
|
||||
// match returns whether r is in rg.
|
||||
func (rg runeRange) match(r rune) bool {
|
||||
return rg.lo <= r && r <= rg.hi
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue