feat: rewrote glob to have a full doublestar impl
All checks were successful
default / ensure tests work (push) Successful in 35s

This implements a full double star glob implementation with it's own filesystem implementation.
This commit is contained in:
Louis Seubert 2026-01-25 17:22:16 +01:00
commit 24ad00274d
Signed by: louis9902
GPG key ID: 4B9DB28F826553BD
7 changed files with 1354 additions and 333 deletions

485
glob/glob.go Normal file
View file

@ -0,0 +1,485 @@
// Package glob implements a language for specifying glob patterns for path
// names starting at some root. The language does not follow the specs from
// filepath.Match but provides a superset which allows for directory
// wildcards.
//
// Patterns consist of normal characters, non-separator wildcards '*' and '?',
// separators '/' and directory wildcards '**'.
//
// A somewhat formal grammer can be given as:
//
// pattern = term, { '/', term };
// term = '**' | name;
// name = { charSpecial | group | escapedChar | '*' | '?' };
// charSpecial = (* any unicode rune except '/', '*', '?', '[' and '\' *);
// char = (* any unicode rune *);
// escapedChar = '\\', char;
// group = '[', [ '^' ] { escapedChar | groupChar | range } ']';
// groupChar = (* any unicode rune except '-' and ']' *);
// range = ( groupChar | escapedChar ), '-', (groupChar | escapedChar);
//
// The format operators have the following meaning:
//
// - any character (rune) matches the exactly this rune - with the following
// exceptions
// - `/` works as a directory separator. It matches directory boundarys of the
// underlying system independently of the separator char used by the OS.
// - `?` matches exactly one non-separator char
// - `*` matches any number of non-separator chars - including zero
// - `\` escapes a character's special meaning allowing `*` and `?` to be used
// as regular characters.
// - `**` matches any number of nested directories. If anything is matched it
// always extends until a separator or the end of the name.
// - Groups can be defined using the `[` and `]` characters. Inside a group the
// special meaning of the characters mentioned before is disabled but the
// following rules apply
// - any character used as part of the group acts as a choice to pick from
// - if the group's first character is a `^` the whole group is negated
// - a range can be defined using `-` matching any rune between low and high
// inclusive
// - Multiple ranges can be given. Ranges can be combined with choices.
// - The meaning of `-` and `]` can be escacped using `\`
package glob
import (
"errors"
"fmt"
"io/fs"
"strings"
"unicode/utf8"
)
const (
// Separator defines the path separator to use in patterns. This is always
// a forward slash independently of the underlying's OS separator
Separator = '/'
// SingleWildcard defines the the single non-separator character wildcard
// operator.
SingleWildcard = '?'
// AnyWildcard defines the the any number of non-separator characters
// wildcard operator.
AnyWildcard = '*'
// Backslash escapes the next character's special meaning
Backslash = '\\'
// GroupStart starts a range
GroupStart = '['
// GroupEnd starts a range
GroupEnd = ']'
// GroupNegate when used as the first character of a group negates the group.
GroupNegate = '^'
// Range defines the range operator
Range = '-'
)
var (
// ErrBadPattern is returned when an invalid pattern is found. Make
// sure you use errors.Is to compare errors to this sentinel value.
ErrBadPattern = errors.New("bad pattern")
)
// Pattern defines a glob pattern prepared ahead of time which can be used to
// match filenames. Pattern is safe to use concurrently.
type Pattern struct {
tokens []token
}
// New creates a new pattern from pat and returns it. It returns an error
// indicating any invalid pattern.
func New(pat string) (*Pattern, error) {
var tokens []token
p := pat
for {
if len(p) == 0 {
return &Pattern{tokens: tokens}, nil
}
r, l := utf8.DecodeRuneInString(p)
var t token
switch r {
case Separator:
if len(tokens) > 0 && tokens[len(tokens)-1].r == Separator {
return nil, fmt.Errorf("%w: unexpected //", ErrBadPattern)
}
t = token{tokenTypeLiteral, Separator, runeGroup{}}
case SingleWildcard:
if len(tokens) > 0 && (tokens[len(tokens)-1].t == tokenTypeAnyRunes || tokens[len(tokens)-1].t == tokenTypeAnyDirectories) {
return nil, fmt.Errorf("%w: unexpected ?", ErrBadPattern)
}
t = token{tokenTypeSingleRune, 0, runeGroup{}}
case AnyWildcard:
if len(tokens) > 0 && (tokens[len(tokens)-1].t == tokenTypeSingleRune || tokens[len(tokens)-1].t == tokenTypeAnyDirectories) {
return nil, fmt.Errorf("%w: unexpected ?", ErrBadPattern)
}
t = token{tokenTypeAnyRunes, 0, runeGroup{}}
if len(p[l:]) > 0 {
n, nl := utf8.DecodeRuneInString(p[l:])
if n == AnyWildcard {
d, _ := utf8.DecodeRuneInString(p[l+nl:])
if d == utf8.RuneError {
return nil, fmt.Errorf("%w: unexpected end of patterm after **", ErrBadPattern)
}
if d != Separator {
return nil, fmt.Errorf("%w: unexpected %c after **", ErrBadPattern, d)
}
t.t = tokenTypeAnyDirectories
l += nl
}
}
case Backslash:
if len(p[l:]) == 0 {
return nil, fmt.Errorf("%w: no character given after \\", ErrBadPattern)
}
p = p[l:]
r, l = utf8.DecodeRuneInString(p)
t = token{tokenTypeLiteral, r, runeGroup{}}
case GroupStart:
var err error
t, l, err = parseGroup(p)
if err != nil {
return nil, err
}
case GroupEnd:
return nil, fmt.Errorf("%w: using ] w/o [", ErrBadPattern)
default:
t = token{tokenTypeLiteral, r, runeGroup{}}
}
tokens = append(tokens, t)
p = p[l:]
}
}
// String reconstructs the glob pattern from the tokens.
func (pat *Pattern) String() string {
var b strings.Builder
for _, t := range pat.tokens {
switch t.t {
case tokenTypeLiteral:
switch t.r {
case GroupStart, GroupEnd, GroupNegate, AnyWildcard, SingleWildcard, Range:
b.WriteRune(Backslash)
fallthrough
default:
b.WriteRune(t.r)
}
case tokenTypeSingleRune:
b.WriteRune(SingleWildcard)
case tokenTypeAnyRunes:
b.WriteRune(AnyWildcard)
case tokenTypeAnyDirectories:
b.WriteString("**")
case tokenTypeGroup:
b.WriteRune(GroupStart)
if t.g.neg {
b.WriteRune(GroupNegate)
}
for _, r := range t.g.runes {
b.WriteRune(r)
}
for _, rg := range t.g.ranges {
b.WriteRune(rg.lo)
b.WriteRune(Range)
b.WriteRune(rg.hi)
}
b.WriteRune(GroupEnd)
}
}
return b.String()
}
func (pat *Pattern) GoString() string {
return pat.String()
}
// Match matches a file's path name f to the compiled pattern and returns
// whether the path matches the pattern or not.
func (pat *Pattern) Match(f string) bool {
return match(f, pat.tokens, false)
}
func (pat *Pattern) MatchPrefix(f string) bool {
return match(f, pat.tokens, true)
}
// GlobFS applies pat to all files found in fsys under root and returns the
// matching path names as a string slice. It uses fs.WalkDir internally and all
// constraints given for that function apply to GlobFS.
func (pat *Pattern) GlobFS(fsys fs.FS, root string) ([]string, error) {
results := make([]string, 0)
err := fs.WalkDir(fsys, root, func(p string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if p == "." {
return nil
}
if root != "." && root != "" {
p = strings.Replace(p, root, "", 1)
}
if d.IsDir() {
if !pat.MatchPrefix(p) {
return fs.SkipDir
}
return nil
}
if pat.Match(p) {
results = append(results, p)
}
return nil
})
return results, err
}
func parseGroup(p string) (token, int, error) {
// re-read the [. No need to assert the rune here as it has been
// done in the main parsing loop.
_, le := utf8.DecodeRuneInString(p)
t := token{
t: tokenTypeGroup,
g: runeGroup{},
}
initialLen := le
var start rune
for {
if len(p[le:]) == 0 {
return t, le, fmt.Errorf("%w: missing %c", ErrBadPattern, GroupEnd)
}
r, l := utf8.DecodeRuneInString(p[le:])
le += l
if initialLen == le-l && r == GroupNegate {
t.g.neg = true
continue
}
switch r {
case GroupEnd:
if start != 0 {
t.g.runes = append(t.g.runes, start)
}
return t, le, nil
case Range:
if start == 0 {
return t, le, fmt.Errorf("%w: missing start for character range", ErrBadPattern)
}
if len(p[le:]) == 0 {
return t, le, fmt.Errorf("%w: missing range end", ErrBadPattern)
}
r, l = utf8.DecodeRuneInString(p[le:])
le += l
switch r {
case GroupEnd:
return t, le, fmt.Errorf("%w: unterminated range", ErrBadPattern)
case Backslash:
if len(p[le:]) == 0 {
return t, le, fmt.Errorf("%w: missing character after \\", ErrBadPattern)
}
r, l = utf8.DecodeRuneInString(p[le:])
le += l
fallthrough
default:
t.g.ranges = append(t.g.ranges, runeRange{start, r})
start = 0
}
case Backslash:
if len(p[le:]) == 0 {
return t, le, fmt.Errorf("%w: missing character after \\", ErrBadPattern)
}
r, l = utf8.DecodeRuneInString(p[le:])
le += l
fallthrough
default:
if start != 0 {
t.g.runes = append(t.g.runes, start)
}
start = r
}
}
}
// match is used internally to implement a simple recursive backtracking
// algorithmn using the token list t to match against file path f. If matchPrefix
// is set to true, match returns true if f is completely matched by any prefix
// of t. Otherwise, match returns true if f is matched by _all_ tokens in t.
func match(f string, t []token, matchPrefix bool) bool {
for {
if len(f) == 0 {
if matchPrefix {
return true
}
if len(t) == 0 {
return true
}
if len(t) == 1 && t[0].t == tokenTypeAnyRunes {
return true
}
return false
}
if len(t) == 0 {
return false
}
r, le := utf8.DecodeRuneInString(f)
switch t[0].t {
case tokenTypeLiteral:
if t[0].r != r {
return false
}
case tokenTypeGroup:
if !t[0].g.match(r) {
return false
}
case tokenTypeSingleRune:
if r == Separator {
return false
}
case tokenTypeAnyRunes:
if r == Separator {
return match(f, t[1:], matchPrefix)
}
if match(f[le:], t, matchPrefix) {
return true
}
if match(f, t[1:], matchPrefix) {
return true
}
case tokenTypeAnyDirectories:
if match(f, t[2:], matchPrefix) {
return true
}
var l2 int
for {
if len(f[le+l2:]) == 0 {
return false
}
n, nl := utf8.DecodeRuneInString(f[le+l2:])
l2 += nl
if n == Separator {
break
}
}
if match(f[le+l2:], t[2:], matchPrefix) {
return true
}
return match(f[le+l2:], t, matchPrefix)
}
t = t[1:]
f = f[le:]
}
}
// tokenType enumerates the different types of tokens.
type tokenType int
const (
// a rune literal
tokenTypeLiteral tokenType = iota + 1
// any single non-separator rune
tokenTypeSingleRune
// any number of non-separator runes (incl. zero)
tokenTypeAnyRunes
// any number runes including separators. Matches whole directories.
tokenTypeAnyDirectories
// a group of rune consisting of named runes and/or ranges. Might be negated.
tokenTypeGroup
)
// token implements a single token in the pattern.
type token struct {
// the token's type
t tokenType
// a literal rune to matche. Literal runes are stored separate from groups
// to improve matching performance.
r rune
// A rune group to match.
g runeGroup
}
// A group of runes. Groups can contain any number of enumerated runes and rune
// ranges. In addition a whole group can be negated.
type runeGroup struct {
// Whether the group is negated
neg bool
// Enumerated runes contained in this group
runes []rune
// All ranges contained in this group
ranges []runeRange
}
// match matches r with g. It returns true if r is matched.
func (g runeGroup) match(r rune) bool {
for _, ru := range g.runes {
if ru == r {
return !g.neg
}
}
for _, rang := range g.ranges {
if rang.match(r) {
return !g.neg
}
}
return g.neg
}
// A closed range of runes consisting of all runes between lo and hi both
// inclusive.
type runeRange struct {
lo, hi rune
}
// match returns whether r is in rg.
func (rg runeRange) match(r rune) bool {
return rg.lo <= r && r <= rg.hi
}