feat: rewrote glob to have a full doublestar impl

This implements a full double star glob implementation with it's own filesystem implementation.
2026-01-25 17:22:16 +01:00 · 2026-01-25 17:22:16 +01:00 · 24ad00274d
commit 24ad00274d
parent 3daeb3fc05
7 changed files with 1354 additions and 333 deletions
--- a/glob/glob.go
+++ b/glob/glob.go
@ -0,0 +1,485 @@
+// Package glob implements a language for specifying glob patterns for path
+// names starting at some root. The language does not follow the specs from
+// filepath.Match but provides a superset which allows for directory
+// wildcards.
+//
+// Patterns consist of normal characters, non-separator wildcards '*' and '?',
+// separators '/' and directory wildcards '**'.
+//
+// A somewhat formal grammer can be given as:
+//
+//	pattern = term, { '/', term };
+//	term        = '**' | name;
+//	name        = { charSpecial | group | escapedChar | '*' | '?' };
+//	charSpecial = (* any unicode rune except '/', '*', '?', '[' and '\' *);
+//	char        = (* any unicode rune *);
+//	escapedChar = '\\', char;
+//	group       = '[', [ '^' ] { escapedChar | groupChar | range } ']';
+//	groupChar   = (* any unicode rune except '-' and ']' *);
+//	range       = ( groupChar | escapedChar ), '-', (groupChar | escapedChar);
+//
+// The format operators have the following meaning:
+//
+//   - any character (rune) matches the exactly this rune - with the following
+//     exceptions
+//   - `/` works as a directory separator. It matches directory boundarys of the
+//     underlying system independently of the separator char used by the OS.
+//   - `?` matches exactly one non-separator char
+//   - `*` matches any number of non-separator chars - including zero
+//   - `\` escapes a character's special meaning allowing `*` and `?` to be used
+//     as regular characters.
+//   - `**` matches any number of nested directories. If anything is matched it
+//     always extends until a separator or the end of the name.
+//   - Groups can be defined using the `[` and `]` characters. Inside a group the
+//     special meaning of the characters mentioned before is disabled but the
+//     following rules apply
+//   - any character used as part of the group acts as a choice to pick from
+//   - if the group's first character is a `^` the whole group is negated
+//   - a range can be defined using `-` matching any rune between low and high
+//     inclusive
+//   - Multiple ranges can be given. Ranges can be combined with choices.
+//   - The meaning of `-` and `]` can be escacped using `\`
+package glob
+
+import (
+	"errors"
+	"fmt"
+	"io/fs"
+	"strings"
+	"unicode/utf8"
+)
+
+const (
+	// Separator defines the path separator to use in patterns. This is always
+	// a forward slash independently of the underlying's OS separator
+	Separator = '/'
+	// SingleWildcard defines the the single non-separator character wildcard
+	// operator.
+	SingleWildcard = '?'
+	// AnyWildcard defines the the any number of non-separator characters
+	// wildcard operator.
+	AnyWildcard = '*'
+	// Backslash escapes the next character's special meaning
+	Backslash = '\\'
+	// GroupStart starts a range
+	GroupStart = '['
+	// GroupEnd starts a range
+	GroupEnd = ']'
+	// GroupNegate when used as the first character of a group negates the group.
+	GroupNegate = '^'
+	// Range defines the range operator
+	Range = '-'
+)
+
+var (
+	// ErrBadPattern is returned when an invalid pattern is found. Make
+	// sure you use errors.Is to compare errors to this sentinel value.
+	ErrBadPattern = errors.New("bad pattern")
+)
+
+// Pattern defines a glob pattern prepared ahead of time which can be used to
+// match filenames. Pattern is safe to use concurrently.
+type Pattern struct {
+	tokens []token
+}
+
+// New creates a new pattern from pat and returns it. It returns an error
+// indicating any invalid pattern.
+func New(pat string) (*Pattern, error) {
+	var tokens []token
+
+	p := pat
+	for {
+		if len(p) == 0 {
+			return &Pattern{tokens: tokens}, nil
+		}
+
+		r, l := utf8.DecodeRuneInString(p)
+
+		var t token
+		switch r {
+		case Separator:
+			if len(tokens) > 0 && tokens[len(tokens)-1].r == Separator {
+				return nil, fmt.Errorf("%w: unexpected //", ErrBadPattern)
+			}
+			t = token{tokenTypeLiteral, Separator, runeGroup{}}
+
+		case SingleWildcard:
+			if len(tokens) > 0 && (tokens[len(tokens)-1].t == tokenTypeAnyRunes || tokens[len(tokens)-1].t == tokenTypeAnyDirectories) {
+				return nil, fmt.Errorf("%w: unexpected ?", ErrBadPattern)
+			}
+			t = token{tokenTypeSingleRune, 0, runeGroup{}}
+
+		case AnyWildcard:
+			if len(tokens) > 0 && (tokens[len(tokens)-1].t == tokenTypeSingleRune || tokens[len(tokens)-1].t == tokenTypeAnyDirectories) {
+				return nil, fmt.Errorf("%w: unexpected ?", ErrBadPattern)
+			}
+
+			t = token{tokenTypeAnyRunes, 0, runeGroup{}}
+
+			if len(p[l:]) > 0 {
+				n, nl := utf8.DecodeRuneInString(p[l:])
+				if n == AnyWildcard {
+					d, _ := utf8.DecodeRuneInString(p[l+nl:])
+					if d == utf8.RuneError {
+						return nil, fmt.Errorf("%w: unexpected end of patterm after **", ErrBadPattern)
+					}
+					if d != Separator {
+						return nil, fmt.Errorf("%w: unexpected %c after **", ErrBadPattern, d)
+					}
+
+					t.t = tokenTypeAnyDirectories
+					l += nl
+				}
+			}
+
+		case Backslash:
+			if len(p[l:]) == 0 {
+				return nil, fmt.Errorf("%w: no character given after \\", ErrBadPattern)
+			}
+
+			p = p[l:]
+			r, l = utf8.DecodeRuneInString(p)
+
+			t = token{tokenTypeLiteral, r, runeGroup{}}
+
+		case GroupStart:
+			var err error
+			t, l, err = parseGroup(p)
+			if err != nil {
+				return nil, err
+			}
+
+		case GroupEnd:
+			return nil, fmt.Errorf("%w: using ] w/o [", ErrBadPattern)
+
+		default:
+			t = token{tokenTypeLiteral, r, runeGroup{}}
+		}
+
+		tokens = append(tokens, t)
+		p = p[l:]
+	}
+}
+
+// String reconstructs the glob pattern from the tokens.
+func (pat *Pattern) String() string {
+	var b strings.Builder
+	for _, t := range pat.tokens {
+		switch t.t {
+		case tokenTypeLiteral:
+			switch t.r {
+			case GroupStart, GroupEnd, GroupNegate, AnyWildcard, SingleWildcard, Range:
+				b.WriteRune(Backslash)
+				fallthrough
+			default:
+				b.WriteRune(t.r)
+			}
+		case tokenTypeSingleRune:
+			b.WriteRune(SingleWildcard)
+		case tokenTypeAnyRunes:
+			b.WriteRune(AnyWildcard)
+		case tokenTypeAnyDirectories:
+			b.WriteString("**")
+		case tokenTypeGroup:
+			b.WriteRune(GroupStart)
+			if t.g.neg {
+				b.WriteRune(GroupNegate)
+			}
+			for _, r := range t.g.runes {
+				b.WriteRune(r)
+			}
+			for _, rg := range t.g.ranges {
+				b.WriteRune(rg.lo)
+				b.WriteRune(Range)
+				b.WriteRune(rg.hi)
+			}
+			b.WriteRune(GroupEnd)
+		}
+	}
+	return b.String()
+}
+
+func (pat *Pattern) GoString() string {
+	return pat.String()
+}
+
+// Match matches a file's path name f to the compiled pattern and returns
+// whether the path matches the pattern or not.
+func (pat *Pattern) Match(f string) bool {
+	return match(f, pat.tokens, false)
+}
+
+func (pat *Pattern) MatchPrefix(f string) bool {
+	return match(f, pat.tokens, true)
+}
+
+// GlobFS applies pat to all files found in fsys under root and returns the
+// matching path names as a string slice. It uses fs.WalkDir internally and all
+// constraints given for that function apply to GlobFS.
+func (pat *Pattern) GlobFS(fsys fs.FS, root string) ([]string, error) {
+	results := make([]string, 0)
+	err := fs.WalkDir(fsys, root, func(p string, d fs.DirEntry, err error) error {
+		if err != nil {
+			return err
+		}
+
+		if p == "." {
+			return nil
+		}
+
+		if root != "." && root != "" {
+			p = strings.Replace(p, root, "", 1)
+		}
+
+		if d.IsDir() {
+			if !pat.MatchPrefix(p) {
+				return fs.SkipDir
+			}
+
+			return nil
+		}
+
+		if pat.Match(p) {
+			results = append(results, p)
+		}
+
+		return nil
+	})
+
+	return results, err
+}
+
+func parseGroup(p string) (token, int, error) {
+	// re-read the [. No need to assert the rune here as it has been
+	// done in the main parsing loop.
+	_, le := utf8.DecodeRuneInString(p)
+	t := token{
+		t: tokenTypeGroup,
+		g: runeGroup{},
+	}
+
+	initialLen := le
+	var start rune
+
+	for {
+		if len(p[le:]) == 0 {
+			return t, le, fmt.Errorf("%w: missing %c", ErrBadPattern, GroupEnd)
+		}
+
+		r, l := utf8.DecodeRuneInString(p[le:])
+		le += l
+
+		if initialLen == le-l && r == GroupNegate {
+			t.g.neg = true
+			continue
+		}
+
+		switch r {
+		case GroupEnd:
+			if start != 0 {
+				t.g.runes = append(t.g.runes, start)
+			}
+
+			return t, le, nil
+
+		case Range:
+			if start == 0 {
+				return t, le, fmt.Errorf("%w: missing start for character range", ErrBadPattern)
+			}
+
+			if len(p[le:]) == 0 {
+				return t, le, fmt.Errorf("%w: missing range end", ErrBadPattern)
+			}
+
+			r, l = utf8.DecodeRuneInString(p[le:])
+			le += l
+
+			switch r {
+			case GroupEnd:
+				return t, le, fmt.Errorf("%w: unterminated range", ErrBadPattern)
+
+			case Backslash:
+				if len(p[le:]) == 0 {
+					return t, le, fmt.Errorf("%w: missing character after \\", ErrBadPattern)
+				}
+				r, l = utf8.DecodeRuneInString(p[le:])
+				le += l
+				fallthrough
+
+			default:
+				t.g.ranges = append(t.g.ranges, runeRange{start, r})
+				start = 0
+			}
+
+		case Backslash:
+			if len(p[le:]) == 0 {
+				return t, le, fmt.Errorf("%w: missing character after \\", ErrBadPattern)
+			}
+
+			r, l = utf8.DecodeRuneInString(p[le:])
+			le += l
+			fallthrough
+
+		default:
+			if start != 0 {
+				t.g.runes = append(t.g.runes, start)
+			}
+			start = r
+		}
+	}
+}
+
+// match is used internally to implement a simple recursive backtracking
+// algorithmn using the token list t to match against file path f. If matchPrefix
+// is set to true, match returns true if f is completely matched by any prefix
+// of t. Otherwise, match returns true if f is matched by _all_ tokens in t.
+func match(f string, t []token, matchPrefix bool) bool {
+	for {
+		if len(f) == 0 {
+			if matchPrefix {
+				return true
+			}
+
+			if len(t) == 0 {
+				return true
+			}
+
+			if len(t) == 1 && t[0].t == tokenTypeAnyRunes {
+				return true
+			}
+
+			return false
+		}
+
+		if len(t) == 0 {
+			return false
+		}
+
+		r, le := utf8.DecodeRuneInString(f)
+
+		switch t[0].t {
+		case tokenTypeLiteral:
+			if t[0].r != r {
+				return false
+			}
+
+		case tokenTypeGroup:
+			if !t[0].g.match(r) {
+				return false
+			}
+
+		case tokenTypeSingleRune:
+			if r == Separator {
+				return false
+			}
+
+		case tokenTypeAnyRunes:
+			if r == Separator {
+				return match(f, t[1:], matchPrefix)
+			}
+
+			if match(f[le:], t, matchPrefix) {
+				return true
+			}
+
+			if match(f, t[1:], matchPrefix) {
+				return true
+			}
+
+		case tokenTypeAnyDirectories:
+			if match(f, t[2:], matchPrefix) {
+				return true
+			}
+
+			var l2 int
+			for {
+				if len(f[le+l2:]) == 0 {
+					return false
+				}
+
+				n, nl := utf8.DecodeRuneInString(f[le+l2:])
+				l2 += nl
+
+				if n == Separator {
+					break
+				}
+			}
+
+			if match(f[le+l2:], t[2:], matchPrefix) {
+				return true
+			}
+
+			return match(f[le+l2:], t, matchPrefix)
+		}
+
+		t = t[1:]
+		f = f[le:]
+	}
+}
+
+// tokenType enumerates the different types of tokens.
+type tokenType int
+
+const (
+	// a rune literal
+	tokenTypeLiteral tokenType = iota + 1
+	// any single non-separator rune
+	tokenTypeSingleRune
+	// any number of non-separator runes (incl. zero)
+	tokenTypeAnyRunes
+	// any number runes including separators. Matches whole directories.
+	tokenTypeAnyDirectories
+	// a group of rune consisting of named runes and/or ranges. Might be negated.
+	tokenTypeGroup
+)
+
+// token implements a single token in the pattern.
+type token struct {
+	// the token's type
+	t tokenType
+	// a literal rune to matche. Literal runes are stored separate from groups
+	// to improve matching performance.
+	r rune
+	// A rune group to match.
+	g runeGroup
+}
+
+// A group of runes. Groups can contain any number of enumerated runes and rune
+// ranges. In addition a whole group can be negated.
+type runeGroup struct {
+	// Whether the group is negated
+	neg bool
+	// Enumerated runes contained in this group
+	runes []rune
+	// All ranges contained in this group
+	ranges []runeRange
+}
+
+// match matches r with g. It returns true if r is matched.
+func (g runeGroup) match(r rune) bool {
+	for _, ru := range g.runes {
+		if ru == r {
+			return !g.neg
+		}
+	}
+
+	for _, rang := range g.ranges {
+		if rang.match(r) {
+			return !g.neg
+		}
+	}
+
+	return g.neg
+}
+
+// A closed range of runes consisting of all runes between lo and hi both
+// inclusive.
+type runeRange struct {
+	lo, hi rune
+}
+
+// match returns whether r is in rg.
+func (rg runeRange) match(r rune) bool {
+	return rg.lo <= r && r <= rg.hi
+}