// Package glob implements a language for specifying glob patterns for path // names starting at some root. The language does not follow the specs from // filepath.Match but provides a superset which allows for directory // wildcards. // // Patterns consist of normal characters, non-separator wildcards '*' and '?', // separators '/' and directory wildcards '**'. // // A somewhat formal grammer can be given as: // // pattern = term, { '/', term }; // term = '**' | name; // name = { charSpecial | group | escapedChar | '*' | '?' }; // charSpecial = (* any unicode rune except '/', '*', '?', '[' and '\' *); // char = (* any unicode rune *); // escapedChar = '\\', char; // group = '[', [ '^' ] { escapedChar | groupChar | range } ']'; // groupChar = (* any unicode rune except '-' and ']' *); // range = ( groupChar | escapedChar ), '-', (groupChar | escapedChar); // // The format operators have the following meaning: // // - any character (rune) matches the exactly this rune - with the following // exceptions // - `/` works as a directory separator. It matches directory boundarys of the // underlying system independently of the separator char used by the OS. // - `?` matches exactly one non-separator char // - `*` matches any number of non-separator chars - including zero // - `\` escapes a character's special meaning allowing `*` and `?` to be used // as regular characters. // - `**` matches any number of nested directories. If anything is matched it // always extends until a separator or the end of the name. // - Groups can be defined using the `[` and `]` characters. Inside a group the // special meaning of the characters mentioned before is disabled but the // following rules apply // - any character used as part of the group acts as a choice to pick from // - if the group's first character is a `^` the whole group is negated // - a range can be defined using `-` matching any rune between low and high // inclusive // - Multiple ranges can be given. Ranges can be combined with choices. // - The meaning of `-` and `]` can be escacped using `\` package glob import ( "errors" "fmt" "io/fs" "strings" "unicode/utf8" ) const ( // Separator defines the path separator to use in patterns. This is always // a forward slash independently of the underlying's OS separator Separator = '/' // SingleWildcard defines the the single non-separator character wildcard // operator. SingleWildcard = '?' // AnyWildcard defines the the any number of non-separator characters // wildcard operator. AnyWildcard = '*' // Backslash escapes the next character's special meaning Backslash = '\\' // GroupStart starts a range GroupStart = '[' // GroupEnd starts a range GroupEnd = ']' // GroupNegate when used as the first character of a group negates the group. GroupNegate = '^' // Range defines the range operator Range = '-' ) var ( // ErrBadPattern is returned when an invalid pattern is found. Make // sure you use errors.Is to compare errors to this sentinel value. ErrBadPattern = errors.New("bad pattern") ) // Pattern defines a glob pattern prepared ahead of time which can be used to // match filenames. Pattern is safe to use concurrently. type Pattern struct { tokens []token } // New creates a new pattern from pat and returns it. It returns an error // indicating any invalid pattern. func New(pat string) (*Pattern, error) { var tokens []token p := pat for { if len(p) == 0 { return &Pattern{tokens: tokens}, nil } r, l := utf8.DecodeRuneInString(p) var t token switch r { case Separator: if len(tokens) > 0 && tokens[len(tokens)-1].r == Separator { return nil, fmt.Errorf("%w: unexpected //", ErrBadPattern) } t = token{tokenTypeLiteral, Separator, runeGroup{}} case SingleWildcard: if len(tokens) > 0 && (tokens[len(tokens)-1].t == tokenTypeAnyRunes || tokens[len(tokens)-1].t == tokenTypeAnyDirectories) { return nil, fmt.Errorf("%w: unexpected ?", ErrBadPattern) } t = token{tokenTypeSingleRune, 0, runeGroup{}} case AnyWildcard: if len(tokens) > 0 && (tokens[len(tokens)-1].t == tokenTypeSingleRune || tokens[len(tokens)-1].t == tokenTypeAnyDirectories) { return nil, fmt.Errorf("%w: unexpected ?", ErrBadPattern) } t = token{tokenTypeAnyRunes, 0, runeGroup{}} if len(p[l:]) > 0 { n, nl := utf8.DecodeRuneInString(p[l:]) if n == AnyWildcard { d, _ := utf8.DecodeRuneInString(p[l+nl:]) if d == utf8.RuneError { return nil, fmt.Errorf("%w: unexpected end of patterm after **", ErrBadPattern) } if d != Separator { return nil, fmt.Errorf("%w: unexpected %c after **", ErrBadPattern, d) } t.t = tokenTypeAnyDirectories l += nl } } case Backslash: if len(p[l:]) == 0 { return nil, fmt.Errorf("%w: no character given after \\", ErrBadPattern) } p = p[l:] r, l = utf8.DecodeRuneInString(p) t = token{tokenTypeLiteral, r, runeGroup{}} case GroupStart: var err error t, l, err = parseGroup(p) if err != nil { return nil, err } case GroupEnd: return nil, fmt.Errorf("%w: using ] w/o [", ErrBadPattern) default: t = token{tokenTypeLiteral, r, runeGroup{}} } tokens = append(tokens, t) p = p[l:] } } // String reconstructs the glob pattern from the tokens. func (pat *Pattern) String() string { var b strings.Builder for _, t := range pat.tokens { switch t.t { case tokenTypeLiteral: switch t.r { case GroupStart, GroupEnd, GroupNegate, AnyWildcard, SingleWildcard, Range: b.WriteRune(Backslash) fallthrough default: b.WriteRune(t.r) } case tokenTypeSingleRune: b.WriteRune(SingleWildcard) case tokenTypeAnyRunes: b.WriteRune(AnyWildcard) case tokenTypeAnyDirectories: b.WriteString("**") case tokenTypeGroup: b.WriteRune(GroupStart) if t.g.neg { b.WriteRune(GroupNegate) } for _, r := range t.g.runes { b.WriteRune(r) } for _, rg := range t.g.ranges { b.WriteRune(rg.lo) b.WriteRune(Range) b.WriteRune(rg.hi) } b.WriteRune(GroupEnd) } } return b.String() } func (pat *Pattern) GoString() string { return pat.String() } // Match matches a file's path name f to the compiled pattern and returns // whether the path matches the pattern or not. func (pat *Pattern) Match(f string) bool { return match(f, pat.tokens, false) } func (pat *Pattern) MatchPrefix(f string) bool { return match(f, pat.tokens, true) } // GlobFS applies pat to all files found in fsys under root and returns the // matching path names as a string slice. It uses fs.WalkDir internally and all // constraints given for that function apply to GlobFS. func (pat *Pattern) GlobFS(fsys fs.FS, root string) ([]string, error) { results := make([]string, 0) err := fs.WalkDir(fsys, root, func(p string, d fs.DirEntry, err error) error { if err != nil { return err } if p == "." { return nil } if root != "." && root != "" { p = strings.Replace(p, root, "", 1) } if d.IsDir() { if !pat.MatchPrefix(p) { return fs.SkipDir } return nil } if pat.Match(p) { results = append(results, p) } return nil }) return results, err } func parseGroup(p string) (token, int, error) { // re-read the [. No need to assert the rune here as it has been // done in the main parsing loop. _, le := utf8.DecodeRuneInString(p) t := token{ t: tokenTypeGroup, g: runeGroup{}, } initialLen := le var start rune for { if len(p[le:]) == 0 { return t, le, fmt.Errorf("%w: missing %c", ErrBadPattern, GroupEnd) } r, l := utf8.DecodeRuneInString(p[le:]) le += l if initialLen == le-l && r == GroupNegate { t.g.neg = true continue } switch r { case GroupEnd: if start != 0 { t.g.runes = append(t.g.runes, start) } return t, le, nil case Range: if start == 0 { return t, le, fmt.Errorf("%w: missing start for character range", ErrBadPattern) } if len(p[le:]) == 0 { return t, le, fmt.Errorf("%w: missing range end", ErrBadPattern) } r, l = utf8.DecodeRuneInString(p[le:]) le += l switch r { case GroupEnd: return t, le, fmt.Errorf("%w: unterminated range", ErrBadPattern) case Backslash: if len(p[le:]) == 0 { return t, le, fmt.Errorf("%w: missing character after \\", ErrBadPattern) } r, l = utf8.DecodeRuneInString(p[le:]) le += l fallthrough default: t.g.ranges = append(t.g.ranges, runeRange{start, r}) start = 0 } case Backslash: if len(p[le:]) == 0 { return t, le, fmt.Errorf("%w: missing character after \\", ErrBadPattern) } r, l = utf8.DecodeRuneInString(p[le:]) le += l fallthrough default: if start != 0 { t.g.runes = append(t.g.runes, start) } start = r } } } // match is used internally to implement a simple recursive backtracking // algorithmn using the token list t to match against file path f. If matchPrefix // is set to true, match returns true if f is completely matched by any prefix // of t. Otherwise, match returns true if f is matched by _all_ tokens in t. func match(f string, t []token, matchPrefix bool) bool { for { if len(f) == 0 { if matchPrefix { return true } if len(t) == 0 { return true } if len(t) == 1 && t[0].t == tokenTypeAnyRunes { return true } return false } if len(t) == 0 { return false } r, le := utf8.DecodeRuneInString(f) switch t[0].t { case tokenTypeLiteral: if t[0].r != r { return false } case tokenTypeGroup: if !t[0].g.match(r) { return false } case tokenTypeSingleRune: if r == Separator { return false } case tokenTypeAnyRunes: if r == Separator { return match(f, t[1:], matchPrefix) } if match(f[le:], t, matchPrefix) { return true } if match(f, t[1:], matchPrefix) { return true } case tokenTypeAnyDirectories: if match(f, t[2:], matchPrefix) { return true } var l2 int for { if len(f[le+l2:]) == 0 { return false } n, nl := utf8.DecodeRuneInString(f[le+l2:]) l2 += nl if n == Separator { break } } if match(f[le+l2:], t[2:], matchPrefix) { return true } return match(f[le+l2:], t, matchPrefix) } t = t[1:] f = f[le:] } } // tokenType enumerates the different types of tokens. type tokenType int const ( // a rune literal tokenTypeLiteral tokenType = iota + 1 // any single non-separator rune tokenTypeSingleRune // any number of non-separator runes (incl. zero) tokenTypeAnyRunes // any number runes including separators. Matches whole directories. tokenTypeAnyDirectories // a group of rune consisting of named runes and/or ranges. Might be negated. tokenTypeGroup ) // token implements a single token in the pattern. type token struct { // the token's type t tokenType // a literal rune to matche. Literal runes are stored separate from groups // to improve matching performance. r rune // A rune group to match. g runeGroup } // A group of runes. Groups can contain any number of enumerated runes and rune // ranges. In addition a whole group can be negated. type runeGroup struct { // Whether the group is negated neg bool // Enumerated runes contained in this group runes []rune // All ranges contained in this group ranges []runeRange } // match matches r with g. It returns true if r is matched. func (g runeGroup) match(r rune) bool { for _, ru := range g.runes { if ru == r { return !g.neg } } for _, rang := range g.ranges { if rang.match(r) { return !g.neg } } return g.neg } // A closed range of runes consisting of all runes between lo and hi both // inclusive. type runeRange struct { lo, hi rune } // match returns whether r is in rg. func (rg runeRange) match(r rune) bool { return rg.lo <= r && r <= rg.hi }