diff --git a/remap/funcs_remap.go b/remap/funcs_remap.go index 8d8533e..690a44a 100644 --- a/remap/funcs_remap.go +++ b/remap/funcs_remap.go @@ -2,19 +2,141 @@ package remap /* Map returns a map[string] for regexes with named capture groups matched in bytes b. + Note that this supports non-unique group names; regexp.Regexp allows for patterns with multiple groups + using the same group name. Each match for each group is in a slice keyed under that group name, with + that slice ordered by the indexing done by the regex match itself. + matches and/or its values may be nil or empty under the following condition tree: - matches will be nil if no named capture group matches were found. + IF b is nil: + THEN matches will always be nil + ELSE: + IF all of b does not match pattern + IF mustMuch is true + THEN matches == nil + ELSE + THEN matches == map[string][][]byte{} (non-nil but empty) + ELSE IF pattern has no named capture groups + IF inclNoMatch is true + THEN matches == map[string][][]byte{} (non-nil but empty) + ELSE + THEN matches == nil + ELSE + IF there are no named group matches + IF inclNoMatch is true + THEN matches is non-nil; matches[, ...] is/are defined but nil (_, ok = matches[]; ok == true) + ELSE + THEN matches == nil + ELSE + IF does not have a match + IF inclNoMatch is true + IF inclNoMatchStrict is true + THEN matches[] is defined and non-nil, but populated with placeholder nils + (matches[] == [][]byte{nil[, nil...]}) + ELSE + THEN matches[] is guaranteed defined but may be nil (_, ok = matches[]; ok == true) + ELSE + THEN matches[] is not defined (_, ok = matches[]; ok == false) + ELSE + matches[] == []{[, ...]} */ -func (r *ReMap) Map(b []byte) (matches map[string][]byte) { +func (r *ReMap) Map(b []byte, inclNoMatch, inclNoMatchStrict, mustMatch bool) (matches map[string][][]byte) { - var m [][]byte - var tmpMap map[string][]byte = make(map[string][]byte) + var ok bool + var mIdx int + var match []byte + var grpNm string + var names []string + var matchBytes [][]byte + var tmpMap map[string][][]byte = make(map[string][][]byte) - m = r.Regexp.FindSubmatch(b) + if b == nil { + return + } - for idx, grpNm := range r.Regexp.SubexpNames() { - if idx != 0 && grpNm != "" { - tmpMap[grpNm] = m[idx] + names = r.Regexp.SubexpNames() + matchBytes = r.Regexp.FindSubmatch(b) + + if matchBytes == nil { + // b does not match pattern + if !mustMatch { + matches = make(map[string][][]byte) + } + return + } + + if names == nil || len(names) == 0 || len(names) == 1 { + /* + no named capture groups; + technically only the last condition would be the case. + */ + if inclNoMatch { + matches = make(map[string][][]byte) + } + return + } + names = names[1:] + + if len(matchBytes) == 0 || len(matchBytes) == 1 { + /* + no submatches whatsoever. + *technically* I don't think this condition can actually be reached. + This is more of a safe-return before we re-slice. + */ + matches = make(map[string][][]byte) + if inclNoMatch { + if len(names) >= 1 { + for _, grpNm = range names { + matches[grpNm] = nil + } + } + } + return + } + matchBytes = matchBytes[1:] + + for mIdx, match = range matchBytes { + grpNm = names[mIdx] + /* + Thankfully, it's actually a build error if a pattern specifies a named + capture group with an empty name. + So we don't need to worry about accounting for that, + and can just skip over grpNm == "" (which is an *unnamed* capture group). + */ + if grpNm == "" { + continue + } + + if match == nil { + // group did not match + if !inclNoMatch { + continue + } + if _, ok = tmpMap[grpNm]; !ok { + if !inclNoMatchStrict { + tmpMap[grpNm] = nil + } else { + tmpMap[grpNm] = [][]byte{nil} + } + } else { + if inclNoMatchStrict { + tmpMap[grpNm] = append(tmpMap[grpNm], nil) + } + } + continue + } + + if _, ok = tmpMap[grpNm]; !ok { + tmpMap[grpNm] = make([][]byte, 0) + } + tmpMap[grpNm] = append(tmpMap[grpNm], match) + } + + // This *technically* should be completely handled above. + if inclNoMatch { + for _, grpNm = range names { + if _, ok = tmpMap[grpNm]; !ok { + tmpMap[grpNm] = nil + } } } @@ -26,20 +148,137 @@ func (r *ReMap) Map(b []byte) (matches map[string][]byte) { } /* - MapString returns a map[string] for regexes with named capture groups matched in string s. + MapString is exactly like ReMap.Map(), but operates on (and returns) strings instead. (matches will always be nil if s == ``.) - matches will be nil if no named capture group matches were found. + A small deviation, though; empty strings instead of nils (because duh) will occupy placeholders (if inclNoMatchStrict is specified). */ -func (r *ReMap) MapString(s string) (matches map[string]string) { +func (r *ReMap) MapString(s string, inclNoMatch, inclNoMatchStrict, mustMatch bool) (matches map[string][]string) { - var m []string - var tmpMap map[string]string = make(map[string]string) + var ok bool + var endIdx int + var startIdx int + var chunkIdx int + var grpNm string + var names []string + var matchStr string + var idxChunks [][]int + var matchIndices []int + var chunkIndices []int // always 2 elements; start pos and end pos + var tmpMap map[string][]string = make(map[string][]string) - m = r.Regexp.FindStringSubmatch(s) + /* + OK so this is a bit of a deviation. - for idx, grpNm := range r.Regexp.SubexpNames() { - if idx != 0 && grpNm != "" { - tmpMap[grpNm] = m[idx] + It's not as straightforward as above, because there isn't an explicit way + like above to determine if a patterb was *matched as an empty string* vs. + *not matched*. + + So instead do roundabout index-y things. + */ + + if s == "" { + return + } + names = r.Regexp.SubexpNames() + matchIndices = r.Regexp.FindStringSubmatchIndex(s) + + if matchIndices == nil { + // s does not match pattern + if !mustMatch { + matches = make(map[string][]string) + } + return + } + + if names == nil || len(names) == 0 || len(names) == 1 { + /* + no named capture groups; + technically only the last condition would be the case. + */ + if inclNoMatch { + matches = make(map[string][]string) + } + return + } + names = names[1:] + + if len(matchIndices) == 0 || len(matchIndices) == 1 { + /* + no submatches whatsoever. + *technically* I don't think this condition can actually be reached. + This is more of a safe-return before we chunk the indices. + */ + matches = make(map[string][]string) + if inclNoMatch { + if len(names) >= 1 { + for _, grpNm = range names { + matches[grpNm] = nil + } + } + } + return + } + /* + The reslice starts at 2 because they're in pairs: []int{, , , , ...} + and the first *pair* is the entire pattern match. + Thus the len(matchIndices) == 2*len(names). + Keep in mind that since the first element of names is removed, + the first pair here is also removed. + */ + matchIndices = matchIndices[2:] + + idxChunks = make([][]int, len(names)) + for startIdx = 0; startIdx < len(idxChunks); startIdx += 2 { + endIdx = startIdx + 2 + grpNm = names[chunkIdx] + /* + Thankfully, it's actually a build error if a pattern specifies a named + capture group with an empty name. + So we don't need to worry about accounting for that, + and can just skip over grpNm == "" (which is an *unnamed* capture group). + */ + if grpNm == "" { + continue + } + // This technically should never happen. + if endIdx > len(matchIndices) { + endIdx = len(matchIndices) + } + chunkIndices = matchIndices[startIdx:endIdx] + if chunkIndices[0] == -1 || chunkIndices[1] == -1 { + // group did not match + if !inclNoMatch { + continue + } + if _, ok = tmpMap[grpNm]; !ok { + if !inclNoMatchStrict { + tmpMap[grpNm] = nil + } else { + tmpMap[grpNm] = []string{""} + } + } else { + if inclNoMatchStrict { + tmpMap[grpNm] = append(tmpMap[grpNm], "") + } + } + continue + } + + matchStr = s[chunkIndices[0]:chunkIndices[1]] + if _, ok = tmpMap[grpNm]; !ok { + tmpMap[grpNm] = make([]string, 0) + } + tmpMap[grpNm] = append(tmpMap[grpNm], matchStr) + + chunkIdx++ + } + + // This *technically* should be completely handled above. + if inclNoMatch { + for _, grpNm = range names { + if _, ok = tmpMap[grpNm]; !ok { + tmpMap[grpNm] = nil + } } } diff --git a/remap/types.go b/remap/types.go index cbeea18..bf304db 100644 --- a/remap/types.go +++ b/remap/types.go @@ -4,7 +4,20 @@ import ( `regexp` ) -// ReMap provides some map-related functions around a regexp.Regexp. -type ReMap struct { - *regexp.Regexp -} +type ( + // ReMap provides some map-related functions around a regexp.Regexp. + ReMap struct { + *regexp.Regexp + } + + /* + ExplicitStringMatch is used with ReMap.MapStringExplicit to indicate if a + capture group result is a hit (a group matched, but e.g. the match value is empty string) + or not (a group did not match) + */ + ExplicitStringMatch struct { + Group string + IsMatch bool + Value string + } +)