diff --git a/remap/funcs_remap.go b/remap/funcs_remap.go index ac21e0c..51fddea 100644 --- a/remap/funcs_remap.go +++ b/remap/funcs_remap.go @@ -135,7 +135,7 @@ func (r *ReMap) Map(b []byte, inclNoMatch, inclNoMatchStrict, mustMatch bool) (m if len(matchBytes) == 0 || len(matchBytes) == 1 { /* no submatches whatsoever. - *technically* I don't think this condition can actually be reached. + *Technically* I don't think this condition can actually be reached. This is more of a safe-return before we re-slice. */ matches = make(map[string][][]byte) @@ -308,6 +308,13 @@ func (r *ReMap) MapString(s string, inclNoMatch, inclNoMatchStrict, mustMatch bo var grpNm string var names []string var matchStr string + /* + A slice of indices or index pairs. + For each element `e` in idxChunks, + * if `e` is nil, no group match. + * if len(e) == 1, only a single character was matched. + * otherwise len(e) == 2, the start and end of the match. + */ var idxChunks [][]int var matchIndices []int var chunkIndices []int // always 2 elements; start pos and end pos @@ -317,7 +324,7 @@ func (r *ReMap) MapString(s string, inclNoMatch, inclNoMatchStrict, mustMatch bo OK so this is a bit of a deviation. It's not as straightforward as above, because there isn't an explicit way - like above to determine if a patterb was *matched as an empty string* vs. + like above to determine if a pattern was *matched as an empty string* vs. *not matched*. So instead do roundabout index-y things. @@ -326,73 +333,111 @@ func (r *ReMap) MapString(s string, inclNoMatch, inclNoMatchStrict, mustMatch bo if s == "" { return } - names = r.Regexp.SubexpNames() + /* + I'm not entirely sure how serious they are about "the slice should not be modified"... + + DO NOT sort or dedupe `names`! If the same name for groups is duplicated, + it will be duplicated here in proper order and the ordering is tied to + the ordering of matchIndices. + */ + names = r.Regexp.SubexpNames()[:] matchIndices = r.Regexp.FindStringSubmatchIndex(s) if matchIndices == nil { - // s does not match pattern + // s does not match pattern at all. if !mustMatch { matches = make(map[string][]string) } return } - if names == nil || len(names) == 0 || len(names) == 1 { + if names == nil || len(names) <= 1 { /* - no named capture groups; - technically only the last condition would be the case. + No named capture groups; + technically only the last condition would be the case, + as (regexp.Regexp).SubexpNames() will ALWAYS at the LEAST + return a `[]string{""}`. */ if inclNoMatch { matches = make(map[string][]string) } return } - names = names[1:] if len(matchIndices) == 0 || len(matchIndices) == 1 { /* - no submatches whatsoever. - *technically* I don't think this condition can actually be reached. + No (sub)matches whatsoever. + *technically* I don't think this condition can actually be reached; + matchIndices should ALWAYS either be `nil` or len will be at LEAST 2, + and modulo 2 thereafter since they're PAIRS of indices... + Why they didn't just return a [][]int or [][2]int or something + instead of an []int, who knows. + But we're correcting that poor design. This is more of a safe-return before we chunk the indices. */ matches = make(map[string][]string) if inclNoMatch { - if len(names) >= 1 { - for _, grpNm = range names { + for _, grpNm = range names { + if grpNm != "" { matches[grpNm] = nil } } } return } - /* - The reslice starts at 2 because they're in pairs: []int{, , , , ...} - and the first *pair* is the entire pattern match. - Thus the len(matchIndices) == 2*len(names). - Keep in mind that since the first element of names is removed, - the first pair here is also removed. - */ - matchIndices = matchIndices[2:] + /* + A reslice of `matchIndices` could technically start at 2 (as long as `names` is sliced [1:]) + because they're in pairs: []int{, , , , ...} + and the first pair is the entire pattern match (un-resliced names[0]). + Thus the len(matchIndices) == 2*len(names), *even* if you + Keep in mind that since the first element of names is removed, + the first pair here is skipped. + This provides a bit more consistent readability, though. + */ idxChunks = make([][]int, len(names)) - for startIdx = 0; startIdx < len(idxChunks); startIdx += 2 { + chunkIdx = 0 + endIdx = 0 + for startIdx = 0; endIdx < len(matchIndices); startIdx += 2 { endIdx = startIdx + 2 + // This technically should never happen. + if endIdx > len(matchIndices) { + endIdx = len(matchIndices) + } + + chunkIndices = matchIndices[startIdx:endIdx] + + if chunkIndices[0] == -1 || chunkIndices[1] == -1 { + // group did not match + chunkIndices = nil + } else { + if chunkIndices[0] == chunkIndices[1] { + chunkIndices = []int{chunkIndices[0]} + } else { + chunkIndices = matchIndices[startIdx:endIdx] + } + } + idxChunks[chunkIdx] = chunkIndices + chunkIdx++ + } + + // Now associate with names and pull the string sequence. + for chunkIdx, chunkIndices = range idxChunks { grpNm = names[chunkIdx] /* Thankfully, it's actually a build error if a pattern specifies a named capture group with an empty name. So we don't need to worry about accounting for that, - and can just skip over grpNm == "" (which is an *unnamed* capture group). + and can just skip over grpNm == "" + (which is either an *unnamed* capture group + OR the first element in `names`, which is always + the entire match). */ if grpNm == "" { continue } - // This technically should never happen. - if endIdx > len(matchIndices) { - endIdx = len(matchIndices) - } - chunkIndices = matchIndices[startIdx:endIdx] - if chunkIndices[0] == -1 || chunkIndices[1] == -1 { + + if chunkIndices == nil || len(chunkIndices) == 0 { // group did not match if !inclNoMatch { continue @@ -411,13 +456,19 @@ func (r *ReMap) MapString(s string, inclNoMatch, inclNoMatchStrict, mustMatch bo continue } - matchStr = s[chunkIndices[0]:chunkIndices[1]] + switch len(chunkIndices) { + case 1: + // Single character + matchStr = string(s[chunkIndices[0]]) + case 2: + // Multiple characters + matchStr = s[chunkIndices[0]:chunkIndices[1]] + } + if _, ok = tmpMap[grpNm]; !ok { tmpMap[grpNm] = make([]string, 0) } tmpMap[grpNm] = append(tmpMap[grpNm], matchStr) - - chunkIdx++ } // This *technically* should be completely handled above.