123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311 |
- package closestmatch
- import (
- "encoding/gob"
- "math/rand"
- "os"
- "sort"
- "strings"
- )
- // ClosestMatch is the structure that contains the
- // substring sizes and carrys a map of the substrings for
- // easy lookup
- type ClosestMatch struct {
- SubstringSizes []int
- SubstringToID map[string]map[uint32]struct{}
- ID map[uint32]IDInfo
- }
- // IDInfo carries the information about the keys
- type IDInfo struct {
- Key string
- NumSubstrings int
- }
- // New returns a new structure for performing closest matches
- func New(possible []string, subsetSize []int) *ClosestMatch {
- cm := new(ClosestMatch)
- cm.SubstringSizes = subsetSize
- cm.SubstringToID = make(map[string]map[uint32]struct{})
- cm.ID = make(map[uint32]IDInfo)
- for i, s := range possible {
- substrings := cm.splitWord(strings.ToLower(s))
- cm.ID[uint32(i)] = IDInfo{Key: s, NumSubstrings: len(substrings)}
- for substring := range substrings {
- if _, ok := cm.SubstringToID[substring]; !ok {
- cm.SubstringToID[substring] = make(map[uint32]struct{})
- }
- cm.SubstringToID[substring][uint32(i)] = struct{}{}
- }
- }
- return cm
- }
- // Load can load a previously saved ClosestMatch object from disk
- func Load(filename string) (*ClosestMatch, error) {
- cm := new(ClosestMatch)
- f, err := os.Open(filename)
- defer f.Close()
- if err != nil {
- return cm, err
- }
- err = gob.NewDecoder(f).Decode(&cm)
- return cm, err
- }
- // Save writes the current ClosestSave object as a gzipped JSON file
- func (cm *ClosestMatch) Save(filename string) error {
- f, err := os.Create(filename)
- if err != nil {
- return err
- }
- defer f.Close()
- enc := gob.NewEncoder(f)
- return enc.Encode(cm)
- }
- func (cm *ClosestMatch) worker(id int, jobs <-chan job, results chan<- result) {
- for j := range jobs {
- m := make(map[string]int)
- if ids, ok := cm.SubstringToID[j.substring]; ok {
- weight := 200000 / len(ids)
- for id := range ids {
- if _, ok2 := m[cm.ID[id].Key]; !ok2 {
- m[cm.ID[id].Key] = 0
- }
- m[cm.ID[id].Key] += 1 + 0*weight
- }
- }
- results <- result{m: m}
- }
- }
- type job struct {
- substring string
- }
- type result struct {
- m map[string]int
- }
- func (cm *ClosestMatch) match(searchWord string) map[string]int {
- searchSubstrings := cm.splitWord(searchWord)
- searchSubstringsLen := len(searchSubstrings)
- jobs := make(chan job, searchSubstringsLen)
- results := make(chan result, searchSubstringsLen)
- workers := 8
- for w := 1; w <= workers; w++ {
- go cm.worker(w, jobs, results)
- }
- for substring := range searchSubstrings {
- jobs <- job{substring: substring}
- }
- close(jobs)
- m := make(map[string]int)
- for a := 1; a <= searchSubstringsLen; a++ {
- r := <-results
- for key := range r.m {
- if _, ok := m[key]; ok {
- m[key] += r.m[key]
- } else {
- m[key] = r.m[key]
- }
- }
- }
- return m
- }
- // Closest searches for the `searchWord` and returns the closest match
- func (cm *ClosestMatch) Closest(searchWord string) string {
- for _, pair := range rankByWordCount(cm.match(searchWord)) {
- return pair.Key
- }
- return ""
- }
- // ClosestN searches for the `searchWord` and returns the n closests matches
- func (cm *ClosestMatch) ClosestN(searchWord string, n int) []string {
- matches := make([]string, n)
- j := 0
- for i, pair := range rankByWordCount(cm.match(searchWord)) {
- if i == n {
- break
- }
- matches[i] = pair.Key
- j = i
- }
- return matches[:j+1]
- }
- func rankByWordCount(wordFrequencies map[string]int) PairList {
- pl := make(PairList, len(wordFrequencies))
- i := 0
- for k, v := range wordFrequencies {
- pl[i] = Pair{k, v}
- i++
- }
- sort.Sort(sort.Reverse(pl))
- return pl
- }
- type Pair struct {
- Key string
- Value int
- }
- type PairList []Pair
- func (p PairList) Len() int { return len(p) }
- func (p PairList) Less(i, j int) bool { return p[i].Value < p[j].Value }
- func (p PairList) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
- func (cm *ClosestMatch) splitWord(word string) map[string]struct{} {
- wordHash := make(map[string]struct{})
- for _, j := range cm.SubstringSizes {
- for i := 0; i < len(word)-j; i++ {
- substring := string(word[i : i+j])
- if len(strings.TrimSpace(substring)) > 0 {
- wordHash[string(word[i:i+j])] = struct{}{}
- }
- }
- }
- return wordHash
- }
- // AccuracyMutatingWords runs some basic tests against the wordlist to
- // see how accurate this bag-of-characters method is against
- // the target dataset
- func (cm *ClosestMatch) AccuracyMutatingWords() float64 {
- rand.Seed(1)
- percentCorrect := 0.0
- numTrials := 0.0
- for wordTrials := 0; wordTrials < 200; wordTrials++ {
- var testString, originalTestString string
- testStringNum := rand.Intn(len(cm.ID))
- i := 0
- for id := range cm.ID {
- i++
- if i != testStringNum {
- continue
- }
- originalTestString = cm.ID[id].Key
- break
- }
- var words []string
- choice := rand.Intn(3)
- if choice == 0 {
- // remove a random word
- words = strings.Split(originalTestString, " ")
- if len(words) < 3 {
- continue
- }
- deleteWordI := rand.Intn(len(words))
- words = append(words[:deleteWordI], words[deleteWordI+1:]...)
- testString = strings.Join(words, " ")
- } else if choice == 1 {
- // remove a random word and reverse
- words = strings.Split(originalTestString, " ")
- if len(words) > 1 {
- deleteWordI := rand.Intn(len(words))
- words = append(words[:deleteWordI], words[deleteWordI+1:]...)
- for left, right := 0, len(words)-1; left < right; left, right = left+1, right-1 {
- words[left], words[right] = words[right], words[left]
- }
- } else {
- continue
- }
- testString = strings.Join(words, " ")
- } else {
- // remove a random word and shuffle and replace 2 random letters
- words = strings.Split(originalTestString, " ")
- if len(words) > 1 {
- deleteWordI := rand.Intn(len(words))
- words = append(words[:deleteWordI], words[deleteWordI+1:]...)
- for i := range words {
- j := rand.Intn(i + 1)
- words[i], words[j] = words[j], words[i]
- }
- }
- testString = strings.Join(words, " ")
- letters := "abcdefghijklmnopqrstuvwxyz"
- if len(testString) == 0 {
- continue
- }
- ii := rand.Intn(len(testString))
- testString = testString[:ii] + string(letters[rand.Intn(len(letters))]) + testString[ii+1:]
- ii = rand.Intn(len(testString))
- testString = testString[:ii] + string(letters[rand.Intn(len(letters))]) + testString[ii+1:]
- }
- closest := cm.Closest(testString)
- if closest == originalTestString {
- percentCorrect += 1.0
- } else {
- //fmt.Printf("Original: %s, Mutilated: %s, Match: %s\n", originalTestString, testString, closest)
- }
- numTrials += 1.0
- }
- return 100.0 * percentCorrect / numTrials
- }
- // AccuracyMutatingLetters runs some basic tests against the wordlist to
- // see how accurate this bag-of-characters method is against
- // the target dataset when mutating individual letters (adding, removing, changing)
- func (cm *ClosestMatch) AccuracyMutatingLetters() float64 {
- rand.Seed(1)
- percentCorrect := 0.0
- numTrials := 0.0
- for wordTrials := 0; wordTrials < 200; wordTrials++ {
- var testString, originalTestString string
- testStringNum := rand.Intn(len(cm.ID))
- i := 0
- for id := range cm.ID {
- i++
- if i != testStringNum {
- continue
- }
- originalTestString = cm.ID[id].Key
- break
- }
- testString = originalTestString
- // letters to replace with
- letters := "abcdefghijklmnopqrstuvwxyz"
- choice := rand.Intn(3)
- if choice == 0 {
- // replace random letter
- ii := rand.Intn(len(testString))
- testString = testString[:ii] + string(letters[rand.Intn(len(letters))]) + testString[ii+1:]
- } else if choice == 1 {
- // delete random letter
- ii := rand.Intn(len(testString))
- testString = testString[:ii] + testString[ii+1:]
- } else {
- // add random letter
- ii := rand.Intn(len(testString))
- testString = testString[:ii] + string(letters[rand.Intn(len(letters))]) + testString[ii:]
- }
- closest := cm.Closest(testString)
- if closest == originalTestString {
- percentCorrect += 1.0
- } else {
- //fmt.Printf("Original: %s, Mutilated: %s, Match: %s\n", originalTestString, testString, closest)
- }
- numTrials += 1.0
- }
- return 100.0 * percentCorrect / numTrials
- }
|