closestmatch.go 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311
  1. package closestmatch
  2. import (
  3. "encoding/gob"
  4. "math/rand"
  5. "os"
  6. "sort"
  7. "strings"
  8. )
  9. // ClosestMatch is the structure that contains the
  10. // substring sizes and carrys a map of the substrings for
  11. // easy lookup
  12. type ClosestMatch struct {
  13. SubstringSizes []int
  14. SubstringToID map[string]map[uint32]struct{}
  15. ID map[uint32]IDInfo
  16. }
  17. // IDInfo carries the information about the keys
  18. type IDInfo struct {
  19. Key string
  20. NumSubstrings int
  21. }
  22. // New returns a new structure for performing closest matches
  23. func New(possible []string, subsetSize []int) *ClosestMatch {
  24. cm := new(ClosestMatch)
  25. cm.SubstringSizes = subsetSize
  26. cm.SubstringToID = make(map[string]map[uint32]struct{})
  27. cm.ID = make(map[uint32]IDInfo)
  28. for i, s := range possible {
  29. substrings := cm.splitWord(strings.ToLower(s))
  30. cm.ID[uint32(i)] = IDInfo{Key: s, NumSubstrings: len(substrings)}
  31. for substring := range substrings {
  32. if _, ok := cm.SubstringToID[substring]; !ok {
  33. cm.SubstringToID[substring] = make(map[uint32]struct{})
  34. }
  35. cm.SubstringToID[substring][uint32(i)] = struct{}{}
  36. }
  37. }
  38. return cm
  39. }
  40. // Load can load a previously saved ClosestMatch object from disk
  41. func Load(filename string) (*ClosestMatch, error) {
  42. cm := new(ClosestMatch)
  43. f, err := os.Open(filename)
  44. defer f.Close()
  45. if err != nil {
  46. return cm, err
  47. }
  48. err = gob.NewDecoder(f).Decode(&cm)
  49. return cm, err
  50. }
  51. // Save writes the current ClosestSave object as a gzipped JSON file
  52. func (cm *ClosestMatch) Save(filename string) error {
  53. f, err := os.Create(filename)
  54. if err != nil {
  55. return err
  56. }
  57. defer f.Close()
  58. enc := gob.NewEncoder(f)
  59. return enc.Encode(cm)
  60. }
  61. func (cm *ClosestMatch) worker(id int, jobs <-chan job, results chan<- result) {
  62. for j := range jobs {
  63. m := make(map[string]int)
  64. if ids, ok := cm.SubstringToID[j.substring]; ok {
  65. weight := 200000 / len(ids)
  66. for id := range ids {
  67. if _, ok2 := m[cm.ID[id].Key]; !ok2 {
  68. m[cm.ID[id].Key] = 0
  69. }
  70. m[cm.ID[id].Key] += 1 + 0*weight
  71. }
  72. }
  73. results <- result{m: m}
  74. }
  75. }
  76. type job struct {
  77. substring string
  78. }
  79. type result struct {
  80. m map[string]int
  81. }
  82. func (cm *ClosestMatch) match(searchWord string) map[string]int {
  83. searchSubstrings := cm.splitWord(searchWord)
  84. searchSubstringsLen := len(searchSubstrings)
  85. jobs := make(chan job, searchSubstringsLen)
  86. results := make(chan result, searchSubstringsLen)
  87. workers := 8
  88. for w := 1; w <= workers; w++ {
  89. go cm.worker(w, jobs, results)
  90. }
  91. for substring := range searchSubstrings {
  92. jobs <- job{substring: substring}
  93. }
  94. close(jobs)
  95. m := make(map[string]int)
  96. for a := 1; a <= searchSubstringsLen; a++ {
  97. r := <-results
  98. for key := range r.m {
  99. if _, ok := m[key]; ok {
  100. m[key] += r.m[key]
  101. } else {
  102. m[key] = r.m[key]
  103. }
  104. }
  105. }
  106. return m
  107. }
  108. // Closest searches for the `searchWord` and returns the closest match
  109. func (cm *ClosestMatch) Closest(searchWord string) string {
  110. for _, pair := range rankByWordCount(cm.match(searchWord)) {
  111. return pair.Key
  112. }
  113. return ""
  114. }
  115. // ClosestN searches for the `searchWord` and returns the n closests matches
  116. func (cm *ClosestMatch) ClosestN(searchWord string, n int) []string {
  117. matches := make([]string, n)
  118. j := 0
  119. for i, pair := range rankByWordCount(cm.match(searchWord)) {
  120. if i == n {
  121. break
  122. }
  123. matches[i] = pair.Key
  124. j = i
  125. }
  126. return matches[:j+1]
  127. }
  128. func rankByWordCount(wordFrequencies map[string]int) PairList {
  129. pl := make(PairList, len(wordFrequencies))
  130. i := 0
  131. for k, v := range wordFrequencies {
  132. pl[i] = Pair{k, v}
  133. i++
  134. }
  135. sort.Sort(sort.Reverse(pl))
  136. return pl
  137. }
  138. type Pair struct {
  139. Key string
  140. Value int
  141. }
  142. type PairList []Pair
  143. func (p PairList) Len() int { return len(p) }
  144. func (p PairList) Less(i, j int) bool { return p[i].Value < p[j].Value }
  145. func (p PairList) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
  146. func (cm *ClosestMatch) splitWord(word string) map[string]struct{} {
  147. wordHash := make(map[string]struct{})
  148. for _, j := range cm.SubstringSizes {
  149. for i := 0; i < len(word)-j; i++ {
  150. substring := string(word[i : i+j])
  151. if len(strings.TrimSpace(substring)) > 0 {
  152. wordHash[string(word[i:i+j])] = struct{}{}
  153. }
  154. }
  155. }
  156. return wordHash
  157. }
  158. // AccuracyMutatingWords runs some basic tests against the wordlist to
  159. // see how accurate this bag-of-characters method is against
  160. // the target dataset
  161. func (cm *ClosestMatch) AccuracyMutatingWords() float64 {
  162. rand.Seed(1)
  163. percentCorrect := 0.0
  164. numTrials := 0.0
  165. for wordTrials := 0; wordTrials < 200; wordTrials++ {
  166. var testString, originalTestString string
  167. testStringNum := rand.Intn(len(cm.ID))
  168. i := 0
  169. for id := range cm.ID {
  170. i++
  171. if i != testStringNum {
  172. continue
  173. }
  174. originalTestString = cm.ID[id].Key
  175. break
  176. }
  177. var words []string
  178. choice := rand.Intn(3)
  179. if choice == 0 {
  180. // remove a random word
  181. words = strings.Split(originalTestString, " ")
  182. if len(words) < 3 {
  183. continue
  184. }
  185. deleteWordI := rand.Intn(len(words))
  186. words = append(words[:deleteWordI], words[deleteWordI+1:]...)
  187. testString = strings.Join(words, " ")
  188. } else if choice == 1 {
  189. // remove a random word and reverse
  190. words = strings.Split(originalTestString, " ")
  191. if len(words) > 1 {
  192. deleteWordI := rand.Intn(len(words))
  193. words = append(words[:deleteWordI], words[deleteWordI+1:]...)
  194. for left, right := 0, len(words)-1; left < right; left, right = left+1, right-1 {
  195. words[left], words[right] = words[right], words[left]
  196. }
  197. } else {
  198. continue
  199. }
  200. testString = strings.Join(words, " ")
  201. } else {
  202. // remove a random word and shuffle and replace 2 random letters
  203. words = strings.Split(originalTestString, " ")
  204. if len(words) > 1 {
  205. deleteWordI := rand.Intn(len(words))
  206. words = append(words[:deleteWordI], words[deleteWordI+1:]...)
  207. for i := range words {
  208. j := rand.Intn(i + 1)
  209. words[i], words[j] = words[j], words[i]
  210. }
  211. }
  212. testString = strings.Join(words, " ")
  213. letters := "abcdefghijklmnopqrstuvwxyz"
  214. if len(testString) == 0 {
  215. continue
  216. }
  217. ii := rand.Intn(len(testString))
  218. testString = testString[:ii] + string(letters[rand.Intn(len(letters))]) + testString[ii+1:]
  219. ii = rand.Intn(len(testString))
  220. testString = testString[:ii] + string(letters[rand.Intn(len(letters))]) + testString[ii+1:]
  221. }
  222. closest := cm.Closest(testString)
  223. if closest == originalTestString {
  224. percentCorrect += 1.0
  225. } else {
  226. //fmt.Printf("Original: %s, Mutilated: %s, Match: %s\n", originalTestString, testString, closest)
  227. }
  228. numTrials += 1.0
  229. }
  230. return 100.0 * percentCorrect / numTrials
  231. }
  232. // AccuracyMutatingLetters runs some basic tests against the wordlist to
  233. // see how accurate this bag-of-characters method is against
  234. // the target dataset when mutating individual letters (adding, removing, changing)
  235. func (cm *ClosestMatch) AccuracyMutatingLetters() float64 {
  236. rand.Seed(1)
  237. percentCorrect := 0.0
  238. numTrials := 0.0
  239. for wordTrials := 0; wordTrials < 200; wordTrials++ {
  240. var testString, originalTestString string
  241. testStringNum := rand.Intn(len(cm.ID))
  242. i := 0
  243. for id := range cm.ID {
  244. i++
  245. if i != testStringNum {
  246. continue
  247. }
  248. originalTestString = cm.ID[id].Key
  249. break
  250. }
  251. testString = originalTestString
  252. // letters to replace with
  253. letters := "abcdefghijklmnopqrstuvwxyz"
  254. choice := rand.Intn(3)
  255. if choice == 0 {
  256. // replace random letter
  257. ii := rand.Intn(len(testString))
  258. testString = testString[:ii] + string(letters[rand.Intn(len(letters))]) + testString[ii+1:]
  259. } else if choice == 1 {
  260. // delete random letter
  261. ii := rand.Intn(len(testString))
  262. testString = testString[:ii] + testString[ii+1:]
  263. } else {
  264. // add random letter
  265. ii := rand.Intn(len(testString))
  266. testString = testString[:ii] + string(letters[rand.Intn(len(letters))]) + testString[ii:]
  267. }
  268. closest := cm.Closest(testString)
  269. if closest == originalTestString {
  270. percentCorrect += 1.0
  271. } else {
  272. //fmt.Printf("Original: %s, Mutilated: %s, Match: %s\n", originalTestString, testString, closest)
  273. }
  274. numTrials += 1.0
  275. }
  276. return 100.0 * percentCorrect / numTrials
  277. }