1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
|
(def alphabets "abcdefghijklmnopqrstuvwxyz")
(defn words [text] (re-seq #"[a-zA-Z]+" text))
(defn train [features]
(reduce (fn [model word]
(let [lword (clojure.string/lower-case word)
value (get model lword 1)]
(assoc model lword (+ 1 value)))) {} features))
(def nwords (train (words (slurp "data.txt"))))
;; splits
(defn get-splits [word] (map #(conj [] (subs word 0 %) (subs word %)) (range (inc (count word)))))
;; deletes
(defn get-deletes [coll]
(map (fn [[p1 p2]] (apply str (concat p1 (rest p2)))) coll))
;; transposes
(defn get-transposes [coll]
(map (fn [[p1 p2]] (apply str (concat p1 (str (second p2)) (str (first p2)) (drop 2 p2)))) coll))
;; replaces
(defn get-replaces [coll]
(flatten (map (fn [[p1 p2]] (map #(apply str (concat p1 (str %) (rest p2))) alphabets)) coll)))
;; inserts
(defn get-inserts [coll]
(flatten (map (fn [[p1 p2]] (map #(apply str (concat p1 (str %) p2)) alphabets)) coll)))
(defn edits1 [word]
(let [coll (get-splits word)]
(distinct (concat (get-deletes coll)
(get-transposes coll)
(get-replaces coll)
(get-inserts coll)))))
(defn known-edits2 [word]
(distinct
(flatten
(map (fn [e1w] (reduce #(if (contains? nwords %2) (conj % %2) %) [] (edits1 e1w)))
(edits1 word)))))
(defn known [words]
(reduce #(if (contains? nwords %2) (conj % %2) %) [] words))
(defn correct [word]
(let [candidates (or (seq (known [word]))
(seq (edits1 word))
(seq (known-edits2 word))
[word])]
(apply (partial max-key #(get nwords % 1)) candidates)))
;; (correct "speling") ; => spelling
|