Kiran Gangadharan

Implementing Norvig's Spelling Corrector in Clojure

I wanted to try something small in Clojure as an exercise, and so I decided to implement Norvig’s spelling corrector (which is an elegant Python implementation btw). Being a clojure newbie, it was a bit frustating to figure out an idiomatic solution, but I think I’ve managed a decent implementation. It was fun! :)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57

(def alphabets "abcdefghijklmnopqrstuvwxyz")

(defn words [text] (re-seq #"[a-zA-Z]+" text))

(defn train [features]
  (reduce (fn [model word]
            (let [lword (clojure.string/lower-case word)
                  value (get model lword 1)]
              (assoc model lword (+ 1 value)))) {} features))

(def nwords (train (words (slurp "data.txt"))))

;; splits
(defn get-splits [word] (map #(conj [] (subs word 0 %) (subs word %)) (range (inc (count word)))))

;; deletes
(defn get-deletes [coll]
  (map (fn [[p1 p2]] (apply str (concat p1 (rest p2)))) coll))

;; transposes
(defn get-transposes [coll]
  (map (fn [[p1 p2]] (apply str (concat p1 (str (second p2)) (str (first p2)) (drop 2 p2)))) coll))

;; replaces
(defn get-replaces [coll]
  (flatten (map (fn [[p1 p2]] (map #(apply str (concat p1 (str %) (rest p2))) alphabets)) coll)))

;; inserts
(defn get-inserts [coll]
  (flatten (map (fn [[p1 p2]] (map #(apply str (concat p1 (str %) p2)) alphabets)) coll)))

(defn edits1 [word]
  (let [coll (get-splits word)]
    (distinct (concat (get-deletes coll)
                      (get-transposes coll)
                      (get-replaces coll)
                      (get-inserts coll)))))

(defn known-edits2 [word]
  (distinct
   (flatten
    (map (fn [e1w] (reduce #(if (contains? nwords %2) (conj % %2) %) [] (edits1 e1w)))
       (edits1 word)))))

(defn known [words]
  (reduce #(if (contains? nwords %2) (conj % %2) %) [] words))

(defn correct [word]
  (let [candidates (or (seq (known [word]))
                       (seq (edits1 word))
                       (seq (known-edits2 word))
                       [word])]
    (apply (partial max-key #(get nwords % 1)) candidates)))


;; (correct "speling") ; => spelling

You can find the source code with tests here.

Feedback/Comments are always welcome!