Browse Source

update diff_by_ scripts and descriptions in readme

Charles Reid 5 months ago
parent
commit
56a4d4dc6d
3 changed files with 94 additions and 32 deletions
  1. 17
    3
      README.md
  2. 69
    20
      diff_by_n.py
  3. 8
    9
      diff_by_one_fixed.py

+ 17
- 3
README.md View File

@@ -5,6 +5,8 @@ as well as scripts to run various combinatoric experiments,
5 5
 graph algorithms, and other algorithms to explore the 
6 6
 relationships among these words.
7 7
 
8
+
9
+
8 10
 The list of words comes from [[1]](http://www-cs-faculty.stanford.edu/~knuth/sgb.html) and is in the public domain.
9 11
 
10 12
 ## Get Words
@@ -19,9 +21,21 @@ the SGB five letter word list. Solutions to these exercises are listed below.
19 21
 
20 22
 ```distinct.py```- computes the number of SGB words containing exactly k distinct letters.
21 23
 
22
-```diff_by_one.py``` - computes the number of words in the SGB that are off by a single letter,
23
-shifted a single place. For example, "might" and "night" or "large" and "marge". There is a 
24
-surprisingly large number of such pairs.
24
+```diff_by_one_fixed.py``` - (fixed 2019-03-09.) computes the number of words in the SGB
25
+that are off by a single letter in each position. An example is `rover` and `spuds`.
26
+Each corresponding letter is only different by one: `r -> s`, `o->p`, and so on.
27
+This uses recursive backtracking to generate possible matches for each word, and 
28
+uses a hash table to check for their existence in the original word set.
29
+
30
+There are 38 such pairs in the SGB.
31
+
32
+Also see [https://charlesreid1.com/wiki/F](https://charlesreid1.com/wiki/Letter_Coverage)..
33
+
34
+```diff_by_n_fixed.py``` - (added 2019-03-10.) using the corrected approach (above) to
35
+computing differences by 1, this generalizes the calculation to words that are different
36
+by a distance `d` for each letter position. There
37
+
38
+
25 39
 
26 40
 ```euclidean_distance.py``` - computes the euclidean distance between two words. This uses
27 41
 the traditional Euclidean distance definition but reinterprets distance to mean edit distance.

+ 69
- 20
diff_by_n.py View File

@@ -7,30 +7,79 @@ Variation on Exercise #28
7 7
 Find pairs of SGB word vectors that differ by +/-n.
8 8
 """
9 9
 from get_words import get_words
10
-from euclidean_distance import euclidean_distance
11 10
 
12
-def diff_by_n(n):
13
-    k = 0
14
-    off_by_one = []
15
-    for i in range(len(words)):
16
-        for j in range(i,len(words)):
17
-            d = euclidean_distance(words[i],words[j])
18
-            if(abs(d)==n):
19
-                k += 1
20
-                off_by_one.append((words[i],words[j]))
21
-                print("{0:s}, {1:s}".format(words[i],words[j]))
22
-        if k>5:
23
-            break
11
+def gen_variations(word,fragment,distance,depth,variations):
12
+    """
13
+    Recursive backtracking method to assemble strings
14
+    differing by +/-distance at each position
15
+    """
16
+    if depth==5:
17
+        variations.add(fragment)
18
+    else:
19
+        for d in range(1,distance+1):
20
+            fragment_pd = fragment + chr(ord(word[depth])+d)
21
+            fragment_md = fragment + chr(ord(word[depth])-d)
22
+            for new_fragment in [fragment_pd,fragment_md]:
23
+                gen_variations(word,new_fragment,distance,depth+1,variations)
24 24
 
25
-    print("{0:d} words have a Euclidean distance of +/-{0:d}.".format(k,n))
26 25
 
26
+def get_all_variations(word,d):
27
+    """
28
+    Return all possible words that differ
29
+    from `word` by +/-d in each index.
30
+    This does not include `word` in the 
31
+    variations.
32
+    """
33
+    word_variations = set()
34
+    gen_variations(word,'',d,0,word_variations)
27 35
 
28
-if __name__=="__main__":
29
-    words = get_words()
36
+    word_variations = list(word_variations)
37
+    return word_variations
38
+
39
+
40
+def main():
41
+    """
42
+    Find pairs of SGB word vectors that differ by 
43
+    +/-d in each component.
44
+    
45
+    To do this, iterate through each word,
46
+    generate the possible candidate matchings,
47
+    and if they exist, add the pair to a set.
48
+    """
49
+    words = set(get_words())
30 50
     #words = words[:1000]
51
+    words = set(get_words())
52
+
53
+    for d in [2,3,4,5]:
54
+
55
+        # List of string tuples
56
+        off_by_n = set()
31 57
 
32
-    for n in [2,3,4,5]:
33
-        print("-"*40)
34
-        print("Distance of {0:d}".format(n))
35
-        diff_by_n(n)
58
+        # Iterate over every word
59
+        for iw,word in enumerate(words):
60
+            # Generate all possible candidate matches
61
+            # distance +/-d from this word at each
62
+            # position
63
+            all_vars = get_all_variations(word,d)
64
+            for word_var in all_vars:
65
+                if word_var in words:
66
+                    # Found a new (unordered) pair
67
+                    if word<word_var:
68
+                        left=word
69
+                        right=word_var
70
+                    else:
71
+                        left=word_var
72
+                        right=word
73
+                    off_by_n.add((left,right))
74
+        
75
+        off_by_n = list(off_by_n)
76
+        off_by_n.sort()
77
+
78
+        for o in off_by_n[:10]:
79
+            print("{:s} {:s}".format(o[0],o[1]))
80
+
81
+        print("Found {0:d} pairs of words that differ by +/-{1:d} in each component.".format(len(off_by_n),d))
82
+
83
+if __name__=="__main__":
84
+    main()
36 85
 

+ 8
- 9
diff_by_one_fixed.py View File

@@ -10,6 +10,10 @@ from get_words import get_words
10 10
 
11 11
 
12 12
 def gen_variations(word,fragment,depth,variations):
13
+    """
14
+    Recursive backtracking method to assemble strings
15
+    differing by +/-1 at each position
16
+    """
13 17
     if depth==5:
14 18
         variations.add(fragment)
15 19
     else:
@@ -20,7 +24,6 @@ def gen_variations(word,fragment,depth,variations):
20 24
             gen_variations(word,new_fragment,depth+1,variations)
21 25
 
22 26
 
23
-
24 27
 def get_all_variations(word):
25 28
     """
26 29
     Return all possible words that differ
@@ -34,6 +37,7 @@ def get_all_variations(word):
34 37
     word_variations = list(word_variations)
35 38
     return word_variations
36 39
 
40
+
37 41
 def main():
38 42
     """
39 43
     Find pairs of SGB word vectors that differ by +/-1 in each component.
@@ -42,17 +46,12 @@ def main():
42 46
     generate the 32 possible candidate matchings,
43 47
     and if they exist, add the pair to a set.
44 48
     """
45
-
46
-
47 49
     # words is a hash table (unsorted)
48
-    words = set(get_words())
49
-
50
-    ## To limit the output:
50
+    words = get_words()
51 51
     #words = words[:1000]
52
+    words = set(get_words())
52 53
 
53
-    k = 0
54
-
55
-    # List of string tuples ('this','that')
54
+    # List of string tuples
56 55
     off_by_one = set()
57 56
 
58 57
     # Iterate over every word