Browse Source

update diff_by_ scripts and descriptions in readme

master
Charles Reid 9 months ago
parent
commit
56a4d4dc6d
3 changed files with 94 additions and 32 deletions
  1. +17
    -3
      README.md
  2. +69
    -20
      diff_by_n.py
  3. +8
    -9
      diff_by_one_fixed.py

+ 17
- 3
README.md View File

@@ -5,6 +5,8 @@ as well as scripts to run various combinatoric experiments,
graph algorithms, and other algorithms to explore the
relationships among these words.



The list of words comes from [[1]](http://www-cs-faculty.stanford.edu/~knuth/sgb.html) and is in the public domain.

## Get Words
@@ -19,9 +21,21 @@ the SGB five letter word list. Solutions to these exercises are listed below.

```distinct.py```- computes the number of SGB words containing exactly k distinct letters.

```diff_by_one.py``` - computes the number of words in the SGB that are off by a single letter,
shifted a single place. For example, "might" and "night" or "large" and "marge". There is a
surprisingly large number of such pairs.
```diff_by_one_fixed.py``` - (fixed 2019-03-09.) computes the number of words in the SGB
that are off by a single letter in each position. An example is `rover` and `spuds`.
Each corresponding letter is only different by one: `r -> s`, `o->p`, and so on.
This uses recursive backtracking to generate possible matches for each word, and
uses a hash table to check for their existence in the original word set.

There are 38 such pairs in the SGB.

Also see [https://charlesreid1.com/wiki/F](https://charlesreid1.com/wiki/Letter_Coverage)..

```diff_by_n_fixed.py``` - (added 2019-03-10.) using the corrected approach (above) to
computing differences by 1, this generalizes the calculation to words that are different
by a distance `d` for each letter position. There



```euclidean_distance.py``` - computes the euclidean distance between two words. This uses
the traditional Euclidean distance definition but reinterprets distance to mean edit distance.

+ 69
- 20
diff_by_n.py View File

@@ -7,30 +7,79 @@ Variation on Exercise #28
Find pairs of SGB word vectors that differ by +/-n.
"""
from get_words import get_words
from euclidean_distance import euclidean_distance

def diff_by_n(n):
k = 0
off_by_one = []
for i in range(len(words)):
for j in range(i,len(words)):
d = euclidean_distance(words[i],words[j])
if(abs(d)==n):
k += 1
off_by_one.append((words[i],words[j]))
print("{0:s}, {1:s}".format(words[i],words[j]))
if k>5:
break
def gen_variations(word,fragment,distance,depth,variations):
"""
Recursive backtracking method to assemble strings
differing by +/-distance at each position
"""
if depth==5:
variations.add(fragment)
else:
for d in range(1,distance+1):
fragment_pd = fragment + chr(ord(word[depth])+d)
fragment_md = fragment + chr(ord(word[depth])-d)
for new_fragment in [fragment_pd,fragment_md]:
gen_variations(word,new_fragment,distance,depth+1,variations)

print("{0:d} words have a Euclidean distance of +/-{0:d}.".format(k,n))

def get_all_variations(word,d):
"""
Return all possible words that differ
from `word` by +/-d in each index.
This does not include `word` in the
variations.
"""
word_variations = set()
gen_variations(word,'',d,0,word_variations)

if __name__=="__main__":
words = get_words()
word_variations = list(word_variations)
return word_variations


def main():
"""
Find pairs of SGB word vectors that differ by
+/-d in each component.
To do this, iterate through each word,
generate the possible candidate matchings,
and if they exist, add the pair to a set.
"""
words = set(get_words())
#words = words[:1000]
words = set(get_words())

for d in [2,3,4,5]:

# List of string tuples
off_by_n = set()

for n in [2,3,4,5]:
print("-"*40)
print("Distance of {0:d}".format(n))
diff_by_n(n)
# Iterate over every word
for iw,word in enumerate(words):
# Generate all possible candidate matches
# distance +/-d from this word at each
# position
all_vars = get_all_variations(word,d)
for word_var in all_vars:
if word_var in words:
# Found a new (unordered) pair
if word<word_var:
left=word
right=word_var
else:
left=word_var
right=word
off_by_n.add((left,right))
off_by_n = list(off_by_n)
off_by_n.sort()

for o in off_by_n[:10]:
print("{:s} {:s}".format(o[0],o[1]))

print("Found {0:d} pairs of words that differ by +/-{1:d} in each component.".format(len(off_by_n),d))

if __name__=="__main__":
main()


+ 8
- 9
diff_by_one_fixed.py View File

@@ -10,6 +10,10 @@ from get_words import get_words


def gen_variations(word,fragment,depth,variations):
"""
Recursive backtracking method to assemble strings
differing by +/-1 at each position
"""
if depth==5:
variations.add(fragment)
else:
@@ -20,7 +24,6 @@ def gen_variations(word,fragment,depth,variations):
gen_variations(word,new_fragment,depth+1,variations)



def get_all_variations(word):
"""
Return all possible words that differ
@@ -34,6 +37,7 @@ def get_all_variations(word):
word_variations = list(word_variations)
return word_variations


def main():
"""
Find pairs of SGB word vectors that differ by +/-1 in each component.
@@ -42,17 +46,12 @@ def main():
generate the 32 possible candidate matchings,
and if they exist, add the pair to a set.
"""


# words is a hash table (unsorted)
words = set(get_words())

## To limit the output:
words = get_words()
#words = words[:1000]
words = set(get_words())

k = 0

# List of string tuples ('this','that')
# List of string tuples
off_by_one = set()

# Iterate over every word

Loading…
Cancel
Save