Experiments with Knuth's 5,757 five letter words. https://charlesreid1.com/wiki/Five_Letter_Words
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

273 lines
8.5KB

  1. """
  2. word_coverage.py
  3. Compute the minimum number of words from five-letter-words
  4. needed to cover N letters from the alphabet.
  5. This can be done in O(N^2) time with a dynamic program.
  6. For each word that we choose, we have to look at all other words
  7. to see how many letters those two cover, combined.
  8. https://charlesreid1.com/wiki/Five_Letter_Words
  9. https://charlesreid1.com/wiki/Letter_Coverage
  10. """
  11. from get_words import get_words
  12. import numpy as np
  13. from pprint import pprint
  14. def word2bitvector(word,N):
  15. """
  16. Turns a five-letter word into a bit vector representing character coverage.
  17. Uses 26 letters by default.
  18. """
  19. bit_vector = [False,]*N
  20. for c in word:
  21. i = ord(c)-ord('a')
  22. try:
  23. bit_vector[i] = True
  24. except IndexError:
  25. pass
  26. return np.array(bit_vector)
  27. def printbv(bv):
  28. """
  29. Pretty printing for boolean bit vector
  30. """
  31. result = ""
  32. for bit in bv:
  33. if bit:
  34. result += "1"
  35. else:
  36. result += "0"
  37. return result
  38. def btsolution(min_key, min_val, words, bt):
  39. """
  40. Reconstruct the sequence of words that gives maximum coverage and minimum word count.
  41. Input: minimum word key (last word), minimum value (number of words), backtrack (prior word)
  42. Output: list of words
  43. """
  44. solution = []
  45. solution.append(words[min_key])
  46. prior_key = bt[min_key]
  47. while prior_key != -1:
  48. solution.append(words[prior_key])
  49. prior_key = bt[prior_key]
  50. return reversed(solution)
  51. def get_dummy_words():
  52. return ["aa","ab","bc","aa","dd","de","bb"]
  53. if __name__=="__main__":
  54. # Searching for words covering first N letters
  55. N = 15
  56. words = get_words()
  57. words = words[:1000]
  58. # Initialization:
  59. # ----------------
  60. # Store best coverage bitvectors for each word
  61. bestcoverage_bv = [np.array([False]*N) for k in range(len(words))]
  62. # Store number of 1s for best coverage vector for each word
  63. ones_bv = [-1]*len(words)
  64. # Store number of words in best solution for each word
  65. ws = [0]*len(words)
  66. # Store prior word for backtracking
  67. bt = [-1]*len(words)
  68. # Fencepost: Initial Step
  69. # Word 0
  70. # ----------------
  71. i = 0
  72. # Start with word 0
  73. wi = words[i]
  74. # Best letter coverage bit vector
  75. bestcoverage_bv[i] = word2bitvector(words[i],N)
  76. # Length of 1s
  77. ones_bv[i] = sum(bestcoverage_bv[i])
  78. # Number of words in best solution:
  79. ws[i] = 1
  80. # Backtracking: first word has no prior word
  81. bt[i] = -1
  82. # -----------------------------------------------------------------
  83. print("-"*20)
  84. print("Analyzing word "+str(i)+" "+words[i])
  85. print("----")
  86. print("Prior word: there is none")
  87. print("Best coverage bit vector at index 0: "+printbv(bestcoverage_bv[i]))
  88. print("Best coverage bit vector sum: "+str(ones_bv[i]))
  89. print("(This needs to > beat: nobody. there is none.")
  90. # -----------------------------------------------------------------
  91. #import pdb; pdb.set_trace();
  92. # Start by assuming the word by itself,
  93. # and then examine each possible pairing
  94. for i in range(1,len(words)):
  95. wi = words[i]
  96. # Start with bitvector of word i's coverage
  97. wi_bv = word2bitvector(wi,N)
  98. # Fencepost: initial step
  99. # Word i by itself
  100. # Assume word i is the first word in the solution,
  101. # and if we find a better combination with prior word,
  102. # overwrite this solution.
  103. # ------------------------
  104. # Best coverage so far (first guess) is word i by itself
  105. bestcoverage_bv[i] = wi_bv
  106. # Count ones in (first guess) best bitvector
  107. ones_bv[i] = sum(bestcoverage_bv[i])
  108. # Number of words in new best solution:
  109. ws[i] = 1
  110. # Backtracking
  111. bt[i] = -1
  112. # Boolean: is this the first word in the sequence of solutions?
  113. first = True
  114. # -----------------------------------------------------------------
  115. print("-"*20)
  116. print("For word "+str(i)+" "+words[i])
  117. print("Bitvector coverage at index "+str(i)+" is: "+printbv(bestcoverage_bv[i]))
  118. print("about to loop over remaining words...")
  119. print("----")
  120. print("Prior word: there is none")
  121. print("Best coverage bit vector for first guess solution: "+printbv(bestcoverage_bv[i]))
  122. print("Best coverage bit vector sum for first guess solution: "+str(ones_bv[i]))
  123. print("Number of words in first guess solution: "+str(ws[i]))
  124. print("(first guess solution is now current solution)")
  125. # -----------------------------------------------------------------
  126. #import pdb; pdb.set_trace();
  127. # Now loop over the rest of the words,
  128. # and look for a better solution.
  129. for j in reversed(range(0,i)):
  130. ##########################
  131. # Note: we need to consider the word j
  132. # by itself, as the first word of our solution.
  133. #
  134. # A bit lost here.
  135. # Something's not quite right.
  136. # While some of the intermediate calcs are ok,
  137. # we are *never* finding better solutions...???
  138. ##########################
  139. # Get the prior word
  140. wj = words[j]
  141. # Get best coverage bitvector
  142. wj_bv = bestcoverage_bv[j]
  143. # (potential) new combined coverage vector
  144. bestcoverage_bv_i = np.logical_or(wi_bv, wj_bv)
  145. # Number of ones in (potential) new combined coverage vector
  146. ones_bv_i = sum(bestcoverage_bv_i)
  147. # Number of words in (potential) new best solution
  148. ws_i = ws[j]+1
  149. # -----------------------------------------------------------------
  150. print("----")
  151. print("Prior word: "+wj)
  152. print("Best coverage bit vector at index "+str(i))
  153. pprint(bestcoverage_bv_i)
  154. print("")
  155. print("Best coverage bit vector for this solution: "+printbv(bestcoverage_bv_i))
  156. print("Best coverage bit vector sum for this solution: "+str(ones_bv_i))
  157. print("Number of words in this solution: "+str(ws_i))
  158. print("")
  159. print("Best coverage bit vector for current solution: "+printbv(bestcoverage_bv[i]))
  160. print("Best coverage bit vector sum for current solution: "+str(ones_bv[i]))
  161. print("Number of words in current solution: "+str(ws[i]))
  162. print("")
  163. print( "Better ones? " + str(ones_bv_i > ones_bv[i]) )
  164. print( "Same ones but better words? " + str((ones_bv_i==ones_bv[i]) and ws_i < ws[i]) )
  165. if( (ones_bv_i > ones_bv[i]) or (ones_bv_i==ones_bv[i] and ws_i < ws[i]) ):
  166. print("Replacing current solution with this solution.")
  167. else:
  168. print("Keeping current solution and tossing out this solution.")
  169. # -----------------------------------------------------------------
  170. # If this solution is better than our current one,
  171. # overwrite the current solution.
  172. # (Better means, "more ones", or "same ones and fewer words".)
  173. #import pdb; pdb.set_trace();
  174. if( (ones_bv_i > ones_bv[i]) or (ones_bv_i==ones_bv[i] and ws_i < ws[i]) ):
  175. bestcoverage_bv[i] = bestcoverage_bv_i
  176. ones_bv[i] = ones_bv_i
  177. ws[i] = ws_i
  178. bt[i] = j
  179. # This word now follows another word in the sequence of solutions
  180. first = False
  181. # It's tempting to stop early,
  182. # but what if we find the perfect
  183. # solution right at the end?!?
  184. # -----------------------------------------------------------------
  185. print("----")
  186. print("Final:")
  187. pprint(bestcoverage_bv[i])
  188. print("-"*20)
  189. # -----------------------------------------------------------------
  190. # Okay, now actually get the solution.
  191. # The solution is the maximum of ones_bv and the minimum of ws
  192. #
  193. # Start by finding the maximum(s) of ones_bv
  194. # Then check each corresponding index of ws
  195. ones_bv_indices = [k for k,v in enumerate(ones_bv) if v==max(ones_bv)]
  196. min_key = ones_bv_indices[0]
  197. min_val = ones_bv[ones_bv_indices[0]]
  198. for ix in reversed(ones_bv_indices[1:]):
  199. if(ones_bv[ix] < min_key):
  200. min_key = ix
  201. min_val = ones_bv[ix]
  202. solution = list(btsolution(min_key, min_val, words, bt))
  203. print("Takes "+str(len(solution))+" words to cover "+str(N)+" letters")
  204. pprint(solution)