8 Commits
v0.2 ... master

5 changed files with 62 additions and 18 deletions

View File

@@ -58,6 +58,9 @@ last schema thing to change:
- list of fields needs to be updated - list of fields needs to be updated
- don't exactly understand that if block but okkkkk.... - don't exactly understand that if block but okkkkk....
## todo
see [Todo.md](Todo.md)
## creating apps ## creating apps

38
Todo.md
View File

@@ -4,7 +4,7 @@ recap of round 1:
- able to grab a google doc, add metadata, index that metadata with search - able to grab a google doc, add metadata, index that metadata with search
- no content, which is the big next step - no content, which is the big next step
## Round 2 ## v0.2 (done)
add content: add content:
- create temp dir - create temp dir
@@ -14,4 +14,40 @@ add content:
- ??? - ???
- profit - profit
## v0.3 (done)
~what is up with html formatting?~
- markdown with html tables is all messed up
- what's up with it? well, we have a bunch of shite word tables.
- those are rendered as markdown files full of html.
- the html is rendered directly by the page.
- fixed by using pandoc to convert to plain text, not markdown.
- docx -> text, not docx -> markdown
## v0.4
(later can add a step where we do convert to markdown, extract headers, etc.)
indexing: hashing content
delta/main index
## Learnings for Centillion
whoosh:
- convert documents to text, not markdown
- schema for different documents will present the biggest integration challenge
- integration tests?
- None values for fields that do not apply to a record?
- conditional jinja templating?
licensing:
- need to improve readme
- need to unpack the markdown functionality and replace it
flask routes:
- need to think through routes (separate heroku app, maintenance dashboard,
diff/main index)

View File

@@ -345,27 +345,34 @@ class Search:
# This re could probablybe improved # This re could probablybe improved
name = re.sub('/','_',item['name']) name = re.sub('/','_',item['name'])
# Now make the pandoc input/output filenames # Now make the pandoc input/output filenames
out_ext = 'txt'
pandoc_fmt = 'plain'
if name.endswith(file_ext): if name.endswith(file_ext):
infile_name = name infile_name = name
outfile_name = re.sub(file_ext,'md',infile_name) outfile_name = re.sub(file_ext,out_ext,infile_name)
else: else:
infile_name = name+'.'+file_ext infile_name = name+'.'+file_ext
outfile_name = name+'.md' outfile_name = name+'.'+out_ext
# assemble input/output file paths
fullpath_input = os.path.join(temp_dir,infile_name)
fullpath_output = os.path.join(temp_dir,outfile_name)
# Use requests.get to download url to file # Use requests.get to download url to file
r = requests.get(file_url, allow_redirects=True) r = requests.get(file_url, allow_redirects=True)
fullpath_input = os.path.join(temp_dir,infile_name)
with open(fullpath_input, 'wb') as f: with open(fullpath_input, 'wb') as f:
f.write(r.content) f.write(r.content)
# Try to convert docx file to markdown # Try to convert docx file to plain text
fullpath_output = os.path.join(temp_dir,outfile_name)
try: try:
output = pypandoc.convert_file(fullpath_input,'gfm',format='docx',outputfile=fullpath_output) output = pypandoc.convert_file(fullpath_input,
pandoc_fmt,
format='docx',
outputfile=fullpath_output
)
assert output == "" assert output == ""
except RuntimeError: except RuntimeError:
print("XXXXXX Failed to index document %s"%(item['name'])) print("XXXXXX Failed to index document %s"%(item['name']))
@@ -383,11 +390,11 @@ class Search:
# No matter what happens, clean up. # No matter what happens, clean up.
print("Cleaning up %s"%item['name']) print("Cleaning up %s"%item['name'])
#subprocess.call(['rm','-fr',fullpath_output]) subprocess.call(['rm','-fr',fullpath_output])
print(" ".join(['rm','-fr',fullpath_output])) #print(" ".join(['rm','-fr',fullpath_output]))
#subprocess.call(['rm','-fr',fullpath_input]) subprocess.call(['rm','-fr',fullpath_input])
print(" ".join(['rm','-fr',fullpath_input])) #print(" ".join(['rm','-fr',fullpath_input]))
mimetype = re.split('[/\.]', item['mimeType'])[-1] mimetype = re.split('[/\.]', item['mimeType'])[-1]
@@ -403,7 +410,6 @@ class Search:
) )
def create_search_result(self, results): def create_search_result(self, results):
# Allow larger fragments # Allow larger fragments
results.fragmenter.maxchars = 300 results.fragmenter.maxchars = 300

View File

@@ -3,6 +3,8 @@ apiclient>=1.0.3
oauth2client>=3.0.0 oauth2client>=3.0.0
httplib2>=0.10.3 httplib2>=0.10.3
google-api-python-client google-api-python-client
mistune>=0.8.3 mistune>=0.8
whoosh>=2.7.4 whoosh>=2.7.4
pypandoc>=1.4 pypandoc>=1.4
requests>=2.19
pandoc>=1.0

View File

@@ -30,14 +30,11 @@
{% for e in entries %} {% for e in entries %}
<tr> <tr>
<td class="search-result"> <td class="search-result">
<!--
<div class="path"><a href='{{ url_for("open_file")}}?path={{e.path|urlencode}}&query={{query}}&fields={{fields}}'>{{e.path}}</a>score: {{'%d' % e.score}}</div>
-->
<div class="url"> <div class="url">
<a href='{{e.url}}'>{{e.title}} ({{e.mimetype}})</a><br /> <a href='{{e.url}}'>{{e.title}} ({{e.mimetype}})</a><br />
score: {{'%d' % e.score}} score: {{'%d' % e.score}}
</div> </div>
<div class="markdown-body">{{ e.content_highlight|safe}}</div> <div class="markdown-body">{{e.content_highlight|safe}}</div>
</td> </td>
</tr> </tr>
{% endfor %} {% endfor %}