Compare commits
6 Commits
Author | SHA1 | Date | |
---|---|---|---|
19f3053875 | |||
2a8ab4b1e2 | |||
58c4ec4b32 | |||
2978efce63 | |||
b871d417a0 | |||
b5755c656b |
@@ -58,6 +58,9 @@ last schema thing to change:
|
||||
- list of fields needs to be updated
|
||||
- don't exactly understand that if block but okkkkk....
|
||||
|
||||
## todo
|
||||
|
||||
see [Todo.md](Todo.md)
|
||||
|
||||
## creating apps
|
||||
|
||||
|
38
Todo.md
38
Todo.md
@@ -4,7 +4,7 @@ recap of round 1:
|
||||
- able to grab a google doc, add metadata, index that metadata with search
|
||||
- no content, which is the big next step
|
||||
|
||||
## Round 2
|
||||
## v0.2 (done)
|
||||
|
||||
add content:
|
||||
- create temp dir
|
||||
@@ -14,4 +14,40 @@ add content:
|
||||
- ???
|
||||
- profit
|
||||
|
||||
## v0.3 (done)
|
||||
|
||||
~what is up with html formatting?~
|
||||
- markdown with html tables is all messed up
|
||||
- what's up with it? well, we have a bunch of shite word tables.
|
||||
- those are rendered as markdown files full of html.
|
||||
- the html is rendered directly by the page.
|
||||
- fixed by using pandoc to convert to plain text, not markdown.
|
||||
- docx -> text, not docx -> markdown
|
||||
|
||||
## v0.4
|
||||
|
||||
(later can add a step where we do convert to markdown, extract headers, etc.)
|
||||
|
||||
indexing: hashing content
|
||||
|
||||
delta/main index
|
||||
|
||||
## Learnings for Centillion
|
||||
|
||||
whoosh:
|
||||
- convert documents to text, not markdown
|
||||
- schema for different documents will present the biggest integration challenge
|
||||
- integration tests?
|
||||
- None values for fields that do not apply to a record?
|
||||
- conditional jinja templating?
|
||||
|
||||
licensing:
|
||||
- need to improve readme
|
||||
- need to unpack the markdown functionality and replace it
|
||||
|
||||
flask routes:
|
||||
- need to think through routes (separate heroku app, maintenance dashboard,
|
||||
diff/main index)
|
||||
|
||||
|
||||
|
||||
|
@@ -345,27 +345,34 @@ class Search:
|
||||
# This re could probablybe improved
|
||||
name = re.sub('/','_',item['name'])
|
||||
|
||||
|
||||
# Now make the pandoc input/output filenames
|
||||
out_ext = 'txt'
|
||||
pandoc_fmt = 'plain'
|
||||
if name.endswith(file_ext):
|
||||
infile_name = name
|
||||
outfile_name = re.sub(file_ext,'md',infile_name)
|
||||
outfile_name = re.sub(file_ext,out_ext,infile_name)
|
||||
else:
|
||||
infile_name = name+'.'+file_ext
|
||||
outfile_name = name+'.md'
|
||||
outfile_name = name+'.'+out_ext
|
||||
|
||||
|
||||
# assemble input/output file paths
|
||||
fullpath_input = os.path.join(temp_dir,infile_name)
|
||||
fullpath_output = os.path.join(temp_dir,outfile_name)
|
||||
|
||||
# Use requests.get to download url to file
|
||||
r = requests.get(file_url, allow_redirects=True)
|
||||
fullpath_input = os.path.join(temp_dir,infile_name)
|
||||
with open(fullpath_input, 'wb') as f:
|
||||
f.write(r.content)
|
||||
|
||||
|
||||
# Try to convert docx file to markdown
|
||||
fullpath_output = os.path.join(temp_dir,outfile_name)
|
||||
# Try to convert docx file to plain text
|
||||
try:
|
||||
output = pypandoc.convert_file(fullpath_input,'gfm',format='docx',outputfile=fullpath_output)
|
||||
output = pypandoc.convert_file(fullpath_input,
|
||||
pandoc_fmt,
|
||||
format='docx',
|
||||
outputfile=fullpath_output
|
||||
)
|
||||
assert output == ""
|
||||
except RuntimeError:
|
||||
print("XXXXXX Failed to index document %s"%(item['name']))
|
||||
@@ -383,11 +390,11 @@ class Search:
|
||||
# No matter what happens, clean up.
|
||||
print("Cleaning up %s"%item['name'])
|
||||
|
||||
#subprocess.call(['rm','-fr',fullpath_output])
|
||||
print(" ".join(['rm','-fr',fullpath_output]))
|
||||
subprocess.call(['rm','-fr',fullpath_output])
|
||||
#print(" ".join(['rm','-fr',fullpath_output]))
|
||||
|
||||
#subprocess.call(['rm','-fr',fullpath_input])
|
||||
print(" ".join(['rm','-fr',fullpath_input]))
|
||||
subprocess.call(['rm','-fr',fullpath_input])
|
||||
#print(" ".join(['rm','-fr',fullpath_input]))
|
||||
|
||||
|
||||
mimetype = re.split('[/\.]', item['mimeType'])[-1]
|
||||
@@ -403,7 +410,6 @@ class Search:
|
||||
)
|
||||
|
||||
|
||||
|
||||
def create_search_result(self, results):
|
||||
# Allow larger fragments
|
||||
results.fragmenter.maxchars = 300
|
||||
|
@@ -3,6 +3,8 @@ apiclient>=1.0.3
|
||||
oauth2client>=3.0.0
|
||||
httplib2>=0.10.3
|
||||
google-api-python-client
|
||||
mistune>=0.8.3
|
||||
mistune>=0.8
|
||||
whoosh>=2.7.4
|
||||
pypandoc>=1.4
|
||||
requests>=2.19
|
||||
pandoc>=1.0
|
||||
|
@@ -30,14 +30,11 @@
|
||||
{% for e in entries %}
|
||||
<tr>
|
||||
<td class="search-result">
|
||||
<!--
|
||||
<div class="path"><a href='{{ url_for("open_file")}}?path={{e.path|urlencode}}&query={{query}}&fields={{fields}}'>{{e.path}}</a>score: {{'%d' % e.score}}</div>
|
||||
-->
|
||||
<div class="url">
|
||||
<a href='{{e.url}}'>{{e.title}} ({{e.mimetype}})</a><br />
|
||||
score: {{'%d' % e.score}}
|
||||
</div>
|
||||
<div class="markdown-body">{{ e.content_highlight|safe}}</div>
|
||||
<div class="markdown-body">{{e.content_highlight|safe}}</div>
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
|
Reference in New Issue
Block a user