Compare commits
12 Commits
Author | SHA1 | Date | |
---|---|---|---|
a40243c3b8 | |||
19f3053875 | |||
4a2910771d | |||
2a8ab4b1e2 | |||
58c4ec4b32 | |||
2978efce63 | |||
b871d417a0 | |||
b5755c656b | |||
1ff71ad459 | |||
948126a4dc | |||
783cd967b0 | |||
2bbc1378c0 |
@@ -58,7 +58,11 @@ last schema thing to change:
|
|||||||
- list of fields needs to be updated
|
- list of fields needs to be updated
|
||||||
- don't exactly understand that if block but okkkkk....
|
- don't exactly understand that if block but okkkkk....
|
||||||
|
|
||||||
|
## todo
|
||||||
|
|
||||||
|
see [Todo.md](Todo.md)
|
||||||
|
|
||||||
|
## creating apps
|
||||||
|
|
||||||
|
[link to google apps docs](https://developers.google.com/api-client-library/python/samples/samples)
|
||||||
|
|
||||||
|
38
Todo.md
38
Todo.md
@@ -4,7 +4,7 @@ recap of round 1:
|
|||||||
- able to grab a google doc, add metadata, index that metadata with search
|
- able to grab a google doc, add metadata, index that metadata with search
|
||||||
- no content, which is the big next step
|
- no content, which is the big next step
|
||||||
|
|
||||||
## Round 2
|
## v0.2 (done)
|
||||||
|
|
||||||
add content:
|
add content:
|
||||||
- create temp dir
|
- create temp dir
|
||||||
@@ -14,4 +14,40 @@ add content:
|
|||||||
- ???
|
- ???
|
||||||
- profit
|
- profit
|
||||||
|
|
||||||
|
## v0.3 (done)
|
||||||
|
|
||||||
|
~what is up with html formatting?~
|
||||||
|
- markdown with html tables is all messed up
|
||||||
|
- what's up with it? well, we have a bunch of shite word tables.
|
||||||
|
- those are rendered as markdown files full of html.
|
||||||
|
- the html is rendered directly by the page.
|
||||||
|
- fixed by using pandoc to convert to plain text, not markdown.
|
||||||
|
- docx -> text, not docx -> markdown
|
||||||
|
|
||||||
|
## v0.4
|
||||||
|
|
||||||
|
(later can add a step where we do convert to markdown, extract headers, etc.)
|
||||||
|
|
||||||
|
indexing: hashing content
|
||||||
|
|
||||||
|
delta/main index
|
||||||
|
|
||||||
|
## Learnings for Centillion
|
||||||
|
|
||||||
|
whoosh:
|
||||||
|
- convert documents to text, not markdown
|
||||||
|
- schema for different documents will present the biggest integration challenge
|
||||||
|
- integration tests?
|
||||||
|
- None values for fields that do not apply to a record?
|
||||||
|
- conditional jinja templating?
|
||||||
|
|
||||||
|
licensing:
|
||||||
|
- need to improve readme
|
||||||
|
- need to unpack the markdown functionality and replace it
|
||||||
|
|
||||||
|
flask routes:
|
||||||
|
- need to think through routes (separate heroku app, maintenance dashboard,
|
||||||
|
diff/main index)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@@ -2,12 +2,15 @@ import shutil
|
|||||||
import html.parser
|
import html.parser
|
||||||
|
|
||||||
from gdrive_util import GDrive
|
from gdrive_util import GDrive
|
||||||
|
from apiclient.http import MediaIoBaseDownload
|
||||||
|
|
||||||
from markdown_parser import MarkdownParser
|
from markdown_parser import MarkdownParser
|
||||||
import mistune
|
import mistune
|
||||||
from whoosh.fields import *
|
from whoosh.fields import *
|
||||||
import whoosh.index as index
|
import whoosh.index as index
|
||||||
import os
|
import os, re, io, requests
|
||||||
|
import tempfile, subprocess
|
||||||
|
import pypandoc
|
||||||
import os.path
|
import os.path
|
||||||
import codecs
|
import codecs
|
||||||
from whoosh.qparser import MultifieldParser, QueryParser
|
from whoosh.qparser import MultifieldParser, QueryParser
|
||||||
@@ -149,6 +152,12 @@ class Search:
|
|||||||
- github org/user owning these repos
|
- github org/user owning these repos
|
||||||
- location of the whoosh config file for configuring the search engine
|
- location of the whoosh config file for configuring the search engine
|
||||||
"""
|
"""
|
||||||
|
# Steps to add all documents to index:
|
||||||
|
#
|
||||||
|
# Step 1: walk each doc in google drive.
|
||||||
|
# Step 2: index it.
|
||||||
|
# Step 3: grab a beer.
|
||||||
|
|
||||||
|
|
||||||
if create_new_index:
|
if create_new_index:
|
||||||
self.open_index(self.index_folder, create_new=True)
|
self.open_index(self.index_folder, create_new=True)
|
||||||
@@ -156,13 +165,6 @@ class Search:
|
|||||||
gd = GDrive()
|
gd = GDrive()
|
||||||
service = gd.get_service()
|
service = gd.get_service()
|
||||||
|
|
||||||
# Steps to add all documents to index:
|
|
||||||
#
|
|
||||||
# Step 1: walk each doc in google drive.
|
|
||||||
# Step 2: index it.
|
|
||||||
# Step 3: grab a beer.
|
|
||||||
#
|
|
||||||
|
|
||||||
# -----
|
# -----
|
||||||
# Set of all documents on Google Drive:
|
# Set of all documents on Google Drive:
|
||||||
|
|
||||||
@@ -182,10 +184,15 @@ class Search:
|
|||||||
|
|
||||||
writer = self.ix.writer()
|
writer = self.ix.writer()
|
||||||
|
|
||||||
|
temp_dir = tempfile.mkdtemp(dir=os.getcwd())
|
||||||
|
print("Temporary directory: %s"%(temp_dir))
|
||||||
|
if not os.path.exists(temp_dir):
|
||||||
|
os.mkdir(temp_dir)
|
||||||
|
|
||||||
count = 0
|
count = 0
|
||||||
for item in items:
|
for item in items:
|
||||||
|
|
||||||
self.add_item(writer, item, indexed_ids, config)
|
self.add_item(writer, item, indexed_ids, temp_dir, config)
|
||||||
count += 1
|
count += 1
|
||||||
|
|
||||||
writer.commit()
|
writer.commit()
|
||||||
@@ -206,12 +213,6 @@ class Search:
|
|||||||
- location of the whoosh config file for configuring the search engine
|
- location of the whoosh config file for configuring the search engine
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if create_new_index:
|
|
||||||
self.open_index(self.index_folder, create_new=True)
|
|
||||||
|
|
||||||
gd = GDrive()
|
|
||||||
service = gd.get_service()
|
|
||||||
|
|
||||||
# PoC||GTFO
|
# PoC||GTFO
|
||||||
|
|
||||||
# Steps to rebuild all documents in index:
|
# Steps to rebuild all documents in index:
|
||||||
@@ -229,6 +230,13 @@ class Search:
|
|||||||
# if a document already exists in the index,
|
# if a document already exists in the index,
|
||||||
# it gets removed and re-added.
|
# it gets removed and re-added.
|
||||||
|
|
||||||
|
|
||||||
|
if create_new_index:
|
||||||
|
self.open_index(self.index_folder, create_new=True)
|
||||||
|
|
||||||
|
gd = GDrive()
|
||||||
|
service = gd.get_service()
|
||||||
|
|
||||||
# -----
|
# -----
|
||||||
# Set of all documents on Google Drive:
|
# Set of all documents on Google Drive:
|
||||||
|
|
||||||
@@ -248,17 +256,22 @@ class Search:
|
|||||||
|
|
||||||
writer = self.ix.writer()
|
writer = self.ix.writer()
|
||||||
|
|
||||||
|
temp_dir = tempfile.mkdtemp(dir=os.getcwd())
|
||||||
|
print("Temporary directory: %s"%(temp_dir))
|
||||||
|
if not os.path.exists(temp_dir):
|
||||||
|
os.mkdir(temp_dir)
|
||||||
|
|
||||||
count = 0
|
count = 0
|
||||||
for item in items:
|
for item in items:
|
||||||
|
|
||||||
self.add_item(writer, item, indexed_ids, config)
|
self.add_item(writer, item, indexed_ids, temp_dir, config)
|
||||||
count += 1
|
count += 1
|
||||||
|
|
||||||
writer.commit()
|
writer.commit()
|
||||||
print("Done, updated %d documents in the index" % count)
|
print("Done, updated %d documents in the index" % count)
|
||||||
|
|
||||||
|
|
||||||
def add_item(self, writer, item, indexed_ids, config):
|
def add_item(self, writer, item, indexed_ids, temp_dir, config):
|
||||||
"""
|
"""
|
||||||
Add an item to the index.
|
Add an item to the index.
|
||||||
item is a google drive api document item.
|
item is a google drive api document item.
|
||||||
@@ -266,18 +279,123 @@ class Search:
|
|||||||
"""
|
"""
|
||||||
# If we have already indexed this document,
|
# If we have already indexed this document,
|
||||||
# drop the old record first
|
# drop the old record first
|
||||||
|
|
||||||
if item['id'] in indexed_ids:
|
if item['id'] in indexed_ids:
|
||||||
writer.delete_by_term('id',item['id'])
|
writer.delete_by_term('id',item['id'])
|
||||||
|
|
||||||
|
gd = GDrive()
|
||||||
|
service = gd.get_service()
|
||||||
|
|
||||||
# IMPORTANT:
|
# IMPORTANT:
|
||||||
# This is where the search documents are actually created.
|
# This is where the search documents are actually created.
|
||||||
|
|
||||||
# TODO:
|
##########################################
|
||||||
# Major todo item:
|
# Two kinds of documents:
|
||||||
# - download items using id and get endpoint: https://developers.google.com/drive/api/v3/reference/files/get
|
# - documents with text that can be extracted and indexed
|
||||||
# - convert to markdown
|
# - every other kind
|
||||||
# - add content to content field
|
#
|
||||||
|
# In Google Drive land, that's (docx) and (everybody else).
|
||||||
|
#
|
||||||
|
# For each document living in the Google Drive folder,
|
||||||
|
# - If mimeType is document:
|
||||||
|
# - Download it
|
||||||
|
# - Convert it to markdown
|
||||||
|
# - Extract and index the content
|
||||||
|
# - Index everything else
|
||||||
|
# - Else:
|
||||||
|
# - Just index everything else
|
||||||
|
|
||||||
|
|
||||||
|
mimetype = re.split('[/\.]',item['mimeType'])[-1]
|
||||||
|
mimemap = {
|
||||||
|
'document' : 'docx',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
content = ""
|
||||||
|
|
||||||
|
if(mimetype not in mimemap.keys()):
|
||||||
|
|
||||||
|
# ----------
|
||||||
|
# Not a document
|
||||||
|
#
|
||||||
|
# No text to extract
|
||||||
|
#
|
||||||
|
# Technically, there probably is,
|
||||||
|
# but I'm not about to parse powerpoint
|
||||||
|
# or mystery PDF files in python.
|
||||||
|
|
||||||
|
print("Indexing document %s of type %s"%(item['name'], mimetype))
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
# ----------
|
||||||
|
# docx Content Extraction:
|
||||||
|
#
|
||||||
|
# We can only do this with .docx files
|
||||||
|
# This is a file type we know how to convert
|
||||||
|
# Construct the URL and download it
|
||||||
|
|
||||||
|
print("Extracting content from %s of type %s"%(item['name'], mimetype))
|
||||||
|
|
||||||
|
|
||||||
|
# Create a URL and a destination filename
|
||||||
|
file_ext = mimemap[mimetype]
|
||||||
|
file_url = "https://docs.google.com/document/d/%s/export?format=%s"%(item['id'], file_ext)
|
||||||
|
|
||||||
|
# This re could probablybe improved
|
||||||
|
name = re.sub('/','_',item['name'])
|
||||||
|
|
||||||
|
# Now make the pandoc input/output filenames
|
||||||
|
out_ext = 'txt'
|
||||||
|
pandoc_fmt = 'plain'
|
||||||
|
if name.endswith(file_ext):
|
||||||
|
infile_name = name
|
||||||
|
outfile_name = re.sub(file_ext,out_ext,infile_name)
|
||||||
|
else:
|
||||||
|
infile_name = name+'.'+file_ext
|
||||||
|
outfile_name = name+'.'+out_ext
|
||||||
|
|
||||||
|
|
||||||
|
# assemble input/output file paths
|
||||||
|
fullpath_input = os.path.join(temp_dir,infile_name)
|
||||||
|
fullpath_output = os.path.join(temp_dir,outfile_name)
|
||||||
|
|
||||||
|
# Use requests.get to download url to file
|
||||||
|
r = requests.get(file_url, allow_redirects=True)
|
||||||
|
with open(fullpath_input, 'wb') as f:
|
||||||
|
f.write(r.content)
|
||||||
|
|
||||||
|
|
||||||
|
# Try to convert docx file to plain text
|
||||||
|
try:
|
||||||
|
output = pypandoc.convert_file(fullpath_input,
|
||||||
|
pandoc_fmt,
|
||||||
|
format='docx',
|
||||||
|
outputfile=fullpath_output
|
||||||
|
)
|
||||||
|
assert output == ""
|
||||||
|
except RuntimeError:
|
||||||
|
print("XXXXXX Failed to index document %s"%(item['name']))
|
||||||
|
|
||||||
|
|
||||||
|
# If export was successful, read contents of markdown
|
||||||
|
# into the content variable.
|
||||||
|
# into the content variable.
|
||||||
|
if os.path.isfile(fullpath_output):
|
||||||
|
# Export was successful
|
||||||
|
with codecs.open(fullpath_output, encoding='utf-8') as f:
|
||||||
|
content = f.read()
|
||||||
|
|
||||||
|
|
||||||
|
# No matter what happens, clean up.
|
||||||
|
print("Cleaning up %s"%item['name'])
|
||||||
|
|
||||||
|
subprocess.call(['rm','-fr',fullpath_output])
|
||||||
|
#print(" ".join(['rm','-fr',fullpath_output]))
|
||||||
|
|
||||||
|
subprocess.call(['rm','-fr',fullpath_input])
|
||||||
|
#print(" ".join(['rm','-fr',fullpath_input]))
|
||||||
|
|
||||||
|
|
||||||
mimetype = re.split('[/\.]', item['mimeType'])[-1]
|
mimetype = re.split('[/\.]', item['mimeType'])[-1]
|
||||||
writer.add_document(
|
writer.add_document(
|
||||||
@@ -287,12 +405,11 @@ class Search:
|
|||||||
timestamp = item['createdTime'],
|
timestamp = item['createdTime'],
|
||||||
owner_email = item['owners'][0]['emailAddress'],
|
owner_email = item['owners'][0]['emailAddress'],
|
||||||
owner_name = item['owners'][0]['displayName'],
|
owner_name = item['owners'][0]['displayName'],
|
||||||
title = item['name']
|
title = item['name'],
|
||||||
|
content = content
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def create_search_result(self, results):
|
def create_search_result(self, results):
|
||||||
# Allow larger fragments
|
# Allow larger fragments
|
||||||
results.fragmenter.maxchars = 300
|
results.fragmenter.maxchars = 300
|
||||||
@@ -329,17 +446,16 @@ class Search:
|
|||||||
sr.owner_email = r['owner_email']
|
sr.owner_email = r['owner_email']
|
||||||
sr.owner_name = r['owner_name']
|
sr.owner_name = r['owner_name']
|
||||||
|
|
||||||
#sr.content = r['content']
|
sr.content = r['content']
|
||||||
|
|
||||||
#highlights = r.highlights('content')
|
highlights = r.highlights('content')
|
||||||
#if not highlights:
|
if not highlights:
|
||||||
# # just use the first 1,000 words of the document
|
# just use the first 1,000 words of the document
|
||||||
# highlights = self.cap(r['content'], 1000)
|
highlights = self.cap(r['content'], 1000)
|
||||||
|
|
||||||
#highlights = self.html_parser.unescape(highlights)
|
highlights = self.html_parser.unescape(highlights)
|
||||||
#html = self.markdown(highlights)
|
html = self.markdown(highlights)
|
||||||
#sr.content_highlight = html
|
sr.content_highlight = html
|
||||||
sr.content_highlight = '<p>Hello world</p>'
|
|
||||||
|
|
||||||
search_results.append(sr)
|
search_results.append(sr)
|
||||||
|
|
||||||
|
@@ -3,5 +3,8 @@ apiclient>=1.0.3
|
|||||||
oauth2client>=3.0.0
|
oauth2client>=3.0.0
|
||||||
httplib2>=0.10.3
|
httplib2>=0.10.3
|
||||||
google-api-python-client
|
google-api-python-client
|
||||||
mistune>=0.8.3
|
mistune>=0.8
|
||||||
whoosh>=2.7.4
|
whoosh>=2.7.4
|
||||||
|
pypandoc>=1.4
|
||||||
|
requests>=2.19
|
||||||
|
pandoc>=1.0
|
||||||
|
@@ -30,14 +30,11 @@
|
|||||||
{% for e in entries %}
|
{% for e in entries %}
|
||||||
<tr>
|
<tr>
|
||||||
<td class="search-result">
|
<td class="search-result">
|
||||||
<!--
|
|
||||||
<div class="path"><a href='{{ url_for("open_file")}}?path={{e.path|urlencode}}&query={{query}}&fields={{fields}}'>{{e.path}}</a>score: {{'%d' % e.score}}</div>
|
|
||||||
-->
|
|
||||||
<div class="url">
|
<div class="url">
|
||||||
<a href='{{e.url}}'>{{e.title}} ({{e.mimetype}})</a><br />
|
<a href='{{e.url}}'>{{e.title}} ({{e.mimetype}})</a><br />
|
||||||
score: {{'%d' % e.score}}
|
score: {{'%d' % e.score}}
|
||||||
</div>
|
</div>
|
||||||
<div class="markdown-body">{{ e.content_highlight|safe}}</div>
|
<div class="markdown-body">{{e.content_highlight|safe}}</div>
|
||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
Reference in New Issue
Block a user