14 Commits
v0.1 ... v0.3

6 changed files with 277 additions and 125 deletions

View File

@@ -4,6 +4,16 @@ use whoosh to search documents in a google drive folder.
Implemented in **Python** using **Flask**, **Whoosh** and **Mistune**.
## virtualenv
```
virtualenv vp
source vp/bin/activate
pip install -r requirements.txt
```
## notes
in addition to the schema changes listed in issues-search:
@@ -48,7 +58,11 @@ last schema thing to change:
- list of fields needs to be updated
- don't exactly understand that if block but okkkkk....
## todo
see [Todo.md](Todo.md)
## creating apps
[link to google apps docs](https://developers.google.com/api-client-library/python/samples/samples)

53
Todo.md Normal file
View File

@@ -0,0 +1,53 @@
# TODO
recap of round 1:
- able to grab a google doc, add metadata, index that metadata with search
- no content, which is the big next step
## v0.2 (done)
add content:
- create temp dir
- download content using document id and get api endpoint
- convert document to markdown using pandoc
- index the markdown
- ???
- profit
## v0.3 (done)
~what is up with html formatting?~
- markdown with html tables is all messed up
- what's up with it? well, we have a bunch of shite word tables.
- those are rendered as markdown files full of html.
- the html is rendered directly by the page.
- fixed by using pandoc to convert to plain text, not markdown.
- docx -> text, not docx -> markdown
## v0.4
(later can add a step where we do convert to markdown, extract headers, etc.)
indexing: hashing content
delta/main index
## Learnings for Centillion
whoosh:
- convert documents to text, not markdown
- schema for different documents will present the biggest integration challenge
- integration tests?
- None values for fields that do not apply to a record?
- conditional jinja templating?
licensing:
- need to improve readme
- need to unpack the markdown functionality and replace it
flask routes:
- need to think through routes (separate heroku app, maintenance dashboard,
diff/main index)

View File

@@ -30,14 +30,14 @@ class UpdateIndexTask(object):
thread.daemon = True
thread.start()
def run(self, rebuild_index=False):
def run(self):
search = Search(app.config["INDEX_DIR"])
credentials_file = 'credentials.json'
collection = 'charlesreid1dib test collection'
search.update_index_incremental(credentials_file,
collection,
app.config, create_new_index=self.rebuild_index)
app.config,
create_new_index=self.rebuild_index)
app = Flask(__name__)

View File

@@ -2,12 +2,15 @@ import shutil
import html.parser
from gdrive_util import GDrive
from apiclient.http import MediaIoBaseDownload
from markdown_parser import MarkdownParser
import mistune
from whoosh.fields import *
import whoosh.index as index
import os
import os, re, io, requests
import tempfile, subprocess
import pypandoc
import os.path
import codecs
from whoosh.qparser import MultifieldParser, QueryParser
@@ -137,7 +140,7 @@ class Search:
self.ix = index.open_dir(index_folder)
def add_all_documents(self, credentials_file, collection, config, create_new_index=False):
def add_all_documents(self, credentials_file, config, create_new_index=False):
"""
Add all issues in a given github repo to the search index.
@@ -149,6 +152,12 @@ class Search:
- github org/user owning these repos
- location of the whoosh config file for configuring the search engine
"""
# Steps to add all documents to index:
#
# Step 1: walk each doc in google drive.
# Step 2: index it.
# Step 3: grab a beer.
if create_new_index:
self.open_index(self.index_folder, create_new=True)
@@ -156,13 +165,6 @@ class Search:
gd = GDrive()
service = gd.get_service()
# Iindex each document:
#
# Step 1: walk each doc in google drive.
# Step 2: index it.
# Step 3: grab a beer.
#
# -----
# Set of all documents on Google Drive:
@@ -173,50 +175,34 @@ class Search:
items = results.get('files', [])
# To use the next token,
# just say results['nextPageToken']
# otherwise use items for the files
#
indexed_ids = set()
for item in items:
indexed_ids.add(item['id'])
# TODO:
# Tapping out at 100, use nextPageToken to get all later
for item in items:
drive_ids.add(item['id'])
writer = self.ix.writer()
temp_dir = tempfile.mkdtemp(dir=os.getcwd())
print("Temporary directory: %s"%(temp_dir))
if not os.path.exists(temp_dir):
os.mkdir(temp_dir)
count = 0
for item in items:
# If we have already indexed this document,
# drop the old record first
if item['id'] in indexed_ids:
writer.delete_by_term('id',item['id'])
# IMPORTANT:
# This is where the search documents are actually created.
mimetype = re.split('[/\.]', item['mimeType'])[-1]
writer.add_document(
id = item['id'],
url = item['webViewLink'],
mimetype = mimetype,
timestamp = item['createdTime'],
owner_email = item['owners'][0]['emailAddress'],
owner_name = item['owners'][0]['displayName'],
title = item['name']
)
self.add_item(writer, item, indexed_ids, temp_dir, config)
count += 1
# TODO:
# Major todo item:
# - download items using id and get endpoint: https://developers.google.com/drive/api/v3/reference/files/get
# - convert to markdown
# - add content to content field
writer.commit()
print("Done, created %d documents in the index" % count)
def update_index_incremental(self, credentials_file, collection, config, create_new_index=False):
def update_index_incremental(self,
credentials_file,
config,
create_new_index=False):
"""
Update the index of issues of a given github repo.
@@ -227,101 +213,201 @@ class Search:
- location of the whoosh config file for configuring the search engine
"""
# PoC||GTFO
# Steps to rebuild all documents in index:
#
# Step 1: walk each doc in google drive.
# Step 2: index it.
# Step 2.5: deal with documents removed from google drive.
# Step 3: grab a beer.
# TODO:
# Can make Step 2/2.5 shorter by storing hash of contents.
# for now, just... uh... i dunno.
# figure it out later. don't remove.
# update works exactly like add:
# if a document already exists in the index,
# it gets removed and re-added.
if create_new_index:
self.open_index(self.index_folder, create_new=True)
gd = GDrive()
service = gd.get_service()
# PoC||GTFO
# Re-index each document:
#
# Step 1: build list of indexed documents
# Step 2: build list of documents on google drive
# Step 3: if indexed documents not on google drive, delete
# Step 4: if indexed documents on google drive, delete and reindex
# Step 5: if non-indexed documents on google drive, index
#
# TODO:
# Can make Step 4 shorter by storing hash of contents.
# -----
# Set of all indexed documents:
indexed_ids = set()
with self.ix.searcher() as searcher:
writer = self.ix.writer()
# Loop over the stored fields in the index
# (i.e., each record)
for fields in searcher.all_stored_fields():
indexed_id = fields['id']
indexed_ids.add(indexed_id)
# -----
# Set of all documents on Google Drive:
drive_ids = set()
# Call the Drive v3 API
## short record
#results = service.files().list(
# pageSize=100, fields="nextPageToken, files(id, name)").execute()
# long record
results = service.files().list(
pageSize=100, fields="nextPageToken, files(id, kind, mimeType, name, owners, webViewLink, createdTime)").execute()
items = results.get('files', [])
# To use the next token,
# just say results['nextPageToken']
# otherwise use items for the files
#
indexed_ids = set()
for item in items:
indexed_ids.add(item['id'])
# TODO:
# Tapping out at 100, use nextPageToken to get all later
for item in items:
drive_ids.add(item['id'])
# Remove documents in the index that are not in this drive
for indexed_id in indexed_ids:
if indexed_id not in drive_ids:
writer.delete_by_term('id',indexed_id)
writer = self.ix.writer()
temp_dir = tempfile.mkdtemp(dir=os.getcwd())
print("Temporary directory: %s"%(temp_dir))
if not os.path.exists(temp_dir):
os.mkdir(temp_dir)
count = 0
for item in items:
self.add_item(writer, item, indexed_ids, temp_dir, config)
count += 1
writer.commit()
print("Done, updated %d documents in the index" % count)
def add_item(self, writer, item, indexed_ids, temp_dir, config):
"""
Add an item to the index.
item is a google drive api document item.
works like a dictionary.
"""
# If we have already indexed this document,
# drop the old record first
if item['id'] in indexed_ids:
writer.delete_by_term('id',item['id'])
gd = GDrive()
service = gd.get_service()
# IMPORTANT:
# This is where the search documents are actually created.
##########################################
# Two kinds of documents:
# - documents with text that can be extracted and indexed
# - every other kind
#
# In Google Drive land, that's (docx) and (everybody else).
#
# For each document living in the Google Drive folder,
# - If mimeType is document:
# - Download it
# - Convert it to markdown
# - Extract and index the content
# - Index everything else
# - Else:
# - Just index everything else
mimetype = re.split('[/\.]',item['mimeType'])[-1]
mimemap = {
'document' : 'docx',
}
content = ""
if(mimetype not in mimemap.keys()):
# ----------
# Not a document
#
# No text to extract
#
# Technically, there probably is,
# but I'm not about to parse powerpoint
# or mystery PDF files in python.
print("Indexing document %s of type %s"%(item['name'], mimetype))
else:
# ----------
# docx Content Extraction:
#
# We can only do this with .docx files
# This is a file type we know how to convert
# Construct the URL and download it
print("Extracting content from %s of type %s"%(item['name'], mimetype))
# Create a URL and a destination filename
file_ext = mimemap[mimetype]
file_url = "https://docs.google.com/document/d/%s/export?format=%s"%(item['id'], file_ext)
# This re could probablybe improved
name = re.sub('/','_',item['name'])
# Now make the pandoc input/output filenames
out_ext = 'txt'
pandoc_fmt = 'plain'
if name.endswith(file_ext):
infile_name = name
outfile_name = re.sub(file_ext,out_ext,infile_name)
else:
infile_name = name+'.'+file_ext
outfile_name = name+'.'+out_ext
# assemble input/output file paths
fullpath_input = os.path.join(temp_dir,infile_name)
fullpath_output = os.path.join(temp_dir,outfile_name)
# Use requests.get to download url to file
r = requests.get(file_url, allow_redirects=True)
with open(fullpath_input, 'wb') as f:
f.write(r.content)
# Try to convert docx file to plain text
try:
output = pypandoc.convert_file(fullpath_input,
pandoc_fmt,
format='docx',
outputfile=fullpath_output
)
assert output == ""
except RuntimeError:
print("XXXXXX Failed to index document %s"%(item['name']))
# If export was successful, read contents of markdown
# into the content variable.
# into the content variable.
if os.path.isfile(fullpath_output):
# Export was successful
with codecs.open(fullpath_output, encoding='utf-8') as f:
content = f.read()
# No matter what happens, clean up.
print("Cleaning up %s"%item['name'])
subprocess.call(['rm','-fr',fullpath_output])
#print(" ".join(['rm','-fr',fullpath_output]))
subprocess.call(['rm','-fr',fullpath_input])
#print(" ".join(['rm','-fr',fullpath_input]))
mimetype = re.split('[/\.]', item['mimeType'])[-1]
writer.add_document(
id = item['id'],
url = item['webViewLink'],
timestamp = item['createdTime'],
mimetype = mimetype,
timestamp = item['createdTime'],
owner_email = item['owners'][0]['emailAddress'],
owner_name = item['owners'][0]['displayName'],
title = item['name']
title = item['name'],
content = content
)
count += 1
# TODO:
# Major todo item:
# - download items using id and get endpoint: https://developers.google.com/drive/api/v3/reference/files/get
# - convert to markdown
# - add content to content field
writer.commit()
print("Done, updated %d documents in the index" % count)
def create_search_result(self, results):
@@ -360,17 +446,16 @@ class Search:
sr.owner_email = r['owner_email']
sr.owner_name = r['owner_name']
#sr.content = r['content']
sr.content = r['content']
#highlights = r.highlights('content')
#if not highlights:
# # just use the first 1,000 words of the document
# highlights = self.cap(r['content'], 1000)
highlights = r.highlights('content')
if not highlights:
# just use the first 1,000 words of the document
highlights = self.cap(r['content'], 1000)
#highlights = self.html_parser.unescape(highlights)
#html = self.markdown(highlights)
#sr.content_highlight = html
sr.content_highlight = '<p>Hello world</p>'
highlights = self.html_parser.unescape(highlights)
html = self.markdown(highlights)
sr.content_highlight = html
search_results.append(sr)

View File

@@ -3,5 +3,8 @@ apiclient>=1.0.3
oauth2client>=3.0.0
httplib2>=0.10.3
google-api-python-client
mistune>=0.8.3
mistune>=0.8
whoosh>=2.7.4
pypandoc>=1.4
requests>=2.19
pandoc>=1.0

View File

@@ -30,14 +30,11 @@
{% for e in entries %}
<tr>
<td class="search-result">
<!--
<div class="path"><a href='{{ url_for("open_file")}}?path={{e.path|urlencode}}&query={{query}}&fields={{fields}}'>{{e.path}}</a>score: {{'%d' % e.score}}</div>
-->
<div class="url">
<a href='{{e.url}}'>{{e.title}} ({{e.mimetype}})</a><br />
score: {{'%d' % e.score}}
</div>
<div class="markdown-body">{{ e.content_highlight|safe}}</div>
<div class="markdown-body">{{e.content_highlight|safe}}</div>
</td>
</tr>
{% endfor %}