16 Commits
v0.1 ... master

Author SHA1 Message Date
a40243c3b8 Merge branch 'master' of https://git.charlesreid1.com/charlesreid1/cheeseburger-search into pandoc
* 'master' of https://git.charlesreid1.com/charlesreid1/cheeseburger-search:
2018-07-29 17:16:06 -07:00
19f3053875 mark v0.3 as in the bag 2018-07-29 17:14:50 -07:00
4a2910771d Merge branch 'pandoc' of charlesreid1/cheeseburger-search into master 2018-07-30 00:14:30 +00:00
2a8ab4b1e2 update todo 2018-07-29 17:14:17 -07:00
58c4ec4b32 add pandoc mistune requests to requirements.txt 2018-07-29 17:12:27 -07:00
2978efce63 minor cleanup of search template 2018-07-29 17:11:41 -07:00
b871d417a0 actually clean up 2018-07-29 17:11:02 -07:00
b5755c656b udpate readme and todo 2018-07-29 17:10:44 -07:00
1ff71ad459 clean up cheeseburger search function, and add documentation! 2018-07-29 14:08:11 -07:00
948126a4dc success - using requests to download the files 2018-07-29 13:01:40 -07:00
783cd967b0 add failed attempts to download files from google drive. URGH 2018-07-29 02:28:57 -07:00
2bbc1378c0 add pypandoc to requirements 2018-07-29 01:27:20 -07:00
33a2d5d2fc add better id checking in cheeseburger_search 2018-07-29 01:26:27 -07:00
ad50e85e38 fix mysterious bug in app 2018-07-29 01:25:49 -07:00
1dfe9adaab add virtualenv instructions to readme 2018-07-29 00:55:45 -07:00
5d41699aa6 update todo 2018-07-29 00:54:22 -07:00
6 changed files with 277 additions and 125 deletions

View File

@@ -4,6 +4,16 @@ use whoosh to search documents in a google drive folder.
Implemented in **Python** using **Flask**, **Whoosh** and **Mistune**. Implemented in **Python** using **Flask**, **Whoosh** and **Mistune**.
## virtualenv
```
virtualenv vp
source vp/bin/activate
pip install -r requirements.txt
```
## notes ## notes
in addition to the schema changes listed in issues-search: in addition to the schema changes listed in issues-search:
@@ -48,7 +58,11 @@ last schema thing to change:
- list of fields needs to be updated - list of fields needs to be updated
- don't exactly understand that if block but okkkkk.... - don't exactly understand that if block but okkkkk....
## todo
see [Todo.md](Todo.md)
## creating apps
[link to google apps docs](https://developers.google.com/api-client-library/python/samples/samples)

53
Todo.md Normal file
View File

@@ -0,0 +1,53 @@
# TODO
recap of round 1:
- able to grab a google doc, add metadata, index that metadata with search
- no content, which is the big next step
## v0.2 (done)
add content:
- create temp dir
- download content using document id and get api endpoint
- convert document to markdown using pandoc
- index the markdown
- ???
- profit
## v0.3 (done)
~what is up with html formatting?~
- markdown with html tables is all messed up
- what's up with it? well, we have a bunch of shite word tables.
- those are rendered as markdown files full of html.
- the html is rendered directly by the page.
- fixed by using pandoc to convert to plain text, not markdown.
- docx -> text, not docx -> markdown
## v0.4
(later can add a step where we do convert to markdown, extract headers, etc.)
indexing: hashing content
delta/main index
## Learnings for Centillion
whoosh:
- convert documents to text, not markdown
- schema for different documents will present the biggest integration challenge
- integration tests?
- None values for fields that do not apply to a record?
- conditional jinja templating?
licensing:
- need to improve readme
- need to unpack the markdown functionality and replace it
flask routes:
- need to think through routes (separate heroku app, maintenance dashboard,
diff/main index)

View File

@@ -30,14 +30,14 @@ class UpdateIndexTask(object):
thread.daemon = True thread.daemon = True
thread.start() thread.start()
def run(self, rebuild_index=False): def run(self):
search = Search(app.config["INDEX_DIR"]) search = Search(app.config["INDEX_DIR"])
credentials_file = 'credentials.json' credentials_file = 'credentials.json'
collection = 'charlesreid1dib test collection' collection = 'charlesreid1dib test collection'
search.update_index_incremental(credentials_file, search.update_index_incremental(credentials_file,
collection, app.config,
app.config, create_new_index=self.rebuild_index) create_new_index=self.rebuild_index)
app = Flask(__name__) app = Flask(__name__)

View File

@@ -2,12 +2,15 @@ import shutil
import html.parser import html.parser
from gdrive_util import GDrive from gdrive_util import GDrive
from apiclient.http import MediaIoBaseDownload
from markdown_parser import MarkdownParser from markdown_parser import MarkdownParser
import mistune import mistune
from whoosh.fields import * from whoosh.fields import *
import whoosh.index as index import whoosh.index as index
import os import os, re, io, requests
import tempfile, subprocess
import pypandoc
import os.path import os.path
import codecs import codecs
from whoosh.qparser import MultifieldParser, QueryParser from whoosh.qparser import MultifieldParser, QueryParser
@@ -137,7 +140,7 @@ class Search:
self.ix = index.open_dir(index_folder) self.ix = index.open_dir(index_folder)
def add_all_documents(self, credentials_file, collection, config, create_new_index=False): def add_all_documents(self, credentials_file, config, create_new_index=False):
""" """
Add all issues in a given github repo to the search index. Add all issues in a given github repo to the search index.
@@ -149,6 +152,12 @@ class Search:
- github org/user owning these repos - github org/user owning these repos
- location of the whoosh config file for configuring the search engine - location of the whoosh config file for configuring the search engine
""" """
# Steps to add all documents to index:
#
# Step 1: walk each doc in google drive.
# Step 2: index it.
# Step 3: grab a beer.
if create_new_index: if create_new_index:
self.open_index(self.index_folder, create_new=True) self.open_index(self.index_folder, create_new=True)
@@ -156,13 +165,6 @@ class Search:
gd = GDrive() gd = GDrive()
service = gd.get_service() service = gd.get_service()
# Iindex each document:
#
# Step 1: walk each doc in google drive.
# Step 2: index it.
# Step 3: grab a beer.
#
# ----- # -----
# Set of all documents on Google Drive: # Set of all documents on Google Drive:
@@ -173,50 +175,34 @@ class Search:
items = results.get('files', []) items = results.get('files', [])
# To use the next token, indexed_ids = set()
# just say results['nextPageToken'] for item in items:
# otherwise use items for the files indexed_ids.add(item['id'])
#
# TODO: # TODO:
# Tapping out at 100, use nextPageToken to get all later # Tapping out at 100, use nextPageToken to get all later
for item in items:
drive_ids.add(item['id']) writer = self.ix.writer()
temp_dir = tempfile.mkdtemp(dir=os.getcwd())
print("Temporary directory: %s"%(temp_dir))
if not os.path.exists(temp_dir):
os.mkdir(temp_dir)
count = 0 count = 0
for item in items: for item in items:
# If we have already indexed this document, self.add_item(writer, item, indexed_ids, temp_dir, config)
# drop the old record first
if item['id'] in indexed_ids:
writer.delete_by_term('id',item['id'])
# IMPORTANT:
# This is where the search documents are actually created.
mimetype = re.split('[/\.]', item['mimeType'])[-1]
writer.add_document(
id = item['id'],
url = item['webViewLink'],
mimetype = mimetype,
timestamp = item['createdTime'],
owner_email = item['owners'][0]['emailAddress'],
owner_name = item['owners'][0]['displayName'],
title = item['name']
)
count += 1 count += 1
# TODO:
# Major todo item:
# - download items using id and get endpoint: https://developers.google.com/drive/api/v3/reference/files/get
# - convert to markdown
# - add content to content field
writer.commit() writer.commit()
print("Done, created %d documents in the index" % count) print("Done, created %d documents in the index" % count)
def update_index_incremental(self,
def update_index_incremental(self, credentials_file, collection, config, create_new_index=False): credentials_file,
config,
create_new_index=False):
""" """
Update the index of issues of a given github repo. Update the index of issues of a given github repo.
@@ -227,101 +213,201 @@ class Search:
- location of the whoosh config file for configuring the search engine - location of the whoosh config file for configuring the search engine
""" """
# PoC||GTFO
# Steps to rebuild all documents in index:
#
# Step 1: walk each doc in google drive.
# Step 2: index it.
# Step 2.5: deal with documents removed from google drive.
# Step 3: grab a beer.
# TODO:
# Can make Step 2/2.5 shorter by storing hash of contents.
# for now, just... uh... i dunno.
# figure it out later. don't remove.
# update works exactly like add:
# if a document already exists in the index,
# it gets removed and re-added.
if create_new_index: if create_new_index:
self.open_index(self.index_folder, create_new=True) self.open_index(self.index_folder, create_new=True)
gd = GDrive() gd = GDrive()
service = gd.get_service() service = gd.get_service()
# PoC||GTFO
# Re-index each document:
#
# Step 1: build list of indexed documents
# Step 2: build list of documents on google drive
# Step 3: if indexed documents not on google drive, delete
# Step 4: if indexed documents on google drive, delete and reindex
# Step 5: if non-indexed documents on google drive, index
#
# TODO:
# Can make Step 4 shorter by storing hash of contents.
# -----
# Set of all indexed documents:
indexed_ids = set()
with self.ix.searcher() as searcher:
writer = self.ix.writer()
# Loop over the stored fields in the index
# (i.e., each record)
for fields in searcher.all_stored_fields():
indexed_id = fields['id']
indexed_ids.add(indexed_id)
# ----- # -----
# Set of all documents on Google Drive: # Set of all documents on Google Drive:
drive_ids = set()
# Call the Drive v3 API # Call the Drive v3 API
## short record
#results = service.files().list(
# pageSize=100, fields="nextPageToken, files(id, name)").execute()
# long record
results = service.files().list( results = service.files().list(
pageSize=100, fields="nextPageToken, files(id, kind, mimeType, name, owners, webViewLink, createdTime)").execute() pageSize=100, fields="nextPageToken, files(id, kind, mimeType, name, owners, webViewLink, createdTime)").execute()
items = results.get('files', []) items = results.get('files', [])
# To use the next token, indexed_ids = set()
# just say results['nextPageToken'] for item in items:
# otherwise use items for the files indexed_ids.add(item['id'])
#
# TODO: # TODO:
# Tapping out at 100, use nextPageToken to get all later # Tapping out at 100, use nextPageToken to get all later
for item in items:
drive_ids.add(item['id'])
# Remove documents in the index that are not in this drive writer = self.ix.writer()
for indexed_id in indexed_ids:
if indexed_id not in drive_ids: temp_dir = tempfile.mkdtemp(dir=os.getcwd())
writer.delete_by_term('id',indexed_id) print("Temporary directory: %s"%(temp_dir))
if not os.path.exists(temp_dir):
os.mkdir(temp_dir)
count = 0 count = 0
for item in items: for item in items:
self.add_item(writer, item, indexed_ids, temp_dir, config)
count += 1
writer.commit()
print("Done, updated %d documents in the index" % count)
def add_item(self, writer, item, indexed_ids, temp_dir, config):
"""
Add an item to the index.
item is a google drive api document item.
works like a dictionary.
"""
# If we have already indexed this document, # If we have already indexed this document,
# drop the old record first # drop the old record first
if item['id'] in indexed_ids: if item['id'] in indexed_ids:
writer.delete_by_term('id',item['id']) writer.delete_by_term('id',item['id'])
gd = GDrive()
service = gd.get_service()
# IMPORTANT: # IMPORTANT:
# This is where the search documents are actually created. # This is where the search documents are actually created.
##########################################
# Two kinds of documents:
# - documents with text that can be extracted and indexed
# - every other kind
#
# In Google Drive land, that's (docx) and (everybody else).
#
# For each document living in the Google Drive folder,
# - If mimeType is document:
# - Download it
# - Convert it to markdown
# - Extract and index the content
# - Index everything else
# - Else:
# - Just index everything else
mimetype = re.split('[/\.]',item['mimeType'])[-1]
mimemap = {
'document' : 'docx',
}
content = ""
if(mimetype not in mimemap.keys()):
# ----------
# Not a document
#
# No text to extract
#
# Technically, there probably is,
# but I'm not about to parse powerpoint
# or mystery PDF files in python.
print("Indexing document %s of type %s"%(item['name'], mimetype))
else:
# ----------
# docx Content Extraction:
#
# We can only do this with .docx files
# This is a file type we know how to convert
# Construct the URL and download it
print("Extracting content from %s of type %s"%(item['name'], mimetype))
# Create a URL and a destination filename
file_ext = mimemap[mimetype]
file_url = "https://docs.google.com/document/d/%s/export?format=%s"%(item['id'], file_ext)
# This re could probablybe improved
name = re.sub('/','_',item['name'])
# Now make the pandoc input/output filenames
out_ext = 'txt'
pandoc_fmt = 'plain'
if name.endswith(file_ext):
infile_name = name
outfile_name = re.sub(file_ext,out_ext,infile_name)
else:
infile_name = name+'.'+file_ext
outfile_name = name+'.'+out_ext
# assemble input/output file paths
fullpath_input = os.path.join(temp_dir,infile_name)
fullpath_output = os.path.join(temp_dir,outfile_name)
# Use requests.get to download url to file
r = requests.get(file_url, allow_redirects=True)
with open(fullpath_input, 'wb') as f:
f.write(r.content)
# Try to convert docx file to plain text
try:
output = pypandoc.convert_file(fullpath_input,
pandoc_fmt,
format='docx',
outputfile=fullpath_output
)
assert output == ""
except RuntimeError:
print("XXXXXX Failed to index document %s"%(item['name']))
# If export was successful, read contents of markdown
# into the content variable.
# into the content variable.
if os.path.isfile(fullpath_output):
# Export was successful
with codecs.open(fullpath_output, encoding='utf-8') as f:
content = f.read()
# No matter what happens, clean up.
print("Cleaning up %s"%item['name'])
subprocess.call(['rm','-fr',fullpath_output])
#print(" ".join(['rm','-fr',fullpath_output]))
subprocess.call(['rm','-fr',fullpath_input])
#print(" ".join(['rm','-fr',fullpath_input]))
mimetype = re.split('[/\.]', item['mimeType'])[-1] mimetype = re.split('[/\.]', item['mimeType'])[-1]
writer.add_document( writer.add_document(
id = item['id'], id = item['id'],
url = item['webViewLink'], url = item['webViewLink'],
timestamp = item['createdTime'],
mimetype = mimetype, mimetype = mimetype,
timestamp = item['createdTime'],
owner_email = item['owners'][0]['emailAddress'], owner_email = item['owners'][0]['emailAddress'],
owner_name = item['owners'][0]['displayName'], owner_name = item['owners'][0]['displayName'],
title = item['name'] title = item['name'],
content = content
) )
count += 1
# TODO:
# Major todo item:
# - download items using id and get endpoint: https://developers.google.com/drive/api/v3/reference/files/get
# - convert to markdown
# - add content to content field
writer.commit()
print("Done, updated %d documents in the index" % count)
def create_search_result(self, results): def create_search_result(self, results):
@@ -360,17 +446,16 @@ class Search:
sr.owner_email = r['owner_email'] sr.owner_email = r['owner_email']
sr.owner_name = r['owner_name'] sr.owner_name = r['owner_name']
#sr.content = r['content'] sr.content = r['content']
#highlights = r.highlights('content') highlights = r.highlights('content')
#if not highlights: if not highlights:
# # just use the first 1,000 words of the document # just use the first 1,000 words of the document
# highlights = self.cap(r['content'], 1000) highlights = self.cap(r['content'], 1000)
#highlights = self.html_parser.unescape(highlights) highlights = self.html_parser.unescape(highlights)
#html = self.markdown(highlights) html = self.markdown(highlights)
#sr.content_highlight = html sr.content_highlight = html
sr.content_highlight = '<p>Hello world</p>'
search_results.append(sr) search_results.append(sr)

View File

@@ -3,5 +3,8 @@ apiclient>=1.0.3
oauth2client>=3.0.0 oauth2client>=3.0.0
httplib2>=0.10.3 httplib2>=0.10.3
google-api-python-client google-api-python-client
mistune>=0.8.3 mistune>=0.8
whoosh>=2.7.4 whoosh>=2.7.4
pypandoc>=1.4
requests>=2.19
pandoc>=1.0

View File

@@ -30,14 +30,11 @@
{% for e in entries %} {% for e in entries %}
<tr> <tr>
<td class="search-result"> <td class="search-result">
<!--
<div class="path"><a href='{{ url_for("open_file")}}?path={{e.path|urlencode}}&query={{query}}&fields={{fields}}'>{{e.path}}</a>score: {{'%d' % e.score}}</div>
-->
<div class="url"> <div class="url">
<a href='{{e.url}}'>{{e.title}} ({{e.mimetype}})</a><br /> <a href='{{e.url}}'>{{e.title}} ({{e.mimetype}})</a><br />
score: {{'%d' % e.score}} score: {{'%d' % e.score}}
</div> </div>
<div class="markdown-body">{{ e.content_highlight|safe}}</div> <div class="markdown-body">{{e.content_highlight|safe}}</div>
</td> </td>
</tr> </tr>
{% endfor %} {% endfor %}