10 Commits
v0.1.1 ... v0.3

5 changed files with 197 additions and 41 deletions

View File

@@ -58,7 +58,11 @@ last schema thing to change:
- list of fields needs to be updated - list of fields needs to be updated
- don't exactly understand that if block but okkkkk.... - don't exactly understand that if block but okkkkk....
## todo
see [Todo.md](Todo.md)
## creating apps
[link to google apps docs](https://developers.google.com/api-client-library/python/samples/samples)

38
Todo.md
View File

@@ -4,7 +4,7 @@ recap of round 1:
- able to grab a google doc, add metadata, index that metadata with search - able to grab a google doc, add metadata, index that metadata with search
- no content, which is the big next step - no content, which is the big next step
## Round 2 ## v0.2 (done)
add content: add content:
- create temp dir - create temp dir
@@ -14,4 +14,40 @@ add content:
- ??? - ???
- profit - profit
## v0.3 (done)
~what is up with html formatting?~
- markdown with html tables is all messed up
- what's up with it? well, we have a bunch of shite word tables.
- those are rendered as markdown files full of html.
- the html is rendered directly by the page.
- fixed by using pandoc to convert to plain text, not markdown.
- docx -> text, not docx -> markdown
## v0.4
(later can add a step where we do convert to markdown, extract headers, etc.)
indexing: hashing content
delta/main index
## Learnings for Centillion
whoosh:
- convert documents to text, not markdown
- schema for different documents will present the biggest integration challenge
- integration tests?
- None values for fields that do not apply to a record?
- conditional jinja templating?
licensing:
- need to improve readme
- need to unpack the markdown functionality and replace it
flask routes:
- need to think through routes (separate heroku app, maintenance dashboard,
diff/main index)

View File

@@ -2,12 +2,15 @@ import shutil
import html.parser import html.parser
from gdrive_util import GDrive from gdrive_util import GDrive
from apiclient.http import MediaIoBaseDownload
from markdown_parser import MarkdownParser from markdown_parser import MarkdownParser
import mistune import mistune
from whoosh.fields import * from whoosh.fields import *
import whoosh.index as index import whoosh.index as index
import os import os, re, io, requests
import tempfile, subprocess
import pypandoc
import os.path import os.path
import codecs import codecs
from whoosh.qparser import MultifieldParser, QueryParser from whoosh.qparser import MultifieldParser, QueryParser
@@ -149,6 +152,12 @@ class Search:
- github org/user owning these repos - github org/user owning these repos
- location of the whoosh config file for configuring the search engine - location of the whoosh config file for configuring the search engine
""" """
# Steps to add all documents to index:
#
# Step 1: walk each doc in google drive.
# Step 2: index it.
# Step 3: grab a beer.
if create_new_index: if create_new_index:
self.open_index(self.index_folder, create_new=True) self.open_index(self.index_folder, create_new=True)
@@ -156,13 +165,6 @@ class Search:
gd = GDrive() gd = GDrive()
service = gd.get_service() service = gd.get_service()
# Steps to add all documents to index:
#
# Step 1: walk each doc in google drive.
# Step 2: index it.
# Step 3: grab a beer.
#
# ----- # -----
# Set of all documents on Google Drive: # Set of all documents on Google Drive:
@@ -182,10 +184,15 @@ class Search:
writer = self.ix.writer() writer = self.ix.writer()
temp_dir = tempfile.mkdtemp(dir=os.getcwd())
print("Temporary directory: %s"%(temp_dir))
if not os.path.exists(temp_dir):
os.mkdir(temp_dir)
count = 0 count = 0
for item in items: for item in items:
self.add_item(writer, item, indexed_ids, config) self.add_item(writer, item, indexed_ids, temp_dir, config)
count += 1 count += 1
writer.commit() writer.commit()
@@ -206,12 +213,6 @@ class Search:
- location of the whoosh config file for configuring the search engine - location of the whoosh config file for configuring the search engine
""" """
if create_new_index:
self.open_index(self.index_folder, create_new=True)
gd = GDrive()
service = gd.get_service()
# PoC||GTFO # PoC||GTFO
# Steps to rebuild all documents in index: # Steps to rebuild all documents in index:
@@ -229,6 +230,13 @@ class Search:
# if a document already exists in the index, # if a document already exists in the index,
# it gets removed and re-added. # it gets removed and re-added.
if create_new_index:
self.open_index(self.index_folder, create_new=True)
gd = GDrive()
service = gd.get_service()
# ----- # -----
# Set of all documents on Google Drive: # Set of all documents on Google Drive:
@@ -248,17 +256,22 @@ class Search:
writer = self.ix.writer() writer = self.ix.writer()
temp_dir = tempfile.mkdtemp(dir=os.getcwd())
print("Temporary directory: %s"%(temp_dir))
if not os.path.exists(temp_dir):
os.mkdir(temp_dir)
count = 0 count = 0
for item in items: for item in items:
self.add_item(writer, item, indexed_ids, config) self.add_item(writer, item, indexed_ids, temp_dir, config)
count += 1 count += 1
writer.commit() writer.commit()
print("Done, updated %d documents in the index" % count) print("Done, updated %d documents in the index" % count)
def add_item(self, writer, item, indexed_ids, config): def add_item(self, writer, item, indexed_ids, temp_dir, config):
""" """
Add an item to the index. Add an item to the index.
item is a google drive api document item. item is a google drive api document item.
@@ -266,18 +279,123 @@ class Search:
""" """
# If we have already indexed this document, # If we have already indexed this document,
# drop the old record first # drop the old record first
if item['id'] in indexed_ids: if item['id'] in indexed_ids:
writer.delete_by_term('id',item['id']) writer.delete_by_term('id',item['id'])
gd = GDrive()
service = gd.get_service()
# IMPORTANT: # IMPORTANT:
# This is where the search documents are actually created. # This is where the search documents are actually created.
# TODO: ##########################################
# Major todo item: # Two kinds of documents:
# - download items using id and get endpoint: https://developers.google.com/drive/api/v3/reference/files/get # - documents with text that can be extracted and indexed
# - convert to markdown # - every other kind
# - add content to content field #
# In Google Drive land, that's (docx) and (everybody else).
#
# For each document living in the Google Drive folder,
# - If mimeType is document:
# - Download it
# - Convert it to markdown
# - Extract and index the content
# - Index everything else
# - Else:
# - Just index everything else
mimetype = re.split('[/\.]',item['mimeType'])[-1]
mimemap = {
'document' : 'docx',
}
content = ""
if(mimetype not in mimemap.keys()):
# ----------
# Not a document
#
# No text to extract
#
# Technically, there probably is,
# but I'm not about to parse powerpoint
# or mystery PDF files in python.
print("Indexing document %s of type %s"%(item['name'], mimetype))
else:
# ----------
# docx Content Extraction:
#
# We can only do this with .docx files
# This is a file type we know how to convert
# Construct the URL and download it
print("Extracting content from %s of type %s"%(item['name'], mimetype))
# Create a URL and a destination filename
file_ext = mimemap[mimetype]
file_url = "https://docs.google.com/document/d/%s/export?format=%s"%(item['id'], file_ext)
# This re could probablybe improved
name = re.sub('/','_',item['name'])
# Now make the pandoc input/output filenames
out_ext = 'txt'
pandoc_fmt = 'plain'
if name.endswith(file_ext):
infile_name = name
outfile_name = re.sub(file_ext,out_ext,infile_name)
else:
infile_name = name+'.'+file_ext
outfile_name = name+'.'+out_ext
# assemble input/output file paths
fullpath_input = os.path.join(temp_dir,infile_name)
fullpath_output = os.path.join(temp_dir,outfile_name)
# Use requests.get to download url to file
r = requests.get(file_url, allow_redirects=True)
with open(fullpath_input, 'wb') as f:
f.write(r.content)
# Try to convert docx file to plain text
try:
output = pypandoc.convert_file(fullpath_input,
pandoc_fmt,
format='docx',
outputfile=fullpath_output
)
assert output == ""
except RuntimeError:
print("XXXXXX Failed to index document %s"%(item['name']))
# If export was successful, read contents of markdown
# into the content variable.
# into the content variable.
if os.path.isfile(fullpath_output):
# Export was successful
with codecs.open(fullpath_output, encoding='utf-8') as f:
content = f.read()
# No matter what happens, clean up.
print("Cleaning up %s"%item['name'])
subprocess.call(['rm','-fr',fullpath_output])
#print(" ".join(['rm','-fr',fullpath_output]))
subprocess.call(['rm','-fr',fullpath_input])
#print(" ".join(['rm','-fr',fullpath_input]))
mimetype = re.split('[/\.]', item['mimeType'])[-1] mimetype = re.split('[/\.]', item['mimeType'])[-1]
writer.add_document( writer.add_document(
@@ -287,12 +405,11 @@ class Search:
timestamp = item['createdTime'], timestamp = item['createdTime'],
owner_email = item['owners'][0]['emailAddress'], owner_email = item['owners'][0]['emailAddress'],
owner_name = item['owners'][0]['displayName'], owner_name = item['owners'][0]['displayName'],
title = item['name'] title = item['name'],
content = content
) )
def create_search_result(self, results): def create_search_result(self, results):
# Allow larger fragments # Allow larger fragments
results.fragmenter.maxchars = 300 results.fragmenter.maxchars = 300
@@ -329,17 +446,16 @@ class Search:
sr.owner_email = r['owner_email'] sr.owner_email = r['owner_email']
sr.owner_name = r['owner_name'] sr.owner_name = r['owner_name']
#sr.content = r['content'] sr.content = r['content']
#highlights = r.highlights('content') highlights = r.highlights('content')
#if not highlights: if not highlights:
# # just use the first 1,000 words of the document # just use the first 1,000 words of the document
# highlights = self.cap(r['content'], 1000) highlights = self.cap(r['content'], 1000)
#highlights = self.html_parser.unescape(highlights) highlights = self.html_parser.unescape(highlights)
#html = self.markdown(highlights) html = self.markdown(highlights)
#sr.content_highlight = html sr.content_highlight = html
sr.content_highlight = '<p>Hello world</p>'
search_results.append(sr) search_results.append(sr)

View File

@@ -3,5 +3,8 @@ apiclient>=1.0.3
oauth2client>=3.0.0 oauth2client>=3.0.0
httplib2>=0.10.3 httplib2>=0.10.3
google-api-python-client google-api-python-client
mistune>=0.8.3 mistune>=0.8
whoosh>=2.7.4 whoosh>=2.7.4
pypandoc>=1.4
requests>=2.19
pandoc>=1.0

View File

@@ -30,9 +30,6 @@
{% for e in entries %} {% for e in entries %}
<tr> <tr>
<td class="search-result"> <td class="search-result">
<!--
<div class="path"><a href='{{ url_for("open_file")}}?path={{e.path|urlencode}}&query={{query}}&fields={{fields}}'>{{e.path}}</a>score: {{'%d' % e.score}}</div>
-->
<div class="url"> <div class="url">
<a href='{{e.url}}'>{{e.title}} ({{e.mimetype}})</a><br /> <a href='{{e.url}}'>{{e.title}} ({{e.mimetype}})</a><br />
score: {{'%d' % e.score}} score: {{'%d' % e.score}}