Compare commits
8 Commits
Author | SHA1 | Date | |
---|---|---|---|
1ff71ad459 | |||
948126a4dc | |||
783cd967b0 | |||
2bbc1378c0 | |||
33a2d5d2fc | |||
ad50e85e38 | |||
1dfe9adaab | |||
5d41699aa6 |
13
Readme.md
13
Readme.md
@@ -4,6 +4,16 @@ use whoosh to search documents in a google drive folder.
|
||||
|
||||
Implemented in **Python** using **Flask**, **Whoosh** and **Mistune**.
|
||||
|
||||
|
||||
## virtualenv
|
||||
|
||||
```
|
||||
virtualenv vp
|
||||
source vp/bin/activate
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
|
||||
## notes
|
||||
|
||||
in addition to the schema changes listed in issues-search:
|
||||
@@ -49,6 +59,7 @@ last schema thing to change:
|
||||
- don't exactly understand that if block but okkkkk....
|
||||
|
||||
|
||||
## creating apps
|
||||
|
||||
|
||||
[link to google apps docs](https://developers.google.com/api-client-library/python/samples/samples)
|
||||
|
||||
|
17
Todo.md
Normal file
17
Todo.md
Normal file
@@ -0,0 +1,17 @@
|
||||
# TODO
|
||||
|
||||
recap of round 1:
|
||||
- able to grab a google doc, add metadata, index that metadata with search
|
||||
- no content, which is the big next step
|
||||
|
||||
## Round 2
|
||||
|
||||
add content:
|
||||
- create temp dir
|
||||
- download content using document id and get api endpoint
|
||||
- convert document to markdown using pandoc
|
||||
- index the markdown
|
||||
- ???
|
||||
- profit
|
||||
|
||||
|
@@ -30,14 +30,14 @@ class UpdateIndexTask(object):
|
||||
thread.daemon = True
|
||||
thread.start()
|
||||
|
||||
def run(self, rebuild_index=False):
|
||||
def run(self):
|
||||
search = Search(app.config["INDEX_DIR"])
|
||||
|
||||
credentials_file = 'credentials.json'
|
||||
collection = 'charlesreid1dib test collection'
|
||||
search.update_index_incremental(credentials_file,
|
||||
collection,
|
||||
app.config, create_new_index=self.rebuild_index)
|
||||
app.config,
|
||||
create_new_index=self.rebuild_index)
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
|
@@ -2,12 +2,15 @@ import shutil
|
||||
import html.parser
|
||||
|
||||
from gdrive_util import GDrive
|
||||
from apiclient.http import MediaIoBaseDownload
|
||||
|
||||
from markdown_parser import MarkdownParser
|
||||
import mistune
|
||||
from whoosh.fields import *
|
||||
import whoosh.index as index
|
||||
import os
|
||||
import os, re, io, requests
|
||||
import tempfile, subprocess
|
||||
import pypandoc
|
||||
import os.path
|
||||
import codecs
|
||||
from whoosh.qparser import MultifieldParser, QueryParser
|
||||
@@ -137,7 +140,7 @@ class Search:
|
||||
self.ix = index.open_dir(index_folder)
|
||||
|
||||
|
||||
def add_all_documents(self, credentials_file, collection, config, create_new_index=False):
|
||||
def add_all_documents(self, credentials_file, config, create_new_index=False):
|
||||
"""
|
||||
Add all issues in a given github repo to the search index.
|
||||
|
||||
@@ -149,6 +152,12 @@ class Search:
|
||||
- github org/user owning these repos
|
||||
- location of the whoosh config file for configuring the search engine
|
||||
"""
|
||||
# Steps to add all documents to index:
|
||||
#
|
||||
# Step 1: walk each doc in google drive.
|
||||
# Step 2: index it.
|
||||
# Step 3: grab a beer.
|
||||
|
||||
|
||||
if create_new_index:
|
||||
self.open_index(self.index_folder, create_new=True)
|
||||
@@ -156,13 +165,6 @@ class Search:
|
||||
gd = GDrive()
|
||||
service = gd.get_service()
|
||||
|
||||
# Iindex each document:
|
||||
#
|
||||
# Step 1: walk each doc in google drive.
|
||||
# Step 2: index it.
|
||||
# Step 3: grab a beer.
|
||||
#
|
||||
|
||||
# -----
|
||||
# Set of all documents on Google Drive:
|
||||
|
||||
@@ -173,50 +175,34 @@ class Search:
|
||||
|
||||
items = results.get('files', [])
|
||||
|
||||
# To use the next token,
|
||||
# just say results['nextPageToken']
|
||||
# otherwise use items for the files
|
||||
#
|
||||
indexed_ids = set()
|
||||
for item in items:
|
||||
indexed_ids.add(item['id'])
|
||||
|
||||
# TODO:
|
||||
# Tapping out at 100, use nextPageToken to get all later
|
||||
for item in items:
|
||||
drive_ids.add(item['id'])
|
||||
|
||||
writer = self.ix.writer()
|
||||
|
||||
temp_dir = tempfile.mkdtemp(dir=os.getcwd())
|
||||
print("Temporary directory: %s"%(temp_dir))
|
||||
if not os.path.exists(temp_dir):
|
||||
os.mkdir(temp_dir)
|
||||
|
||||
count = 0
|
||||
for item in items:
|
||||
|
||||
# If we have already indexed this document,
|
||||
# drop the old record first
|
||||
if item['id'] in indexed_ids:
|
||||
writer.delete_by_term('id',item['id'])
|
||||
|
||||
# IMPORTANT:
|
||||
# This is where the search documents are actually created.
|
||||
|
||||
mimetype = re.split('[/\.]', item['mimeType'])[-1]
|
||||
writer.add_document(
|
||||
id = item['id'],
|
||||
url = item['webViewLink'],
|
||||
mimetype = mimetype,
|
||||
timestamp = item['createdTime'],
|
||||
owner_email = item['owners'][0]['emailAddress'],
|
||||
owner_name = item['owners'][0]['displayName'],
|
||||
title = item['name']
|
||||
)
|
||||
self.add_item(writer, item, indexed_ids, temp_dir, config)
|
||||
count += 1
|
||||
|
||||
# TODO:
|
||||
# Major todo item:
|
||||
# - download items using id and get endpoint: https://developers.google.com/drive/api/v3/reference/files/get
|
||||
# - convert to markdown
|
||||
# - add content to content field
|
||||
|
||||
writer.commit()
|
||||
print("Done, created %d documents in the index" % count)
|
||||
|
||||
|
||||
|
||||
def update_index_incremental(self, credentials_file, collection, config, create_new_index=False):
|
||||
def update_index_incremental(self,
|
||||
credentials_file,
|
||||
config,
|
||||
create_new_index=False):
|
||||
"""
|
||||
Update the index of issues of a given github repo.
|
||||
|
||||
@@ -227,100 +213,194 @@ class Search:
|
||||
- location of the whoosh config file for configuring the search engine
|
||||
"""
|
||||
|
||||
# PoC||GTFO
|
||||
|
||||
# Steps to rebuild all documents in index:
|
||||
#
|
||||
# Step 1: walk each doc in google drive.
|
||||
# Step 2: index it.
|
||||
# Step 2.5: deal with documents removed from google drive.
|
||||
# Step 3: grab a beer.
|
||||
|
||||
# TODO:
|
||||
# Can make Step 2/2.5 shorter by storing hash of contents.
|
||||
# for now, just... uh... i dunno.
|
||||
# figure it out later. don't remove.
|
||||
# update works exactly like add:
|
||||
# if a document already exists in the index,
|
||||
# it gets removed and re-added.
|
||||
|
||||
|
||||
if create_new_index:
|
||||
self.open_index(self.index_folder, create_new=True)
|
||||
|
||||
gd = GDrive()
|
||||
service = gd.get_service()
|
||||
|
||||
# PoC||GTFO
|
||||
|
||||
# Re-index each document:
|
||||
#
|
||||
# Step 1: build list of indexed documents
|
||||
# Step 2: build list of documents on google drive
|
||||
# Step 3: if indexed documents not on google drive, delete
|
||||
# Step 4: if indexed documents on google drive, delete and reindex
|
||||
# Step 5: if non-indexed documents on google drive, index
|
||||
#
|
||||
# TODO:
|
||||
# Can make Step 4 shorter by storing hash of contents.
|
||||
|
||||
# -----
|
||||
# Set of all indexed documents:
|
||||
indexed_ids = set()
|
||||
|
||||
with self.ix.searcher() as searcher:
|
||||
writer = self.ix.writer()
|
||||
|
||||
# Loop over the stored fields in the index
|
||||
# (i.e., each record)
|
||||
for fields in searcher.all_stored_fields():
|
||||
indexed_id = fields['id']
|
||||
indexed_ids.add(indexed_id)
|
||||
|
||||
# -----
|
||||
# Set of all documents on Google Drive:
|
||||
drive_ids = set()
|
||||
|
||||
# Call the Drive v3 API
|
||||
|
||||
## short record
|
||||
#results = service.files().list(
|
||||
# pageSize=100, fields="nextPageToken, files(id, name)").execute()
|
||||
|
||||
# long record
|
||||
results = service.files().list(
|
||||
pageSize=100, fields="nextPageToken, files(id, kind, mimeType, name, owners, webViewLink, createdTime)").execute()
|
||||
|
||||
items = results.get('files', [])
|
||||
|
||||
# To use the next token,
|
||||
# just say results['nextPageToken']
|
||||
# otherwise use items for the files
|
||||
#
|
||||
indexed_ids = set()
|
||||
for item in items:
|
||||
indexed_ids.add(item['id'])
|
||||
|
||||
# TODO:
|
||||
# Tapping out at 100, use nextPageToken to get all later
|
||||
for item in items:
|
||||
drive_ids.add(item['id'])
|
||||
|
||||
# Remove documents in the index that are not in this drive
|
||||
for indexed_id in indexed_ids:
|
||||
if indexed_id not in drive_ids:
|
||||
writer.delete_by_term('id',indexed_id)
|
||||
writer = self.ix.writer()
|
||||
|
||||
temp_dir = tempfile.mkdtemp(dir=os.getcwd())
|
||||
print("Temporary directory: %s"%(temp_dir))
|
||||
if not os.path.exists(temp_dir):
|
||||
os.mkdir(temp_dir)
|
||||
|
||||
count = 0
|
||||
|
||||
for item in items:
|
||||
|
||||
self.add_item(writer, item, indexed_ids, temp_dir, config)
|
||||
count += 1
|
||||
|
||||
writer.commit()
|
||||
print("Done, updated %d documents in the index" % count)
|
||||
|
||||
|
||||
def add_item(self, writer, item, indexed_ids, temp_dir, config):
|
||||
"""
|
||||
Add an item to the index.
|
||||
item is a google drive api document item.
|
||||
works like a dictionary.
|
||||
"""
|
||||
# If we have already indexed this document,
|
||||
# drop the old record first
|
||||
if item['id'] in indexed_ids:
|
||||
writer.delete_by_term('id',item['id'])
|
||||
|
||||
gd = GDrive()
|
||||
service = gd.get_service()
|
||||
|
||||
# IMPORTANT:
|
||||
# This is where the search documents are actually created.
|
||||
|
||||
##########################################
|
||||
# Two kinds of documents:
|
||||
# - documents with text that can be extracted and indexed
|
||||
# - every other kind
|
||||
#
|
||||
# In Google Drive land, that's (docx) and (everybody else).
|
||||
#
|
||||
# For each document living in the Google Drive folder,
|
||||
# - If mimeType is document:
|
||||
# - Download it
|
||||
# - Convert it to markdown
|
||||
# - Extract and index the content
|
||||
# - Index everything else
|
||||
# - Else:
|
||||
# - Just index everything else
|
||||
|
||||
|
||||
mimetype = re.split('[/\.]',item['mimeType'])[-1]
|
||||
mimemap = {
|
||||
'document' : 'docx',
|
||||
}
|
||||
|
||||
|
||||
content = ""
|
||||
|
||||
if(mimetype not in mimemap.keys()):
|
||||
|
||||
# ----------
|
||||
# Not a document
|
||||
#
|
||||
# No text to extract
|
||||
#
|
||||
# Technically, there probably is,
|
||||
# but I'm not about to parse powerpoint
|
||||
# or mystery PDF files in python.
|
||||
|
||||
print("Indexing document %s of type %s"%(item['name'], mimetype))
|
||||
|
||||
else:
|
||||
|
||||
# ----------
|
||||
# docx Content Extraction:
|
||||
#
|
||||
# We can only do this with .docx files
|
||||
# This is a file type we know how to convert
|
||||
# Construct the URL and download it
|
||||
|
||||
print("Extracting content from %s of type %s"%(item['name'], mimetype))
|
||||
|
||||
|
||||
# Create a URL and a destination filename
|
||||
file_ext = mimemap[mimetype]
|
||||
file_url = "https://docs.google.com/document/d/%s/export?format=%s"%(item['id'], file_ext)
|
||||
|
||||
# This re could probablybe improved
|
||||
name = re.sub('/','_',item['name'])
|
||||
|
||||
|
||||
# Now make the pandoc input/output filenames
|
||||
if name.endswith(file_ext):
|
||||
infile_name = name
|
||||
outfile_name = re.sub(file_ext,'md',infile_name)
|
||||
else:
|
||||
infile_name = name+'.'+file_ext
|
||||
outfile_name = name+'.md'
|
||||
|
||||
|
||||
# Use requests.get to download url to file
|
||||
r = requests.get(file_url, allow_redirects=True)
|
||||
fullpath_input = os.path.join(temp_dir,infile_name)
|
||||
with open(fullpath_input, 'wb') as f:
|
||||
f.write(r.content)
|
||||
|
||||
|
||||
# Try to convert docx file to markdown
|
||||
fullpath_output = os.path.join(temp_dir,outfile_name)
|
||||
try:
|
||||
output = pypandoc.convert_file(fullpath_input,'gfm',format='docx',outputfile=fullpath_output)
|
||||
assert output == ""
|
||||
except RuntimeError:
|
||||
print("XXXXXX Failed to index document %s"%(item['name']))
|
||||
|
||||
|
||||
# If export was successful, read contents of markdown
|
||||
# into the content variable.
|
||||
# into the content variable.
|
||||
if os.path.isfile(fullpath_output):
|
||||
# Export was successful
|
||||
with codecs.open(fullpath_output, encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
|
||||
# No matter what happens, clean up.
|
||||
print("Cleaning up %s"%item['name'])
|
||||
|
||||
#subprocess.call(['rm','-fr',fullpath_output])
|
||||
print(" ".join(['rm','-fr',fullpath_output]))
|
||||
|
||||
#subprocess.call(['rm','-fr',fullpath_input])
|
||||
print(" ".join(['rm','-fr',fullpath_input]))
|
||||
|
||||
|
||||
mimetype = re.split('[/\.]', item['mimeType'])[-1]
|
||||
writer.add_document(
|
||||
id = item['id'],
|
||||
url = item['webViewLink'],
|
||||
timestamp = item['createdTime'],
|
||||
mimetype = mimetype,
|
||||
timestamp = item['createdTime'],
|
||||
owner_email = item['owners'][0]['emailAddress'],
|
||||
owner_name = item['owners'][0]['displayName'],
|
||||
title = item['name']
|
||||
title = item['name'],
|
||||
content = content
|
||||
)
|
||||
count += 1
|
||||
|
||||
# TODO:
|
||||
# Major todo item:
|
||||
# - download items using id and get endpoint: https://developers.google.com/drive/api/v3/reference/files/get
|
||||
# - convert to markdown
|
||||
# - add content to content field
|
||||
|
||||
writer.commit()
|
||||
print("Done, updated %d documents in the index" % count)
|
||||
|
||||
|
||||
|
||||
@@ -360,17 +440,16 @@ class Search:
|
||||
sr.owner_email = r['owner_email']
|
||||
sr.owner_name = r['owner_name']
|
||||
|
||||
#sr.content = r['content']
|
||||
sr.content = r['content']
|
||||
|
||||
#highlights = r.highlights('content')
|
||||
#if not highlights:
|
||||
# # just use the first 1,000 words of the document
|
||||
# highlights = self.cap(r['content'], 1000)
|
||||
highlights = r.highlights('content')
|
||||
if not highlights:
|
||||
# just use the first 1,000 words of the document
|
||||
highlights = self.cap(r['content'], 1000)
|
||||
|
||||
#highlights = self.html_parser.unescape(highlights)
|
||||
#html = self.markdown(highlights)
|
||||
#sr.content_highlight = html
|
||||
sr.content_highlight = '<p>Hello world</p>'
|
||||
highlights = self.html_parser.unescape(highlights)
|
||||
html = self.markdown(highlights)
|
||||
sr.content_highlight = html
|
||||
|
||||
search_results.append(sr)
|
||||
|
||||
|
@@ -5,3 +5,4 @@ httplib2>=0.10.3
|
||||
google-api-python-client
|
||||
mistune>=0.8.3
|
||||
whoosh>=2.7.4
|
||||
pypandoc>=1.4
|
||||
|
Reference in New Issue
Block a user