4 Commits
v0.1.1 ... v0.2

3 changed files with 147 additions and 35 deletions

View File

@@ -59,6 +59,7 @@ last schema thing to change:
- don't exactly understand that if block but okkkkk.... - don't exactly understand that if block but okkkkk....
## creating apps
[link to google apps docs](https://developers.google.com/api-client-library/python/samples/samples)

View File

@@ -2,12 +2,15 @@ import shutil
import html.parser import html.parser
from gdrive_util import GDrive from gdrive_util import GDrive
from apiclient.http import MediaIoBaseDownload
from markdown_parser import MarkdownParser from markdown_parser import MarkdownParser
import mistune import mistune
from whoosh.fields import * from whoosh.fields import *
import whoosh.index as index import whoosh.index as index
import os import os, re, io, requests
import tempfile, subprocess
import pypandoc
import os.path import os.path
import codecs import codecs
from whoosh.qparser import MultifieldParser, QueryParser from whoosh.qparser import MultifieldParser, QueryParser
@@ -149,6 +152,12 @@ class Search:
- github org/user owning these repos - github org/user owning these repos
- location of the whoosh config file for configuring the search engine - location of the whoosh config file for configuring the search engine
""" """
# Steps to add all documents to index:
#
# Step 1: walk each doc in google drive.
# Step 2: index it.
# Step 3: grab a beer.
if create_new_index: if create_new_index:
self.open_index(self.index_folder, create_new=True) self.open_index(self.index_folder, create_new=True)
@@ -156,13 +165,6 @@ class Search:
gd = GDrive() gd = GDrive()
service = gd.get_service() service = gd.get_service()
# Steps to add all documents to index:
#
# Step 1: walk each doc in google drive.
# Step 2: index it.
# Step 3: grab a beer.
#
# ----- # -----
# Set of all documents on Google Drive: # Set of all documents on Google Drive:
@@ -182,10 +184,15 @@ class Search:
writer = self.ix.writer() writer = self.ix.writer()
temp_dir = tempfile.mkdtemp(dir=os.getcwd())
print("Temporary directory: %s"%(temp_dir))
if not os.path.exists(temp_dir):
os.mkdir(temp_dir)
count = 0 count = 0
for item in items: for item in items:
self.add_item(writer, item, indexed_ids, config) self.add_item(writer, item, indexed_ids, temp_dir, config)
count += 1 count += 1
writer.commit() writer.commit()
@@ -206,12 +213,6 @@ class Search:
- location of the whoosh config file for configuring the search engine - location of the whoosh config file for configuring the search engine
""" """
if create_new_index:
self.open_index(self.index_folder, create_new=True)
gd = GDrive()
service = gd.get_service()
# PoC||GTFO # PoC||GTFO
# Steps to rebuild all documents in index: # Steps to rebuild all documents in index:
@@ -229,6 +230,13 @@ class Search:
# if a document already exists in the index, # if a document already exists in the index,
# it gets removed and re-added. # it gets removed and re-added.
if create_new_index:
self.open_index(self.index_folder, create_new=True)
gd = GDrive()
service = gd.get_service()
# ----- # -----
# Set of all documents on Google Drive: # Set of all documents on Google Drive:
@@ -248,17 +256,22 @@ class Search:
writer = self.ix.writer() writer = self.ix.writer()
temp_dir = tempfile.mkdtemp(dir=os.getcwd())
print("Temporary directory: %s"%(temp_dir))
if not os.path.exists(temp_dir):
os.mkdir(temp_dir)
count = 0 count = 0
for item in items: for item in items:
self.add_item(writer, item, indexed_ids, config) self.add_item(writer, item, indexed_ids, temp_dir, config)
count += 1 count += 1
writer.commit() writer.commit()
print("Done, updated %d documents in the index" % count) print("Done, updated %d documents in the index" % count)
def add_item(self, writer, item, indexed_ids, config): def add_item(self, writer, item, indexed_ids, temp_dir, config):
""" """
Add an item to the index. Add an item to the index.
item is a google drive api document item. item is a google drive api document item.
@@ -266,18 +279,116 @@ class Search:
""" """
# If we have already indexed this document, # If we have already indexed this document,
# drop the old record first # drop the old record first
if item['id'] in indexed_ids: if item['id'] in indexed_ids:
writer.delete_by_term('id',item['id']) writer.delete_by_term('id',item['id'])
gd = GDrive()
service = gd.get_service()
# IMPORTANT: # IMPORTANT:
# This is where the search documents are actually created. # This is where the search documents are actually created.
# TODO: ##########################################
# Major todo item: # Two kinds of documents:
# - download items using id and get endpoint: https://developers.google.com/drive/api/v3/reference/files/get # - documents with text that can be extracted and indexed
# - convert to markdown # - every other kind
# - add content to content field #
# In Google Drive land, that's (docx) and (everybody else).
#
# For each document living in the Google Drive folder,
# - If mimeType is document:
# - Download it
# - Convert it to markdown
# - Extract and index the content
# - Index everything else
# - Else:
# - Just index everything else
mimetype = re.split('[/\.]',item['mimeType'])[-1]
mimemap = {
'document' : 'docx',
}
content = ""
if(mimetype not in mimemap.keys()):
# ----------
# Not a document
#
# No text to extract
#
# Technically, there probably is,
# but I'm not about to parse powerpoint
# or mystery PDF files in python.
print("Indexing document %s of type %s"%(item['name'], mimetype))
else:
# ----------
# docx Content Extraction:
#
# We can only do this with .docx files
# This is a file type we know how to convert
# Construct the URL and download it
print("Extracting content from %s of type %s"%(item['name'], mimetype))
# Create a URL and a destination filename
file_ext = mimemap[mimetype]
file_url = "https://docs.google.com/document/d/%s/export?format=%s"%(item['id'], file_ext)
# This re could probablybe improved
name = re.sub('/','_',item['name'])
# Now make the pandoc input/output filenames
if name.endswith(file_ext):
infile_name = name
outfile_name = re.sub(file_ext,'md',infile_name)
else:
infile_name = name+'.'+file_ext
outfile_name = name+'.md'
# Use requests.get to download url to file
r = requests.get(file_url, allow_redirects=True)
fullpath_input = os.path.join(temp_dir,infile_name)
with open(fullpath_input, 'wb') as f:
f.write(r.content)
# Try to convert docx file to markdown
fullpath_output = os.path.join(temp_dir,outfile_name)
try:
output = pypandoc.convert_file(fullpath_input,'gfm',format='docx',outputfile=fullpath_output)
assert output == ""
except RuntimeError:
print("XXXXXX Failed to index document %s"%(item['name']))
# If export was successful, read contents of markdown
# into the content variable.
# into the content variable.
if os.path.isfile(fullpath_output):
# Export was successful
with codecs.open(fullpath_output, encoding='utf-8') as f:
content = f.read()
# No matter what happens, clean up.
print("Cleaning up %s"%item['name'])
#subprocess.call(['rm','-fr',fullpath_output])
print(" ".join(['rm','-fr',fullpath_output]))
#subprocess.call(['rm','-fr',fullpath_input])
print(" ".join(['rm','-fr',fullpath_input]))
mimetype = re.split('[/\.]', item['mimeType'])[-1] mimetype = re.split('[/\.]', item['mimeType'])[-1]
writer.add_document( writer.add_document(
@@ -287,12 +398,12 @@ class Search:
timestamp = item['createdTime'], timestamp = item['createdTime'],
owner_email = item['owners'][0]['emailAddress'], owner_email = item['owners'][0]['emailAddress'],
owner_name = item['owners'][0]['displayName'], owner_name = item['owners'][0]['displayName'],
title = item['name'] title = item['name'],
content = content
) )
def create_search_result(self, results): def create_search_result(self, results):
# Allow larger fragments # Allow larger fragments
results.fragmenter.maxchars = 300 results.fragmenter.maxchars = 300
@@ -329,17 +440,16 @@ class Search:
sr.owner_email = r['owner_email'] sr.owner_email = r['owner_email']
sr.owner_name = r['owner_name'] sr.owner_name = r['owner_name']
#sr.content = r['content'] sr.content = r['content']
#highlights = r.highlights('content') highlights = r.highlights('content')
#if not highlights: if not highlights:
# # just use the first 1,000 words of the document # just use the first 1,000 words of the document
# highlights = self.cap(r['content'], 1000) highlights = self.cap(r['content'], 1000)
#highlights = self.html_parser.unescape(highlights) highlights = self.html_parser.unescape(highlights)
#html = self.markdown(highlights) html = self.markdown(highlights)
#sr.content_highlight = html sr.content_highlight = html
sr.content_highlight = '<p>Hello world</p>'
search_results.append(sr) search_results.append(sr)

View File

@@ -5,3 +5,4 @@ httplib2>=0.10.3
google-api-python-client google-api-python-client
mistune>=0.8.3 mistune>=0.8.3
whoosh>=2.7.4 whoosh>=2.7.4
pypandoc>=1.4