4 Commits
v0.1 ... v0.1.1

Author SHA1 Message Date
33a2d5d2fc add better id checking in cheeseburger_search 2018-07-29 01:26:27 -07:00
ad50e85e38 fix mysterious bug in app 2018-07-29 01:25:49 -07:00
1dfe9adaab add virtualenv instructions to readme 2018-07-29 00:55:45 -07:00
5d41699aa6 update todo 2018-07-29 00:54:22 -07:00
4 changed files with 91 additions and 95 deletions

View File

@@ -4,6 +4,16 @@ use whoosh to search documents in a google drive folder.
Implemented in **Python** using **Flask**, **Whoosh** and **Mistune**. Implemented in **Python** using **Flask**, **Whoosh** and **Mistune**.
## virtualenv
```
virtualenv vp
source vp/bin/activate
pip install -r requirements.txt
```
## notes ## notes
in addition to the schema changes listed in issues-search: in addition to the schema changes listed in issues-search:

17
Todo.md Normal file
View File

@@ -0,0 +1,17 @@
# TODO
recap of round 1:
- able to grab a google doc, add metadata, index that metadata with search
- no content, which is the big next step
## Round 2
add content:
- create temp dir
- download content using document id and get api endpoint
- convert document to markdown using pandoc
- index the markdown
- ???
- profit

View File

@@ -30,14 +30,14 @@ class UpdateIndexTask(object):
thread.daemon = True thread.daemon = True
thread.start() thread.start()
def run(self, rebuild_index=False): def run(self):
search = Search(app.config["INDEX_DIR"]) search = Search(app.config["INDEX_DIR"])
credentials_file = 'credentials.json' credentials_file = 'credentials.json'
collection = 'charlesreid1dib test collection' collection = 'charlesreid1dib test collection'
search.update_index_incremental(credentials_file, search.update_index_incremental(credentials_file,
collection, app.config,
app.config, create_new_index=self.rebuild_index) create_new_index=self.rebuild_index)
app = Flask(__name__) app = Flask(__name__)

View File

@@ -137,7 +137,7 @@ class Search:
self.ix = index.open_dir(index_folder) self.ix = index.open_dir(index_folder)
def add_all_documents(self, credentials_file, collection, config, create_new_index=False): def add_all_documents(self, credentials_file, config, create_new_index=False):
""" """
Add all issues in a given github repo to the search index. Add all issues in a given github repo to the search index.
@@ -156,7 +156,7 @@ class Search:
gd = GDrive() gd = GDrive()
service = gd.get_service() service = gd.get_service()
# Iindex each document: # Steps to add all documents to index:
# #
# Step 1: walk each doc in google drive. # Step 1: walk each doc in google drive.
# Step 2: index it. # Step 2: index it.
@@ -173,50 +173,29 @@ class Search:
items = results.get('files', []) items = results.get('files', [])
# To use the next token, indexed_ids = set()
# just say results['nextPageToken'] for item in items:
# otherwise use items for the files indexed_ids.add(item['id'])
#
# TODO: # TODO:
# Tapping out at 100, use nextPageToken to get all later # Tapping out at 100, use nextPageToken to get all later
for item in items:
drive_ids.add(item['id']) writer = self.ix.writer()
count = 0 count = 0
for item in items: for item in items:
# If we have already indexed this document, self.add_item(writer, item, indexed_ids, config)
# drop the old record first
if item['id'] in indexed_ids:
writer.delete_by_term('id',item['id'])
# IMPORTANT:
# This is where the search documents are actually created.
mimetype = re.split('[/\.]', item['mimeType'])[-1]
writer.add_document(
id = item['id'],
url = item['webViewLink'],
mimetype = mimetype,
timestamp = item['createdTime'],
owner_email = item['owners'][0]['emailAddress'],
owner_name = item['owners'][0]['displayName'],
title = item['name']
)
count += 1 count += 1
# TODO:
# Major todo item:
# - download items using id and get endpoint: https://developers.google.com/drive/api/v3/reference/files/get
# - convert to markdown
# - add content to content field
writer.commit() writer.commit()
print("Done, created %d documents in the index" % count) print("Done, created %d documents in the index" % count)
def update_index_incremental(self,
def update_index_incremental(self, credentials_file, collection, config, create_new_index=False): credentials_file,
config,
create_new_index=False):
""" """
Update the index of issues of a given github repo. Update the index of issues of a given github repo.
@@ -235,92 +214,82 @@ class Search:
# PoC||GTFO # PoC||GTFO
# Re-index each document: # Steps to rebuild all documents in index:
#
# Step 1: build list of indexed documents
# Step 2: build list of documents on google drive
# Step 3: if indexed documents not on google drive, delete
# Step 4: if indexed documents on google drive, delete and reindex
# Step 5: if non-indexed documents on google drive, index
# #
# Step 1: walk each doc in google drive.
# Step 2: index it.
# Step 2.5: deal with documents removed from google drive.
# Step 3: grab a beer.
# TODO: # TODO:
# Can make Step 4 shorter by storing hash of contents. # Can make Step 2/2.5 shorter by storing hash of contents.
# for now, just... uh... i dunno.
# ----- # figure it out later. don't remove.
# Set of all indexed documents: # update works exactly like add:
indexed_ids = set() # if a document already exists in the index,
# it gets removed and re-added.
with self.ix.searcher() as searcher:
writer = self.ix.writer()
# Loop over the stored fields in the index
# (i.e., each record)
for fields in searcher.all_stored_fields():
indexed_id = fields['id']
indexed_ids.add(indexed_id)
# ----- # -----
# Set of all documents on Google Drive: # Set of all documents on Google Drive:
drive_ids = set()
# Call the Drive v3 API # Call the Drive v3 API
## short record
#results = service.files().list(
# pageSize=100, fields="nextPageToken, files(id, name)").execute()
# long record
results = service.files().list( results = service.files().list(
pageSize=100, fields="nextPageToken, files(id, kind, mimeType, name, owners, webViewLink, createdTime)").execute() pageSize=100, fields="nextPageToken, files(id, kind, mimeType, name, owners, webViewLink, createdTime)").execute()
items = results.get('files', []) items = results.get('files', [])
# To use the next token, indexed_ids = set()
# just say results['nextPageToken'] for item in items:
# otherwise use items for the files indexed_ids.add(item['id'])
#
# TODO: # TODO:
# Tapping out at 100, use nextPageToken to get all later # Tapping out at 100, use nextPageToken to get all later
for item in items:
drive_ids.add(item['id'])
# Remove documents in the index that are not in this drive writer = self.ix.writer()
for indexed_id in indexed_ids:
if indexed_id not in drive_ids:
writer.delete_by_term('id',indexed_id)
count = 0 count = 0
for item in items: for item in items:
self.add_item(writer, item, indexed_ids, config)
count += 1
writer.commit()
print("Done, updated %d documents in the index" % count)
def add_item(self, writer, item, indexed_ids, config):
"""
Add an item to the index.
item is a google drive api document item.
works like a dictionary.
"""
# If we have already indexed this document, # If we have already indexed this document,
# drop the old record first # drop the old record first
if item['id'] in indexed_ids: if item['id'] in indexed_ids:
writer.delete_by_term('id',item['id']) writer.delete_by_term('id',item['id'])
# IMPORTANT: # IMPORTANT:
# This is where the search documents are actually created. # This is where the search documents are actually created.
mimetype = re.split('[/\.]', item['mimeType'])[-1]
writer.add_document(
id = item['id'],
url = item['webViewLink'],
timestamp = item['createdTime'],
mimetype = mimetype,
owner_email = item['owners'][0]['emailAddress'],
owner_name = item['owners'][0]['displayName'],
title = item['name']
)
count += 1
# TODO: # TODO:
# Major todo item: # Major todo item:
# - download items using id and get endpoint: https://developers.google.com/drive/api/v3/reference/files/get # - download items using id and get endpoint: https://developers.google.com/drive/api/v3/reference/files/get
# - convert to markdown # - convert to markdown
# - add content to content field # - add content to content field
writer.commit() mimetype = re.split('[/\.]', item['mimeType'])[-1]
print("Done, updated %d documents in the index" % count) writer.add_document(
id = item['id'],
url = item['webViewLink'],
mimetype = mimetype,
timestamp = item['createdTime'],
owner_email = item['owners'][0]['emailAddress'],
owner_name = item['owners'][0]['displayName'],
title = item['name']
)