Compare commits
4 Commits
Author | SHA1 | Date | |
---|---|---|---|
33a2d5d2fc | |||
ad50e85e38 | |||
1dfe9adaab | |||
5d41699aa6 |
10
Readme.md
10
Readme.md
@@ -4,6 +4,16 @@ use whoosh to search documents in a google drive folder.
|
|||||||
|
|
||||||
Implemented in **Python** using **Flask**, **Whoosh** and **Mistune**.
|
Implemented in **Python** using **Flask**, **Whoosh** and **Mistune**.
|
||||||
|
|
||||||
|
|
||||||
|
## virtualenv
|
||||||
|
|
||||||
|
```
|
||||||
|
virtualenv vp
|
||||||
|
source vp/bin/activate
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
## notes
|
## notes
|
||||||
|
|
||||||
in addition to the schema changes listed in issues-search:
|
in addition to the schema changes listed in issues-search:
|
||||||
|
17
Todo.md
Normal file
17
Todo.md
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
# TODO
|
||||||
|
|
||||||
|
recap of round 1:
|
||||||
|
- able to grab a google doc, add metadata, index that metadata with search
|
||||||
|
- no content, which is the big next step
|
||||||
|
|
||||||
|
## Round 2
|
||||||
|
|
||||||
|
add content:
|
||||||
|
- create temp dir
|
||||||
|
- download content using document id and get api endpoint
|
||||||
|
- convert document to markdown using pandoc
|
||||||
|
- index the markdown
|
||||||
|
- ???
|
||||||
|
- profit
|
||||||
|
|
||||||
|
|
@@ -30,14 +30,14 @@ class UpdateIndexTask(object):
|
|||||||
thread.daemon = True
|
thread.daemon = True
|
||||||
thread.start()
|
thread.start()
|
||||||
|
|
||||||
def run(self, rebuild_index=False):
|
def run(self):
|
||||||
search = Search(app.config["INDEX_DIR"])
|
search = Search(app.config["INDEX_DIR"])
|
||||||
|
|
||||||
credentials_file = 'credentials.json'
|
credentials_file = 'credentials.json'
|
||||||
collection = 'charlesreid1dib test collection'
|
collection = 'charlesreid1dib test collection'
|
||||||
search.update_index_incremental(credentials_file,
|
search.update_index_incremental(credentials_file,
|
||||||
collection,
|
app.config,
|
||||||
app.config, create_new_index=self.rebuild_index)
|
create_new_index=self.rebuild_index)
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
@@ -137,7 +137,7 @@ class Search:
|
|||||||
self.ix = index.open_dir(index_folder)
|
self.ix = index.open_dir(index_folder)
|
||||||
|
|
||||||
|
|
||||||
def add_all_documents(self, credentials_file, collection, config, create_new_index=False):
|
def add_all_documents(self, credentials_file, config, create_new_index=False):
|
||||||
"""
|
"""
|
||||||
Add all issues in a given github repo to the search index.
|
Add all issues in a given github repo to the search index.
|
||||||
|
|
||||||
@@ -156,7 +156,7 @@ class Search:
|
|||||||
gd = GDrive()
|
gd = GDrive()
|
||||||
service = gd.get_service()
|
service = gd.get_service()
|
||||||
|
|
||||||
# Iindex each document:
|
# Steps to add all documents to index:
|
||||||
#
|
#
|
||||||
# Step 1: walk each doc in google drive.
|
# Step 1: walk each doc in google drive.
|
||||||
# Step 2: index it.
|
# Step 2: index it.
|
||||||
@@ -173,50 +173,29 @@ class Search:
|
|||||||
|
|
||||||
items = results.get('files', [])
|
items = results.get('files', [])
|
||||||
|
|
||||||
# To use the next token,
|
indexed_ids = set()
|
||||||
# just say results['nextPageToken']
|
for item in items:
|
||||||
# otherwise use items for the files
|
indexed_ids.add(item['id'])
|
||||||
#
|
|
||||||
# TODO:
|
# TODO:
|
||||||
# Tapping out at 100, use nextPageToken to get all later
|
# Tapping out at 100, use nextPageToken to get all later
|
||||||
for item in items:
|
|
||||||
drive_ids.add(item['id'])
|
writer = self.ix.writer()
|
||||||
|
|
||||||
count = 0
|
count = 0
|
||||||
for item in items:
|
for item in items:
|
||||||
|
|
||||||
# If we have already indexed this document,
|
self.add_item(writer, item, indexed_ids, config)
|
||||||
# drop the old record first
|
|
||||||
if item['id'] in indexed_ids:
|
|
||||||
writer.delete_by_term('id',item['id'])
|
|
||||||
|
|
||||||
# IMPORTANT:
|
|
||||||
# This is where the search documents are actually created.
|
|
||||||
|
|
||||||
mimetype = re.split('[/\.]', item['mimeType'])[-1]
|
|
||||||
writer.add_document(
|
|
||||||
id = item['id'],
|
|
||||||
url = item['webViewLink'],
|
|
||||||
mimetype = mimetype,
|
|
||||||
timestamp = item['createdTime'],
|
|
||||||
owner_email = item['owners'][0]['emailAddress'],
|
|
||||||
owner_name = item['owners'][0]['displayName'],
|
|
||||||
title = item['name']
|
|
||||||
)
|
|
||||||
count += 1
|
count += 1
|
||||||
|
|
||||||
# TODO:
|
|
||||||
# Major todo item:
|
|
||||||
# - download items using id and get endpoint: https://developers.google.com/drive/api/v3/reference/files/get
|
|
||||||
# - convert to markdown
|
|
||||||
# - add content to content field
|
|
||||||
|
|
||||||
writer.commit()
|
writer.commit()
|
||||||
print("Done, created %d documents in the index" % count)
|
print("Done, created %d documents in the index" % count)
|
||||||
|
|
||||||
|
|
||||||
|
def update_index_incremental(self,
|
||||||
def update_index_incremental(self, credentials_file, collection, config, create_new_index=False):
|
credentials_file,
|
||||||
|
config,
|
||||||
|
create_new_index=False):
|
||||||
"""
|
"""
|
||||||
Update the index of issues of a given github repo.
|
Update the index of issues of a given github repo.
|
||||||
|
|
||||||
@@ -235,92 +214,82 @@ class Search:
|
|||||||
|
|
||||||
# PoC||GTFO
|
# PoC||GTFO
|
||||||
|
|
||||||
# Re-index each document:
|
# Steps to rebuild all documents in index:
|
||||||
#
|
|
||||||
# Step 1: build list of indexed documents
|
|
||||||
# Step 2: build list of documents on google drive
|
|
||||||
# Step 3: if indexed documents not on google drive, delete
|
|
||||||
# Step 4: if indexed documents on google drive, delete and reindex
|
|
||||||
# Step 5: if non-indexed documents on google drive, index
|
|
||||||
#
|
#
|
||||||
|
# Step 1: walk each doc in google drive.
|
||||||
|
# Step 2: index it.
|
||||||
|
# Step 2.5: deal with documents removed from google drive.
|
||||||
|
# Step 3: grab a beer.
|
||||||
|
|
||||||
# TODO:
|
# TODO:
|
||||||
# Can make Step 4 shorter by storing hash of contents.
|
# Can make Step 2/2.5 shorter by storing hash of contents.
|
||||||
|
# for now, just... uh... i dunno.
|
||||||
# -----
|
# figure it out later. don't remove.
|
||||||
# Set of all indexed documents:
|
# update works exactly like add:
|
||||||
indexed_ids = set()
|
# if a document already exists in the index,
|
||||||
|
# it gets removed and re-added.
|
||||||
with self.ix.searcher() as searcher:
|
|
||||||
writer = self.ix.writer()
|
|
||||||
|
|
||||||
# Loop over the stored fields in the index
|
|
||||||
# (i.e., each record)
|
|
||||||
for fields in searcher.all_stored_fields():
|
|
||||||
indexed_id = fields['id']
|
|
||||||
indexed_ids.add(indexed_id)
|
|
||||||
|
|
||||||
# -----
|
# -----
|
||||||
# Set of all documents on Google Drive:
|
# Set of all documents on Google Drive:
|
||||||
drive_ids = set()
|
|
||||||
|
|
||||||
# Call the Drive v3 API
|
# Call the Drive v3 API
|
||||||
|
|
||||||
## short record
|
|
||||||
#results = service.files().list(
|
|
||||||
# pageSize=100, fields="nextPageToken, files(id, name)").execute()
|
|
||||||
|
|
||||||
# long record
|
|
||||||
results = service.files().list(
|
results = service.files().list(
|
||||||
pageSize=100, fields="nextPageToken, files(id, kind, mimeType, name, owners, webViewLink, createdTime)").execute()
|
pageSize=100, fields="nextPageToken, files(id, kind, mimeType, name, owners, webViewLink, createdTime)").execute()
|
||||||
|
|
||||||
items = results.get('files', [])
|
items = results.get('files', [])
|
||||||
|
|
||||||
# To use the next token,
|
indexed_ids = set()
|
||||||
# just say results['nextPageToken']
|
for item in items:
|
||||||
# otherwise use items for the files
|
indexed_ids.add(item['id'])
|
||||||
#
|
|
||||||
# TODO:
|
# TODO:
|
||||||
# Tapping out at 100, use nextPageToken to get all later
|
# Tapping out at 100, use nextPageToken to get all later
|
||||||
for item in items:
|
|
||||||
drive_ids.add(item['id'])
|
|
||||||
|
|
||||||
# Remove documents in the index that are not in this drive
|
writer = self.ix.writer()
|
||||||
for indexed_id in indexed_ids:
|
|
||||||
if indexed_id not in drive_ids:
|
|
||||||
writer.delete_by_term('id',indexed_id)
|
|
||||||
|
|
||||||
count = 0
|
count = 0
|
||||||
|
|
||||||
for item in items:
|
for item in items:
|
||||||
|
|
||||||
|
self.add_item(writer, item, indexed_ids, config)
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
writer.commit()
|
||||||
|
print("Done, updated %d documents in the index" % count)
|
||||||
|
|
||||||
|
|
||||||
|
def add_item(self, writer, item, indexed_ids, config):
|
||||||
|
"""
|
||||||
|
Add an item to the index.
|
||||||
|
item is a google drive api document item.
|
||||||
|
works like a dictionary.
|
||||||
|
"""
|
||||||
# If we have already indexed this document,
|
# If we have already indexed this document,
|
||||||
# drop the old record first
|
# drop the old record first
|
||||||
|
|
||||||
if item['id'] in indexed_ids:
|
if item['id'] in indexed_ids:
|
||||||
writer.delete_by_term('id',item['id'])
|
writer.delete_by_term('id',item['id'])
|
||||||
|
|
||||||
# IMPORTANT:
|
# IMPORTANT:
|
||||||
# This is where the search documents are actually created.
|
# This is where the search documents are actually created.
|
||||||
|
|
||||||
mimetype = re.split('[/\.]', item['mimeType'])[-1]
|
|
||||||
writer.add_document(
|
|
||||||
id = item['id'],
|
|
||||||
url = item['webViewLink'],
|
|
||||||
timestamp = item['createdTime'],
|
|
||||||
mimetype = mimetype,
|
|
||||||
owner_email = item['owners'][0]['emailAddress'],
|
|
||||||
owner_name = item['owners'][0]['displayName'],
|
|
||||||
title = item['name']
|
|
||||||
)
|
|
||||||
count += 1
|
|
||||||
|
|
||||||
# TODO:
|
# TODO:
|
||||||
# Major todo item:
|
# Major todo item:
|
||||||
# - download items using id and get endpoint: https://developers.google.com/drive/api/v3/reference/files/get
|
# - download items using id and get endpoint: https://developers.google.com/drive/api/v3/reference/files/get
|
||||||
# - convert to markdown
|
# - convert to markdown
|
||||||
# - add content to content field
|
# - add content to content field
|
||||||
|
|
||||||
writer.commit()
|
mimetype = re.split('[/\.]', item['mimeType'])[-1]
|
||||||
print("Done, updated %d documents in the index" % count)
|
writer.add_document(
|
||||||
|
id = item['id'],
|
||||||
|
url = item['webViewLink'],
|
||||||
|
mimetype = mimetype,
|
||||||
|
timestamp = item['createdTime'],
|
||||||
|
owner_email = item['owners'][0]['emailAddress'],
|
||||||
|
owner_name = item['owners'][0]['displayName'],
|
||||||
|
title = item['name']
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user