add better id checking in cheeseburger_search

fix mysterious bug in app
add virtualenv instructions to readme
2018-07-29 01:26:27 -07:00 · 2018-07-29 01:25:49 -07:00 · 2018-07-29 00:55:45 -07:00 · 2018-07-29 00:54:22 -07:00
4 changed files with 91 additions and 95 deletions
--- a/Readme.md
+++ b/Readme.md
@@ -4,6 +4,16 @@ use whoosh to search documents in a google drive folder.
 Implemented in **Python** using **Flask**, **Whoosh** and **Mistune**.
 ## virtualenv
 ```
 virtualenv vp
 source vp/bin/activate
 pip install -r requirements.txt
 ```
 ## notes
 in addition to the schema changes listed in issues-search:
--- a/Todo.md
+++ b/Todo.md
@@ -0,0 +1,17 @@
 # TODO
 recap of round 1:
 - able to grab a google doc, add metadata, index that metadata with search
 - no content, which is the big next step
 ## Round 2
 add content:
 - create temp dir
 - download content using document id and get api endpoint
 - convert document to markdown using pandoc
 - index the markdown
 - ???
 - profit
--- a/cheeseburger_app.py
+++ b/cheeseburger_app.py
@@ -30,14 +30,14 @@ class UpdateIndexTask(object):
        thread.daemon = True
        thread.start()
-    def run(self, rebuild_index=False):
+    def run(self):
        search = Search(app.config["INDEX_DIR"])
        credentials_file = 'credentials.json'
        collection = 'charlesreid1dib test collection'
        search.update_index_incremental(credentials_file,
-                                        collection,
+                                        app.config, 
-                                        app.config, create_new_index=self.rebuild_index)
+                                        create_new_index=self.rebuild_index)
 app = Flask(__name__)
--- a/cheeseburger_search.py
+++ b/cheeseburger_search.py
@@ -137,7 +137,7 @@ class Search:
            self.ix = index.open_dir(index_folder)
-    def add_all_documents(self, credentials_file, collection, config, create_new_index=False):
+    def add_all_documents(self, credentials_file, config, create_new_index=False):
        """
        Add all issues in a given github repo to the search index.
@@ -156,7 +156,7 @@ class Search:
        gd = GDrive()
        service = gd.get_service()
-        # Iindex each document:
+        # Steps to add all documents to index:
        # 
        # Step 1: walk each doc in google drive. 
        # Step 2: index it.
@@ -173,50 +173,29 @@ class Search:
        items = results.get('files', [])
-        # To use the next token, 
+        indexed_ids = set()
-        # just say results['nextPageToken']
+        for item in items:
-        # otherwise use items for the files
+            indexed_ids.add(item['id'])
-        # 
+
        # TODO:
        # Tapping out at 100, use nextPageToken to get all later
-        for item in items:
+
-            drive_ids.add(item['id'])
+        writer = self.ix.writer()
        count = 0
        for item in items:
-            # If we have already indexed this document, 
+            self.add_item(writer, item, indexed_ids, config)
            # drop the old record first
            if item['id'] in indexed_ids:
                writer.delete_by_term('id',item['id'])
            # IMPORTANT:
            # This is where the search documents are actually created.
            mimetype = re.split('[/\.]', item['mimeType'])[-1]
            writer.add_document(
                    id = item['id'],
                    url = item['webViewLink'],
                    mimetype = mimetype,
                    timestamp = item['createdTime'],
                    owner_email = item['owners'][0]['emailAddress'],
                    owner_name = item['owners'][0]['displayName'],
                    title = item['name']
            )
            count += 1
        # TODO:
        # Major todo item:
        # - download items using id and get endpoint: https://developers.google.com/drive/api/v3/reference/files/get
        # - convert to markdown
        # - add content to content field
        writer.commit()
        print("Done, created %d documents in the index" % count)
-
+    def update_index_incremental(self, 
-    def update_index_incremental(self, credentials_file, collection, config, create_new_index=False):
+                                 credentials_file, 
                                 config, 
                                 create_new_index=False):
        """
        Update the index of issues of a given github repo.
@@ -235,92 +214,82 @@ class Search:
        # PoC||GTFO
-        # Re-index each document:
+        # Steps to rebuild all documents in index:
        # 
        # Step 1: build list of indexed documents
        # Step 2: build list of documents on google drive
        # Step 3: if indexed documents not on google drive, delete
        # Step 4: if indexed documents on google drive, delete and reindex
        # Step 5: if non-indexed documents on google drive, index
        # 
        # Step 1: walk each doc in google drive. 
        # Step 2: index it.
        # Step 2.5: deal with documents removed from google drive.
        # Step 3: grab a beer.
        # TODO:
-        # Can make Step 4 shorter by storing hash of contents.
+        # Can make Step 2/2.5 shorter by storing hash of contents.
-
+        # for now, just... uh... i dunno. 
-        # -----
+        # figure it out later. don't remove.
-        # Set of all indexed documents:
+        # update works exactly like add:
-        indexed_ids = set()
+        # if a document already exists in the index,
-
+        # it gets removed and re-added.
        with self.ix.searcher() as searcher:
            writer = self.ix.writer()
            # Loop over the stored fields in the index
            # (i.e., each record)
            for fields in searcher.all_stored_fields():
                indexed_id = fields['id']
                indexed_ids.add(indexed_id)
        # -----
        # Set of all documents on Google Drive:
        drive_ids = set()
        # Call the Drive v3 API
        ## short record
        #results = service.files().list(
        #    pageSize=100, fields="nextPageToken, files(id, name)").execute()
        # long record
        results = service.files().list(
            pageSize=100, fields="nextPageToken, files(id, kind, mimeType, name, owners, webViewLink, createdTime)").execute()
        items = results.get('files', [])
-        # To use the next token, 
+        indexed_ids = set()
-        # just say results['nextPageToken']
+        for item in items:
-        # otherwise use items for the files
+            indexed_ids.add(item['id'])
-        # 
+
        # TODO:
        # Tapping out at 100, use nextPageToken to get all later
        for item in items:
            drive_ids.add(item['id'])
-        # Remove documents in the index that are not in this drive
+        writer = self.ix.writer()
        for indexed_id in indexed_ids:
            if indexed_id not in drive_ids:
                writer.delete_by_term('id',indexed_id)
        count = 0
        for item in items:
            self.add_item(writer, item, indexed_ids, config)
            count += 1
        writer.commit()
        print("Done, updated %d documents in the index" % count)
    def add_item(self, writer, item, indexed_ids, config):
        """
        Add an item to the index.
        item is a google drive api document item.
        works like a dictionary.
        """
        # If we have already indexed this document, 
        # drop the old record first
        if item['id'] in indexed_ids:
            writer.delete_by_term('id',item['id'])
        # IMPORTANT:
        # This is where the search documents are actually created.
            mimetype = re.split('[/\.]', item['mimeType'])[-1]
            writer.add_document(
                    id = item['id'],
                    url = item['webViewLink'],
                    timestamp = item['createdTime'],
                    mimetype = mimetype,
                    owner_email = item['owners'][0]['emailAddress'],
                    owner_name = item['owners'][0]['displayName'],
                    title = item['name']
            )
            count += 1
        # TODO:
        # Major todo item:
        # - download items using id and get endpoint: https://developers.google.com/drive/api/v3/reference/files/get
        # - convert to markdown
        # - add content to content field
-        writer.commit()
+        mimetype = re.split('[/\.]', item['mimeType'])[-1]
-        print("Done, updated %d documents in the index" % count)
+        writer.add_document(
                id = item['id'],
                url = item['webViewLink'],
                mimetype = mimetype,
                timestamp = item['createdTime'],
                owner_email = item['owners'][0]['emailAddress'],
                owner_name = item['owners'][0]['displayName'],
                title = item['name']
        )
Author	SHA1	Message	Date
Charles Reid	33a2d5d2fc	add better id checking in cheeseburger_search	2018-07-29 01:26:27 -07:00
Charles Reid	ad50e85e38	fix mysterious bug in app	2018-07-29 01:25:49 -07:00
Charles Reid	1dfe9adaab	add virtualenv instructions to readme	2018-07-29 00:55:45 -07:00
Charles Reid	5d41699aa6	update todo	2018-07-29 00:54:22 -07:00