add better id checking in cheeseburger_search

fix mysterious bug in app
add virtualenv instructions to readme
2018-07-29 01:26:27 -07:00 · 2018-07-29 01:25:49 -07:00 · 2018-07-29 00:55:45 -07:00 · 2018-07-29 00:54:22 -07:00
4 changed files with 91 additions and 95 deletions
--- a/Readme.md
+++ b/Readme.md
@@ -4,6 +4,16 @@ use whoosh to search documents in a google drive folder.

 Implemented in **Python** using **Flask**, **Whoosh** and **Mistune**.

+
+## virtualenv
+
+```
+virtualenv vp
+source vp/bin/activate
+pip install -r requirements.txt
+```
+
+
 ## notes

 in addition to the schema changes listed in issues-search:
--- a/Todo.md
+++ b/Todo.md
@@ -0,0 +1,17 @@
+# TODO
+
+recap of round 1:
+- able to grab a google doc, add metadata, index that metadata with search
+- no content, which is the big next step
+
+## Round 2
+
+add content:
+- create temp dir
+- download content using document id and get api endpoint
+- convert document to markdown using pandoc
+- index the markdown
+- ???
+- profit
+
+
--- a/cheeseburger_app.py
+++ b/cheeseburger_app.py
@@ -30,14 +30,14 @@ class UpdateIndexTask(object):
        thread.daemon = True
        thread.start()

-    def run(self, rebuild_index=False):
+    def run(self):
        search = Search(app.config["INDEX_DIR"])

        credentials_file = 'credentials.json'
        collection = 'charlesreid1dib test collection'
        search.update_index_incremental(credentials_file,
-                                        collection,
-                                        app.config, create_new_index=self.rebuild_index)
+                                        app.config, 
+                                        create_new_index=self.rebuild_index)

 app = Flask(__name__)

--- a/cheeseburger_search.py
+++ b/cheeseburger_search.py
@@ -137,7 +137,7 @@ class Search:
            self.ix = index.open_dir(index_folder)


-    def add_all_documents(self, credentials_file, collection, config, create_new_index=False):
+    def add_all_documents(self, credentials_file, config, create_new_index=False):
        """
        Add all issues in a given github repo to the search index.

@@ -156,7 +156,7 @@ class Search:
        gd = GDrive()
        service = gd.get_service()

-        # Iindex each document:
+        # Steps to add all documents to index:
        # 
        # Step 1: walk each doc in google drive. 
        # Step 2: index it.
@@ -173,50 +173,29 @@ class Search:

        items = results.get('files', [])

-        # To use the next token, 
-        # just say results['nextPageToken']
-        # otherwise use items for the files
-        # 
+        indexed_ids = set()
+        for item in items:
+            indexed_ids.add(item['id'])
+
        # TODO:
        # Tapping out at 100, use nextPageToken to get all later
-        for item in items:
-            drive_ids.add(item['id'])
+
+        writer = self.ix.writer()

        count = 0
        for item in items:

-            # If we have already indexed this document, 
-            # drop the old record first
-            if item['id'] in indexed_ids:
-                writer.delete_by_term('id',item['id'])
-
-            # IMPORTANT:
-            # This is where the search documents are actually created.
-
-            mimetype = re.split('[/\.]', item['mimeType'])[-1]
-            writer.add_document(
-                    id = item['id'],
-                    url = item['webViewLink'],
-                    mimetype = mimetype,
-                    timestamp = item['createdTime'],
-                    owner_email = item['owners'][0]['emailAddress'],
-                    owner_name = item['owners'][0]['displayName'],
-                    title = item['name']
-            )
+            self.add_item(writer, item, indexed_ids, config)
            count += 1

-        # TODO:
-        # Major todo item:
-        # - download items using id and get endpoint: https://developers.google.com/drive/api/v3/reference/files/get
-        # - convert to markdown
-        # - add content to content field
-
        writer.commit()
        print("Done, created %d documents in the index" % count)


-
-    def update_index_incremental(self, credentials_file, collection, config, create_new_index=False):
+    def update_index_incremental(self, 
+                                 credentials_file, 
+                                 config, 
+                                 create_new_index=False):
        """
        Update the index of issues of a given github repo.

@@ -235,92 +214,82 @@ class Search:

        # PoC||GTFO

-        # Re-index each document:
+        # Steps to rebuild all documents in index:
        # 
-        # Step 1: build list of indexed documents
-        # Step 2: build list of documents on google drive
-        # Step 3: if indexed documents not on google drive, delete
-        # Step 4: if indexed documents on google drive, delete and reindex
-        # Step 5: if non-indexed documents on google drive, index
-        #
+        # Step 1: walk each doc in google drive. 
+        # Step 2: index it.
+        # Step 2.5: deal with documents removed from google drive.
+        # Step 3: grab a beer.
+
        # TODO:
-        # Can make Step 4 shorter by storing hash of contents.
-
-        # -----
-        # Set of all indexed documents:
-        indexed_ids = set()
-
-        with self.ix.searcher() as searcher:
-            writer = self.ix.writer()
-
-            # Loop over the stored fields in the index
-            # (i.e., each record)
-            for fields in searcher.all_stored_fields():
-                indexed_id = fields['id']
-                indexed_ids.add(indexed_id)
+        # Can make Step 2/2.5 shorter by storing hash of contents.
+        # for now, just... uh... i dunno. 
+        # figure it out later. don't remove.
+        # update works exactly like add:
+        # if a document already exists in the index,
+        # it gets removed and re-added.

        # -----
        # Set of all documents on Google Drive:
-        drive_ids = set()

        # Call the Drive v3 API

-        ## short record
-        #results = service.files().list(
-        #    pageSize=100, fields="nextPageToken, files(id, name)").execute()
-
-        # long record
        results = service.files().list(
            pageSize=100, fields="nextPageToken, files(id, kind, mimeType, name, owners, webViewLink, createdTime)").execute()

        items = results.get('files', [])

-        # To use the next token, 
-        # just say results['nextPageToken']
-        # otherwise use items for the files
-        # 
+        indexed_ids = set()
+        for item in items:
+            indexed_ids.add(item['id'])
+
        # TODO:
        # Tapping out at 100, use nextPageToken to get all later
-        for item in items:
-            drive_ids.add(item['id'])

-        # Remove documents in the index that are not in this drive
-        for indexed_id in indexed_ids:
-            if indexed_id not in drive_ids:
-                writer.delete_by_term('id',indexed_id)
+        writer = self.ix.writer()

        count = 0
-
        for item in items:

-            # If we have already indexed this document, 
-            # drop the old record first
-            if item['id'] in indexed_ids:
-                writer.delete_by_term('id',item['id'])
-
-            # IMPORTANT:
-            # This is where the search documents are actually created.
-            
-            mimetype = re.split('[/\.]', item['mimeType'])[-1]
-            writer.add_document(
-                    id = item['id'],
-                    url = item['webViewLink'],
-                    timestamp = item['createdTime'],
-                    mimetype = mimetype,
-                    owner_email = item['owners'][0]['emailAddress'],
-                    owner_name = item['owners'][0]['displayName'],
-                    title = item['name']
-            )
+            self.add_item(writer, item, indexed_ids, config)
            count += 1

+        writer.commit()
+        print("Done, updated %d documents in the index" % count)
+
+
+    def add_item(self, writer, item, indexed_ids, config):
+        """
+        Add an item to the index.
+        item is a google drive api document item.
+        works like a dictionary.
+        """
+        # If we have already indexed this document, 
+        # drop the old record first
+
+        if item['id'] in indexed_ids:
+            writer.delete_by_term('id',item['id'])
+
+        # IMPORTANT:
+        # This is where the search documents are actually created.
+
        # TODO:
        # Major todo item:
        # - download items using id and get endpoint: https://developers.google.com/drive/api/v3/reference/files/get
        # - convert to markdown
        # - add content to content field

-        writer.commit()
-        print("Done, updated %d documents in the index" % count)
+        mimetype = re.split('[/\.]', item['mimeType'])[-1]
+        writer.add_document(
+                id = item['id'],
+                url = item['webViewLink'],
+                mimetype = mimetype,
+                timestamp = item['createdTime'],
+                owner_email = item['owners'][0]['emailAddress'],
+                owner_name = item['owners'][0]['displayName'],
+                title = item['name']
+        )
+
Author	SHA1	Message	Date
Charles Reid	33a2d5d2fc	add better id checking in cheeseburger_search	2018-07-29 01:26:27 -07:00
Charles Reid	ad50e85e38	fix mysterious bug in app	2018-07-29 01:25:49 -07:00
Charles Reid	1dfe9adaab	add virtualenv instructions to readme	2018-07-29 00:55:45 -07:00
Charles Reid	5d41699aa6	update todo	2018-07-29 00:54:22 -07:00