clean up cheeseburger search function, and add documentation!

success - using requests to download the files
add failed attempts to download files from google drive. URGH
2018-07-29 14:08:11 -07:00 · 2018-07-29 13:01:40 -07:00 · 2018-07-29 02:28:57 -07:00 · 2018-07-29 01:27:20 -07:00
3 changed files with 147 additions and 35 deletions
--- a/Readme.md
+++ b/Readme.md
@@ -59,6 +59,7 @@ last schema thing to change:
 - don't exactly understand that if block but okkkkk....
 ## creating apps
-
+[link to google apps docs](https://developers.google.com/api-client-library/python/samples/samples)
--- a/cheeseburger_search.py
+++ b/cheeseburger_search.py
@@ -2,12 +2,15 @@ import shutil
 import html.parser
 from gdrive_util import GDrive
 from apiclient.http import MediaIoBaseDownload
 from markdown_parser import MarkdownParser
 import mistune
 from whoosh.fields import *
 import whoosh.index as index
-import os
+import os, re, io, requests
 import tempfile, subprocess
 import pypandoc
 import os.path
 import codecs
 from whoosh.qparser import MultifieldParser, QueryParser
@@ -149,6 +152,12 @@ class Search:
        - github org/user owning these repos
        - location of the whoosh config file for configuring the search engine
        """
        # Steps to add all documents to index:
        # 
        # Step 1: walk each doc in google drive. 
        # Step 2: index it.
        # Step 3: grab a beer.
        if create_new_index:
            self.open_index(self.index_folder, create_new=True)
@@ -156,13 +165,6 @@ class Search:
        gd = GDrive()
        service = gd.get_service()
        # Steps to add all documents to index:
        # 
        # Step 1: walk each doc in google drive. 
        # Step 2: index it.
        # Step 3: grab a beer.
        #
        # -----
        # Set of all documents on Google Drive:
@@ -182,10 +184,15 @@ class Search:
        writer = self.ix.writer()
        temp_dir = tempfile.mkdtemp(dir=os.getcwd())
        print("Temporary directory: %s"%(temp_dir))
        if not os.path.exists(temp_dir):
            os.mkdir(temp_dir)
        count = 0
        for item in items:
-            self.add_item(writer, item, indexed_ids, config)
+            self.add_item(writer, item, indexed_ids, temp_dir, config)
            count += 1
        writer.commit()
@@ -206,12 +213,6 @@ class Search:
        - location of the whoosh config file for configuring the search engine
        """
        if create_new_index:
            self.open_index(self.index_folder, create_new=True)
        gd = GDrive()
        service = gd.get_service()
        # PoC||GTFO
        # Steps to rebuild all documents in index:
@@ -229,6 +230,13 @@ class Search:
        # if a document already exists in the index,
        # it gets removed and re-added.
        if create_new_index:
            self.open_index(self.index_folder, create_new=True)
        gd = GDrive()
        service = gd.get_service()
        # -----
        # Set of all documents on Google Drive:
@@ -248,17 +256,22 @@ class Search:
        writer = self.ix.writer()
        temp_dir = tempfile.mkdtemp(dir=os.getcwd())
        print("Temporary directory: %s"%(temp_dir))
        if not os.path.exists(temp_dir):
            os.mkdir(temp_dir)
        count = 0
        for item in items:
-            self.add_item(writer, item, indexed_ids, config)
+            self.add_item(writer, item, indexed_ids, temp_dir, config)
            count += 1
        writer.commit()
        print("Done, updated %d documents in the index" % count)
-    def add_item(self, writer, item, indexed_ids, config):
+    def add_item(self, writer, item, indexed_ids, temp_dir, config):
        """
        Add an item to the index.
        item is a google drive api document item.
@@ -266,18 +279,116 @@ class Search:
        """
        # If we have already indexed this document, 
        # drop the old record first
        if item['id'] in indexed_ids:
            writer.delete_by_term('id',item['id'])
        gd = GDrive()
        service = gd.get_service()
        # IMPORTANT:
        # This is where the search documents are actually created.
-        # TODO:
+        ##########################################
-        # Major todo item:
+        # Two kinds of documents:
-        # - download items using id and get endpoint: https://developers.google.com/drive/api/v3/reference/files/get
+        # - documents with text that can be extracted and indexed
-        # - convert to markdown
+        # - every other kind
-        # - add content to content field
+        #
        # In Google Drive land, that's (docx) and (everybody else).
        #
        # For each document living in the Google Drive folder,
        # - If mimeType is document:
        #   - Download it
        #   - Convert it to markdown
        #   - Extract and index the content
        #   - Index everything else
        # - Else:
        #   - Just index everything else
        mimetype = re.split('[/\.]',item['mimeType'])[-1]
        mimemap = {
                'document' : 'docx',
        }
        content = ""
        if(mimetype not in mimemap.keys()):
            # ----------
            # Not a document
            # 
            # No text to extract
            # 
            # Technically, there probably is,
            # but I'm not about to parse powerpoint
            # or mystery PDF files in python.
            print("Indexing document %s of type %s"%(item['name'], mimetype))
        else:
            # ----------
            # docx Content Extraction:
            # 
            # We can only do this with .docx files
            # This is a file type we know how to convert
            # Construct the URL and download it
            print("Extracting content from %s of type %s"%(item['name'], mimetype))
            # Create a URL and a destination filename
            file_ext = mimemap[mimetype]
            file_url = "https://docs.google.com/document/d/%s/export?format=%s"%(item['id'], file_ext)
            # This re could probablybe improved
            name = re.sub('/','_',item['name'])
            # Now make the pandoc input/output filenames
            if name.endswith(file_ext):
                infile_name = name
                outfile_name = re.sub(file_ext,'md',infile_name)
            else:
                infile_name  = name+'.'+file_ext
                outfile_name = name+'.md'
            # Use requests.get to download url to file
            r = requests.get(file_url, allow_redirects=True)
            fullpath_input = os.path.join(temp_dir,infile_name)
            with open(fullpath_input, 'wb') as f:
                f.write(r.content)
            # Try to convert docx file to markdown
            fullpath_output = os.path.join(temp_dir,outfile_name)
            try:
                output = pypandoc.convert_file(fullpath_input,'gfm',format='docx',outputfile=fullpath_output)
                assert output == ""
            except RuntimeError:
                print("XXXXXX Failed to index document %s"%(item['name']))
            # If export was successful, read contents of markdown
            # into the content variable.
            # into the content variable.
            if os.path.isfile(fullpath_output):
                # Export was successful
                with codecs.open(fullpath_output, encoding='utf-8') as f:
                    content = f.read()
            # No matter what happens, clean up.
            print("Cleaning up %s"%item['name'])
            #subprocess.call(['rm','-fr',fullpath_output])
            print(" ".join(['rm','-fr',fullpath_output]))
            #subprocess.call(['rm','-fr',fullpath_input])
            print(" ".join(['rm','-fr',fullpath_input]))
        mimetype = re.split('[/\.]', item['mimeType'])[-1]
        writer.add_document(
@@ -287,12 +398,12 @@ class Search:
                timestamp = item['createdTime'],
                owner_email = item['owners'][0]['emailAddress'],
                owner_name = item['owners'][0]['displayName'],
-                title = item['name']
+                title = item['name'],
                content = content
        )
    def create_search_result(self, results):
        # Allow larger fragments
        results.fragmenter.maxchars = 300
@@ -329,17 +440,16 @@ class Search:
            sr.owner_email = r['owner_email']
            sr.owner_name = r['owner_name']
-            #sr.content = r['content']
+            sr.content = r['content']
-            #highlights = r.highlights('content')
+            highlights = r.highlights('content')
-            #if not highlights:
+            if not highlights:
-            #    # just use the first 1,000 words of the document
+                # just use the first 1,000 words of the document
-            #    highlights = self.cap(r['content'], 1000)
+                highlights = self.cap(r['content'], 1000)
-            #highlights = self.html_parser.unescape(highlights)
+            highlights = self.html_parser.unescape(highlights)
-            #html = self.markdown(highlights)
+            html = self.markdown(highlights)
-            #sr.content_highlight = html
+            sr.content_highlight = html
            sr.content_highlight = '<p>Hello world</p>'
            search_results.append(sr)
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,4 @@ httplib2>=0.10.3
 google-api-python-client
 mistune>=0.8.3
 whoosh>=2.7.4
 pypandoc>=1.4
Author	SHA1	Message	Date
Charles Reid	1ff71ad459	clean up cheeseburger search function, and add documentation!	2018-07-29 14:08:11 -07:00
Charles Reid	948126a4dc	success - using requests to download the files	2018-07-29 13:01:40 -07:00
Charles Reid	783cd967b0	add failed attempts to download files from google drive. URGH	2018-07-29 02:28:57 -07:00
Charles Reid	2bbc1378c0	add pypandoc to requirements	2018-07-29 01:27:20 -07:00