mark v0.3 as in the bag

update todo
add pandoc mistune requests to requirements.txt
2018-07-29 17:14:50 -07:00 · 2018-07-29 17:14:17 -07:00 · 2018-07-29 17:12:27 -07:00 · 2018-07-29 17:11:41 -07:00 · 2018-07-29 17:11:02 -07:00 · 2018-07-29 17:10:44 -07:00
5 changed files with 62 additions and 18 deletions
--- a/Readme.md
+++ b/Readme.md
@@ -58,6 +58,9 @@ last schema thing to change:
 - list of fields needs to be updated
 - don't exactly understand that if block but okkkkk....

+## todo
+
+see [Todo.md](Todo.md)

 ## creating apps

--- a/Todo.md
+++ b/Todo.md
@@ -4,7 +4,7 @@ recap of round 1:
 - able to grab a google doc, add metadata, index that metadata with search
 - no content, which is the big next step

-## Round 2
+## v0.2 (done)

 add content:
 - create temp dir
@@ -14,4 +14,40 @@ add content:
 - ???
 - profit

+## v0.3 (done)
+
+~what is up with html formatting?~
+- markdown with html tables is all messed up
+- what's up with it? well, we have a bunch of shite word tables.
+- those are rendered as markdown files full of html.
+- the html is rendered directly by the page.
+- fixed by using pandoc to convert to plain text, not markdown.
+- docx -> text, not docx -> markdown
+
+## v0.4
+
+(later can add a step where we do convert to markdown, extract headers, etc.)
+
+indexing: hashing content
+
+delta/main index
+
+## Learnings for Centillion
+
+whoosh:
+- convert documents to text, not markdown
+- schema for different documents will present the biggest integration challenge
+    - integration tests? 
+    - None values for fields that do not apply to a record?
+    - conditional jinja templating?
+
+licensing:
+- need to improve readme
+- need to unpack the markdown functionality and replace it
+
+flask routes:
+- need to think through routes (separate heroku app, maintenance dashboard,
+  diff/main index)
+
+

--- a/cheeseburger_search.py
+++ b/cheeseburger_search.py
@@ -345,27 +345,34 @@ class Search:
            # This re could probablybe improved
            name = re.sub('/','_',item['name'])

-
            # Now make the pandoc input/output filenames
+            out_ext = 'txt'
+            pandoc_fmt = 'plain'
            if name.endswith(file_ext):
                infile_name = name
-                outfile_name = re.sub(file_ext,'md',infile_name)
+                outfile_name = re.sub(file_ext,out_ext,infile_name)
            else:
                infile_name  = name+'.'+file_ext
-                outfile_name = name+'.md'
+                outfile_name = name+'.'+out_ext


+            # assemble input/output file paths
+            fullpath_input = os.path.join(temp_dir,infile_name)
+            fullpath_output = os.path.join(temp_dir,outfile_name)
+
            # Use requests.get to download url to file
            r = requests.get(file_url, allow_redirects=True)
-            fullpath_input = os.path.join(temp_dir,infile_name)
            with open(fullpath_input, 'wb') as f:
                f.write(r.content)


-            # Try to convert docx file to markdown
-            fullpath_output = os.path.join(temp_dir,outfile_name)
+            # Try to convert docx file to plain text
            try:
-                output = pypandoc.convert_file(fullpath_input,'gfm',format='docx',outputfile=fullpath_output)
+                output = pypandoc.convert_file(fullpath_input,
+                                               pandoc_fmt,
+                                               format='docx',
+                                               outputfile=fullpath_output
+                )
                assert output == ""
            except RuntimeError:
                print("XXXXXX Failed to index document %s"%(item['name']))
@@ -383,11 +390,11 @@ class Search:
            # No matter what happens, clean up.
            print("Cleaning up %s"%item['name'])

-            #subprocess.call(['rm','-fr',fullpath_output])
-            print(" ".join(['rm','-fr',fullpath_output]))
+            subprocess.call(['rm','-fr',fullpath_output])
+            #print(" ".join(['rm','-fr',fullpath_output]))

-            #subprocess.call(['rm','-fr',fullpath_input])
-            print(" ".join(['rm','-fr',fullpath_input]))
+            subprocess.call(['rm','-fr',fullpath_input])
+            #print(" ".join(['rm','-fr',fullpath_input]))


        mimetype = re.split('[/\.]', item['mimeType'])[-1]
@@ -403,7 +410,6 @@ class Search:
        )


-
    def create_search_result(self, results):
        # Allow larger fragments
        results.fragmenter.maxchars = 300
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,6 +3,8 @@ apiclient>=1.0.3
 oauth2client>=3.0.0
 httplib2>=0.10.3
 google-api-python-client
-mistune>=0.8.3
+mistune>=0.8
 whoosh>=2.7.4
 pypandoc>=1.4
+requests>=2.19
+pandoc>=1.0
--- a/templates/search.html
+++ b/templates/search.html
@@ -30,14 +30,11 @@
    {% for e in entries %}
    <tr>
        <td class="search-result">
-            <!--
-                <div class="path"><a href='{{ url_for("open_file")}}?path={{e.path|urlencode}}&query={{query}}&fields={{fields}}'>{{e.path}}</a>score: {{'%d'  % e.score}}</div>
-            -->
            <div class="url">
                <a href='{{e.url}}'>{{e.title}} ({{e.mimetype}})</a><br />
                score: {{'%d'  % e.score}}
            </div>
-            <div class="markdown-body">{{ e.content_highlight|safe}}</div>
+            <div class="markdown-body">{{e.content_highlight|safe}}</div>
        </td>
    </tr>
    {% endfor %}
Author	SHA1	Message	Date
Charles Reid	19f3053875	mark v0.3 as in the bag	2018-07-29 17:14:50 -07:00
Charles Reid	2a8ab4b1e2	update todo	2018-07-29 17:14:17 -07:00
Charles Reid	58c4ec4b32	add pandoc mistune requests to requirements.txt	2018-07-29 17:12:27 -07:00
Charles Reid	2978efce63	minor cleanup of search template	2018-07-29 17:11:41 -07:00
Charles Reid	b871d417a0	actually clean up	2018-07-29 17:11:02 -07:00
Charles Reid	b5755c656b	udpate readme and todo	2018-07-29 17:10:44 -07:00