add advanced search button to search template

starting point: merge 'feedback-floater' branch into 'advanced-search' branch
* feedback-floater: add dismissable "thanks for your feedback" message to top improve message formatting add dumy function as placeholder for where we add info messages return better messages add successful post call and export to JSON db update todo move modal into its own .html file update todo with tasks fix button and smiley styles add /feedback post route feedback button successfully triggers a modal add page self-identifiers. add "send feedback" button. fix layouts.
2018-08-21 10:04:48 -07:00 · 2018-08-20 21:04:54 -07:00
6 changed files with 184 additions and 311 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 feedback_database.json
+config_centillion.py
 config_flask.py
 vp
 credentials.json
--- a/centillion.py
+++ b/centillion.py
@@ -342,10 +342,5 @@ def store_search(query, fields):
 if __name__ == '__main__':
    # if running local instance, set to true
    os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = 'true'
-    port = os.environ.get('CENTILLION_PORT','')
-    if port=='':
-        port = 5000
-    else:
-        port = int(port)
-    app.run(host="0.0.0.0",port=port)
+    app.run(host="0.0.0.0",port=5000)

--- a/centillion_search.py
+++ b/centillion_search.py
@@ -21,8 +21,6 @@ import dateutil.parser

 from whoosh.qparser import MultifieldParser, QueryParser
 from whoosh.analysis import StemmingAnalyzer
-from whoosh.qparser.dateparse import DateParserPlugin
-from whoosh import fields, index


 """
@@ -182,38 +180,30 @@ class Search:
        # is defined.

        schema = Schema(
-                id = fields.ID(stored=True, unique=True),
-                kind = fields.ID(stored=True),
+                id = ID(stored=True, unique=True),
+                kind = ID(stored=True),

-                created_time = fields.DATETIME(stored=True),
-                modified_time = fields.DATETIME(stored=True),
-                indexed_time = fields.DATETIME(stored=True),
+                created_time = ID(stored=True),
+                modified_time = ID(stored=True),
+                indexed_time = ID(stored=True),
                
-                title = fields.TEXT(stored=True, field_boost=100.0),
+                title = TEXT(stored=True, field_boost=100.0),
+                url = ID(stored=True, unique=True),
                
-                url = fields.ID(stored=True),
+                mimetype=ID(stored=True),
+                owner_email=ID(stored=True),
+                owner_name=TEXT(stored=True),
                
-                mimetype = fields.TEXT(stored=True),
+                repo_name=TEXT(stored=True),
+                repo_url=ID(stored=True),

-                owner_email = fields.ID(stored=True),
-                owner_name = fields.TEXT(stored=True),
-
-                # mainly for email threads, groups.io, hypothesis
-                group = fields.ID(stored=True),
-
-                repo_name = fields.TEXT(stored=True),
-                repo_url = fields.ID(stored=True),
-                github_user = fields.TEXT(stored=True),
-
-                tags = fields.KEYWORD(commas=True,
-                                      stored=True,
-                                      lowercase=True),
+                github_user=TEXT(stored=True),

                # comments only
-                issue_title = fields.TEXT(stored=True, field_boost=100.0),
-                issue_url = fields.ID(stored=True),
+                issue_title=TEXT(stored=True, field_boost=100.0),
+                issue_url=ID(stored=True),
                
-                content = fields.TEXT(stored=True, analyzer=stemming_analyzer)
+                content=TEXT(stored=True, analyzer=stemming_analyzer)
        )


@@ -253,22 +243,17 @@ class Search:
            writer.delete_by_term('id',item['id'])

            # Index a plain google drive file
-            created_time = dateutil.parser.parse(item['createdTime'])
-            modified_time = dateutil.parser.parse(item['modifiedTime'])
-            indexed_time = datetime.now().replace(microsecond=0)
-            try:
            writer.add_document(
                    id = item['id'],
                    kind = 'gdoc',
-                        created_time = created_time,
-                        modified_time = modified_time,
-                        indexed_time = indexed_time,
+                    created_time = item['createdTime'],
+                    modified_time = item['modifiedTime'],
+                    indexed_time = datetime.now().replace(microsecond=0).isoformat(),
                    title = item['name'],
                    url = item['webViewLink'],
                    mimetype = mimetype,
                    owner_email = item['owners'][0]['emailAddress'],
                    owner_name = item['owners'][0]['displayName'],
-                        group='',
                    repo_name='',
                    repo_url='',
                    github_user='',
@@ -276,9 +261,6 @@ class Search:
                    issue_url='',
                    content = content
            )
-            except ValueError as e:
-                print(repr(e))
-                print(" > XXXXXX Failed to index Google Drive file \"%s\""%(item['name']))


        else:
@@ -332,7 +314,7 @@ class Search:
                )
                assert output == ""
            except RuntimeError:
-                print(" > XXXXXX Failed to index Google Drive document \"%s\""%(item['name']))
+                print(" > XXXXXX Failed to index document \"%s\""%(item['name']))


            # If export was successful, read contents of markdown
@@ -360,22 +342,17 @@ class Search:
            else:
                print(" > Creating a new record")

-            try:
-                created_time = dateutil.parser.parse(item['createdTime'])
-                modified_time = dateutil.parser.parse(item['modifiedTime'])
-                indexed_time = datetime.now()
            writer.add_document(
                    id = item['id'],
                    kind = 'gdoc',
-                        created_time = created_time,
-                        modified_time = modified_time,
-                        indexed_time = indexed_time,
+                    created_time = item['createdTime'],
+                    modified_time = item['modifiedTime'],
+                    indexed_time = datetime.now().replace(microsecond=0).isoformat(),
                    title = item['name'],
                    url = item['webViewLink'],
                    mimetype = mimetype,
                    owner_email = item['owners'][0]['emailAddress'],
                    owner_name = item['owners'][0]['displayName'],
-                        group='',
                    repo_name='',
                    repo_url='',
                    github_user='',
@@ -383,10 +360,6 @@ class Search:
                    issue_url='',
                    content = content
            )
-            except ValueError as e:
-                print(repr(e))
-                print(" > XXXXXX Failed to index Google Drive file \"%s\""%(item['name']))
-



@@ -420,14 +393,13 @@ class Search:
                issue_comment_content += comment.body.rstrip()
                issue_comment_content += "\n"

-        # Now create the actual search index record.
+        # Now create the actual search index record
+        created_time = clean_timestamp(issue.created_at)
+        modified_time = clean_timestamp(issue.updated_at)
+        indexed_time = clean_timestamp(datetime.now())
+
        # Add one document per issue thread,
        # containing entire text of thread.
-
-        created_time = issue.created_at
-        modified_time = issue.updated_at
-        indexed_time = datetime.now()
-        try:
        writer.add_document(
                id = issue.html_url,
                kind = 'issue',
@@ -439,7 +411,6 @@ class Search:
                mimetype='',
                owner_email='',
                owner_name='',
-                    group='',
                repo_name = repo_name,
                repo_url = repo_url,
                github_user = issue.user.login,
@@ -447,9 +418,6 @@ class Search:
                issue_url = issue.html_url,
                content = issue_comment_content
        )
-        except ValueError as e:
-            print(repr(e))
-            print(" > XXXXXX Failed to index Github issue \"%s\""%(issue.title))



@@ -479,8 +447,7 @@ class Search:
            print(" > XXXXXXXX Failed to find file info.")
            return

-
-        indexed_time = datetime.now()
+        indexed_time = clean_timestamp(datetime.now())

        if fext in MARKDOWN_EXTS:
            print("Indexing markdown doc %s from repo %s"%(fname,repo_name))
@@ -509,19 +476,17 @@ class Search:
            usable_url = "https://github.com/%s/blob/master/%s"%(repo_name, fpath)

            # Now create the actual search index record
-            try:
            writer.add_document(
                    id = fsha,
                    kind = 'markdown',
-                        created_time = None,
-                        modified_time = None,
+                    created_time = '',
+                    modified_time = '',
                    indexed_time = indexed_time,
                    title = fname,
                    url = usable_url,
                    mimetype='',
                    owner_email='',
                    owner_name='',
-                        group='',
                    repo_name = repo_name,
                    repo_url = repo_url,
                    github_user = '',
@@ -529,11 +494,6 @@ class Search:
                    issue_url = '',
                    content = content
            )
-            except ValueError as e:
-                print(repr(e))
-                print(" > XXXXXX Failed to index Github markdown file \"%s\""%(fname))
-
-

        else:
            print("Indexing github file %s from repo %s"%(fname,repo_name))
@@ -541,19 +501,17 @@ class Search:
            key = fname+"_"+fsha

            # Now create the actual search index record
-            try:
            writer.add_document(
                    id = key,
                    kind = 'ghfile',
-                        created_time = None,
-                        modified_time = None,
+                    created_time = '',
+                    modified_time = '',
                    indexed_time = indexed_time,
                    title = fname,
                    url = repo_url,
                    mimetype='',
                    owner_email='',
                    owner_name='',
-                        group='',
                    repo_name = repo_name,
                    repo_url = repo_url,
                    github_user = '',
@@ -561,9 +519,6 @@ class Search:
                    issue_url = '',
                    content = ''
            )
-            except ValueError as e:
-                print(repr(e))
-                print(" > XXXXXX Failed to index Github file \"%s\""%(fname))



@@ -577,32 +532,20 @@ class Search:
        Use a Github file API record to add a filename
        to the search index.
        """
-        if 'created_time' in d.keys() and d['created_time'] is not None:
-            created_time = d['created_time']
-        else:
-            created_time = None
-
-        if 'modified_time' in d.keys() and d['modified_time'] is not None:
-            modified_time = d['modified_time']
-        else:
-            modified_time = None
-
-        indexed_time = datetime.now()
+        indexed_time = clean_timestamp(datetime.now())

        # Now create the actual search index record
-        try:
        writer.add_document(
                id = d['permalink'],
                kind = 'emailthread',
-                    created_time = created_time,
-                    modified_time = modified_time,
+                created_time = '',
+                modified_time = '',
                indexed_time = indexed_time,
                title = d['subject'],
                url = d['permalink'],
                mimetype='',
                owner_email='',
                owner_name=d['original_sender'],
-                    group=d['subgroup'],
                repo_name = '',
                repo_url = '',
                github_user = '',
@@ -610,9 +553,7 @@ class Search:
                issue_url = '',
                content = d['content']
        )
-        except ValueError as e:
-            print(repr(e))
-            print(" > XXXXXX Failed to index Groups.io thread \"%s\""%(d['subject']))
+



@@ -690,10 +631,10 @@ class Search:
                full_items[f['id']] = f
            
            ## Shorter:
+            #break
+            # Longer:
+            if nextPageToken is None:
                break
-            ## Longer:
-            #if nextPageToken is None:
-            #    break


        writer = self.ix.writer()
@@ -701,7 +642,7 @@ class Search:
        temp_dir = tempfile.mkdtemp(dir=os.getcwd())
        print("Temporary directory: %s"%(temp_dir))

-        try:
+

        # Drop any id in indexed_ids
        # not in remote_ids
@@ -729,13 +670,6 @@ class Search:
            self.add_drive_file(writer, item, temp_dir, config, update=False)
            count += 1

-        except Exception as e:
-            print("ERROR: While adding Google Drive files to search index")
-            print("-"*40)
-            print(repr(e))
-            print("-"*40)
-            print("Continuing...")
-            pass

        print("Cleaning temporary directory: %s"%(temp_dir))
        subprocess.call(['rm','-fr',temp_dir])
@@ -1140,7 +1074,7 @@ class Search:
        elif doctype=='issue':
            item_keys = ['title','repo_name','repo_url','url','created_time','modified_time']
        elif doctype=='emailthread':
-            item_keys = ['title','owner_name','url','created_time','modified_time']
+            item_keys = ['title','owner_name','url']
        elif doctype=='ghfile':
            item_keys = ['title','repo_name','repo_url','url']
        elif doctype=='markdown':
@@ -1157,6 +1091,10 @@ class Search:
            for r in results:
                d = {}
                for k in item_keys:
+                    if k=='created_time' or k=='modified_time':
+                        #d[k] = r[k]
+                        d[k] = dateutil.parser.parse(r[k]).strftime("%Y-%m-%d")
+                    else:
                        d[k] = r[k]
                json_results.append(d)

@@ -1170,9 +1108,7 @@ class Search:
            query_string = " ".join(query_list)
            query = None
            if ":" in query_string:
-                query = QueryParser("content", self.schema)
-                query.add_plugin(DateParserPlugin(free=True))
-                query = query.parse(query_string)
+                query = QueryParser("content", self.schema).parse(query_string)
            elif len(fields) == 1 and fields[0] == "filename":
                pass
            elif len(fields) == 2:
@@ -1180,12 +1116,9 @@ class Search:
            else:
                # If the user does not specify a field,
                # these are the fields that are actually searched
-                fields = ['title', 'content','owner_name','owner_email','url','created_date','modified_date']
+                fields = ['title', 'content','owner_name','owner_email','url']
            if not query:
-                query = MultifieldParser(fields, schema=self.ix.schema)
-                query.add_plugin(DateParserPlugin(free=True))
-                query = query.parse(query_string)
-                #query = MultifieldParser(fields, schema=self.ix.schema).parse(query_string) 
+                query = MultifieldParser(fields, schema=self.ix.schema).parse(query_string)
            parsed_query = "%s" % query
            print("query: %s" % parsed_query)
            results = searcher.search(query, terms=False, scored=True, groupedby="kind")
--- a/config_centillion.py
+++ b/config_centillion.py
@@ -1,28 +0,0 @@
-config = {
-    "repositories" : [
-        "dcppc/project-management",
-        "dcppc/nih-demo-meetings",
-        "dcppc/internal",
-        "dcppc/organize",
-        "dcppc/dcppc-bot",
-        "dcppc/full-stacks",
-        "dcppc/design-guidelines-discuss",
-        "dcppc/dcppc-deliverables",
-        "dcppc/dcppc-milestones",
-        "dcppc/crosscut-metadata",
-        "dcppc/lucky-penny",
-        "dcppc/dcppc-workshops",
-        "dcppc/metadata-matrix",
-        "dcppc/data-stewards",
-        "dcppc/dcppc-phase1-demos",
-        "dcppc/apis",
-        "dcppc/2018-june-workshop",
-        "dcppc/2018-july-workshop",
-        "dcppc/2018-august-workshop",
-        "dcppc/2018-september-workshop",
-        "dcppc/design-guidelines",
-        "dcppc/2018-may-workshop",
-        "dcppc/centillion"
-    ]
-}
-
--- a/groupsio_util.py
+++ b/groupsio_util.py
@@ -1,7 +1,5 @@
 import requests, os, re
 from bs4 import BeautifulSoup
-import dateutil.parser
-import datetime

 class GroupsIOException(Exception):
    pass
@@ -66,7 +64,7 @@ class GroupsIOArchivesCrawler(object):

            ## Short circuit
            ## for debugging purposes
-            break
+            #break

        return subgroups

@@ -253,7 +251,7 @@ class GroupsIOArchivesCrawler(object):
            subject = soup.find('title').text

            # Extract information for the schema:
-            # - permalink for thread (done above)
+            # - permalink for thread (done)
            # - subject/title (done)
            # - original sender email/name (done)
            # - content (done)
@@ -268,35 +266,11 @@ class GroupsIOArchivesCrawler(object):
                    pass
                else:
                    # found an email!
-                    # this is a maze, not amazing.
-                    # thanks groups.io!
+                    # this is a maze, thanks groups.io
                    td = tr.find('td')
-
-                    sender_divrow = td.find('div',{'class':'row'})
-                    sender_divrow = sender_divrow.find('div',{'class':'pull-left'})
+                    divrow = td.find('div',{'class':'row'}).find('div',{'class':'pull-left'})
                    if (i+1)==1:
-                        original_sender = sender_divrow.text.strip()
-
-                    date_divrow = td.find('div',{'class':'row'})
-                    date_divrow = date_divrow.find('div',{'class':'pull-right'})
-                    date_divrow = date_divrow.find('font',{'class':'text-muted'})
-                    date_divrow = date_divrow.find('script').text
-                    try:
-                        time_seconds = re.search(' [0-9]{1,} ',date_divrow).group(0)
-                        time_seconds = time_seconds.strip()
-                        # Thanks groups.io for the weird date formatting
-                        time_seconds = time_seconds[:10]
-                        mmicro_seconds = time_seconds[10:]
-                        if (i+1)==1:
-                            created_time  = datetime.datetime.utcfromtimestamp(int(time_seconds))
-                            modified_time = datetime.datetime.utcfromtimestamp(int(time_seconds))
-                        else:
-                            modified_time = datetime.datetime.utcfromtimestamp(int(time_seconds))
-
-                    except AttributeError:
-                        created_time = None
-                        modified_time = None
-
+                        original_sender = divrow.text.strip()
                    for div in td.find_all('div'):
                        if div.has_attr('id'):

@@ -325,10 +299,7 @@ class GroupsIOArchivesCrawler(object):

            thread = {
                    'permalink' : permalink,
-                    'created_time' : created_time,
-                    'modified_time' : modified_time,
                    'subject' : subject,
-                    'subgroup' : subgroup_name,
                    'original_sender' : original_sender,
                    'content' : full_content
            }
@@ -353,13 +324,11 @@ class GroupsIOArchivesCrawler(object):

        results = []
        for row in rows:
-            # This is where we extract
-            # a list of thread titles 
-            # and corresponding links.
+            # We don't care about anything except title and ugly link
            subject = row.find('span',{'class':'subject'})
            title = subject.get_text()
            link = row.find('a')['href']
-
+            #print(title)
            results.append((title,link))

        return results
--- a/templates/search.html
+++ b/templates/search.html
@@ -10,12 +10,15 @@
                <form action="{{ url_for('search') }}" name="search">
                    <p><input type="text" name="query" value="{{ query }}">
                    </p>
-                    <p><button id="the-big-one" type="submit" style="font-size: 20px; padding: 10px; padding-left: 50px; padding-right: 50px;" 
+                    <p><button id="the-big-one" type="submit" 
+                        style="font-size: 20px; padding: 10px; padding-left: 50px; padding-right: 50px;" 
                        value="search" class="btn btn-primary">Search</button>
                    </p>

+                    <p><a href="#" onClick="advanced_search()">[Advanced Search]</a>
+
                    {% if parsed_query %}
-                        <p><a href="{{ url_for('search')}}?query=&fields=">[clear all results]</a>
+                        <p><a href="{{ url_for('search')}}?query=&fields=">[Clear All Results]</a>
                    {% endif %}

                    </p>