add advanced search button to search template

starting point: merge 'feedback-floater' branch into 'advanced-search' branch
* feedback-floater: add dismissable "thanks for your feedback" message to top improve message formatting add dumy function as placeholder for where we add info messages return better messages add successful post call and export to JSON db update todo move modal into its own .html file update todo with tasks fix button and smiley styles add /feedback post route feedback button successfully triggers a modal add page self-identifiers. add "send feedback" button. fix layouts.
2018-08-21 10:04:48 -07:00 · 2018-08-20 21:04:54 -07:00
6 changed files with 184 additions and 311 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 feedback_database.json
 config_centillion.py
 config_flask.py
 vp
 credentials.json
--- a/centillion.py
+++ b/centillion.py
@@ -342,10 +342,5 @@ def store_search(query, fields):
 if __name__ == '__main__':
    # if running local instance, set to true
    os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = 'true'
-    port = os.environ.get('CENTILLION_PORT','')
+    app.run(host="0.0.0.0",port=5000)
    if port=='':
        port = 5000
    else:
        port = int(port)
    app.run(host="0.0.0.0",port=port)
--- a/centillion_search.py
+++ b/centillion_search.py
@@ -21,8 +21,6 @@ import dateutil.parser
 from whoosh.qparser import MultifieldParser, QueryParser
 from whoosh.analysis import StemmingAnalyzer
 from whoosh.qparser.dateparse import DateParserPlugin
 from whoosh import fields, index
 """
@@ -182,38 +180,30 @@ class Search:
        # is defined.
        schema = Schema(
-                id = fields.ID(stored=True, unique=True),
+                id = ID(stored=True, unique=True),
-                kind = fields.ID(stored=True),
+                kind = ID(stored=True),
-                created_time = fields.DATETIME(stored=True),
+                created_time = ID(stored=True),
-                modified_time = fields.DATETIME(stored=True),
+                modified_time = ID(stored=True),
-                indexed_time = fields.DATETIME(stored=True),
+                indexed_time = ID(stored=True),
-                title = fields.TEXT(stored=True, field_boost=100.0),
+                title = TEXT(stored=True, field_boost=100.0),
                url = ID(stored=True, unique=True),
-                url = fields.ID(stored=True),
+                mimetype=ID(stored=True),
                owner_email=ID(stored=True),
                owner_name=TEXT(stored=True),
-                mimetype = fields.TEXT(stored=True),
+                repo_name=TEXT(stored=True),
                repo_url=ID(stored=True),
-                owner_email = fields.ID(stored=True),
+                github_user=TEXT(stored=True),
                owner_name = fields.TEXT(stored=True),
                # mainly for email threads, groups.io, hypothesis
                group = fields.ID(stored=True),
                repo_name = fields.TEXT(stored=True),
                repo_url = fields.ID(stored=True),
                github_user = fields.TEXT(stored=True),
                tags = fields.KEYWORD(commas=True,
                                      stored=True,
                                      lowercase=True),
                # comments only
-                issue_title = fields.TEXT(stored=True, field_boost=100.0),
+                issue_title=TEXT(stored=True, field_boost=100.0),
-                issue_url = fields.ID(stored=True),
+                issue_url=ID(stored=True),
-                content = fields.TEXT(stored=True, analyzer=stemming_analyzer)
+                content=TEXT(stored=True, analyzer=stemming_analyzer)
        )
@@ -253,32 +243,24 @@ class Search:
            writer.delete_by_term('id',item['id'])
            # Index a plain google drive file
-            created_time = dateutil.parser.parse(item['createdTime'])
+            writer.add_document(
-            modified_time = dateutil.parser.parse(item['modifiedTime'])
+                    id = item['id'],
-            indexed_time = datetime.now().replace(microsecond=0)
+                    kind = 'gdoc',
-            try:
+                    created_time = item['createdTime'],
-                writer.add_document(
+                    modified_time = item['modifiedTime'],
-                        id = item['id'],
+                    indexed_time = datetime.now().replace(microsecond=0).isoformat(),
-                        kind = 'gdoc',
+                    title = item['name'],
-                        created_time = created_time,
+                    url = item['webViewLink'],
-                        modified_time = modified_time,
+                    mimetype = mimetype,
-                        indexed_time = indexed_time,
+                    owner_email = item['owners'][0]['emailAddress'],
-                        title = item['name'],
+                    owner_name = item['owners'][0]['displayName'],
-                        url = item['webViewLink'],
+                    repo_name='',
-                        mimetype = mimetype,
+                    repo_url='',
-                        owner_email = item['owners'][0]['emailAddress'],
+                    github_user='',
-                        owner_name = item['owners'][0]['displayName'],
+                    issue_title='',
-                        group='',
+                    issue_url='',
-                        repo_name='',
+                    content = content
-                        repo_url='',
+            )
                        github_user='',
                        issue_title='',
                        issue_url='',
                        content = content
                )
            except ValueError as e:
                print(repr(e))
                print(" > XXXXXX Failed to index Google Drive file \"%s\""%(item['name']))
        else:
@@ -332,7 +314,7 @@ class Search:
                )
                assert output == ""
            except RuntimeError:
-                print(" > XXXXXX Failed to index Google Drive document \"%s\""%(item['name']))
+                print(" > XXXXXX Failed to index document \"%s\""%(item['name']))
            # If export was successful, read contents of markdown
@@ -360,33 +342,24 @@ class Search:
            else:
                print(" > Creating a new record")
-            try:
+            writer.add_document(
-                created_time = dateutil.parser.parse(item['createdTime'])
+                    id = item['id'],
-                modified_time = dateutil.parser.parse(item['modifiedTime'])
+                    kind = 'gdoc',
-                indexed_time = datetime.now()
+                    created_time = item['createdTime'],
-                writer.add_document(
+                    modified_time = item['modifiedTime'],
-                        id = item['id'],
+                    indexed_time = datetime.now().replace(microsecond=0).isoformat(),
-                        kind = 'gdoc',
+                    title = item['name'],
-                        created_time = created_time,
+                    url = item['webViewLink'],
-                        modified_time = modified_time,
+                    mimetype = mimetype,
-                        indexed_time = indexed_time,
+                    owner_email = item['owners'][0]['emailAddress'],
-                        title = item['name'],
+                    owner_name = item['owners'][0]['displayName'],
-                        url = item['webViewLink'],
+                    repo_name='',
-                        mimetype = mimetype,
+                    repo_url='',
-                        owner_email = item['owners'][0]['emailAddress'],
+                    github_user='',
-                        owner_name = item['owners'][0]['displayName'],
+                    issue_title='',
-                        group='',
+                    issue_url='',
-                        repo_name='',
+                    content = content
-                        repo_url='',
+            )
                        github_user='',
                        issue_title='',
                        issue_url='',
                        content = content
                )
            except ValueError as e:
                print(repr(e))
                print(" > XXXXXX Failed to index Google Drive file \"%s\""%(item['name']))
@@ -420,36 +393,31 @@ class Search:
                issue_comment_content += comment.body.rstrip()
                issue_comment_content += "\n"
-        # Now create the actual search index record.
+        # Now create the actual search index record
        created_time = clean_timestamp(issue.created_at)
        modified_time = clean_timestamp(issue.updated_at)
        indexed_time = clean_timestamp(datetime.now())
        # Add one document per issue thread,
        # containing entire text of thread.
-
+        writer.add_document(
-        created_time = issue.created_at
+                id = issue.html_url,
-        modified_time = issue.updated_at
+                kind = 'issue',
-        indexed_time = datetime.now()
+                created_time = created_time,
-        try:
+                modified_time = modified_time,
-            writer.add_document(
+                indexed_time = indexed_time,
-                    id = issue.html_url,
+                title = issue.title,
-                    kind = 'issue',
+                url = issue.html_url,
-                    created_time = created_time,
+                mimetype='',
-                    modified_time = modified_time,
+                owner_email='',
-                    indexed_time = indexed_time,
+                owner_name='',
-                    title = issue.title,
+                repo_name = repo_name,
-                    url = issue.html_url,
+                repo_url = repo_url,
-                    mimetype='',
+                github_user = issue.user.login,
-                    owner_email='',
+                issue_title = issue.title,
-                    owner_name='',
+                issue_url = issue.html_url,
-                    group='',
+                content = issue_comment_content
-                    repo_name = repo_name,
+        )
                    repo_url = repo_url,
                    github_user = issue.user.login,
                    issue_title = issue.title,
                    issue_url = issue.html_url,
                    content = issue_comment_content
            )
        except ValueError as e:
            print(repr(e))
            print(" > XXXXXX Failed to index Github issue \"%s\""%(issue.title))
@@ -479,8 +447,7 @@ class Search:
            print(" > XXXXXXXX Failed to find file info.")
            return
-
+        indexed_time = clean_timestamp(datetime.now())
        indexed_time = datetime.now()
        if fext in MARKDOWN_EXTS:
            print("Indexing markdown doc %s from repo %s"%(fname,repo_name))
@@ -509,31 +476,24 @@ class Search:
            usable_url = "https://github.com/%s/blob/master/%s"%(repo_name, fpath)
            # Now create the actual search index record
-            try:
+            writer.add_document(
-                writer.add_document(
+                    id = fsha,
-                        id = fsha,
+                    kind = 'markdown',
-                        kind = 'markdown',
+                    created_time = '',
-                        created_time = None,
+                    modified_time = '',
-                        modified_time = None,
+                    indexed_time = indexed_time,
-                        indexed_time = indexed_time,
+                    title = fname,
-                        title = fname,
+                    url = usable_url,
-                        url = usable_url,
+                    mimetype='',
-                        mimetype='',
+                    owner_email='',
-                        owner_email='',
+                    owner_name='',
-                        owner_name='',
+                    repo_name = repo_name,
-                        group='',
+                    repo_url = repo_url,
-                        repo_name = repo_name,
+                    github_user = '',
-                        repo_url = repo_url,
+                    issue_title = '',
-                        github_user = '',
+                    issue_url = '',
-                        issue_title = '',
+                    content = content
-                        issue_url = '',
+            )
                        content = content
                )
            except ValueError as e:
                print(repr(e))
                print(" > XXXXXX Failed to index Github markdown file \"%s\""%(fname))
        else:
            print("Indexing github file %s from repo %s"%(fname,repo_name))
@@ -541,29 +501,24 @@ class Search:
            key = fname+"_"+fsha
            # Now create the actual search index record
-            try:
+            writer.add_document(
-                writer.add_document(
+                    id = key,
-                        id = key,
+                    kind = 'ghfile',
-                        kind = 'ghfile',
+                    created_time = '',
-                        created_time = None,
+                    modified_time = '',
-                        modified_time = None,
+                    indexed_time = indexed_time,
-                        indexed_time = indexed_time,
+                    title = fname,
-                        title = fname,
+                    url = repo_url,
-                        url = repo_url,
+                    mimetype='',
-                        mimetype='',
+                    owner_email='',
-                        owner_email='',
+                    owner_name='',
-                        owner_name='',
+                    repo_name = repo_name,
-                        group='',
+                    repo_url = repo_url,
-                        repo_name = repo_name,
+                    github_user = '',
-                        repo_url = repo_url,
+                    issue_title = '',
-                        github_user = '',
+                    issue_url = '',
-                        issue_title = '',
+                    content = ''
-                        issue_url = '',
+            )
                        content = ''
                )
            except ValueError as e:
                print(repr(e))
                print(" > XXXXXX Failed to index Github file \"%s\""%(fname))
@@ -577,42 +532,28 @@ class Search:
        Use a Github file API record to add a filename
        to the search index.
        """
-        if 'created_time' in d.keys() and d['created_time'] is not None:
+        indexed_time = clean_timestamp(datetime.now())
            created_time = d['created_time']
        else:
            created_time = None
        if 'modified_time' in d.keys() and d['modified_time'] is not None:
            modified_time = d['modified_time']
        else:
            modified_time = None
        indexed_time = datetime.now()
        # Now create the actual search index record
-        try:
+        writer.add_document(
-            writer.add_document(
+                id = d['permalink'],
-                    id = d['permalink'],
+                kind = 'emailthread',
-                    kind = 'emailthread',
+                created_time = '',
-                    created_time = created_time,
+                modified_time = '',
-                    modified_time = modified_time,
+                indexed_time = indexed_time,
-                    indexed_time = indexed_time,
+                title = d['subject'],
-                    title = d['subject'],
+                url = d['permalink'],
-                    url = d['permalink'],
+                mimetype='',
-                    mimetype='',
+                owner_email='',
-                    owner_email='',
+                owner_name=d['original_sender'],
-                    owner_name=d['original_sender'],
+                repo_name = '',
-                    group=d['subgroup'],
+                repo_url = '',
-                    repo_name = '',
+                github_user = '',
-                    repo_url = '',
+                issue_title = '',
-                    github_user = '',
+                issue_url = '',
-                    issue_title = '',
+                content = d['content']
-                    issue_url = '',
+        )
-                    content = d['content']
+
            )
        except ValueError as e:
            print(repr(e))
            print(" > XXXXXX Failed to index Groups.io thread \"%s\""%(d['subject']))
@@ -690,10 +631,10 @@ class Search:
                full_items[f['id']] = f
            ## Shorter:
-            break
+            #break
-            ## Longer:
+            # Longer:
-            #if nextPageToken is None:
+            if nextPageToken is None:
-            #    break
+                break
        writer = self.ix.writer()
@@ -701,41 +642,34 @@ class Search:
        temp_dir = tempfile.mkdtemp(dir=os.getcwd())
        print("Temporary directory: %s"%(temp_dir))
        try:
            # Drop any id in indexed_ids
            # not in remote_ids
            drop_ids = indexed_ids - remote_ids
            for drop_id in drop_ids:
                writer.delete_by_term('id',drop_id)
-            # Update any id in indexed_ids
+        # Drop any id in indexed_ids
-            # and in remote_ids
+        # not in remote_ids
-            update_ids = indexed_ids & remote_ids
+        drop_ids = indexed_ids - remote_ids
-            for update_id in update_ids:
+        for drop_id in drop_ids:
-                # cop out
+            writer.delete_by_term('id',drop_id)
                writer.delete_by_term('id',update_id)
                item = full_items[update_id]
                self.add_drive_file(writer, item, temp_dir, config, update=True)
                count += 1
-            # Add any id not in indexed_ids
+        # Update any id in indexed_ids
-            # and in remote_ids
+        # and in remote_ids
-            add_ids = remote_ids - indexed_ids
+        update_ids = indexed_ids & remote_ids
-            for add_id in add_ids:
+        for update_id in update_ids:
-                item = full_items[add_id]
+            # cop out
-                self.add_drive_file(writer, item, temp_dir, config, update=False)
+            writer.delete_by_term('id',update_id)
-                count += 1
+            item = full_items[update_id]
            self.add_drive_file(writer, item, temp_dir, config, update=True)
            count += 1
        # Add any id not in indexed_ids
        # and in remote_ids
        add_ids = remote_ids - indexed_ids
        for add_id in add_ids:
            item = full_items[add_id]
            self.add_drive_file(writer, item, temp_dir, config, update=False)
            count += 1
        except Exception as e:
            print("ERROR: While adding Google Drive files to search index")
            print("-"*40)
            print(repr(e))
            print("-"*40)
            print("Continuing...")
            pass
        print("Cleaning temporary directory: %s"%(temp_dir))
        subprocess.call(['rm','-fr',temp_dir])
@@ -1140,7 +1074,7 @@ class Search:
        elif doctype=='issue':
            item_keys = ['title','repo_name','repo_url','url','created_time','modified_time']
        elif doctype=='emailthread':
-            item_keys = ['title','owner_name','url','created_time','modified_time']
+            item_keys = ['title','owner_name','url']
        elif doctype=='ghfile':
            item_keys = ['title','repo_name','repo_url','url']
        elif doctype=='markdown':
@@ -1157,7 +1091,11 @@ class Search:
            for r in results:
                d = {}
                for k in item_keys:
-                    d[k] = r[k]
+                    if k=='created_time' or k=='modified_time':
                        #d[k] = r[k]
                        d[k] = dateutil.parser.parse(r[k]).strftime("%Y-%m-%d")
                    else:
                        d[k] = r[k]
                json_results.append(d)
        return json_results
@@ -1170,9 +1108,7 @@ class Search:
            query_string = " ".join(query_list)
            query = None
            if ":" in query_string:
-                query = QueryParser("content", self.schema)
+                query = QueryParser("content", self.schema).parse(query_string)
                query.add_plugin(DateParserPlugin(free=True))
                query = query.parse(query_string)
            elif len(fields) == 1 and fields[0] == "filename":
                pass
            elif len(fields) == 2:
@@ -1180,12 +1116,9 @@ class Search:
            else:
                # If the user does not specify a field,
                # these are the fields that are actually searched
-                fields = ['title', 'content','owner_name','owner_email','url','created_date','modified_date']
+                fields = ['title', 'content','owner_name','owner_email','url']
            if not query:
-                query = MultifieldParser(fields, schema=self.ix.schema)
+                query = MultifieldParser(fields, schema=self.ix.schema).parse(query_string)
                query.add_plugin(DateParserPlugin(free=True))
                query = query.parse(query_string)
                #query = MultifieldParser(fields, schema=self.ix.schema).parse(query_string) 
            parsed_query = "%s" % query
            print("query: %s" % parsed_query)
            results = searcher.search(query, terms=False, scored=True, groupedby="kind")
--- a/config_centillion.py
+++ b/config_centillion.py
@@ -1,28 +0,0 @@
 config = {
    "repositories" : [
        "dcppc/project-management",
        "dcppc/nih-demo-meetings",
        "dcppc/internal",
        "dcppc/organize",
        "dcppc/dcppc-bot",
        "dcppc/full-stacks",
        "dcppc/design-guidelines-discuss",
        "dcppc/dcppc-deliverables",
        "dcppc/dcppc-milestones",
        "dcppc/crosscut-metadata",
        "dcppc/lucky-penny",
        "dcppc/dcppc-workshops",
        "dcppc/metadata-matrix",
        "dcppc/data-stewards",
        "dcppc/dcppc-phase1-demos",
        "dcppc/apis",
        "dcppc/2018-june-workshop",
        "dcppc/2018-july-workshop",
        "dcppc/2018-august-workshop",
        "dcppc/2018-september-workshop",
        "dcppc/design-guidelines",
        "dcppc/2018-may-workshop",
        "dcppc/centillion"
    ]
 }
--- a/groupsio_util.py
+++ b/groupsio_util.py
@@ -1,7 +1,5 @@
 import requests, os, re
 from bs4 import BeautifulSoup
 import dateutil.parser
 import datetime
 class GroupsIOException(Exception):
    pass
@@ -66,7 +64,7 @@ class GroupsIOArchivesCrawler(object):
            ## Short circuit
            ## for debugging purposes
-            break
+            #break
        return subgroups
@@ -253,7 +251,7 @@ class GroupsIOArchivesCrawler(object):
            subject = soup.find('title').text
            # Extract information for the schema:
-            # - permalink for thread (done above)
+            # - permalink for thread (done)
            # - subject/title (done)
            # - original sender email/name (done)
            # - content (done)
@@ -268,35 +266,11 @@ class GroupsIOArchivesCrawler(object):
                    pass
                else:
                    # found an email!
-                    # this is a maze, not amazing.
+                    # this is a maze, thanks groups.io
                    # thanks groups.io!
                    td = tr.find('td')
-
+                    divrow = td.find('div',{'class':'row'}).find('div',{'class':'pull-left'})
                    sender_divrow = td.find('div',{'class':'row'})
                    sender_divrow = sender_divrow.find('div',{'class':'pull-left'})
                    if (i+1)==1:
-                        original_sender = sender_divrow.text.strip()
+                        original_sender = divrow.text.strip()
                    date_divrow = td.find('div',{'class':'row'})
                    date_divrow = date_divrow.find('div',{'class':'pull-right'})
                    date_divrow = date_divrow.find('font',{'class':'text-muted'})
                    date_divrow = date_divrow.find('script').text
                    try:
                        time_seconds = re.search(' [0-9]{1,} ',date_divrow).group(0)
                        time_seconds = time_seconds.strip()
                        # Thanks groups.io for the weird date formatting
                        time_seconds = time_seconds[:10]
                        mmicro_seconds = time_seconds[10:]
                        if (i+1)==1:
                            created_time  = datetime.datetime.utcfromtimestamp(int(time_seconds))
                            modified_time = datetime.datetime.utcfromtimestamp(int(time_seconds))
                        else:
                            modified_time = datetime.datetime.utcfromtimestamp(int(time_seconds))
                    except AttributeError:
                        created_time = None
                        modified_time = None
                    for div in td.find_all('div'):
                        if div.has_attr('id'):
@@ -325,10 +299,7 @@ class GroupsIOArchivesCrawler(object):
            thread = {
                    'permalink' : permalink,
                    'created_time' : created_time,
                    'modified_time' : modified_time,
                    'subject' : subject,
                    'subgroup' : subgroup_name,
                    'original_sender' : original_sender,
                    'content' : full_content
            }
@@ -353,13 +324,11 @@ class GroupsIOArchivesCrawler(object):
        results = []
        for row in rows:
-            # This is where we extract
+            # We don't care about anything except title and ugly link
            # a list of thread titles 
            # and corresponding links.
            subject = row.find('span',{'class':'subject'})
            title = subject.get_text()
            link = row.find('a')['href']
-
+            #print(title)
            results.append((title,link))
        return results
--- a/templates/search.html
+++ b/templates/search.html
@@ -10,12 +10,15 @@
                <form action="{{ url_for('search') }}" name="search">
                    <p><input type="text" name="query" value="{{ query }}">
                    </p>
-                    <p><button id="the-big-one" type="submit" style="font-size: 20px; padding: 10px; padding-left: 50px; padding-right: 50px;" 
+                    <p><button id="the-big-one" type="submit" 
                        style="font-size: 20px; padding: 10px; padding-left: 50px; padding-right: 50px;" 
                        value="search" class="btn btn-primary">Search</button>
                    </p>
                    <p><a href="#" onClick="advanced_search()">[Advanced Search]</a>
                    {% if parsed_query %}
-                        <p><a href="{{ url_for('search')}}?query=&fields=">[clear all results]</a>
+                        <p><a href="{{ url_for('search')}}?query=&fields=">[Clear All Results]</a>
                    {% endif %}
                    </p>