Merge pull request #90 from dcppc/add-dates-and-subgroups-to-emails

Add dates and subgroups to emails
extract date and time from email threads pages
2018-08-24 00:07:40 -07:00 · 2018-08-24 00:04:35 -07:00 · 2018-08-24 00:03:23 -07:00 · 2018-08-24 00:01:26 -07:00 · 2018-08-23 19:01:40 -07:00
4 changed files with 276 additions and 555 deletions
--- a/centillion_search.py
+++ b/centillion_search.py
@@ -21,6 +21,8 @@ import dateutil.parser
 from whoosh.qparser import MultifieldParser, QueryParser
 from whoosh.analysis import StemmingAnalyzer
 from whoosh.qparser.dateparse import DateParserPlugin
 from whoosh import fields, index
 """
@@ -180,30 +182,38 @@ class Search:
        # is defined.
        schema = Schema(
-                id = ID(stored=True, unique=True),
+                id = fields.ID(stored=True, unique=True),
-                kind = ID(stored=True),
+                kind = fields.ID(stored=True),
-                created_time = ID(stored=True),
+                created_time = fields.DATETIME(stored=True),
-                modified_time = ID(stored=True),
+                modified_time = fields.DATETIME(stored=True),
-                indexed_time = ID(stored=True),
+                indexed_time = fields.DATETIME(stored=True),
-                title = TEXT(stored=True, field_boost=100.0),
+                title = fields.TEXT(stored=True, field_boost=100.0),
                url = ID(stored=True, unique=True),
                mimetype=ID(stored=True),
                owner_email=ID(stored=True),
                owner_name=TEXT(stored=True),
                repo_name=TEXT(stored=True),
                repo_url=ID(stored=True),
-                github_user=TEXT(stored=True),
+                url = fields.ID(stored=True),
                mimetype = fields.TEXT(stored=True),
                owner_email = fields.ID(stored=True),
                owner_name = fields.TEXT(stored=True),
                # mainly for email threads, groups.io, hypothesis
                group = fields.ID(stored=True),
                repo_name = fields.TEXT(stored=True),
                repo_url = fields.ID(stored=True),
                github_user = fields.TEXT(stored=True),
                tags = fields.KEYWORD(commas=True,
                                      stored=True,
                                      lowercase=True),
                # comments only
-                issue_title=TEXT(stored=True, field_boost=100.0),
+                issue_title = fields.TEXT(stored=True, field_boost=100.0),
-                issue_url=ID(stored=True),
+                issue_url = fields.ID(stored=True),
-                content=TEXT(stored=True, analyzer=stemming_analyzer)
+                content = fields.TEXT(stored=True, analyzer=stemming_analyzer)
        )
@@ -243,24 +253,32 @@ class Search:
            writer.delete_by_term('id',item['id'])
            # Index a plain google drive file
-            writer.add_document(
+            created_time = dateutil.parser.parse(item['createdTime'])
-                    id = item['id'],
+            modified_time = dateutil.parser.parse(item['modifiedTime'])
-                    kind = 'gdoc',
+            indexed_time = datetime.now().replace(microsecond=0)
-                    created_time = item['createdTime'],
+            try:
-                    modified_time = item['modifiedTime'],
+                writer.add_document(
-                    indexed_time = datetime.now().replace(microsecond=0).isoformat(),
+                        id = item['id'],
-                    title = item['name'],
+                        kind = 'gdoc',
-                    url = item['webViewLink'],
+                        created_time = created_time,
-                    mimetype = mimetype,
+                        modified_time = modified_time,
-                    owner_email = item['owners'][0]['emailAddress'],
+                        indexed_time = indexed_time,
-                    owner_name = item['owners'][0]['displayName'],
+                        title = item['name'],
-                    repo_name='',
+                        url = item['webViewLink'],
-                    repo_url='',
+                        mimetype = mimetype,
-                    github_user='',
+                        owner_email = item['owners'][0]['emailAddress'],
-                    issue_title='',
+                        owner_name = item['owners'][0]['displayName'],
-                    issue_url='',
+                        group='',
-                    content = content
+                        repo_name='',
-            )
+                        repo_url='',
                        github_user='',
                        issue_title='',
                        issue_url='',
                        content = content
                )
            except ValueError as e:
                print(repr(e))
                print(" > XXXXXX Failed to index Google Drive file \"%s\""%(item['name']))
        else:
@@ -314,7 +332,7 @@ class Search:
                )
                assert output == ""
            except RuntimeError:
-                print(" > XXXXXX Failed to index document \"%s\""%(item['name']))
+                print(" > XXXXXX Failed to index Google Drive document \"%s\""%(item['name']))
            # If export was successful, read contents of markdown
@@ -342,24 +360,33 @@ class Search:
            else:
                print(" > Creating a new record")
-            writer.add_document(
+            try:
-                    id = item['id'],
+                created_time = dateutil.parser.parse(item['createdTime'])
-                    kind = 'gdoc',
+                modified_time = dateutil.parser.parse(item['modifiedTime'])
-                    created_time = item['createdTime'],
+                indexed_time = datetime.now()
-                    modified_time = item['modifiedTime'],
+                writer.add_document(
-                    indexed_time = datetime.now().replace(microsecond=0).isoformat(),
+                        id = item['id'],
-                    title = item['name'],
+                        kind = 'gdoc',
-                    url = item['webViewLink'],
+                        created_time = created_time,
-                    mimetype = mimetype,
+                        modified_time = modified_time,
-                    owner_email = item['owners'][0]['emailAddress'],
+                        indexed_time = indexed_time,
-                    owner_name = item['owners'][0]['displayName'],
+                        title = item['name'],
-                    repo_name='',
+                        url = item['webViewLink'],
-                    repo_url='',
+                        mimetype = mimetype,
-                    github_user='',
+                        owner_email = item['owners'][0]['emailAddress'],
-                    issue_title='',
+                        owner_name = item['owners'][0]['displayName'],
-                    issue_url='',
+                        group='',
-                    content = content
+                        repo_name='',
-            )
+                        repo_url='',
                        github_user='',
                        issue_title='',
                        issue_url='',
                        content = content
                )
            except ValueError as e:
                print(repr(e))
                print(" > XXXXXX Failed to index Google Drive file \"%s\""%(item['name']))
@@ -393,31 +420,36 @@ class Search:
                issue_comment_content += comment.body.rstrip()
                issue_comment_content += "\n"
-        # Now create the actual search index record
+        # Now create the actual search index record.
        created_time = clean_timestamp(issue.created_at)
        modified_time = clean_timestamp(issue.updated_at)
        indexed_time = clean_timestamp(datetime.now())
        # Add one document per issue thread,
        # containing entire text of thread.
-        writer.add_document(
+
-                id = issue.html_url,
+        created_time = issue.created_at
-                kind = 'issue',
+        modified_time = issue.updated_at
-                created_time = created_time,
+        indexed_time = datetime.now()
-                modified_time = modified_time,
+        try:
-                indexed_time = indexed_time,
+            writer.add_document(
-                title = issue.title,
+                    id = issue.html_url,
-                url = issue.html_url,
+                    kind = 'issue',
-                mimetype='',
+                    created_time = created_time,
-                owner_email='',
+                    modified_time = modified_time,
-                owner_name='',
+                    indexed_time = indexed_time,
-                repo_name = repo_name,
+                    title = issue.title,
-                repo_url = repo_url,
+                    url = issue.html_url,
-                github_user = issue.user.login,
+                    mimetype='',
-                issue_title = issue.title,
+                    owner_email='',
-                issue_url = issue.html_url,
+                    owner_name='',
-                content = issue_comment_content
+                    group='',
-        )
+                    repo_name = repo_name,
                    repo_url = repo_url,
                    github_user = issue.user.login,
                    issue_title = issue.title,
                    issue_url = issue.html_url,
                    content = issue_comment_content
            )
        except ValueError as e:
            print(repr(e))
            print(" > XXXXXX Failed to index Github issue \"%s\""%(issue.title))
@@ -447,7 +479,8 @@ class Search:
            print(" > XXXXXXXX Failed to find file info.")
            return
-        indexed_time = clean_timestamp(datetime.now())
+
        indexed_time = datetime.now()
        if fext in MARKDOWN_EXTS:
            print("Indexing markdown doc %s from repo %s"%(fname,repo_name))
@@ -476,24 +509,31 @@ class Search:
            usable_url = "https://github.com/%s/blob/master/%s"%(repo_name, fpath)
            # Now create the actual search index record
-            writer.add_document(
+            try:
-                    id = fsha,
+                writer.add_document(
-                    kind = 'markdown',
+                        id = fsha,
-                    created_time = '',
+                        kind = 'markdown',
-                    modified_time = '',
+                        created_time = None,
-                    indexed_time = indexed_time,
+                        modified_time = None,
-                    title = fname,
+                        indexed_time = indexed_time,
-                    url = usable_url,
+                        title = fname,
-                    mimetype='',
+                        url = usable_url,
-                    owner_email='',
+                        mimetype='',
-                    owner_name='',
+                        owner_email='',
-                    repo_name = repo_name,
+                        owner_name='',
-                    repo_url = repo_url,
+                        group='',
-                    github_user = '',
+                        repo_name = repo_name,
-                    issue_title = '',
+                        repo_url = repo_url,
-                    issue_url = '',
+                        github_user = '',
-                    content = content
+                        issue_title = '',
-            )
+                        issue_url = '',
                        content = content
                )
            except ValueError as e:
                print(repr(e))
                print(" > XXXXXX Failed to index Github markdown file \"%s\""%(fname))
        else:
            print("Indexing github file %s from repo %s"%(fname,repo_name))
@@ -501,24 +541,29 @@ class Search:
            key = fname+"_"+fsha
            # Now create the actual search index record
-            writer.add_document(
+            try:
-                    id = key,
+                writer.add_document(
-                    kind = 'ghfile',
+                        id = key,
-                    created_time = '',
+                        kind = 'ghfile',
-                    modified_time = '',
+                        created_time = None,
-                    indexed_time = indexed_time,
+                        modified_time = None,
-                    title = fname,
+                        indexed_time = indexed_time,
-                    url = repo_url,
+                        title = fname,
-                    mimetype='',
+                        url = repo_url,
-                    owner_email='',
+                        mimetype='',
-                    owner_name='',
+                        owner_email='',
-                    repo_name = repo_name,
+                        owner_name='',
-                    repo_url = repo_url,
+                        group='',
-                    github_user = '',
+                        repo_name = repo_name,
-                    issue_title = '',
+                        repo_url = repo_url,
-                    issue_url = '',
+                        github_user = '',
-                    content = ''
+                        issue_title = '',
-            )
+                        issue_url = '',
                        content = ''
                )
            except ValueError as e:
                print(repr(e))
                print(" > XXXXXX Failed to index Github file \"%s\""%(fname))
@@ -532,28 +577,42 @@ class Search:
        Use a Github file API record to add a filename
        to the search index.
        """
-        indexed_time = clean_timestamp(datetime.now())
+        if 'created_time' in d.keys() and d['created_time'] is not None:
            created_time = d['created_time']
        else:
            created_time = None
        if 'modified_time' in d.keys() and d['modified_time'] is not None:
            modified_time = d['modified_time']
        else:
            modified_time = None
        indexed_time = datetime.now()
        # Now create the actual search index record
-        writer.add_document(
+        try:
-                id = d['permalink'],
+            writer.add_document(
-                kind = 'emailthread',
+                    id = d['permalink'],
-                created_time = '',
+                    kind = 'emailthread',
-                modified_time = '',
+                    created_time = created_time,
-                indexed_time = indexed_time,
+                    modified_time = modified_time,
-                title = d['subject'],
+                    indexed_time = indexed_time,
-                url = d['permalink'],
+                    title = d['subject'],
-                mimetype='',
+                    url = d['permalink'],
-                owner_email='',
+                    mimetype='',
-                owner_name=d['original_sender'],
+                    owner_email='',
-                repo_name = '',
+                    owner_name=d['original_sender'],
-                repo_url = '',
+                    group=d['subgroup'],
-                github_user = '',
+                    repo_name = '',
-                issue_title = '',
+                    repo_url = '',
-                issue_url = '',
+                    github_user = '',
-                content = d['content']
+                    issue_title = '',
-        )
+                    issue_url = '',
-
+                    content = d['content']
            )
        except ValueError as e:
            print(repr(e))
            print(" > XXXXXX Failed to index Groups.io thread \"%s\""%(d['subject']))
@@ -631,10 +690,10 @@ class Search:
                full_items[f['id']] = f
            ## Shorter:
-            #break
+            break
-            # Longer:
+            ## Longer:
-            if nextPageToken is None:
+            #if nextPageToken is None:
-                break
+            #    break
        writer = self.ix.writer()
@@ -642,34 +701,41 @@ class Search:
        temp_dir = tempfile.mkdtemp(dir=os.getcwd())
        print("Temporary directory: %s"%(temp_dir))
        try:
            # Drop any id in indexed_ids
            # not in remote_ids
            drop_ids = indexed_ids - remote_ids
            for drop_id in drop_ids:
                writer.delete_by_term('id',drop_id)
-        # Drop any id in indexed_ids
+            # Update any id in indexed_ids
-        # not in remote_ids
+            # and in remote_ids
-        drop_ids = indexed_ids - remote_ids
+            update_ids = indexed_ids & remote_ids
-        for drop_id in drop_ids:
+            for update_id in update_ids:
-            writer.delete_by_term('id',drop_id)
+                # cop out
                writer.delete_by_term('id',update_id)
                item = full_items[update_id]
                self.add_drive_file(writer, item, temp_dir, config, update=True)
                count += 1
-        # Update any id in indexed_ids
+            # Add any id not in indexed_ids
-        # and in remote_ids
+            # and in remote_ids
-        update_ids = indexed_ids & remote_ids
+            add_ids = remote_ids - indexed_ids
-        for update_id in update_ids:
+            for add_id in add_ids:
-            # cop out
+                item = full_items[add_id]
-            writer.delete_by_term('id',update_id)
+                self.add_drive_file(writer, item, temp_dir, config, update=False)
-            item = full_items[update_id]
+                count += 1
            self.add_drive_file(writer, item, temp_dir, config, update=True)
            count += 1
        # Add any id not in indexed_ids
        # and in remote_ids
        add_ids = remote_ids - indexed_ids
        for add_id in add_ids:
            item = full_items[add_id]
            self.add_drive_file(writer, item, temp_dir, config, update=False)
            count += 1
        except Exception as e:
            print("ERROR: While adding Google Drive files to search index")
            print("-"*40)
            print(repr(e))
            print("-"*40)
            print("Continuing...")
            pass
        print("Cleaning temporary directory: %s"%(temp_dir))
        subprocess.call(['rm','-fr',temp_dir])
@@ -1074,7 +1140,7 @@ class Search:
        elif doctype=='issue':
            item_keys = ['title','repo_name','repo_url','url','created_time','modified_time']
        elif doctype=='emailthread':
-            item_keys = ['title','owner_name','url']
+            item_keys = ['title','owner_name','url','created_time','modified_time']
        elif doctype=='ghfile':
            item_keys = ['title','repo_name','repo_url','url']
        elif doctype=='markdown':
@@ -1091,11 +1157,7 @@ class Search:
            for r in results:
                d = {}
                for k in item_keys:
-                    if k=='created_time' or k=='modified_time':
+                    d[k] = r[k]
                        #d[k] = r[k]
                        d[k] = dateutil.parser.parse(r[k]).strftime("%Y-%m-%d")
                    else:
                        d[k] = r[k]
                json_results.append(d)
        return json_results
@@ -1108,7 +1170,9 @@ class Search:
            query_string = " ".join(query_list)
            query = None
            if ":" in query_string:
-                query = QueryParser("content", self.schema).parse(query_string)
+                query = QueryParser("content", self.schema)
                query.add_plugin(DateParserPlugin(free=True))
                query = query.parse(query_string)
            elif len(fields) == 1 and fields[0] == "filename":
                pass
            elif len(fields) == 2:
@@ -1116,9 +1180,12 @@ class Search:
            else:
                # If the user does not specify a field,
                # these are the fields that are actually searched
-                fields = ['title', 'content','owner_name','owner_email','url']
+                fields = ['title', 'content','owner_name','owner_email','url','created_date','modified_date']
            if not query:
-                query = MultifieldParser(fields, schema=self.ix.schema).parse(query_string)
+                query = MultifieldParser(fields, schema=self.ix.schema)
                query.add_plugin(DateParserPlugin(free=True))
                query = query.parse(query_string)
                #query = MultifieldParser(fields, schema=self.ix.schema).parse(query_string) 
            parsed_query = "%s" % query
            print("query: %s" % parsed_query)
            results = searcher.search(query, terms=False, scored=True, groupedby="kind")
--- a/groupsio_util.py
+++ b/groupsio_util.py
@@ -1,5 +1,7 @@
 import requests, os, re
 from bs4 import BeautifulSoup
 import dateutil.parser
 import datetime
 class GroupsIOException(Exception):
    pass
@@ -64,7 +66,7 @@ class GroupsIOArchivesCrawler(object):
            ## Short circuit
            ## for debugging purposes
-            #break
+            break
        return subgroups
@@ -251,7 +253,7 @@ class GroupsIOArchivesCrawler(object):
            subject = soup.find('title').text
            # Extract information for the schema:
-            # - permalink for thread (done)
+            # - permalink for thread (done above)
            # - subject/title (done)
            # - original sender email/name (done)
            # - content (done)
@@ -266,11 +268,35 @@ class GroupsIOArchivesCrawler(object):
                    pass
                else:
                    # found an email!
-                    # this is a maze, thanks groups.io
+                    # this is a maze, not amazing.
                    # thanks groups.io!
                    td = tr.find('td')
-                    divrow = td.find('div',{'class':'row'}).find('div',{'class':'pull-left'})
+
                    sender_divrow = td.find('div',{'class':'row'})
                    sender_divrow = sender_divrow.find('div',{'class':'pull-left'})
                    if (i+1)==1:
-                        original_sender = divrow.text.strip()
+                        original_sender = sender_divrow.text.strip()
                    date_divrow = td.find('div',{'class':'row'})
                    date_divrow = date_divrow.find('div',{'class':'pull-right'})
                    date_divrow = date_divrow.find('font',{'class':'text-muted'})
                    date_divrow = date_divrow.find('script').text
                    try:
                        time_seconds = re.search(' [0-9]{1,} ',date_divrow).group(0)
                        time_seconds = time_seconds.strip()
                        # Thanks groups.io for the weird date formatting
                        time_seconds = time_seconds[:10]
                        mmicro_seconds = time_seconds[10:]
                        if (i+1)==1:
                            created_time  = datetime.datetime.utcfromtimestamp(int(time_seconds))
                            modified_time = datetime.datetime.utcfromtimestamp(int(time_seconds))
                        else:
                            modified_time = datetime.datetime.utcfromtimestamp(int(time_seconds))
                    except AttributeError:
                        created_time = None
                        modified_time = None
                    for div in td.find_all('div'):
                        if div.has_attr('id'):
@@ -299,7 +325,10 @@ class GroupsIOArchivesCrawler(object):
            thread = {
                    'permalink' : permalink,
                    'created_time' : created_time,
                    'modified_time' : modified_time,
                    'subject' : subject,
                    'subgroup' : subgroup_name,
                    'original_sender' : original_sender,
                    'content' : full_content
            }
@@ -324,11 +353,13 @@ class GroupsIOArchivesCrawler(object):
        results = []
        for row in rows:
-            # We don't care about anything except title and ugly link
+            # This is where we extract
            # a list of thread titles 
            # and corresponding links.
            subject = row.find('span',{'class':'subject'})
            title = subject.get_text()
            link = row.find('a')['href']
-            #print(title)
+
            results.append((title,link))
        return results
--- a/quality/Readme.md
+++ b/quality/Readme.md
@@ -1,181 +0,0 @@
 # Centillion Quality Engineering Plan
 Table of Contents
 -------
 *   [Centillion Quality Engineering Plan](#centillion-quality-engineering-plan)
    *   [Summary](#summary)
    *   [Tracking Bugs and Issues](#tracking-bugs-and-issues)
    *   [Branches, Versioning, and Git Workflow](#branches-versioning-and-git-workflow)
    *   [Communication and Mailing Lists](#communication-and-mailing-lists)
    *   [Checklists](#checklists)
    *   [Documentation](#documentation)
    *   [Configuration Management Tools](#configuration-management-tools)
    *   [Tests](#tests)
    *   [Code Reviews](#code-reviews)
    *   [Formal Release Process](#formal-release-process)
    *   [Continual Process Improvement](#continual-process-improvement)
 Summary
 -------
 This document contains a quality engineering plan for centillion, the
 Data Commons search engine.
 Tracking Bugs and Issues
 ------------------------
 We utilize the [issues
 section](https://github.com/dcppc/centillion/issues) of the centillion
 repository to keep track of bugs and feature requests.
 Branches, Versioning, and Git Workflow
 --------------------------------------
 All code is kept under version control in the
 [dcppc/centillion](https://github.com/dcppc/centillion) Github
 repository.
 **Primary Git Branches:**
 We utillize a git branch pattern that has two primary branches: a
 development branch and a stable branch.
 -   The primary **development branch** is `dcppc` and is actively
    developed and deployed to <https://betasearch.nihdatacommons.us>.
 -   The primary **stable branch** is `releases/v1` and is stable and
    deployed to <https://search.nihdatacommons.us>.
 All tagged versions of Centillion exist on the stable branch. Only
 tagged versions of centillion are run on
 <https://search.nihdatacommons.us>.
 **Other Branches:**
 Features are developed by creating a new branch from `dcppc`, working on
 the feature, and opening a pull request. When the pull request is
 approved, it can be merged into the `dcppc` branch.
 When features have accumulated and a new version is ready, a new
 pre-release branch will be made to prepare for a new release. When the
 pre-release branch is ready, it is merged into the stable branch in a
 single merge commit and a new version of centillion is tagged. The new
 version is deployed on <https://search.nihdatacommons.us>.
 Commits to fix bugs (hotfixes) may need to be applied to both the stable
 and development branches. In this case, a hotfix branch should be
 created from the head commit of the stable branch, and the appropriate
 changes should be made on the branch. A pull request should be opened to
 merge the hotfix into the release branch. A second pull request should
 be opened to merge the hotfix into the development branch. Once the
 hotfix is merged into the stable branch, a new version should be tagged.
 Communication and Mailing Lists
 -------------------------------
 -   No mailing list currently exists for centillion.
 -   Github issues are the primary form of communication about
    development of centillion. This is the best method for communicating
    bug reports or detailed information.
 -   The Send Feedback button on the centillion page is the primary way
    of getting quick feedback from users about the search engine.
 -   The [\#centillion](https://nih-dcppc.slack.com/messages/CCD64QD6G)
    Slack channel in the DCPPC slack workspace is the best place for
    conversations about centillion (providing feedback, answering quick
    questions, etc.)
 Checklists
 ----------
 We plan to utilize the Wiki feature of the Github repository to develop
 checlists:
 -   Checklist for releases
 -   Checklist for deployment of https://search.nihdatacommons.us nginx
    etc.
 Documentation
 -------------
 The documentation is a pile of markdown documents, turned into a static
 site using mkdocs.
 Configuration Management Tools
 ------------------------------
 We do not currently utilize any configuration management software,
 because centillion is not packaged as an importable Python module.
 Packaging centillion is a future goal that is closely related to the
 need to improve and modularize the internal search schema/document type
 abstraction. These improvements would allow the types of collections
 being indexed to be separated from "core centillion", and core
 centillion would be packaged.
 Tests
 -----
 See (ref) for a full test plan with more detail.
 Summary of test plan:
 -   Implement tests for the four major pages/components
    -   Login/authentication
    -   Search
    -   Master List
    -   Control Panel
 -   Test authentication with two bot accounts (yammasnake and florence
    python)
 -   Separate frontend and backend tests
 -   Add a test flag in the flask config file to change the backend
    behavior of the server
 Code Reviews
 ------------
 CI tests will be implemented for all pull requests.
 Pull requests to the **stable branch** have the following checks in
 place:
 -   PRs to the stable branch require at least 1 PR review
 -   PRs to the stable branch must pass CI tests
 Pull requests to the **development branch** have the following checks in
 place:
 -   PRs to the development branch must pass CI tests
 Formal Release Process
 ----------------------
 In order to ensure a stable, consistent product, we utilize the
 branching pattern described above to implement new features in the
 development branch and test them out on
 <https://betasearch.nihdatacommons.us>.
 Once features and bug fixes have been tested and reviewed internally,
 they are ready to be deployed. A new pre-release branch is created from
 the development branch. The pre-release branch has a feature freeze in
 place. Changes are made to the pre-release branch to prepare it for the
 next major version release.
 When the pre-release branch is finished, it is merged into the stable
 branch. The head commit of the stable version is tagged with the lastest
 release number.
 Finally, the new version is deployed on
 <https://search.nihdatacommons.us>.
 Continual Process Improvement
 -----------------------------
 We will utilize the centillion wiki on Github to keep track of repeated
 processes and opportunities for improvement. Feedback and ideas for
 process improvement can also be submitted via Github issues.
--- a/tests/Readme.md
+++ b/tests/Readme.md
@@ -1,196 +0,0 @@
 Centillion Tests
 ================
 Table of Contents
 ------------------
 *   [Centillion Tests](#centillion-tests)
    *   [Test Plan](#test-plan)
        *   [Local Tests](#local-tests)
        *   [Short Tests](#short-tests)
        *   [Long Tests](#long-tests)
    *   [Credentials](#credentials)
    *   [Detailed Description of Tests](#detailed-description-of-tests)
        *   [Authentication Layer Tests](#authentication-layer-tests)
        *   [Search Function Tests](#search-function-tests)
        *   [Master List Endpoint Tests](#master-list-endpoint-tests)
        *   [Control Panel Endpoint Tests](#control-panel-endpoint-tests)
        *   [Continuous Integration Plan](#continuous-integration-plan)
    *   [Procedure/Checklist](#procedurechecklist)
 Test Plan
 ---------
 Related: <https://github.com/dcppc/centillion/issues/82>
 The test suite for centillion needs to check each of the major
 components of centillion, as well as check the authentication mechanism
 using multiple login credentials.
 We implement the following checks:
 1.  Check authentication mechanism(s) (yamasnake and florence python)
 2.  Check search function
 3.  Check master list endpoint
 4.  Check control panel endpoint
 5.  Check update search index endpoints
 The tests are written such that the back end and front end are tested
 separately.
 We need also need different tiers of tests, so we don't max out API
 calls by making lots of commits to multiple PRs.
 We have three tiers of tests: \* Local tests - quick tests for CI, no
 API calls \* Short tests - tests using dummy API accounts \* Long tests
 - tests using DCPPC API accounts
 ### Local Tests
 Local tests can be run locally without any interaction with APIs. These
 will still utilize centillion's search schema, but will load the search
 index with fake documents rather than fetching them from an API.
 Uncle Archie, which runs CI tests, runs local tests only (unless you
 request it to run short test or long test.)
 ### Short Tests
 Short tests utilize credentials for bot accounts that have intentionally
 been set up to have a "known" corpus of test documents. These would
 provide unit-style tests for centillion - are the mechanics of indexing
 a particular type of document from a particular API working?
 ### Long Tests
 Long tests are indexing the real deal, utilizing the credentials used in
 the final production centillion. This test takes longer but is more
 likely to catch corner cases specific to the DCPPC documents.
 Credentials
 -----------
 Running tests on centillion requires multiple sets of credentials. Let's
 lay out what is needed:
 -   The Flask app requires a token/secret token API key pair to allow
    users to authenticate through Github and confirm they are members of
    the DCPPC organization. This OAuth application is owned by Charles
    Reid (@charlesreid1).
 -   The search index needs a Github access token so that it can
    interface with the Github API to index files and issues. This access
    token is specified (along with other secrets) in the Flask
    configuration file. The access key comes from Florence Python
    (@fp9695253).
 -   The search index also requires a Google Drive API access token. This
    must be an access token for a user who has authenticated with the
    Centillion Google Drive OAuth application. This access token comes
    from <mailroom@nihdatacommons.com>.
 -   The search index requires API credentials for any other APIs
    associated with other document collections (Groups.io, Hypothesis,
    Disqus).
 -   The backend test requires the credentials provided to Flask.
 -   The frontend test (Selenium) needs two Github username/passwords:
    one for Florence Python (@fp9695253) and one for Yamma Snake
    (@yammasnake). These are required to simulate the user
    authenticating with Github through the browser.
    -   The frontend test credentials are a special case.
    -   The frontend tests expect credentials to come from environment
        variables.
    -   These environment variables get passed in at test time.
    -   Tests are all run on [Uncle
        Archie](https://github.com/dcppc/uncle-archie).
    -   Uncle Archie already has to protect a confidential config file
        containing Github credentials, so add additional credentials for
        frontend tests there.
    -   Logical separation: these credentials are not needed to
        *operate* centillion, these credentials are needed to *test*
        centillion
    -   Uncle Archie already requires github credentials, already
        protects sensitive info.
    -   Google Drive requiring its own credentials file on disk is a
        pain.
 In summary: tests use the `config_flask.py` and `config_centillion.py`
 files to provide it with the API keys it needs and to instruct it on
 what to index. The credentials and config files will control what the
 search index will actually index. The Uncle Archie CI tester config file
 contains the credentials needed to run frontend tests (check the
 login/authentication layer).
 Detailed Description of Tests
 -----------------------------
 ### Authentication Layer Tests
 Frontend tests run as Florence Python:
 -   Can we log in via github and reach centillion
 -   Can we reach the control panel
 Frontend tests run as Yamma Snake (DCPPC member):
 -   Can we log in via github and reach centillion
 -   Can we reach the control panel
 ### Search Function Tests
 Frontend tests:
 -   Can we enter something into search box and submit
 -   Can we sort the results
 -   Do the results look okay
 Backend tests:
 -   Load the search index and run a query using whoosh API
 ### Master List Endpoint Tests
 Frontend tests:
 -   Can we get to the master list page
 -   Can we sort the results
 -   Do the results look okay
 Backend tests:
 -   Check the output of the `/list` API endpoint
 ### Control Panel Endpoint Tests
 Frontend tests:
 -   Can we get to the control panel page
 -   Can we click the button to trigger an indexing event
 Backend tests:
 -   Trigger a re-index of the search index from the backend.
 ### Continuous Integration Plan
 Tests are automatically run using Uncle Archie for continuous
 integration and deployment.
 Procedure/Checklist
 -------------------
 Pre-release procedure:
 -   prepare to run all test
 -   run short tests
 -   deploy to beta
 -   run long tests
 -   test out
Author	SHA1	Message	Date
Chaz Reid	ab76226b0c	Merge pull request #90 from dcppc/add-dates-and-subgroups-to-emails Add dates and subgroups to emails	2018-08-24 00:07:40 -07:00
Charles Reid	a4ebef6e6f	extract date and time from email threads pages	2018-08-24 00:04:35 -07:00
Charles Reid	bad50efa9b	add groups and tags to schema; update how we determine timestamps; handle exceptions when we add the document to the writer, rather than elsewhere	2018-08-24 00:03:23 -07:00
Charles Reid	629fc063db	move where exception is caught (exception was also incorrect.)	2018-08-24 00:01:26 -07:00
Charles Reid	3b0baa21de	switched created_time, modified_time, indexed_time over to DATETIME. added DateParserPlugin to query QueryParser. added time fields to those being searched by default. tests do not seem to be working.	2018-08-23 19:01:40 -07:00