Compare commits
5 Commits
testing
...
use-dateti
Author | SHA1 | Date | |
---|---|---|---|
ab76226b0c | |||
a4ebef6e6f | |||
bad50efa9b | |||
629fc063db | |||
3b0baa21de |
@@ -21,6 +21,8 @@ import dateutil.parser
|
||||
|
||||
from whoosh.qparser import MultifieldParser, QueryParser
|
||||
from whoosh.analysis import StemmingAnalyzer
|
||||
from whoosh.qparser.dateparse import DateParserPlugin
|
||||
from whoosh import fields, index
|
||||
|
||||
|
||||
"""
|
||||
@@ -180,30 +182,38 @@ class Search:
|
||||
# is defined.
|
||||
|
||||
schema = Schema(
|
||||
id = ID(stored=True, unique=True),
|
||||
kind = ID(stored=True),
|
||||
id = fields.ID(stored=True, unique=True),
|
||||
kind = fields.ID(stored=True),
|
||||
|
||||
created_time = ID(stored=True),
|
||||
modified_time = ID(stored=True),
|
||||
indexed_time = ID(stored=True),
|
||||
created_time = fields.DATETIME(stored=True),
|
||||
modified_time = fields.DATETIME(stored=True),
|
||||
indexed_time = fields.DATETIME(stored=True),
|
||||
|
||||
title = TEXT(stored=True, field_boost=100.0),
|
||||
url = ID(stored=True, unique=True),
|
||||
title = fields.TEXT(stored=True, field_boost=100.0),
|
||||
|
||||
mimetype=ID(stored=True),
|
||||
owner_email=ID(stored=True),
|
||||
owner_name=TEXT(stored=True),
|
||||
url = fields.ID(stored=True),
|
||||
|
||||
repo_name=TEXT(stored=True),
|
||||
repo_url=ID(stored=True),
|
||||
mimetype = fields.TEXT(stored=True),
|
||||
|
||||
github_user=TEXT(stored=True),
|
||||
owner_email = fields.ID(stored=True),
|
||||
owner_name = fields.TEXT(stored=True),
|
||||
|
||||
# mainly for email threads, groups.io, hypothesis
|
||||
group = fields.ID(stored=True),
|
||||
|
||||
repo_name = fields.TEXT(stored=True),
|
||||
repo_url = fields.ID(stored=True),
|
||||
github_user = fields.TEXT(stored=True),
|
||||
|
||||
tags = fields.KEYWORD(commas=True,
|
||||
stored=True,
|
||||
lowercase=True),
|
||||
|
||||
# comments only
|
||||
issue_title=TEXT(stored=True, field_boost=100.0),
|
||||
issue_url=ID(stored=True),
|
||||
issue_title = fields.TEXT(stored=True, field_boost=100.0),
|
||||
issue_url = fields.ID(stored=True),
|
||||
|
||||
content=TEXT(stored=True, analyzer=stemming_analyzer)
|
||||
content = fields.TEXT(stored=True, analyzer=stemming_analyzer)
|
||||
)
|
||||
|
||||
|
||||
@@ -243,17 +253,22 @@ class Search:
|
||||
writer.delete_by_term('id',item['id'])
|
||||
|
||||
# Index a plain google drive file
|
||||
created_time = dateutil.parser.parse(item['createdTime'])
|
||||
modified_time = dateutil.parser.parse(item['modifiedTime'])
|
||||
indexed_time = datetime.now().replace(microsecond=0)
|
||||
try:
|
||||
writer.add_document(
|
||||
id = item['id'],
|
||||
kind = 'gdoc',
|
||||
created_time = item['createdTime'],
|
||||
modified_time = item['modifiedTime'],
|
||||
indexed_time = datetime.now().replace(microsecond=0).isoformat(),
|
||||
created_time = created_time,
|
||||
modified_time = modified_time,
|
||||
indexed_time = indexed_time,
|
||||
title = item['name'],
|
||||
url = item['webViewLink'],
|
||||
mimetype = mimetype,
|
||||
owner_email = item['owners'][0]['emailAddress'],
|
||||
owner_name = item['owners'][0]['displayName'],
|
||||
group='',
|
||||
repo_name='',
|
||||
repo_url='',
|
||||
github_user='',
|
||||
@@ -261,6 +276,9 @@ class Search:
|
||||
issue_url='',
|
||||
content = content
|
||||
)
|
||||
except ValueError as e:
|
||||
print(repr(e))
|
||||
print(" > XXXXXX Failed to index Google Drive file \"%s\""%(item['name']))
|
||||
|
||||
|
||||
else:
|
||||
@@ -314,7 +332,7 @@ class Search:
|
||||
)
|
||||
assert output == ""
|
||||
except RuntimeError:
|
||||
print(" > XXXXXX Failed to index document \"%s\""%(item['name']))
|
||||
print(" > XXXXXX Failed to index Google Drive document \"%s\""%(item['name']))
|
||||
|
||||
|
||||
# If export was successful, read contents of markdown
|
||||
@@ -342,17 +360,22 @@ class Search:
|
||||
else:
|
||||
print(" > Creating a new record")
|
||||
|
||||
try:
|
||||
created_time = dateutil.parser.parse(item['createdTime'])
|
||||
modified_time = dateutil.parser.parse(item['modifiedTime'])
|
||||
indexed_time = datetime.now()
|
||||
writer.add_document(
|
||||
id = item['id'],
|
||||
kind = 'gdoc',
|
||||
created_time = item['createdTime'],
|
||||
modified_time = item['modifiedTime'],
|
||||
indexed_time = datetime.now().replace(microsecond=0).isoformat(),
|
||||
created_time = created_time,
|
||||
modified_time = modified_time,
|
||||
indexed_time = indexed_time,
|
||||
title = item['name'],
|
||||
url = item['webViewLink'],
|
||||
mimetype = mimetype,
|
||||
owner_email = item['owners'][0]['emailAddress'],
|
||||
owner_name = item['owners'][0]['displayName'],
|
||||
group='',
|
||||
repo_name='',
|
||||
repo_url='',
|
||||
github_user='',
|
||||
@@ -360,6 +383,10 @@ class Search:
|
||||
issue_url='',
|
||||
content = content
|
||||
)
|
||||
except ValueError as e:
|
||||
print(repr(e))
|
||||
print(" > XXXXXX Failed to index Google Drive file \"%s\""%(item['name']))
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -393,13 +420,14 @@ class Search:
|
||||
issue_comment_content += comment.body.rstrip()
|
||||
issue_comment_content += "\n"
|
||||
|
||||
# Now create the actual search index record
|
||||
created_time = clean_timestamp(issue.created_at)
|
||||
modified_time = clean_timestamp(issue.updated_at)
|
||||
indexed_time = clean_timestamp(datetime.now())
|
||||
|
||||
# Now create the actual search index record.
|
||||
# Add one document per issue thread,
|
||||
# containing entire text of thread.
|
||||
|
||||
created_time = issue.created_at
|
||||
modified_time = issue.updated_at
|
||||
indexed_time = datetime.now()
|
||||
try:
|
||||
writer.add_document(
|
||||
id = issue.html_url,
|
||||
kind = 'issue',
|
||||
@@ -411,6 +439,7 @@ class Search:
|
||||
mimetype='',
|
||||
owner_email='',
|
||||
owner_name='',
|
||||
group='',
|
||||
repo_name = repo_name,
|
||||
repo_url = repo_url,
|
||||
github_user = issue.user.login,
|
||||
@@ -418,6 +447,9 @@ class Search:
|
||||
issue_url = issue.html_url,
|
||||
content = issue_comment_content
|
||||
)
|
||||
except ValueError as e:
|
||||
print(repr(e))
|
||||
print(" > XXXXXX Failed to index Github issue \"%s\""%(issue.title))
|
||||
|
||||
|
||||
|
||||
@@ -447,7 +479,8 @@ class Search:
|
||||
print(" > XXXXXXXX Failed to find file info.")
|
||||
return
|
||||
|
||||
indexed_time = clean_timestamp(datetime.now())
|
||||
|
||||
indexed_time = datetime.now()
|
||||
|
||||
if fext in MARKDOWN_EXTS:
|
||||
print("Indexing markdown doc %s from repo %s"%(fname,repo_name))
|
||||
@@ -476,17 +509,19 @@ class Search:
|
||||
usable_url = "https://github.com/%s/blob/master/%s"%(repo_name, fpath)
|
||||
|
||||
# Now create the actual search index record
|
||||
try:
|
||||
writer.add_document(
|
||||
id = fsha,
|
||||
kind = 'markdown',
|
||||
created_time = '',
|
||||
modified_time = '',
|
||||
created_time = None,
|
||||
modified_time = None,
|
||||
indexed_time = indexed_time,
|
||||
title = fname,
|
||||
url = usable_url,
|
||||
mimetype='',
|
||||
owner_email='',
|
||||
owner_name='',
|
||||
group='',
|
||||
repo_name = repo_name,
|
||||
repo_url = repo_url,
|
||||
github_user = '',
|
||||
@@ -494,6 +529,11 @@ class Search:
|
||||
issue_url = '',
|
||||
content = content
|
||||
)
|
||||
except ValueError as e:
|
||||
print(repr(e))
|
||||
print(" > XXXXXX Failed to index Github markdown file \"%s\""%(fname))
|
||||
|
||||
|
||||
|
||||
else:
|
||||
print("Indexing github file %s from repo %s"%(fname,repo_name))
|
||||
@@ -501,17 +541,19 @@ class Search:
|
||||
key = fname+"_"+fsha
|
||||
|
||||
# Now create the actual search index record
|
||||
try:
|
||||
writer.add_document(
|
||||
id = key,
|
||||
kind = 'ghfile',
|
||||
created_time = '',
|
||||
modified_time = '',
|
||||
created_time = None,
|
||||
modified_time = None,
|
||||
indexed_time = indexed_time,
|
||||
title = fname,
|
||||
url = repo_url,
|
||||
mimetype='',
|
||||
owner_email='',
|
||||
owner_name='',
|
||||
group='',
|
||||
repo_name = repo_name,
|
||||
repo_url = repo_url,
|
||||
github_user = '',
|
||||
@@ -519,6 +561,9 @@ class Search:
|
||||
issue_url = '',
|
||||
content = ''
|
||||
)
|
||||
except ValueError as e:
|
||||
print(repr(e))
|
||||
print(" > XXXXXX Failed to index Github file \"%s\""%(fname))
|
||||
|
||||
|
||||
|
||||
@@ -532,20 +577,32 @@ class Search:
|
||||
Use a Github file API record to add a filename
|
||||
to the search index.
|
||||
"""
|
||||
indexed_time = clean_timestamp(datetime.now())
|
||||
if 'created_time' in d.keys() and d['created_time'] is not None:
|
||||
created_time = d['created_time']
|
||||
else:
|
||||
created_time = None
|
||||
|
||||
if 'modified_time' in d.keys() and d['modified_time'] is not None:
|
||||
modified_time = d['modified_time']
|
||||
else:
|
||||
modified_time = None
|
||||
|
||||
indexed_time = datetime.now()
|
||||
|
||||
# Now create the actual search index record
|
||||
try:
|
||||
writer.add_document(
|
||||
id = d['permalink'],
|
||||
kind = 'emailthread',
|
||||
created_time = '',
|
||||
modified_time = '',
|
||||
created_time = created_time,
|
||||
modified_time = modified_time,
|
||||
indexed_time = indexed_time,
|
||||
title = d['subject'],
|
||||
url = d['permalink'],
|
||||
mimetype='',
|
||||
owner_email='',
|
||||
owner_name=d['original_sender'],
|
||||
group=d['subgroup'],
|
||||
repo_name = '',
|
||||
repo_url = '',
|
||||
github_user = '',
|
||||
@@ -553,7 +610,9 @@ class Search:
|
||||
issue_url = '',
|
||||
content = d['content']
|
||||
)
|
||||
|
||||
except ValueError as e:
|
||||
print(repr(e))
|
||||
print(" > XXXXXX Failed to index Groups.io thread \"%s\""%(d['subject']))
|
||||
|
||||
|
||||
|
||||
@@ -631,10 +690,10 @@ class Search:
|
||||
full_items[f['id']] = f
|
||||
|
||||
## Shorter:
|
||||
#break
|
||||
# Longer:
|
||||
if nextPageToken is None:
|
||||
break
|
||||
## Longer:
|
||||
#if nextPageToken is None:
|
||||
# break
|
||||
|
||||
|
||||
writer = self.ix.writer()
|
||||
@@ -642,7 +701,7 @@ class Search:
|
||||
temp_dir = tempfile.mkdtemp(dir=os.getcwd())
|
||||
print("Temporary directory: %s"%(temp_dir))
|
||||
|
||||
|
||||
try:
|
||||
|
||||
# Drop any id in indexed_ids
|
||||
# not in remote_ids
|
||||
@@ -670,6 +729,13 @@ class Search:
|
||||
self.add_drive_file(writer, item, temp_dir, config, update=False)
|
||||
count += 1
|
||||
|
||||
except Exception as e:
|
||||
print("ERROR: While adding Google Drive files to search index")
|
||||
print("-"*40)
|
||||
print(repr(e))
|
||||
print("-"*40)
|
||||
print("Continuing...")
|
||||
pass
|
||||
|
||||
print("Cleaning temporary directory: %s"%(temp_dir))
|
||||
subprocess.call(['rm','-fr',temp_dir])
|
||||
@@ -1074,7 +1140,7 @@ class Search:
|
||||
elif doctype=='issue':
|
||||
item_keys = ['title','repo_name','repo_url','url','created_time','modified_time']
|
||||
elif doctype=='emailthread':
|
||||
item_keys = ['title','owner_name','url']
|
||||
item_keys = ['title','owner_name','url','created_time','modified_time']
|
||||
elif doctype=='ghfile':
|
||||
item_keys = ['title','repo_name','repo_url','url']
|
||||
elif doctype=='markdown':
|
||||
@@ -1091,10 +1157,6 @@ class Search:
|
||||
for r in results:
|
||||
d = {}
|
||||
for k in item_keys:
|
||||
if k=='created_time' or k=='modified_time':
|
||||
#d[k] = r[k]
|
||||
d[k] = dateutil.parser.parse(r[k]).strftime("%Y-%m-%d")
|
||||
else:
|
||||
d[k] = r[k]
|
||||
json_results.append(d)
|
||||
|
||||
@@ -1108,7 +1170,9 @@ class Search:
|
||||
query_string = " ".join(query_list)
|
||||
query = None
|
||||
if ":" in query_string:
|
||||
query = QueryParser("content", self.schema).parse(query_string)
|
||||
query = QueryParser("content", self.schema)
|
||||
query.add_plugin(DateParserPlugin(free=True))
|
||||
query = query.parse(query_string)
|
||||
elif len(fields) == 1 and fields[0] == "filename":
|
||||
pass
|
||||
elif len(fields) == 2:
|
||||
@@ -1116,9 +1180,12 @@ class Search:
|
||||
else:
|
||||
# If the user does not specify a field,
|
||||
# these are the fields that are actually searched
|
||||
fields = ['title', 'content','owner_name','owner_email','url']
|
||||
fields = ['title', 'content','owner_name','owner_email','url','created_date','modified_date']
|
||||
if not query:
|
||||
query = MultifieldParser(fields, schema=self.ix.schema).parse(query_string)
|
||||
query = MultifieldParser(fields, schema=self.ix.schema)
|
||||
query.add_plugin(DateParserPlugin(free=True))
|
||||
query = query.parse(query_string)
|
||||
#query = MultifieldParser(fields, schema=self.ix.schema).parse(query_string)
|
||||
parsed_query = "%s" % query
|
||||
print("query: %s" % parsed_query)
|
||||
results = searcher.search(query, terms=False, scored=True, groupedby="kind")
|
||||
|
@@ -1,5 +1,7 @@
|
||||
import requests, os, re
|
||||
from bs4 import BeautifulSoup
|
||||
import dateutil.parser
|
||||
import datetime
|
||||
|
||||
class GroupsIOException(Exception):
|
||||
pass
|
||||
@@ -64,7 +66,7 @@ class GroupsIOArchivesCrawler(object):
|
||||
|
||||
## Short circuit
|
||||
## for debugging purposes
|
||||
#break
|
||||
break
|
||||
|
||||
return subgroups
|
||||
|
||||
@@ -251,7 +253,7 @@ class GroupsIOArchivesCrawler(object):
|
||||
subject = soup.find('title').text
|
||||
|
||||
# Extract information for the schema:
|
||||
# - permalink for thread (done)
|
||||
# - permalink for thread (done above)
|
||||
# - subject/title (done)
|
||||
# - original sender email/name (done)
|
||||
# - content (done)
|
||||
@@ -266,11 +268,35 @@ class GroupsIOArchivesCrawler(object):
|
||||
pass
|
||||
else:
|
||||
# found an email!
|
||||
# this is a maze, thanks groups.io
|
||||
# this is a maze, not amazing.
|
||||
# thanks groups.io!
|
||||
td = tr.find('td')
|
||||
divrow = td.find('div',{'class':'row'}).find('div',{'class':'pull-left'})
|
||||
|
||||
sender_divrow = td.find('div',{'class':'row'})
|
||||
sender_divrow = sender_divrow.find('div',{'class':'pull-left'})
|
||||
if (i+1)==1:
|
||||
original_sender = divrow.text.strip()
|
||||
original_sender = sender_divrow.text.strip()
|
||||
|
||||
date_divrow = td.find('div',{'class':'row'})
|
||||
date_divrow = date_divrow.find('div',{'class':'pull-right'})
|
||||
date_divrow = date_divrow.find('font',{'class':'text-muted'})
|
||||
date_divrow = date_divrow.find('script').text
|
||||
try:
|
||||
time_seconds = re.search(' [0-9]{1,} ',date_divrow).group(0)
|
||||
time_seconds = time_seconds.strip()
|
||||
# Thanks groups.io for the weird date formatting
|
||||
time_seconds = time_seconds[:10]
|
||||
mmicro_seconds = time_seconds[10:]
|
||||
if (i+1)==1:
|
||||
created_time = datetime.datetime.utcfromtimestamp(int(time_seconds))
|
||||
modified_time = datetime.datetime.utcfromtimestamp(int(time_seconds))
|
||||
else:
|
||||
modified_time = datetime.datetime.utcfromtimestamp(int(time_seconds))
|
||||
|
||||
except AttributeError:
|
||||
created_time = None
|
||||
modified_time = None
|
||||
|
||||
for div in td.find_all('div'):
|
||||
if div.has_attr('id'):
|
||||
|
||||
@@ -299,7 +325,10 @@ class GroupsIOArchivesCrawler(object):
|
||||
|
||||
thread = {
|
||||
'permalink' : permalink,
|
||||
'created_time' : created_time,
|
||||
'modified_time' : modified_time,
|
||||
'subject' : subject,
|
||||
'subgroup' : subgroup_name,
|
||||
'original_sender' : original_sender,
|
||||
'content' : full_content
|
||||
}
|
||||
@@ -324,11 +353,13 @@ class GroupsIOArchivesCrawler(object):
|
||||
|
||||
results = []
|
||||
for row in rows:
|
||||
# We don't care about anything except title and ugly link
|
||||
# This is where we extract
|
||||
# a list of thread titles
|
||||
# and corresponding links.
|
||||
subject = row.find('span',{'class':'subject'})
|
||||
title = subject.get_text()
|
||||
link = row.find('a')['href']
|
||||
#print(title)
|
||||
|
||||
results.append((title,link))
|
||||
|
||||
return results
|
||||
|
@@ -1,181 +0,0 @@
|
||||
# Centillion Quality Engineering Plan
|
||||
|
||||
Table of Contents
|
||||
-------
|
||||
|
||||
* [Centillion Quality Engineering Plan](#centillion-quality-engineering-plan)
|
||||
* [Summary](#summary)
|
||||
* [Tracking Bugs and Issues](#tracking-bugs-and-issues)
|
||||
* [Branches, Versioning, and Git Workflow](#branches-versioning-and-git-workflow)
|
||||
* [Communication and Mailing Lists](#communication-and-mailing-lists)
|
||||
* [Checklists](#checklists)
|
||||
* [Documentation](#documentation)
|
||||
* [Configuration Management Tools](#configuration-management-tools)
|
||||
* [Tests](#tests)
|
||||
* [Code Reviews](#code-reviews)
|
||||
* [Formal Release Process](#formal-release-process)
|
||||
* [Continual Process Improvement](#continual-process-improvement)
|
||||
|
||||
Summary
|
||||
-------
|
||||
|
||||
This document contains a quality engineering plan for centillion, the
|
||||
Data Commons search engine.
|
||||
|
||||
Tracking Bugs and Issues
|
||||
------------------------
|
||||
|
||||
We utilize the [issues
|
||||
section](https://github.com/dcppc/centillion/issues) of the centillion
|
||||
repository to keep track of bugs and feature requests.
|
||||
|
||||
Branches, Versioning, and Git Workflow
|
||||
--------------------------------------
|
||||
|
||||
All code is kept under version control in the
|
||||
[dcppc/centillion](https://github.com/dcppc/centillion) Github
|
||||
repository.
|
||||
|
||||
**Primary Git Branches:**
|
||||
|
||||
We utillize a git branch pattern that has two primary branches: a
|
||||
development branch and a stable branch.
|
||||
|
||||
- The primary **development branch** is `dcppc` and is actively
|
||||
developed and deployed to <https://betasearch.nihdatacommons.us>.
|
||||
|
||||
- The primary **stable branch** is `releases/v1` and is stable and
|
||||
deployed to <https://search.nihdatacommons.us>.
|
||||
|
||||
All tagged versions of Centillion exist on the stable branch. Only
|
||||
tagged versions of centillion are run on
|
||||
<https://search.nihdatacommons.us>.
|
||||
|
||||
**Other Branches:**
|
||||
|
||||
Features are developed by creating a new branch from `dcppc`, working on
|
||||
the feature, and opening a pull request. When the pull request is
|
||||
approved, it can be merged into the `dcppc` branch.
|
||||
|
||||
When features have accumulated and a new version is ready, a new
|
||||
pre-release branch will be made to prepare for a new release. When the
|
||||
pre-release branch is ready, it is merged into the stable branch in a
|
||||
single merge commit and a new version of centillion is tagged. The new
|
||||
version is deployed on <https://search.nihdatacommons.us>.
|
||||
|
||||
Commits to fix bugs (hotfixes) may need to be applied to both the stable
|
||||
and development branches. In this case, a hotfix branch should be
|
||||
created from the head commit of the stable branch, and the appropriate
|
||||
changes should be made on the branch. A pull request should be opened to
|
||||
merge the hotfix into the release branch. A second pull request should
|
||||
be opened to merge the hotfix into the development branch. Once the
|
||||
hotfix is merged into the stable branch, a new version should be tagged.
|
||||
|
||||
Communication and Mailing Lists
|
||||
-------------------------------
|
||||
|
||||
- No mailing list currently exists for centillion.
|
||||
|
||||
- Github issues are the primary form of communication about
|
||||
development of centillion. This is the best method for communicating
|
||||
bug reports or detailed information.
|
||||
|
||||
- The Send Feedback button on the centillion page is the primary way
|
||||
of getting quick feedback from users about the search engine.
|
||||
|
||||
- The [\#centillion](https://nih-dcppc.slack.com/messages/CCD64QD6G)
|
||||
Slack channel in the DCPPC slack workspace is the best place for
|
||||
conversations about centillion (providing feedback, answering quick
|
||||
questions, etc.)
|
||||
|
||||
Checklists
|
||||
----------
|
||||
|
||||
We plan to utilize the Wiki feature of the Github repository to develop
|
||||
checlists:
|
||||
|
||||
- Checklist for releases
|
||||
- Checklist for deployment of https://search.nihdatacommons.us nginx
|
||||
etc.
|
||||
|
||||
Documentation
|
||||
-------------
|
||||
|
||||
The documentation is a pile of markdown documents, turned into a static
|
||||
site using mkdocs.
|
||||
|
||||
Configuration Management Tools
|
||||
------------------------------
|
||||
|
||||
We do not currently utilize any configuration management software,
|
||||
because centillion is not packaged as an importable Python module.
|
||||
|
||||
Packaging centillion is a future goal that is closely related to the
|
||||
need to improve and modularize the internal search schema/document type
|
||||
abstraction. These improvements would allow the types of collections
|
||||
being indexed to be separated from "core centillion", and core
|
||||
centillion would be packaged.
|
||||
|
||||
Tests
|
||||
-----
|
||||
|
||||
See (ref) for a full test plan with more detail.
|
||||
|
||||
Summary of test plan:
|
||||
|
||||
- Implement tests for the four major pages/components
|
||||
- Login/authentication
|
||||
- Search
|
||||
- Master List
|
||||
- Control Panel
|
||||
- Test authentication with two bot accounts (yammasnake and florence
|
||||
python)
|
||||
|
||||
- Separate frontend and backend tests
|
||||
|
||||
- Add a test flag in the flask config file to change the backend
|
||||
behavior of the server
|
||||
|
||||
Code Reviews
|
||||
------------
|
||||
|
||||
CI tests will be implemented for all pull requests.
|
||||
|
||||
Pull requests to the **stable branch** have the following checks in
|
||||
place:
|
||||
|
||||
- PRs to the stable branch require at least 1 PR review
|
||||
- PRs to the stable branch must pass CI tests
|
||||
|
||||
Pull requests to the **development branch** have the following checks in
|
||||
place:
|
||||
|
||||
- PRs to the development branch must pass CI tests
|
||||
|
||||
Formal Release Process
|
||||
----------------------
|
||||
|
||||
In order to ensure a stable, consistent product, we utilize the
|
||||
branching pattern described above to implement new features in the
|
||||
development branch and test them out on
|
||||
<https://betasearch.nihdatacommons.us>.
|
||||
|
||||
Once features and bug fixes have been tested and reviewed internally,
|
||||
they are ready to be deployed. A new pre-release branch is created from
|
||||
the development branch. The pre-release branch has a feature freeze in
|
||||
place. Changes are made to the pre-release branch to prepare it for the
|
||||
next major version release.
|
||||
|
||||
When the pre-release branch is finished, it is merged into the stable
|
||||
branch. The head commit of the stable version is tagged with the lastest
|
||||
release number.
|
||||
|
||||
Finally, the new version is deployed on
|
||||
<https://search.nihdatacommons.us>.
|
||||
|
||||
Continual Process Improvement
|
||||
-----------------------------
|
||||
|
||||
We will utilize the centillion wiki on Github to keep track of repeated
|
||||
processes and opportunities for improvement. Feedback and ideas for
|
||||
process improvement can also be submitted via Github issues.
|
196
tests/Readme.md
196
tests/Readme.md
@@ -1,196 +0,0 @@
|
||||
Centillion Tests
|
||||
================
|
||||
|
||||
Table of Contents
|
||||
------------------
|
||||
|
||||
* [Centillion Tests](#centillion-tests)
|
||||
* [Test Plan](#test-plan)
|
||||
* [Local Tests](#local-tests)
|
||||
* [Short Tests](#short-tests)
|
||||
* [Long Tests](#long-tests)
|
||||
* [Credentials](#credentials)
|
||||
* [Detailed Description of Tests](#detailed-description-of-tests)
|
||||
* [Authentication Layer Tests](#authentication-layer-tests)
|
||||
* [Search Function Tests](#search-function-tests)
|
||||
* [Master List Endpoint Tests](#master-list-endpoint-tests)
|
||||
* [Control Panel Endpoint Tests](#control-panel-endpoint-tests)
|
||||
* [Continuous Integration Plan](#continuous-integration-plan)
|
||||
* [Procedure/Checklist](#procedurechecklist)
|
||||
|
||||
|
||||
Test Plan
|
||||
---------
|
||||
|
||||
Related: <https://github.com/dcppc/centillion/issues/82>
|
||||
|
||||
The test suite for centillion needs to check each of the major
|
||||
components of centillion, as well as check the authentication mechanism
|
||||
using multiple login credentials.
|
||||
|
||||
We implement the following checks:
|
||||
|
||||
1. Check authentication mechanism(s) (yamasnake and florence python)
|
||||
|
||||
2. Check search function
|
||||
|
||||
3. Check master list endpoint
|
||||
|
||||
4. Check control panel endpoint
|
||||
|
||||
5. Check update search index endpoints
|
||||
|
||||
The tests are written such that the back end and front end are tested
|
||||
separately.
|
||||
|
||||
We need also need different tiers of tests, so we don't max out API
|
||||
calls by making lots of commits to multiple PRs.
|
||||
|
||||
We have three tiers of tests: \* Local tests - quick tests for CI, no
|
||||
API calls \* Short tests - tests using dummy API accounts \* Long tests
|
||||
- tests using DCPPC API accounts
|
||||
|
||||
### Local Tests
|
||||
|
||||
Local tests can be run locally without any interaction with APIs. These
|
||||
will still utilize centillion's search schema, but will load the search
|
||||
index with fake documents rather than fetching them from an API.
|
||||
|
||||
Uncle Archie, which runs CI tests, runs local tests only (unless you
|
||||
request it to run short test or long test.)
|
||||
|
||||
### Short Tests
|
||||
|
||||
Short tests utilize credentials for bot accounts that have intentionally
|
||||
been set up to have a "known" corpus of test documents. These would
|
||||
provide unit-style tests for centillion - are the mechanics of indexing
|
||||
a particular type of document from a particular API working?
|
||||
|
||||
### Long Tests
|
||||
|
||||
Long tests are indexing the real deal, utilizing the credentials used in
|
||||
the final production centillion. This test takes longer but is more
|
||||
likely to catch corner cases specific to the DCPPC documents.
|
||||
|
||||
Credentials
|
||||
-----------
|
||||
|
||||
Running tests on centillion requires multiple sets of credentials. Let's
|
||||
lay out what is needed:
|
||||
|
||||
- The Flask app requires a token/secret token API key pair to allow
|
||||
users to authenticate through Github and confirm they are members of
|
||||
the DCPPC organization. This OAuth application is owned by Charles
|
||||
Reid (@charlesreid1).
|
||||
|
||||
- The search index needs a Github access token so that it can
|
||||
interface with the Github API to index files and issues. This access
|
||||
token is specified (along with other secrets) in the Flask
|
||||
configuration file. The access key comes from Florence Python
|
||||
(@fp9695253).
|
||||
|
||||
- The search index also requires a Google Drive API access token. This
|
||||
must be an access token for a user who has authenticated with the
|
||||
Centillion Google Drive OAuth application. This access token comes
|
||||
from <mailroom@nihdatacommons.com>.
|
||||
|
||||
- The search index requires API credentials for any other APIs
|
||||
associated with other document collections (Groups.io, Hypothesis,
|
||||
Disqus).
|
||||
|
||||
- The backend test requires the credentials provided to Flask.
|
||||
|
||||
- The frontend test (Selenium) needs two Github username/passwords:
|
||||
one for Florence Python (@fp9695253) and one for Yamma Snake
|
||||
(@yammasnake). These are required to simulate the user
|
||||
authenticating with Github through the browser.
|
||||
- The frontend test credentials are a special case.
|
||||
- The frontend tests expect credentials to come from environment
|
||||
variables.
|
||||
- These environment variables get passed in at test time.
|
||||
- Tests are all run on [Uncle
|
||||
Archie](https://github.com/dcppc/uncle-archie).
|
||||
- Uncle Archie already has to protect a confidential config file
|
||||
containing Github credentials, so add additional credentials for
|
||||
frontend tests there.
|
||||
- Logical separation: these credentials are not needed to
|
||||
*operate* centillion, these credentials are needed to *test*
|
||||
centillion
|
||||
- Uncle Archie already requires github credentials, already
|
||||
protects sensitive info.
|
||||
- Google Drive requiring its own credentials file on disk is a
|
||||
pain.
|
||||
|
||||
In summary: tests use the `config_flask.py` and `config_centillion.py`
|
||||
files to provide it with the API keys it needs and to instruct it on
|
||||
what to index. The credentials and config files will control what the
|
||||
search index will actually index. The Uncle Archie CI tester config file
|
||||
contains the credentials needed to run frontend tests (check the
|
||||
login/authentication layer).
|
||||
|
||||
Detailed Description of Tests
|
||||
-----------------------------
|
||||
|
||||
### Authentication Layer Tests
|
||||
|
||||
Frontend tests run as Florence Python:
|
||||
|
||||
- Can we log in via github and reach centillion
|
||||
- Can we reach the control panel
|
||||
|
||||
Frontend tests run as Yamma Snake (DCPPC member):
|
||||
|
||||
- Can we log in via github and reach centillion
|
||||
- Can we reach the control panel
|
||||
|
||||
### Search Function Tests
|
||||
|
||||
Frontend tests:
|
||||
|
||||
- Can we enter something into search box and submit
|
||||
- Can we sort the results
|
||||
- Do the results look okay
|
||||
|
||||
Backend tests:
|
||||
|
||||
- Load the search index and run a query using whoosh API
|
||||
|
||||
### Master List Endpoint Tests
|
||||
|
||||
Frontend tests:
|
||||
|
||||
- Can we get to the master list page
|
||||
- Can we sort the results
|
||||
- Do the results look okay
|
||||
|
||||
Backend tests:
|
||||
|
||||
- Check the output of the `/list` API endpoint
|
||||
|
||||
### Control Panel Endpoint Tests
|
||||
|
||||
Frontend tests:
|
||||
|
||||
- Can we get to the control panel page
|
||||
- Can we click the button to trigger an indexing event
|
||||
|
||||
Backend tests:
|
||||
|
||||
- Trigger a re-index of the search index from the backend.
|
||||
|
||||
### Continuous Integration Plan
|
||||
|
||||
Tests are automatically run using Uncle Archie for continuous
|
||||
integration and deployment.
|
||||
|
||||
Procedure/Checklist
|
||||
-------------------
|
||||
|
||||
Pre-release procedure:
|
||||
|
||||
- prepare to run all test
|
||||
|
||||
- run short tests
|
||||
- deploy to beta
|
||||
- run long tests
|
||||
- test out
|
Reference in New Issue
Block a user