Compare commits
5 Commits
testing
...
use-dateti
Author | SHA1 | Date | |
---|---|---|---|
ab76226b0c | |||
a4ebef6e6f | |||
bad50efa9b | |||
629fc063db | |||
3b0baa21de |
@@ -21,6 +21,8 @@ import dateutil.parser
|
|||||||
|
|
||||||
from whoosh.qparser import MultifieldParser, QueryParser
|
from whoosh.qparser import MultifieldParser, QueryParser
|
||||||
from whoosh.analysis import StemmingAnalyzer
|
from whoosh.analysis import StemmingAnalyzer
|
||||||
|
from whoosh.qparser.dateparse import DateParserPlugin
|
||||||
|
from whoosh import fields, index
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@@ -180,30 +182,38 @@ class Search:
|
|||||||
# is defined.
|
# is defined.
|
||||||
|
|
||||||
schema = Schema(
|
schema = Schema(
|
||||||
id = ID(stored=True, unique=True),
|
id = fields.ID(stored=True, unique=True),
|
||||||
kind = ID(stored=True),
|
kind = fields.ID(stored=True),
|
||||||
|
|
||||||
created_time = ID(stored=True),
|
created_time = fields.DATETIME(stored=True),
|
||||||
modified_time = ID(stored=True),
|
modified_time = fields.DATETIME(stored=True),
|
||||||
indexed_time = ID(stored=True),
|
indexed_time = fields.DATETIME(stored=True),
|
||||||
|
|
||||||
title = TEXT(stored=True, field_boost=100.0),
|
title = fields.TEXT(stored=True, field_boost=100.0),
|
||||||
url = ID(stored=True, unique=True),
|
|
||||||
|
|
||||||
mimetype=ID(stored=True),
|
|
||||||
owner_email=ID(stored=True),
|
|
||||||
owner_name=TEXT(stored=True),
|
|
||||||
|
|
||||||
repo_name=TEXT(stored=True),
|
|
||||||
repo_url=ID(stored=True),
|
|
||||||
|
|
||||||
github_user=TEXT(stored=True),
|
url = fields.ID(stored=True),
|
||||||
|
|
||||||
|
mimetype = fields.TEXT(stored=True),
|
||||||
|
|
||||||
|
owner_email = fields.ID(stored=True),
|
||||||
|
owner_name = fields.TEXT(stored=True),
|
||||||
|
|
||||||
|
# mainly for email threads, groups.io, hypothesis
|
||||||
|
group = fields.ID(stored=True),
|
||||||
|
|
||||||
|
repo_name = fields.TEXT(stored=True),
|
||||||
|
repo_url = fields.ID(stored=True),
|
||||||
|
github_user = fields.TEXT(stored=True),
|
||||||
|
|
||||||
|
tags = fields.KEYWORD(commas=True,
|
||||||
|
stored=True,
|
||||||
|
lowercase=True),
|
||||||
|
|
||||||
# comments only
|
# comments only
|
||||||
issue_title=TEXT(stored=True, field_boost=100.0),
|
issue_title = fields.TEXT(stored=True, field_boost=100.0),
|
||||||
issue_url=ID(stored=True),
|
issue_url = fields.ID(stored=True),
|
||||||
|
|
||||||
content=TEXT(stored=True, analyzer=stemming_analyzer)
|
content = fields.TEXT(stored=True, analyzer=stemming_analyzer)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -243,24 +253,32 @@ class Search:
|
|||||||
writer.delete_by_term('id',item['id'])
|
writer.delete_by_term('id',item['id'])
|
||||||
|
|
||||||
# Index a plain google drive file
|
# Index a plain google drive file
|
||||||
writer.add_document(
|
created_time = dateutil.parser.parse(item['createdTime'])
|
||||||
id = item['id'],
|
modified_time = dateutil.parser.parse(item['modifiedTime'])
|
||||||
kind = 'gdoc',
|
indexed_time = datetime.now().replace(microsecond=0)
|
||||||
created_time = item['createdTime'],
|
try:
|
||||||
modified_time = item['modifiedTime'],
|
writer.add_document(
|
||||||
indexed_time = datetime.now().replace(microsecond=0).isoformat(),
|
id = item['id'],
|
||||||
title = item['name'],
|
kind = 'gdoc',
|
||||||
url = item['webViewLink'],
|
created_time = created_time,
|
||||||
mimetype = mimetype,
|
modified_time = modified_time,
|
||||||
owner_email = item['owners'][0]['emailAddress'],
|
indexed_time = indexed_time,
|
||||||
owner_name = item['owners'][0]['displayName'],
|
title = item['name'],
|
||||||
repo_name='',
|
url = item['webViewLink'],
|
||||||
repo_url='',
|
mimetype = mimetype,
|
||||||
github_user='',
|
owner_email = item['owners'][0]['emailAddress'],
|
||||||
issue_title='',
|
owner_name = item['owners'][0]['displayName'],
|
||||||
issue_url='',
|
group='',
|
||||||
content = content
|
repo_name='',
|
||||||
)
|
repo_url='',
|
||||||
|
github_user='',
|
||||||
|
issue_title='',
|
||||||
|
issue_url='',
|
||||||
|
content = content
|
||||||
|
)
|
||||||
|
except ValueError as e:
|
||||||
|
print(repr(e))
|
||||||
|
print(" > XXXXXX Failed to index Google Drive file \"%s\""%(item['name']))
|
||||||
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
@@ -314,7 +332,7 @@ class Search:
|
|||||||
)
|
)
|
||||||
assert output == ""
|
assert output == ""
|
||||||
except RuntimeError:
|
except RuntimeError:
|
||||||
print(" > XXXXXX Failed to index document \"%s\""%(item['name']))
|
print(" > XXXXXX Failed to index Google Drive document \"%s\""%(item['name']))
|
||||||
|
|
||||||
|
|
||||||
# If export was successful, read contents of markdown
|
# If export was successful, read contents of markdown
|
||||||
@@ -342,24 +360,33 @@ class Search:
|
|||||||
else:
|
else:
|
||||||
print(" > Creating a new record")
|
print(" > Creating a new record")
|
||||||
|
|
||||||
writer.add_document(
|
try:
|
||||||
id = item['id'],
|
created_time = dateutil.parser.parse(item['createdTime'])
|
||||||
kind = 'gdoc',
|
modified_time = dateutil.parser.parse(item['modifiedTime'])
|
||||||
created_time = item['createdTime'],
|
indexed_time = datetime.now()
|
||||||
modified_time = item['modifiedTime'],
|
writer.add_document(
|
||||||
indexed_time = datetime.now().replace(microsecond=0).isoformat(),
|
id = item['id'],
|
||||||
title = item['name'],
|
kind = 'gdoc',
|
||||||
url = item['webViewLink'],
|
created_time = created_time,
|
||||||
mimetype = mimetype,
|
modified_time = modified_time,
|
||||||
owner_email = item['owners'][0]['emailAddress'],
|
indexed_time = indexed_time,
|
||||||
owner_name = item['owners'][0]['displayName'],
|
title = item['name'],
|
||||||
repo_name='',
|
url = item['webViewLink'],
|
||||||
repo_url='',
|
mimetype = mimetype,
|
||||||
github_user='',
|
owner_email = item['owners'][0]['emailAddress'],
|
||||||
issue_title='',
|
owner_name = item['owners'][0]['displayName'],
|
||||||
issue_url='',
|
group='',
|
||||||
content = content
|
repo_name='',
|
||||||
)
|
repo_url='',
|
||||||
|
github_user='',
|
||||||
|
issue_title='',
|
||||||
|
issue_url='',
|
||||||
|
content = content
|
||||||
|
)
|
||||||
|
except ValueError as e:
|
||||||
|
print(repr(e))
|
||||||
|
print(" > XXXXXX Failed to index Google Drive file \"%s\""%(item['name']))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -393,31 +420,36 @@ class Search:
|
|||||||
issue_comment_content += comment.body.rstrip()
|
issue_comment_content += comment.body.rstrip()
|
||||||
issue_comment_content += "\n"
|
issue_comment_content += "\n"
|
||||||
|
|
||||||
# Now create the actual search index record
|
# Now create the actual search index record.
|
||||||
created_time = clean_timestamp(issue.created_at)
|
|
||||||
modified_time = clean_timestamp(issue.updated_at)
|
|
||||||
indexed_time = clean_timestamp(datetime.now())
|
|
||||||
|
|
||||||
# Add one document per issue thread,
|
# Add one document per issue thread,
|
||||||
# containing entire text of thread.
|
# containing entire text of thread.
|
||||||
writer.add_document(
|
|
||||||
id = issue.html_url,
|
created_time = issue.created_at
|
||||||
kind = 'issue',
|
modified_time = issue.updated_at
|
||||||
created_time = created_time,
|
indexed_time = datetime.now()
|
||||||
modified_time = modified_time,
|
try:
|
||||||
indexed_time = indexed_time,
|
writer.add_document(
|
||||||
title = issue.title,
|
id = issue.html_url,
|
||||||
url = issue.html_url,
|
kind = 'issue',
|
||||||
mimetype='',
|
created_time = created_time,
|
||||||
owner_email='',
|
modified_time = modified_time,
|
||||||
owner_name='',
|
indexed_time = indexed_time,
|
||||||
repo_name = repo_name,
|
title = issue.title,
|
||||||
repo_url = repo_url,
|
url = issue.html_url,
|
||||||
github_user = issue.user.login,
|
mimetype='',
|
||||||
issue_title = issue.title,
|
owner_email='',
|
||||||
issue_url = issue.html_url,
|
owner_name='',
|
||||||
content = issue_comment_content
|
group='',
|
||||||
)
|
repo_name = repo_name,
|
||||||
|
repo_url = repo_url,
|
||||||
|
github_user = issue.user.login,
|
||||||
|
issue_title = issue.title,
|
||||||
|
issue_url = issue.html_url,
|
||||||
|
content = issue_comment_content
|
||||||
|
)
|
||||||
|
except ValueError as e:
|
||||||
|
print(repr(e))
|
||||||
|
print(" > XXXXXX Failed to index Github issue \"%s\""%(issue.title))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -447,7 +479,8 @@ class Search:
|
|||||||
print(" > XXXXXXXX Failed to find file info.")
|
print(" > XXXXXXXX Failed to find file info.")
|
||||||
return
|
return
|
||||||
|
|
||||||
indexed_time = clean_timestamp(datetime.now())
|
|
||||||
|
indexed_time = datetime.now()
|
||||||
|
|
||||||
if fext in MARKDOWN_EXTS:
|
if fext in MARKDOWN_EXTS:
|
||||||
print("Indexing markdown doc %s from repo %s"%(fname,repo_name))
|
print("Indexing markdown doc %s from repo %s"%(fname,repo_name))
|
||||||
@@ -476,24 +509,31 @@ class Search:
|
|||||||
usable_url = "https://github.com/%s/blob/master/%s"%(repo_name, fpath)
|
usable_url = "https://github.com/%s/blob/master/%s"%(repo_name, fpath)
|
||||||
|
|
||||||
# Now create the actual search index record
|
# Now create the actual search index record
|
||||||
writer.add_document(
|
try:
|
||||||
id = fsha,
|
writer.add_document(
|
||||||
kind = 'markdown',
|
id = fsha,
|
||||||
created_time = '',
|
kind = 'markdown',
|
||||||
modified_time = '',
|
created_time = None,
|
||||||
indexed_time = indexed_time,
|
modified_time = None,
|
||||||
title = fname,
|
indexed_time = indexed_time,
|
||||||
url = usable_url,
|
title = fname,
|
||||||
mimetype='',
|
url = usable_url,
|
||||||
owner_email='',
|
mimetype='',
|
||||||
owner_name='',
|
owner_email='',
|
||||||
repo_name = repo_name,
|
owner_name='',
|
||||||
repo_url = repo_url,
|
group='',
|
||||||
github_user = '',
|
repo_name = repo_name,
|
||||||
issue_title = '',
|
repo_url = repo_url,
|
||||||
issue_url = '',
|
github_user = '',
|
||||||
content = content
|
issue_title = '',
|
||||||
)
|
issue_url = '',
|
||||||
|
content = content
|
||||||
|
)
|
||||||
|
except ValueError as e:
|
||||||
|
print(repr(e))
|
||||||
|
print(" > XXXXXX Failed to index Github markdown file \"%s\""%(fname))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
print("Indexing github file %s from repo %s"%(fname,repo_name))
|
print("Indexing github file %s from repo %s"%(fname,repo_name))
|
||||||
@@ -501,24 +541,29 @@ class Search:
|
|||||||
key = fname+"_"+fsha
|
key = fname+"_"+fsha
|
||||||
|
|
||||||
# Now create the actual search index record
|
# Now create the actual search index record
|
||||||
writer.add_document(
|
try:
|
||||||
id = key,
|
writer.add_document(
|
||||||
kind = 'ghfile',
|
id = key,
|
||||||
created_time = '',
|
kind = 'ghfile',
|
||||||
modified_time = '',
|
created_time = None,
|
||||||
indexed_time = indexed_time,
|
modified_time = None,
|
||||||
title = fname,
|
indexed_time = indexed_time,
|
||||||
url = repo_url,
|
title = fname,
|
||||||
mimetype='',
|
url = repo_url,
|
||||||
owner_email='',
|
mimetype='',
|
||||||
owner_name='',
|
owner_email='',
|
||||||
repo_name = repo_name,
|
owner_name='',
|
||||||
repo_url = repo_url,
|
group='',
|
||||||
github_user = '',
|
repo_name = repo_name,
|
||||||
issue_title = '',
|
repo_url = repo_url,
|
||||||
issue_url = '',
|
github_user = '',
|
||||||
content = ''
|
issue_title = '',
|
||||||
)
|
issue_url = '',
|
||||||
|
content = ''
|
||||||
|
)
|
||||||
|
except ValueError as e:
|
||||||
|
print(repr(e))
|
||||||
|
print(" > XXXXXX Failed to index Github file \"%s\""%(fname))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -532,28 +577,42 @@ class Search:
|
|||||||
Use a Github file API record to add a filename
|
Use a Github file API record to add a filename
|
||||||
to the search index.
|
to the search index.
|
||||||
"""
|
"""
|
||||||
indexed_time = clean_timestamp(datetime.now())
|
if 'created_time' in d.keys() and d['created_time'] is not None:
|
||||||
|
created_time = d['created_time']
|
||||||
|
else:
|
||||||
|
created_time = None
|
||||||
|
|
||||||
|
if 'modified_time' in d.keys() and d['modified_time'] is not None:
|
||||||
|
modified_time = d['modified_time']
|
||||||
|
else:
|
||||||
|
modified_time = None
|
||||||
|
|
||||||
|
indexed_time = datetime.now()
|
||||||
|
|
||||||
# Now create the actual search index record
|
# Now create the actual search index record
|
||||||
writer.add_document(
|
try:
|
||||||
id = d['permalink'],
|
writer.add_document(
|
||||||
kind = 'emailthread',
|
id = d['permalink'],
|
||||||
created_time = '',
|
kind = 'emailthread',
|
||||||
modified_time = '',
|
created_time = created_time,
|
||||||
indexed_time = indexed_time,
|
modified_time = modified_time,
|
||||||
title = d['subject'],
|
indexed_time = indexed_time,
|
||||||
url = d['permalink'],
|
title = d['subject'],
|
||||||
mimetype='',
|
url = d['permalink'],
|
||||||
owner_email='',
|
mimetype='',
|
||||||
owner_name=d['original_sender'],
|
owner_email='',
|
||||||
repo_name = '',
|
owner_name=d['original_sender'],
|
||||||
repo_url = '',
|
group=d['subgroup'],
|
||||||
github_user = '',
|
repo_name = '',
|
||||||
issue_title = '',
|
repo_url = '',
|
||||||
issue_url = '',
|
github_user = '',
|
||||||
content = d['content']
|
issue_title = '',
|
||||||
)
|
issue_url = '',
|
||||||
|
content = d['content']
|
||||||
|
)
|
||||||
|
except ValueError as e:
|
||||||
|
print(repr(e))
|
||||||
|
print(" > XXXXXX Failed to index Groups.io thread \"%s\""%(d['subject']))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -631,10 +690,10 @@ class Search:
|
|||||||
full_items[f['id']] = f
|
full_items[f['id']] = f
|
||||||
|
|
||||||
## Shorter:
|
## Shorter:
|
||||||
#break
|
break
|
||||||
# Longer:
|
## Longer:
|
||||||
if nextPageToken is None:
|
#if nextPageToken is None:
|
||||||
break
|
# break
|
||||||
|
|
||||||
|
|
||||||
writer = self.ix.writer()
|
writer = self.ix.writer()
|
||||||
@@ -642,34 +701,41 @@ class Search:
|
|||||||
temp_dir = tempfile.mkdtemp(dir=os.getcwd())
|
temp_dir = tempfile.mkdtemp(dir=os.getcwd())
|
||||||
print("Temporary directory: %s"%(temp_dir))
|
print("Temporary directory: %s"%(temp_dir))
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
|
# Drop any id in indexed_ids
|
||||||
|
# not in remote_ids
|
||||||
|
drop_ids = indexed_ids - remote_ids
|
||||||
|
for drop_id in drop_ids:
|
||||||
|
writer.delete_by_term('id',drop_id)
|
||||||
|
|
||||||
|
|
||||||
# Drop any id in indexed_ids
|
# Update any id in indexed_ids
|
||||||
# not in remote_ids
|
# and in remote_ids
|
||||||
drop_ids = indexed_ids - remote_ids
|
update_ids = indexed_ids & remote_ids
|
||||||
for drop_id in drop_ids:
|
for update_id in update_ids:
|
||||||
writer.delete_by_term('id',drop_id)
|
# cop out
|
||||||
|
writer.delete_by_term('id',update_id)
|
||||||
|
item = full_items[update_id]
|
||||||
|
self.add_drive_file(writer, item, temp_dir, config, update=True)
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
|
||||||
# Update any id in indexed_ids
|
# Add any id not in indexed_ids
|
||||||
# and in remote_ids
|
# and in remote_ids
|
||||||
update_ids = indexed_ids & remote_ids
|
add_ids = remote_ids - indexed_ids
|
||||||
for update_id in update_ids:
|
for add_id in add_ids:
|
||||||
# cop out
|
item = full_items[add_id]
|
||||||
writer.delete_by_term('id',update_id)
|
self.add_drive_file(writer, item, temp_dir, config, update=False)
|
||||||
item = full_items[update_id]
|
count += 1
|
||||||
self.add_drive_file(writer, item, temp_dir, config, update=True)
|
|
||||||
count += 1
|
|
||||||
|
|
||||||
|
|
||||||
# Add any id not in indexed_ids
|
|
||||||
# and in remote_ids
|
|
||||||
add_ids = remote_ids - indexed_ids
|
|
||||||
for add_id in add_ids:
|
|
||||||
item = full_items[add_id]
|
|
||||||
self.add_drive_file(writer, item, temp_dir, config, update=False)
|
|
||||||
count += 1
|
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print("ERROR: While adding Google Drive files to search index")
|
||||||
|
print("-"*40)
|
||||||
|
print(repr(e))
|
||||||
|
print("-"*40)
|
||||||
|
print("Continuing...")
|
||||||
|
pass
|
||||||
|
|
||||||
print("Cleaning temporary directory: %s"%(temp_dir))
|
print("Cleaning temporary directory: %s"%(temp_dir))
|
||||||
subprocess.call(['rm','-fr',temp_dir])
|
subprocess.call(['rm','-fr',temp_dir])
|
||||||
@@ -1074,7 +1140,7 @@ class Search:
|
|||||||
elif doctype=='issue':
|
elif doctype=='issue':
|
||||||
item_keys = ['title','repo_name','repo_url','url','created_time','modified_time']
|
item_keys = ['title','repo_name','repo_url','url','created_time','modified_time']
|
||||||
elif doctype=='emailthread':
|
elif doctype=='emailthread':
|
||||||
item_keys = ['title','owner_name','url']
|
item_keys = ['title','owner_name','url','created_time','modified_time']
|
||||||
elif doctype=='ghfile':
|
elif doctype=='ghfile':
|
||||||
item_keys = ['title','repo_name','repo_url','url']
|
item_keys = ['title','repo_name','repo_url','url']
|
||||||
elif doctype=='markdown':
|
elif doctype=='markdown':
|
||||||
@@ -1091,11 +1157,7 @@ class Search:
|
|||||||
for r in results:
|
for r in results:
|
||||||
d = {}
|
d = {}
|
||||||
for k in item_keys:
|
for k in item_keys:
|
||||||
if k=='created_time' or k=='modified_time':
|
d[k] = r[k]
|
||||||
#d[k] = r[k]
|
|
||||||
d[k] = dateutil.parser.parse(r[k]).strftime("%Y-%m-%d")
|
|
||||||
else:
|
|
||||||
d[k] = r[k]
|
|
||||||
json_results.append(d)
|
json_results.append(d)
|
||||||
|
|
||||||
return json_results
|
return json_results
|
||||||
@@ -1108,7 +1170,9 @@ class Search:
|
|||||||
query_string = " ".join(query_list)
|
query_string = " ".join(query_list)
|
||||||
query = None
|
query = None
|
||||||
if ":" in query_string:
|
if ":" in query_string:
|
||||||
query = QueryParser("content", self.schema).parse(query_string)
|
query = QueryParser("content", self.schema)
|
||||||
|
query.add_plugin(DateParserPlugin(free=True))
|
||||||
|
query = query.parse(query_string)
|
||||||
elif len(fields) == 1 and fields[0] == "filename":
|
elif len(fields) == 1 and fields[0] == "filename":
|
||||||
pass
|
pass
|
||||||
elif len(fields) == 2:
|
elif len(fields) == 2:
|
||||||
@@ -1116,9 +1180,12 @@ class Search:
|
|||||||
else:
|
else:
|
||||||
# If the user does not specify a field,
|
# If the user does not specify a field,
|
||||||
# these are the fields that are actually searched
|
# these are the fields that are actually searched
|
||||||
fields = ['title', 'content','owner_name','owner_email','url']
|
fields = ['title', 'content','owner_name','owner_email','url','created_date','modified_date']
|
||||||
if not query:
|
if not query:
|
||||||
query = MultifieldParser(fields, schema=self.ix.schema).parse(query_string)
|
query = MultifieldParser(fields, schema=self.ix.schema)
|
||||||
|
query.add_plugin(DateParserPlugin(free=True))
|
||||||
|
query = query.parse(query_string)
|
||||||
|
#query = MultifieldParser(fields, schema=self.ix.schema).parse(query_string)
|
||||||
parsed_query = "%s" % query
|
parsed_query = "%s" % query
|
||||||
print("query: %s" % parsed_query)
|
print("query: %s" % parsed_query)
|
||||||
results = searcher.search(query, terms=False, scored=True, groupedby="kind")
|
results = searcher.search(query, terms=False, scored=True, groupedby="kind")
|
||||||
|
@@ -1,5 +1,7 @@
|
|||||||
import requests, os, re
|
import requests, os, re
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
import dateutil.parser
|
||||||
|
import datetime
|
||||||
|
|
||||||
class GroupsIOException(Exception):
|
class GroupsIOException(Exception):
|
||||||
pass
|
pass
|
||||||
@@ -64,7 +66,7 @@ class GroupsIOArchivesCrawler(object):
|
|||||||
|
|
||||||
## Short circuit
|
## Short circuit
|
||||||
## for debugging purposes
|
## for debugging purposes
|
||||||
#break
|
break
|
||||||
|
|
||||||
return subgroups
|
return subgroups
|
||||||
|
|
||||||
@@ -251,7 +253,7 @@ class GroupsIOArchivesCrawler(object):
|
|||||||
subject = soup.find('title').text
|
subject = soup.find('title').text
|
||||||
|
|
||||||
# Extract information for the schema:
|
# Extract information for the schema:
|
||||||
# - permalink for thread (done)
|
# - permalink for thread (done above)
|
||||||
# - subject/title (done)
|
# - subject/title (done)
|
||||||
# - original sender email/name (done)
|
# - original sender email/name (done)
|
||||||
# - content (done)
|
# - content (done)
|
||||||
@@ -266,11 +268,35 @@ class GroupsIOArchivesCrawler(object):
|
|||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
# found an email!
|
# found an email!
|
||||||
# this is a maze, thanks groups.io
|
# this is a maze, not amazing.
|
||||||
|
# thanks groups.io!
|
||||||
td = tr.find('td')
|
td = tr.find('td')
|
||||||
divrow = td.find('div',{'class':'row'}).find('div',{'class':'pull-left'})
|
|
||||||
|
sender_divrow = td.find('div',{'class':'row'})
|
||||||
|
sender_divrow = sender_divrow.find('div',{'class':'pull-left'})
|
||||||
if (i+1)==1:
|
if (i+1)==1:
|
||||||
original_sender = divrow.text.strip()
|
original_sender = sender_divrow.text.strip()
|
||||||
|
|
||||||
|
date_divrow = td.find('div',{'class':'row'})
|
||||||
|
date_divrow = date_divrow.find('div',{'class':'pull-right'})
|
||||||
|
date_divrow = date_divrow.find('font',{'class':'text-muted'})
|
||||||
|
date_divrow = date_divrow.find('script').text
|
||||||
|
try:
|
||||||
|
time_seconds = re.search(' [0-9]{1,} ',date_divrow).group(0)
|
||||||
|
time_seconds = time_seconds.strip()
|
||||||
|
# Thanks groups.io for the weird date formatting
|
||||||
|
time_seconds = time_seconds[:10]
|
||||||
|
mmicro_seconds = time_seconds[10:]
|
||||||
|
if (i+1)==1:
|
||||||
|
created_time = datetime.datetime.utcfromtimestamp(int(time_seconds))
|
||||||
|
modified_time = datetime.datetime.utcfromtimestamp(int(time_seconds))
|
||||||
|
else:
|
||||||
|
modified_time = datetime.datetime.utcfromtimestamp(int(time_seconds))
|
||||||
|
|
||||||
|
except AttributeError:
|
||||||
|
created_time = None
|
||||||
|
modified_time = None
|
||||||
|
|
||||||
for div in td.find_all('div'):
|
for div in td.find_all('div'):
|
||||||
if div.has_attr('id'):
|
if div.has_attr('id'):
|
||||||
|
|
||||||
@@ -299,7 +325,10 @@ class GroupsIOArchivesCrawler(object):
|
|||||||
|
|
||||||
thread = {
|
thread = {
|
||||||
'permalink' : permalink,
|
'permalink' : permalink,
|
||||||
|
'created_time' : created_time,
|
||||||
|
'modified_time' : modified_time,
|
||||||
'subject' : subject,
|
'subject' : subject,
|
||||||
|
'subgroup' : subgroup_name,
|
||||||
'original_sender' : original_sender,
|
'original_sender' : original_sender,
|
||||||
'content' : full_content
|
'content' : full_content
|
||||||
}
|
}
|
||||||
@@ -324,11 +353,13 @@ class GroupsIOArchivesCrawler(object):
|
|||||||
|
|
||||||
results = []
|
results = []
|
||||||
for row in rows:
|
for row in rows:
|
||||||
# We don't care about anything except title and ugly link
|
# This is where we extract
|
||||||
|
# a list of thread titles
|
||||||
|
# and corresponding links.
|
||||||
subject = row.find('span',{'class':'subject'})
|
subject = row.find('span',{'class':'subject'})
|
||||||
title = subject.get_text()
|
title = subject.get_text()
|
||||||
link = row.find('a')['href']
|
link = row.find('a')['href']
|
||||||
#print(title)
|
|
||||||
results.append((title,link))
|
results.append((title,link))
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
@@ -1,181 +0,0 @@
|
|||||||
# Centillion Quality Engineering Plan
|
|
||||||
|
|
||||||
Table of Contents
|
|
||||||
-------
|
|
||||||
|
|
||||||
* [Centillion Quality Engineering Plan](#centillion-quality-engineering-plan)
|
|
||||||
* [Summary](#summary)
|
|
||||||
* [Tracking Bugs and Issues](#tracking-bugs-and-issues)
|
|
||||||
* [Branches, Versioning, and Git Workflow](#branches-versioning-and-git-workflow)
|
|
||||||
* [Communication and Mailing Lists](#communication-and-mailing-lists)
|
|
||||||
* [Checklists](#checklists)
|
|
||||||
* [Documentation](#documentation)
|
|
||||||
* [Configuration Management Tools](#configuration-management-tools)
|
|
||||||
* [Tests](#tests)
|
|
||||||
* [Code Reviews](#code-reviews)
|
|
||||||
* [Formal Release Process](#formal-release-process)
|
|
||||||
* [Continual Process Improvement](#continual-process-improvement)
|
|
||||||
|
|
||||||
Summary
|
|
||||||
-------
|
|
||||||
|
|
||||||
This document contains a quality engineering plan for centillion, the
|
|
||||||
Data Commons search engine.
|
|
||||||
|
|
||||||
Tracking Bugs and Issues
|
|
||||||
------------------------
|
|
||||||
|
|
||||||
We utilize the [issues
|
|
||||||
section](https://github.com/dcppc/centillion/issues) of the centillion
|
|
||||||
repository to keep track of bugs and feature requests.
|
|
||||||
|
|
||||||
Branches, Versioning, and Git Workflow
|
|
||||||
--------------------------------------
|
|
||||||
|
|
||||||
All code is kept under version control in the
|
|
||||||
[dcppc/centillion](https://github.com/dcppc/centillion) Github
|
|
||||||
repository.
|
|
||||||
|
|
||||||
**Primary Git Branches:**
|
|
||||||
|
|
||||||
We utillize a git branch pattern that has two primary branches: a
|
|
||||||
development branch and a stable branch.
|
|
||||||
|
|
||||||
- The primary **development branch** is `dcppc` and is actively
|
|
||||||
developed and deployed to <https://betasearch.nihdatacommons.us>.
|
|
||||||
|
|
||||||
- The primary **stable branch** is `releases/v1` and is stable and
|
|
||||||
deployed to <https://search.nihdatacommons.us>.
|
|
||||||
|
|
||||||
All tagged versions of Centillion exist on the stable branch. Only
|
|
||||||
tagged versions of centillion are run on
|
|
||||||
<https://search.nihdatacommons.us>.
|
|
||||||
|
|
||||||
**Other Branches:**
|
|
||||||
|
|
||||||
Features are developed by creating a new branch from `dcppc`, working on
|
|
||||||
the feature, and opening a pull request. When the pull request is
|
|
||||||
approved, it can be merged into the `dcppc` branch.
|
|
||||||
|
|
||||||
When features have accumulated and a new version is ready, a new
|
|
||||||
pre-release branch will be made to prepare for a new release. When the
|
|
||||||
pre-release branch is ready, it is merged into the stable branch in a
|
|
||||||
single merge commit and a new version of centillion is tagged. The new
|
|
||||||
version is deployed on <https://search.nihdatacommons.us>.
|
|
||||||
|
|
||||||
Commits to fix bugs (hotfixes) may need to be applied to both the stable
|
|
||||||
and development branches. In this case, a hotfix branch should be
|
|
||||||
created from the head commit of the stable branch, and the appropriate
|
|
||||||
changes should be made on the branch. A pull request should be opened to
|
|
||||||
merge the hotfix into the release branch. A second pull request should
|
|
||||||
be opened to merge the hotfix into the development branch. Once the
|
|
||||||
hotfix is merged into the stable branch, a new version should be tagged.
|
|
||||||
|
|
||||||
Communication and Mailing Lists
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
- No mailing list currently exists for centillion.
|
|
||||||
|
|
||||||
- Github issues are the primary form of communication about
|
|
||||||
development of centillion. This is the best method for communicating
|
|
||||||
bug reports or detailed information.
|
|
||||||
|
|
||||||
- The Send Feedback button on the centillion page is the primary way
|
|
||||||
of getting quick feedback from users about the search engine.
|
|
||||||
|
|
||||||
- The [\#centillion](https://nih-dcppc.slack.com/messages/CCD64QD6G)
|
|
||||||
Slack channel in the DCPPC slack workspace is the best place for
|
|
||||||
conversations about centillion (providing feedback, answering quick
|
|
||||||
questions, etc.)
|
|
||||||
|
|
||||||
Checklists
|
|
||||||
----------
|
|
||||||
|
|
||||||
We plan to utilize the Wiki feature of the Github repository to develop
|
|
||||||
checlists:
|
|
||||||
|
|
||||||
- Checklist for releases
|
|
||||||
- Checklist for deployment of https://search.nihdatacommons.us nginx
|
|
||||||
etc.
|
|
||||||
|
|
||||||
Documentation
|
|
||||||
-------------
|
|
||||||
|
|
||||||
The documentation is a pile of markdown documents, turned into a static
|
|
||||||
site using mkdocs.
|
|
||||||
|
|
||||||
Configuration Management Tools
|
|
||||||
------------------------------
|
|
||||||
|
|
||||||
We do not currently utilize any configuration management software,
|
|
||||||
because centillion is not packaged as an importable Python module.
|
|
||||||
|
|
||||||
Packaging centillion is a future goal that is closely related to the
|
|
||||||
need to improve and modularize the internal search schema/document type
|
|
||||||
abstraction. These improvements would allow the types of collections
|
|
||||||
being indexed to be separated from "core centillion", and core
|
|
||||||
centillion would be packaged.
|
|
||||||
|
|
||||||
Tests
|
|
||||||
-----
|
|
||||||
|
|
||||||
See (ref) for a full test plan with more detail.
|
|
||||||
|
|
||||||
Summary of test plan:
|
|
||||||
|
|
||||||
- Implement tests for the four major pages/components
|
|
||||||
- Login/authentication
|
|
||||||
- Search
|
|
||||||
- Master List
|
|
||||||
- Control Panel
|
|
||||||
- Test authentication with two bot accounts (yammasnake and florence
|
|
||||||
python)
|
|
||||||
|
|
||||||
- Separate frontend and backend tests
|
|
||||||
|
|
||||||
- Add a test flag in the flask config file to change the backend
|
|
||||||
behavior of the server
|
|
||||||
|
|
||||||
Code Reviews
|
|
||||||
------------
|
|
||||||
|
|
||||||
CI tests will be implemented for all pull requests.
|
|
||||||
|
|
||||||
Pull requests to the **stable branch** have the following checks in
|
|
||||||
place:
|
|
||||||
|
|
||||||
- PRs to the stable branch require at least 1 PR review
|
|
||||||
- PRs to the stable branch must pass CI tests
|
|
||||||
|
|
||||||
Pull requests to the **development branch** have the following checks in
|
|
||||||
place:
|
|
||||||
|
|
||||||
- PRs to the development branch must pass CI tests
|
|
||||||
|
|
||||||
Formal Release Process
|
|
||||||
----------------------
|
|
||||||
|
|
||||||
In order to ensure a stable, consistent product, we utilize the
|
|
||||||
branching pattern described above to implement new features in the
|
|
||||||
development branch and test them out on
|
|
||||||
<https://betasearch.nihdatacommons.us>.
|
|
||||||
|
|
||||||
Once features and bug fixes have been tested and reviewed internally,
|
|
||||||
they are ready to be deployed. A new pre-release branch is created from
|
|
||||||
the development branch. The pre-release branch has a feature freeze in
|
|
||||||
place. Changes are made to the pre-release branch to prepare it for the
|
|
||||||
next major version release.
|
|
||||||
|
|
||||||
When the pre-release branch is finished, it is merged into the stable
|
|
||||||
branch. The head commit of the stable version is tagged with the lastest
|
|
||||||
release number.
|
|
||||||
|
|
||||||
Finally, the new version is deployed on
|
|
||||||
<https://search.nihdatacommons.us>.
|
|
||||||
|
|
||||||
Continual Process Improvement
|
|
||||||
-----------------------------
|
|
||||||
|
|
||||||
We will utilize the centillion wiki on Github to keep track of repeated
|
|
||||||
processes and opportunities for improvement. Feedback and ideas for
|
|
||||||
process improvement can also be submitted via Github issues.
|
|
196
tests/Readme.md
196
tests/Readme.md
@@ -1,196 +0,0 @@
|
|||||||
Centillion Tests
|
|
||||||
================
|
|
||||||
|
|
||||||
Table of Contents
|
|
||||||
------------------
|
|
||||||
|
|
||||||
* [Centillion Tests](#centillion-tests)
|
|
||||||
* [Test Plan](#test-plan)
|
|
||||||
* [Local Tests](#local-tests)
|
|
||||||
* [Short Tests](#short-tests)
|
|
||||||
* [Long Tests](#long-tests)
|
|
||||||
* [Credentials](#credentials)
|
|
||||||
* [Detailed Description of Tests](#detailed-description-of-tests)
|
|
||||||
* [Authentication Layer Tests](#authentication-layer-tests)
|
|
||||||
* [Search Function Tests](#search-function-tests)
|
|
||||||
* [Master List Endpoint Tests](#master-list-endpoint-tests)
|
|
||||||
* [Control Panel Endpoint Tests](#control-panel-endpoint-tests)
|
|
||||||
* [Continuous Integration Plan](#continuous-integration-plan)
|
|
||||||
* [Procedure/Checklist](#procedurechecklist)
|
|
||||||
|
|
||||||
|
|
||||||
Test Plan
|
|
||||||
---------
|
|
||||||
|
|
||||||
Related: <https://github.com/dcppc/centillion/issues/82>
|
|
||||||
|
|
||||||
The test suite for centillion needs to check each of the major
|
|
||||||
components of centillion, as well as check the authentication mechanism
|
|
||||||
using multiple login credentials.
|
|
||||||
|
|
||||||
We implement the following checks:
|
|
||||||
|
|
||||||
1. Check authentication mechanism(s) (yamasnake and florence python)
|
|
||||||
|
|
||||||
2. Check search function
|
|
||||||
|
|
||||||
3. Check master list endpoint
|
|
||||||
|
|
||||||
4. Check control panel endpoint
|
|
||||||
|
|
||||||
5. Check update search index endpoints
|
|
||||||
|
|
||||||
The tests are written such that the back end and front end are tested
|
|
||||||
separately.
|
|
||||||
|
|
||||||
We need also need different tiers of tests, so we don't max out API
|
|
||||||
calls by making lots of commits to multiple PRs.
|
|
||||||
|
|
||||||
We have three tiers of tests: \* Local tests - quick tests for CI, no
|
|
||||||
API calls \* Short tests - tests using dummy API accounts \* Long tests
|
|
||||||
- tests using DCPPC API accounts
|
|
||||||
|
|
||||||
### Local Tests
|
|
||||||
|
|
||||||
Local tests can be run locally without any interaction with APIs. These
|
|
||||||
will still utilize centillion's search schema, but will load the search
|
|
||||||
index with fake documents rather than fetching them from an API.
|
|
||||||
|
|
||||||
Uncle Archie, which runs CI tests, runs local tests only (unless you
|
|
||||||
request it to run short test or long test.)
|
|
||||||
|
|
||||||
### Short Tests
|
|
||||||
|
|
||||||
Short tests utilize credentials for bot accounts that have intentionally
|
|
||||||
been set up to have a "known" corpus of test documents. These would
|
|
||||||
provide unit-style tests for centillion - are the mechanics of indexing
|
|
||||||
a particular type of document from a particular API working?
|
|
||||||
|
|
||||||
### Long Tests
|
|
||||||
|
|
||||||
Long tests are indexing the real deal, utilizing the credentials used in
|
|
||||||
the final production centillion. This test takes longer but is more
|
|
||||||
likely to catch corner cases specific to the DCPPC documents.
|
|
||||||
|
|
||||||
Credentials
|
|
||||||
-----------
|
|
||||||
|
|
||||||
Running tests on centillion requires multiple sets of credentials. Let's
|
|
||||||
lay out what is needed:
|
|
||||||
|
|
||||||
- The Flask app requires a token/secret token API key pair to allow
|
|
||||||
users to authenticate through Github and confirm they are members of
|
|
||||||
the DCPPC organization. This OAuth application is owned by Charles
|
|
||||||
Reid (@charlesreid1).
|
|
||||||
|
|
||||||
- The search index needs a Github access token so that it can
|
|
||||||
interface with the Github API to index files and issues. This access
|
|
||||||
token is specified (along with other secrets) in the Flask
|
|
||||||
configuration file. The access key comes from Florence Python
|
|
||||||
(@fp9695253).
|
|
||||||
|
|
||||||
- The search index also requires a Google Drive API access token. This
|
|
||||||
must be an access token for a user who has authenticated with the
|
|
||||||
Centillion Google Drive OAuth application. This access token comes
|
|
||||||
from <mailroom@nihdatacommons.com>.
|
|
||||||
|
|
||||||
- The search index requires API credentials for any other APIs
|
|
||||||
associated with other document collections (Groups.io, Hypothesis,
|
|
||||||
Disqus).
|
|
||||||
|
|
||||||
- The backend test requires the credentials provided to Flask.
|
|
||||||
|
|
||||||
- The frontend test (Selenium) needs two Github username/passwords:
|
|
||||||
one for Florence Python (@fp9695253) and one for Yamma Snake
|
|
||||||
(@yammasnake). These are required to simulate the user
|
|
||||||
authenticating with Github through the browser.
|
|
||||||
- The frontend test credentials are a special case.
|
|
||||||
- The frontend tests expect credentials to come from environment
|
|
||||||
variables.
|
|
||||||
- These environment variables get passed in at test time.
|
|
||||||
- Tests are all run on [Uncle
|
|
||||||
Archie](https://github.com/dcppc/uncle-archie).
|
|
||||||
- Uncle Archie already has to protect a confidential config file
|
|
||||||
containing Github credentials, so add additional credentials for
|
|
||||||
frontend tests there.
|
|
||||||
- Logical separation: these credentials are not needed to
|
|
||||||
*operate* centillion, these credentials are needed to *test*
|
|
||||||
centillion
|
|
||||||
- Uncle Archie already requires github credentials, already
|
|
||||||
protects sensitive info.
|
|
||||||
- Google Drive requiring its own credentials file on disk is a
|
|
||||||
pain.
|
|
||||||
|
|
||||||
In summary: tests use the `config_flask.py` and `config_centillion.py`
|
|
||||||
files to provide it with the API keys it needs and to instruct it on
|
|
||||||
what to index. The credentials and config files will control what the
|
|
||||||
search index will actually index. The Uncle Archie CI tester config file
|
|
||||||
contains the credentials needed to run frontend tests (check the
|
|
||||||
login/authentication layer).
|
|
||||||
|
|
||||||
Detailed Description of Tests
|
|
||||||
-----------------------------
|
|
||||||
|
|
||||||
### Authentication Layer Tests
|
|
||||||
|
|
||||||
Frontend tests run as Florence Python:
|
|
||||||
|
|
||||||
- Can we log in via github and reach centillion
|
|
||||||
- Can we reach the control panel
|
|
||||||
|
|
||||||
Frontend tests run as Yamma Snake (DCPPC member):
|
|
||||||
|
|
||||||
- Can we log in via github and reach centillion
|
|
||||||
- Can we reach the control panel
|
|
||||||
|
|
||||||
### Search Function Tests
|
|
||||||
|
|
||||||
Frontend tests:
|
|
||||||
|
|
||||||
- Can we enter something into search box and submit
|
|
||||||
- Can we sort the results
|
|
||||||
- Do the results look okay
|
|
||||||
|
|
||||||
Backend tests:
|
|
||||||
|
|
||||||
- Load the search index and run a query using whoosh API
|
|
||||||
|
|
||||||
### Master List Endpoint Tests
|
|
||||||
|
|
||||||
Frontend tests:
|
|
||||||
|
|
||||||
- Can we get to the master list page
|
|
||||||
- Can we sort the results
|
|
||||||
- Do the results look okay
|
|
||||||
|
|
||||||
Backend tests:
|
|
||||||
|
|
||||||
- Check the output of the `/list` API endpoint
|
|
||||||
|
|
||||||
### Control Panel Endpoint Tests
|
|
||||||
|
|
||||||
Frontend tests:
|
|
||||||
|
|
||||||
- Can we get to the control panel page
|
|
||||||
- Can we click the button to trigger an indexing event
|
|
||||||
|
|
||||||
Backend tests:
|
|
||||||
|
|
||||||
- Trigger a re-index of the search index from the backend.
|
|
||||||
|
|
||||||
### Continuous Integration Plan
|
|
||||||
|
|
||||||
Tests are automatically run using Uncle Archie for continuous
|
|
||||||
integration and deployment.
|
|
||||||
|
|
||||||
Procedure/Checklist
|
|
||||||
-------------------
|
|
||||||
|
|
||||||
Pre-release procedure:
|
|
||||||
|
|
||||||
- prepare to run all test
|
|
||||||
|
|
||||||
- run short tests
|
|
||||||
- deploy to beta
|
|
||||||
- run long tests
|
|
||||||
- test out
|
|
Reference in New Issue
Block a user