13 Commits

Author SHA1 Message Date
ab76226b0c Merge pull request #90 from dcppc/add-dates-and-subgroups-to-emails
Add dates and subgroups to emails
2018-08-24 00:07:40 -07:00
a4ebef6e6f extract date and time from email threads pages 2018-08-24 00:04:35 -07:00
bad50efa9b add groups and tags to schema; update how we determine timestamps; handle exceptions when we add the document to the writer, rather than elsewhere 2018-08-24 00:03:23 -07:00
629fc063db move where exception is caught (exception was also incorrect.) 2018-08-24 00:01:26 -07:00
3b0baa21de switched created_time, modified_time, indexed_time over to DATETIME. added DateParserPlugin to query QueryParser. added time fields to those being searched by default. tests do not seem to be working. 2018-08-23 19:01:40 -07:00
6bfadef829 Merge pull request #73 from dcppc/feedback-floater
Add a feedback mechanism
2018-08-21 11:33:34 -07:00
c38683ae9f (resolve conflict) Merge branch 'dcppc' into feedback-floater
* dcppc:
  add centillion config back. no sensitive info.
  add option to set port at runtime with CENTILLION_PORT environment variable
  add a bit o whitespace
2018-08-21 11:32:59 -07:00
3f5349a5a6 Merge pull request #80 from dcppc/add-centillion-config-back
add centillion config back. no sensitive info.
2018-08-21 11:16:21 -07:00
f88cf6ecad add centillion config back. no sensitive info. 2018-08-21 11:15:29 -07:00
ec54292a4b Merge pull request #79 from dcppc/add-port-env-var
add option to set port at runtime
2018-08-21 11:12:17 -07:00
296132d356 add option to set port at runtime with CENTILLION_PORT environment variable 2018-08-21 11:09:46 -07:00
0bc40ba323 Merge pull request #76 from dcppc/add-whitespace
add a bit o whitespace
2018-08-21 10:33:20 -07:00
8143e214c2 add a bit o whitespace 2018-08-21 10:06:16 -07:00
6 changed files with 312 additions and 180 deletions

1
.gitignore vendored
View File

@@ -1,5 +1,4 @@
feedback_database.json
config_centillion.py
config_flask.py
vp
credentials.json

View File

@@ -342,5 +342,10 @@ def store_search(query, fields):
if __name__ == '__main__':
# if running local instance, set to true
os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = 'true'
app.run(host="0.0.0.0",port=5000)
port = os.environ.get('CENTILLION_PORT','')
if port=='':
port = 5000
else:
port = int(port)
app.run(host="0.0.0.0",port=port)

View File

@@ -21,6 +21,8 @@ import dateutil.parser
from whoosh.qparser import MultifieldParser, QueryParser
from whoosh.analysis import StemmingAnalyzer
from whoosh.qparser.dateparse import DateParserPlugin
from whoosh import fields, index
"""
@@ -180,30 +182,38 @@ class Search:
# is defined.
schema = Schema(
id = ID(stored=True, unique=True),
kind = ID(stored=True),
id = fields.ID(stored=True, unique=True),
kind = fields.ID(stored=True),
created_time = ID(stored=True),
modified_time = ID(stored=True),
indexed_time = ID(stored=True),
created_time = fields.DATETIME(stored=True),
modified_time = fields.DATETIME(stored=True),
indexed_time = fields.DATETIME(stored=True),
title = TEXT(stored=True, field_boost=100.0),
url = ID(stored=True, unique=True),
title = fields.TEXT(stored=True, field_boost=100.0),
mimetype=ID(stored=True),
owner_email=ID(stored=True),
owner_name=TEXT(stored=True),
url = fields.ID(stored=True),
repo_name=TEXT(stored=True),
repo_url=ID(stored=True),
mimetype = fields.TEXT(stored=True),
github_user=TEXT(stored=True),
owner_email = fields.ID(stored=True),
owner_name = fields.TEXT(stored=True),
# mainly for email threads, groups.io, hypothesis
group = fields.ID(stored=True),
repo_name = fields.TEXT(stored=True),
repo_url = fields.ID(stored=True),
github_user = fields.TEXT(stored=True),
tags = fields.KEYWORD(commas=True,
stored=True,
lowercase=True),
# comments only
issue_title=TEXT(stored=True, field_boost=100.0),
issue_url=ID(stored=True),
issue_title = fields.TEXT(stored=True, field_boost=100.0),
issue_url = fields.ID(stored=True),
content=TEXT(stored=True, analyzer=stemming_analyzer)
content = fields.TEXT(stored=True, analyzer=stemming_analyzer)
)
@@ -243,17 +253,22 @@ class Search:
writer.delete_by_term('id',item['id'])
# Index a plain google drive file
created_time = dateutil.parser.parse(item['createdTime'])
modified_time = dateutil.parser.parse(item['modifiedTime'])
indexed_time = datetime.now().replace(microsecond=0)
try:
writer.add_document(
id = item['id'],
kind = 'gdoc',
created_time = item['createdTime'],
modified_time = item['modifiedTime'],
indexed_time = datetime.now().replace(microsecond=0).isoformat(),
created_time = created_time,
modified_time = modified_time,
indexed_time = indexed_time,
title = item['name'],
url = item['webViewLink'],
mimetype = mimetype,
owner_email = item['owners'][0]['emailAddress'],
owner_name = item['owners'][0]['displayName'],
group='',
repo_name='',
repo_url='',
github_user='',
@@ -261,6 +276,9 @@ class Search:
issue_url='',
content = content
)
except ValueError as e:
print(repr(e))
print(" > XXXXXX Failed to index Google Drive file \"%s\""%(item['name']))
else:
@@ -314,7 +332,7 @@ class Search:
)
assert output == ""
except RuntimeError:
print(" > XXXXXX Failed to index document \"%s\""%(item['name']))
print(" > XXXXXX Failed to index Google Drive document \"%s\""%(item['name']))
# If export was successful, read contents of markdown
@@ -342,17 +360,22 @@ class Search:
else:
print(" > Creating a new record")
try:
created_time = dateutil.parser.parse(item['createdTime'])
modified_time = dateutil.parser.parse(item['modifiedTime'])
indexed_time = datetime.now()
writer.add_document(
id = item['id'],
kind = 'gdoc',
created_time = item['createdTime'],
modified_time = item['modifiedTime'],
indexed_time = datetime.now().replace(microsecond=0).isoformat(),
created_time = created_time,
modified_time = modified_time,
indexed_time = indexed_time,
title = item['name'],
url = item['webViewLink'],
mimetype = mimetype,
owner_email = item['owners'][0]['emailAddress'],
owner_name = item['owners'][0]['displayName'],
group='',
repo_name='',
repo_url='',
github_user='',
@@ -360,6 +383,10 @@ class Search:
issue_url='',
content = content
)
except ValueError as e:
print(repr(e))
print(" > XXXXXX Failed to index Google Drive file \"%s\""%(item['name']))
@@ -393,13 +420,14 @@ class Search:
issue_comment_content += comment.body.rstrip()
issue_comment_content += "\n"
# Now create the actual search index record
created_time = clean_timestamp(issue.created_at)
modified_time = clean_timestamp(issue.updated_at)
indexed_time = clean_timestamp(datetime.now())
# Now create the actual search index record.
# Add one document per issue thread,
# containing entire text of thread.
created_time = issue.created_at
modified_time = issue.updated_at
indexed_time = datetime.now()
try:
writer.add_document(
id = issue.html_url,
kind = 'issue',
@@ -411,6 +439,7 @@ class Search:
mimetype='',
owner_email='',
owner_name='',
group='',
repo_name = repo_name,
repo_url = repo_url,
github_user = issue.user.login,
@@ -418,6 +447,9 @@ class Search:
issue_url = issue.html_url,
content = issue_comment_content
)
except ValueError as e:
print(repr(e))
print(" > XXXXXX Failed to index Github issue \"%s\""%(issue.title))
@@ -447,7 +479,8 @@ class Search:
print(" > XXXXXXXX Failed to find file info.")
return
indexed_time = clean_timestamp(datetime.now())
indexed_time = datetime.now()
if fext in MARKDOWN_EXTS:
print("Indexing markdown doc %s from repo %s"%(fname,repo_name))
@@ -476,17 +509,19 @@ class Search:
usable_url = "https://github.com/%s/blob/master/%s"%(repo_name, fpath)
# Now create the actual search index record
try:
writer.add_document(
id = fsha,
kind = 'markdown',
created_time = '',
modified_time = '',
created_time = None,
modified_time = None,
indexed_time = indexed_time,
title = fname,
url = usable_url,
mimetype='',
owner_email='',
owner_name='',
group='',
repo_name = repo_name,
repo_url = repo_url,
github_user = '',
@@ -494,6 +529,11 @@ class Search:
issue_url = '',
content = content
)
except ValueError as e:
print(repr(e))
print(" > XXXXXX Failed to index Github markdown file \"%s\""%(fname))
else:
print("Indexing github file %s from repo %s"%(fname,repo_name))
@@ -501,17 +541,19 @@ class Search:
key = fname+"_"+fsha
# Now create the actual search index record
try:
writer.add_document(
id = key,
kind = 'ghfile',
created_time = '',
modified_time = '',
created_time = None,
modified_time = None,
indexed_time = indexed_time,
title = fname,
url = repo_url,
mimetype='',
owner_email='',
owner_name='',
group='',
repo_name = repo_name,
repo_url = repo_url,
github_user = '',
@@ -519,6 +561,9 @@ class Search:
issue_url = '',
content = ''
)
except ValueError as e:
print(repr(e))
print(" > XXXXXX Failed to index Github file \"%s\""%(fname))
@@ -532,20 +577,32 @@ class Search:
Use a Github file API record to add a filename
to the search index.
"""
indexed_time = clean_timestamp(datetime.now())
if 'created_time' in d.keys() and d['created_time'] is not None:
created_time = d['created_time']
else:
created_time = None
if 'modified_time' in d.keys() and d['modified_time'] is not None:
modified_time = d['modified_time']
else:
modified_time = None
indexed_time = datetime.now()
# Now create the actual search index record
try:
writer.add_document(
id = d['permalink'],
kind = 'emailthread',
created_time = '',
modified_time = '',
created_time = created_time,
modified_time = modified_time,
indexed_time = indexed_time,
title = d['subject'],
url = d['permalink'],
mimetype='',
owner_email='',
owner_name=d['original_sender'],
group=d['subgroup'],
repo_name = '',
repo_url = '',
github_user = '',
@@ -553,7 +610,9 @@ class Search:
issue_url = '',
content = d['content']
)
except ValueError as e:
print(repr(e))
print(" > XXXXXX Failed to index Groups.io thread \"%s\""%(d['subject']))
@@ -631,10 +690,10 @@ class Search:
full_items[f['id']] = f
## Shorter:
#break
# Longer:
if nextPageToken is None:
break
## Longer:
#if nextPageToken is None:
# break
writer = self.ix.writer()
@@ -642,7 +701,7 @@ class Search:
temp_dir = tempfile.mkdtemp(dir=os.getcwd())
print("Temporary directory: %s"%(temp_dir))
try:
# Drop any id in indexed_ids
# not in remote_ids
@@ -670,6 +729,13 @@ class Search:
self.add_drive_file(writer, item, temp_dir, config, update=False)
count += 1
except Exception as e:
print("ERROR: While adding Google Drive files to search index")
print("-"*40)
print(repr(e))
print("-"*40)
print("Continuing...")
pass
print("Cleaning temporary directory: %s"%(temp_dir))
subprocess.call(['rm','-fr',temp_dir])
@@ -1074,7 +1140,7 @@ class Search:
elif doctype=='issue':
item_keys = ['title','repo_name','repo_url','url','created_time','modified_time']
elif doctype=='emailthread':
item_keys = ['title','owner_name','url']
item_keys = ['title','owner_name','url','created_time','modified_time']
elif doctype=='ghfile':
item_keys = ['title','repo_name','repo_url','url']
elif doctype=='markdown':
@@ -1091,10 +1157,6 @@ class Search:
for r in results:
d = {}
for k in item_keys:
if k=='created_time' or k=='modified_time':
#d[k] = r[k]
d[k] = dateutil.parser.parse(r[k]).strftime("%Y-%m-%d")
else:
d[k] = r[k]
json_results.append(d)
@@ -1108,7 +1170,9 @@ class Search:
query_string = " ".join(query_list)
query = None
if ":" in query_string:
query = QueryParser("content", self.schema).parse(query_string)
query = QueryParser("content", self.schema)
query.add_plugin(DateParserPlugin(free=True))
query = query.parse(query_string)
elif len(fields) == 1 and fields[0] == "filename":
pass
elif len(fields) == 2:
@@ -1116,9 +1180,12 @@ class Search:
else:
# If the user does not specify a field,
# these are the fields that are actually searched
fields = ['title', 'content','owner_name','owner_email','url']
fields = ['title', 'content','owner_name','owner_email','url','created_date','modified_date']
if not query:
query = MultifieldParser(fields, schema=self.ix.schema).parse(query_string)
query = MultifieldParser(fields, schema=self.ix.schema)
query.add_plugin(DateParserPlugin(free=True))
query = query.parse(query_string)
#query = MultifieldParser(fields, schema=self.ix.schema).parse(query_string)
parsed_query = "%s" % query
print("query: %s" % parsed_query)
results = searcher.search(query, terms=False, scored=True, groupedby="kind")

28
config_centillion.py Normal file
View File

@@ -0,0 +1,28 @@
config = {
"repositories" : [
"dcppc/project-management",
"dcppc/nih-demo-meetings",
"dcppc/internal",
"dcppc/organize",
"dcppc/dcppc-bot",
"dcppc/full-stacks",
"dcppc/design-guidelines-discuss",
"dcppc/dcppc-deliverables",
"dcppc/dcppc-milestones",
"dcppc/crosscut-metadata",
"dcppc/lucky-penny",
"dcppc/dcppc-workshops",
"dcppc/metadata-matrix",
"dcppc/data-stewards",
"dcppc/dcppc-phase1-demos",
"dcppc/apis",
"dcppc/2018-june-workshop",
"dcppc/2018-july-workshop",
"dcppc/2018-august-workshop",
"dcppc/2018-september-workshop",
"dcppc/design-guidelines",
"dcppc/2018-may-workshop",
"dcppc/centillion"
]
}

View File

@@ -1,5 +1,7 @@
import requests, os, re
from bs4 import BeautifulSoup
import dateutil.parser
import datetime
class GroupsIOException(Exception):
pass
@@ -64,7 +66,7 @@ class GroupsIOArchivesCrawler(object):
## Short circuit
## for debugging purposes
#break
break
return subgroups
@@ -251,7 +253,7 @@ class GroupsIOArchivesCrawler(object):
subject = soup.find('title').text
# Extract information for the schema:
# - permalink for thread (done)
# - permalink for thread (done above)
# - subject/title (done)
# - original sender email/name (done)
# - content (done)
@@ -266,11 +268,35 @@ class GroupsIOArchivesCrawler(object):
pass
else:
# found an email!
# this is a maze, thanks groups.io
# this is a maze, not amazing.
# thanks groups.io!
td = tr.find('td')
divrow = td.find('div',{'class':'row'}).find('div',{'class':'pull-left'})
sender_divrow = td.find('div',{'class':'row'})
sender_divrow = sender_divrow.find('div',{'class':'pull-left'})
if (i+1)==1:
original_sender = divrow.text.strip()
original_sender = sender_divrow.text.strip()
date_divrow = td.find('div',{'class':'row'})
date_divrow = date_divrow.find('div',{'class':'pull-right'})
date_divrow = date_divrow.find('font',{'class':'text-muted'})
date_divrow = date_divrow.find('script').text
try:
time_seconds = re.search(' [0-9]{1,} ',date_divrow).group(0)
time_seconds = time_seconds.strip()
# Thanks groups.io for the weird date formatting
time_seconds = time_seconds[:10]
mmicro_seconds = time_seconds[10:]
if (i+1)==1:
created_time = datetime.datetime.utcfromtimestamp(int(time_seconds))
modified_time = datetime.datetime.utcfromtimestamp(int(time_seconds))
else:
modified_time = datetime.datetime.utcfromtimestamp(int(time_seconds))
except AttributeError:
created_time = None
modified_time = None
for div in td.find_all('div'):
if div.has_attr('id'):
@@ -299,7 +325,10 @@ class GroupsIOArchivesCrawler(object):
thread = {
'permalink' : permalink,
'created_time' : created_time,
'modified_time' : modified_time,
'subject' : subject,
'subgroup' : subgroup_name,
'original_sender' : original_sender,
'content' : full_content
}
@@ -324,11 +353,13 @@ class GroupsIOArchivesCrawler(object):
results = []
for row in rows:
# We don't care about anything except title and ugly link
# This is where we extract
# a list of thread titles
# and corresponding links.
subject = row.find('span',{'class':'subject'})
title = subject.get_text()
link = row.find('a')['href']
#print(title)
results.append((title,link))
return results

View File

@@ -25,6 +25,8 @@
</div>
</div>
<div style="height: 20px;"><p>&nbsp;</p></div>
<div id="info-bars-container" class="container">
<div class="row">