2 Commits

Author SHA1 Message Date
4b90057664 add advanced search button to search template 2018-08-21 10:04:48 -07:00
90f49e7626 starting point: merge 'feedback-floater' branch into 'advanced-search' branch
* feedback-floater:
  add dismissable "thanks for your feedback" message to top
  improve message formatting
  add dumy function as placeholder for where we add info messages
  return better messages
  add successful post call and export to JSON db
  update todo
  move modal into its own .html file
  update todo with tasks
  fix button and smiley styles
  add /feedback post route
  feedback button successfully triggers a modal
  add page self-identifiers. add "send feedback" button. fix layouts.
2018-08-20 21:04:54 -07:00
6 changed files with 184 additions and 311 deletions

1
.gitignore vendored
View File

@@ -1,4 +1,5 @@
feedback_database.json
config_centillion.py
config_flask.py
vp
credentials.json

View File

@@ -342,10 +342,5 @@ def store_search(query, fields):
if __name__ == '__main__':
# if running local instance, set to true
os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = 'true'
port = os.environ.get('CENTILLION_PORT','')
if port=='':
port = 5000
else:
port = int(port)
app.run(host="0.0.0.0",port=port)
app.run(host="0.0.0.0",port=5000)

View File

@@ -21,8 +21,6 @@ import dateutil.parser
from whoosh.qparser import MultifieldParser, QueryParser
from whoosh.analysis import StemmingAnalyzer
from whoosh.qparser.dateparse import DateParserPlugin
from whoosh import fields, index
"""
@@ -182,38 +180,30 @@ class Search:
# is defined.
schema = Schema(
id = fields.ID(stored=True, unique=True),
kind = fields.ID(stored=True),
id = ID(stored=True, unique=True),
kind = ID(stored=True),
created_time = fields.DATETIME(stored=True),
modified_time = fields.DATETIME(stored=True),
indexed_time = fields.DATETIME(stored=True),
created_time = ID(stored=True),
modified_time = ID(stored=True),
indexed_time = ID(stored=True),
title = fields.TEXT(stored=True, field_boost=100.0),
title = TEXT(stored=True, field_boost=100.0),
url = ID(stored=True, unique=True),
url = fields.ID(stored=True),
mimetype=ID(stored=True),
owner_email=ID(stored=True),
owner_name=TEXT(stored=True),
mimetype = fields.TEXT(stored=True),
repo_name=TEXT(stored=True),
repo_url=ID(stored=True),
owner_email = fields.ID(stored=True),
owner_name = fields.TEXT(stored=True),
# mainly for email threads, groups.io, hypothesis
group = fields.ID(stored=True),
repo_name = fields.TEXT(stored=True),
repo_url = fields.ID(stored=True),
github_user = fields.TEXT(stored=True),
tags = fields.KEYWORD(commas=True,
stored=True,
lowercase=True),
github_user=TEXT(stored=True),
# comments only
issue_title = fields.TEXT(stored=True, field_boost=100.0),
issue_url = fields.ID(stored=True),
issue_title=TEXT(stored=True, field_boost=100.0),
issue_url=ID(stored=True),
content = fields.TEXT(stored=True, analyzer=stemming_analyzer)
content=TEXT(stored=True, analyzer=stemming_analyzer)
)
@@ -253,22 +243,17 @@ class Search:
writer.delete_by_term('id',item['id'])
# Index a plain google drive file
created_time = dateutil.parser.parse(item['createdTime'])
modified_time = dateutil.parser.parse(item['modifiedTime'])
indexed_time = datetime.now().replace(microsecond=0)
try:
writer.add_document(
id = item['id'],
kind = 'gdoc',
created_time = created_time,
modified_time = modified_time,
indexed_time = indexed_time,
created_time = item['createdTime'],
modified_time = item['modifiedTime'],
indexed_time = datetime.now().replace(microsecond=0).isoformat(),
title = item['name'],
url = item['webViewLink'],
mimetype = mimetype,
owner_email = item['owners'][0]['emailAddress'],
owner_name = item['owners'][0]['displayName'],
group='',
repo_name='',
repo_url='',
github_user='',
@@ -276,9 +261,6 @@ class Search:
issue_url='',
content = content
)
except ValueError as e:
print(repr(e))
print(" > XXXXXX Failed to index Google Drive file \"%s\""%(item['name']))
else:
@@ -332,7 +314,7 @@ class Search:
)
assert output == ""
except RuntimeError:
print(" > XXXXXX Failed to index Google Drive document \"%s\""%(item['name']))
print(" > XXXXXX Failed to index document \"%s\""%(item['name']))
# If export was successful, read contents of markdown
@@ -360,22 +342,17 @@ class Search:
else:
print(" > Creating a new record")
try:
created_time = dateutil.parser.parse(item['createdTime'])
modified_time = dateutil.parser.parse(item['modifiedTime'])
indexed_time = datetime.now()
writer.add_document(
id = item['id'],
kind = 'gdoc',
created_time = created_time,
modified_time = modified_time,
indexed_time = indexed_time,
created_time = item['createdTime'],
modified_time = item['modifiedTime'],
indexed_time = datetime.now().replace(microsecond=0).isoformat(),
title = item['name'],
url = item['webViewLink'],
mimetype = mimetype,
owner_email = item['owners'][0]['emailAddress'],
owner_name = item['owners'][0]['displayName'],
group='',
repo_name='',
repo_url='',
github_user='',
@@ -383,10 +360,6 @@ class Search:
issue_url='',
content = content
)
except ValueError as e:
print(repr(e))
print(" > XXXXXX Failed to index Google Drive file \"%s\""%(item['name']))
@@ -420,14 +393,13 @@ class Search:
issue_comment_content += comment.body.rstrip()
issue_comment_content += "\n"
# Now create the actual search index record.
# Now create the actual search index record
created_time = clean_timestamp(issue.created_at)
modified_time = clean_timestamp(issue.updated_at)
indexed_time = clean_timestamp(datetime.now())
# Add one document per issue thread,
# containing entire text of thread.
created_time = issue.created_at
modified_time = issue.updated_at
indexed_time = datetime.now()
try:
writer.add_document(
id = issue.html_url,
kind = 'issue',
@@ -439,7 +411,6 @@ class Search:
mimetype='',
owner_email='',
owner_name='',
group='',
repo_name = repo_name,
repo_url = repo_url,
github_user = issue.user.login,
@@ -447,9 +418,6 @@ class Search:
issue_url = issue.html_url,
content = issue_comment_content
)
except ValueError as e:
print(repr(e))
print(" > XXXXXX Failed to index Github issue \"%s\""%(issue.title))
@@ -479,8 +447,7 @@ class Search:
print(" > XXXXXXXX Failed to find file info.")
return
indexed_time = datetime.now()
indexed_time = clean_timestamp(datetime.now())
if fext in MARKDOWN_EXTS:
print("Indexing markdown doc %s from repo %s"%(fname,repo_name))
@@ -509,19 +476,17 @@ class Search:
usable_url = "https://github.com/%s/blob/master/%s"%(repo_name, fpath)
# Now create the actual search index record
try:
writer.add_document(
id = fsha,
kind = 'markdown',
created_time = None,
modified_time = None,
created_time = '',
modified_time = '',
indexed_time = indexed_time,
title = fname,
url = usable_url,
mimetype='',
owner_email='',
owner_name='',
group='',
repo_name = repo_name,
repo_url = repo_url,
github_user = '',
@@ -529,11 +494,6 @@ class Search:
issue_url = '',
content = content
)
except ValueError as e:
print(repr(e))
print(" > XXXXXX Failed to index Github markdown file \"%s\""%(fname))
else:
print("Indexing github file %s from repo %s"%(fname,repo_name))
@@ -541,19 +501,17 @@ class Search:
key = fname+"_"+fsha
# Now create the actual search index record
try:
writer.add_document(
id = key,
kind = 'ghfile',
created_time = None,
modified_time = None,
created_time = '',
modified_time = '',
indexed_time = indexed_time,
title = fname,
url = repo_url,
mimetype='',
owner_email='',
owner_name='',
group='',
repo_name = repo_name,
repo_url = repo_url,
github_user = '',
@@ -561,9 +519,6 @@ class Search:
issue_url = '',
content = ''
)
except ValueError as e:
print(repr(e))
print(" > XXXXXX Failed to index Github file \"%s\""%(fname))
@@ -577,32 +532,20 @@ class Search:
Use a Github file API record to add a filename
to the search index.
"""
if 'created_time' in d.keys() and d['created_time'] is not None:
created_time = d['created_time']
else:
created_time = None
if 'modified_time' in d.keys() and d['modified_time'] is not None:
modified_time = d['modified_time']
else:
modified_time = None
indexed_time = datetime.now()
indexed_time = clean_timestamp(datetime.now())
# Now create the actual search index record
try:
writer.add_document(
id = d['permalink'],
kind = 'emailthread',
created_time = created_time,
modified_time = modified_time,
created_time = '',
modified_time = '',
indexed_time = indexed_time,
title = d['subject'],
url = d['permalink'],
mimetype='',
owner_email='',
owner_name=d['original_sender'],
group=d['subgroup'],
repo_name = '',
repo_url = '',
github_user = '',
@@ -610,9 +553,7 @@ class Search:
issue_url = '',
content = d['content']
)
except ValueError as e:
print(repr(e))
print(" > XXXXXX Failed to index Groups.io thread \"%s\""%(d['subject']))
@@ -690,10 +631,10 @@ class Search:
full_items[f['id']] = f
## Shorter:
#break
# Longer:
if nextPageToken is None:
break
## Longer:
#if nextPageToken is None:
# break
writer = self.ix.writer()
@@ -701,7 +642,7 @@ class Search:
temp_dir = tempfile.mkdtemp(dir=os.getcwd())
print("Temporary directory: %s"%(temp_dir))
try:
# Drop any id in indexed_ids
# not in remote_ids
@@ -729,13 +670,6 @@ class Search:
self.add_drive_file(writer, item, temp_dir, config, update=False)
count += 1
except Exception as e:
print("ERROR: While adding Google Drive files to search index")
print("-"*40)
print(repr(e))
print("-"*40)
print("Continuing...")
pass
print("Cleaning temporary directory: %s"%(temp_dir))
subprocess.call(['rm','-fr',temp_dir])
@@ -1140,7 +1074,7 @@ class Search:
elif doctype=='issue':
item_keys = ['title','repo_name','repo_url','url','created_time','modified_time']
elif doctype=='emailthread':
item_keys = ['title','owner_name','url','created_time','modified_time']
item_keys = ['title','owner_name','url']
elif doctype=='ghfile':
item_keys = ['title','repo_name','repo_url','url']
elif doctype=='markdown':
@@ -1157,6 +1091,10 @@ class Search:
for r in results:
d = {}
for k in item_keys:
if k=='created_time' or k=='modified_time':
#d[k] = r[k]
d[k] = dateutil.parser.parse(r[k]).strftime("%Y-%m-%d")
else:
d[k] = r[k]
json_results.append(d)
@@ -1170,9 +1108,7 @@ class Search:
query_string = " ".join(query_list)
query = None
if ":" in query_string:
query = QueryParser("content", self.schema)
query.add_plugin(DateParserPlugin(free=True))
query = query.parse(query_string)
query = QueryParser("content", self.schema).parse(query_string)
elif len(fields) == 1 and fields[0] == "filename":
pass
elif len(fields) == 2:
@@ -1180,12 +1116,9 @@ class Search:
else:
# If the user does not specify a field,
# these are the fields that are actually searched
fields = ['title', 'content','owner_name','owner_email','url','created_date','modified_date']
fields = ['title', 'content','owner_name','owner_email','url']
if not query:
query = MultifieldParser(fields, schema=self.ix.schema)
query.add_plugin(DateParserPlugin(free=True))
query = query.parse(query_string)
#query = MultifieldParser(fields, schema=self.ix.schema).parse(query_string)
query = MultifieldParser(fields, schema=self.ix.schema).parse(query_string)
parsed_query = "%s" % query
print("query: %s" % parsed_query)
results = searcher.search(query, terms=False, scored=True, groupedby="kind")

View File

@@ -1,28 +0,0 @@
config = {
"repositories" : [
"dcppc/project-management",
"dcppc/nih-demo-meetings",
"dcppc/internal",
"dcppc/organize",
"dcppc/dcppc-bot",
"dcppc/full-stacks",
"dcppc/design-guidelines-discuss",
"dcppc/dcppc-deliverables",
"dcppc/dcppc-milestones",
"dcppc/crosscut-metadata",
"dcppc/lucky-penny",
"dcppc/dcppc-workshops",
"dcppc/metadata-matrix",
"dcppc/data-stewards",
"dcppc/dcppc-phase1-demos",
"dcppc/apis",
"dcppc/2018-june-workshop",
"dcppc/2018-july-workshop",
"dcppc/2018-august-workshop",
"dcppc/2018-september-workshop",
"dcppc/design-guidelines",
"dcppc/2018-may-workshop",
"dcppc/centillion"
]
}

View File

@@ -1,7 +1,5 @@
import requests, os, re
from bs4 import BeautifulSoup
import dateutil.parser
import datetime
class GroupsIOException(Exception):
pass
@@ -66,7 +64,7 @@ class GroupsIOArchivesCrawler(object):
## Short circuit
## for debugging purposes
break
#break
return subgroups
@@ -253,7 +251,7 @@ class GroupsIOArchivesCrawler(object):
subject = soup.find('title').text
# Extract information for the schema:
# - permalink for thread (done above)
# - permalink for thread (done)
# - subject/title (done)
# - original sender email/name (done)
# - content (done)
@@ -268,35 +266,11 @@ class GroupsIOArchivesCrawler(object):
pass
else:
# found an email!
# this is a maze, not amazing.
# thanks groups.io!
# this is a maze, thanks groups.io
td = tr.find('td')
sender_divrow = td.find('div',{'class':'row'})
sender_divrow = sender_divrow.find('div',{'class':'pull-left'})
divrow = td.find('div',{'class':'row'}).find('div',{'class':'pull-left'})
if (i+1)==1:
original_sender = sender_divrow.text.strip()
date_divrow = td.find('div',{'class':'row'})
date_divrow = date_divrow.find('div',{'class':'pull-right'})
date_divrow = date_divrow.find('font',{'class':'text-muted'})
date_divrow = date_divrow.find('script').text
try:
time_seconds = re.search(' [0-9]{1,} ',date_divrow).group(0)
time_seconds = time_seconds.strip()
# Thanks groups.io for the weird date formatting
time_seconds = time_seconds[:10]
mmicro_seconds = time_seconds[10:]
if (i+1)==1:
created_time = datetime.datetime.utcfromtimestamp(int(time_seconds))
modified_time = datetime.datetime.utcfromtimestamp(int(time_seconds))
else:
modified_time = datetime.datetime.utcfromtimestamp(int(time_seconds))
except AttributeError:
created_time = None
modified_time = None
original_sender = divrow.text.strip()
for div in td.find_all('div'):
if div.has_attr('id'):
@@ -325,10 +299,7 @@ class GroupsIOArchivesCrawler(object):
thread = {
'permalink' : permalink,
'created_time' : created_time,
'modified_time' : modified_time,
'subject' : subject,
'subgroup' : subgroup_name,
'original_sender' : original_sender,
'content' : full_content
}
@@ -353,13 +324,11 @@ class GroupsIOArchivesCrawler(object):
results = []
for row in rows:
# This is where we extract
# a list of thread titles
# and corresponding links.
# We don't care about anything except title and ugly link
subject = row.find('span',{'class':'subject'})
title = subject.get_text()
link = row.find('a')['href']
#print(title)
results.append((title,link))
return results

View File

@@ -10,12 +10,15 @@
<form action="{{ url_for('search') }}" name="search">
<p><input type="text" name="query" value="{{ query }}">
</p>
<p><button id="the-big-one" type="submit" style="font-size: 20px; padding: 10px; padding-left: 50px; padding-right: 50px;"
<p><button id="the-big-one" type="submit"
style="font-size: 20px; padding: 10px; padding-left: 50px; padding-right: 50px;"
value="search" class="btn btn-primary">Search</button>
</p>
<p><a href="#" onClick="advanced_search()">[Advanced Search]</a>
{% if parsed_query %}
<p><a href="{{ url_for('search')}}?query=&fields=">[clear all results]</a>
<p><a href="{{ url_for('search')}}?query=&fields=">[Clear All Results]</a>
{% endif %}
</p>