6 Commits

Author SHA1 Message Date
55a74f7d98 Merge branch 'use-datetime' into merge-datetime-into-disqus
* use-datetime:
  extract date and time from email threads pages
  add groups and tags to schema; update how we determine timestamps; handle exceptions when we add the document to the writer, rather than elsewhere
  move where exception is caught (exception was also incorrect.)
  switched created_time, modified_time, indexed_time over to DATETIME. added DateParserPlugin to query QueryParser. added time fields to those being searched by default. tests do not seem to be working.
2018-08-24 01:13:42 -07:00
ab76226b0c Merge pull request #90 from dcppc/add-dates-and-subgroups-to-emails
Add dates and subgroups to emails
2018-08-24 00:07:40 -07:00
a4ebef6e6f extract date and time from email threads pages 2018-08-24 00:04:35 -07:00
bad50efa9b add groups and tags to schema; update how we determine timestamps; handle exceptions when we add the document to the writer, rather than elsewhere 2018-08-24 00:03:23 -07:00
629fc063db move where exception is caught (exception was also incorrect.) 2018-08-24 00:01:26 -07:00
3b0baa21de switched created_time, modified_time, indexed_time over to DATETIME. added DateParserPlugin to query QueryParser. added time fields to those being searched by default. tests do not seem to be working. 2018-08-23 19:01:40 -07:00
3 changed files with 304 additions and 199 deletions

View File

@@ -24,6 +24,8 @@ import dateutil.parser
from whoosh import query from whoosh import query
from whoosh.qparser import MultifieldParser, QueryParser from whoosh.qparser import MultifieldParser, QueryParser
from whoosh.analysis import StemmingAnalyzer, LowercaseFilter, StopFilter from whoosh.analysis import StemmingAnalyzer, LowercaseFilter, StopFilter
from whoosh.qparser.dateparse import DateParserPlugin
from whoosh import fields, index
""" """
@@ -195,30 +197,38 @@ class Search:
# is defined. # is defined.
schema = Schema( schema = Schema(
id = ID(stored=True, unique=True), id = fields.ID(stored=True, unique=True),
kind = ID(stored=True), kind = fields.ID(stored=True),
created_time = ID(stored=True), created_time = fields.DATETIME(stored=True),
modified_time = ID(stored=True), modified_time = fields.DATETIME(stored=True),
indexed_time = ID(stored=True), indexed_time = fields.DATETIME(stored=True),
title = TEXT(stored=True, field_boost=100.0), title = fields.TEXT(stored=True, field_boost=100.0),
url = ID(stored=True, unique=True),
mimetype=ID(stored=True),
owner_email=ID(stored=True),
owner_name=TEXT(stored=True),
repo_name=TEXT(stored=True),
repo_url=ID(stored=True),
github_user=TEXT(stored=True), url = fields.ID(stored=True),
mimetype = fields.TEXT(stored=True),
owner_email = fields.ID(stored=True),
owner_name = fields.TEXT(stored=True),
# mainly for email threads, groups.io, hypothesis
group = fields.ID(stored=True),
repo_name = fields.TEXT(stored=True),
repo_url = fields.ID(stored=True),
github_user = fields.TEXT(stored=True),
tags = fields.KEYWORD(commas=True,
stored=True,
lowercase=True),
# comments only # comments only
issue_title=TEXT(stored=True, field_boost=100.0), issue_title = fields.TEXT(stored=True, field_boost=100.0),
issue_url=ID(stored=True), issue_url = fields.ID(stored=True),
content=TEXT(stored=True, analyzer=stemming_analyzer) content = fields.TEXT(stored=True, analyzer=stemming_analyzer)
) )
@@ -258,24 +268,32 @@ class Search:
writer.delete_by_term('id',item['id']) writer.delete_by_term('id',item['id'])
# Index a plain google drive file # Index a plain google drive file
writer.add_document( created_time = dateutil.parser.parse(item['createdTime'])
id = item['id'], modified_time = dateutil.parser.parse(item['modifiedTime'])
kind = 'gdoc', indexed_time = datetime.now().replace(microsecond=0)
created_time = item['createdTime'], try:
modified_time = item['modifiedTime'], writer.add_document(
indexed_time = datetime.now().replace(microsecond=0).isoformat(), id = item['id'],
title = item['name'], kind = 'gdoc',
url = item['webViewLink'], created_time = created_time,
mimetype = mimetype, modified_time = modified_time,
owner_email = item['owners'][0]['emailAddress'], indexed_time = indexed_time,
owner_name = item['owners'][0]['displayName'], title = item['name'],
repo_name='', url = item['webViewLink'],
repo_url='', mimetype = mimetype,
github_user='', owner_email = item['owners'][0]['emailAddress'],
issue_title='', owner_name = item['owners'][0]['displayName'],
issue_url='', group='',
content = content repo_name='',
) repo_url='',
github_user='',
issue_title='',
issue_url='',
content = content
)
except ValueError as e:
print(repr(e))
print(" > XXXXXX Failed to index Google Drive file \"%s\""%(item['name']))
else: else:
@@ -329,7 +347,7 @@ class Search:
) )
assert output == "" assert output == ""
except RuntimeError: except RuntimeError:
print(" > XXXXXX Failed to index document \"%s\""%(item['name'])) print(" > XXXXXX Failed to index Google Drive document \"%s\""%(item['name']))
# If export was successful, read contents of markdown # If export was successful, read contents of markdown
@@ -357,24 +375,33 @@ class Search:
else: else:
print(" > Creating a new record") print(" > Creating a new record")
writer.add_document( try:
id = item['id'], created_time = dateutil.parser.parse(item['createdTime'])
kind = 'gdoc', modified_time = dateutil.parser.parse(item['modifiedTime'])
created_time = item['createdTime'], indexed_time = datetime.now()
modified_time = item['modifiedTime'], writer.add_document(
indexed_time = datetime.now().replace(microsecond=0).isoformat(), id = item['id'],
title = item['name'], kind = 'gdoc',
url = item['webViewLink'], created_time = created_time,
mimetype = mimetype, modified_time = modified_time,
owner_email = item['owners'][0]['emailAddress'], indexed_time = indexed_time,
owner_name = item['owners'][0]['displayName'], title = item['name'],
repo_name='', url = item['webViewLink'],
repo_url='', mimetype = mimetype,
github_user='', owner_email = item['owners'][0]['emailAddress'],
issue_title='', owner_name = item['owners'][0]['displayName'],
issue_url='', group='',
content = content repo_name='',
) repo_url='',
github_user='',
issue_title='',
issue_url='',
content = content
)
except ValueError as e:
print(repr(e))
print(" > XXXXXX Failed to index Google Drive file \"%s\""%(item['name']))
@@ -408,31 +435,36 @@ class Search:
issue_comment_content += comment.body.rstrip() issue_comment_content += comment.body.rstrip()
issue_comment_content += "\n" issue_comment_content += "\n"
# Now create the actual search index record # Now create the actual search index record.
created_time = clean_timestamp(issue.created_at)
modified_time = clean_timestamp(issue.updated_at)
indexed_time = clean_timestamp(datetime.now())
# Add one document per issue thread, # Add one document per issue thread,
# containing entire text of thread. # containing entire text of thread.
writer.add_document(
id = issue.html_url, created_time = issue.created_at
kind = 'issue', modified_time = issue.updated_at
created_time = created_time, indexed_time = datetime.now()
modified_time = modified_time, try:
indexed_time = indexed_time, writer.add_document(
title = issue.title, id = issue.html_url,
url = issue.html_url, kind = 'issue',
mimetype='', created_time = created_time,
owner_email='', modified_time = modified_time,
owner_name='', indexed_time = indexed_time,
repo_name = repo_name, title = issue.title,
repo_url = repo_url, url = issue.html_url,
github_user = issue.user.login, mimetype='',
issue_title = issue.title, owner_email='',
issue_url = issue.html_url, owner_name='',
content = issue_comment_content group='',
) repo_name = repo_name,
repo_url = repo_url,
github_user = issue.user.login,
issue_title = issue.title,
issue_url = issue.html_url,
content = issue_comment_content
)
except ValueError as e:
print(repr(e))
print(" > XXXXXX Failed to index Github issue \"%s\""%(issue.title))
@@ -462,7 +494,8 @@ class Search:
print(" > XXXXXXXX Failed to find file info.") print(" > XXXXXXXX Failed to find file info.")
return return
indexed_time = clean_timestamp(datetime.now())
indexed_time = datetime.now()
if fext in MARKDOWN_EXTS: if fext in MARKDOWN_EXTS:
print("Indexing markdown doc %s from repo %s"%(fname,repo_name)) print("Indexing markdown doc %s from repo %s"%(fname,repo_name))
@@ -491,24 +524,31 @@ class Search:
usable_url = "https://github.com/%s/blob/master/%s"%(repo_name, fpath) usable_url = "https://github.com/%s/blob/master/%s"%(repo_name, fpath)
# Now create the actual search index record # Now create the actual search index record
writer.add_document( try:
id = fsha, writer.add_document(
kind = 'markdown', id = fsha,
created_time = '', kind = 'markdown',
modified_time = '', created_time = None,
indexed_time = indexed_time, modified_time = None,
title = fname, indexed_time = indexed_time,
url = usable_url, title = fname,
mimetype='', url = usable_url,
owner_email='', mimetype='',
owner_name='', owner_email='',
repo_name = repo_name, owner_name='',
repo_url = repo_url, group='',
github_user = '', repo_name = repo_name,
issue_title = '', repo_url = repo_url,
issue_url = '', github_user = '',
content = content issue_title = '',
) issue_url = '',
content = content
)
except ValueError as e:
print(repr(e))
print(" > XXXXXX Failed to index Github markdown file \"%s\""%(fname))
else: else:
print("Indexing github file %s from repo %s"%(fname,repo_name)) print("Indexing github file %s from repo %s"%(fname,repo_name))
@@ -516,24 +556,29 @@ class Search:
key = fname+"_"+fsha key = fname+"_"+fsha
# Now create the actual search index record # Now create the actual search index record
writer.add_document( try:
id = key, writer.add_document(
kind = 'ghfile', id = key,
created_time = '', kind = 'ghfile',
modified_time = '', created_time = None,
indexed_time = indexed_time, modified_time = None,
title = fname, indexed_time = indexed_time,
url = repo_url, title = fname,
mimetype='', url = repo_url,
owner_email='', mimetype='',
owner_name='', owner_email='',
repo_name = repo_name, owner_name='',
repo_url = repo_url, group='',
github_user = '', repo_name = repo_name,
issue_title = '', repo_url = repo_url,
issue_url = '', github_user = '',
content = '' issue_title = '',
) issue_url = '',
content = ''
)
except ValueError as e:
print(repr(e))
print(" > XXXXXX Failed to index Github file \"%s\""%(fname))
@@ -547,28 +592,42 @@ class Search:
Use a Groups.io email thread record to add Use a Groups.io email thread record to add
an email thread to the search index. an email thread to the search index.
""" """
indexed_time = clean_timestamp(datetime.now()) if 'created_time' in d.keys() and d['created_time'] is not None:
created_time = d['created_time']
else:
created_time = None
if 'modified_time' in d.keys() and d['modified_time'] is not None:
modified_time = d['modified_time']
else:
modified_time = None
indexed_time = datetime.now()
# Now create the actual search index record # Now create the actual search index record
writer.add_document( try:
id = d['permalink'], writer.add_document(
kind = 'emailthread', id = d['permalink'],
created_time = '', kind = 'emailthread',
modified_time = '', created_time = created_time,
indexed_time = indexed_time, modified_time = modified_time,
title = d['subject'], indexed_time = indexed_time,
url = d['permalink'], title = d['subject'],
mimetype='', url = d['permalink'],
owner_email='', mimetype='',
owner_name=d['original_sender'], owner_email='',
repo_name = '', owner_name=d['original_sender'],
repo_url = '', group=d['subgroup'],
github_user = '', repo_name = '',
issue_title = '', repo_url = '',
issue_url = '', github_user = '',
content = d['content'] issue_title = '',
) issue_url = '',
content = d['content']
)
except ValueError as e:
print(repr(e))
print(" > XXXXXX Failed to index Groups.io thread \"%s\""%(d['subject']))
# ------------------------------ # ------------------------------
@@ -581,28 +640,33 @@ class Search:
to add a disqus comment thread to the to add a disqus comment thread to the
search index. search index.
""" """
indexed_time = clean_timestamp(datetime.now()) indexed_time = datetime.now()
# created_time is already a timestamp
# Now create the actual search index record # Now create the actual search index record
writer.add_document( try:
id = d['id'], writer.add_document(
kind = 'disqus', id = d['id'],
created_time = d['created_time'], kind = 'disqus',
modified_time = '', created_time = d['created_time'],
indexed_time = indexed_time, modified_time = None,
title = d['title'], indexed_time = indexed_time,
url = d['link'], title = d['title'],
mimetype='', url = d['link'],
owner_email='', mimetype='',
owner_name='', owner_email='',
repo_name = '', owner_name='',
repo_url = '', repo_name = '',
github_user = '', repo_url = '',
issue_title = '', github_user = '',
issue_url = '', issue_title = '',
content = d['content'] issue_url = '',
) content = d['content']
)
except ValueError as e:
print(repr(e))
print(" > XXXXXX Failed to index Disqus comment thread \"%s\""%(d['title']))
@@ -680,10 +744,10 @@ class Search:
full_items[f['id']] = f full_items[f['id']] = f
## Shorter: ## Shorter:
#break break
# Longer: ## Longer:
if nextPageToken is None: #if nextPageToken is None:
break # break
writer = self.ix.writer() writer = self.ix.writer()
@@ -691,34 +755,41 @@ class Search:
temp_dir = tempfile.mkdtemp(dir=os.getcwd()) temp_dir = tempfile.mkdtemp(dir=os.getcwd())
print("Temporary directory: %s"%(temp_dir)) print("Temporary directory: %s"%(temp_dir))
try:
# Drop any id in indexed_ids
# not in remote_ids
drop_ids = indexed_ids - remote_ids
for drop_id in drop_ids:
writer.delete_by_term('id',drop_id)
# Drop any id in indexed_ids # Update any id in indexed_ids
# not in remote_ids # and in remote_ids
drop_ids = indexed_ids - remote_ids update_ids = indexed_ids & remote_ids
for drop_id in drop_ids: for update_id in update_ids:
writer.delete_by_term('id',drop_id) # cop out
writer.delete_by_term('id',update_id)
item = full_items[update_id]
self.add_drive_file(writer, item, temp_dir, config, update=True)
count += 1
# Update any id in indexed_ids # Add any id not in indexed_ids
# and in remote_ids # and in remote_ids
update_ids = indexed_ids & remote_ids add_ids = remote_ids - indexed_ids
for update_id in update_ids: for add_id in add_ids:
# cop out item = full_items[add_id]
writer.delete_by_term('id',update_id) self.add_drive_file(writer, item, temp_dir, config, update=False)
item = full_items[update_id] count += 1
self.add_drive_file(writer, item, temp_dir, config, update=True)
count += 1
# Add any id not in indexed_ids
# and in remote_ids
add_ids = remote_ids - indexed_ids
for add_id in add_ids:
item = full_items[add_id]
self.add_drive_file(writer, item, temp_dir, config, update=False)
count += 1
except Exception as e:
print("ERROR: While adding Google Drive files to search index")
print("-"*40)
print(repr(e))
print("-"*40)
print("Continuing...")
pass
print("Cleaning temporary directory: %s"%(temp_dir)) print("Cleaning temporary directory: %s"%(temp_dir))
subprocess.call(['rm','-fr',temp_dir]) subprocess.call(['rm','-fr',temp_dir])
@@ -1176,7 +1247,7 @@ class Search:
elif doctype=='issue': elif doctype=='issue':
item_keys = ['title','repo_name','repo_url','url','created_time','modified_time'] item_keys = ['title','repo_name','repo_url','url','created_time','modified_time']
elif doctype=='emailthread': elif doctype=='emailthread':
item_keys = ['title','owner_name','url'] item_keys = ['title','owner_name','url','created_time','modified_time']
elif doctype=='disqus': elif doctype=='disqus':
item_keys = ['title','created_time','url'] item_keys = ['title','created_time','url']
elif doctype=='ghfile': elif doctype=='ghfile':
@@ -1195,11 +1266,7 @@ class Search:
for r in results: for r in results:
d = {} d = {}
for k in item_keys: for k in item_keys:
if k=='created_time' or k=='modified_time': d[k] = r[k]
#d[k] = r[k]
d[k] = dateutil.parser.parse(r[k]).strftime("%Y-%m-%d")
else:
d[k] = r[k]
json_results.append(d) json_results.append(d)
return json_results return json_results
@@ -1212,13 +1279,16 @@ class Search:
query_string = " ".join(query_list) query_string = " ".join(query_list)
query = None query = None
if ":" in query_string: if ":" in query_string:
#query = QueryParser("content", #query = QueryParser("content",
# self.schema # self.schema
#).parse(query_string) #).parse(query_string)
query = QueryParser("content", query = QueryParser("content",
self.schema, self.schema,
termclass=query.Variations termclass=query.Variations
).parse(query_string) )
query.add_plugin(DateParserPlugin(free=True))
query = query.parse(query_string)
elif len(fields) == 1 and fields[0] == "filename": elif len(fields) == 1 and fields[0] == "filename":
pass pass
elif len(fields) == 2: elif len(fields) == 2:
@@ -1226,9 +1296,12 @@ class Search:
else: else:
# If the user does not specify a field, # If the user does not specify a field,
# these are the fields that are actually searched # these are the fields that are actually searched
fields = ['title', 'content','owner_name','owner_email','url'] fields = ['title', 'content','owner_name','owner_email','url','created_date','modified_date']
if not query: if not query:
query = MultifieldParser(fields, schema=self.ix.schema).parse(query_string) query = MultifieldParser(fields, schema=self.ix.schema)
query.add_plugin(DateParserPlugin(free=True))
query = query.parse(query_string)
#query = MultifieldParser(fields, schema=self.ix.schema).parse(query_string)
parsed_query = "%s" % query parsed_query = "%s" % query
print("query: %s" % parsed_query) print("query: %s" % parsed_query)
results = searcher.search(query, terms=False, scored=True, groupedby="kind") results = searcher.search(query, terms=False, scored=True, groupedby="kind")

View File

@@ -1,6 +1,7 @@
import os, re import os, re
import requests import requests
import json import json
import dateutil.parser
from pprint import pprint from pprint import pprint
@@ -123,7 +124,7 @@ class DisqusCrawler(object):
# We need to make this value a dictionary # We need to make this value a dictionary
thread_info = dict( thread_info = dict(
id = response['id'], id = response['id'],
created_time = response['createdAt'], created_time = dateutil.parser.parse(response['createdAt']),
title = response['title'], title = response['title'],
forum = response['forum'], forum = response['forum'],
link = clean_link, link = clean_link,

View File

@@ -1,5 +1,7 @@
import requests, os, re import requests, os, re
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import dateutil.parser
import datetime
class GroupsIOException(Exception): class GroupsIOException(Exception):
pass pass
@@ -64,7 +66,7 @@ class GroupsIOArchivesCrawler(object):
## Short circuit ## Short circuit
## for debugging purposes ## for debugging purposes
#break break
return subgroups return subgroups
@@ -251,7 +253,7 @@ class GroupsIOArchivesCrawler(object):
subject = soup.find('title').text subject = soup.find('title').text
# Extract information for the schema: # Extract information for the schema:
# - permalink for thread (done) # - permalink for thread (done above)
# - subject/title (done) # - subject/title (done)
# - original sender email/name (done) # - original sender email/name (done)
# - content (done) # - content (done)
@@ -266,11 +268,35 @@ class GroupsIOArchivesCrawler(object):
pass pass
else: else:
# found an email! # found an email!
# this is a maze, thanks groups.io # this is a maze, not amazing.
# thanks groups.io!
td = tr.find('td') td = tr.find('td')
divrow = td.find('div',{'class':'row'}).find('div',{'class':'pull-left'})
sender_divrow = td.find('div',{'class':'row'})
sender_divrow = sender_divrow.find('div',{'class':'pull-left'})
if (i+1)==1: if (i+1)==1:
original_sender = divrow.text.strip() original_sender = sender_divrow.text.strip()
date_divrow = td.find('div',{'class':'row'})
date_divrow = date_divrow.find('div',{'class':'pull-right'})
date_divrow = date_divrow.find('font',{'class':'text-muted'})
date_divrow = date_divrow.find('script').text
try:
time_seconds = re.search(' [0-9]{1,} ',date_divrow).group(0)
time_seconds = time_seconds.strip()
# Thanks groups.io for the weird date formatting
time_seconds = time_seconds[:10]
mmicro_seconds = time_seconds[10:]
if (i+1)==1:
created_time = datetime.datetime.utcfromtimestamp(int(time_seconds))
modified_time = datetime.datetime.utcfromtimestamp(int(time_seconds))
else:
modified_time = datetime.datetime.utcfromtimestamp(int(time_seconds))
except AttributeError:
created_time = None
modified_time = None
for div in td.find_all('div'): for div in td.find_all('div'):
if div.has_attr('id'): if div.has_attr('id'):
@@ -299,7 +325,10 @@ class GroupsIOArchivesCrawler(object):
thread = { thread = {
'permalink' : permalink, 'permalink' : permalink,
'created_time' : created_time,
'modified_time' : modified_time,
'subject' : subject, 'subject' : subject,
'subgroup' : subgroup_name,
'original_sender' : original_sender, 'original_sender' : original_sender,
'content' : full_content 'content' : full_content
} }
@@ -324,11 +353,13 @@ class GroupsIOArchivesCrawler(object):
results = [] results = []
for row in rows: for row in rows:
# We don't care about anything except title and ugly link # This is where we extract
# a list of thread titles
# and corresponding links.
subject = row.find('span',{'class':'subject'}) subject = row.find('span',{'class':'subject'})
title = subject.get_text() title = subject.get_text()
link = row.find('a')['href'] link = row.find('a')['href']
#print(title)
results.append((title,link)) results.append((title,link))
return results return results