Merge branch 'use-datetime' into merge-datetime-into-disqus

* use-datetime: extract date and time from email threads pages add groups and tags to schema; update how we determine timestamps; handle exceptions when we add the document to the writer, rather than elsewhere move where exception is caught (exception was also incorrect.) switched created_time, modified_time, indexed_time over to DATETIME. added DateParserPlugin to query QueryParser. added time fields to those being searched by default. tests do not seem to be working.
Merge pull request #90 from dcppc/add-dates-and-subgroups-to-emails
2018-08-24 01:13:42 -07:00 · 2018-08-24 00:07:40 -07:00 · 2018-08-24 00:04:35 -07:00 · 2018-08-24 00:03:23 -07:00 · 2018-08-24 00:01:26 -07:00 · 2018-08-23 19:01:40 -07:00
3 changed files with 304 additions and 199 deletions
--- a/centillion_search.py
+++ b/centillion_search.py
@@ -24,6 +24,8 @@ import dateutil.parser
 from whoosh import query
 from whoosh.qparser import MultifieldParser, QueryParser
 from whoosh.analysis import StemmingAnalyzer, LowercaseFilter, StopFilter
 from whoosh.qparser.dateparse import DateParserPlugin
 from whoosh import fields, index
 """
@@ -195,30 +197,38 @@ class Search:
        # is defined.
        schema = Schema(
-                id = ID(stored=True, unique=True),
+                id = fields.ID(stored=True, unique=True),
-                kind = ID(stored=True),
+                kind = fields.ID(stored=True),
-                created_time = ID(stored=True),
+                created_time = fields.DATETIME(stored=True),
-                modified_time = ID(stored=True),
+                modified_time = fields.DATETIME(stored=True),
-                indexed_time = ID(stored=True),
+                indexed_time = fields.DATETIME(stored=True),
-                title = TEXT(stored=True, field_boost=100.0),
+                title = fields.TEXT(stored=True, field_boost=100.0),
                url = ID(stored=True, unique=True),
                mimetype=ID(stored=True),
                owner_email=ID(stored=True),
                owner_name=TEXT(stored=True),
                repo_name=TEXT(stored=True),
                repo_url=ID(stored=True),
-                github_user=TEXT(stored=True),
+                url = fields.ID(stored=True),
                mimetype = fields.TEXT(stored=True),
                owner_email = fields.ID(stored=True),
                owner_name = fields.TEXT(stored=True),
                # mainly for email threads, groups.io, hypothesis
                group = fields.ID(stored=True),
                repo_name = fields.TEXT(stored=True),
                repo_url = fields.ID(stored=True),
                github_user = fields.TEXT(stored=True),
                tags = fields.KEYWORD(commas=True,
                                      stored=True,
                                      lowercase=True),
                # comments only
-                issue_title=TEXT(stored=True, field_boost=100.0),
+                issue_title = fields.TEXT(stored=True, field_boost=100.0),
-                issue_url=ID(stored=True),
+                issue_url = fields.ID(stored=True),
-                content=TEXT(stored=True, analyzer=stemming_analyzer)
+                content = fields.TEXT(stored=True, analyzer=stemming_analyzer)
        )
@@ -258,24 +268,32 @@ class Search:
            writer.delete_by_term('id',item['id'])
            # Index a plain google drive file
-            writer.add_document(
+            created_time = dateutil.parser.parse(item['createdTime'])
-                    id = item['id'],
+            modified_time = dateutil.parser.parse(item['modifiedTime'])
-                    kind = 'gdoc',
+            indexed_time = datetime.now().replace(microsecond=0)
-                    created_time = item['createdTime'],
+            try:
-                    modified_time = item['modifiedTime'],
+                writer.add_document(
-                    indexed_time = datetime.now().replace(microsecond=0).isoformat(),
+                        id = item['id'],
-                    title = item['name'],
+                        kind = 'gdoc',
-                    url = item['webViewLink'],
+                        created_time = created_time,
-                    mimetype = mimetype,
+                        modified_time = modified_time,
-                    owner_email = item['owners'][0]['emailAddress'],
+                        indexed_time = indexed_time,
-                    owner_name = item['owners'][0]['displayName'],
+                        title = item['name'],
-                    repo_name='',
+                        url = item['webViewLink'],
-                    repo_url='',
+                        mimetype = mimetype,
-                    github_user='',
+                        owner_email = item['owners'][0]['emailAddress'],
-                    issue_title='',
+                        owner_name = item['owners'][0]['displayName'],
-                    issue_url='',
+                        group='',
-                    content = content
+                        repo_name='',
-            )
+                        repo_url='',
                        github_user='',
                        issue_title='',
                        issue_url='',
                        content = content
                )
            except ValueError as e:
                print(repr(e))
                print(" > XXXXXX Failed to index Google Drive file \"%s\""%(item['name']))
        else:
@@ -329,7 +347,7 @@ class Search:
                )
                assert output == ""
            except RuntimeError:
-                print(" > XXXXXX Failed to index document \"%s\""%(item['name']))
+                print(" > XXXXXX Failed to index Google Drive document \"%s\""%(item['name']))
            # If export was successful, read contents of markdown
@@ -357,24 +375,33 @@ class Search:
            else:
                print(" > Creating a new record")
-            writer.add_document(
+            try:
-                    id = item['id'],
+                created_time = dateutil.parser.parse(item['createdTime'])
-                    kind = 'gdoc',
+                modified_time = dateutil.parser.parse(item['modifiedTime'])
-                    created_time = item['createdTime'],
+                indexed_time = datetime.now()
-                    modified_time = item['modifiedTime'],
+                writer.add_document(
-                    indexed_time = datetime.now().replace(microsecond=0).isoformat(),
+                        id = item['id'],
-                    title = item['name'],
+                        kind = 'gdoc',
-                    url = item['webViewLink'],
+                        created_time = created_time,
-                    mimetype = mimetype,
+                        modified_time = modified_time,
-                    owner_email = item['owners'][0]['emailAddress'],
+                        indexed_time = indexed_time,
-                    owner_name = item['owners'][0]['displayName'],
+                        title = item['name'],
-                    repo_name='',
+                        url = item['webViewLink'],
-                    repo_url='',
+                        mimetype = mimetype,
-                    github_user='',
+                        owner_email = item['owners'][0]['emailAddress'],
-                    issue_title='',
+                        owner_name = item['owners'][0]['displayName'],
-                    issue_url='',
+                        group='',
-                    content = content
+                        repo_name='',
-            )
+                        repo_url='',
                        github_user='',
                        issue_title='',
                        issue_url='',
                        content = content
                )
            except ValueError as e:
                print(repr(e))
                print(" > XXXXXX Failed to index Google Drive file \"%s\""%(item['name']))
@@ -408,31 +435,36 @@ class Search:
                issue_comment_content += comment.body.rstrip()
                issue_comment_content += "\n"
-        # Now create the actual search index record
+        # Now create the actual search index record.
        created_time = clean_timestamp(issue.created_at)
        modified_time = clean_timestamp(issue.updated_at)
        indexed_time = clean_timestamp(datetime.now())
        # Add one document per issue thread,
        # containing entire text of thread.
-        writer.add_document(
+
-                id = issue.html_url,
+        created_time = issue.created_at
-                kind = 'issue',
+        modified_time = issue.updated_at
-                created_time = created_time,
+        indexed_time = datetime.now()
-                modified_time = modified_time,
+        try:
-                indexed_time = indexed_time,
+            writer.add_document(
-                title = issue.title,
+                    id = issue.html_url,
-                url = issue.html_url,
+                    kind = 'issue',
-                mimetype='',
+                    created_time = created_time,
-                owner_email='',
+                    modified_time = modified_time,
-                owner_name='',
+                    indexed_time = indexed_time,
-                repo_name = repo_name,
+                    title = issue.title,
-                repo_url = repo_url,
+                    url = issue.html_url,
-                github_user = issue.user.login,
+                    mimetype='',
-                issue_title = issue.title,
+                    owner_email='',
-                issue_url = issue.html_url,
+                    owner_name='',
-                content = issue_comment_content
+                    group='',
-        )
+                    repo_name = repo_name,
                    repo_url = repo_url,
                    github_user = issue.user.login,
                    issue_title = issue.title,
                    issue_url = issue.html_url,
                    content = issue_comment_content
            )
        except ValueError as e:
            print(repr(e))
            print(" > XXXXXX Failed to index Github issue \"%s\""%(issue.title))
@@ -462,7 +494,8 @@ class Search:
            print(" > XXXXXXXX Failed to find file info.")
            return
-        indexed_time = clean_timestamp(datetime.now())
+
        indexed_time = datetime.now()
        if fext in MARKDOWN_EXTS:
            print("Indexing markdown doc %s from repo %s"%(fname,repo_name))
@@ -491,24 +524,31 @@ class Search:
            usable_url = "https://github.com/%s/blob/master/%s"%(repo_name, fpath)
            # Now create the actual search index record
-            writer.add_document(
+            try:
-                    id = fsha,
+                writer.add_document(
-                    kind = 'markdown',
+                        id = fsha,
-                    created_time = '',
+                        kind = 'markdown',
-                    modified_time = '',
+                        created_time = None,
-                    indexed_time = indexed_time,
+                        modified_time = None,
-                    title = fname,
+                        indexed_time = indexed_time,
-                    url = usable_url,
+                        title = fname,
-                    mimetype='',
+                        url = usable_url,
-                    owner_email='',
+                        mimetype='',
-                    owner_name='',
+                        owner_email='',
-                    repo_name = repo_name,
+                        owner_name='',
-                    repo_url = repo_url,
+                        group='',
-                    github_user = '',
+                        repo_name = repo_name,
-                    issue_title = '',
+                        repo_url = repo_url,
-                    issue_url = '',
+                        github_user = '',
-                    content = content
+                        issue_title = '',
-            )
+                        issue_url = '',
                        content = content
                )
            except ValueError as e:
                print(repr(e))
                print(" > XXXXXX Failed to index Github markdown file \"%s\""%(fname))
        else:
            print("Indexing github file %s from repo %s"%(fname,repo_name))
@@ -516,24 +556,29 @@ class Search:
            key = fname+"_"+fsha
            # Now create the actual search index record
-            writer.add_document(
+            try:
-                    id = key,
+                writer.add_document(
-                    kind = 'ghfile',
+                        id = key,
-                    created_time = '',
+                        kind = 'ghfile',
-                    modified_time = '',
+                        created_time = None,
-                    indexed_time = indexed_time,
+                        modified_time = None,
-                    title = fname,
+                        indexed_time = indexed_time,
-                    url = repo_url,
+                        title = fname,
-                    mimetype='',
+                        url = repo_url,
-                    owner_email='',
+                        mimetype='',
-                    owner_name='',
+                        owner_email='',
-                    repo_name = repo_name,
+                        owner_name='',
-                    repo_url = repo_url,
+                        group='',
-                    github_user = '',
+                        repo_name = repo_name,
-                    issue_title = '',
+                        repo_url = repo_url,
-                    issue_url = '',
+                        github_user = '',
-                    content = ''
+                        issue_title = '',
-            )
+                        issue_url = '',
                        content = ''
                )
            except ValueError as e:
                print(repr(e))
                print(" > XXXXXX Failed to index Github file \"%s\""%(fname))
@@ -547,28 +592,42 @@ class Search:
        Use a Groups.io email thread record to add 
        an email thread to the search index.
        """
-        indexed_time = clean_timestamp(datetime.now())
+        if 'created_time' in d.keys() and d['created_time'] is not None:
            created_time = d['created_time']
        else:
            created_time = None
        if 'modified_time' in d.keys() and d['modified_time'] is not None:
            modified_time = d['modified_time']
        else:
            modified_time = None
        indexed_time = datetime.now()
        # Now create the actual search index record
-        writer.add_document(
+        try:
-                id = d['permalink'],
+            writer.add_document(
-                kind = 'emailthread',
+                    id = d['permalink'],
-                created_time = '',
+                    kind = 'emailthread',
-                modified_time = '',
+                    created_time = created_time,
-                indexed_time = indexed_time,
+                    modified_time = modified_time,
-                title = d['subject'],
+                    indexed_time = indexed_time,
-                url = d['permalink'],
+                    title = d['subject'],
-                mimetype='',
+                    url = d['permalink'],
-                owner_email='',
+                    mimetype='',
-                owner_name=d['original_sender'],
+                    owner_email='',
-                repo_name = '',
+                    owner_name=d['original_sender'],
-                repo_url = '',
+                    group=d['subgroup'],
-                github_user = '',
+                    repo_name = '',
-                issue_title = '',
+                    repo_url = '',
-                issue_url = '',
+                    github_user = '',
-                content = d['content']
+                    issue_title = '',
-        )
+                    issue_url = '',
-
+                    content = d['content']
            )
        except ValueError as e:
            print(repr(e))
            print(" > XXXXXX Failed to index Groups.io thread \"%s\""%(d['subject']))
    # ------------------------------
@@ -581,28 +640,33 @@ class Search:
        to add a disqus comment thread to the
        search index.
        """
-        indexed_time = clean_timestamp(datetime.now())
+        indexed_time = datetime.now()
        # created_time is already a timestamp
        # Now create the actual search index record
-        writer.add_document(
+        try:
-                id = d['id'],
+            writer.add_document(
-                kind = 'disqus',
+                    id = d['id'],
-                created_time = d['created_time'],
+                    kind = 'disqus',
-                modified_time = '',
+                    created_time = d['created_time'],
-                indexed_time = indexed_time,
+                    modified_time = None,
-                title = d['title'],
+                    indexed_time = indexed_time,
-                url = d['link'],
+                    title = d['title'],
-                mimetype='',
+                    url = d['link'],
-                owner_email='',
+                    mimetype='',
-                owner_name='',
+                    owner_email='',
-                repo_name = '',
+                    owner_name='',
-                repo_url = '',
+                    repo_name = '',
-                github_user = '',
+                    repo_url = '',
-                issue_title = '',
+                    github_user = '',
-                issue_url = '',
+                    issue_title = '',
-                content = d['content']
+                    issue_url = '',
-        )
+                    content = d['content']
-
+            )
        except ValueError as e:
            print(repr(e))
            print(" > XXXXXX Failed to index Disqus comment thread \"%s\""%(d['title']))
@@ -680,10 +744,10 @@ class Search:
                full_items[f['id']] = f
            ## Shorter:
-            #break
+            break
-            # Longer:
+            ## Longer:
-            if nextPageToken is None:
+            #if nextPageToken is None:
-                break
+            #    break
        writer = self.ix.writer()
@@ -691,34 +755,41 @@ class Search:
        temp_dir = tempfile.mkdtemp(dir=os.getcwd())
        print("Temporary directory: %s"%(temp_dir))
        try:
            # Drop any id in indexed_ids
            # not in remote_ids
            drop_ids = indexed_ids - remote_ids
            for drop_id in drop_ids:
                writer.delete_by_term('id',drop_id)
-        # Drop any id in indexed_ids
+            # Update any id in indexed_ids
-        # not in remote_ids
+            # and in remote_ids
-        drop_ids = indexed_ids - remote_ids
+            update_ids = indexed_ids & remote_ids
-        for drop_id in drop_ids:
+            for update_id in update_ids:
-            writer.delete_by_term('id',drop_id)
+                # cop out
                writer.delete_by_term('id',update_id)
                item = full_items[update_id]
                self.add_drive_file(writer, item, temp_dir, config, update=True)
                count += 1
-        # Update any id in indexed_ids
+            # Add any id not in indexed_ids
-        # and in remote_ids
+            # and in remote_ids
-        update_ids = indexed_ids & remote_ids
+            add_ids = remote_ids - indexed_ids
-        for update_id in update_ids:
+            for add_id in add_ids:
-            # cop out
+                item = full_items[add_id]
-            writer.delete_by_term('id',update_id)
+                self.add_drive_file(writer, item, temp_dir, config, update=False)
-            item = full_items[update_id]
+                count += 1
            self.add_drive_file(writer, item, temp_dir, config, update=True)
            count += 1
        # Add any id not in indexed_ids
        # and in remote_ids
        add_ids = remote_ids - indexed_ids
        for add_id in add_ids:
            item = full_items[add_id]
            self.add_drive_file(writer, item, temp_dir, config, update=False)
            count += 1
        except Exception as e:
            print("ERROR: While adding Google Drive files to search index")
            print("-"*40)
            print(repr(e))
            print("-"*40)
            print("Continuing...")
            pass
        print("Cleaning temporary directory: %s"%(temp_dir))
        subprocess.call(['rm','-fr',temp_dir])
@@ -1176,7 +1247,7 @@ class Search:
        elif doctype=='issue':
            item_keys = ['title','repo_name','repo_url','url','created_time','modified_time']
        elif doctype=='emailthread':
-            item_keys = ['title','owner_name','url']
+            item_keys = ['title','owner_name','url','created_time','modified_time']
        elif doctype=='disqus':
            item_keys = ['title','created_time','url']
        elif doctype=='ghfile':
@@ -1195,11 +1266,7 @@ class Search:
            for r in results:
                d = {}
                for k in item_keys:
-                    if k=='created_time' or k=='modified_time':
+                    d[k] = r[k]
                        #d[k] = r[k]
                        d[k] = dateutil.parser.parse(r[k]).strftime("%Y-%m-%d")
                    else:
                        d[k] = r[k]
                json_results.append(d)
        return json_results
@@ -1212,13 +1279,16 @@ class Search:
            query_string = " ".join(query_list)
            query = None
            if ":" in query_string:
                #query = QueryParser("content", 
                #                    self.schema
                #).parse(query_string)
                query = QueryParser("content", 
                                    self.schema,
                                    termclass=query.Variations
-                ).parse(query_string)
+                )
                query.add_plugin(DateParserPlugin(free=True))
                query = query.parse(query_string)
            elif len(fields) == 1 and fields[0] == "filename":
                pass
            elif len(fields) == 2:
@@ -1226,9 +1296,12 @@ class Search:
            else:
                # If the user does not specify a field,
                # these are the fields that are actually searched
-                fields = ['title', 'content','owner_name','owner_email','url']
+                fields = ['title', 'content','owner_name','owner_email','url','created_date','modified_date']
            if not query:
-                query = MultifieldParser(fields, schema=self.ix.schema).parse(query_string)
+                query = MultifieldParser(fields, schema=self.ix.schema)
                query.add_plugin(DateParserPlugin(free=True))
                query = query.parse(query_string)
                #query = MultifieldParser(fields, schema=self.ix.schema).parse(query_string) 
            parsed_query = "%s" % query
            print("query: %s" % parsed_query)
            results = searcher.search(query, terms=False, scored=True, groupedby="kind")
--- a/disqus_util.py
+++ b/disqus_util.py
@@ -1,6 +1,7 @@
 import os, re
 import requests
 import json
 import dateutil.parser
 from pprint import pprint
@@ -123,7 +124,7 @@ class DisqusCrawler(object):
                        # We need to make this value a dictionary
                        thread_info = dict(
                                id = response['id'],
-                                created_time = response['createdAt'],
+                                created_time = dateutil.parser.parse(response['createdAt']),
                                title = response['title'],
                                forum = response['forum'],
                                link = clean_link,
--- a/groupsio_util.py
+++ b/groupsio_util.py
@@ -1,5 +1,7 @@
 import requests, os, re
 from bs4 import BeautifulSoup
 import dateutil.parser
 import datetime
 class GroupsIOException(Exception):
    pass
@@ -64,7 +66,7 @@ class GroupsIOArchivesCrawler(object):
            ## Short circuit
            ## for debugging purposes
-            #break
+            break
        return subgroups
@@ -251,7 +253,7 @@ class GroupsIOArchivesCrawler(object):
            subject = soup.find('title').text
            # Extract information for the schema:
-            # - permalink for thread (done)
+            # - permalink for thread (done above)
            # - subject/title (done)
            # - original sender email/name (done)
            # - content (done)
@@ -266,11 +268,35 @@ class GroupsIOArchivesCrawler(object):
                    pass
                else:
                    # found an email!
-                    # this is a maze, thanks groups.io
+                    # this is a maze, not amazing.
                    # thanks groups.io!
                    td = tr.find('td')
-                    divrow = td.find('div',{'class':'row'}).find('div',{'class':'pull-left'})
+
                    sender_divrow = td.find('div',{'class':'row'})
                    sender_divrow = sender_divrow.find('div',{'class':'pull-left'})
                    if (i+1)==1:
-                        original_sender = divrow.text.strip()
+                        original_sender = sender_divrow.text.strip()
                    date_divrow = td.find('div',{'class':'row'})
                    date_divrow = date_divrow.find('div',{'class':'pull-right'})
                    date_divrow = date_divrow.find('font',{'class':'text-muted'})
                    date_divrow = date_divrow.find('script').text
                    try:
                        time_seconds = re.search(' [0-9]{1,} ',date_divrow).group(0)
                        time_seconds = time_seconds.strip()
                        # Thanks groups.io for the weird date formatting
                        time_seconds = time_seconds[:10]
                        mmicro_seconds = time_seconds[10:]
                        if (i+1)==1:
                            created_time  = datetime.datetime.utcfromtimestamp(int(time_seconds))
                            modified_time = datetime.datetime.utcfromtimestamp(int(time_seconds))
                        else:
                            modified_time = datetime.datetime.utcfromtimestamp(int(time_seconds))
                    except AttributeError:
                        created_time = None
                        modified_time = None
                    for div in td.find_all('div'):
                        if div.has_attr('id'):
@@ -299,7 +325,10 @@ class GroupsIOArchivesCrawler(object):
            thread = {
                    'permalink' : permalink,
                    'created_time' : created_time,
                    'modified_time' : modified_time,
                    'subject' : subject,
                    'subgroup' : subgroup_name,
                    'original_sender' : original_sender,
                    'content' : full_content
            }
@@ -324,11 +353,13 @@ class GroupsIOArchivesCrawler(object):
        results = []
        for row in rows:
-            # We don't care about anything except title and ugly link
+            # This is where we extract
            # a list of thread titles 
            # and corresponding links.
            subject = row.find('span',{'class':'subject'})
            title = subject.get_text()
            link = row.find('a')['href']
-            #print(title)
+
            results.append((title,link))
        return results
Author	SHA1	Message	Date
Charles Reid	55a74f7d98	Merge branch 'use-datetime' into merge-datetime-into-disqus * use-datetime: extract date and time from email threads pages add groups and tags to schema; update how we determine timestamps; handle exceptions when we add the document to the writer, rather than elsewhere move where exception is caught (exception was also incorrect.) switched created_time, modified_time, indexed_time over to DATETIME. added DateParserPlugin to query QueryParser. added time fields to those being searched by default. tests do not seem to be working.	2018-08-24 01:13:42 -07:00
Chaz Reid	ab76226b0c	Merge pull request #90 from dcppc/add-dates-and-subgroups-to-emails Add dates and subgroups to emails	2018-08-24 00:07:40 -07:00
Charles Reid	a4ebef6e6f	extract date and time from email threads pages	2018-08-24 00:04:35 -07:00
Charles Reid	bad50efa9b	add groups and tags to schema; update how we determine timestamps; handle exceptions when we add the document to the writer, rather than elsewhere	2018-08-24 00:03:23 -07:00
Charles Reid	629fc063db	move where exception is caught (exception was also incorrect.)	2018-08-24 00:01:26 -07:00
Charles Reid	3b0baa21de	switched created_time, modified_time, indexed_time over to DATETIME. added DateParserPlugin to query QueryParser. added time fields to those being searched by default. tests do not seem to be working.	2018-08-23 19:01:40 -07:00