Merge branch 'master' of github.com:charlesreid1/centillion

* 'master' of github.com:charlesreid1/centillion: update config_flask.example.py to strip dc info
Merge branch 'master' of github.com:dcppc/centillion
2018-08-13 19:14:54 -07:00 · 2018-08-13 19:14:07 -07:00 · 2018-08-13 19:13:53 -07:00 · 2018-08-13 12:42:18 -07:00 · 2018-08-13 00:54:12 -07:00 · 2018-08-13 00:27:45 -07:00
8 changed files with 559 additions and 125 deletions
--- a/Readme.md
+++ b/Readme.md
@@ -1,4 +1,4 @@
-# The Centillion
+# Centillion

 **centillion**: a pan-github-markdown-issues-google-docs search engine.

--- a/centillion.py
+++ b/centillion.py
@@ -27,10 +27,16 @@ You provide:


 class UpdateIndexTask(object):
-    def __init__(self, gh_access_token, diff_index=False):
+    def __init__(self, app_config, diff_index=False):
        self.diff_index = diff_index
        thread = threading.Thread(target=self.run, args=())
-        self.gh_access_token = gh_access_token
+
+        self.gh_token = app_config['GITHUB_TOKEN']
+        self.groupsio_credentials = {
+                'groupsio_token' :     app_config['GROUPSIO_TOKEN'],
+                'groupsio_username' :  app_config['GROUPSIO_USERNAME'],
+                'groupsio_password' :  app_config['GROUPSIO_PASSWORD']
+        }
        thread.daemon = True
        thread.start()

@@ -43,9 +49,10 @@ class UpdateIndexTask(object):
        from get_centillion_config import get_centillion_config
        config = get_centillion_config('config_centillion.json')

-        search.update_index_issues(self.gh_access_token,config)
-        search.update_index_markdown(self.gh_access_token,config)
-        search.update_index_gdocs(config)
+        search.update_index_groupsioemails(self.groupsio_credentials,config)
+        ###search.update_index_ghfiles(self.gh_token,config)
+        ###search.update_index_issues(self.gh_token,config)
+        ###search.update_index_gdocs(config)



@@ -172,12 +179,9 @@ def update_index():
                mresp = github.get('/teams/%s/members/%s'%(copper_team_id,username))
                if mresp.status_code==204:

-                    #gh_oauth_token = github.token['access_token']
-                    gh_access_token = app.config['GITHUB_TOKEN']
-
                    # --------------------
                    # Business as usual
-                    UpdateIndexTask(gh_access_token, 
+                    UpdateIndexTask(app.config,
                                    diff_index=False)
                    flash("Rebuilding index, check console output")
                    return render_template("controlpanel.html", 
@@ -218,6 +222,7 @@ def oops(e):
    return contents404

 if __name__ == '__main__':
+    # if running local instance, set to true
    os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = 'true'
    app.run(host="0.0.0.0",port=5000)

--- a/centillion_search.py
+++ b/centillion_search.py
@@ -5,6 +5,7 @@ from github import Github, GithubException
 import base64

 from gdrive_util import GDrive
+from groupsio_util import GroupsIOArchivesCrawler
 from apiclient.http import MediaIoBaseDownload

 import mistune
@@ -128,7 +129,6 @@ class Search:
        schema = Schema(
                id = ID(stored=True, unique=True),
                kind = ID(stored=True),
-                #fingerprint = ID(stored=True),

                created_time = ID(stored=True),
                modified_time = ID(stored=True),
@@ -266,7 +266,6 @@ class Search:

            # If export was successful, read contents of markdown
            # into the content variable.
-            # into the content variable.
            if os.path.isfile(fullpath_output):
                # Export was successful
                with codecs.open(fullpath_output, encoding='utf-8') as f:
@@ -276,12 +275,14 @@ class Search:
            # No matter what happens, clean up.
            print(" > Cleaning up \"%s\""%item['name'])

-            subprocess.call(['rm','-fr',fullpath_output])
+            ## test
            #print(" ".join(['rm','-fr',fullpath_output]))
-
-            subprocess.call(['rm','-fr',fullpath_input])
            #print(" ".join(['rm','-fr',fullpath_input]))

+            # do it
+            subprocess.call(['rm','-fr',fullpath_output])
+            subprocess.call(['rm','-fr',fullpath_input])
+
            if update:
                print(" > Removing old record")
                writer.delete_by_term('id',item['id'])
@@ -315,7 +316,7 @@ class Search:
    # to a search index.


-    def add_issue(self, writer, issue, gh_access_token, config, update=True):
+    def add_issue(self, writer, issue, gh_token, config, update=True):
        """
        Add a Github issue/comment to a search index.
        """
@@ -367,71 +368,101 @@ class Search:



-    def add_markdown(self, writer, d, gh_access_token, config, update=True):
+
+    def add_ghfile(self, writer, d, gh_token, config, update=True):
        """
-        Use a Github markdown document API record
-        to add a markdown document's contents to
-        the search index.
+        Use a Github file API record to add a filename
+        to the search index.
        """
+        MARKDOWN_EXTS = ['.md','.markdown']
+
        repo = d['repo']
        org = d['org']
        repo_name = org + "/" + repo
        repo_url = "https://github.com/" + repo_name

-        fpath = d['path']
-        furl = d['url']
-        fsha = d['sha']
-        _, fname = os.path.split(fpath)
-        _, fext = os.path.splitext(fpath)
-
-        print("Indexing markdown doc %s from repo %s"%(fname,repo_name))
-
-        # Unpack the requests response and decode the content
-        # 
-        # don't forget the headers for private repos!
-        # useful: https://bit.ly/2LSAflS
-
-        headers = {'Authorization' : 'token %s'%(gh_access_token)}
-
-        response = requests.get(furl, headers=headers)
-        if response.status_code==200:
-            jresponse = response.json()
-            content = ""
-            try:
-                binary_content = re.sub('\n','',jresponse['content'])
-                content = base64.b64decode(binary_content).decode('utf-8')
-            except KeyError:
-                print(" > XXXXXXXX Failed to extract 'content' field. You probably hit the rate limit.")
-
-        else:
-            print(" > XXXXXXXX Failed to reach file URL. There may be a problem with authentication/headers.")
+        try:
+            fpath = d['path']
+            furl = d['url']
+            fsha = d['sha']
+            _, fname = os.path.split(fpath)
+            _, fext = os.path.splitext(fpath)
+        except:
+            print(" > XXXXXXXX Failed to find file info.")
            return

-        # Now create the actual search index record
        indexed_time = clean_timestamp(datetime.now())

-        usable_url = "https://github.com/%s/blob/master/%s"%(repo_name, fpath)
+        if fext in MARKDOWN_EXTS:
+            print("Indexing markdown doc %s from repo %s"%(fname,repo_name))

-        # Add one document per issue thread,
-        # containing entire text of thread.
-        writer.add_document(
-                id = fsha,
-                kind = 'markdown',
-                created_time = '',
-                modified_time = '',
-                indexed_time = indexed_time,
-                title = fname,
-                url = usable_url,
-                mimetype='',
-                owner_email='',
-                owner_name='',
-                repo_name = repo_name,
-                repo_url = repo_url,
-                github_user = '',
-                issue_title = '',
-                issue_url = '',
-                content = content
-        )
+            # Unpack the requests response and decode the content
+            # 
+            # don't forget the headers for private repos!
+            # useful: https://bit.ly/2LSAflS
+
+            headers = {'Authorization' : 'token %s'%(gh_token)}
+
+            response = requests.get(furl, headers=headers)
+            if response.status_code==200:
+                jresponse = response.json()
+                content = ""
+                try:
+                    binary_content = re.sub('\n','',jresponse['content'])
+                    content = base64.b64decode(binary_content).decode('utf-8')
+                except KeyError:
+                    print(" > XXXXXXXX Failed to extract 'content' field. You probably hit the rate limit.")
+
+            else:
+                print(" > XXXXXXXX Failed to reach file URL. There may be a problem with authentication/headers.")
+                return 
+
+            usable_url = "https://github.com/%s/blob/master/%s"%(repo_name, fpath)
+
+            # Now create the actual search index record
+            writer.add_document(
+                    id = fsha,
+                    kind = 'markdown',
+                    created_time = '',
+                    modified_time = '',
+                    indexed_time = indexed_time,
+                    title = fname,
+                    url = usable_url,
+                    mimetype='',
+                    owner_email='',
+                    owner_name='',
+                    repo_name = repo_name,
+                    repo_url = repo_url,
+                    github_user = '',
+                    issue_title = '',
+                    issue_url = '',
+                    content = content
+            )
+
+        else:
+            print("Indexing github file %s from repo %s"%(fname,repo_name))
+
+            key = fname+"_"+fsha
+
+            # Now create the actual search index record
+            writer.add_document(
+                    id = key,
+                    kind = 'ghfile',
+                    created_time = '',
+                    modified_time = '',
+                    indexed_time = indexed_time,
+                    title = fname,
+                    url = repo_url,
+                    mimetype='',
+                    owner_email='',
+                    owner_name='',
+                    repo_name = repo_name,
+                    repo_url = repo_url,
+                    github_user = '',
+                    issue_title = '',
+                    issue_url = '',
+                    content = ''
+            )



@@ -559,7 +590,7 @@ class Search:
    # ------------------------------
    # Github Issues/Comments

-    def update_index_issues(self, gh_access_token, config):
+    def update_index_issues(self, gh_token, config):
        """
        Update the search index using a collection of 
        Github repo issues and comments.
@@ -574,7 +605,7 @@ class Search:
        # ------
        indexed_issues = set()
        p = QueryParser("kind", schema=self.ix.schema)
-        q = p.parse("gdoc")
+        q = p.parse("issue")
        with self.ix.searcher() as s:
            results = s.search(q,limit=None)
            for result in results:
@@ -584,7 +615,7 @@ class Search:
        # Get the set of remote ids:
        # ------
        # Start with api object
-        g = Github(gh_access_token)
+        g = Github(gh_token)

        # Now index all issue threads in the user-specified repos

@@ -638,7 +669,7 @@ class Search:
            # cop out
            writer.delete_by_term('id',update_issue)
            item = full_items[update_issue]
-            self.add_issue(writer, item, gh_access_token, config, update=True)
+            self.add_issue(writer, item, gh_token, config, update=True)
            count += 1


@@ -647,7 +678,7 @@ class Search:
        add_issues = remote_issues - indexed_issues
        for add_issue in add_issues:
            item = full_items[add_issue]
-            self.add_issue(writer, item, gh_access_token, config, update=False)
+            self.add_issue(writer, item, gh_token, config, update=False)
            count += 1


@@ -657,15 +688,14 @@ class Search:


    # ------------------------------
-    # Github Markdown Files
+    # Github Files

-    def update_index_markdown(self, gh_access_token, config): 
+    def update_index_ghfiles(self, gh_token, config): 
        """
        Update the search index using a collection of 
-        Markdown files from a Github repo.
+        files (and, separately, Markdown files) from 
+        a Github repo.
        """
-        EXT = '.md'
-
        # Updated algorithm:
        # - get set of indexed ids
        # - get set of remote ids
@@ -676,6 +706,12 @@ class Search:
        # ------
        indexed_ids = set()
        p = QueryParser("kind", schema=self.ix.schema)
+        q = p.parse("ghfiles")
+        with self.ix.searcher() as s:
+            results = s.search(q,limit=None)
+            for result in results:
+                indexed_ids.add(result['id'])
+
        q = p.parse("markdown")
        with self.ix.searcher() as s:
            results = s.search(q,limit=None)
@@ -685,10 +721,9 @@ class Search:
        # Get the set of remote ids:
        # ------
        # Start with api object
-        g = Github(gh_access_token)
+        g = Github(gh_token)

-        # Now index all markdown files
-        # in the user-specified repos
+        # Now index all the files.

        # Start by collecting all the things
        remote_ids = set()
@@ -711,9 +746,6 @@ class Search:
                continue


-            # ---------
-            # begin markdown-specific code
-
            # Get head commit
            commits = repo.get_commits()
            try:
@@ -726,31 +758,29 @@ class Search:
            # Get all the docs
            tree = repo.get_git_tree(sha=sha, recursive=True)
            docs = tree.raw_data['tree']
-            print("Parsing doc ids from repository %s"%(r))
+            print("Parsing file ids from repository %s"%(r))

            for d in docs:

                # For each doc, get the file extension
-                # If it matches EXT, download the file
+                # and decide what to do with it.
+
                fpath = d['path']
                _, fname = os.path.split(fpath)
                _, fext = os.path.splitext(fpath)

-                if fext==EXT:
+                key = d['sha']

-                    key = d['sha']
-                    d['org'] = this_org
-                    d['repo'] = this_repo
-                    value = d
+                d['org'] = this_org
+                d['repo'] = this_repo
+                value = d

-                    # Stash the doc for later
-                    remote_ids.add(key)
-                    full_items[key] = value
+                remote_ids.add(key)
+                full_items[key] = value

        writer = self.ix.writer()
        count = 0

-
        # Drop any id in indexed_ids
        # not in remote_ids
        drop_ids = indexed_ids - remote_ids
@@ -765,7 +795,7 @@ class Search:
            # cop out: just delete and re-add
            writer.delete_by_term('id',update_id)
            item = full_items[update_id]
-            self.add_markdown(writer, item, gh_access_token, config, update=True)
+            self.add_ghfile(writer, item, gh_token, config, update=True)
            count += 1


@@ -774,12 +804,12 @@ class Search:
        add_ids = remote_ids - indexed_ids
        for add_id in add_ids:
            item = full_items[add_id]
-            self.add_markdown(writer, item, gh_access_token, config, update=False)
+            self.add_ghfile(writer, item, gh_token, config, update=False)
            count += 1


        writer.commit()
-        print("Done, updated %d markdown documents in the index" % count)
+        print("Done, updated %d Github files in the index" % count)



@@ -787,10 +817,27 @@ class Search:
    # Groups.io Emails


-    #def update_index_markdown(self, gh_access_token, config): 
+    def update_index_groupsioemails(self, groupsio_token, config):
+        """
+        Update the search index using the email archives
+        of groups.io groups.

+        This requires the use of a spider.
+        RELEASE THE SPIDER!!!
+        """
+        spider = GroupsIOArchivesCrawler(groupsio_token,'dcppc')

+        # - ask spider to crawl the archives
+        spider.crawl_group_archives()

+        # - ask spider for list of all email records
+        #   - 1 email = 1 dictionary
+        #   - email records compiled by the spider
+        archives = spider.get_archives()
+
+        # - email object is sent off to add email method
+
+        print("Finished indexing groups.io emails")


    # ---------------------------------
@@ -900,31 +947,27 @@ class Search:
    def get_document_total_count(self):
        p = QueryParser("kind", schema=self.ix.schema)

-        kind_labels = {
-                "documents" : "gdoc",
-                "markdown" :  "markdown",
-                "issues" :    "issue",
-        }
        counts = {
-                "documents" : None,
+                "gdoc" : None,
+                "issue" : None,
+                "ghfile" : None,
                "markdown" : None,
-                "issues" : None,
                "total" : None
        }
-        for key in kind_labels:
-            kind = kind_labels[key]
-            q = p.parse(kind)
+        for key in counts.keys():
+            q = p.parse(key)
            with self.ix.searcher() as s:
                results = s.search(q,limit=None)
                counts[key] = len(results)

-        ## These two should NOT be different, but they are...
-        #counts['total'] = self.ix.searcher().doc_count_all()
-        counts['total'] = counts['documents'] + counts['markdown'] + counts['issues']
+        counts['total'] = sum(counts[k] for k in counts.keys())

        return counts

 if __name__ == "__main__":
+
+    raise Exception("Error: main method not implemented (fix groupsio credentials first)")
+
    search = Search("search_index")

    from get_centillion_config import get_centillion_config
--- a/config_centillion.json
+++ b/config_centillion.json
@@ -6,7 +6,6 @@
        "dcppc/organize",
        "dcppc/dcppc-bot",
        "dcppc/full-stacks",
-        "dcppc/markdown-issues",
        "dcppc/design-guidelines-discuss",
        "dcppc/dcppc-deliverables",
        "dcppc/dcppc-milestones",
@@ -22,6 +21,7 @@
        "dcppc/2018-august-workshop",
        "dcppc/2018-september-workshop",
        "dcppc/design-guidelines",
-        "dcppc/2018-may-workshop"
+        "dcppc/2018-may-workshop",
+        "dcppc/centillion"
    ]
 }
--- a/config_flask.example.py
+++ b/config_flask.example.py
@@ -7,14 +7,14 @@ GITHUB_OAUTH_CLIENT_SECRET = "YYY"
 GITHUB_TOKEN = "ZZZ"

 # More information footer: Repository label
-FOOTER_REPO_ORG = "dcppc"
+FOOTER_REPO_ORG = "charlesreid1"
 FOOTER_REPO_NAME = "centillion"

 # Toggle to show Whoosh parsed query
 SHOW_PARSED_QUERY=True

-TAGLINE = "Search the Data Commons"
+TAGLINE = "Search All The Things"

 # Flask settings
 DEBUG = True
-SECRET_KEY = '42c5a8eda356ca9d9c3ab2d149541e6b91d843fa'
+SECRET_KEY = 'WWWWW'
--- a/groupsio_util.py
+++ b/groupsio_util.py
@@ -0,0 +1,382 @@
+import requests, os, re
+from bs4 import BeautifulSoup
+
+class GroupsIOArchivesCrawler(object):
+    """
+    This is a Groups.io spider
+    designed to crawl the email
+    archives of a group.
+
+    credentials (dictionary):
+        groupsio_token :     api access token
+        groupsio_username     :     username
+        groupsio_password     :     password
+    """
+    def __init__(self,
+                 credentials,
+                 group_name):
+        # template url for archives page (list of topics)
+        self.url = "https://{group}.groups.io/g/{subgroup}/topics"
+        self.login_url = "https://groups.io/login"
+
+        self.credentials = credentials
+        self.group_name = group_name
+        self.crawled_archives = False
+        self.archives = None
+
+
+    def get_archives(self):
+        """
+        Return a list of dictionaries containing 
+        information about each email topic in the 
+        groups.io email archive.
+
+        Call crawl_group_archives() first!
+        """
+        return self.archives
+
+
+    def get_subgroups_list(self):
+        """
+        Use the API to get a list of subgroups.
+        """
+        subgroups_url = 'https://api.groups.io/v1/getsubgroups'
+
+        key = self.credentials['groupsio_token']
+
+        data = [('group_name', self.group_name),
+                ('limit',100)
+        ]
+        response = requests.post(subgroups_url,
+                                 data=data,
+                                 auth=(key,''))
+        response = response.json()
+        data = response['data']
+
+        subgroups = {}
+        for group in data:
+            k = group['id']
+            v = re.sub(r'dcppc\+','',group['name'])
+            subgroups[k] = v
+
+        return subgroups
+
+
+    def crawl_group_archives(self):
+        """
+        Spider will crawl the email archives of the entire group
+        by crawling the email archives of each subgroup.
+        """
+        subgroups = self.get_subgroups_list()
+
+        # ------------------------------
+        # Start by logging in.
+
+        # Create session object to persist session data
+        session = requests.Session()
+
+        # Log in to the website
+        data = dict(email = self.credentials['groupsio_username'],
+                    password = self.credentials['groupsio_password'],
+                    timezone = 'America/Los_Angeles')
+
+        r = session.post(self.login_url,
+                         data = data)
+
+        csrf = self.get_csrf(r)
+
+        # ------------------------------
+        # For each subgroup, crawl the archives
+        # and return a list of dictionaries
+        # containing all the email threads.
+        for subgroup_id in subgroups.keys():
+            self.crawl_subgroup_archives(session,
+                                         csrf,
+                                         subgroup_id, 
+                                         subgroups[subgroup_id])
+
+        # Done. archives are now tucked away
+        # in the variable self.archives
+        # 
+        # self.archives is a list of dictionaries,
+        # with each dictionary containing info about
+        # a topic/email thread in a subgroup.
+        # ------------------------------
+
+
+
+
+    def crawl_subgroup_archives(self, session, csrf, subgroup_id, subgroup_name):
+        """
+        This kicks off the process to crawl the entire
+        archives of a given subgroup on groups.io.
+
+        For a given subgroup the url is self.url,
+        
+            https://{group}.groups.io/g/{subgroup}/topics
+
+        This is the first of a paginated list of topics.
+        Procedure is:
+        - passed a starting page (or its contents)
+        - iterate through all topics via the HTML page elements
+        - assemble a bundle of information about each topic:
+            - topic title, by, URL, date, content, permalink
+            - content filtering:
+                - ^From, Reply-To, Date, To, Subject
+                - Lines containing phone numbers
+                    - 9 digits
+                    - XXX-XXX-XXXX, (XXX) XXX-XXXX
+                    - XXXXXXXXXX, XXX XXX XXXX
+                    - ^Work: or (Work) or Work$
+                    - Home, Cell, Mobile
+                    - +1 XXX 
+                    - \w@\w
+        - while next button is not greyed out,
+        - click the next button
+
+        everything stored in self.archives:
+        list of dictionaries.
+
+        """
+        self.archives = []
+
+        prefix = "https://{group}.groups.io".format(group=self.group_name)
+
+        url = self.url.format(group=self.group_name, 
+                              subgroup=subgroup_name)
+
+        # ------------------------------
+
+        # Now get the first page
+        r = session.get(url)
+
+        # ------------------------------
+        # Fencepost algorithm:
+
+        # First page:
+
+        # Extract a list of (title, link) items
+        items = self.extract_archive_page_items_(r)
+
+        # Get the next link
+        next_url = self.get_next_url_(r)
+
+        # Now add each item to the archive of threads,
+        # then find the next button.
+        self.add_items_to_archives_(session,subgroup_name,items)
+
+        if next_url is None:
+            return
+        else:
+            full_next_url = prefix + next_url
+
+        # Now click the next button
+        next_request = requests.get(full_next_url)
+
+        while next_request.status_code==200:
+            items = self.extract_archive_page_items_(next_request)
+            next_url = self.get_next_url_(next_request)
+            self.add_items_to_archives_(session,subgroup_name,items)
+            if next_url is None:
+                return
+            else:
+                full_next_url = prefix + next_url
+            next_request = requests.get(full_next_url)
+        
+
+
+    def add_items_to_archives_(self,session,subgroup_name,items):
+        """
+        Given a set of items from a list of threads,
+        items being title and link,
+        get the page and store all info
+        in self.archives variable
+        (list of dictionaries)
+        """
+        for (title, link) in items:
+            # Get the thread page:
+            prefix = "https://{group}.groups.io".format(group=self.group_name)
+            full_link = prefix + link
+            r = session.get(full_link)
+            soup = BeautifulSoup(r.text,'html.parser')
+
+            # soup contains the entire thread
+
+            # What are we extracting:
+            # 1. thread number
+            # 2. permalink
+            # 3. content/text (filtered)
+
+            # - - - - - - - - - - - - - - 
+            # 1. topic/thread number:
+            # <a rel="nofollow" href="">
+            # where link is:
+            # https://{group}.groups.io/g/{subgroup}/topic/{topic_id}
+            # example topic id: 24209140
+            #
+            # ugly links are in the form 
+            # https://dcppc.groups.io/g/{subgroup}/topic/some_text_here/{thread_id}?p=,,,,,1,2,3,,,4,,5
+            # split at ?, 0th portion
+            # then split at /, last (-1th) portion
+            topic_id = link.split('?')[0].split('/')[-1]
+
+            # - - - - - - - - - - - - - - - 
+            # 2. permalink:
+            # - current link is ugly link
+            # - permalink is the nice one
+            # - topic id is available from the ugly link
+            # https://{group}.groups.io/g/{subgroup}/topic/{topic_id}
+
+            permalink_template = "https://{group}.groups.io/g/{subgroup}/topic/{topic_id}"
+            permalink = permalink_template.format(
+                    group = self.group_name,
+                    subgroup = subgroup_name, 
+                    topic_id = topic_id
+            )
+
+            # - - - - - - - - - - - - - - - 
+            # 3. content:
+
+            # Need to rearrange how we're assembling threads here.
+            # This is one thread, no?
+            content = []
+
+            subject = soup.find('title').text
+
+            # Extract information for the schema:
+            # - permalink for thread (done)
+            # - subject/title (done)
+            # - original sender email/name (done)
+            # - content (done)
+
+            # Groups.io pages have zero CSS classes, which makes everything
+            # a giant pain in the neck to interact with. Thanks Groups.io!
+            original_sender = ''
+            for i, tr in enumerate(soup.find_all('tr',{'class':'test'})):
+                # Every other tr row contains an email.
+                if (i+1)%2==0:
+                    # nope, no email here
+                    pass
+                else:
+                    # found an email!
+                    # this is a maze, thanks groups.io
+                    td = tr.find('td')
+                    divrow = td.find('div',{'class':'row'}).find('div',{'class':'pull-left'})
+                    if (i+1)==1:
+                        original_sender = divrow.text.strip()
+                    for div in td.find_all('div'):
+                        if div.has_attr('id'):
+
+                            # purge any signatures
+                            for x in div.find_all('div',{'id':'Signature'}):
+                                x.extract()
+
+                            # purge any headers
+                            for x in div.find_all('div'): 
+                                nonos = ['From:','Sent:','To:','Cc:','CC:','Subject:']
+                                for nono in nonos:
+                                    if nono in x.text:
+                                        x.extract()
+
+                            message_text = div.get_text()
+
+                            # More filtering:
+
+                            # phone numbers
+                            message_text = re.sub(r'[0-9]{3}-[0-9]{3}-[0-9]{4}','XXX-XXX-XXXX',message_text)
+                            message_text = re.sub(r'[0-9]\{10\}','XXXXXXXXXX',message_text)
+
+                            content.append(message_text)
+
+            full_content = "\n".join(content)
+
+            thread = {
+                    'permalink' : permalink,
+                    'subject' : subject,
+                    'original_sender' : original_sender,
+                    'content' : full_content
+            }
+            
+            print('*'*40)
+            for k in thread.keys():
+                if k=='content':
+                    pass
+                else:
+                    print("%s : %s"%(k,thread[k]))
+            print('*'*40)
+            self.archives.append(thread)
+
+
+    def extract_archive_page_items_(self, response):
+        """
+        (Private method)
+
+        Given a response from a GET request,
+        use beautifulsoup to extract all items
+        (thread titles and ugly thread links)
+        and pass them back in a list.
+        """
+        soup = BeautifulSoup(response.content,"html.parser")
+        rows = soup.find_all('tr',{'class':'test'})
+        if 'rate limited' in soup.text:
+            raise Exception("Error: rate limit in place for Groups.io")
+
+        results = []
+        for row in rows:
+            # We don't care about anything except title and ugly link
+            subject = row.find('span',{'class':'subject'})
+            title = subject.get_text()
+            link = row.find('a')['href']
+            print(title)
+            results.append((title,link))
+
+        return results
+
+
+    def get_next_url_(self, response):
+        """
+        (Private method)
+
+        Given a response (which is a list of threads),
+        find the next button and return the URL.
+
+        If no next URL, if is disabled, then return None.
+        """
+        soup = BeautifulSoup(response.text,'html.parser')
+        chevron = soup.find('i',{'class':'fa-chevron-right'})
+        try:
+            if '#' in chevron.parent['href']:
+                # empty link, abort
+                return None
+        except AttributeError:
+            # I don't even now
+            return None
+
+        if chevron.parent.parent.has_attr('class') and 'disabled' in chevron.parent.parent['class']:
+            # no next link, abort
+            return None
+
+        return chevron.parent['href']
+
+
+
+    def get_csrf(self,resp):
+        """
+        Find the CSRF token embedded in the subgroup page
+        """
+        soup = BeautifulSoup(resp.text,'html.parser')
+        csrf = ''
+        for i in soup.find_all('input'):
+            # Note that i.name is different from i['name']
+            # the first is the actual tag,
+            # the second is the attribute name="xyz"
+            if i['name']=='csrf':
+                csrf = i['value']
+        
+        if csrf=='':
+            err = "ERROR: Could not find csrf token on page."
+            raise Exception(err)
+    
+        return csrf
+
+
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,3 +10,4 @@ pypandoc>=1.4
 requests>=2.19
 pandoc>=1.0
 flask-dance>=1.0.0
+beautifulsoup4>=4.6
--- a/templates/search.html
+++ b/templates/search.html
@@ -86,9 +86,11 @@
                    <div class="container-fluid">
                        <div class="row">
                            <div class="col-xs-12 info">
-                                <b>Indexing:</b> <span class="badge">{{totals["documents"]}}</span> Google Documents,
-                                <span class="badge">{{totals["issues"]}}</span> Github issues,
-                                <span class="badge">{{totals["markdown"]}}</span> markdown files.
+                                <b>Indexing:</b> <span
+                                    class="badge">{{totals["gdoc"]}}</span> Google Documents,
+                                <span class="badge">{{totals["issue"]}}</span> Github issues,
+                                <span class="badge">{{totals["ghfile"]}}</span> Github files,
+                                <span class="badge">{{totals["markdown"]}}</span> Github markdown files.
                            </div>
                        </div>
                </div>
@@ -107,14 +109,15 @@

                    <div class="url">
                        {% if e.kind=="gdoc" %}
-                            {% if e.mimetype=="document" %}
+                            {% if e.mimetype=="" %}
                                <b>Google Document:</b>
                                <a href='{{e.url}}'>{{e.title}}</a>
-                                (Type: {{e.mimetype}}, Owner: {{e.owner_name}}, {{e.owner_email}})
+                                (Owner: {{e.owner_name}}, {{e.owner_email}})<br />
+                                <b>Document Type</b>: {{e.mimetype}}
                            {% else %}
                                <b>Google Drive:</b>
                                <a href='{{e.url}}'>{{e.title}}</a>
-                                (Type: {{e.mimetype}}, Owner: {{e.owner_name}}, {{e.owner_email}})
+                                (Owner: {{e.owner_name}}, {{e.owner_email}})
                            {% endif %}

                        {% elif e.kind=="issue" %}
Author	SHA1	Message	Date
Charles Reid	de796880c5	Merge branch 'master' of github.com:charlesreid1/centillion * 'master' of github.com:charlesreid1/centillion: update config_flask.example.py to strip dc info	2018-08-13 19:14:54 -07:00
Charles Reid	f79f711a38	Merge branch 'master' of github.com:dcppc/centillion * 'master' of github.com:dcppc/centillion: Update Readme.md	2018-08-13 19:14:07 -07:00
Charles Reid	00b862b83e	Merge branch 'master' of ssh://git.charlesreid1.com:222/charlesreid1/centillion * 'master' of ssh://git.charlesreid1.com:222/charlesreid1/centillion:	2018-08-13 19:13:53 -07:00
Chaz Reid	a06c3b645a	Update Readme.md	2018-08-13 12:42:18 -07:00
Charles Reid	878ff011fb	locked out by rate limit, but otherwise successful in indexing so far.	2018-08-13 00:54:12 -07:00
Charles Reid	33cf78a524	successfully grabbing threads from 1st page of every subgroup	2018-08-13 00:27:45 -07:00
Charles Reid	c1bcd8dc22	add import pdb where things are currently stuck	2018-08-12 20:25:29 -07:00
Charles Reid	757e9d79a1	keep going with spider idea	2018-08-12 20:24:29 -07:00
Charles Reid	c47682adb4	fix typo with groupsio key	2018-08-12 20:13:45 -07:00
Charles Reid	f2662c3849	adding calls to index groupsio emails this is currently work in progress. we have a debug statement in place as a bookmark. we are currently: - creating a login session - getting all the subgroups - going to first subgroup - getting list of titles and links - getting emails for each title and link still need to: - figure out how to assemble email {} - assemble content/etc and how to parse text of emails	2018-08-12 18:00:33 -07:00
Charles Reid	2478a3f857	Merge branch 'dcppc' of github.com:dcppc/centillion into dcppc * 'dcppc' of github.com:dcppc/centillion: fix how search results are bundled fix search template	2018-08-10 06:05:44 -07:00
Charles Reid	f174080dfd	catch exception when file info not found	2018-08-10 06:05:33 -07:00
Chaz Reid	ca8b12db06	Merge pull request #2 from charlesreid1/dcppc-merge-master Merge dcppc changes into master	2018-08-10 05:49:29 -07:00
Chaz Reid	a1ffdad292	Merge branch 'master' into dcppc-merge-master	2018-08-10 05:49:19 -07:00
Charles Reid	ce76396096	update config_flask.example.py to strip dc info	2018-08-10 05:46:07 -07:00
Chaz Reid	175ff4f71d	Merge pull request #17 from dcppc/github-files fix search template	2018-08-09 18:57:30 -07:00
Charles Reid	94f956e2d0	fix how search results are bundled	2018-08-09 18:56:56 -07:00
Charles Reid	dc015671fc	fix search template	2018-08-09 18:55:49 -07:00
Charles Reid	1e9eec81d7	make it valid json	2018-08-09 18:15:14 -07:00
Chaz Reid	31e12476af	Merge pull request #16 from dcppc/inception add inception	2018-08-09 18:08:11 -07:00
Chaz Reid	bbe4e32f63	Merge pull request #15 from dcppc/github-files index all github filenames, not just markdown	2018-08-09 18:07:56 -07:00
Charles Reid	5013741958	while we're at it	2018-08-09 17:40:56 -07:00
Charles Reid	1ce80a5da0	closes #11	2018-08-09 17:38:20 -07:00
Charles Reid	3ed967bd8b	remove unused function	2018-08-09 17:28:22 -07:00
Charles Reid	1eaaa32007	index all github filenames, not just markdown	2018-08-09 17:25:09 -07:00
Charles Reid	9c7e696b6a	Merge branch 'master' of ssh://git.charlesreid1.com:222/charlesreid1/centillion * 'master' of ssh://git.charlesreid1.com:222/charlesreid1/centillion: Move images, resize images, update image markdown in readme update readme to use <img> tags merge image files in from master fix <title> fix the readme to reflect current state of things/links/descriptions fix typos/wording in readme adding changes to enable https, update callback to http, and everything still passes through https (proxy) update footer repo info update screen shots add mkdocs-material-dib submodule remove mkdocs material submodule update tagline update tagline add _example_ config file for flask	2018-08-09 16:39:18 -07:00
Chaz Reid	262a0c19e7	Merge pull request #14 from dcppc/local-fixes Fix centillion to work for local instances	2018-08-09 16:37:37 -07:00
Chaz Reid	bd2714cc0b	Merge branch 'dcppc' into local-fixes	2018-08-09 16:36:34 -07:00
Charles Reid	899d6fed53	comment out localhost only env var	2018-08-09 16:25:37 -07:00
Charles Reid	a7756049e5	revert changes	2018-08-09 16:23:42 -07:00
Charles Reid	3df427a8f8	fix how existing issues in search index are collected. closes #10	2018-08-09 16:17:17 -07:00
Charles Reid	0dd06748de	fix centillion to work for local instance	2018-08-09 16:16:30 -07:00