32 Commits
1.0 ... master

Author SHA1 Message Date
de796880c5 Merge branch 'master' of github.com:charlesreid1/centillion
* 'master' of github.com:charlesreid1/centillion:
  update config_flask.example.py to strip dc info
2018-08-13 19:14:54 -07:00
f79f711a38 Merge branch 'master' of github.com:dcppc/centillion
* 'master' of github.com:dcppc/centillion:
  Update Readme.md
2018-08-13 19:14:07 -07:00
00b862b83e Merge branch 'master' of ssh://git.charlesreid1.com:222/charlesreid1/centillion
* 'master' of ssh://git.charlesreid1.com:222/charlesreid1/centillion:
2018-08-13 19:13:53 -07:00
a06c3b645a Update Readme.md 2018-08-13 12:42:18 -07:00
878ff011fb locked out by rate limit, but otherwise successful in indexing so far. 2018-08-13 00:54:12 -07:00
33cf78a524 successfully grabbing threads from 1st page of every subgroup 2018-08-13 00:27:45 -07:00
c1bcd8dc22 add import pdb where things are currently stuck 2018-08-12 20:25:29 -07:00
757e9d79a1 keep going with spider idea 2018-08-12 20:24:29 -07:00
c47682adb4 fix typo with groupsio key 2018-08-12 20:13:45 -07:00
f2662c3849 adding calls to index groupsio emails
this is currently work in progress.
we have a debug statement in place as a bookmark.

we are currently:
- creating a login session
- getting all the subgroups
- going to first subgroup
- getting list of titles and links
- getting emails for each title and link

still need to:
- figure out how to assemble email {}
- assemble content/etc and how to parse text of emails
2018-08-12 18:00:33 -07:00
2478a3f857 Merge branch 'dcppc' of github.com:dcppc/centillion into dcppc
* 'dcppc' of github.com:dcppc/centillion:
  fix how search results are bundled
  fix search template
2018-08-10 06:05:44 -07:00
f174080dfd catch exception when file info not found 2018-08-10 06:05:33 -07:00
ca8b12db06 Merge pull request #2 from charlesreid1/dcppc-merge-master
Merge dcppc changes into master
2018-08-10 05:49:29 -07:00
a1ffdad292 Merge branch 'master' into dcppc-merge-master 2018-08-10 05:49:19 -07:00
ce76396096 update config_flask.example.py to strip dc info 2018-08-10 05:46:07 -07:00
175ff4f71d Merge pull request #17 from dcppc/github-files
fix search template
2018-08-09 18:57:30 -07:00
94f956e2d0 fix how search results are bundled 2018-08-09 18:56:56 -07:00
dc015671fc fix search template 2018-08-09 18:55:49 -07:00
1e9eec81d7 make it valid json 2018-08-09 18:15:14 -07:00
31e12476af Merge pull request #16 from dcppc/inception
add inception
2018-08-09 18:08:11 -07:00
bbe4e32f63 Merge pull request #15 from dcppc/github-files
index all github filenames, not just markdown
2018-08-09 18:07:56 -07:00
5013741958 while we're at it 2018-08-09 17:40:56 -07:00
1ce80a5da0 closes #11 2018-08-09 17:38:20 -07:00
3ed967bd8b remove unused function 2018-08-09 17:28:22 -07:00
1eaaa32007 index all github filenames, not just markdown 2018-08-09 17:25:09 -07:00
9c7e696b6a Merge branch 'master' of ssh://git.charlesreid1.com:222/charlesreid1/centillion
* 'master' of ssh://git.charlesreid1.com:222/charlesreid1/centillion:
  Move images, resize images, update image markdown in readme
  update readme to use <img> tags
  merge image files in from master
  fix <title>
  fix the readme to reflect current state of things/links/descriptions
  fix typos/wording in readme
  adding changes to enable https, update callback to http, and everything still passes through https (proxy)
  update footer repo info
  update screen shots
  add mkdocs-material-dib submodule
  remove mkdocs material submodule
  update tagline
  update tagline
  add _example_ config file for flask
2018-08-09 16:39:18 -07:00
262a0c19e7 Merge pull request #14 from dcppc/local-fixes
Fix centillion to work for local instances
2018-08-09 16:37:37 -07:00
bd2714cc0b Merge branch 'dcppc' into local-fixes 2018-08-09 16:36:34 -07:00
899d6fed53 comment out localhost only env var 2018-08-09 16:25:37 -07:00
a7756049e5 revert changes 2018-08-09 16:23:42 -07:00
3df427a8f8 fix how existing issues in search index are collected. closes #10 2018-08-09 16:17:17 -07:00
0dd06748de fix centillion to work for local instance 2018-08-09 16:16:30 -07:00
8 changed files with 559 additions and 125 deletions

View File

@@ -1,4 +1,4 @@
# The Centillion
# Centillion
**centillion**: a pan-github-markdown-issues-google-docs search engine.

View File

@@ -27,10 +27,16 @@ You provide:
class UpdateIndexTask(object):
def __init__(self, gh_access_token, diff_index=False):
def __init__(self, app_config, diff_index=False):
self.diff_index = diff_index
thread = threading.Thread(target=self.run, args=())
self.gh_access_token = gh_access_token
self.gh_token = app_config['GITHUB_TOKEN']
self.groupsio_credentials = {
'groupsio_token' : app_config['GROUPSIO_TOKEN'],
'groupsio_username' : app_config['GROUPSIO_USERNAME'],
'groupsio_password' : app_config['GROUPSIO_PASSWORD']
}
thread.daemon = True
thread.start()
@@ -43,9 +49,10 @@ class UpdateIndexTask(object):
from get_centillion_config import get_centillion_config
config = get_centillion_config('config_centillion.json')
search.update_index_issues(self.gh_access_token,config)
search.update_index_markdown(self.gh_access_token,config)
search.update_index_gdocs(config)
search.update_index_groupsioemails(self.groupsio_credentials,config)
###search.update_index_ghfiles(self.gh_token,config)
###search.update_index_issues(self.gh_token,config)
###search.update_index_gdocs(config)
@@ -172,12 +179,9 @@ def update_index():
mresp = github.get('/teams/%s/members/%s'%(copper_team_id,username))
if mresp.status_code==204:
#gh_oauth_token = github.token['access_token']
gh_access_token = app.config['GITHUB_TOKEN']
# --------------------
# Business as usual
UpdateIndexTask(gh_access_token,
UpdateIndexTask(app.config,
diff_index=False)
flash("Rebuilding index, check console output")
return render_template("controlpanel.html",
@@ -218,6 +222,7 @@ def oops(e):
return contents404
if __name__ == '__main__':
# if running local instance, set to true
os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = 'true'
app.run(host="0.0.0.0",port=5000)

View File

@@ -5,6 +5,7 @@ from github import Github, GithubException
import base64
from gdrive_util import GDrive
from groupsio_util import GroupsIOArchivesCrawler
from apiclient.http import MediaIoBaseDownload
import mistune
@@ -128,7 +129,6 @@ class Search:
schema = Schema(
id = ID(stored=True, unique=True),
kind = ID(stored=True),
#fingerprint = ID(stored=True),
created_time = ID(stored=True),
modified_time = ID(stored=True),
@@ -266,7 +266,6 @@ class Search:
# If export was successful, read contents of markdown
# into the content variable.
# into the content variable.
if os.path.isfile(fullpath_output):
# Export was successful
with codecs.open(fullpath_output, encoding='utf-8') as f:
@@ -276,12 +275,14 @@ class Search:
# No matter what happens, clean up.
print(" > Cleaning up \"%s\""%item['name'])
subprocess.call(['rm','-fr',fullpath_output])
## test
#print(" ".join(['rm','-fr',fullpath_output]))
subprocess.call(['rm','-fr',fullpath_input])
#print(" ".join(['rm','-fr',fullpath_input]))
# do it
subprocess.call(['rm','-fr',fullpath_output])
subprocess.call(['rm','-fr',fullpath_input])
if update:
print(" > Removing old record")
writer.delete_by_term('id',item['id'])
@@ -315,7 +316,7 @@ class Search:
# to a search index.
def add_issue(self, writer, issue, gh_access_token, config, update=True):
def add_issue(self, writer, issue, gh_token, config, update=True):
"""
Add a Github issue/comment to a search index.
"""
@@ -367,71 +368,101 @@ class Search:
def add_markdown(self, writer, d, gh_access_token, config, update=True):
def add_ghfile(self, writer, d, gh_token, config, update=True):
"""
Use a Github markdown document API record
to add a markdown document's contents to
the search index.
Use a Github file API record to add a filename
to the search index.
"""
MARKDOWN_EXTS = ['.md','.markdown']
repo = d['repo']
org = d['org']
repo_name = org + "/" + repo
repo_url = "https://github.com/" + repo_name
fpath = d['path']
furl = d['url']
fsha = d['sha']
_, fname = os.path.split(fpath)
_, fext = os.path.splitext(fpath)
print("Indexing markdown doc %s from repo %s"%(fname,repo_name))
# Unpack the requests response and decode the content
#
# don't forget the headers for private repos!
# useful: https://bit.ly/2LSAflS
headers = {'Authorization' : 'token %s'%(gh_access_token)}
response = requests.get(furl, headers=headers)
if response.status_code==200:
jresponse = response.json()
content = ""
try:
binary_content = re.sub('\n','',jresponse['content'])
content = base64.b64decode(binary_content).decode('utf-8')
except KeyError:
print(" > XXXXXXXX Failed to extract 'content' field. You probably hit the rate limit.")
else:
print(" > XXXXXXXX Failed to reach file URL. There may be a problem with authentication/headers.")
try:
fpath = d['path']
furl = d['url']
fsha = d['sha']
_, fname = os.path.split(fpath)
_, fext = os.path.splitext(fpath)
except:
print(" > XXXXXXXX Failed to find file info.")
return
# Now create the actual search index record
indexed_time = clean_timestamp(datetime.now())
usable_url = "https://github.com/%s/blob/master/%s"%(repo_name, fpath)
if fext in MARKDOWN_EXTS:
print("Indexing markdown doc %s from repo %s"%(fname,repo_name))
# Add one document per issue thread,
# containing entire text of thread.
writer.add_document(
id = fsha,
kind = 'markdown',
created_time = '',
modified_time = '',
indexed_time = indexed_time,
title = fname,
url = usable_url,
mimetype='',
owner_email='',
owner_name='',
repo_name = repo_name,
repo_url = repo_url,
github_user = '',
issue_title = '',
issue_url = '',
content = content
)
# Unpack the requests response and decode the content
#
# don't forget the headers for private repos!
# useful: https://bit.ly/2LSAflS
headers = {'Authorization' : 'token %s'%(gh_token)}
response = requests.get(furl, headers=headers)
if response.status_code==200:
jresponse = response.json()
content = ""
try:
binary_content = re.sub('\n','',jresponse['content'])
content = base64.b64decode(binary_content).decode('utf-8')
except KeyError:
print(" > XXXXXXXX Failed to extract 'content' field. You probably hit the rate limit.")
else:
print(" > XXXXXXXX Failed to reach file URL. There may be a problem with authentication/headers.")
return
usable_url = "https://github.com/%s/blob/master/%s"%(repo_name, fpath)
# Now create the actual search index record
writer.add_document(
id = fsha,
kind = 'markdown',
created_time = '',
modified_time = '',
indexed_time = indexed_time,
title = fname,
url = usable_url,
mimetype='',
owner_email='',
owner_name='',
repo_name = repo_name,
repo_url = repo_url,
github_user = '',
issue_title = '',
issue_url = '',
content = content
)
else:
print("Indexing github file %s from repo %s"%(fname,repo_name))
key = fname+"_"+fsha
# Now create the actual search index record
writer.add_document(
id = key,
kind = 'ghfile',
created_time = '',
modified_time = '',
indexed_time = indexed_time,
title = fname,
url = repo_url,
mimetype='',
owner_email='',
owner_name='',
repo_name = repo_name,
repo_url = repo_url,
github_user = '',
issue_title = '',
issue_url = '',
content = ''
)
@@ -559,7 +590,7 @@ class Search:
# ------------------------------
# Github Issues/Comments
def update_index_issues(self, gh_access_token, config):
def update_index_issues(self, gh_token, config):
"""
Update the search index using a collection of
Github repo issues and comments.
@@ -574,7 +605,7 @@ class Search:
# ------
indexed_issues = set()
p = QueryParser("kind", schema=self.ix.schema)
q = p.parse("gdoc")
q = p.parse("issue")
with self.ix.searcher() as s:
results = s.search(q,limit=None)
for result in results:
@@ -584,7 +615,7 @@ class Search:
# Get the set of remote ids:
# ------
# Start with api object
g = Github(gh_access_token)
g = Github(gh_token)
# Now index all issue threads in the user-specified repos
@@ -638,7 +669,7 @@ class Search:
# cop out
writer.delete_by_term('id',update_issue)
item = full_items[update_issue]
self.add_issue(writer, item, gh_access_token, config, update=True)
self.add_issue(writer, item, gh_token, config, update=True)
count += 1
@@ -647,7 +678,7 @@ class Search:
add_issues = remote_issues - indexed_issues
for add_issue in add_issues:
item = full_items[add_issue]
self.add_issue(writer, item, gh_access_token, config, update=False)
self.add_issue(writer, item, gh_token, config, update=False)
count += 1
@@ -657,15 +688,14 @@ class Search:
# ------------------------------
# Github Markdown Files
# Github Files
def update_index_markdown(self, gh_access_token, config):
def update_index_ghfiles(self, gh_token, config):
"""
Update the search index using a collection of
Markdown files from a Github repo.
files (and, separately, Markdown files) from
a Github repo.
"""
EXT = '.md'
# Updated algorithm:
# - get set of indexed ids
# - get set of remote ids
@@ -676,6 +706,12 @@ class Search:
# ------
indexed_ids = set()
p = QueryParser("kind", schema=self.ix.schema)
q = p.parse("ghfiles")
with self.ix.searcher() as s:
results = s.search(q,limit=None)
for result in results:
indexed_ids.add(result['id'])
q = p.parse("markdown")
with self.ix.searcher() as s:
results = s.search(q,limit=None)
@@ -685,10 +721,9 @@ class Search:
# Get the set of remote ids:
# ------
# Start with api object
g = Github(gh_access_token)
g = Github(gh_token)
# Now index all markdown files
# in the user-specified repos
# Now index all the files.
# Start by collecting all the things
remote_ids = set()
@@ -711,9 +746,6 @@ class Search:
continue
# ---------
# begin markdown-specific code
# Get head commit
commits = repo.get_commits()
try:
@@ -726,31 +758,29 @@ class Search:
# Get all the docs
tree = repo.get_git_tree(sha=sha, recursive=True)
docs = tree.raw_data['tree']
print("Parsing doc ids from repository %s"%(r))
print("Parsing file ids from repository %s"%(r))
for d in docs:
# For each doc, get the file extension
# If it matches EXT, download the file
# and decide what to do with it.
fpath = d['path']
_, fname = os.path.split(fpath)
_, fext = os.path.splitext(fpath)
if fext==EXT:
key = d['sha']
key = d['sha']
d['org'] = this_org
d['repo'] = this_repo
value = d
d['org'] = this_org
d['repo'] = this_repo
value = d
# Stash the doc for later
remote_ids.add(key)
full_items[key] = value
remote_ids.add(key)
full_items[key] = value
writer = self.ix.writer()
count = 0
# Drop any id in indexed_ids
# not in remote_ids
drop_ids = indexed_ids - remote_ids
@@ -765,7 +795,7 @@ class Search:
# cop out: just delete and re-add
writer.delete_by_term('id',update_id)
item = full_items[update_id]
self.add_markdown(writer, item, gh_access_token, config, update=True)
self.add_ghfile(writer, item, gh_token, config, update=True)
count += 1
@@ -774,12 +804,12 @@ class Search:
add_ids = remote_ids - indexed_ids
for add_id in add_ids:
item = full_items[add_id]
self.add_markdown(writer, item, gh_access_token, config, update=False)
self.add_ghfile(writer, item, gh_token, config, update=False)
count += 1
writer.commit()
print("Done, updated %d markdown documents in the index" % count)
print("Done, updated %d Github files in the index" % count)
@@ -787,10 +817,27 @@ class Search:
# Groups.io Emails
#def update_index_markdown(self, gh_access_token, config):
def update_index_groupsioemails(self, groupsio_token, config):
"""
Update the search index using the email archives
of groups.io groups.
This requires the use of a spider.
RELEASE THE SPIDER!!!
"""
spider = GroupsIOArchivesCrawler(groupsio_token,'dcppc')
# - ask spider to crawl the archives
spider.crawl_group_archives()
# - ask spider for list of all email records
# - 1 email = 1 dictionary
# - email records compiled by the spider
archives = spider.get_archives()
# - email object is sent off to add email method
print("Finished indexing groups.io emails")
# ---------------------------------
@@ -900,31 +947,27 @@ class Search:
def get_document_total_count(self):
p = QueryParser("kind", schema=self.ix.schema)
kind_labels = {
"documents" : "gdoc",
"markdown" : "markdown",
"issues" : "issue",
}
counts = {
"documents" : None,
"gdoc" : None,
"issue" : None,
"ghfile" : None,
"markdown" : None,
"issues" : None,
"total" : None
}
for key in kind_labels:
kind = kind_labels[key]
q = p.parse(kind)
for key in counts.keys():
q = p.parse(key)
with self.ix.searcher() as s:
results = s.search(q,limit=None)
counts[key] = len(results)
## These two should NOT be different, but they are...
#counts['total'] = self.ix.searcher().doc_count_all()
counts['total'] = counts['documents'] + counts['markdown'] + counts['issues']
counts['total'] = sum(counts[k] for k in counts.keys())
return counts
if __name__ == "__main__":
raise Exception("Error: main method not implemented (fix groupsio credentials first)")
search = Search("search_index")
from get_centillion_config import get_centillion_config

View File

@@ -6,7 +6,6 @@
"dcppc/organize",
"dcppc/dcppc-bot",
"dcppc/full-stacks",
"dcppc/markdown-issues",
"dcppc/design-guidelines-discuss",
"dcppc/dcppc-deliverables",
"dcppc/dcppc-milestones",
@@ -22,6 +21,7 @@
"dcppc/2018-august-workshop",
"dcppc/2018-september-workshop",
"dcppc/design-guidelines",
"dcppc/2018-may-workshop"
"dcppc/2018-may-workshop",
"dcppc/centillion"
]
}

View File

@@ -7,14 +7,14 @@ GITHUB_OAUTH_CLIENT_SECRET = "YYY"
GITHUB_TOKEN = "ZZZ"
# More information footer: Repository label
FOOTER_REPO_ORG = "dcppc"
FOOTER_REPO_ORG = "charlesreid1"
FOOTER_REPO_NAME = "centillion"
# Toggle to show Whoosh parsed query
SHOW_PARSED_QUERY=True
TAGLINE = "Search the Data Commons"
TAGLINE = "Search All The Things"
# Flask settings
DEBUG = True
SECRET_KEY = '42c5a8eda356ca9d9c3ab2d149541e6b91d843fa'
SECRET_KEY = 'WWWWW'

382
groupsio_util.py Normal file
View File

@@ -0,0 +1,382 @@
import requests, os, re
from bs4 import BeautifulSoup
class GroupsIOArchivesCrawler(object):
"""
This is a Groups.io spider
designed to crawl the email
archives of a group.
credentials (dictionary):
groupsio_token : api access token
groupsio_username : username
groupsio_password : password
"""
def __init__(self,
credentials,
group_name):
# template url for archives page (list of topics)
self.url = "https://{group}.groups.io/g/{subgroup}/topics"
self.login_url = "https://groups.io/login"
self.credentials = credentials
self.group_name = group_name
self.crawled_archives = False
self.archives = None
def get_archives(self):
"""
Return a list of dictionaries containing
information about each email topic in the
groups.io email archive.
Call crawl_group_archives() first!
"""
return self.archives
def get_subgroups_list(self):
"""
Use the API to get a list of subgroups.
"""
subgroups_url = 'https://api.groups.io/v1/getsubgroups'
key = self.credentials['groupsio_token']
data = [('group_name', self.group_name),
('limit',100)
]
response = requests.post(subgroups_url,
data=data,
auth=(key,''))
response = response.json()
data = response['data']
subgroups = {}
for group in data:
k = group['id']
v = re.sub(r'dcppc\+','',group['name'])
subgroups[k] = v
return subgroups
def crawl_group_archives(self):
"""
Spider will crawl the email archives of the entire group
by crawling the email archives of each subgroup.
"""
subgroups = self.get_subgroups_list()
# ------------------------------
# Start by logging in.
# Create session object to persist session data
session = requests.Session()
# Log in to the website
data = dict(email = self.credentials['groupsio_username'],
password = self.credentials['groupsio_password'],
timezone = 'America/Los_Angeles')
r = session.post(self.login_url,
data = data)
csrf = self.get_csrf(r)
# ------------------------------
# For each subgroup, crawl the archives
# and return a list of dictionaries
# containing all the email threads.
for subgroup_id in subgroups.keys():
self.crawl_subgroup_archives(session,
csrf,
subgroup_id,
subgroups[subgroup_id])
# Done. archives are now tucked away
# in the variable self.archives
#
# self.archives is a list of dictionaries,
# with each dictionary containing info about
# a topic/email thread in a subgroup.
# ------------------------------
def crawl_subgroup_archives(self, session, csrf, subgroup_id, subgroup_name):
"""
This kicks off the process to crawl the entire
archives of a given subgroup on groups.io.
For a given subgroup the url is self.url,
https://{group}.groups.io/g/{subgroup}/topics
This is the first of a paginated list of topics.
Procedure is:
- passed a starting page (or its contents)
- iterate through all topics via the HTML page elements
- assemble a bundle of information about each topic:
- topic title, by, URL, date, content, permalink
- content filtering:
- ^From, Reply-To, Date, To, Subject
- Lines containing phone numbers
- 9 digits
- XXX-XXX-XXXX, (XXX) XXX-XXXX
- XXXXXXXXXX, XXX XXX XXXX
- ^Work: or (Work) or Work$
- Home, Cell, Mobile
- +1 XXX
- \w@\w
- while next button is not greyed out,
- click the next button
everything stored in self.archives:
list of dictionaries.
"""
self.archives = []
prefix = "https://{group}.groups.io".format(group=self.group_name)
url = self.url.format(group=self.group_name,
subgroup=subgroup_name)
# ------------------------------
# Now get the first page
r = session.get(url)
# ------------------------------
# Fencepost algorithm:
# First page:
# Extract a list of (title, link) items
items = self.extract_archive_page_items_(r)
# Get the next link
next_url = self.get_next_url_(r)
# Now add each item to the archive of threads,
# then find the next button.
self.add_items_to_archives_(session,subgroup_name,items)
if next_url is None:
return
else:
full_next_url = prefix + next_url
# Now click the next button
next_request = requests.get(full_next_url)
while next_request.status_code==200:
items = self.extract_archive_page_items_(next_request)
next_url = self.get_next_url_(next_request)
self.add_items_to_archives_(session,subgroup_name,items)
if next_url is None:
return
else:
full_next_url = prefix + next_url
next_request = requests.get(full_next_url)
def add_items_to_archives_(self,session,subgroup_name,items):
"""
Given a set of items from a list of threads,
items being title and link,
get the page and store all info
in self.archives variable
(list of dictionaries)
"""
for (title, link) in items:
# Get the thread page:
prefix = "https://{group}.groups.io".format(group=self.group_name)
full_link = prefix + link
r = session.get(full_link)
soup = BeautifulSoup(r.text,'html.parser')
# soup contains the entire thread
# What are we extracting:
# 1. thread number
# 2. permalink
# 3. content/text (filtered)
# - - - - - - - - - - - - - -
# 1. topic/thread number:
# <a rel="nofollow" href="">
# where link is:
# https://{group}.groups.io/g/{subgroup}/topic/{topic_id}
# example topic id: 24209140
#
# ugly links are in the form
# https://dcppc.groups.io/g/{subgroup}/topic/some_text_here/{thread_id}?p=,,,,,1,2,3,,,4,,5
# split at ?, 0th portion
# then split at /, last (-1th) portion
topic_id = link.split('?')[0].split('/')[-1]
# - - - - - - - - - - - - - - -
# 2. permalink:
# - current link is ugly link
# - permalink is the nice one
# - topic id is available from the ugly link
# https://{group}.groups.io/g/{subgroup}/topic/{topic_id}
permalink_template = "https://{group}.groups.io/g/{subgroup}/topic/{topic_id}"
permalink = permalink_template.format(
group = self.group_name,
subgroup = subgroup_name,
topic_id = topic_id
)
# - - - - - - - - - - - - - - -
# 3. content:
# Need to rearrange how we're assembling threads here.
# This is one thread, no?
content = []
subject = soup.find('title').text
# Extract information for the schema:
# - permalink for thread (done)
# - subject/title (done)
# - original sender email/name (done)
# - content (done)
# Groups.io pages have zero CSS classes, which makes everything
# a giant pain in the neck to interact with. Thanks Groups.io!
original_sender = ''
for i, tr in enumerate(soup.find_all('tr',{'class':'test'})):
# Every other tr row contains an email.
if (i+1)%2==0:
# nope, no email here
pass
else:
# found an email!
# this is a maze, thanks groups.io
td = tr.find('td')
divrow = td.find('div',{'class':'row'}).find('div',{'class':'pull-left'})
if (i+1)==1:
original_sender = divrow.text.strip()
for div in td.find_all('div'):
if div.has_attr('id'):
# purge any signatures
for x in div.find_all('div',{'id':'Signature'}):
x.extract()
# purge any headers
for x in div.find_all('div'):
nonos = ['From:','Sent:','To:','Cc:','CC:','Subject:']
for nono in nonos:
if nono in x.text:
x.extract()
message_text = div.get_text()
# More filtering:
# phone numbers
message_text = re.sub(r'[0-9]{3}-[0-9]{3}-[0-9]{4}','XXX-XXX-XXXX',message_text)
message_text = re.sub(r'[0-9]\{10\}','XXXXXXXXXX',message_text)
content.append(message_text)
full_content = "\n".join(content)
thread = {
'permalink' : permalink,
'subject' : subject,
'original_sender' : original_sender,
'content' : full_content
}
print('*'*40)
for k in thread.keys():
if k=='content':
pass
else:
print("%s : %s"%(k,thread[k]))
print('*'*40)
self.archives.append(thread)
def extract_archive_page_items_(self, response):
"""
(Private method)
Given a response from a GET request,
use beautifulsoup to extract all items
(thread titles and ugly thread links)
and pass them back in a list.
"""
soup = BeautifulSoup(response.content,"html.parser")
rows = soup.find_all('tr',{'class':'test'})
if 'rate limited' in soup.text:
raise Exception("Error: rate limit in place for Groups.io")
results = []
for row in rows:
# We don't care about anything except title and ugly link
subject = row.find('span',{'class':'subject'})
title = subject.get_text()
link = row.find('a')['href']
print(title)
results.append((title,link))
return results
def get_next_url_(self, response):
"""
(Private method)
Given a response (which is a list of threads),
find the next button and return the URL.
If no next URL, if is disabled, then return None.
"""
soup = BeautifulSoup(response.text,'html.parser')
chevron = soup.find('i',{'class':'fa-chevron-right'})
try:
if '#' in chevron.parent['href']:
# empty link, abort
return None
except AttributeError:
# I don't even now
return None
if chevron.parent.parent.has_attr('class') and 'disabled' in chevron.parent.parent['class']:
# no next link, abort
return None
return chevron.parent['href']
def get_csrf(self,resp):
"""
Find the CSRF token embedded in the subgroup page
"""
soup = BeautifulSoup(resp.text,'html.parser')
csrf = ''
for i in soup.find_all('input'):
# Note that i.name is different from i['name']
# the first is the actual tag,
# the second is the attribute name="xyz"
if i['name']=='csrf':
csrf = i['value']
if csrf=='':
err = "ERROR: Could not find csrf token on page."
raise Exception(err)
return csrf

View File

@@ -10,3 +10,4 @@ pypandoc>=1.4
requests>=2.19
pandoc>=1.0
flask-dance>=1.0.0
beautifulsoup4>=4.6

View File

@@ -86,9 +86,11 @@
<div class="container-fluid">
<div class="row">
<div class="col-xs-12 info">
<b>Indexing:</b> <span class="badge">{{totals["documents"]}}</span> Google Documents,
<span class="badge">{{totals["issues"]}}</span> Github issues,
<span class="badge">{{totals["markdown"]}}</span> markdown files.
<b>Indexing:</b> <span
class="badge">{{totals["gdoc"]}}</span> Google Documents,
<span class="badge">{{totals["issue"]}}</span> Github issues,
<span class="badge">{{totals["ghfile"]}}</span> Github files,
<span class="badge">{{totals["markdown"]}}</span> Github markdown files.
</div>
</div>
</div>
@@ -107,14 +109,15 @@
<div class="url">
{% if e.kind=="gdoc" %}
{% if e.mimetype=="document" %}
{% if e.mimetype=="" %}
<b>Google Document:</b>
<a href='{{e.url}}'>{{e.title}}</a>
(Type: {{e.mimetype}}, Owner: {{e.owner_name}}, {{e.owner_email}})
(Owner: {{e.owner_name}}, {{e.owner_email}})<br />
<b>Document Type</b>: {{e.mimetype}}
{% else %}
<b>Google Drive:</b>
<a href='{{e.url}}'>{{e.title}}</a>
(Type: {{e.mimetype}}, Owner: {{e.owner_name}}, {{e.owner_email}})
(Owner: {{e.owner_name}}, {{e.owner_email}})
{% endif %}
{% elif e.kind=="issue" %}