Compare commits
32 Commits
Author | SHA1 | Date | |
---|---|---|---|
de796880c5 | |||
f79f711a38 | |||
00b862b83e | |||
a06c3b645a | |||
878ff011fb | |||
33cf78a524 | |||
c1bcd8dc22 | |||
757e9d79a1 | |||
c47682adb4 | |||
f2662c3849 | |||
2478a3f857 | |||
f174080dfd | |||
ca8b12db06 | |||
a1ffdad292 | |||
ce76396096 | |||
175ff4f71d | |||
94f956e2d0 | |||
dc015671fc | |||
1e9eec81d7 | |||
31e12476af | |||
bbe4e32f63 | |||
5013741958 | |||
1ce80a5da0 | |||
3ed967bd8b | |||
1eaaa32007 | |||
9c7e696b6a | |||
262a0c19e7 | |||
bd2714cc0b | |||
899d6fed53 | |||
a7756049e5 | |||
3df427a8f8 | |||
0dd06748de |
@@ -1,4 +1,4 @@
|
|||||||
# The Centillion
|
# Centillion
|
||||||
|
|
||||||
**centillion**: a pan-github-markdown-issues-google-docs search engine.
|
**centillion**: a pan-github-markdown-issues-google-docs search engine.
|
||||||
|
|
||||||
|
@@ -27,10 +27,16 @@ You provide:
|
|||||||
|
|
||||||
|
|
||||||
class UpdateIndexTask(object):
|
class UpdateIndexTask(object):
|
||||||
def __init__(self, gh_access_token, diff_index=False):
|
def __init__(self, app_config, diff_index=False):
|
||||||
self.diff_index = diff_index
|
self.diff_index = diff_index
|
||||||
thread = threading.Thread(target=self.run, args=())
|
thread = threading.Thread(target=self.run, args=())
|
||||||
self.gh_access_token = gh_access_token
|
|
||||||
|
self.gh_token = app_config['GITHUB_TOKEN']
|
||||||
|
self.groupsio_credentials = {
|
||||||
|
'groupsio_token' : app_config['GROUPSIO_TOKEN'],
|
||||||
|
'groupsio_username' : app_config['GROUPSIO_USERNAME'],
|
||||||
|
'groupsio_password' : app_config['GROUPSIO_PASSWORD']
|
||||||
|
}
|
||||||
thread.daemon = True
|
thread.daemon = True
|
||||||
thread.start()
|
thread.start()
|
||||||
|
|
||||||
@@ -43,9 +49,10 @@ class UpdateIndexTask(object):
|
|||||||
from get_centillion_config import get_centillion_config
|
from get_centillion_config import get_centillion_config
|
||||||
config = get_centillion_config('config_centillion.json')
|
config = get_centillion_config('config_centillion.json')
|
||||||
|
|
||||||
search.update_index_issues(self.gh_access_token,config)
|
search.update_index_groupsioemails(self.groupsio_credentials,config)
|
||||||
search.update_index_markdown(self.gh_access_token,config)
|
###search.update_index_ghfiles(self.gh_token,config)
|
||||||
search.update_index_gdocs(config)
|
###search.update_index_issues(self.gh_token,config)
|
||||||
|
###search.update_index_gdocs(config)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -172,12 +179,9 @@ def update_index():
|
|||||||
mresp = github.get('/teams/%s/members/%s'%(copper_team_id,username))
|
mresp = github.get('/teams/%s/members/%s'%(copper_team_id,username))
|
||||||
if mresp.status_code==204:
|
if mresp.status_code==204:
|
||||||
|
|
||||||
#gh_oauth_token = github.token['access_token']
|
|
||||||
gh_access_token = app.config['GITHUB_TOKEN']
|
|
||||||
|
|
||||||
# --------------------
|
# --------------------
|
||||||
# Business as usual
|
# Business as usual
|
||||||
UpdateIndexTask(gh_access_token,
|
UpdateIndexTask(app.config,
|
||||||
diff_index=False)
|
diff_index=False)
|
||||||
flash("Rebuilding index, check console output")
|
flash("Rebuilding index, check console output")
|
||||||
return render_template("controlpanel.html",
|
return render_template("controlpanel.html",
|
||||||
@@ -218,6 +222,7 @@ def oops(e):
|
|||||||
return contents404
|
return contents404
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
# if running local instance, set to true
|
||||||
os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = 'true'
|
os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = 'true'
|
||||||
app.run(host="0.0.0.0",port=5000)
|
app.run(host="0.0.0.0",port=5000)
|
||||||
|
|
||||||
|
@@ -5,6 +5,7 @@ from github import Github, GithubException
|
|||||||
import base64
|
import base64
|
||||||
|
|
||||||
from gdrive_util import GDrive
|
from gdrive_util import GDrive
|
||||||
|
from groupsio_util import GroupsIOArchivesCrawler
|
||||||
from apiclient.http import MediaIoBaseDownload
|
from apiclient.http import MediaIoBaseDownload
|
||||||
|
|
||||||
import mistune
|
import mistune
|
||||||
@@ -128,7 +129,6 @@ class Search:
|
|||||||
schema = Schema(
|
schema = Schema(
|
||||||
id = ID(stored=True, unique=True),
|
id = ID(stored=True, unique=True),
|
||||||
kind = ID(stored=True),
|
kind = ID(stored=True),
|
||||||
#fingerprint = ID(stored=True),
|
|
||||||
|
|
||||||
created_time = ID(stored=True),
|
created_time = ID(stored=True),
|
||||||
modified_time = ID(stored=True),
|
modified_time = ID(stored=True),
|
||||||
@@ -266,7 +266,6 @@ class Search:
|
|||||||
|
|
||||||
# If export was successful, read contents of markdown
|
# If export was successful, read contents of markdown
|
||||||
# into the content variable.
|
# into the content variable.
|
||||||
# into the content variable.
|
|
||||||
if os.path.isfile(fullpath_output):
|
if os.path.isfile(fullpath_output):
|
||||||
# Export was successful
|
# Export was successful
|
||||||
with codecs.open(fullpath_output, encoding='utf-8') as f:
|
with codecs.open(fullpath_output, encoding='utf-8') as f:
|
||||||
@@ -276,12 +275,14 @@ class Search:
|
|||||||
# No matter what happens, clean up.
|
# No matter what happens, clean up.
|
||||||
print(" > Cleaning up \"%s\""%item['name'])
|
print(" > Cleaning up \"%s\""%item['name'])
|
||||||
|
|
||||||
subprocess.call(['rm','-fr',fullpath_output])
|
## test
|
||||||
#print(" ".join(['rm','-fr',fullpath_output]))
|
#print(" ".join(['rm','-fr',fullpath_output]))
|
||||||
|
|
||||||
subprocess.call(['rm','-fr',fullpath_input])
|
|
||||||
#print(" ".join(['rm','-fr',fullpath_input]))
|
#print(" ".join(['rm','-fr',fullpath_input]))
|
||||||
|
|
||||||
|
# do it
|
||||||
|
subprocess.call(['rm','-fr',fullpath_output])
|
||||||
|
subprocess.call(['rm','-fr',fullpath_input])
|
||||||
|
|
||||||
if update:
|
if update:
|
||||||
print(" > Removing old record")
|
print(" > Removing old record")
|
||||||
writer.delete_by_term('id',item['id'])
|
writer.delete_by_term('id',item['id'])
|
||||||
@@ -315,7 +316,7 @@ class Search:
|
|||||||
# to a search index.
|
# to a search index.
|
||||||
|
|
||||||
|
|
||||||
def add_issue(self, writer, issue, gh_access_token, config, update=True):
|
def add_issue(self, writer, issue, gh_token, config, update=True):
|
||||||
"""
|
"""
|
||||||
Add a Github issue/comment to a search index.
|
Add a Github issue/comment to a search index.
|
||||||
"""
|
"""
|
||||||
@@ -367,71 +368,101 @@ class Search:
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
def add_markdown(self, writer, d, gh_access_token, config, update=True):
|
|
||||||
|
def add_ghfile(self, writer, d, gh_token, config, update=True):
|
||||||
"""
|
"""
|
||||||
Use a Github markdown document API record
|
Use a Github file API record to add a filename
|
||||||
to add a markdown document's contents to
|
to the search index.
|
||||||
the search index.
|
|
||||||
"""
|
"""
|
||||||
|
MARKDOWN_EXTS = ['.md','.markdown']
|
||||||
|
|
||||||
repo = d['repo']
|
repo = d['repo']
|
||||||
org = d['org']
|
org = d['org']
|
||||||
repo_name = org + "/" + repo
|
repo_name = org + "/" + repo
|
||||||
repo_url = "https://github.com/" + repo_name
|
repo_url = "https://github.com/" + repo_name
|
||||||
|
|
||||||
fpath = d['path']
|
try:
|
||||||
furl = d['url']
|
fpath = d['path']
|
||||||
fsha = d['sha']
|
furl = d['url']
|
||||||
_, fname = os.path.split(fpath)
|
fsha = d['sha']
|
||||||
_, fext = os.path.splitext(fpath)
|
_, fname = os.path.split(fpath)
|
||||||
|
_, fext = os.path.splitext(fpath)
|
||||||
|
except:
|
||||||
|
print(" > XXXXXXXX Failed to find file info.")
|
||||||
|
return
|
||||||
|
|
||||||
print("Indexing markdown doc %s from repo %s"%(fname,repo_name))
|
|
||||||
|
|
||||||
# Unpack the requests response and decode the content
|
|
||||||
#
|
|
||||||
# don't forget the headers for private repos!
|
|
||||||
# useful: https://bit.ly/2LSAflS
|
|
||||||
|
|
||||||
headers = {'Authorization' : 'token %s'%(gh_access_token)}
|
|
||||||
|
|
||||||
response = requests.get(furl, headers=headers)
|
|
||||||
if response.status_code==200:
|
|
||||||
jresponse = response.json()
|
|
||||||
content = ""
|
|
||||||
try:
|
|
||||||
binary_content = re.sub('\n','',jresponse['content'])
|
|
||||||
content = base64.b64decode(binary_content).decode('utf-8')
|
|
||||||
except KeyError:
|
|
||||||
print(" > XXXXXXXX Failed to extract 'content' field. You probably hit the rate limit.")
|
|
||||||
|
|
||||||
else:
|
|
||||||
print(" > XXXXXXXX Failed to reach file URL. There may be a problem with authentication/headers.")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Now create the actual search index record
|
|
||||||
indexed_time = clean_timestamp(datetime.now())
|
indexed_time = clean_timestamp(datetime.now())
|
||||||
|
|
||||||
usable_url = "https://github.com/%s/blob/master/%s"%(repo_name, fpath)
|
if fext in MARKDOWN_EXTS:
|
||||||
|
print("Indexing markdown doc %s from repo %s"%(fname,repo_name))
|
||||||
|
|
||||||
# Add one document per issue thread,
|
# Unpack the requests response and decode the content
|
||||||
# containing entire text of thread.
|
#
|
||||||
writer.add_document(
|
# don't forget the headers for private repos!
|
||||||
id = fsha,
|
# useful: https://bit.ly/2LSAflS
|
||||||
kind = 'markdown',
|
|
||||||
created_time = '',
|
headers = {'Authorization' : 'token %s'%(gh_token)}
|
||||||
modified_time = '',
|
|
||||||
indexed_time = indexed_time,
|
response = requests.get(furl, headers=headers)
|
||||||
title = fname,
|
if response.status_code==200:
|
||||||
url = usable_url,
|
jresponse = response.json()
|
||||||
mimetype='',
|
content = ""
|
||||||
owner_email='',
|
try:
|
||||||
owner_name='',
|
binary_content = re.sub('\n','',jresponse['content'])
|
||||||
repo_name = repo_name,
|
content = base64.b64decode(binary_content).decode('utf-8')
|
||||||
repo_url = repo_url,
|
except KeyError:
|
||||||
github_user = '',
|
print(" > XXXXXXXX Failed to extract 'content' field. You probably hit the rate limit.")
|
||||||
issue_title = '',
|
|
||||||
issue_url = '',
|
else:
|
||||||
content = content
|
print(" > XXXXXXXX Failed to reach file URL. There may be a problem with authentication/headers.")
|
||||||
)
|
return
|
||||||
|
|
||||||
|
usable_url = "https://github.com/%s/blob/master/%s"%(repo_name, fpath)
|
||||||
|
|
||||||
|
# Now create the actual search index record
|
||||||
|
writer.add_document(
|
||||||
|
id = fsha,
|
||||||
|
kind = 'markdown',
|
||||||
|
created_time = '',
|
||||||
|
modified_time = '',
|
||||||
|
indexed_time = indexed_time,
|
||||||
|
title = fname,
|
||||||
|
url = usable_url,
|
||||||
|
mimetype='',
|
||||||
|
owner_email='',
|
||||||
|
owner_name='',
|
||||||
|
repo_name = repo_name,
|
||||||
|
repo_url = repo_url,
|
||||||
|
github_user = '',
|
||||||
|
issue_title = '',
|
||||||
|
issue_url = '',
|
||||||
|
content = content
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
print("Indexing github file %s from repo %s"%(fname,repo_name))
|
||||||
|
|
||||||
|
key = fname+"_"+fsha
|
||||||
|
|
||||||
|
# Now create the actual search index record
|
||||||
|
writer.add_document(
|
||||||
|
id = key,
|
||||||
|
kind = 'ghfile',
|
||||||
|
created_time = '',
|
||||||
|
modified_time = '',
|
||||||
|
indexed_time = indexed_time,
|
||||||
|
title = fname,
|
||||||
|
url = repo_url,
|
||||||
|
mimetype='',
|
||||||
|
owner_email='',
|
||||||
|
owner_name='',
|
||||||
|
repo_name = repo_name,
|
||||||
|
repo_url = repo_url,
|
||||||
|
github_user = '',
|
||||||
|
issue_title = '',
|
||||||
|
issue_url = '',
|
||||||
|
content = ''
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -559,7 +590,7 @@ class Search:
|
|||||||
# ------------------------------
|
# ------------------------------
|
||||||
# Github Issues/Comments
|
# Github Issues/Comments
|
||||||
|
|
||||||
def update_index_issues(self, gh_access_token, config):
|
def update_index_issues(self, gh_token, config):
|
||||||
"""
|
"""
|
||||||
Update the search index using a collection of
|
Update the search index using a collection of
|
||||||
Github repo issues and comments.
|
Github repo issues and comments.
|
||||||
@@ -574,7 +605,7 @@ class Search:
|
|||||||
# ------
|
# ------
|
||||||
indexed_issues = set()
|
indexed_issues = set()
|
||||||
p = QueryParser("kind", schema=self.ix.schema)
|
p = QueryParser("kind", schema=self.ix.schema)
|
||||||
q = p.parse("gdoc")
|
q = p.parse("issue")
|
||||||
with self.ix.searcher() as s:
|
with self.ix.searcher() as s:
|
||||||
results = s.search(q,limit=None)
|
results = s.search(q,limit=None)
|
||||||
for result in results:
|
for result in results:
|
||||||
@@ -584,7 +615,7 @@ class Search:
|
|||||||
# Get the set of remote ids:
|
# Get the set of remote ids:
|
||||||
# ------
|
# ------
|
||||||
# Start with api object
|
# Start with api object
|
||||||
g = Github(gh_access_token)
|
g = Github(gh_token)
|
||||||
|
|
||||||
# Now index all issue threads in the user-specified repos
|
# Now index all issue threads in the user-specified repos
|
||||||
|
|
||||||
@@ -638,7 +669,7 @@ class Search:
|
|||||||
# cop out
|
# cop out
|
||||||
writer.delete_by_term('id',update_issue)
|
writer.delete_by_term('id',update_issue)
|
||||||
item = full_items[update_issue]
|
item = full_items[update_issue]
|
||||||
self.add_issue(writer, item, gh_access_token, config, update=True)
|
self.add_issue(writer, item, gh_token, config, update=True)
|
||||||
count += 1
|
count += 1
|
||||||
|
|
||||||
|
|
||||||
@@ -647,7 +678,7 @@ class Search:
|
|||||||
add_issues = remote_issues - indexed_issues
|
add_issues = remote_issues - indexed_issues
|
||||||
for add_issue in add_issues:
|
for add_issue in add_issues:
|
||||||
item = full_items[add_issue]
|
item = full_items[add_issue]
|
||||||
self.add_issue(writer, item, gh_access_token, config, update=False)
|
self.add_issue(writer, item, gh_token, config, update=False)
|
||||||
count += 1
|
count += 1
|
||||||
|
|
||||||
|
|
||||||
@@ -657,15 +688,14 @@ class Search:
|
|||||||
|
|
||||||
|
|
||||||
# ------------------------------
|
# ------------------------------
|
||||||
# Github Markdown Files
|
# Github Files
|
||||||
|
|
||||||
def update_index_markdown(self, gh_access_token, config):
|
def update_index_ghfiles(self, gh_token, config):
|
||||||
"""
|
"""
|
||||||
Update the search index using a collection of
|
Update the search index using a collection of
|
||||||
Markdown files from a Github repo.
|
files (and, separately, Markdown files) from
|
||||||
|
a Github repo.
|
||||||
"""
|
"""
|
||||||
EXT = '.md'
|
|
||||||
|
|
||||||
# Updated algorithm:
|
# Updated algorithm:
|
||||||
# - get set of indexed ids
|
# - get set of indexed ids
|
||||||
# - get set of remote ids
|
# - get set of remote ids
|
||||||
@@ -676,6 +706,12 @@ class Search:
|
|||||||
# ------
|
# ------
|
||||||
indexed_ids = set()
|
indexed_ids = set()
|
||||||
p = QueryParser("kind", schema=self.ix.schema)
|
p = QueryParser("kind", schema=self.ix.schema)
|
||||||
|
q = p.parse("ghfiles")
|
||||||
|
with self.ix.searcher() as s:
|
||||||
|
results = s.search(q,limit=None)
|
||||||
|
for result in results:
|
||||||
|
indexed_ids.add(result['id'])
|
||||||
|
|
||||||
q = p.parse("markdown")
|
q = p.parse("markdown")
|
||||||
with self.ix.searcher() as s:
|
with self.ix.searcher() as s:
|
||||||
results = s.search(q,limit=None)
|
results = s.search(q,limit=None)
|
||||||
@@ -685,10 +721,9 @@ class Search:
|
|||||||
# Get the set of remote ids:
|
# Get the set of remote ids:
|
||||||
# ------
|
# ------
|
||||||
# Start with api object
|
# Start with api object
|
||||||
g = Github(gh_access_token)
|
g = Github(gh_token)
|
||||||
|
|
||||||
# Now index all markdown files
|
# Now index all the files.
|
||||||
# in the user-specified repos
|
|
||||||
|
|
||||||
# Start by collecting all the things
|
# Start by collecting all the things
|
||||||
remote_ids = set()
|
remote_ids = set()
|
||||||
@@ -711,9 +746,6 @@ class Search:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
||||||
# ---------
|
|
||||||
# begin markdown-specific code
|
|
||||||
|
|
||||||
# Get head commit
|
# Get head commit
|
||||||
commits = repo.get_commits()
|
commits = repo.get_commits()
|
||||||
try:
|
try:
|
||||||
@@ -726,31 +758,29 @@ class Search:
|
|||||||
# Get all the docs
|
# Get all the docs
|
||||||
tree = repo.get_git_tree(sha=sha, recursive=True)
|
tree = repo.get_git_tree(sha=sha, recursive=True)
|
||||||
docs = tree.raw_data['tree']
|
docs = tree.raw_data['tree']
|
||||||
print("Parsing doc ids from repository %s"%(r))
|
print("Parsing file ids from repository %s"%(r))
|
||||||
|
|
||||||
for d in docs:
|
for d in docs:
|
||||||
|
|
||||||
# For each doc, get the file extension
|
# For each doc, get the file extension
|
||||||
# If it matches EXT, download the file
|
# and decide what to do with it.
|
||||||
|
|
||||||
fpath = d['path']
|
fpath = d['path']
|
||||||
_, fname = os.path.split(fpath)
|
_, fname = os.path.split(fpath)
|
||||||
_, fext = os.path.splitext(fpath)
|
_, fext = os.path.splitext(fpath)
|
||||||
|
|
||||||
if fext==EXT:
|
key = d['sha']
|
||||||
|
|
||||||
key = d['sha']
|
d['org'] = this_org
|
||||||
d['org'] = this_org
|
d['repo'] = this_repo
|
||||||
d['repo'] = this_repo
|
value = d
|
||||||
value = d
|
|
||||||
|
|
||||||
# Stash the doc for later
|
remote_ids.add(key)
|
||||||
remote_ids.add(key)
|
full_items[key] = value
|
||||||
full_items[key] = value
|
|
||||||
|
|
||||||
writer = self.ix.writer()
|
writer = self.ix.writer()
|
||||||
count = 0
|
count = 0
|
||||||
|
|
||||||
|
|
||||||
# Drop any id in indexed_ids
|
# Drop any id in indexed_ids
|
||||||
# not in remote_ids
|
# not in remote_ids
|
||||||
drop_ids = indexed_ids - remote_ids
|
drop_ids = indexed_ids - remote_ids
|
||||||
@@ -765,7 +795,7 @@ class Search:
|
|||||||
# cop out: just delete and re-add
|
# cop out: just delete and re-add
|
||||||
writer.delete_by_term('id',update_id)
|
writer.delete_by_term('id',update_id)
|
||||||
item = full_items[update_id]
|
item = full_items[update_id]
|
||||||
self.add_markdown(writer, item, gh_access_token, config, update=True)
|
self.add_ghfile(writer, item, gh_token, config, update=True)
|
||||||
count += 1
|
count += 1
|
||||||
|
|
||||||
|
|
||||||
@@ -774,12 +804,12 @@ class Search:
|
|||||||
add_ids = remote_ids - indexed_ids
|
add_ids = remote_ids - indexed_ids
|
||||||
for add_id in add_ids:
|
for add_id in add_ids:
|
||||||
item = full_items[add_id]
|
item = full_items[add_id]
|
||||||
self.add_markdown(writer, item, gh_access_token, config, update=False)
|
self.add_ghfile(writer, item, gh_token, config, update=False)
|
||||||
count += 1
|
count += 1
|
||||||
|
|
||||||
|
|
||||||
writer.commit()
|
writer.commit()
|
||||||
print("Done, updated %d markdown documents in the index" % count)
|
print("Done, updated %d Github files in the index" % count)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -787,10 +817,27 @@ class Search:
|
|||||||
# Groups.io Emails
|
# Groups.io Emails
|
||||||
|
|
||||||
|
|
||||||
#def update_index_markdown(self, gh_access_token, config):
|
def update_index_groupsioemails(self, groupsio_token, config):
|
||||||
|
"""
|
||||||
|
Update the search index using the email archives
|
||||||
|
of groups.io groups.
|
||||||
|
|
||||||
|
This requires the use of a spider.
|
||||||
|
RELEASE THE SPIDER!!!
|
||||||
|
"""
|
||||||
|
spider = GroupsIOArchivesCrawler(groupsio_token,'dcppc')
|
||||||
|
|
||||||
|
# - ask spider to crawl the archives
|
||||||
|
spider.crawl_group_archives()
|
||||||
|
|
||||||
|
# - ask spider for list of all email records
|
||||||
|
# - 1 email = 1 dictionary
|
||||||
|
# - email records compiled by the spider
|
||||||
|
archives = spider.get_archives()
|
||||||
|
|
||||||
|
# - email object is sent off to add email method
|
||||||
|
|
||||||
|
print("Finished indexing groups.io emails")
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------
|
# ---------------------------------
|
||||||
@@ -900,31 +947,27 @@ class Search:
|
|||||||
def get_document_total_count(self):
|
def get_document_total_count(self):
|
||||||
p = QueryParser("kind", schema=self.ix.schema)
|
p = QueryParser("kind", schema=self.ix.schema)
|
||||||
|
|
||||||
kind_labels = {
|
|
||||||
"documents" : "gdoc",
|
|
||||||
"markdown" : "markdown",
|
|
||||||
"issues" : "issue",
|
|
||||||
}
|
|
||||||
counts = {
|
counts = {
|
||||||
"documents" : None,
|
"gdoc" : None,
|
||||||
|
"issue" : None,
|
||||||
|
"ghfile" : None,
|
||||||
"markdown" : None,
|
"markdown" : None,
|
||||||
"issues" : None,
|
|
||||||
"total" : None
|
"total" : None
|
||||||
}
|
}
|
||||||
for key in kind_labels:
|
for key in counts.keys():
|
||||||
kind = kind_labels[key]
|
q = p.parse(key)
|
||||||
q = p.parse(kind)
|
|
||||||
with self.ix.searcher() as s:
|
with self.ix.searcher() as s:
|
||||||
results = s.search(q,limit=None)
|
results = s.search(q,limit=None)
|
||||||
counts[key] = len(results)
|
counts[key] = len(results)
|
||||||
|
|
||||||
## These two should NOT be different, but they are...
|
counts['total'] = sum(counts[k] for k in counts.keys())
|
||||||
#counts['total'] = self.ix.searcher().doc_count_all()
|
|
||||||
counts['total'] = counts['documents'] + counts['markdown'] + counts['issues']
|
|
||||||
|
|
||||||
return counts
|
return counts
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
raise Exception("Error: main method not implemented (fix groupsio credentials first)")
|
||||||
|
|
||||||
search = Search("search_index")
|
search = Search("search_index")
|
||||||
|
|
||||||
from get_centillion_config import get_centillion_config
|
from get_centillion_config import get_centillion_config
|
||||||
|
@@ -6,7 +6,6 @@
|
|||||||
"dcppc/organize",
|
"dcppc/organize",
|
||||||
"dcppc/dcppc-bot",
|
"dcppc/dcppc-bot",
|
||||||
"dcppc/full-stacks",
|
"dcppc/full-stacks",
|
||||||
"dcppc/markdown-issues",
|
|
||||||
"dcppc/design-guidelines-discuss",
|
"dcppc/design-guidelines-discuss",
|
||||||
"dcppc/dcppc-deliverables",
|
"dcppc/dcppc-deliverables",
|
||||||
"dcppc/dcppc-milestones",
|
"dcppc/dcppc-milestones",
|
||||||
@@ -22,6 +21,7 @@
|
|||||||
"dcppc/2018-august-workshop",
|
"dcppc/2018-august-workshop",
|
||||||
"dcppc/2018-september-workshop",
|
"dcppc/2018-september-workshop",
|
||||||
"dcppc/design-guidelines",
|
"dcppc/design-guidelines",
|
||||||
"dcppc/2018-may-workshop"
|
"dcppc/2018-may-workshop",
|
||||||
|
"dcppc/centillion"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
@@ -7,14 +7,14 @@ GITHUB_OAUTH_CLIENT_SECRET = "YYY"
|
|||||||
GITHUB_TOKEN = "ZZZ"
|
GITHUB_TOKEN = "ZZZ"
|
||||||
|
|
||||||
# More information footer: Repository label
|
# More information footer: Repository label
|
||||||
FOOTER_REPO_ORG = "dcppc"
|
FOOTER_REPO_ORG = "charlesreid1"
|
||||||
FOOTER_REPO_NAME = "centillion"
|
FOOTER_REPO_NAME = "centillion"
|
||||||
|
|
||||||
# Toggle to show Whoosh parsed query
|
# Toggle to show Whoosh parsed query
|
||||||
SHOW_PARSED_QUERY=True
|
SHOW_PARSED_QUERY=True
|
||||||
|
|
||||||
TAGLINE = "Search the Data Commons"
|
TAGLINE = "Search All The Things"
|
||||||
|
|
||||||
# Flask settings
|
# Flask settings
|
||||||
DEBUG = True
|
DEBUG = True
|
||||||
SECRET_KEY = '42c5a8eda356ca9d9c3ab2d149541e6b91d843fa'
|
SECRET_KEY = 'WWWWW'
|
||||||
|
382
groupsio_util.py
Normal file
382
groupsio_util.py
Normal file
@@ -0,0 +1,382 @@
|
|||||||
|
import requests, os, re
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
class GroupsIOArchivesCrawler(object):
|
||||||
|
"""
|
||||||
|
This is a Groups.io spider
|
||||||
|
designed to crawl the email
|
||||||
|
archives of a group.
|
||||||
|
|
||||||
|
credentials (dictionary):
|
||||||
|
groupsio_token : api access token
|
||||||
|
groupsio_username : username
|
||||||
|
groupsio_password : password
|
||||||
|
"""
|
||||||
|
def __init__(self,
|
||||||
|
credentials,
|
||||||
|
group_name):
|
||||||
|
# template url for archives page (list of topics)
|
||||||
|
self.url = "https://{group}.groups.io/g/{subgroup}/topics"
|
||||||
|
self.login_url = "https://groups.io/login"
|
||||||
|
|
||||||
|
self.credentials = credentials
|
||||||
|
self.group_name = group_name
|
||||||
|
self.crawled_archives = False
|
||||||
|
self.archives = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_archives(self):
|
||||||
|
"""
|
||||||
|
Return a list of dictionaries containing
|
||||||
|
information about each email topic in the
|
||||||
|
groups.io email archive.
|
||||||
|
|
||||||
|
Call crawl_group_archives() first!
|
||||||
|
"""
|
||||||
|
return self.archives
|
||||||
|
|
||||||
|
|
||||||
|
def get_subgroups_list(self):
|
||||||
|
"""
|
||||||
|
Use the API to get a list of subgroups.
|
||||||
|
"""
|
||||||
|
subgroups_url = 'https://api.groups.io/v1/getsubgroups'
|
||||||
|
|
||||||
|
key = self.credentials['groupsio_token']
|
||||||
|
|
||||||
|
data = [('group_name', self.group_name),
|
||||||
|
('limit',100)
|
||||||
|
]
|
||||||
|
response = requests.post(subgroups_url,
|
||||||
|
data=data,
|
||||||
|
auth=(key,''))
|
||||||
|
response = response.json()
|
||||||
|
data = response['data']
|
||||||
|
|
||||||
|
subgroups = {}
|
||||||
|
for group in data:
|
||||||
|
k = group['id']
|
||||||
|
v = re.sub(r'dcppc\+','',group['name'])
|
||||||
|
subgroups[k] = v
|
||||||
|
|
||||||
|
return subgroups
|
||||||
|
|
||||||
|
|
||||||
|
def crawl_group_archives(self):
|
||||||
|
"""
|
||||||
|
Spider will crawl the email archives of the entire group
|
||||||
|
by crawling the email archives of each subgroup.
|
||||||
|
"""
|
||||||
|
subgroups = self.get_subgroups_list()
|
||||||
|
|
||||||
|
# ------------------------------
|
||||||
|
# Start by logging in.
|
||||||
|
|
||||||
|
# Create session object to persist session data
|
||||||
|
session = requests.Session()
|
||||||
|
|
||||||
|
# Log in to the website
|
||||||
|
data = dict(email = self.credentials['groupsio_username'],
|
||||||
|
password = self.credentials['groupsio_password'],
|
||||||
|
timezone = 'America/Los_Angeles')
|
||||||
|
|
||||||
|
r = session.post(self.login_url,
|
||||||
|
data = data)
|
||||||
|
|
||||||
|
csrf = self.get_csrf(r)
|
||||||
|
|
||||||
|
# ------------------------------
|
||||||
|
# For each subgroup, crawl the archives
|
||||||
|
# and return a list of dictionaries
|
||||||
|
# containing all the email threads.
|
||||||
|
for subgroup_id in subgroups.keys():
|
||||||
|
self.crawl_subgroup_archives(session,
|
||||||
|
csrf,
|
||||||
|
subgroup_id,
|
||||||
|
subgroups[subgroup_id])
|
||||||
|
|
||||||
|
# Done. archives are now tucked away
|
||||||
|
# in the variable self.archives
|
||||||
|
#
|
||||||
|
# self.archives is a list of dictionaries,
|
||||||
|
# with each dictionary containing info about
|
||||||
|
# a topic/email thread in a subgroup.
|
||||||
|
# ------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def crawl_subgroup_archives(self, session, csrf, subgroup_id, subgroup_name):
|
||||||
|
"""
|
||||||
|
This kicks off the process to crawl the entire
|
||||||
|
archives of a given subgroup on groups.io.
|
||||||
|
|
||||||
|
For a given subgroup the url is self.url,
|
||||||
|
|
||||||
|
https://{group}.groups.io/g/{subgroup}/topics
|
||||||
|
|
||||||
|
This is the first of a paginated list of topics.
|
||||||
|
Procedure is:
|
||||||
|
- passed a starting page (or its contents)
|
||||||
|
- iterate through all topics via the HTML page elements
|
||||||
|
- assemble a bundle of information about each topic:
|
||||||
|
- topic title, by, URL, date, content, permalink
|
||||||
|
- content filtering:
|
||||||
|
- ^From, Reply-To, Date, To, Subject
|
||||||
|
- Lines containing phone numbers
|
||||||
|
- 9 digits
|
||||||
|
- XXX-XXX-XXXX, (XXX) XXX-XXXX
|
||||||
|
- XXXXXXXXXX, XXX XXX XXXX
|
||||||
|
- ^Work: or (Work) or Work$
|
||||||
|
- Home, Cell, Mobile
|
||||||
|
- +1 XXX
|
||||||
|
- \w@\w
|
||||||
|
- while next button is not greyed out,
|
||||||
|
- click the next button
|
||||||
|
|
||||||
|
everything stored in self.archives:
|
||||||
|
list of dictionaries.
|
||||||
|
|
||||||
|
"""
|
||||||
|
self.archives = []
|
||||||
|
|
||||||
|
prefix = "https://{group}.groups.io".format(group=self.group_name)
|
||||||
|
|
||||||
|
url = self.url.format(group=self.group_name,
|
||||||
|
subgroup=subgroup_name)
|
||||||
|
|
||||||
|
# ------------------------------
|
||||||
|
|
||||||
|
# Now get the first page
|
||||||
|
r = session.get(url)
|
||||||
|
|
||||||
|
# ------------------------------
|
||||||
|
# Fencepost algorithm:
|
||||||
|
|
||||||
|
# First page:
|
||||||
|
|
||||||
|
# Extract a list of (title, link) items
|
||||||
|
items = self.extract_archive_page_items_(r)
|
||||||
|
|
||||||
|
# Get the next link
|
||||||
|
next_url = self.get_next_url_(r)
|
||||||
|
|
||||||
|
# Now add each item to the archive of threads,
|
||||||
|
# then find the next button.
|
||||||
|
self.add_items_to_archives_(session,subgroup_name,items)
|
||||||
|
|
||||||
|
if next_url is None:
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
full_next_url = prefix + next_url
|
||||||
|
|
||||||
|
# Now click the next button
|
||||||
|
next_request = requests.get(full_next_url)
|
||||||
|
|
||||||
|
while next_request.status_code==200:
|
||||||
|
items = self.extract_archive_page_items_(next_request)
|
||||||
|
next_url = self.get_next_url_(next_request)
|
||||||
|
self.add_items_to_archives_(session,subgroup_name,items)
|
||||||
|
if next_url is None:
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
full_next_url = prefix + next_url
|
||||||
|
next_request = requests.get(full_next_url)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def add_items_to_archives_(self,session,subgroup_name,items):
|
||||||
|
"""
|
||||||
|
Given a set of items from a list of threads,
|
||||||
|
items being title and link,
|
||||||
|
get the page and store all info
|
||||||
|
in self.archives variable
|
||||||
|
(list of dictionaries)
|
||||||
|
"""
|
||||||
|
for (title, link) in items:
|
||||||
|
# Get the thread page:
|
||||||
|
prefix = "https://{group}.groups.io".format(group=self.group_name)
|
||||||
|
full_link = prefix + link
|
||||||
|
r = session.get(full_link)
|
||||||
|
soup = BeautifulSoup(r.text,'html.parser')
|
||||||
|
|
||||||
|
# soup contains the entire thread
|
||||||
|
|
||||||
|
# What are we extracting:
|
||||||
|
# 1. thread number
|
||||||
|
# 2. permalink
|
||||||
|
# 3. content/text (filtered)
|
||||||
|
|
||||||
|
# - - - - - - - - - - - - - -
|
||||||
|
# 1. topic/thread number:
|
||||||
|
# <a rel="nofollow" href="">
|
||||||
|
# where link is:
|
||||||
|
# https://{group}.groups.io/g/{subgroup}/topic/{topic_id}
|
||||||
|
# example topic id: 24209140
|
||||||
|
#
|
||||||
|
# ugly links are in the form
|
||||||
|
# https://dcppc.groups.io/g/{subgroup}/topic/some_text_here/{thread_id}?p=,,,,,1,2,3,,,4,,5
|
||||||
|
# split at ?, 0th portion
|
||||||
|
# then split at /, last (-1th) portion
|
||||||
|
topic_id = link.split('?')[0].split('/')[-1]
|
||||||
|
|
||||||
|
# - - - - - - - - - - - - - - -
|
||||||
|
# 2. permalink:
|
||||||
|
# - current link is ugly link
|
||||||
|
# - permalink is the nice one
|
||||||
|
# - topic id is available from the ugly link
|
||||||
|
# https://{group}.groups.io/g/{subgroup}/topic/{topic_id}
|
||||||
|
|
||||||
|
permalink_template = "https://{group}.groups.io/g/{subgroup}/topic/{topic_id}"
|
||||||
|
permalink = permalink_template.format(
|
||||||
|
group = self.group_name,
|
||||||
|
subgroup = subgroup_name,
|
||||||
|
topic_id = topic_id
|
||||||
|
)
|
||||||
|
|
||||||
|
# - - - - - - - - - - - - - - -
|
||||||
|
# 3. content:
|
||||||
|
|
||||||
|
# Need to rearrange how we're assembling threads here.
|
||||||
|
# This is one thread, no?
|
||||||
|
content = []
|
||||||
|
|
||||||
|
subject = soup.find('title').text
|
||||||
|
|
||||||
|
# Extract information for the schema:
|
||||||
|
# - permalink for thread (done)
|
||||||
|
# - subject/title (done)
|
||||||
|
# - original sender email/name (done)
|
||||||
|
# - content (done)
|
||||||
|
|
||||||
|
# Groups.io pages have zero CSS classes, which makes everything
|
||||||
|
# a giant pain in the neck to interact with. Thanks Groups.io!
|
||||||
|
original_sender = ''
|
||||||
|
for i, tr in enumerate(soup.find_all('tr',{'class':'test'})):
|
||||||
|
# Every other tr row contains an email.
|
||||||
|
if (i+1)%2==0:
|
||||||
|
# nope, no email here
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
# found an email!
|
||||||
|
# this is a maze, thanks groups.io
|
||||||
|
td = tr.find('td')
|
||||||
|
divrow = td.find('div',{'class':'row'}).find('div',{'class':'pull-left'})
|
||||||
|
if (i+1)==1:
|
||||||
|
original_sender = divrow.text.strip()
|
||||||
|
for div in td.find_all('div'):
|
||||||
|
if div.has_attr('id'):
|
||||||
|
|
||||||
|
# purge any signatures
|
||||||
|
for x in div.find_all('div',{'id':'Signature'}):
|
||||||
|
x.extract()
|
||||||
|
|
||||||
|
# purge any headers
|
||||||
|
for x in div.find_all('div'):
|
||||||
|
nonos = ['From:','Sent:','To:','Cc:','CC:','Subject:']
|
||||||
|
for nono in nonos:
|
||||||
|
if nono in x.text:
|
||||||
|
x.extract()
|
||||||
|
|
||||||
|
message_text = div.get_text()
|
||||||
|
|
||||||
|
# More filtering:
|
||||||
|
|
||||||
|
# phone numbers
|
||||||
|
message_text = re.sub(r'[0-9]{3}-[0-9]{3}-[0-9]{4}','XXX-XXX-XXXX',message_text)
|
||||||
|
message_text = re.sub(r'[0-9]\{10\}','XXXXXXXXXX',message_text)
|
||||||
|
|
||||||
|
content.append(message_text)
|
||||||
|
|
||||||
|
full_content = "\n".join(content)
|
||||||
|
|
||||||
|
thread = {
|
||||||
|
'permalink' : permalink,
|
||||||
|
'subject' : subject,
|
||||||
|
'original_sender' : original_sender,
|
||||||
|
'content' : full_content
|
||||||
|
}
|
||||||
|
|
||||||
|
print('*'*40)
|
||||||
|
for k in thread.keys():
|
||||||
|
if k=='content':
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
print("%s : %s"%(k,thread[k]))
|
||||||
|
print('*'*40)
|
||||||
|
self.archives.append(thread)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_archive_page_items_(self, response):
|
||||||
|
"""
|
||||||
|
(Private method)
|
||||||
|
|
||||||
|
Given a response from a GET request,
|
||||||
|
use beautifulsoup to extract all items
|
||||||
|
(thread titles and ugly thread links)
|
||||||
|
and pass them back in a list.
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(response.content,"html.parser")
|
||||||
|
rows = soup.find_all('tr',{'class':'test'})
|
||||||
|
if 'rate limited' in soup.text:
|
||||||
|
raise Exception("Error: rate limit in place for Groups.io")
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for row in rows:
|
||||||
|
# We don't care about anything except title and ugly link
|
||||||
|
subject = row.find('span',{'class':'subject'})
|
||||||
|
title = subject.get_text()
|
||||||
|
link = row.find('a')['href']
|
||||||
|
print(title)
|
||||||
|
results.append((title,link))
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def get_next_url_(self, response):
|
||||||
|
"""
|
||||||
|
(Private method)
|
||||||
|
|
||||||
|
Given a response (which is a list of threads),
|
||||||
|
find the next button and return the URL.
|
||||||
|
|
||||||
|
If no next URL, if is disabled, then return None.
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(response.text,'html.parser')
|
||||||
|
chevron = soup.find('i',{'class':'fa-chevron-right'})
|
||||||
|
try:
|
||||||
|
if '#' in chevron.parent['href']:
|
||||||
|
# empty link, abort
|
||||||
|
return None
|
||||||
|
except AttributeError:
|
||||||
|
# I don't even now
|
||||||
|
return None
|
||||||
|
|
||||||
|
if chevron.parent.parent.has_attr('class') and 'disabled' in chevron.parent.parent['class']:
|
||||||
|
# no next link, abort
|
||||||
|
return None
|
||||||
|
|
||||||
|
return chevron.parent['href']
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def get_csrf(self,resp):
|
||||||
|
"""
|
||||||
|
Find the CSRF token embedded in the subgroup page
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(resp.text,'html.parser')
|
||||||
|
csrf = ''
|
||||||
|
for i in soup.find_all('input'):
|
||||||
|
# Note that i.name is different from i['name']
|
||||||
|
# the first is the actual tag,
|
||||||
|
# the second is the attribute name="xyz"
|
||||||
|
if i['name']=='csrf':
|
||||||
|
csrf = i['value']
|
||||||
|
|
||||||
|
if csrf=='':
|
||||||
|
err = "ERROR: Could not find csrf token on page."
|
||||||
|
raise Exception(err)
|
||||||
|
|
||||||
|
return csrf
|
||||||
|
|
||||||
|
|
@@ -10,3 +10,4 @@ pypandoc>=1.4
|
|||||||
requests>=2.19
|
requests>=2.19
|
||||||
pandoc>=1.0
|
pandoc>=1.0
|
||||||
flask-dance>=1.0.0
|
flask-dance>=1.0.0
|
||||||
|
beautifulsoup4>=4.6
|
||||||
|
@@ -86,9 +86,11 @@
|
|||||||
<div class="container-fluid">
|
<div class="container-fluid">
|
||||||
<div class="row">
|
<div class="row">
|
||||||
<div class="col-xs-12 info">
|
<div class="col-xs-12 info">
|
||||||
<b>Indexing:</b> <span class="badge">{{totals["documents"]}}</span> Google Documents,
|
<b>Indexing:</b> <span
|
||||||
<span class="badge">{{totals["issues"]}}</span> Github issues,
|
class="badge">{{totals["gdoc"]}}</span> Google Documents,
|
||||||
<span class="badge">{{totals["markdown"]}}</span> markdown files.
|
<span class="badge">{{totals["issue"]}}</span> Github issues,
|
||||||
|
<span class="badge">{{totals["ghfile"]}}</span> Github files,
|
||||||
|
<span class="badge">{{totals["markdown"]}}</span> Github markdown files.
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -107,14 +109,15 @@
|
|||||||
|
|
||||||
<div class="url">
|
<div class="url">
|
||||||
{% if e.kind=="gdoc" %}
|
{% if e.kind=="gdoc" %}
|
||||||
{% if e.mimetype=="document" %}
|
{% if e.mimetype=="" %}
|
||||||
<b>Google Document:</b>
|
<b>Google Document:</b>
|
||||||
<a href='{{e.url}}'>{{e.title}}</a>
|
<a href='{{e.url}}'>{{e.title}}</a>
|
||||||
(Type: {{e.mimetype}}, Owner: {{e.owner_name}}, {{e.owner_email}})
|
(Owner: {{e.owner_name}}, {{e.owner_email}})<br />
|
||||||
|
<b>Document Type</b>: {{e.mimetype}}
|
||||||
{% else %}
|
{% else %}
|
||||||
<b>Google Drive:</b>
|
<b>Google Drive:</b>
|
||||||
<a href='{{e.url}}'>{{e.title}}</a>
|
<a href='{{e.url}}'>{{e.title}}</a>
|
||||||
(Type: {{e.mimetype}}, Owner: {{e.owner_name}}, {{e.owner_email}})
|
(Owner: {{e.owner_name}}, {{e.owner_email}})
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
{% elif e.kind=="issue" %}
|
{% elif e.kind=="issue" %}
|
||||||
|
Reference in New Issue
Block a user