Compare commits
5 Commits
fix-flashe
...
use-dateti
Author | SHA1 | Date | |
---|---|---|---|
ab76226b0c | |||
a4ebef6e6f | |||
bad50efa9b | |||
629fc063db | |||
3b0baa21de |
@@ -21,6 +21,8 @@ import dateutil.parser
|
|||||||
|
|
||||||
from whoosh.qparser import MultifieldParser, QueryParser
|
from whoosh.qparser import MultifieldParser, QueryParser
|
||||||
from whoosh.analysis import StemmingAnalyzer
|
from whoosh.analysis import StemmingAnalyzer
|
||||||
|
from whoosh.qparser.dateparse import DateParserPlugin
|
||||||
|
from whoosh import fields, index
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@@ -180,30 +182,38 @@ class Search:
|
|||||||
# is defined.
|
# is defined.
|
||||||
|
|
||||||
schema = Schema(
|
schema = Schema(
|
||||||
id = ID(stored=True, unique=True),
|
id = fields.ID(stored=True, unique=True),
|
||||||
kind = ID(stored=True),
|
kind = fields.ID(stored=True),
|
||||||
|
|
||||||
created_time = ID(stored=True),
|
created_time = fields.DATETIME(stored=True),
|
||||||
modified_time = ID(stored=True),
|
modified_time = fields.DATETIME(stored=True),
|
||||||
indexed_time = ID(stored=True),
|
indexed_time = fields.DATETIME(stored=True),
|
||||||
|
|
||||||
title = TEXT(stored=True, field_boost=100.0),
|
title = fields.TEXT(stored=True, field_boost=100.0),
|
||||||
url = ID(stored=True, unique=True),
|
|
||||||
|
|
||||||
mimetype=ID(stored=True),
|
url = fields.ID(stored=True),
|
||||||
owner_email=ID(stored=True),
|
|
||||||
owner_name=TEXT(stored=True),
|
|
||||||
|
|
||||||
repo_name=TEXT(stored=True),
|
mimetype = fields.TEXT(stored=True),
|
||||||
repo_url=ID(stored=True),
|
|
||||||
|
|
||||||
github_user=TEXT(stored=True),
|
owner_email = fields.ID(stored=True),
|
||||||
|
owner_name = fields.TEXT(stored=True),
|
||||||
|
|
||||||
|
# mainly for email threads, groups.io, hypothesis
|
||||||
|
group = fields.ID(stored=True),
|
||||||
|
|
||||||
|
repo_name = fields.TEXT(stored=True),
|
||||||
|
repo_url = fields.ID(stored=True),
|
||||||
|
github_user = fields.TEXT(stored=True),
|
||||||
|
|
||||||
|
tags = fields.KEYWORD(commas=True,
|
||||||
|
stored=True,
|
||||||
|
lowercase=True),
|
||||||
|
|
||||||
# comments only
|
# comments only
|
||||||
issue_title=TEXT(stored=True, field_boost=100.0),
|
issue_title = fields.TEXT(stored=True, field_boost=100.0),
|
||||||
issue_url=ID(stored=True),
|
issue_url = fields.ID(stored=True),
|
||||||
|
|
||||||
content=TEXT(stored=True, analyzer=stemming_analyzer)
|
content = fields.TEXT(stored=True, analyzer=stemming_analyzer)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -243,17 +253,22 @@ class Search:
|
|||||||
writer.delete_by_term('id',item['id'])
|
writer.delete_by_term('id',item['id'])
|
||||||
|
|
||||||
# Index a plain google drive file
|
# Index a plain google drive file
|
||||||
|
created_time = dateutil.parser.parse(item['createdTime'])
|
||||||
|
modified_time = dateutil.parser.parse(item['modifiedTime'])
|
||||||
|
indexed_time = datetime.now().replace(microsecond=0)
|
||||||
|
try:
|
||||||
writer.add_document(
|
writer.add_document(
|
||||||
id = item['id'],
|
id = item['id'],
|
||||||
kind = 'gdoc',
|
kind = 'gdoc',
|
||||||
created_time = item['createdTime'],
|
created_time = created_time,
|
||||||
modified_time = item['modifiedTime'],
|
modified_time = modified_time,
|
||||||
indexed_time = datetime.now().replace(microsecond=0).isoformat(),
|
indexed_time = indexed_time,
|
||||||
title = item['name'],
|
title = item['name'],
|
||||||
url = item['webViewLink'],
|
url = item['webViewLink'],
|
||||||
mimetype = mimetype,
|
mimetype = mimetype,
|
||||||
owner_email = item['owners'][0]['emailAddress'],
|
owner_email = item['owners'][0]['emailAddress'],
|
||||||
owner_name = item['owners'][0]['displayName'],
|
owner_name = item['owners'][0]['displayName'],
|
||||||
|
group='',
|
||||||
repo_name='',
|
repo_name='',
|
||||||
repo_url='',
|
repo_url='',
|
||||||
github_user='',
|
github_user='',
|
||||||
@@ -261,6 +276,9 @@ class Search:
|
|||||||
issue_url='',
|
issue_url='',
|
||||||
content = content
|
content = content
|
||||||
)
|
)
|
||||||
|
except ValueError as e:
|
||||||
|
print(repr(e))
|
||||||
|
print(" > XXXXXX Failed to index Google Drive file \"%s\""%(item['name']))
|
||||||
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
@@ -314,7 +332,7 @@ class Search:
|
|||||||
)
|
)
|
||||||
assert output == ""
|
assert output == ""
|
||||||
except RuntimeError:
|
except RuntimeError:
|
||||||
print(" > XXXXXX Failed to index document \"%s\""%(item['name']))
|
print(" > XXXXXX Failed to index Google Drive document \"%s\""%(item['name']))
|
||||||
|
|
||||||
|
|
||||||
# If export was successful, read contents of markdown
|
# If export was successful, read contents of markdown
|
||||||
@@ -342,17 +360,22 @@ class Search:
|
|||||||
else:
|
else:
|
||||||
print(" > Creating a new record")
|
print(" > Creating a new record")
|
||||||
|
|
||||||
|
try:
|
||||||
|
created_time = dateutil.parser.parse(item['createdTime'])
|
||||||
|
modified_time = dateutil.parser.parse(item['modifiedTime'])
|
||||||
|
indexed_time = datetime.now()
|
||||||
writer.add_document(
|
writer.add_document(
|
||||||
id = item['id'],
|
id = item['id'],
|
||||||
kind = 'gdoc',
|
kind = 'gdoc',
|
||||||
created_time = item['createdTime'],
|
created_time = created_time,
|
||||||
modified_time = item['modifiedTime'],
|
modified_time = modified_time,
|
||||||
indexed_time = datetime.now().replace(microsecond=0).isoformat(),
|
indexed_time = indexed_time,
|
||||||
title = item['name'],
|
title = item['name'],
|
||||||
url = item['webViewLink'],
|
url = item['webViewLink'],
|
||||||
mimetype = mimetype,
|
mimetype = mimetype,
|
||||||
owner_email = item['owners'][0]['emailAddress'],
|
owner_email = item['owners'][0]['emailAddress'],
|
||||||
owner_name = item['owners'][0]['displayName'],
|
owner_name = item['owners'][0]['displayName'],
|
||||||
|
group='',
|
||||||
repo_name='',
|
repo_name='',
|
||||||
repo_url='',
|
repo_url='',
|
||||||
github_user='',
|
github_user='',
|
||||||
@@ -360,6 +383,10 @@ class Search:
|
|||||||
issue_url='',
|
issue_url='',
|
||||||
content = content
|
content = content
|
||||||
)
|
)
|
||||||
|
except ValueError as e:
|
||||||
|
print(repr(e))
|
||||||
|
print(" > XXXXXX Failed to index Google Drive file \"%s\""%(item['name']))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -393,13 +420,14 @@ class Search:
|
|||||||
issue_comment_content += comment.body.rstrip()
|
issue_comment_content += comment.body.rstrip()
|
||||||
issue_comment_content += "\n"
|
issue_comment_content += "\n"
|
||||||
|
|
||||||
# Now create the actual search index record
|
# Now create the actual search index record.
|
||||||
created_time = clean_timestamp(issue.created_at)
|
|
||||||
modified_time = clean_timestamp(issue.updated_at)
|
|
||||||
indexed_time = clean_timestamp(datetime.now())
|
|
||||||
|
|
||||||
# Add one document per issue thread,
|
# Add one document per issue thread,
|
||||||
# containing entire text of thread.
|
# containing entire text of thread.
|
||||||
|
|
||||||
|
created_time = issue.created_at
|
||||||
|
modified_time = issue.updated_at
|
||||||
|
indexed_time = datetime.now()
|
||||||
|
try:
|
||||||
writer.add_document(
|
writer.add_document(
|
||||||
id = issue.html_url,
|
id = issue.html_url,
|
||||||
kind = 'issue',
|
kind = 'issue',
|
||||||
@@ -411,6 +439,7 @@ class Search:
|
|||||||
mimetype='',
|
mimetype='',
|
||||||
owner_email='',
|
owner_email='',
|
||||||
owner_name='',
|
owner_name='',
|
||||||
|
group='',
|
||||||
repo_name = repo_name,
|
repo_name = repo_name,
|
||||||
repo_url = repo_url,
|
repo_url = repo_url,
|
||||||
github_user = issue.user.login,
|
github_user = issue.user.login,
|
||||||
@@ -418,6 +447,9 @@ class Search:
|
|||||||
issue_url = issue.html_url,
|
issue_url = issue.html_url,
|
||||||
content = issue_comment_content
|
content = issue_comment_content
|
||||||
)
|
)
|
||||||
|
except ValueError as e:
|
||||||
|
print(repr(e))
|
||||||
|
print(" > XXXXXX Failed to index Github issue \"%s\""%(issue.title))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -447,7 +479,8 @@ class Search:
|
|||||||
print(" > XXXXXXXX Failed to find file info.")
|
print(" > XXXXXXXX Failed to find file info.")
|
||||||
return
|
return
|
||||||
|
|
||||||
indexed_time = clean_timestamp(datetime.now())
|
|
||||||
|
indexed_time = datetime.now()
|
||||||
|
|
||||||
if fext in MARKDOWN_EXTS:
|
if fext in MARKDOWN_EXTS:
|
||||||
print("Indexing markdown doc %s from repo %s"%(fname,repo_name))
|
print("Indexing markdown doc %s from repo %s"%(fname,repo_name))
|
||||||
@@ -476,17 +509,19 @@ class Search:
|
|||||||
usable_url = "https://github.com/%s/blob/master/%s"%(repo_name, fpath)
|
usable_url = "https://github.com/%s/blob/master/%s"%(repo_name, fpath)
|
||||||
|
|
||||||
# Now create the actual search index record
|
# Now create the actual search index record
|
||||||
|
try:
|
||||||
writer.add_document(
|
writer.add_document(
|
||||||
id = fsha,
|
id = fsha,
|
||||||
kind = 'markdown',
|
kind = 'markdown',
|
||||||
created_time = '',
|
created_time = None,
|
||||||
modified_time = '',
|
modified_time = None,
|
||||||
indexed_time = indexed_time,
|
indexed_time = indexed_time,
|
||||||
title = fname,
|
title = fname,
|
||||||
url = usable_url,
|
url = usable_url,
|
||||||
mimetype='',
|
mimetype='',
|
||||||
owner_email='',
|
owner_email='',
|
||||||
owner_name='',
|
owner_name='',
|
||||||
|
group='',
|
||||||
repo_name = repo_name,
|
repo_name = repo_name,
|
||||||
repo_url = repo_url,
|
repo_url = repo_url,
|
||||||
github_user = '',
|
github_user = '',
|
||||||
@@ -494,6 +529,11 @@ class Search:
|
|||||||
issue_url = '',
|
issue_url = '',
|
||||||
content = content
|
content = content
|
||||||
)
|
)
|
||||||
|
except ValueError as e:
|
||||||
|
print(repr(e))
|
||||||
|
print(" > XXXXXX Failed to index Github markdown file \"%s\""%(fname))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
print("Indexing github file %s from repo %s"%(fname,repo_name))
|
print("Indexing github file %s from repo %s"%(fname,repo_name))
|
||||||
@@ -501,17 +541,19 @@ class Search:
|
|||||||
key = fname+"_"+fsha
|
key = fname+"_"+fsha
|
||||||
|
|
||||||
# Now create the actual search index record
|
# Now create the actual search index record
|
||||||
|
try:
|
||||||
writer.add_document(
|
writer.add_document(
|
||||||
id = key,
|
id = key,
|
||||||
kind = 'ghfile',
|
kind = 'ghfile',
|
||||||
created_time = '',
|
created_time = None,
|
||||||
modified_time = '',
|
modified_time = None,
|
||||||
indexed_time = indexed_time,
|
indexed_time = indexed_time,
|
||||||
title = fname,
|
title = fname,
|
||||||
url = repo_url,
|
url = repo_url,
|
||||||
mimetype='',
|
mimetype='',
|
||||||
owner_email='',
|
owner_email='',
|
||||||
owner_name='',
|
owner_name='',
|
||||||
|
group='',
|
||||||
repo_name = repo_name,
|
repo_name = repo_name,
|
||||||
repo_url = repo_url,
|
repo_url = repo_url,
|
||||||
github_user = '',
|
github_user = '',
|
||||||
@@ -519,6 +561,9 @@ class Search:
|
|||||||
issue_url = '',
|
issue_url = '',
|
||||||
content = ''
|
content = ''
|
||||||
)
|
)
|
||||||
|
except ValueError as e:
|
||||||
|
print(repr(e))
|
||||||
|
print(" > XXXXXX Failed to index Github file \"%s\""%(fname))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -532,20 +577,32 @@ class Search:
|
|||||||
Use a Github file API record to add a filename
|
Use a Github file API record to add a filename
|
||||||
to the search index.
|
to the search index.
|
||||||
"""
|
"""
|
||||||
indexed_time = clean_timestamp(datetime.now())
|
if 'created_time' in d.keys() and d['created_time'] is not None:
|
||||||
|
created_time = d['created_time']
|
||||||
|
else:
|
||||||
|
created_time = None
|
||||||
|
|
||||||
|
if 'modified_time' in d.keys() and d['modified_time'] is not None:
|
||||||
|
modified_time = d['modified_time']
|
||||||
|
else:
|
||||||
|
modified_time = None
|
||||||
|
|
||||||
|
indexed_time = datetime.now()
|
||||||
|
|
||||||
# Now create the actual search index record
|
# Now create the actual search index record
|
||||||
|
try:
|
||||||
writer.add_document(
|
writer.add_document(
|
||||||
id = d['permalink'],
|
id = d['permalink'],
|
||||||
kind = 'emailthread',
|
kind = 'emailthread',
|
||||||
created_time = '',
|
created_time = created_time,
|
||||||
modified_time = '',
|
modified_time = modified_time,
|
||||||
indexed_time = indexed_time,
|
indexed_time = indexed_time,
|
||||||
title = d['subject'],
|
title = d['subject'],
|
||||||
url = d['permalink'],
|
url = d['permalink'],
|
||||||
mimetype='',
|
mimetype='',
|
||||||
owner_email='',
|
owner_email='',
|
||||||
owner_name=d['original_sender'],
|
owner_name=d['original_sender'],
|
||||||
|
group=d['subgroup'],
|
||||||
repo_name = '',
|
repo_name = '',
|
||||||
repo_url = '',
|
repo_url = '',
|
||||||
github_user = '',
|
github_user = '',
|
||||||
@@ -553,7 +610,9 @@ class Search:
|
|||||||
issue_url = '',
|
issue_url = '',
|
||||||
content = d['content']
|
content = d['content']
|
||||||
)
|
)
|
||||||
|
except ValueError as e:
|
||||||
|
print(repr(e))
|
||||||
|
print(" > XXXXXX Failed to index Groups.io thread \"%s\""%(d['subject']))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -631,10 +690,10 @@ class Search:
|
|||||||
full_items[f['id']] = f
|
full_items[f['id']] = f
|
||||||
|
|
||||||
## Shorter:
|
## Shorter:
|
||||||
#break
|
|
||||||
# Longer:
|
|
||||||
if nextPageToken is None:
|
|
||||||
break
|
break
|
||||||
|
## Longer:
|
||||||
|
#if nextPageToken is None:
|
||||||
|
# break
|
||||||
|
|
||||||
|
|
||||||
writer = self.ix.writer()
|
writer = self.ix.writer()
|
||||||
@@ -642,7 +701,7 @@ class Search:
|
|||||||
temp_dir = tempfile.mkdtemp(dir=os.getcwd())
|
temp_dir = tempfile.mkdtemp(dir=os.getcwd())
|
||||||
print("Temporary directory: %s"%(temp_dir))
|
print("Temporary directory: %s"%(temp_dir))
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
# Drop any id in indexed_ids
|
# Drop any id in indexed_ids
|
||||||
# not in remote_ids
|
# not in remote_ids
|
||||||
@@ -670,6 +729,13 @@ class Search:
|
|||||||
self.add_drive_file(writer, item, temp_dir, config, update=False)
|
self.add_drive_file(writer, item, temp_dir, config, update=False)
|
||||||
count += 1
|
count += 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print("ERROR: While adding Google Drive files to search index")
|
||||||
|
print("-"*40)
|
||||||
|
print(repr(e))
|
||||||
|
print("-"*40)
|
||||||
|
print("Continuing...")
|
||||||
|
pass
|
||||||
|
|
||||||
print("Cleaning temporary directory: %s"%(temp_dir))
|
print("Cleaning temporary directory: %s"%(temp_dir))
|
||||||
subprocess.call(['rm','-fr',temp_dir])
|
subprocess.call(['rm','-fr',temp_dir])
|
||||||
@@ -1074,7 +1140,7 @@ class Search:
|
|||||||
elif doctype=='issue':
|
elif doctype=='issue':
|
||||||
item_keys = ['title','repo_name','repo_url','url','created_time','modified_time']
|
item_keys = ['title','repo_name','repo_url','url','created_time','modified_time']
|
||||||
elif doctype=='emailthread':
|
elif doctype=='emailthread':
|
||||||
item_keys = ['title','owner_name','url']
|
item_keys = ['title','owner_name','url','created_time','modified_time']
|
||||||
elif doctype=='ghfile':
|
elif doctype=='ghfile':
|
||||||
item_keys = ['title','repo_name','repo_url','url']
|
item_keys = ['title','repo_name','repo_url','url']
|
||||||
elif doctype=='markdown':
|
elif doctype=='markdown':
|
||||||
@@ -1091,10 +1157,6 @@ class Search:
|
|||||||
for r in results:
|
for r in results:
|
||||||
d = {}
|
d = {}
|
||||||
for k in item_keys:
|
for k in item_keys:
|
||||||
if k=='created_time' or k=='modified_time':
|
|
||||||
#d[k] = r[k]
|
|
||||||
d[k] = dateutil.parser.parse(r[k]).strftime("%Y-%m-%d")
|
|
||||||
else:
|
|
||||||
d[k] = r[k]
|
d[k] = r[k]
|
||||||
json_results.append(d)
|
json_results.append(d)
|
||||||
|
|
||||||
@@ -1108,7 +1170,9 @@ class Search:
|
|||||||
query_string = " ".join(query_list)
|
query_string = " ".join(query_list)
|
||||||
query = None
|
query = None
|
||||||
if ":" in query_string:
|
if ":" in query_string:
|
||||||
query = QueryParser("content", self.schema).parse(query_string)
|
query = QueryParser("content", self.schema)
|
||||||
|
query.add_plugin(DateParserPlugin(free=True))
|
||||||
|
query = query.parse(query_string)
|
||||||
elif len(fields) == 1 and fields[0] == "filename":
|
elif len(fields) == 1 and fields[0] == "filename":
|
||||||
pass
|
pass
|
||||||
elif len(fields) == 2:
|
elif len(fields) == 2:
|
||||||
@@ -1116,9 +1180,12 @@ class Search:
|
|||||||
else:
|
else:
|
||||||
# If the user does not specify a field,
|
# If the user does not specify a field,
|
||||||
# these are the fields that are actually searched
|
# these are the fields that are actually searched
|
||||||
fields = ['title', 'content','owner_name','owner_email','url']
|
fields = ['title', 'content','owner_name','owner_email','url','created_date','modified_date']
|
||||||
if not query:
|
if not query:
|
||||||
query = MultifieldParser(fields, schema=self.ix.schema).parse(query_string)
|
query = MultifieldParser(fields, schema=self.ix.schema)
|
||||||
|
query.add_plugin(DateParserPlugin(free=True))
|
||||||
|
query = query.parse(query_string)
|
||||||
|
#query = MultifieldParser(fields, schema=self.ix.schema).parse(query_string)
|
||||||
parsed_query = "%s" % query
|
parsed_query = "%s" % query
|
||||||
print("query: %s" % parsed_query)
|
print("query: %s" % parsed_query)
|
||||||
results = searcher.search(query, terms=False, scored=True, groupedby="kind")
|
results = searcher.search(query, terms=False, scored=True, groupedby="kind")
|
||||||
|
@@ -1,5 +1,7 @@
|
|||||||
import requests, os, re
|
import requests, os, re
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
import dateutil.parser
|
||||||
|
import datetime
|
||||||
|
|
||||||
class GroupsIOException(Exception):
|
class GroupsIOException(Exception):
|
||||||
pass
|
pass
|
||||||
@@ -64,7 +66,7 @@ class GroupsIOArchivesCrawler(object):
|
|||||||
|
|
||||||
## Short circuit
|
## Short circuit
|
||||||
## for debugging purposes
|
## for debugging purposes
|
||||||
#break
|
break
|
||||||
|
|
||||||
return subgroups
|
return subgroups
|
||||||
|
|
||||||
@@ -251,7 +253,7 @@ class GroupsIOArchivesCrawler(object):
|
|||||||
subject = soup.find('title').text
|
subject = soup.find('title').text
|
||||||
|
|
||||||
# Extract information for the schema:
|
# Extract information for the schema:
|
||||||
# - permalink for thread (done)
|
# - permalink for thread (done above)
|
||||||
# - subject/title (done)
|
# - subject/title (done)
|
||||||
# - original sender email/name (done)
|
# - original sender email/name (done)
|
||||||
# - content (done)
|
# - content (done)
|
||||||
@@ -266,11 +268,35 @@ class GroupsIOArchivesCrawler(object):
|
|||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
# found an email!
|
# found an email!
|
||||||
# this is a maze, thanks groups.io
|
# this is a maze, not amazing.
|
||||||
|
# thanks groups.io!
|
||||||
td = tr.find('td')
|
td = tr.find('td')
|
||||||
divrow = td.find('div',{'class':'row'}).find('div',{'class':'pull-left'})
|
|
||||||
|
sender_divrow = td.find('div',{'class':'row'})
|
||||||
|
sender_divrow = sender_divrow.find('div',{'class':'pull-left'})
|
||||||
if (i+1)==1:
|
if (i+1)==1:
|
||||||
original_sender = divrow.text.strip()
|
original_sender = sender_divrow.text.strip()
|
||||||
|
|
||||||
|
date_divrow = td.find('div',{'class':'row'})
|
||||||
|
date_divrow = date_divrow.find('div',{'class':'pull-right'})
|
||||||
|
date_divrow = date_divrow.find('font',{'class':'text-muted'})
|
||||||
|
date_divrow = date_divrow.find('script').text
|
||||||
|
try:
|
||||||
|
time_seconds = re.search(' [0-9]{1,} ',date_divrow).group(0)
|
||||||
|
time_seconds = time_seconds.strip()
|
||||||
|
# Thanks groups.io for the weird date formatting
|
||||||
|
time_seconds = time_seconds[:10]
|
||||||
|
mmicro_seconds = time_seconds[10:]
|
||||||
|
if (i+1)==1:
|
||||||
|
created_time = datetime.datetime.utcfromtimestamp(int(time_seconds))
|
||||||
|
modified_time = datetime.datetime.utcfromtimestamp(int(time_seconds))
|
||||||
|
else:
|
||||||
|
modified_time = datetime.datetime.utcfromtimestamp(int(time_seconds))
|
||||||
|
|
||||||
|
except AttributeError:
|
||||||
|
created_time = None
|
||||||
|
modified_time = None
|
||||||
|
|
||||||
for div in td.find_all('div'):
|
for div in td.find_all('div'):
|
||||||
if div.has_attr('id'):
|
if div.has_attr('id'):
|
||||||
|
|
||||||
@@ -299,7 +325,10 @@ class GroupsIOArchivesCrawler(object):
|
|||||||
|
|
||||||
thread = {
|
thread = {
|
||||||
'permalink' : permalink,
|
'permalink' : permalink,
|
||||||
|
'created_time' : created_time,
|
||||||
|
'modified_time' : modified_time,
|
||||||
'subject' : subject,
|
'subject' : subject,
|
||||||
|
'subgroup' : subgroup_name,
|
||||||
'original_sender' : original_sender,
|
'original_sender' : original_sender,
|
||||||
'content' : full_content
|
'content' : full_content
|
||||||
}
|
}
|
||||||
@@ -324,11 +353,13 @@ class GroupsIOArchivesCrawler(object):
|
|||||||
|
|
||||||
results = []
|
results = []
|
||||||
for row in rows:
|
for row in rows:
|
||||||
# We don't care about anything except title and ugly link
|
# This is where we extract
|
||||||
|
# a list of thread titles
|
||||||
|
# and corresponding links.
|
||||||
subject = row.find('span',{'class':'subject'})
|
subject = row.find('span',{'class':'subject'})
|
||||||
title = subject.get_text()
|
title = subject.get_text()
|
||||||
link = row.find('a')['href']
|
link = row.find('a')['href']
|
||||||
#print(title)
|
|
||||||
results.append((title,link))
|
results.append((title,link))
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
Reference in New Issue
Block a user