10 Commits
v0.1 ... round3

7 changed files with 91 additions and 36 deletions

View File

@@ -4,6 +4,7 @@ use whoosh to search github issues.
Implemented in **Python** using **Flask**, **Whoosh** and **Mistune**.
<img src="img/screenshot.png" width="500px" />
## notes
@@ -35,8 +36,15 @@ summary of how to change the schema:
- Search class defines Schema object, main definition
- `add_issue()` (equally important) which defines how to extract the
fields defined in the schema from the document
-
- `create_search_result()` (also important) which packages up the
search results for the template to deal with
- `search.html`: the search.html template uses a different variable
namespace than the Python file `issues_search.py` or the flask app
- The `create_search-result()` method of `issues_search.py`
defines how search results are parsed and packaged for the
`search.html` template
- Jinja variables used in `search.html` should be defined in
`create_search_result()` method of `issues_search.py`

31
Todo.md Normal file
View File

@@ -0,0 +1,31 @@
# TODO
recap of round 1:
- issues search is working well
- indexing comments and issues
- able to easily add new fields to schema
- able to easily modify search + results template
- mapping out where everything is
## Round 2 (done)
improvements:
- storing comments and issues as separate objects?
- storing a boolean? that simple? customize the output of the search result
based on a boolean?
- if so, how do we pass off a search result to a template conditionally,
such that we can save some space (jinja question)
fix stuff that isn't mine:
- improve the readme
- fix the config.py config file options
config:
- enable user to specify list of organizations+repos
- not just one org/list of repos
## Round 3
organization:
- mapping out how to change the schema... now, how do we streamline it?
- how to organize files

BIN
img/screenshot.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 614 KiB

View File

@@ -23,10 +23,8 @@ routes:
"""
def get_items():
repo_list = ['2018-may-workshop',
'2018-june-workshop',
'2018-july-workshop']
repo_org = 'dcppc'
repo_list = app.config["REPOS"]
repo_org =app.config["REPO_ORG"]
gh_access_token = os.environ['GITHUB_TOKEN']
@@ -65,12 +63,10 @@ def search():
if fields == 'None':
fields = None
directories = []
search = Search(app.config["INDEX_DIR"])
if not query:
parsed_query = ""
result = []
directories=get_directories()
else:
parsed_query, result = search.search(query.split(), fields=[fields])
@@ -78,7 +74,7 @@ def search():
total = search.get_document_total_count()
return render_template('search.html', entries=result, query=query, parsed_query=parsed_query, fields=fields, last_searches=get_last_searches(), directories=directories, total=total)
return render_template('search.html', entries=result, query=query, parsed_query=parsed_query, fields=fields, last_searches=get_last_searches(), total=total)
@app.route('/open')
def open_file():
@@ -98,7 +94,6 @@ def update_index():
else:
UpdateIndexTask()
flash("Updating index, check console output")
store_directories()
return render_template("search.html", query="", fields="", last_searches=get_last_searches())
@@ -111,18 +106,7 @@ def get_last_searches():
return contents
def get_directories():
'''remove for issues'''
if os.path.exists(directories_file):
with codecs.open(directories_file, 'r', encoding='utf-8') as f:
directories = f.readlines()
f.close()
else:
directories = []
return directories
def store_search(query, fields):
'''remove for issues'''
if os.path.exists(last_searches_file):
with codecs.open(last_searches_file, 'r', encoding='utf-8') as f:
contents = f.readlines()
@@ -136,17 +120,6 @@ def store_search(query, fields):
with codecs.open(last_searches_file, 'w', encoding='utf-8') as f:
f.writelines(contents[:30])
def store_directories():
'''remove for issues'''
directories = []
for root, dirnames, files in os.walk(app.config["MARKDOWN_FILES_DIR"]):
if dirnames:
for d in dirnames:
if os.path.isdir(os.path.join(root, d)):
directories.append("%s\n" % d.lower())
directories = sorted(set(directories))
with codecs.open(app.config["INDEX_DIR"] + "/directories.txt", 'w', encoding='utf-8') as f:
f.writelines(directories)
if __name__ == '__main__':
app.run()

View File

@@ -17,6 +17,16 @@ from whoosh.analysis import StemmingAnalyzer
"""
issues-search.py Flow:
very high level description:
- zeroth step: create a search index
- first step: load a search index
- second step: call the search() method
- third step: update the search index
program will:
- create a Search object
- call add_all_issues
@@ -88,6 +98,7 @@ class Search:
schema = Schema(
url=ID(stored=True, unique=True),
is_comment=BOOLEAN(stored=True),
timestamp=STORED,
repo_name=TEXT(stored=True),
repo_url=ID(stored=True),
@@ -116,6 +127,7 @@ class Search:
Schema:
- url
- is_comment
- timestamp
- repo_name
- repo_url
@@ -137,6 +149,7 @@ class Search:
print("Indexing issue %s"%(issue.html_url))
writer.add_document(
url = issue.html_url,
is_comment = False,
timestamp = issue.created_at,
repo_name = repo_name,
repo_url = repo_url,
@@ -155,6 +168,7 @@ class Search:
print(" > Indexing comment %s"%(comment.html_url))
writer.add_document(
url = comment.html_url,
is_comment = True,
timestamp = comment.created_at,
repo_name = repo_name,
repo_url = repo_url,
@@ -168,6 +182,7 @@ class Search:
return count
'''
def add_all_issues(self, gh_access_token, list_of_repos, which_org, config, create_new_index=False):
"""
Add all issues in a given github repo to the search index.
@@ -214,6 +229,8 @@ class Search:
writer.commit()
print("Done, added %d documents to the index" % c)
'''
def update_index_incremental(self, gh_access_token, list_of_repos, which_org, config, create_new_index=False):
@@ -245,6 +262,12 @@ class Search:
writer = self.ix.writer()
# fix this. the delete all in index
# is not occurring in right place.
# Iterate over each repo
for this_repo in list_of_repos:
@@ -307,6 +330,8 @@ class Search:
sr.issue_title = r['issue_title']
sr.issue_url = r['issue_url']
sr.is_comment = r['is_comment']
sr.content = r['content']
highlights = r.highlights('content')
if not highlights:
@@ -360,5 +385,5 @@ if __name__ == "__main__":
search.add_all_issues(gh_access_token,
repo_list,
repo_org,
"/Users/charles/codes/markdown-search/config.py")
"/Users/charles/codes/issues-search/config.py")

11
requirements.txt Normal file
View File

@@ -0,0 +1,11 @@
Flask>=0.12.1
apiclient>=1.0.3
oauth2client>=3.0.0
httplib2>=0.10.3
google-api-python-client
mistune>=0.8
whoosh>=2.7.4
PyGithub>=1.39
pypandoc>=1.4
requests>=2.19
pandoc>=1.0

View File

@@ -34,9 +34,16 @@
<div class="path"><a href='{{ url_for("open_file")}}?path={{e.path|urlencode}}&query={{query}}&fields={{fields}}'>{{e.path}}</a>score: {{'%d' % e.score}}</div>
-->
<div class="url">
<a
href='{{e.repo_url}}'>dcppc/{{e.repo_name}}</a>
- <a href='{{e.issue_url}}'>{{e.issue_title}}</a> - <a href='{{e.url}}'>link</a><br />
{% if e.is_comment %}
<b>Comment</b> <a href='{{e.url}}'>(comment link)</a>
on issue <a href='{{e.issue_url}}'>{{e.issue_title}}</a>
in repo <a href='{{e.repo_url}}'>dcppc/{{e.repo_name}}</a>
<br />
{% else %}
<b>Issue</b> <a href='{{e.issue_url}}'>{{e.issue_title}}</a>
in repo <a href='{{e.repo_url}}'>dcppc/{{e.repo_name}}</a>
<br />
{% endif %}
score: {{'%d' % e.score}}
</div>
<div class="markdown-body">{{ e.content_highlight|safe}}</div>