10 Commits
v0.1 ... round3

7 changed files with 91 additions and 36 deletions

View File

@@ -4,6 +4,7 @@ use whoosh to search github issues.
Implemented in **Python** using **Flask**, **Whoosh** and **Mistune**. Implemented in **Python** using **Flask**, **Whoosh** and **Mistune**.
<img src="img/screenshot.png" width="500px" />
## notes ## notes
@@ -35,8 +36,15 @@ summary of how to change the schema:
- Search class defines Schema object, main definition - Search class defines Schema object, main definition
- `add_issue()` (equally important) which defines how to extract the - `add_issue()` (equally important) which defines how to extract the
fields defined in the schema from the document fields defined in the schema from the document
- - `create_search_result()` (also important) which packages up the
search results for the template to deal with
- `search.html`: the search.html template uses a different variable
namespace than the Python file `issues_search.py` or the flask app
- The `create_search-result()` method of `issues_search.py`
defines how search results are parsed and packaged for the
`search.html` template
- Jinja variables used in `search.html` should be defined in
`create_search_result()` method of `issues_search.py`

31
Todo.md Normal file
View File

@@ -0,0 +1,31 @@
# TODO
recap of round 1:
- issues search is working well
- indexing comments and issues
- able to easily add new fields to schema
- able to easily modify search + results template
- mapping out where everything is
## Round 2 (done)
improvements:
- storing comments and issues as separate objects?
- storing a boolean? that simple? customize the output of the search result
based on a boolean?
- if so, how do we pass off a search result to a template conditionally,
such that we can save some space (jinja question)
fix stuff that isn't mine:
- improve the readme
- fix the config.py config file options
config:
- enable user to specify list of organizations+repos
- not just one org/list of repos
## Round 3
organization:
- mapping out how to change the schema... now, how do we streamline it?
- how to organize files

BIN
img/screenshot.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 614 KiB

View File

@@ -23,10 +23,8 @@ routes:
""" """
def get_items(): def get_items():
repo_list = ['2018-may-workshop', repo_list = app.config["REPOS"]
'2018-june-workshop', repo_org =app.config["REPO_ORG"]
'2018-july-workshop']
repo_org = 'dcppc'
gh_access_token = os.environ['GITHUB_TOKEN'] gh_access_token = os.environ['GITHUB_TOKEN']
@@ -65,12 +63,10 @@ def search():
if fields == 'None': if fields == 'None':
fields = None fields = None
directories = []
search = Search(app.config["INDEX_DIR"]) search = Search(app.config["INDEX_DIR"])
if not query: if not query:
parsed_query = "" parsed_query = ""
result = [] result = []
directories=get_directories()
else: else:
parsed_query, result = search.search(query.split(), fields=[fields]) parsed_query, result = search.search(query.split(), fields=[fields])
@@ -78,7 +74,7 @@ def search():
total = search.get_document_total_count() total = search.get_document_total_count()
return render_template('search.html', entries=result, query=query, parsed_query=parsed_query, fields=fields, last_searches=get_last_searches(), directories=directories, total=total) return render_template('search.html', entries=result, query=query, parsed_query=parsed_query, fields=fields, last_searches=get_last_searches(), total=total)
@app.route('/open') @app.route('/open')
def open_file(): def open_file():
@@ -98,7 +94,6 @@ def update_index():
else: else:
UpdateIndexTask() UpdateIndexTask()
flash("Updating index, check console output") flash("Updating index, check console output")
store_directories()
return render_template("search.html", query="", fields="", last_searches=get_last_searches()) return render_template("search.html", query="", fields="", last_searches=get_last_searches())
@@ -111,18 +106,7 @@ def get_last_searches():
return contents return contents
def get_directories():
'''remove for issues'''
if os.path.exists(directories_file):
with codecs.open(directories_file, 'r', encoding='utf-8') as f:
directories = f.readlines()
f.close()
else:
directories = []
return directories
def store_search(query, fields): def store_search(query, fields):
'''remove for issues'''
if os.path.exists(last_searches_file): if os.path.exists(last_searches_file):
with codecs.open(last_searches_file, 'r', encoding='utf-8') as f: with codecs.open(last_searches_file, 'r', encoding='utf-8') as f:
contents = f.readlines() contents = f.readlines()
@@ -136,17 +120,6 @@ def store_search(query, fields):
with codecs.open(last_searches_file, 'w', encoding='utf-8') as f: with codecs.open(last_searches_file, 'w', encoding='utf-8') as f:
f.writelines(contents[:30]) f.writelines(contents[:30])
def store_directories():
'''remove for issues'''
directories = []
for root, dirnames, files in os.walk(app.config["MARKDOWN_FILES_DIR"]):
if dirnames:
for d in dirnames:
if os.path.isdir(os.path.join(root, d)):
directories.append("%s\n" % d.lower())
directories = sorted(set(directories))
with codecs.open(app.config["INDEX_DIR"] + "/directories.txt", 'w', encoding='utf-8') as f:
f.writelines(directories)
if __name__ == '__main__': if __name__ == '__main__':
app.run() app.run()

View File

@@ -17,6 +17,16 @@ from whoosh.analysis import StemmingAnalyzer
""" """
issues-search.py Flow: issues-search.py Flow:
very high level description:
- zeroth step: create a search index
- first step: load a search index
- second step: call the search() method
- third step: update the search index
program will: program will:
- create a Search object - create a Search object
- call add_all_issues - call add_all_issues
@@ -88,6 +98,7 @@ class Search:
schema = Schema( schema = Schema(
url=ID(stored=True, unique=True), url=ID(stored=True, unique=True),
is_comment=BOOLEAN(stored=True),
timestamp=STORED, timestamp=STORED,
repo_name=TEXT(stored=True), repo_name=TEXT(stored=True),
repo_url=ID(stored=True), repo_url=ID(stored=True),
@@ -116,6 +127,7 @@ class Search:
Schema: Schema:
- url - url
- is_comment
- timestamp - timestamp
- repo_name - repo_name
- repo_url - repo_url
@@ -137,6 +149,7 @@ class Search:
print("Indexing issue %s"%(issue.html_url)) print("Indexing issue %s"%(issue.html_url))
writer.add_document( writer.add_document(
url = issue.html_url, url = issue.html_url,
is_comment = False,
timestamp = issue.created_at, timestamp = issue.created_at,
repo_name = repo_name, repo_name = repo_name,
repo_url = repo_url, repo_url = repo_url,
@@ -155,6 +168,7 @@ class Search:
print(" > Indexing comment %s"%(comment.html_url)) print(" > Indexing comment %s"%(comment.html_url))
writer.add_document( writer.add_document(
url = comment.html_url, url = comment.html_url,
is_comment = True,
timestamp = comment.created_at, timestamp = comment.created_at,
repo_name = repo_name, repo_name = repo_name,
repo_url = repo_url, repo_url = repo_url,
@@ -168,6 +182,7 @@ class Search:
return count return count
'''
def add_all_issues(self, gh_access_token, list_of_repos, which_org, config, create_new_index=False): def add_all_issues(self, gh_access_token, list_of_repos, which_org, config, create_new_index=False):
""" """
Add all issues in a given github repo to the search index. Add all issues in a given github repo to the search index.
@@ -214,6 +229,8 @@ class Search:
writer.commit() writer.commit()
print("Done, added %d documents to the index" % c) print("Done, added %d documents to the index" % c)
'''
def update_index_incremental(self, gh_access_token, list_of_repos, which_org, config, create_new_index=False): def update_index_incremental(self, gh_access_token, list_of_repos, which_org, config, create_new_index=False):
@@ -245,6 +262,12 @@ class Search:
writer = self.ix.writer() writer = self.ix.writer()
# fix this. the delete all in index
# is not occurring in right place.
# Iterate over each repo # Iterate over each repo
for this_repo in list_of_repos: for this_repo in list_of_repos:
@@ -307,6 +330,8 @@ class Search:
sr.issue_title = r['issue_title'] sr.issue_title = r['issue_title']
sr.issue_url = r['issue_url'] sr.issue_url = r['issue_url']
sr.is_comment = r['is_comment']
sr.content = r['content'] sr.content = r['content']
highlights = r.highlights('content') highlights = r.highlights('content')
if not highlights: if not highlights:
@@ -360,5 +385,5 @@ if __name__ == "__main__":
search.add_all_issues(gh_access_token, search.add_all_issues(gh_access_token,
repo_list, repo_list,
repo_org, repo_org,
"/Users/charles/codes/markdown-search/config.py") "/Users/charles/codes/issues-search/config.py")

11
requirements.txt Normal file
View File

@@ -0,0 +1,11 @@
Flask>=0.12.1
apiclient>=1.0.3
oauth2client>=3.0.0
httplib2>=0.10.3
google-api-python-client
mistune>=0.8
whoosh>=2.7.4
PyGithub>=1.39
pypandoc>=1.4
requests>=2.19
pandoc>=1.0

View File

@@ -34,9 +34,16 @@
<div class="path"><a href='{{ url_for("open_file")}}?path={{e.path|urlencode}}&query={{query}}&fields={{fields}}'>{{e.path}}</a>score: {{'%d' % e.score}}</div> <div class="path"><a href='{{ url_for("open_file")}}?path={{e.path|urlencode}}&query={{query}}&fields={{fields}}'>{{e.path}}</a>score: {{'%d' % e.score}}</div>
--> -->
<div class="url"> <div class="url">
<a {% if e.is_comment %}
href='{{e.repo_url}}'>dcppc/{{e.repo_name}}</a> <b>Comment</b> <a href='{{e.url}}'>(comment link)</a>
- <a href='{{e.issue_url}}'>{{e.issue_title}}</a> - <a href='{{e.url}}'>link</a><br /> on issue <a href='{{e.issue_url}}'>{{e.issue_title}}</a>
in repo <a href='{{e.repo_url}}'>dcppc/{{e.repo_name}}</a>
<br />
{% else %}
<b>Issue</b> <a href='{{e.issue_url}}'>{{e.issue_title}}</a>
in repo <a href='{{e.repo_url}}'>dcppc/{{e.repo_name}}</a>
<br />
{% endif %}
score: {{'%d' % e.score}} score: {{'%d' % e.score}}
</div> </div>
<div class="markdown-body">{{ e.content_highlight|safe}}</div> <div class="markdown-body">{{ e.content_highlight|safe}}</div>