""" Classes to cache and read specific items from github issues in a uniform way """ from functools import partial as Partial import datetime import time import shelve # Requires PyGithub version >= 1.13 for access to raw_data attribute import github # Needed to not confuse cached 'None' objects class Nothing(object): raw_data = None # Needed to signal list cache, not github object class SearchResults(object): def __init__(self, *stuff): self.raw_data = stuff class GithubCache(object): """ Auto-refreshing github.GithubObject.GithubObject from dict """ cache_hits = 0 cache_misses = 0 cache_lifetimes = { 'default': datetime.timedelta(hours=2), github.GitCommit.GitCommit: datetime.timedelta(days=30), github.NamedUser.NamedUser: datetime.timedelta(days=30), github.Commit.Commit: datetime.timedelta(days=30), github.Issue.Issue: datetime.timedelta(minutes=30), github.PullRequest.PullRequest: datetime.timedelta(hours=1), # Special case for github.Issue.Issue 'closed': datetime.timedelta(days=30), SearchResults: datetime.timedelta(minutes=10), github.NamedUser.NamedUser: datetime.timedelta(hours=2), github.GitAuthor.GitAuthor: datetime.timedelta(days=9999), 'total_issues': datetime.timedelta(days=9999) } def __init__(self, github_obj, cache_get_partial, cache_set_partial, cache_del_partial, pre_fetch_partial, fetch_partial): self.github = github_obj self.cache_get = cache_get_partial # Returns native dict self.cache_set = cache_set_partial # called with value=dict self.cache_del = cache_del_partial self.pre_fetch = pre_fetch_partial # called with nothing self.fetch = fetch_partial # Returns github.GithubObject.GithubObject def __call__(self): """ Retrieve instance from fresh or cached data """ # microseconds aren't useful when fetch takes ~1 second now = datetime.datetime.utcnow() now = datetime.datetime(year=now.year, month=now.month, day=now.day, hour=now.hour, minute=now.minute, second=0, microsecond=0) try: data = self.cached_data() if data['expires'] < now: raise KeyError # refresh cache self.cache_hits += 1 except KeyError: data = self.fetched_data(now) self.cache_set(value=data) self.cache_misses += 1 # Any exceptions thrown during conversion should purge cache entry try: # Format data for consumption if data['klass'] == github.PaginatedList.PaginatedList: inside_klass = data['inside_klass'] result = [] for item in data['raw_data']: result.append( self.github.create_from_raw_data(inside_klass, item)) return result elif data['klass'] == Nothing: return None # it's a None object elif data['klass'] == SearchResults: return data['raw_data'] # just the contents else: return self.github.create_from_raw_data(data['klass'], data['raw_data']) except: try: self.cache_del() except KeyError: pass # doesn't exist in cache, ignore raise # original exception @staticmethod def format_data(klass, expires, raw_data, inside_klass=None): """ Enforce uniform data format for fetched data """ if inside_klass is None: return {'klass': klass, 'fetched': datetime.datetime.utcnow(), 'expires': expires, 'raw_data': raw_data} else: return {'klass': klass, 'inside_klass': inside_klass, 'fetched': datetime.datetime.utcnow(), 'expires': expires, 'raw_data': raw_data} def fetched_data(self, now): """ Return dictionary containing freshly fetched values """ try: if callable(self.pre_fetch): self.pre_fetch() fetched_obj = self.fetch() except github.GithubException, detail: if detail.status == 404: raise KeyError('Github item not-found error while calling %s ' 'with args=%s and dargs=%s' % (self.fetch.func, self.fetch.args, self.fetch.keywords)) else: raise if fetched_obj is None: fetched_obj = Nothing() klass = fetched_obj.__class__ # github.PaginatedList.PaginatedList need special handling if isinstance(fetched_obj, github.PaginatedList.PaginatedList): raw_data = [item.raw_data for item in fetched_obj] inside_klass = fetched_obj[0].__class__ expires = now + self.cache_lifetimes.get(inside_klass, self.cache_lifetimes['default']) return self.__class__.format_data(klass, now + self.cache_lifetimes.get( inside_klass, self.cache_lifetimes[ 'default']), raw_data, inside_klass) else: expires = now + self.cache_lifetimes.get(klass, # else default self.cache_lifetimes['default']) # closed issues/pull requests don't change much if hasattr(fetched_obj, 'closed_at'): if fetched_obj.closed_at is not None: expires = now + self.cache_lifetimes['closed'] return self.__class__.format_data(klass, expires, fetched_obj.raw_data) def cached_data(self): """ Return dictionary containing cached values or raise KeyError """ try: return self.cache_get() # maybe raise KeyError or TypeError except KeyError: raise except: # Try to delete the entry self.cache_del() raise class GithubIssuesBase(list): """ Base class for cached list of github issues """ # Force static pickle protocol version protocol = 2 # Class to use for cache management cache_class = GithubCache def __init__(self, github_obj, repo_full_name, cache_filename): """ Initialize cache and reference github repository issues """ self.github = github_obj self.repo_full_name = repo_full_name self.shelf = shelve.open(filename=cache_filename, protocol=self.protocol, writeback=True) # Avoid exceeding rate-limit per hour requests = self.github.rate_limiting[1] # requests per hour period = 60.0 * 60.0 # one hour in seconds sleeptime = period / requests self.pre_fetch_partial = Partial(time.sleep, sleeptime) # self.pre_fetch_partial = None # cheat-mode enable (no delays) repo_cache_key = 'repo_%s' % self.repo_full_name # get_repo called same way throughout instance life cache_get_partial = Partial(self.shelf.__getitem__, repo_cache_key) cache_set_partial = Partial(self.shelf.__setitem__, repo_cache_key) cache_del_partial = Partial(self.shelf.__delitem__, repo_cache_key) fetch_partial = Partial(self.github.get_repo, self.repo_full_name) # Callable instance retrieves cached or fetched value for key self.get_repo = self.cache_class(self.github, cache_get_partial, cache_set_partial, cache_del_partial, self.pre_fetch_partial, fetch_partial) super(GithubIssuesBase, self).__init__() def __del__(self): """ Make sure cache is saved """ try: self.shelf.close() except AttributeError: pass # Open must have failed def __len__(self): """ Binary search through issue numbers until largest identified """ increment = 1000 last_issue = 1 if not self.__contains__(last_issue): return 0 # no issues while increment > 0: while self.__contains__(last_issue): last_issue += increment # Fall back to prior one last_issue -= increment # Chop increment in half increment /= 2 return last_issue def __contains__(self, key): try: # Must call this classes method specifically GithubIssuesBase.__getitem__(self, key) except KeyError: return False return True def __iter__(self): for key in self.keys(): yield self[key] def __setitem__(self, key, value): raise KeyError("Read only mapping while trying to set %s to %s" % (str(key), str(value))) def __delitem__(self, key): raise KeyError( "Read only mapping while trying to delete %s" % str(key)) def __getitem__(self, key): """ Return a standardized dict of github issue unless NoEnumerate=True """ repo = self.get_repo() # Enforce uniform key string cache_key = self.get_issue_cache_key(key) fetch_partial = Partial(repo.get_issue, int(key)) item = self.get_gh_obj(cache_key, fetch_partial) # No exception raised, update cache on disk self.shelf.sync() return item def get_issue_cache_key(self, number): return 'repo_%s_issue_%s' % (self.repo_full_name, str(int(number))) def has_key(self, key): return self.__contains__(key) def items(self): # Iterator comprehension return (self[key] for key in self.keys()) def keys(self): # Iterators are simply better return xrange(1, self.__len__() + 1) def values(self): # Iterator comprehension return (value for (key, value) in self.items()) class GithubIssues(GithubIssuesBase, object): """ Read-only List-like interface to cached github issues in standardized format """ # Marshal callables for key to github.Issue.Issue value marshal_map = { 'number': lambda gh_obj: getattr(gh_obj, 'number'), 'summary': lambda gh_obj: getattr(gh_obj, 'title'), 'description': lambda gh_obj: getattr(gh_obj, 'body'), 'modified': lambda gh_obj: getattr(gh_obj, 'updated_at'), 'commits': NotImplementedError, # setup in __init__ 'opened': lambda gh_obj: getattr(gh_obj, 'created_at'), 'closed': lambda gh_obj: getattr(gh_obj, 'closed_at'), 'assigned': lambda gh_obj: getattr(gh_obj, 'assignee'), 'author': lambda gh_obj: getattr(gh_obj, 'user').login, 'commit_authors': NotImplementedError, # setup in __init__ 'comments': lambda gh_obj: getattr(gh_obj, 'comments'), 'comment_authors': NotImplementedError, # setup in __init__ 'labels': lambda gh_obj: [label.name for label in gh_obj.labels], 'url': lambda gh_obj: getattr(gh_obj, 'html_url'), 'github_issue': lambda gh_obj: gh_obj } # Storage for property values _cache_hits = 0 # Tracks temporary cache instances _cache_misses = 0 # Tracks temporary cache instances def __init__(self, github_obj, repo_full_name): """ Initialize cache and reference github repository issues """ cache_filename = self.__class__.__name__ + '.cache' super(GithubIssues, self).__init__(github_obj, repo_full_name, cache_filename) # These marshal functions require state self.marshal_map['commits'] = self.gh_pr_commits self.marshal_map['commit_authors'] = self.gh_pr_commit_authors self.marshal_map['comment_authors'] = self.gh_issue_comment_authors def __del__(self): self.vacuum() super(GithubIssues, self).__del__() def vacuum(self): """Vacuum up all expired entries""" # Can't modify list while iterating keys_to_del = [] now = datetime.datetime.utcnow() for key, value in self.shelf.items(): # no need to be precise if value['expires'] <= now: keys_to_del.append(key) for key in keys_to_del: del self.shelf[key] @property def cache_hits(self): return self.get_repo.cache_hits + self._cache_hits @property def cache_misses(self): return self.get_repo.cache_misses + self._cache_misses def __getitem__(self, key): """ Return a standardized dict of github issue """ item = self.marshal_gh_obj(super(GithubIssues, self).__getitem__(key)) self.shelf.sync() return item def __len__(self): """ Return cached number of issues """ cache_key = 'repo_%s_total_issues' % self.repo_full_name # seconds aren't useful when fetch takes > 1 minute now = datetime.datetime.utcnow() now = datetime.datetime(year=now.year, month=now.month, day=now.day, hour=now.hour, minute=now.minute, second=0, microsecond=0) # Easier to do custom caching behavior here than fuss with GithubCache try: cache_data = self.shelf.__getitem__(cache_key) if cache_data['expires'] < now: raise KeyError # Bypass search_result caching used in self.search() searchresult = self.make_search_results( {'since': cache_data['since']}) # about to change the number cache_data['since'] = now # total equal to old count plus new count since then cache_data['raw_data'] += len(searchresult.raw_data) except KeyError: cache_data = {} # doesn't expire ever cache_data['expires'] = now + GithubCache.cache_lifetimes[ 'total_issues'] cache_data['since'] = now # This will take a while if issue cache is stale cache_data['raw_data'] = super(GithubIssues, self).__len__() self.shelf.__setitem__(cache_key, cache_data) return cache_data['raw_data'] def get_gh_obj(self, cache_key, fetch_partial): """ Helper to get object possibly from cache and update counters """ cache_get_partial = Partial(self.shelf.__getitem__, cache_key) cache_set_partial = Partial(self.shelf.__setitem__, cache_key) cache_del_partial = Partial(self.shelf.__delitem__, cache_key) # Callable instance could change every time get_obj = GithubCache(self.github, cache_get_partial, cache_set_partial, cache_del_partial, self.pre_fetch_partial, fetch_partial) result = get_obj() self._cache_hits += get_obj.cache_hits self._cache_misses += get_obj.cache_misses return result # DOES NOT SYNC DATA! def search(self, criteria): """ Return a list of issue-numbers that match a search criteria. :param criteria: Dictionary of search terms state - str - 'open', 'closed' assignee - list of str (login), "none" or "*" mentioned - str (login) labels - list of str (label name) sort - str - 'created', 'updated', 'comments' direction - str - 'asc', 'desc' since - datetime.datetime """ valid_criteria = {} # use search dictionary to form hash for cached results search_cache_key = 'issue_search' # Validate & transform criteria if criteria.has_key('state'): state = str(criteria['state']) if state not in ('open', 'closed'): raise ValueError("'state' criteria must be 'open' or 'closed'") valid_criteria['state'] = state search_cache_key = '%s_%s' % (search_cache_key, state) if criteria.has_key('assignee'): assignee = str(criteria['assignee']) search_cache_key = '%s_%s' % (search_cache_key, assignee) if assignee in ('none', '*'): valid_criteria['assignee'] = assignee else: # returns github.NamedUser.NamedUser valid_criteria['assignee'] = self.get_gh_user(assignee) if criteria.has_key('mentioned'): mentioned = str(criteria['assignee']) search_cache_key = '%s_%s' % (search_cache_key, mentioned) if mentioned in ('none', '*'): valid_criteria['mentioned'] = mentioned else: # returns github.NamedUser.NamedUser valid_criteria['mentioned'] = self.get_gh_user(mentioned) if criteria.has_key('labels'): labels = criteria['labels'] if not isinstance(labels, list): raise ValueError("'lables' criteria must be a list") valid_criteria['labels'] = [] for name in labels: search_cache_key = '%s_%s' % (search_cache_key, labels) valid_criteria['labels'].append(self.get_gh_label(str(name))) if criteria.has_key('sort'): sort = str(criteria['sort']) if sort not in ('created', 'updated', 'comments'): raise ValueError("'sort' criteria must be 'created', 'updated'" ", 'comments'") valid_criteria['sort'] = sort search_cache_key = '%s_%s' % (search_cache_key, sort) if criteria.has_key('direction'): direction = str(criteria['direction']) if direction not in ('asc', 'desc'): raise ValueError("'direction' criteria must be 'asc', 'desc'") valid_criteria['direction'] = direction search_cache_key = '%s_%s' % (search_cache_key, direction) if criteria.has_key('since'): since = criteria['since'] if not isinstance(since, datetime.datetime): raise ValueError("'since' criteria must be a " "datetime.datetime") # second and milisecond not useful to search or cache since = datetime.datetime(year=since.year, month=since.month, day=since.day, hour=since.hour, minute=since.minute, second=0, microsecond=0) search_cache_key = '%s_%s' % (search_cache_key, since.isoformat()) valid_criteria['since'] = since # Do not perform search operation unless no cached results # or cached results have expired fetch_partial = Partial(self.make_search_results, valid_criteria) # This could take an arbitrarily LONG time return self.get_gh_obj(search_cache_key, fetch_partial) def make_search_results(self, valid_criteria): """ Return a SearchResults instance from issue numbers found by search """ repo = self.get_repo() result = repo.get_issues(**valid_criteria) return SearchResults(*[issue.number for issue in result]) def clean_cache_entry(self, key): """ Remove an entry from cache, ignoring any KeyErrors """ try: del self.shelf[key] except KeyError: pass def get_gh_user(self, login): cache_key = 'github_user_%s' % login fetch_partial = Partial(self.github.get_user, login) try: return self.get_gh_obj(cache_key, fetch_partial) except KeyError: raise ValueError('login %s is not a valid github user' % login) def get_gh_label(self, name): repo = self.get_repo() cache_key = str('repo_%s_label_%s' % (self.repo_full_name, name)) fetch_partial = Partial(repo.get_label, name) try: return self.get_gh_obj(cache_key, fetch_partial) except KeyError: raise ValueError('label %s is not valid for repo %s' % (name, self.repo_full_name)) def marshal_gh_obj(self, gh_issue): """ Translate a github issue object into dictionary w/ fixed keys """ mkeys = self.marshal_map.keys() return dict([(key, self.marshal_map[key](gh_issue)) for key in mkeys]) @staticmethod def gh_issue_is_pull(gh_issue): """ Return True/False if gh_issue is a pull request or not """ pullreq = gh_issue.pull_request if pullreq is not None: if (pullreq.diff_url is None and pullreq.html_url is None and pullreq.patch_url is None): return False else: return False # pullreq not None but pullreq attributes are not None return True # marshal_map method def gh_issue_comment_authors(self, gh_issue): """ Retrieve a list of comment author e-mail addresses """ if gh_issue.comments > 0: num = gh_issue.number cache_key = ('repo_%s_issue_%s_comments' % (self.repo_full_name, num)) fetch_partial = Partial(gh_issue.get_comments) authors = set() for comment in self.get_gh_obj(cache_key, fetch_partial): # Referencing user attribute requires a request, so cache it user_cache_key = cache_key + '_%s_user' % comment.id user_fetch_partial = Partial(getattr, comment, 'user') try: user = self.get_gh_obj(user_cache_key, user_fetch_partial) except: # Also clean up comments cache self.clean_cache_entry(cache_key) raise # original exception authors.add(user.email) return authors else: return None # marshal_map method def gh_pr_commit_authors(self, gh_issue): """ Return list of commit author e-mail addresses for a pull-request """ if GithubIssues.gh_issue_is_pull(gh_issue): num = gh_issue.number repo = self.get_repo() cache_key = 'repo_%s_pull_%s' % (self.repo_full_name, str(num)) fetch_partial = Partial(repo.get_pull, num) pull = self.get_gh_obj(cache_key, fetch_partial) if pull.commits is None or pull.commits < 1: return None # No commits == no commit authors cache_key = 'repo_%s_pull_%s_commits' % (self.repo_full_name, str(num)) fetch_partial = Partial(pull.get_commits) authors = set() for commit in self.get_gh_obj(cache_key, fetch_partial): # Referencing commit author requires a request, cache it. author_cache_key = cache_key + '_%s_author' % str(commit.sha) author_fetch_partial = Partial(getattr, commit, 'author') try: author_obj = self.get_gh_obj(author_cache_key, author_fetch_partial) except: # clean up commit list cache entry also self.clean_cache_entry(cache_key) raise # original exception # Retrieve e-mail from git commit object if author_obj is None: # Referencing git commit requires a request, cache it gitcommit_cache_key = (cache_key + '_%s_gitcommit' % str(commit.sha)) gitcommit_fetch_partial = Partial(getattr, commit, 'commit') # git commit try: gitcommit = self.get_gh_obj(gitcommit_cache_key, gitcommit_fetch_partial) except: # Need to clean commit and gitcommit entries self.clean_cache_entry(cache_key) self.clean_cache_entry(gitcommit_cache_key) raise authors.add(gitcommit.author.email) else: # Author is a github user authors.add(author_obj.login) return authors return None # not a pull request # marshal_map method def gh_pr_commits(self, gh_issue): """ Retrieves the number of commits on a pull-request, None if not a pull. """ if GithubIssues.gh_issue_is_pull(gh_issue): num = gh_issue.number repo = self.get_repo() cache_key = 'repo_%s_pull_%s' % (self.repo_full_name, str(num)) fetch_partial = Partial(repo.get_pull, num) pull = self.get_gh_obj(cache_key, fetch_partial) return pull.commits return None # not a pull request class MutateError(KeyError): def __init__(self, key, number): super(MutateError, self).__init__("Unable to modify %s on issue %d" % (str(key), number)) class MutableIssue(dict): """Allow modification of some issue values""" def __init__(self, github_issues, issue_number): if not isinstance(github_issues, GithubIssues): raise ValueError("github_issues %s is not a GithubIssues, it's a %s" % (str(github_issues), str(type(github_issues)))) # make sure issue_number is valid and cached junk = github_issues[issue_number] del junk # Private for private _github_issue property access self._github_issues = github_issues self._issue_number = issue_number super(MutableIssue, self).__init__() @property def _github_issue(self): return self._github_issues[self._issue_number] @property def _issue_cache_key(self): return self.get_issue_cache_key(self._issue_number) def _setdelitem(self, opr, key, value): if key not in self._github_issues.marshal_map.keys(): raise MutateError(key, self._issue_number) methodname = '%s_%s' % (opr, key) if callable(getattr(self, methodname)): method = getattr(self, methodname) if opr == 'set': method(value) else: method() else: raise MutateError(key, self._issue_number) def __getitem__(self, key): # Guarantees fresh/cached data for every call return self._github_issue[key] def __setitem__(self, key, value): self._setdelitem('set', key, value) def __delitem__(self, key): self._setdelitem('del', key, None) def set_labels(self, value): """ Merge list of new lables into existing label set """ new_labels = set(value) old_labels = set(self._github_issue['labels']) change_list = list(new_labels | old_labels) get_gh_label = self._github_issues.get_gh_label # save typing # Raise exception if any label name is bad gh_labels = [get_gh_label(label) for label in change_list] # Access PyGithub object to change labels self._github_issue['github_issue'].set_labels(*gh_labels) # Force retrieval of changed item self._github_issues.clean_cache_entry(self._issue_cache_key()) def del_labels(self): """ Remove all lbels from an issue """ self._github_issue['github_issue'].delete_labels() # Force retrieval of changed item self._github_issues.clean_cache_entry(self._issue_cache_key()) # TODO: Write get_*(), set_*(), del_*() for other dictionary keys