Created
October 23, 2025 16:49
-
-
Save nerewarin/0d6ab8ba01aa16845d34d0faad9b7ee3 to your computer and use it in GitHub Desktop.
Repository fetcher v3: Clean code organization, linting, and type safety
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # copy of original code from https://codeshare.io/2jk6DM + fixed and prettified (but not linted yet) | |
| # Goal: To get top-5 starred repositories from Google GitHub account (https://github.com/google/api) | |
| # | |
| # Result: Printed Python dictionary with top-5 repositories where key is a name of repository and value is count of stars. | |
| # The result should be sorted in descending order | |
| # | |
| # Sample of potential response from github: | |
| # [{ | |
| # "id": 1, | |
| # "repo_name": "google_1", | |
| # "stars": 10, | |
| # "link": "https://" | |
| # }, | |
| # { | |
| # "id": 2, | |
| # "repo_name": "google_2", | |
| # "stars": 4, | |
| # "link": "https://" | |
| # }, | |
| # { | |
| # "id": 3, | |
| # "repo_name": "google_3", | |
| # "stars": null, | |
| # "link": "https://" | |
| # }] | |
| # number of pages is known in advance - lets assume 42 | |
| # https://github.com/google/api?page=1 | |
| # https://github.com/google/api?page=2 | |
| # ... | |
| import asyncio | |
| import uvloop | |
| import traceback | |
| import logging | |
| import argparse | |
| import os | |
| from typing import Any, Protocol, List, Dict | |
| # Global logger - will be reconfigured in main | |
| logger = logging.getLogger(__name__) | |
| # Constants | |
| PAGES_TO_OBSERVE = list(range(1, 43)) # 42 pages | |
| _PARALLELISM = 10 | |
| def setup_logging(level: str | None = None) -> logging.Logger: | |
| """Setup logging configuration with level from CLI or environment""" | |
| if level is None: | |
| level = os.getenv("LOG_LEVEL", "INFO").upper() | |
| logging.basicConfig( | |
| level=getattr(logging, level, logging.INFO), | |
| format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", | |
| datefmt="%Y-%m-%d %H:%M:%S" | |
| ) | |
| return logging.getLogger(__name__) | |
| class RepositoryService(Protocol): | |
| """Protocol for repository data service""" | |
| async def get_repositories(self, page: int) -> List[Dict[str, Any]]: | |
| """Get repositories for a given page""" | |
| ... | |
| class MockRepositoryService: | |
| """Mock implementation of RepositoryService that returns fake GitHub-like data""" | |
| async def get_repositories(self, page: int) -> List[Dict[str, Any]]: | |
| """Mock implementation that returns fake repository data""" | |
| # Simulate network delay | |
| await asyncio.sleep(0.2) # Fixed delay for predictable tests | |
| # Test error handling: always throw timeout error for page 1 | |
| if page == 1: | |
| raise asyncio.TimeoutError(f"Timeout error for page {page}") | |
| # Generate fake repositories for this page | |
| repos = [] | |
| for i in range(10): # 10 repos per page | |
| repo_id = (page - 1) * 10 + i + 1 | |
| # Guarantee at least one None stars value per page | |
| # Use the last repo in each page for None stars | |
| if i == 9: # Last repo in page | |
| stars = None | |
| else: | |
| stars = repo_id * 10 # Deterministic stars based on ID | |
| repo = { | |
| "id": repo_id, | |
| "repo_name": f"google_repo_{repo_id}", | |
| "stars": stars, | |
| "link": f"https://github.com/google/repo_{repo_id}" | |
| } | |
| repos.append(repo) | |
| return repos | |
| async def _task(page: int, service: RepositoryService, max_tries=5, initial_delay=0.3, multiplier=2) -> List[Dict[str, Any]]: | |
| if multiplier <= 1: | |
| raise ValueError(f"multiplier must be greater than 1 but {multiplier=} passed!") | |
| # backoff(max_tries, initial_delay, multiplier) | |
| last_exp: BaseException | None = None | |
| while max_tries: | |
| max_tries -= 1 | |
| logger.debug(f"requesting page {page}...") | |
| try: | |
| batch_result = await service.get_repositories(page) | |
| except Exception as exc: # Catch any service errors | |
| last_exp = exc | |
| logger.warning(f"Failed to fetch page {page}, retrying... (attempts left: {max_tries})") | |
| await asyncio.sleep(initial_delay) | |
| initial_delay *= multiplier | |
| else: | |
| break | |
| else: | |
| logger.error(f"Failed to fetch page {page} after all retries") | |
| logger.error("Full traceback:") | |
| traceback.print_exc() | |
| if last_exp is not None: | |
| raise last_exp | |
| else: | |
| raise RuntimeError(f"Failed to fetch page {page} after all retries") | |
| return batch_result | |
| def _flatten(list_of_lists: List[List[Dict[str, Any]]]) -> List[Dict[str, Any]]: | |
| res: List[Dict[str, Any]] = [] | |
| for lst in list_of_lists: | |
| res += lst | |
| return res | |
| async def _get_pages(pages_to_observe: List[int], service: RepositoryService): | |
| # create task group | |
| tasks = [_task(page, service) for page in pages_to_observe] | |
| # with asyncio.TaskGroup(min(_PARALLELISM, len(repos_to_observe))): | |
| # Handle errors with return_exceptions=True to prevent one failure from stopping all tasks | |
| unflatten_results: List[List[Dict[str, Any]] | BaseException] = await asyncio.gather(*tasks, return_exceptions=True) | |
| # Filter out exceptions and log them | |
| valid_results: List[List[Dict[str, Any]]] = [] | |
| for i, result in enumerate(unflatten_results): | |
| if isinstance(result, BaseException): | |
| logger.error(f"Task {i} failed: {result}") | |
| else: | |
| valid_results.append(result) | |
| return _flatten(valid_results) | |
| async def main(pages_to_observe: List[int], service: RepositoryService): | |
| """ | |
| 1. load 42 pages in parallel | |
| 2. group results | |
| 3. order flatten list of results by stars amount desc | |
| 4. filter top 5 entries | |
| 5. return them as a list of dicts | |
| """ | |
| # get results if all the parallel requests have no error | |
| pages: List[Dict[str, Any]] = await _get_pages(pages_to_observe, service) | |
| ordered_pages = sorted( | |
| pages, | |
| key=lambda x: x.get("stars", 0) or 0, | |
| reverse=True, | |
| ) | |
| return ordered_pages[:5] | |
| def parse_args(): | |
| """Parse command line arguments""" | |
| parser = argparse.ArgumentParser( | |
| description="Get top 5 starred repositories from Google's GitHub account", | |
| epilog=""" | |
| Examples: | |
| %(prog)s # Run with default INFO logging | |
| %(prog)s --log-level DEBUG # Show debug messages including page requests | |
| %(prog)s --log-level ERROR # Only show errors | |
| LOG_LEVEL=DEBUG %(prog)s # Use environment variable for log level | |
| Environment Variables: | |
| LOG_LEVEL Set default logging level (DEBUG, INFO, WARNING, ERROR) | |
| """, | |
| formatter_class=argparse.RawDescriptionHelpFormatter | |
| ) | |
| parser.add_argument( | |
| "--log-level", | |
| choices=["DEBUG", "INFO", "WARNING", "ERROR"], | |
| default=os.getenv("LOG_LEVEL", "INFO"), | |
| help="Set logging level (default: INFO, can also use LOG_LEVEL env var)" | |
| ) | |
| return parser.parse_args() | |
| if __name__ == '__main__': | |
| args = parse_args() | |
| # Setup logging with CLI level | |
| setup_logging(args.log_level) | |
| logger.info("Starting repository fetcher") | |
| uvloop.install() | |
| service = MockRepositoryService() | |
| result = asyncio.run(main(PAGES_TO_OBSERVE, service)) | |
| logger.info("Top 5 repositories:") | |
| for i, repo in enumerate(result, 1): | |
| logger.info(f"{i}. {repo}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment