Skip to content

Instantly share code, notes, and snippets.

@nerewarin
Created October 23, 2025 16:49
Show Gist options
  • Select an option

  • Save nerewarin/0d6ab8ba01aa16845d34d0faad9b7ee3 to your computer and use it in GitHub Desktop.

Select an option

Save nerewarin/0d6ab8ba01aa16845d34d0faad9b7ee3 to your computer and use it in GitHub Desktop.
Repository fetcher v3: Clean code organization, linting, and type safety
# copy of original code from https://codeshare.io/2jk6DM + fixed and prettified (but not linted yet)
# Goal: To get top-5 starred repositories from Google GitHub account (https://github.com/google/api)
#
# Result: Printed Python dictionary with top-5 repositories where key is a name of repository and value is count of stars.
# The result should be sorted in descending order
#
# Sample of potential response from github:
# [{
# "id": 1,
# "repo_name": "google_1",
# "stars": 10,
# "link": "https://"
# },
# {
# "id": 2,
# "repo_name": "google_2",
# "stars": 4,
# "link": "https://"
# },
# {
# "id": 3,
# "repo_name": "google_3",
# "stars": null,
# "link": "https://"
# }]
# number of pages is known in advance - lets assume 42
# https://github.com/google/api?page=1
# https://github.com/google/api?page=2
# ...
import asyncio
import uvloop
import traceback
import logging
import argparse
import os
from typing import Any, Protocol, List, Dict
# Global logger - will be reconfigured in main
logger = logging.getLogger(__name__)
# Constants
PAGES_TO_OBSERVE = list(range(1, 43)) # 42 pages
_PARALLELISM = 10
def setup_logging(level: str | None = None) -> logging.Logger:
"""Setup logging configuration with level from CLI or environment"""
if level is None:
level = os.getenv("LOG_LEVEL", "INFO").upper()
logging.basicConfig(
level=getattr(logging, level, logging.INFO),
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S"
)
return logging.getLogger(__name__)
class RepositoryService(Protocol):
"""Protocol for repository data service"""
async def get_repositories(self, page: int) -> List[Dict[str, Any]]:
"""Get repositories for a given page"""
...
class MockRepositoryService:
"""Mock implementation of RepositoryService that returns fake GitHub-like data"""
async def get_repositories(self, page: int) -> List[Dict[str, Any]]:
"""Mock implementation that returns fake repository data"""
# Simulate network delay
await asyncio.sleep(0.2) # Fixed delay for predictable tests
# Test error handling: always throw timeout error for page 1
if page == 1:
raise asyncio.TimeoutError(f"Timeout error for page {page}")
# Generate fake repositories for this page
repos = []
for i in range(10): # 10 repos per page
repo_id = (page - 1) * 10 + i + 1
# Guarantee at least one None stars value per page
# Use the last repo in each page for None stars
if i == 9: # Last repo in page
stars = None
else:
stars = repo_id * 10 # Deterministic stars based on ID
repo = {
"id": repo_id,
"repo_name": f"google_repo_{repo_id}",
"stars": stars,
"link": f"https://github.com/google/repo_{repo_id}"
}
repos.append(repo)
return repos
async def _task(page: int, service: RepositoryService, max_tries=5, initial_delay=0.3, multiplier=2) -> List[Dict[str, Any]]:
if multiplier <= 1:
raise ValueError(f"multiplier must be greater than 1 but {multiplier=} passed!")
# backoff(max_tries, initial_delay, multiplier)
last_exp: BaseException | None = None
while max_tries:
max_tries -= 1
logger.debug(f"requesting page {page}...")
try:
batch_result = await service.get_repositories(page)
except Exception as exc: # Catch any service errors
last_exp = exc
logger.warning(f"Failed to fetch page {page}, retrying... (attempts left: {max_tries})")
await asyncio.sleep(initial_delay)
initial_delay *= multiplier
else:
break
else:
logger.error(f"Failed to fetch page {page} after all retries")
logger.error("Full traceback:")
traceback.print_exc()
if last_exp is not None:
raise last_exp
else:
raise RuntimeError(f"Failed to fetch page {page} after all retries")
return batch_result
def _flatten(list_of_lists: List[List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
res: List[Dict[str, Any]] = []
for lst in list_of_lists:
res += lst
return res
async def _get_pages(pages_to_observe: List[int], service: RepositoryService):
# create task group
tasks = [_task(page, service) for page in pages_to_observe]
# with asyncio.TaskGroup(min(_PARALLELISM, len(repos_to_observe))):
# Handle errors with return_exceptions=True to prevent one failure from stopping all tasks
unflatten_results: List[List[Dict[str, Any]] | BaseException] = await asyncio.gather(*tasks, return_exceptions=True)
# Filter out exceptions and log them
valid_results: List[List[Dict[str, Any]]] = []
for i, result in enumerate(unflatten_results):
if isinstance(result, BaseException):
logger.error(f"Task {i} failed: {result}")
else:
valid_results.append(result)
return _flatten(valid_results)
async def main(pages_to_observe: List[int], service: RepositoryService):
"""
1. load 42 pages in parallel
2. group results
3. order flatten list of results by stars amount desc
4. filter top 5 entries
5. return them as a list of dicts
"""
# get results if all the parallel requests have no error
pages: List[Dict[str, Any]] = await _get_pages(pages_to_observe, service)
ordered_pages = sorted(
pages,
key=lambda x: x.get("stars", 0) or 0,
reverse=True,
)
return ordered_pages[:5]
def parse_args():
"""Parse command line arguments"""
parser = argparse.ArgumentParser(
description="Get top 5 starred repositories from Google's GitHub account",
epilog="""
Examples:
%(prog)s # Run with default INFO logging
%(prog)s --log-level DEBUG # Show debug messages including page requests
%(prog)s --log-level ERROR # Only show errors
LOG_LEVEL=DEBUG %(prog)s # Use environment variable for log level
Environment Variables:
LOG_LEVEL Set default logging level (DEBUG, INFO, WARNING, ERROR)
""",
formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument(
"--log-level",
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
default=os.getenv("LOG_LEVEL", "INFO"),
help="Set logging level (default: INFO, can also use LOG_LEVEL env var)"
)
return parser.parse_args()
if __name__ == '__main__':
args = parse_args()
# Setup logging with CLI level
setup_logging(args.log_level)
logger.info("Starting repository fetcher")
uvloop.install()
service = MockRepositoryService()
result = asyncio.run(main(PAGES_TO_OBSERVE, service))
logger.info("Top 5 repositories:")
for i, repo in enumerate(result, 1):
logger.info(f"{i}. {repo}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment