From 565464e2666578a87b72e832151797d4041f1256 Mon Sep 17 00:00:00 2001 From: Slawomir Koszewski Date: Sun, 9 Nov 2025 20:55:05 +0100 Subject: [PATCH] Added harvester package with harvester module and harvest_readmes() function prototype. --- .gitignore | 3 ++ harvester.py | 20 +++--------- harvester/__init__.py | 1 + harvester/harvester.py | 72 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 81 insertions(+), 15 deletions(-) create mode 100644 harvester/__init__.py create mode 100644 harvester/harvester.py diff --git a/.gitignore b/.gitignore index bdd3894..36efaca 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,6 @@ prototype_*.py *.pem *.key *.crt + +# Harvester output +reference \ No newline at end of file diff --git a/harvester.py b/harvester.py index 888b1e0..c702106 100755 --- a/harvester.py +++ b/harvester.py @@ -1,18 +1,8 @@ #!/usr/bin/env python3 -import requests -from devops.devops import Organization, Project, Repository, Item -from devops.azure import get_token +from harvester.harvester import harvest_readmes -org = Organization("https://dev.azure.com/mcovsandbox", token=get_token()) - -# Find all Markdown files in all projects and repositories -for project in org.projects: - for repo in project.repositories: - try: - root_item = Item(repository=repo, path="/") - md_files = root_item.get_child_items(pattern="*.md", recurse=True) - for md_file in md_files: - print(f"Project: {project.name}, Repo: {repo.name}, File: {md_file.path}") - except requests.exceptions.HTTPError as e: - print(f"Repository {repo.name} is empty.") +if __name__ == "__main__": + harvest_readmes( + organization="mcovsandbox" + ) diff --git a/harvester/__init__.py b/harvester/__init__.py new file mode 100644 index 0000000..0e57bf9 --- /dev/null +++ b/harvester/__init__.py @@ -0,0 +1 @@ +# Harvester Package diff --git a/harvester/harvester.py b/harvester/harvester.py new file mode 100644 index 0000000..24f19c7 --- /dev/null +++ b/harvester/harvester.py @@ -0,0 +1,72 @@ +import os +import requests +from devops.azure import get_token +from devops.devops import Organization, Project, Repository, Item +import logging + +fmt = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') +ch = logging.StreamHandler() +ch.setFormatter(fmt) +log = logging.getLogger(__name__) +log.addHandler(ch) +log.setLevel(logging.INFO) + +def sanitize_name(name: str) -> str: + """Sanitize a name to be filesystem-friendly.""" + return name.lower().replace(" ", "-").replace("_", "-") + +def harvest_readmes(organization: str, branch: list[str | None] = ["main", "dev", None], projects: list[str] = [], output_path: str = "reference") -> None: + """Harvest README files from repositories.""" + org = Organization("https://dev.azure.com/" + organization, token=get_token()) + + if projects: + # Target specific projects + target_projects = [Project(org=org, name=project_name) for project_name in projects] + else: + # Target all projects + target_projects = org.projects + + for project in target_projects: + repo_index = [] # Repository index for the project. + log.info(f"Processing project: {project.name} with {len(project.repositories)} repositories.") # type: ignore + for repo in project.repositories: + log.info(f"...processing repository: {repo.name}") + readme_found = False + # Try each specified branch to find the README.md file + for branch_name in branch: + try: + # Check if the README.md file exists + readme = Item(repository=repo, path="/README.md", branch=branch_name) + # Build output path and save the README content if found + if readme: + project_path = f"{output_path}/{sanitize_name(project.name)}" # type: ignore + # Create project directory if it doesn't exist + os.makedirs(project_path, exist_ok=True) + # Save README content to index.md + readme_content = readme.get_content(branch=branch_name) + if readme_content is None or len(readme_content.strip()) == 0: + continue + with open(f"{project_path}/{sanitize_name(repo.name)}.md", "w") as f: + f.write(readme_content.decode("utf-8")) + readme_found = True + break # Exit branch loop if README is found + except requests.exceptions.HTTPError: + # Repository does not have a README.md file in the specified branch + continue + # Register if README was not found in any branch + repo_index.append((repo.name, readme_found)) # README not found + # Log if the README was not found + if not readme_found: + log.warning(f"......README.md in repo {repo.name} is not found or empty.") + # Save the repository index for the project + with open(f"{output_path}/{sanitize_name(project.name)}/index.md", "w") as index_file: # type: ignore + index_file.write(f"# Repository Index for Project: {project.name}\n\n") # type: ignore + for repo_name, has_readme in repo_index: + status = "" if has_readme else " - README.md not found" + index_file.write(f"- [{repo_name}]({sanitize_name(repo_name)}.md){status}\n") + + # Save the reference index for all projects + with open(f"{output_path}/index.md", "w") as ref_index_file: + ref_index_file.write("# Project Index\n\n") + for project in target_projects: + ref_index_file.write(f"- [{project.name}]({sanitize_name(project.name)}/index.md)\n") # type: ignore