Files
docs-harvester/harvester/harvester.py
2025-11-20 09:29:20 +01:00

76 lines
3.8 KiB
Python

import os
import requests
from devops.azure import get_token
from devops.devops import Organization, Project, Repository, Item
import logging
fmt = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
ch = logging.StreamHandler()
ch.setFormatter(fmt)
log = logging.getLogger(__name__)
log.addHandler(ch)
log.setLevel(logging.INFO)
def sanitize_name(name: str) -> str:
"""Sanitize a name to be filesystem-friendly."""
return name.lower().replace(" ", "-").replace("_", "-")
def harvest_readmes(organization: str | Organization, branch: list[str | None] = ["main", "dev", None], projects: list[str] = [], output_path: str = "reference") -> None:
"""Harvest README files from repositories."""
if isinstance(organization, str):
org = Organization("https://dev.azure.com/" + organization, token=get_token())
else:
org = organization
if projects:
# Target specific projects
target_projects = [Project(org=org, name=project_name) for project_name in projects]
else:
# Target all projects
target_projects = org.projects
for project in target_projects:
repo_index = [] # Repository index for the project.
log.info(f"Processing project: {project.name} with {len(project.repositories)} repositories.") # type: ignore
for repo in project.repositories:
log.info(f"...processing repository: {repo.name}")
readme_found = False
# Try each specified branch to find the README.md file
for branch_name in branch:
try:
# Check if the README.md file exists
readme = Item(repository=repo, path="/README.md", branch=branch_name)
# Build output path and save the README content if found
if readme:
project_path = f"{output_path}/{sanitize_name(project.name)}" # type: ignore
# Create project directory if it doesn't exist
os.makedirs(project_path, exist_ok=True)
# Save README content to index.md
readme_content = readme.get_content(branch=branch_name)
if readme_content is None or len(readme_content.strip()) == 0:
continue
with open(f"{project_path}/{sanitize_name(repo.name)}.md", "w") as f:
f.write(readme_content.decode("utf-8"))
readme_found = True
break # Exit branch loop if README is found
except requests.exceptions.HTTPError:
# Repository does not have a README.md file in the specified branch
continue
# Register if README was not found in any branch
repo_index.append((repo.name, readme_found)) # README not found
# Log if the README was not found
if not readme_found:
log.warning(f"......README.md in repo {repo.name} is not found or empty.")
# Save the repository index for the project
with open(f"{output_path}/{sanitize_name(project.name)}/index.md", "w") as index_file: # type: ignore
index_file.write(f"# Repository Index for Project: {project.name}\n\n") # type: ignore
for repo_name, has_readme in repo_index:
status = "" if has_readme else " - README.md not found"
index_file.write(f"- [{repo_name}]({sanitize_name(repo_name)}.md){status}\n")
# Save the reference index for all projects
with open(f"{output_path}/index.md", "w") as ref_index_file:
ref_index_file.write("# Project Index\n\n")
for project in target_projects:
ref_index_file.write(f"- [{project.name}]({sanitize_name(project.name)}/index.md)\n") # type: ignore