datasette.io/build_directory.py

import click
from github_to_sqlite.cli import (
    releases as github_to_sqlite_releases,
    repos as github_to_sqlite_repos,
)
import httpx
import json
from python_graphql_client import GraphqlClient
import sqlite_utils
 
 
REPO_FIELDS = """
fragment repoFields on Repository {
  id
  nameWithOwner
  createdAt
  openGraphImageUrl
  usesCustomOpenGraphImage
  defaultBranchRef {
    target {
      oid
    }
  }
  repositoryTopics(first: 100) {
    totalCount
    nodes {
      topic {
        name
      }
    }
  }
  openIssueCount: issues(states: [OPEN]) {
    totalCount
  }
  closedIssueCount: issues(states: [CLOSED]) {
    totalCount
  }
  releases(orderBy: {field: CREATED_AT, direction: DESC}, first: 1) {
    totalCount
    nodes {
      tagName
    }
  }
}
"""
 
 
def build_query(repos):
    repo_fragments = []
    for i, repo in enumerate(repos):
        owner, name = repo.split("/")
        repo_fragments.append(
            """
        repo_{i}: repository(name: "{name}", owner: "{owner}") {open_curly}
          ...repoFields
        {close_curly}
        """.format(
                REPO_FIELDS=REPO_FIELDS,
                i=i,
                name=name,
                owner=owner,
                open_curly="{",
                close_curly="}",
            )
        )
 
    return """
    REPO_FIELDS
    { REPOS }
    """.replace(
        "REPOS", "\n".join(repo_fragments)
    ).replace(
        "REPO_FIELDS", REPO_FIELDS
    )
 
 
def transform_node(node):
    releases = node.pop("releases")
    node["releaseCount"] = releases["totalCount"]
    for key in ("openIssueCount", "closedIssueCount"):
        node[key] = node[key]["totalCount"]
    repository_topics = node.pop("repositoryTopics")
    node["topics"] = [n["topic"]["name"] for n in repository_topics["nodes"]]
    default_branch_ref = node.pop("defaultBranchRef")
    node["latest_commit"] = default_branch_ref["target"]["oid"]
    return node, releases["nodes"]
 
 
client = GraphqlClient(endpoint="https://api.github.com/graphql")
 
 
def fetch_plugins(oauth_token, repos):
    chunks = []
    repos_copy = list(repos)
    while repos_copy:
        chunk, repos_copy = repos_copy[:20], repos_copy[20:]
        chunks.append(chunk)
    all_nodes = []
    for chunk in chunks:
        query = build_query(chunk)
        print(query)
        data = client.execute(
            query=query,
            headers={"Authorization": "Bearer {}".format(oauth_token)},
        )
        assert "errors" not in data, data["errors"]
        nodes = []
        for key in data["data"]:
            if key.startswith("repo_"):
                nodes.append(data["data"][key])
        all_nodes.extend(nodes)
    return all_nodes
 
 
@click.command()
@click.argument(
    "db_filename",
    type=click.Path(file_okay=True, dir_okay=False),
)
@click.option("--github-token", envvar="GITHUB_TOKEN", required=True)
@click.option("--fetch-missing-releases", is_flag=True)
@click.option("--always-fetch-releases-for-repo", multiple=True)
def cli(
    db_filename,
    github_token,
    fetch_missing_releases,
    always_fetch_releases_for_repo,
):
    db = sqlite_utils.Database(db_filename)
    repos_to_fetch_releases_for = {"simonw/datasette"}
    if "latest_commit" not in db["datasette_repos"].columns_dict:
        previous_hashes = {
            row["nameWithOwner"]: None for row in db["datasette_repos"].rows
        }
    else:
        previous_hashes = {
            row["nameWithOwner"]: row["latest_commit"]
            for row in db["datasette_repos"].rows
        }
    repos = [
        r[0]
        for r in db.execute(
            "select repo from tool_repos union select repo from plugin_repos"
        ).fetchall()
    ]
    nodes = fetch_plugins(github_token, repos)
    for node in nodes:
        plugin, releases = transform_node(node)
        db["datasette_repos"].insert(
            plugin,
            pk="id",
            column_order=("id", "nameWithOwner"),
            replace=True,
            alter=True,
        )
        full_name = plugin["nameWithOwner"]
        if fetch_missing_releases:
            for release in releases:
                tag_name = release["tagName"]
                # Does this release exist for this repo?
                if (
                    (full_name in always_fetch_releases_for_repo)
                    or not db["repos"].exists()
                    or not list(
                        db["releases"].rows_where(
                            "repo = (select id from repos where full_name = ?) and tag_name = ?",
                            [full_name, tag_name],
                        )
                    )
                ):
                    repos_to_fetch_releases_for.add(full_name)
 
    if repos_to_fetch_releases_for:
        github_to_sqlite_releases.callback(
            db_filename, list(repos_to_fetch_releases_for), auth="auth.json"
        )
 
    # Fetch details for any repos that have changed since last time
    repos_to_fetch = []
    for row in db["datasette_repos"].rows:
        if row["latest_commit"] != previous_hashes.get(row["nameWithOwner"]):
            repos_to_fetch.append(row["nameWithOwner"])
 
    # Fix up 3 repos with camo.githubusercontent.com at random
    # https://github.com/simonw/datasette.io/issues/156#issuecomment-1894795880
    fix_repo_names = [
        row[0]
        for row in db.execute(
            """
        select full_name from repos
        where readme_html like '%camo.githubusercontent.com%'
        and readme not like '%camo.githubusercontent.com%'
        limit 3
    """
        ).fetchall()
    ]
    print("Fixing HTML for {}".format(fix_repo_names))
    repos_to_fetch.extend(fix_repo_names)
 
    if repos_to_fetch:
        print("Fetching repo details for {}".format(repos_to_fetch))
        github_to_sqlite_repos.callback(
            db_filename,
            usernames=[],
            auth="auth.json",
            repo=repos_to_fetch,
            load=None,
            readme=False,
            readme_html=False,
        )
 
    # Separate step to fetch their READMEs
    for repo in repos_to_fetch:
        print("Fetching README for {}".format(repo))
        # curl 'https://api.github.com/repos/simonw/datasette/readme' -H 'Accept: application/vnd.github.raw'
        response = httpx.get(
            "https://api.github.com/repos/{}/readme".format(repo),
            headers={
                "Accept": "application/vnd.github.raw",
                "Authorization": "Bearer {}".format(github_token),
            },
        )
        response.raise_for_status()
        readme_md = response.text
        # Now convert that markdown to HTML
        response2 = httpx.post(
            "https://api.github.com/markdown",
            json={
                "text": readme_md,
                "mode": "markdown",
                "context": repo,
            },
            headers={
                "Accept": "application/vnd.github.v3+json",
                "Authorization": "Bearer {}".format(github_token),
            },
        )
        response2.raise_for_status()
        readme_html = response2.text
        with db.conn:
            db.execute(
                "update repos set readme = :readme, readme_html = :readme_html where full_name = :full_name",
                {
                    "readme": readme_md,
                    "readme_html": readme_html,
                    "full_name": repo,
                },
            )
 
    for view_name, repo_table in (("plugins", "plugin_repos"), ("tools", "tool_repos")):
        db.create_view(
            view_name,
            """
select
  repos.name as name,
  repos.full_name as full_name,
  users.login as owner,
  repos.description as description,
  {repo_table}.extra_search as extra_search,
  {repo_table}.tags as tags,
  repos.stargazers_count,
  pypi_versions.name as tag_name,
  max(pypi_releases.upload_time) as latest_release_at,
  repos.created_at as created_at,
  datasette_repos.openGraphImageUrl,
  datasette_repos.usesCustomOpenGraphImage,
  (
    select
      sum(downloads)
    from
      stats
    where
      stats.package = repos.name
      and stats.date > date('now', '-7 days')
  ) as downloads_this_week,
  (
    select
      count(*)
    from
      plugin_repos
    where
      repo = repos.full_name
  ) as is_plugin,
  (
    select
      count(*)
    from
      tool_repos
    where
      repo = repos.full_name
  ) as is_tool
from
  datasette_repos
  join repos on datasette_repos.id = repos.node_id
  left join pypi_releases on (
    pypi_releases.package = repos.name or pypi_releases.package = 'datasette-' || repos.name
  )
  left join pypi_versions on pypi_releases.version = pypi_versions.id
  join users on users.id = repos.owner
  join {repo_table} on {repo_table}.repo = datasette_repos.nameWithOwner
group by
  repos.id
order by
  latest_release_at desc;
""".format(
                repo_table=repo_table
            ).strip(),
            replace=True,
        )
 
 
if __name__ == "__main__":
    cli()