Verified Commit c4815554 authored by anarcat's avatar anarcat 💥
Browse files

git: implement a massive repository grepper

Major caveat: this needs to be deployed on the git server, as we can't
actually run it remotely because we'd need sudo -u git before.

See: base-images#19 base-images#20
parent fa36911d
Loading
Loading
Loading
Loading
+57 −0
Original line number Diff line number Diff line
@@ -151,3 +151,60 @@ def is_empty_repo(con: Connection, repo_path: Optional[str] = None, sudo: bool =
    #
    # the rev-list was considered redundant because revs *would* show up as objects
    return non_empty_stats


# original command:
#
# find /var/opt/gitlab/git-data/repositories/@hashed \
#    -type d -name \*.git -not -name \*.wiki.git \
#    -exec bash -c "test -e {}/HEAD &&
#                   test -n \$(sudo -u git git -C {} rev-list -n1 --all) &&
#                   sudo -u git git -C {} grep :bullseye HEAD -- .gitlab-ci.yml &&
#                   echo {}" \;
#
# a few optimisations:
#
# - we don't skip wikis, since we might want those anyways
#
# - we don't check for HEAD or rev-list, and assume git grep will
#   quickly discard those, instead we ignore errors it generates about
#   missing HEADs
@task
def grep_all_repos(
    con,
    pattern: str,
    files: str | None = None,
    repos_path="/var/opt/gitlab/git-data/repositories/@hashed",
    list_only: bool = False,
):
    """grep for a specific pattern in a bunch of git repositories

    This will run the `git-grep` command on multiple repos, assumed to
    be bare repositories under the REPOS_PATH parameter. If no FILES
    is given, it will grep all files in each repo, otherwise it will
    restrict to the named FILES.

    By default, it will output the matches, but if LIST_ONLY is
    provided, it will only provide the matching *repositories* with a
    match (as opposed to files inside the repositories.

    The normal output replaces the normal git-grep output which would
    show HEAD:MATCH by PATH:MATCH.
    """

    # we don't reuse list_all_repos here because it would be too slow
    # to do the roundtrip for each matching repo.
    if list_only:
        cmd = r"find %s -type d -a -name '*.git' -exec sh -c 'git -C {} grep -q %s HEAD %s 2>/dev/null && echo {}' \;"
    else:
        cmd = r"find %s -type d -a -name '*.git' -exec sh -c 'git -C {} grep %s HEAD %s 2>/dev/null | sed s,^HEAD:,{}:, ' \;"  # noqa: E501

    cmd = cmd % (
        repos_path,
        pattern,
        " -- %s" % files if files else "",
    )
    logging.info("running %s on %s", cmd, getattr(con, "host", "localhost"))
    ret = con.run(cmd, warn=True)
    if not ret.ok:
        logging.info("find(1) command failed")