Commit f3579509 authored by henry's avatar henry Committed by Pier Angelo Vendrame
Browse files

BB 42305: Add script to combine translation files across versions.

parent a4bf3a2a
Loading
Loading
Loading
Loading
+404 −0
Original line number Diff line number Diff line
import argparse
import json
import logging
import os
import re
import subprocess

from combine import combine_files

# Whether we are running within the gitlab CI, rather than on a developer
# machine. This toggles some optimisations that work well in the temporary
# gitlab environment but would cause problems if run locally for testing
# purposes.
IN_GITLAB_CI_ENV = os.environ.get("GITLAB_CI", "") == "true"

arg_parser = argparse.ArgumentParser(
    description="Combine a translation file across two different versions"
)

arg_parser.add_argument(
    "current_branch", metavar="<current-branch>", help="branch for the newest version"
)
arg_parser.add_argument(
    "files", metavar="<files>", help="JSON specifying the translation files"
)
arg_parser.add_argument("outname", metavar="<json>", help="name of the json output")

args = arg_parser.parse_args()

logging.basicConfig()
logger = logging.getLogger("combine-translation-versions")
logger.setLevel(logging.INFO)


def in_pink(msg: str) -> str:
    """Present a message as pink in the terminal output.

    :param msg: The message to wrap in pink.
    :returns: The message to print to terminal.
    """
    # Pink and bold.
    return f"\x1b[1;38;5;212m{msg}\x1b[0m"


def git_run(git_args: list[str]) -> None:
    """Run a git command.

    :param git_args: The arguments that should follow "git".
    """
    # Add some text to give context to git's stderr appearing in log.
    logger.info("Running: " + in_pink("git " + " ".join(git_args)))
    subprocess.run(["git", *git_args], check=True)


def git_text(git_args: list[str]) -> str:
    """Get the text output for a git command.

    :param git_args: The arguments that should follow "git".
    :returns: The stdout of the command.
    """
    logger.info("Running: " + in_pink("git " + " ".join(git_args)))
    return subprocess.run(
        ["git", *git_args], text=True, check=True, stdout=subprocess.PIPE
    ).stdout


def git_lines(git_args: list[str]) -> list[str]:
    """Get the lines from a git command.

    :param git_args: The arguments that should follow "git".
    :returns: The non-empty lines from stdout of the command.
    """
    return [line for line in git_text(git_args).split("\n") if line]


class TranslationFile:
    """Represents a translation file."""

    def __init__(self, path: str, content: str) -> None:
        self.path = path
        self.content = content


class BrowserBranch:
    """Represents a browser git branch."""

    def __init__(self, branch_name: str, is_head: bool = False) -> None:
        """Create a new instance.

        :param branch_name: The branch's git name.
        :param is_head: Whether the branch matches "HEAD".
        """
        version_match = re.match(
            r"(?P<prefix>[a-z]+\-browser)\-"
            r"(?P<firefox>[0-9]+(?:\.[0-9]+){1,2})(?:esr|[ab][0-9]+)?\-"
            r"(?P<browser>[0-9]+\.[05])\-"
            r"(?P<number>[0-9]+)$",
            branch_name,
        )

        if not version_match:
            raise ValueError(f"Unable to parse the version from the ref {branch_name}")

        self.name = branch_name
        self.prefix = version_match.group("prefix")
        self.browser_version = version_match.group("browser")
        # Convert tor-browser to "Tor Browser", and similar.
        browser_name = self.prefix.replace("-", " ").title()
        self.browser_version_name = f"{browser_name} {self.browser_version}"

        self._is_head = is_head
        self._ref = "HEAD" if is_head else f"origin/{branch_name}"

        firefox_nums = [int(n) for n in version_match.group("firefox").split(".")]
        if len(firefox_nums) == 2:
            firefox_nums.append(0)
        browser_nums = [int(n) for n in self.browser_version.split(".")]
        branch_number = int(version_match.group("number"))
        # Prioritise the firefox ESR version, then the browser version then the
        # branch number.
        self._ordered = (
            firefox_nums[0],
            firefox_nums[1],
            firefox_nums[2],
            browser_nums[0],
            browser_nums[1],
            branch_number,
        )

        # Minor version for browser is only ever "0" or "5", so we can convert
        # the version to an integer.
        self._browser_int_version = int(2 * float(self.browser_version))

        self._file_paths: list[str] | None = None

    def release_below(self, other: "BrowserBranch", num: int) -> bool:
        """Determine whether another branch is within range of a previous
        browser release.

        The browser versions are expected to increment by "0.5", and a previous
        release branch's version is expected to be `num * 0.5` behind the
        current one.

        :param other: The branch to compare.
        :param num: The number of "0.5" releases behind to test with.
        """
        return other._browser_int_version == self._browser_int_version - num

    def __lt__(self, other: "BrowserBranch") -> bool:
        return self._ordered < other._ordered

    def __gt__(self, other: "BrowserBranch") -> bool:
        return self._ordered > other._ordered

    def _matching_dirs(self, path: str, dir_list: list[str]) -> bool:
        """Test that a path is contained in the list of dirs.

        :param path: The path to check.
        :param dir_list: The list of directories to check against.
        :returns: Whether the path matches.
        """
        for dir_path in dir_list:
            if os.path.commonpath([dir_path, path]) == dir_path:
                return True
        return False

    def get_file(
        self, filename: str, search_dirs: list[str] | None
    ) -> TranslationFile | None:
        """Fetch the file content for the named file in this branch.

        :param filename: The name of the file to fetch the content for.
        :param search_dirs: The directories to restrict the search to, or None
          to search for the file anywhere.
        :returns: The file, or `None` if no file could be found.
        """
        if self._file_paths is None:
            if not self._is_head:
                fetch_args = ()
                if IN_GITLAB_CI_ENV:
                    # Minimal fetch of non-HEAD branch to get the file paths.
                    # Individual file blobs will be downloaded as needed.
                    # Only do this when running in the gitlab CI since it will
                    # alter the user's .git/config and will effect future
                    # plain fetches.
                    fetch_args = ("--depth=1", "--filter=blob:none")
                git_run(["fetch", *fetch_args, "origin", self.name])
            self._file_paths = git_lines(
                ["ls-tree", "-r", "--format=%(path)", self._ref]
            )

        matching = [
            path
            for path in self._file_paths
            if os.path.basename(path) == filename
            and (search_dirs is None or self._matching_dirs(path, search_dirs))
        ]
        if not matching:
            return None
        if len(matching) > 1:
            raise Exception(f"Multiple occurrences of {filename}")

        path = matching[0]

        return TranslationFile(
            path=path, content=git_text(["cat-file", "blob", f"{self._ref}:{path}"])
        )


def get_stable_branch(
    compare_version: BrowserBranch,
) -> tuple[BrowserBranch, BrowserBranch | None]:
    """Find the most recent stable branch in the origin repository.

    :param compare_version: The development branch to compare against.
    :returns: The stable and legacy branches. If no legacy branch is found,
      `None` will be returned instead.
    """
    # We search for build1 tags. These are added *after* the rebase of browser
    # commits, so the corresponding branch should contain our strings.
    # Moreover, we *assume* that the branch with the most recent ESR version
    # with such a tag will be used in the *next* stable build in
    # tor-browser-build.
    tag_glob = f"{compare_version.prefix}-*-build1"

    fetch_args = ()
    if IN_GITLAB_CI_ENV:
        # To speed up, only fetch the tags without blobs.
        # Only do this when running in the gitlab CI since it will alter the
        # user's .git/config and will effect future plain fetches.
        fetch_args = ("--depth=1", "--filter=object:type=tag")
    git_run(["fetch", *fetch_args, "origin", "tag", tag_glob])
    stable_branches = []
    legacy_branches = []
    stable_annotation_regex = re.compile(r"\bstable\b")
    legacy_annotation_regex = re.compile(r"\blegacy\b")
    tag_pattern = re.compile(
        rf"^{re.escape(compare_version.prefix)}-[^-]+-[^-]+-[^-]+-build1$"
    )

    for build_tag, annotation in (
        line.split(" ", 1) for line in git_lines(["tag", "-n1", "--list", tag_glob])
    ):
        if not tag_pattern.match(build_tag):
            continue
        is_stable = bool(stable_annotation_regex.search(annotation))
        is_legacy = bool(legacy_annotation_regex.search(annotation))
        if not is_stable and not is_legacy:
            continue
        try:
            # Branch name is the same as the tag, minus "-build1".
            branch = BrowserBranch(re.sub(r"-build1$", "", build_tag))
        except ValueError:
            logger.warning(f"Could not read the version for {build_tag}")
            continue
        if branch.prefix != compare_version.prefix:
            continue
        if is_stable:
            # Stable can be one release version behind.
            # NOTE: In principle, when switching between versions there may be a
            # window of time where the development branch has not yet progressed
            # to the next "0.5" release, so has the same browser version as the
            # stable branch. So we also allow for matching browser versions.
            # NOTE:
            # 1. The "Will be unused in" message will not make sense, but we do
            #    not expect string differences in this scenario.
            # 2. We do not expect this scenario to last for long.
            if not (
                compare_version.release_below(branch, 1)
                or compare_version.release_below(branch, 0)
            ):
                continue
            stable_branches.append(branch)
        elif is_legacy:
            # Legacy can be arbitrary release versions behind.
            legacy_branches.append(branch)

    if not stable_branches:
        raise Exception("No stable build1 branch found")

    return (
        # Return the stable branch with the highest version.
        max(stable_branches),
        max(legacy_branches) if legacy_branches else None,
    )


current_branch = BrowserBranch(args.current_branch, is_head=True)

stable_branch, legacy_branch = get_stable_branch(current_branch)

if os.environ.get("TRANSLATION_INCLUDE_LEGACY", "") != "true":
    legacy_branch = None

files_list = []

for file_dict in json.loads(args.files):
    name = file_dict["name"]
    where_dirs = file_dict.get("where", None)
    current_file = current_branch.get_file(name, where_dirs)
    stable_file = stable_branch.get_file(name, where_dirs)

    if current_file is None and stable_file is None:
        # No file in either branch.
        logger.warning(f"{name} does not exist in either the current or stable branch")
    elif current_file is None:
        logger.warning(f"{name} deleted in the current branch")
    elif stable_file is None:
        logger.warning(f"{name} does not exist in the stable branch")
    elif current_file.path != stable_file.path:
        logger.warning(
            f"{name} has different paths in the current and stable branch. "
            f"{current_file.path} : {stable_file.path}"
        )

    content = None if current_file is None else current_file.content

    # If we have a branding file, we want to also include strings from the other
    # branding directories that differ from the stable release.
    # The strings that *differ* per release should be specified in
    # file_dict["branding"]["ids"]. These strings will be copied from the other
    # release's branding directory, with an addition suffix added to their ID,
    # as specified in the version_dict["suffix"].
    branding = file_dict.get("branding", None)
    if branding:
        include_ids = branding["ids"]
        for version_dict in branding["versions"]:
            branding_dirs = version_dict.get("where", None)
            branding_file = current_branch.get_file(name, branding_dirs)
            if branding_file is None:
                raise Exception(f"{name} does not exist in {branding_dirs}")
            content = combine_files(
                name,
                content,
                branding_file.content,
                f'{version_dict["name"]} Release.',
                include_ids,
                version_dict["suffix"],
            )

    content = combine_files(
        name,
        content,
        None if stable_file is None else stable_file.content,
        f"Will be unused in {current_branch.browser_version_name}!",
    )

    if legacy_branch and not file_dict.get("exclude-legacy", False):
        legacy_file = legacy_branch.get_file(name, where_dirs)
        if legacy_file is not None and current_file is None and stable_file is None:
            logger.warning(f"{name} still exists in the legacy branch")
        elif legacy_file is None:
            logger.warning(f"{name} does not exist in the legacy branch")
        elif stable_file is not None and legacy_file.path != stable_file.path:
            logger.warning(
                f"{name} has different paths in the stable and legacy branch. "
                f"{stable_file.path} : {legacy_file.path}"
            )
        elif current_file is not None and legacy_file.path != current_file.path:
            logger.warning(
                f"{name} has different paths in the current and legacy branch. "
                f"{current_file.path} : {legacy_file.path}"
            )

        content = combine_files(
            name,
            content,
            legacy_file.content,
            f"Unused in {stable_branch.browser_version_name}!",
        )
    elif legacy_branch:
        logger.info(f"Excluding legacy branch for {name}")

    files_list.append(
        {
            "name": name,
            # If "directory" is unspecified, we place the file directly beneath
            # en-US/ in the translation repository. i.e. "".
            "directory": file_dict.get("directory", ""),
            "branch": file_dict["branch"],
            "content": content,
        }
    )


ci_commit = os.environ.get("CI_COMMIT_SHA", "")
ci_url_base = os.environ.get("CI_PROJECT_URL", "")

json_data = {
    "commit": ci_commit,
    "commit-url": (
        f"{ci_url_base}/-/commit/{ci_commit}" if (ci_commit and ci_url_base) else ""
    ),
    "project-path": os.environ.get("CI_PROJECT_PATH", ""),
    "current-branch": current_branch.name,
    "stable-branch": stable_branch.name,
    "files": files_list,
}

if legacy_branch:
    json_data["legacy-branch"] = legacy_branch.name

with open(args.outname, "w") as file:
    json.dump(json_data, file)
+3 −0
Original line number Diff line number Diff line
# flake8: noqa

from .combine import combine_files
+206 −0
Original line number Diff line number Diff line
import re
from typing import TYPE_CHECKING, Any

from compare_locales.parser import getParser
from compare_locales.parser.android import AndroidEntity, DocumentWrapper
from compare_locales.parser.base import Comment, Entity, Junk, Whitespace
from compare_locales.parser.dtd import DTDEntity
from compare_locales.parser.fluent import FluentComment, FluentEntity
from compare_locales.parser.properties import PropertiesEntity

if TYPE_CHECKING:
    from collections.abc import Iterable


def combine_files(
    filename: str,
    primary_content: str | None,
    alternative_content: str | None,
    comment_prefix: str,
    include_ids: list[str] | None = None,
    alternative_suffix: str = "",
) -> str | None:
    """Combine two translation files into one to include all strings from both.
    The primary content is presented first, followed by the alternative content
    at the end with an additional comment.

    :param filename: The filename for the file, determines the format.
    :param primary_content: The primary content for the file, or None if it does
      not exist.
    :param alternative_content: The alternative content for the file, or None if
      it does not exist.
    :param comment_prefix: A comment to include for any strings that are
      appended to the content. This will be placed before any other comments for
      the string.
    :param include_ids: String IDs from `alternative_content` we want to
      include. If this is `None` then we include all strings that do not already
      have a matching ID in `primary_content`.
    :param duplicate_suffix: The suffix to apply to the alternative IDs.

    :returns: The combined content, or None if both given contents are None.
    """
    if primary_content is None and alternative_content is None:
        return None

    # getParser from compare_locale returns the same instance for the same file
    # extension.
    parser = getParser(filename)

    is_android = filename.endswith(".xml")
    if primary_content is None:
        if is_android:
            # File was deleted, add some document parts.
            content_start = (
                '<?xml version="1.0" encoding="utf-8" standalone="yes"?>\n<resources>\n'
            )
            content_end = "</resources>\n"
        else:
            # Treat as an empty file.
            content_start = ""
            content_end = ""
        existing_keys = []
    else:
        parser.readUnicode(primary_content)

        # Start with the same content as the current file.
        # For android strings, we want to keep the final "</resources>" until after.
        if is_android:
            closing_match = re.match(
                r"^(.*)(</resources>\s*)$", parser.ctx.contents, re.DOTALL
            )
            if not closing_match:
                raise ValueError("Missing a final </resources>")
            content_start = closing_match.group(1)
            content_end = closing_match.group(2)
        else:
            content_start = parser.ctx.contents
            content_end = ""
        existing_keys = [entry.key for entry in parser.walk(only_localizable=True)]

    # For Fluent, we want to prefix the strings using GroupComments.
    # On weblate this will cause all the strings that fall under the GroupComment's
    # scope to have the prefix added to their "notes".
    # We set up an initial GroupComment for the first string we find. This will also
    # end the scope of the last GroupComment in the new translation file.
    # This will be replaced with a the next GroupComment when it is found.
    fluent_group_comment_prefix = f"\n## {comment_prefix}\n"
    fluent_group_comment: str | None = fluent_group_comment_prefix

    # For other formats, we want to keep all the comment lines that come directly
    # before the string.
    # In compare_locales.parser, only the comment line directly before an Entity
    # counts as the pre_comment for that Entity. I.e. only this line will be
    # included in Entity.all
    # However, in weblate every comment line that comes before the Entity is
    # included as a comment. So we also want to keep these additional comments to
    # preserve them for weblate.
    # We gather these extra comments in stacked_comments, and clear them whenever we
    # reach an Entity or a blank line (Whitespace is more than "\n").
    stacked_comments: list[str] = []

    additions: list[str] = []

    entry_iter: Iterable[Any] = ()
    # If the file does not exist in the old branch, don't make any additions.
    if alternative_content is not None:
        parser.readUnicode(alternative_content)
        entry_iter = parser.walk(only_localizable=False)
    for entry in entry_iter:
        if isinstance(entry, Junk):
            raise ValueError(f"Unexpected Junk: {entry.all}")
        if isinstance(entry, Whitespace):
            # Clear stacked comments if more than one empty line.
            if entry.all != "\n":
                stacked_comments.clear()
            continue
        if isinstance(entry, Comment):
            if isinstance(entry, FluentComment):
                # Don't stack Fluent comments.
                # Only the comments included in Entity.pre_comment count towards
                # that Entity's comment.
                if entry.all.startswith("##"):
                    # A Fluent GroupComment
                    if entry.all == "##":
                        # Empty GroupComment. Used to end the scope of a previous
                        # GroupComment.
                        # Replace this with our prefix comment.
                        fluent_group_comment = fluent_group_comment_prefix
                    else:
                        # Prefix the group comment.
                        fluent_group_comment = (
                            f"{fluent_group_comment_prefix}{entry.all}\n"
                        )
            else:
                stacked_comments.append(entry.all)
            continue
        if isinstance(entry, DocumentWrapper):
            # Not needed.
            continue

        if not isinstance(entry, Entity):
            raise ValueError(f"Unexpected type: {entry.__class__.__name__}")

        if include_ids is None:
            # We include the entry if it is not already included.
            include_entry = entry.key not in existing_keys
        else:
            # We include the entry if it is in our list.
            include_entry = entry.key in include_ids
        if not include_entry:
            # Drop the gathered comments for this Entity.
            stacked_comments.clear()
            continue

        if isinstance(entry, FluentEntity):
            id_regex = rf"^({re.escape(entry.key)})( *=)"
            if fluent_group_comment is not None:
                # We have a found GroupComment which has not been included yet.
                # All following Entity's will be under its scope, until the next
                # GroupComment.
                additions.append(fluent_group_comment)
                # Added GroupComment, so don't need to add again.
                fluent_group_comment = None
        elif isinstance(entry, DTDEntity):
            id_regex = rf"^(\s*<!ENTITY\s*{re.escape(entry.key)})(\s)"
            # Include our additional comment before we print the rest for this
            # Entity.
            additions.append(f"<!-- LOCALIZATION NOTE: {comment_prefix} -->")
        elif isinstance(entry, PropertiesEntity):
            id_regex = rf"^({re.escape(entry.key)})( *=)"
            additions.append(f"# {comment_prefix}")
        elif isinstance(entry, AndroidEntity):
            id_regex = rf'^(\s*<string\s[^>]*name="{re.escape(entry.key)})(")'
            additions.append(f"<!-- {comment_prefix} -->")
        else:
            raise ValueError(f"Unexpected Entity type: {entry.__class__.__name__}")

        # Add any other comment lines that came directly before this Entity.
        additions.extend(stacked_comments)
        stacked_comments.clear()
        entry_content = entry.all
        if alternative_suffix:
            # NOTE: compare_locales does not allow us to set the entry.key
            # value. Instead we use a regular expression to append the suffix to
            # the expected key.
            entry_content, count = re.subn(
                id_regex, rf"\1{alternative_suffix}\2", entry_content, flags=re.M
            )
            if count != 1:
                raise ValueError(f"Failed to substitute the ID for {entry.key}")
        additions.append(entry_content)

    content_middle = ""

    if additions:
        # New line before and after the additions
        additions.insert(0, "")
        additions.append("")
        if is_android:
            content_middle = "\n    ".join(additions)
        else:
            content_middle = "\n".join(additions)

        # Remove " " in otherwise blank lines.
        content_middle = re.sub("^ +$", "", content_middle, flags=re.MULTILINE)

    return content_start + content_middle + content_end
+0 −0

Empty file added.

+10 −0
Original line number Diff line number Diff line
[DEFAULT]
subsuite = "base-browser"

["test_android.py"]

["test_dtd.py"]

["test_fluent.py"]

["test_properties.py"]
Loading