Loading tools/base-browser/l10n/combine-translation-versions.py 0 → 100644 +368 −0 Original line number Diff line number Diff line import argparse import json import logging import os import re import subprocess from combine import combine_files arg_parser = argparse.ArgumentParser( description="Combine a translation file across two different versions" ) arg_parser.add_argument( "current_branch", metavar="<current-branch>", help="branch for the newest version" ) arg_parser.add_argument( "files", metavar="<files>", help="JSON specifying the translation files" ) arg_parser.add_argument("outname", metavar="<json>", help="name of the json output") args = arg_parser.parse_args() logging.basicConfig() logger = logging.getLogger("combine-translation-versions") logger.setLevel(logging.INFO) def in_pink(msg: str) -> str: """Present a message as pink in the terminal output. :param msg: The message to wrap in pink. :returns: The message to print to terminal. """ # Pink and bold. return f"\x1b[1;38;5;212m{msg}\x1b[0m" def git_run(git_args: list[str]) -> None: """Run a git command. :param git_args: The arguments that should follow "git". """ # Add some text to give context to git's stderr appearing in log. logger.info("Running: " + in_pink("git " + " ".join(git_args))) subprocess.run(["git", *git_args], check=True) def git_text(git_args: list[str]) -> str: """Get the text output for a git command. :param git_args: The arguments that should follow "git". :returns: The stdout of the command. """ logger.info("Running: " + in_pink("git " + " ".join(git_args))) return subprocess.run( ["git", *git_args], text=True, check=True, stdout=subprocess.PIPE ).stdout def git_lines(git_args: list[str]) -> list[str]: """Get the lines from a git command. :param git_args: The arguments that should follow "git". :returns: The non-empty lines from stdout of the command. """ return [line for line in git_text(git_args).split("\n") if line] class TranslationFile: """Represents a translation file.""" def __init__(self, path: str, content: str) -> None: self.path = path self.content = content class BrowserBranch: """Represents a browser git branch.""" def __init__(self, branch_name: str, is_head: bool = False) -> None: """Create a new instance. :param branch_name: The branch's git name. :param is_head: Whether the branch matches "HEAD". """ version_match = re.match( r"(?P<prefix>[a-z]+\-browser)\-" r"(?P<firefox>[0-9]+(?:\.[0-9]+){1,2})esr\-" r"(?P<browser>[0-9]+\.[05])\-" r"(?P<number>[0-9]+)$", branch_name, ) if not version_match: raise ValueError(f"Unable to parse the version from the ref {branch_name}") self.name = branch_name self.prefix = version_match.group("prefix") self.browser_version = version_match.group("browser") self._is_head = is_head self._ref = "HEAD" if is_head else f"origin/{branch_name}" firefox_nums = [int(n) for n in version_match.group("firefox").split(".")] if len(firefox_nums) == 2: firefox_nums.append(0) browser_nums = [int(n) for n in self.browser_version.split(".")] branch_number = int(version_match.group("number")) # Prioritise the firefox ESR version, then the browser version then the # branch number. self._ordered = ( firefox_nums[0], firefox_nums[1], firefox_nums[2], browser_nums[0], browser_nums[1], branch_number, ) # Minor version for browser is only ever "0" or "5", so we can convert # the version to an integer. self._browser_int_version = int(2 * float(self.browser_version)) self._file_paths: list[str] | None = None def release_below(self, other: "BrowserBranch", num: int) -> bool: """Determine whether another branch is within range of a previous browser release. The browser versions are expected to increment by "0.5", and a previous release branch's version is expected to be `num * 0.5` behind the current one. :param other: The branch to compare. :param num: The number of "0.5" releases behind to test with. """ return other._browser_int_version == self._browser_int_version - num def __lt__(self, other: "BrowserBranch") -> bool: return self._ordered < other._ordered def __gt__(self, other: "BrowserBranch") -> bool: return self._ordered > other._ordered def _matching_dirs(self, path: str, dir_list: list[str]) -> bool: """Test that a path is contained in the list of dirs. :param path: The path to check. :param dir_list: The list of directories to check against. :returns: Whether the path matches. """ for dir_path in dir_list: if os.path.commonpath([dir_path, path]) == dir_path: return True return False def get_file( self, filename: str, search_dirs: list[str] | None ) -> TranslationFile | None: """Fetch the file content for the named file in this branch. :param filename: The name of the file to fetch the content for. :param search_dirs: The directories to restrict the search to, or None to search for the file anywhere. :returns: The file, or `None` if no file could be found. """ if self._file_paths is None: if not self._is_head: # Minimal fetch of non-HEAD branch to get the file paths. # Individual file blobs will be downloaded as needed. git_run( ["fetch", "--depth=1", "--filter=blob:none", "origin", self.name] ) self._file_paths = git_lines( ["ls-tree", "-r", "--format=%(path)", self._ref] ) matching = [ path for path in self._file_paths if os.path.basename(path) == filename and (search_dirs is None or self._matching_dirs(path, search_dirs)) ] if not matching: return None if len(matching) > 1: raise Exception(f"Multiple occurrences of {filename}") path = matching[0] return TranslationFile( path=path, content=git_text(["cat-file", "blob", f"{self._ref}:{path}"]) ) def get_stable_branch( compare_version: BrowserBranch, ) -> tuple[BrowserBranch, BrowserBranch | None]: """Find the most recent stable branch in the origin repository. :param compare_version: The development branch to compare against. :returns: The stable and legacy branches. If no legacy branch is found, `None` will be returned instead. """ # We search for build1 tags. These are added *after* the rebase of browser # commits, so the corresponding branch should contain our strings. # Moreover, we *assume* that the branch with the most recent ESR version # with such a tag will be used in the *next* stable build in # tor-browser-build. tag_glob = f"{compare_version.prefix}-*-build1" # To speed up, only fetch the tags without blobs. git_run( ["fetch", "--depth=1", "--filter=object:type=tag", "origin", "tag", tag_glob] ) stable_branches = [] legacy_branches = [] stable_annotation_regex = re.compile(r"\bstable\b") legacy_annotation_regex = re.compile(r"\blegacy\b") tag_pattern = re.compile( rf"^{re.escape(compare_version.prefix)}-[^-]+esr-[^-]+-[^-]+-build1$" ) for build_tag, annotation in ( line.split(" ", 1) for line in git_lines(["tag", "-n1", "--list", tag_glob]) ): if not tag_pattern.match(build_tag): continue is_stable = bool(stable_annotation_regex.search(annotation)) is_legacy = bool(legacy_annotation_regex.search(annotation)) if not is_stable and not is_legacy: continue try: # Branch name is the same as the tag, minus "-build1". branch = BrowserBranch(re.sub(r"-build1$", "", build_tag)) except ValueError: logger.warning(f"Could not read the version for {build_tag}") continue if branch.prefix != compare_version.prefix: continue if is_stable: # Stable can be one release version behind. # NOTE: In principle, when switching between versions there may be a # window of time where the development branch has not yet progressed # to the next "0.5" release, so has the same browser version as the # stable branch. So we also allow for matching browser versions. # NOTE: # 1. The "Will be unused in" message will not make sense, but we do # not expect string differences in this scenario. # 2. We do not expect this scenario to last for long. if not ( compare_version.release_below(branch, 1) or compare_version.release_below(branch, 0) ): continue stable_branches.append(branch) elif is_legacy: # Legacy can be two release versions behind. # We also allow for being just one version behind. if not ( compare_version.release_below(branch, 2) or compare_version.release_below(branch, 1) ): continue legacy_branches.append(branch) if not stable_branches: raise Exception("No stable build1 branch found") return ( # Return the stable branch with the highest version. max(stable_branches), max(legacy_branches) if legacy_branches else None, ) current_branch = BrowserBranch(args.current_branch, is_head=True) stable_branch, legacy_branch = get_stable_branch(current_branch) if os.environ.get("TRANSLATION_INCLUDE_LEGACY", "") != "true": legacy_branch = None files_list = [] for file_dict in json.loads(args.files): name = file_dict["name"] where_dirs = file_dict.get("where", None) current_file = current_branch.get_file(name, where_dirs) stable_file = stable_branch.get_file(name, where_dirs) if current_file is None and stable_file is None: # No file in either branch. logger.warning(f"{name} does not exist in either the current or stable branch") elif current_file is None: logger.warning(f"{name} deleted in the current branch") elif stable_file is None: logger.warning(f"{name} does not exist in the stable branch") elif current_file.path != stable_file.path: logger.warning( f"{name} has different paths in the current and stable branch. " f"{current_file.path} : {stable_file.path}" ) content = combine_files( name, None if current_file is None else current_file.content, None if stable_file is None else stable_file.content, f"Will be unused in Tor Browser {current_branch.browser_version}!", ) if legacy_branch and not file_dict.get("exclude-legacy", False): legacy_file = legacy_branch.get_file(name, where_dirs) if legacy_file is not None and current_file is None and stable_file is None: logger.warning(f"{name} still exists in the legacy branch") elif legacy_file is None: logger.warning(f"{name} does not exist in the legacy branch") elif stable_file is not None and legacy_file.path != stable_file.path: logger.warning( f"{name} has different paths in the stable and legacy branch. " f"{stable_file.path} : {legacy_file.path}" ) elif current_file is not None and legacy_file.path != current_file.path: logger.warning( f"{name} has different paths in the current and legacy branch. " f"{current_file.path} : {legacy_file.path}" ) content = combine_files( name, content, legacy_file.content, f"Unused in Tor Browser {stable_branch.browser_version}!", ) elif legacy_branch: logger.info(f"Excluding legacy branch for {name}") files_list.append( { "name": name, # If "directory" is unspecified, we place the file directly beneath # en-US/ in the translation repository. i.e. "". "directory": file_dict.get("directory", ""), "branch": file_dict["branch"], "content": content, } ) ci_commit = os.environ.get("CI_COMMIT_SHA", "") ci_url_base = os.environ.get("CI_PROJECT_URL", "") json_data = { "commit": ci_commit, "commit-url": f"{ci_url_base}/-/commit/{ci_commit}" if (ci_commit and ci_url_base) else "", "project-path": os.environ.get("CI_PROJECT_PATH", ""), "current-branch": current_branch.name, "stable-branch": stable_branch.name, "files": files_list, } if legacy_branch: json_data["legacy-branch"] = legacy_branch.name with open(args.outname, "w") as file: json.dump(json_data, file) tools/base-browser/l10n/combine/__init__.py 0 → 100644 +3 −0 Original line number Diff line number Diff line # flake8: noqa from .combine import combine_files tools/base-browser/l10n/combine/combine.py 0 → 100644 +181 −0 Original line number Diff line number Diff line import re from typing import TYPE_CHECKING, Any from compare_locales.parser import getParser from compare_locales.parser.android import AndroidEntity, DocumentWrapper from compare_locales.parser.base import Comment, Entity, Junk, Whitespace from compare_locales.parser.dtd import DTDEntity from compare_locales.parser.fluent import FluentComment, FluentEntity from compare_locales.parser.properties import PropertiesEntity if TYPE_CHECKING: from collections.abc import Iterable def combine_files( filename: str, new_content: str | None, old_content: str | None, comment_prefix: str, ) -> str | None: """Combine two translation files into one to include all strings from both. The new content is presented first, and any strings only found in the old content are placed at the end with an additional comment. :param filename: The filename for the file, determines the format. :param new_content: The new content for the file, or None if it has been deleted. :param old_content: The old content for the file, or None if it did not exist before. :comment_prefix: A comment to include for any strings that are only found in the old content. This will be placed before any other comments for the string. :returns: The combined content, or None if both given contents are None. """ if new_content is None and old_content is None: return None # getParser from compare_locale returns the same instance for the same file # extension. parser = getParser(filename) is_android = filename.endswith(".xml") if new_content is None: if is_android: # File was deleted, add some document parts. content_start = ( '<?xml version="1.0" encoding="utf-8" standalone="yes"?>\n<resources>\n' ) content_end = "</resources>\n" else: # Treat as an empty file. content_start = "" content_end = "" existing_keys = [] else: parser.readUnicode(new_content) # Start with the same content as the current file. # For android strings, we want to keep the final "</resources>" until after. if is_android: closing_match = re.match( r"^(.*)(</resources>\s*)$", parser.ctx.contents, re.DOTALL ) if not closing_match: raise ValueError("Missing a final </resources>") content_start = closing_match.group(1) content_end = closing_match.group(2) else: content_start = parser.ctx.contents content_end = "" existing_keys = [entry.key for entry in parser.walk(only_localizable=True)] # For Fluent, we want to prefix the strings using GroupComments. # On weblate this will cause all the strings that fall under the GroupComment's # scope to have the prefix added to their "notes". # We set up an initial GroupComment for the first string we find. This will also # end the scope of the last GroupComment in the new translation file. # This will be replaced with a the next GroupComment when it is found. fluent_group_comment_prefix = f"\n## {comment_prefix}\n" fluent_group_comment: str | None = fluent_group_comment_prefix # For other formats, we want to keep all the comment lines that come directly # before the string. # In compare_locales.parser, only the comment line directly before an Entity # counts as the pre_comment for that Entity. I.e. only this line will be # included in Entity.all # However, in weblate every comment line that comes before the Entity is # included as a comment. So we also want to keep these additional comments to # preserve them for weblate. # We gather these extra comments in stacked_comments, and clear them whenever we # reach an Entity or a blank line (Whitespace is more than "\n"). stacked_comments: list[str] = [] additions: list[str] = [] entry_iter: Iterable[Any] = () # If the file does not exist in the old branch, don't make any additions. if old_content is not None: parser.readUnicode(old_content) entry_iter = parser.walk(only_localizable=False) for entry in entry_iter: if isinstance(entry, Junk): raise ValueError(f"Unexpected Junk: {entry.all}") if isinstance(entry, Whitespace): # Clear stacked comments if more than one empty line. if entry.all != "\n": stacked_comments.clear() continue if isinstance(entry, Comment): if isinstance(entry, FluentComment): # Don't stack Fluent comments. # Only the comments included in Entity.pre_comment count towards # that Entity's comment. if entry.all.startswith("##"): # A Fluent GroupComment if entry.all == "##": # Empty GroupComment. Used to end the scope of a previous # GroupComment. # Replace this with our prefix comment. fluent_group_comment = fluent_group_comment_prefix else: # Prefix the group comment. fluent_group_comment = ( f"{fluent_group_comment_prefix}{entry.all}\n" ) else: stacked_comments.append(entry.all) continue if isinstance(entry, DocumentWrapper): # Not needed. continue if not isinstance(entry, Entity): raise ValueError(f"Unexpected type: {entry.__class__.__name__}") if entry.key in existing_keys: # Already included this string in the new translation file. # Drop the gathered comments for this Entity. stacked_comments.clear() continue if isinstance(entry, FluentEntity): if fluent_group_comment is not None: # We have a found GroupComment which has not been included yet. # All following Entity's will be under its scope, until the next # GroupComment. additions.append(fluent_group_comment) # Added GroupComment, so don't need to add again. fluent_group_comment = None elif isinstance(entry, DTDEntity): # Include our additional comment before we print the rest for this # Entity. additions.append(f"<!-- LOCALIZATION NOTE: {comment_prefix} -->") elif isinstance(entry, PropertiesEntity): additions.append(f"# {comment_prefix}") elif isinstance(entry, AndroidEntity): additions.append(f"<!-- {comment_prefix} -->") else: raise ValueError(f"Unexpected Entity type: {entry.__class__.__name__}") # Add any other comment lines that came directly before this Entity. additions.extend(stacked_comments) stacked_comments.clear() additions.append(entry.all) content_middle = "" if additions: # New line before and after the additions additions.insert(0, "") additions.append("") if is_android: content_middle = "\n ".join(additions) else: content_middle = "\n".join(additions) # Remove " " in otherwise blank lines. content_middle = re.sub("^ +$", "", content_middle, flags=re.MULTILINE) return content_start + content_middle + content_end tools/base-browser/l10n/combine/tests/README 0 → 100644 +2 −0 Original line number Diff line number Diff line python tests to be run with pytest. Requires the compare-locales package. tools/base-browser/l10n/combine/tests/__init__.py 0 → 100644 +0 −0 Empty file added. Loading
tools/base-browser/l10n/combine-translation-versions.py 0 → 100644 +368 −0 Original line number Diff line number Diff line import argparse import json import logging import os import re import subprocess from combine import combine_files arg_parser = argparse.ArgumentParser( description="Combine a translation file across two different versions" ) arg_parser.add_argument( "current_branch", metavar="<current-branch>", help="branch for the newest version" ) arg_parser.add_argument( "files", metavar="<files>", help="JSON specifying the translation files" ) arg_parser.add_argument("outname", metavar="<json>", help="name of the json output") args = arg_parser.parse_args() logging.basicConfig() logger = logging.getLogger("combine-translation-versions") logger.setLevel(logging.INFO) def in_pink(msg: str) -> str: """Present a message as pink in the terminal output. :param msg: The message to wrap in pink. :returns: The message to print to terminal. """ # Pink and bold. return f"\x1b[1;38;5;212m{msg}\x1b[0m" def git_run(git_args: list[str]) -> None: """Run a git command. :param git_args: The arguments that should follow "git". """ # Add some text to give context to git's stderr appearing in log. logger.info("Running: " + in_pink("git " + " ".join(git_args))) subprocess.run(["git", *git_args], check=True) def git_text(git_args: list[str]) -> str: """Get the text output for a git command. :param git_args: The arguments that should follow "git". :returns: The stdout of the command. """ logger.info("Running: " + in_pink("git " + " ".join(git_args))) return subprocess.run( ["git", *git_args], text=True, check=True, stdout=subprocess.PIPE ).stdout def git_lines(git_args: list[str]) -> list[str]: """Get the lines from a git command. :param git_args: The arguments that should follow "git". :returns: The non-empty lines from stdout of the command. """ return [line for line in git_text(git_args).split("\n") if line] class TranslationFile: """Represents a translation file.""" def __init__(self, path: str, content: str) -> None: self.path = path self.content = content class BrowserBranch: """Represents a browser git branch.""" def __init__(self, branch_name: str, is_head: bool = False) -> None: """Create a new instance. :param branch_name: The branch's git name. :param is_head: Whether the branch matches "HEAD". """ version_match = re.match( r"(?P<prefix>[a-z]+\-browser)\-" r"(?P<firefox>[0-9]+(?:\.[0-9]+){1,2})esr\-" r"(?P<browser>[0-9]+\.[05])\-" r"(?P<number>[0-9]+)$", branch_name, ) if not version_match: raise ValueError(f"Unable to parse the version from the ref {branch_name}") self.name = branch_name self.prefix = version_match.group("prefix") self.browser_version = version_match.group("browser") self._is_head = is_head self._ref = "HEAD" if is_head else f"origin/{branch_name}" firefox_nums = [int(n) for n in version_match.group("firefox").split(".")] if len(firefox_nums) == 2: firefox_nums.append(0) browser_nums = [int(n) for n in self.browser_version.split(".")] branch_number = int(version_match.group("number")) # Prioritise the firefox ESR version, then the browser version then the # branch number. self._ordered = ( firefox_nums[0], firefox_nums[1], firefox_nums[2], browser_nums[0], browser_nums[1], branch_number, ) # Minor version for browser is only ever "0" or "5", so we can convert # the version to an integer. self._browser_int_version = int(2 * float(self.browser_version)) self._file_paths: list[str] | None = None def release_below(self, other: "BrowserBranch", num: int) -> bool: """Determine whether another branch is within range of a previous browser release. The browser versions are expected to increment by "0.5", and a previous release branch's version is expected to be `num * 0.5` behind the current one. :param other: The branch to compare. :param num: The number of "0.5" releases behind to test with. """ return other._browser_int_version == self._browser_int_version - num def __lt__(self, other: "BrowserBranch") -> bool: return self._ordered < other._ordered def __gt__(self, other: "BrowserBranch") -> bool: return self._ordered > other._ordered def _matching_dirs(self, path: str, dir_list: list[str]) -> bool: """Test that a path is contained in the list of dirs. :param path: The path to check. :param dir_list: The list of directories to check against. :returns: Whether the path matches. """ for dir_path in dir_list: if os.path.commonpath([dir_path, path]) == dir_path: return True return False def get_file( self, filename: str, search_dirs: list[str] | None ) -> TranslationFile | None: """Fetch the file content for the named file in this branch. :param filename: The name of the file to fetch the content for. :param search_dirs: The directories to restrict the search to, or None to search for the file anywhere. :returns: The file, or `None` if no file could be found. """ if self._file_paths is None: if not self._is_head: # Minimal fetch of non-HEAD branch to get the file paths. # Individual file blobs will be downloaded as needed. git_run( ["fetch", "--depth=1", "--filter=blob:none", "origin", self.name] ) self._file_paths = git_lines( ["ls-tree", "-r", "--format=%(path)", self._ref] ) matching = [ path for path in self._file_paths if os.path.basename(path) == filename and (search_dirs is None or self._matching_dirs(path, search_dirs)) ] if not matching: return None if len(matching) > 1: raise Exception(f"Multiple occurrences of {filename}") path = matching[0] return TranslationFile( path=path, content=git_text(["cat-file", "blob", f"{self._ref}:{path}"]) ) def get_stable_branch( compare_version: BrowserBranch, ) -> tuple[BrowserBranch, BrowserBranch | None]: """Find the most recent stable branch in the origin repository. :param compare_version: The development branch to compare against. :returns: The stable and legacy branches. If no legacy branch is found, `None` will be returned instead. """ # We search for build1 tags. These are added *after* the rebase of browser # commits, so the corresponding branch should contain our strings. # Moreover, we *assume* that the branch with the most recent ESR version # with such a tag will be used in the *next* stable build in # tor-browser-build. tag_glob = f"{compare_version.prefix}-*-build1" # To speed up, only fetch the tags without blobs. git_run( ["fetch", "--depth=1", "--filter=object:type=tag", "origin", "tag", tag_glob] ) stable_branches = [] legacy_branches = [] stable_annotation_regex = re.compile(r"\bstable\b") legacy_annotation_regex = re.compile(r"\blegacy\b") tag_pattern = re.compile( rf"^{re.escape(compare_version.prefix)}-[^-]+esr-[^-]+-[^-]+-build1$" ) for build_tag, annotation in ( line.split(" ", 1) for line in git_lines(["tag", "-n1", "--list", tag_glob]) ): if not tag_pattern.match(build_tag): continue is_stable = bool(stable_annotation_regex.search(annotation)) is_legacy = bool(legacy_annotation_regex.search(annotation)) if not is_stable and not is_legacy: continue try: # Branch name is the same as the tag, minus "-build1". branch = BrowserBranch(re.sub(r"-build1$", "", build_tag)) except ValueError: logger.warning(f"Could not read the version for {build_tag}") continue if branch.prefix != compare_version.prefix: continue if is_stable: # Stable can be one release version behind. # NOTE: In principle, when switching between versions there may be a # window of time where the development branch has not yet progressed # to the next "0.5" release, so has the same browser version as the # stable branch. So we also allow for matching browser versions. # NOTE: # 1. The "Will be unused in" message will not make sense, but we do # not expect string differences in this scenario. # 2. We do not expect this scenario to last for long. if not ( compare_version.release_below(branch, 1) or compare_version.release_below(branch, 0) ): continue stable_branches.append(branch) elif is_legacy: # Legacy can be two release versions behind. # We also allow for being just one version behind. if not ( compare_version.release_below(branch, 2) or compare_version.release_below(branch, 1) ): continue legacy_branches.append(branch) if not stable_branches: raise Exception("No stable build1 branch found") return ( # Return the stable branch with the highest version. max(stable_branches), max(legacy_branches) if legacy_branches else None, ) current_branch = BrowserBranch(args.current_branch, is_head=True) stable_branch, legacy_branch = get_stable_branch(current_branch) if os.environ.get("TRANSLATION_INCLUDE_LEGACY", "") != "true": legacy_branch = None files_list = [] for file_dict in json.loads(args.files): name = file_dict["name"] where_dirs = file_dict.get("where", None) current_file = current_branch.get_file(name, where_dirs) stable_file = stable_branch.get_file(name, where_dirs) if current_file is None and stable_file is None: # No file in either branch. logger.warning(f"{name} does not exist in either the current or stable branch") elif current_file is None: logger.warning(f"{name} deleted in the current branch") elif stable_file is None: logger.warning(f"{name} does not exist in the stable branch") elif current_file.path != stable_file.path: logger.warning( f"{name} has different paths in the current and stable branch. " f"{current_file.path} : {stable_file.path}" ) content = combine_files( name, None if current_file is None else current_file.content, None if stable_file is None else stable_file.content, f"Will be unused in Tor Browser {current_branch.browser_version}!", ) if legacy_branch and not file_dict.get("exclude-legacy", False): legacy_file = legacy_branch.get_file(name, where_dirs) if legacy_file is not None and current_file is None and stable_file is None: logger.warning(f"{name} still exists in the legacy branch") elif legacy_file is None: logger.warning(f"{name} does not exist in the legacy branch") elif stable_file is not None and legacy_file.path != stable_file.path: logger.warning( f"{name} has different paths in the stable and legacy branch. " f"{stable_file.path} : {legacy_file.path}" ) elif current_file is not None and legacy_file.path != current_file.path: logger.warning( f"{name} has different paths in the current and legacy branch. " f"{current_file.path} : {legacy_file.path}" ) content = combine_files( name, content, legacy_file.content, f"Unused in Tor Browser {stable_branch.browser_version}!", ) elif legacy_branch: logger.info(f"Excluding legacy branch for {name}") files_list.append( { "name": name, # If "directory" is unspecified, we place the file directly beneath # en-US/ in the translation repository. i.e. "". "directory": file_dict.get("directory", ""), "branch": file_dict["branch"], "content": content, } ) ci_commit = os.environ.get("CI_COMMIT_SHA", "") ci_url_base = os.environ.get("CI_PROJECT_URL", "") json_data = { "commit": ci_commit, "commit-url": f"{ci_url_base}/-/commit/{ci_commit}" if (ci_commit and ci_url_base) else "", "project-path": os.environ.get("CI_PROJECT_PATH", ""), "current-branch": current_branch.name, "stable-branch": stable_branch.name, "files": files_list, } if legacy_branch: json_data["legacy-branch"] = legacy_branch.name with open(args.outname, "w") as file: json.dump(json_data, file)
tools/base-browser/l10n/combine/__init__.py 0 → 100644 +3 −0 Original line number Diff line number Diff line # flake8: noqa from .combine import combine_files
tools/base-browser/l10n/combine/combine.py 0 → 100644 +181 −0 Original line number Diff line number Diff line import re from typing import TYPE_CHECKING, Any from compare_locales.parser import getParser from compare_locales.parser.android import AndroidEntity, DocumentWrapper from compare_locales.parser.base import Comment, Entity, Junk, Whitespace from compare_locales.parser.dtd import DTDEntity from compare_locales.parser.fluent import FluentComment, FluentEntity from compare_locales.parser.properties import PropertiesEntity if TYPE_CHECKING: from collections.abc import Iterable def combine_files( filename: str, new_content: str | None, old_content: str | None, comment_prefix: str, ) -> str | None: """Combine two translation files into one to include all strings from both. The new content is presented first, and any strings only found in the old content are placed at the end with an additional comment. :param filename: The filename for the file, determines the format. :param new_content: The new content for the file, or None if it has been deleted. :param old_content: The old content for the file, or None if it did not exist before. :comment_prefix: A comment to include for any strings that are only found in the old content. This will be placed before any other comments for the string. :returns: The combined content, or None if both given contents are None. """ if new_content is None and old_content is None: return None # getParser from compare_locale returns the same instance for the same file # extension. parser = getParser(filename) is_android = filename.endswith(".xml") if new_content is None: if is_android: # File was deleted, add some document parts. content_start = ( '<?xml version="1.0" encoding="utf-8" standalone="yes"?>\n<resources>\n' ) content_end = "</resources>\n" else: # Treat as an empty file. content_start = "" content_end = "" existing_keys = [] else: parser.readUnicode(new_content) # Start with the same content as the current file. # For android strings, we want to keep the final "</resources>" until after. if is_android: closing_match = re.match( r"^(.*)(</resources>\s*)$", parser.ctx.contents, re.DOTALL ) if not closing_match: raise ValueError("Missing a final </resources>") content_start = closing_match.group(1) content_end = closing_match.group(2) else: content_start = parser.ctx.contents content_end = "" existing_keys = [entry.key for entry in parser.walk(only_localizable=True)] # For Fluent, we want to prefix the strings using GroupComments. # On weblate this will cause all the strings that fall under the GroupComment's # scope to have the prefix added to their "notes". # We set up an initial GroupComment for the first string we find. This will also # end the scope of the last GroupComment in the new translation file. # This will be replaced with a the next GroupComment when it is found. fluent_group_comment_prefix = f"\n## {comment_prefix}\n" fluent_group_comment: str | None = fluent_group_comment_prefix # For other formats, we want to keep all the comment lines that come directly # before the string. # In compare_locales.parser, only the comment line directly before an Entity # counts as the pre_comment for that Entity. I.e. only this line will be # included in Entity.all # However, in weblate every comment line that comes before the Entity is # included as a comment. So we also want to keep these additional comments to # preserve them for weblate. # We gather these extra comments in stacked_comments, and clear them whenever we # reach an Entity or a blank line (Whitespace is more than "\n"). stacked_comments: list[str] = [] additions: list[str] = [] entry_iter: Iterable[Any] = () # If the file does not exist in the old branch, don't make any additions. if old_content is not None: parser.readUnicode(old_content) entry_iter = parser.walk(only_localizable=False) for entry in entry_iter: if isinstance(entry, Junk): raise ValueError(f"Unexpected Junk: {entry.all}") if isinstance(entry, Whitespace): # Clear stacked comments if more than one empty line. if entry.all != "\n": stacked_comments.clear() continue if isinstance(entry, Comment): if isinstance(entry, FluentComment): # Don't stack Fluent comments. # Only the comments included in Entity.pre_comment count towards # that Entity's comment. if entry.all.startswith("##"): # A Fluent GroupComment if entry.all == "##": # Empty GroupComment. Used to end the scope of a previous # GroupComment. # Replace this with our prefix comment. fluent_group_comment = fluent_group_comment_prefix else: # Prefix the group comment. fluent_group_comment = ( f"{fluent_group_comment_prefix}{entry.all}\n" ) else: stacked_comments.append(entry.all) continue if isinstance(entry, DocumentWrapper): # Not needed. continue if not isinstance(entry, Entity): raise ValueError(f"Unexpected type: {entry.__class__.__name__}") if entry.key in existing_keys: # Already included this string in the new translation file. # Drop the gathered comments for this Entity. stacked_comments.clear() continue if isinstance(entry, FluentEntity): if fluent_group_comment is not None: # We have a found GroupComment which has not been included yet. # All following Entity's will be under its scope, until the next # GroupComment. additions.append(fluent_group_comment) # Added GroupComment, so don't need to add again. fluent_group_comment = None elif isinstance(entry, DTDEntity): # Include our additional comment before we print the rest for this # Entity. additions.append(f"<!-- LOCALIZATION NOTE: {comment_prefix} -->") elif isinstance(entry, PropertiesEntity): additions.append(f"# {comment_prefix}") elif isinstance(entry, AndroidEntity): additions.append(f"<!-- {comment_prefix} -->") else: raise ValueError(f"Unexpected Entity type: {entry.__class__.__name__}") # Add any other comment lines that came directly before this Entity. additions.extend(stacked_comments) stacked_comments.clear() additions.append(entry.all) content_middle = "" if additions: # New line before and after the additions additions.insert(0, "") additions.append("") if is_android: content_middle = "\n ".join(additions) else: content_middle = "\n".join(additions) # Remove " " in otherwise blank lines. content_middle = re.sub("^ +$", "", content_middle, flags=re.MULTILINE) return content_start + content_middle + content_end
tools/base-browser/l10n/combine/tests/README 0 → 100644 +2 −0 Original line number Diff line number Diff line python tests to be run with pytest. Requires the compare-locales package.