BB 42305: Add script to combine translation files across versions. (f3579509) · Commits · The Tor Project / Applications / Tor Browser

tools/base_browser/l10n/combine-translation-versions.py

0 → 100644

+404 −0

Original line number	Diff line number	Diff line
		import argparse
		import json
		import logging
		import os
		import re
		import subprocess

		from combine import combine_files

		# Whether we are running within the gitlab CI, rather than on a developer
		# machine. This toggles some optimisations that work well in the temporary
		# gitlab environment but would cause problems if run locally for testing
		# purposes.
		IN_GITLAB_CI_ENV = os.environ.get("GITLAB_CI", "") == "true"

		arg_parser = argparse.ArgumentParser(
		description="Combine a translation file across two different versions"
		)

		arg_parser.add_argument(
		"current_branch", metavar="<current-branch>", help="branch for the newest version"
		)
		arg_parser.add_argument(
		"files", metavar="<files>", help="JSON specifying the translation files"
		)
		arg_parser.add_argument("outname", metavar="<json>", help="name of the json output")

		args = arg_parser.parse_args()

		logging.basicConfig()
		logger = logging.getLogger("combine-translation-versions")
		logger.setLevel(logging.INFO)


		def in_pink(msg: str) -> str:
		"""Present a message as pink in the terminal output.

		:param msg: The message to wrap in pink.
		:returns: The message to print to terminal.
		"""
		# Pink and bold.
		return f"\x1b[1;38;5;212m{msg}\x1b[0m"


		def git_run(git_args: list[str]) -> None:
		"""Run a git command.

		:param git_args: The arguments that should follow "git".
		"""
		# Add some text to give context to git's stderr appearing in log.
		logger.info("Running: " + in_pink("git " + " ".join(git_args)))
		subprocess.run(["git", *git_args], check=True)


		def git_text(git_args: list[str]) -> str:
		"""Get the text output for a git command.

		:param git_args: The arguments that should follow "git".
		:returns: The stdout of the command.
		"""
		logger.info("Running: " + in_pink("git " + " ".join(git_args)))
		return subprocess.run(
		["git", *git_args], text=True, check=True, stdout=subprocess.PIPE
		).stdout


		def git_lines(git_args: list[str]) -> list[str]:
		"""Get the lines from a git command.

		:param git_args: The arguments that should follow "git".
		:returns: The non-empty lines from stdout of the command.
		"""
		return [line for line in git_text(git_args).split("\n") if line]


		class TranslationFile:
		"""Represents a translation file."""

		def __init__(self, path: str, content: str) -> None:
		self.path = path
		self.content = content


		class BrowserBranch:
		"""Represents a browser git branch."""

		def __init__(self, branch_name: str, is_head: bool = False) -> None:
		"""Create a new instance.

		:param branch_name: The branch's git name.
		:param is_head: Whether the branch matches "HEAD".
		"""
		version_match = re.match(
		r"(?P<prefix>[a-z]+\-browser)\-"
		r"(?P<firefox>[0-9]+(?:\.[0-9]+){1,2})(?:esr\|[ab][0-9]+)?\-"
		r"(?P<browser>[0-9]+\.[05])\-"
		r"(?P<number>[0-9]+)$",
		branch_name,
		)

		if not version_match:
		raise ValueError(f"Unable to parse the version from the ref {branch_name}")

		self.name = branch_name
		self.prefix = version_match.group("prefix")
		self.browser_version = version_match.group("browser")
		# Convert tor-browser to "Tor Browser", and similar.
		browser_name = self.prefix.replace("-", " ").title()
		self.browser_version_name = f"{browser_name} {self.browser_version}"

		self._is_head = is_head
		self._ref = "HEAD" if is_head else f"origin/{branch_name}"

		firefox_nums = [int(n) for n in version_match.group("firefox").split(".")]
		if len(firefox_nums) == 2:
		firefox_nums.append(0)
		browser_nums = [int(n) for n in self.browser_version.split(".")]
		branch_number = int(version_match.group("number"))
		# Prioritise the firefox ESR version, then the browser version then the
		# branch number.
		self._ordered = (
		firefox_nums[0],
		firefox_nums[1],
		firefox_nums[2],
		browser_nums[0],
		browser_nums[1],
		branch_number,
		)

		# Minor version for browser is only ever "0" or "5", so we can convert
		# the version to an integer.
		self._browser_int_version = int(2 * float(self.browser_version))

		self._file_paths: list[str] \| None = None

		def release_below(self, other: "BrowserBranch", num: int) -> bool:
		"""Determine whether another branch is within range of a previous
		browser release.

		The browser versions are expected to increment by "0.5", and a previous
		release branch's version is expected to be `num * 0.5` behind the
		current one.

		:param other: The branch to compare.
		:param num: The number of "0.5" releases behind to test with.
		"""
		return other._browser_int_version == self._browser_int_version - num

		def __lt__(self, other: "BrowserBranch") -> bool:
		return self._ordered < other._ordered

		def __gt__(self, other: "BrowserBranch") -> bool:
		return self._ordered > other._ordered

		def _matching_dirs(self, path: str, dir_list: list[str]) -> bool:
		"""Test that a path is contained in the list of dirs.

		:param path: The path to check.
		:param dir_list: The list of directories to check against.
		:returns: Whether the path matches.
		"""
		for dir_path in dir_list:
		if os.path.commonpath([dir_path, path]) == dir_path:
		return True
		return False

		def get_file(
		self, filename: str, search_dirs: list[str] \| None
		) -> TranslationFile \| None:
		"""Fetch the file content for the named file in this branch.

		:param filename: The name of the file to fetch the content for.
		:param search_dirs: The directories to restrict the search to, or None
		to search for the file anywhere.
		:returns: The file, or `None` if no file could be found.
		"""
		if self._file_paths is None:
		if not self._is_head:
		fetch_args = ()
		if IN_GITLAB_CI_ENV:
		# Minimal fetch of non-HEAD branch to get the file paths.
		# Individual file blobs will be downloaded as needed.
		# Only do this when running in the gitlab CI since it will
		# alter the user's .git/config and will effect future
		# plain fetches.
		fetch_args = ("--depth=1", "--filter=blob:none")
		git_run(["fetch", *fetch_args, "origin", self.name])
		self._file_paths = git_lines(
		["ls-tree", "-r", "--format=%(path)", self._ref]
		)

		matching = [
		path
		for path in self._file_paths
		if os.path.basename(path) == filename
		and (search_dirs is None or self._matching_dirs(path, search_dirs))
		]
		if not matching:
		return None
		if len(matching) > 1:
		raise Exception(f"Multiple occurrences of {filename}")

		path = matching[0]

		return TranslationFile(
		path=path, content=git_text(["cat-file", "blob", f"{self._ref}:{path}"])
		)


		def get_stable_branch(
		compare_version: BrowserBranch,
		) -> tuple[BrowserBranch, BrowserBranch \| None]:
		"""Find the most recent stable branch in the origin repository.

		:param compare_version: The development branch to compare against.
		:returns: The stable and legacy branches. If no legacy branch is found,
		`None` will be returned instead.
		"""
		# We search for build1 tags. These are added after the rebase of browser
		# commits, so the corresponding branch should contain our strings.
		# Moreover, we assume that the branch with the most recent ESR version
		# with such a tag will be used in the next stable build in
		# tor-browser-build.
		tag_glob = f"{compare_version.prefix}-*-build1"

		fetch_args = ()
		if IN_GITLAB_CI_ENV:
		# To speed up, only fetch the tags without blobs.
		# Only do this when running in the gitlab CI since it will alter the
		# user's .git/config and will effect future plain fetches.
		fetch_args = ("--depth=1", "--filter=object:type=tag")
		git_run(["fetch", *fetch_args, "origin", "tag", tag_glob])
		stable_branches = []
		legacy_branches = []
		stable_annotation_regex = re.compile(r"\bstable\b")
		legacy_annotation_regex = re.compile(r"\blegacy\b")
		tag_pattern = re.compile(
		rf"^{re.escape(compare_version.prefix)}-[^-]+-[^-]+-[^-]+-build1$"
		)

		for build_tag, annotation in (
		line.split(" ", 1) for line in git_lines(["tag", "-n1", "--list", tag_glob])
		):
		if not tag_pattern.match(build_tag):
		continue
		is_stable = bool(stable_annotation_regex.search(annotation))
		is_legacy = bool(legacy_annotation_regex.search(annotation))
		if not is_stable and not is_legacy:
		continue
		try:
		# Branch name is the same as the tag, minus "-build1".
		branch = BrowserBranch(re.sub(r"-build1$", "", build_tag))
		except ValueError:
		logger.warning(f"Could not read the version for {build_tag}")
		continue
		if branch.prefix != compare_version.prefix:
		continue
		if is_stable:
		# Stable can be one release version behind.
		# NOTE: In principle, when switching between versions there may be a
		# window of time where the development branch has not yet progressed
		# to the next "0.5" release, so has the same browser version as the
		# stable branch. So we also allow for matching browser versions.
		# NOTE:
		# 1. The "Will be unused in" message will not make sense, but we do
		# not expect string differences in this scenario.
		# 2. We do not expect this scenario to last for long.
		if not (
		compare_version.release_below(branch, 1)
		or compare_version.release_below(branch, 0)
		):
		continue
		stable_branches.append(branch)
		elif is_legacy:
		# Legacy can be arbitrary release versions behind.
		legacy_branches.append(branch)

		if not stable_branches:
		raise Exception("No stable build1 branch found")

		return (
		# Return the stable branch with the highest version.
		max(stable_branches),
		max(legacy_branches) if legacy_branches else None,
		)


		current_branch = BrowserBranch(args.current_branch, is_head=True)

		stable_branch, legacy_branch = get_stable_branch(current_branch)

		if os.environ.get("TRANSLATION_INCLUDE_LEGACY", "") != "true":
		legacy_branch = None

		files_list = []

		for file_dict in json.loads(args.files):
		name = file_dict["name"]
		where_dirs = file_dict.get("where", None)
		current_file = current_branch.get_file(name, where_dirs)
		stable_file = stable_branch.get_file(name, where_dirs)

		if current_file is None and stable_file is None:
		# No file in either branch.
		logger.warning(f"{name} does not exist in either the current or stable branch")
		elif current_file is None:
		logger.warning(f"{name} deleted in the current branch")
		elif stable_file is None:
		logger.warning(f"{name} does not exist in the stable branch")
		elif current_file.path != stable_file.path:
		logger.warning(
		f"{name} has different paths in the current and stable branch. "
		f"{current_file.path} : {stable_file.path}"
		)

		content = None if current_file is None else current_file.content

		# If we have a branding file, we want to also include strings from the other
		# branding directories that differ from the stable release.
		# The strings that differ per release should be specified in
		# file_dict["branding"]["ids"]. These strings will be copied from the other
		# release's branding directory, with an addition suffix added to their ID,
		# as specified in the version_dict["suffix"].
		branding = file_dict.get("branding", None)
		if branding:
		include_ids = branding["ids"]
		for version_dict in branding["versions"]:
		branding_dirs = version_dict.get("where", None)
		branding_file = current_branch.get_file(name, branding_dirs)
		if branding_file is None:
		raise Exception(f"{name} does not exist in {branding_dirs}")
		content = combine_files(
		name,
		content,
		branding_file.content,
		f'{version_dict["name"]} Release.',
		include_ids,
		version_dict["suffix"],
		)

		content = combine_files(
		name,
		content,
		None if stable_file is None else stable_file.content,
		f"Will be unused in {current_branch.browser_version_name}!",
		)

		if legacy_branch and not file_dict.get("exclude-legacy", False):
		legacy_file = legacy_branch.get_file(name, where_dirs)
		if legacy_file is not None and current_file is None and stable_file is None:
		logger.warning(f"{name} still exists in the legacy branch")
		elif legacy_file is None:
		logger.warning(f"{name} does not exist in the legacy branch")
		elif stable_file is not None and legacy_file.path != stable_file.path:
		logger.warning(
		f"{name} has different paths in the stable and legacy branch. "
		f"{stable_file.path} : {legacy_file.path}"
		)
		elif current_file is not None and legacy_file.path != current_file.path:
		logger.warning(
		f"{name} has different paths in the current and legacy branch. "
		f"{current_file.path} : {legacy_file.path}"
		)

		content = combine_files(
		name,
		content,
		legacy_file.content,
		f"Unused in {stable_branch.browser_version_name}!",
		)
		elif legacy_branch:
		logger.info(f"Excluding legacy branch for {name}")

		files_list.append(
		{
		"name": name,
		# If "directory" is unspecified, we place the file directly beneath
		# en-US/ in the translation repository. i.e. "".
		"directory": file_dict.get("directory", ""),
		"branch": file_dict["branch"],
		"content": content,
		}
		)


		ci_commit = os.environ.get("CI_COMMIT_SHA", "")
		ci_url_base = os.environ.get("CI_PROJECT_URL", "")

		json_data = {
		"commit": ci_commit,
		"commit-url": (
		f"{ci_url_base}/-/commit/{ci_commit}" if (ci_commit and ci_url_base) else ""
		),
		"project-path": os.environ.get("CI_PROJECT_PATH", ""),
		"current-branch": current_branch.name,
		"stable-branch": stable_branch.name,
		"files": files_list,
		}

		if legacy_branch:
		json_data["legacy-branch"] = legacy_branch.name

		with open(args.outname, "w") as file:
		json.dump(json_data, file)

tools/base_browser/l10n/combine/init.py

0 → 100644

+3 −0

Original line number	Diff line number	Diff line
		# flake8: noqa

		from .combine import combine_files

tools/base_browser/l10n/combine/combine.py

0 → 100644

+206 −0

Original line number	Diff line number	Diff line
		import re
		from typing import TYPE_CHECKING, Any

		from compare_locales.parser import getParser
		from compare_locales.parser.android import AndroidEntity, DocumentWrapper
		from compare_locales.parser.base import Comment, Entity, Junk, Whitespace
		from compare_locales.parser.dtd import DTDEntity
		from compare_locales.parser.fluent import FluentComment, FluentEntity
		from compare_locales.parser.properties import PropertiesEntity

		if TYPE_CHECKING:
		from collections.abc import Iterable


		def combine_files(
		filename: str,
		primary_content: str \| None,
		alternative_content: str \| None,
		comment_prefix: str,
		include_ids: list[str] \| None = None,
		alternative_suffix: str = "",
		) -> str \| None:
		"""Combine two translation files into one to include all strings from both.
		The primary content is presented first, followed by the alternative content
		at the end with an additional comment.

		:param filename: The filename for the file, determines the format.
		:param primary_content: The primary content for the file, or None if it does
		not exist.
		:param alternative_content: The alternative content for the file, or None if
		it does not exist.
		:param comment_prefix: A comment to include for any strings that are
		appended to the content. This will be placed before any other comments for
		the string.
		:param include_ids: String IDs from `alternative_content` we want to
		include. If this is `None` then we include all strings that do not already
		have a matching ID in `primary_content`.
		:param duplicate_suffix: The suffix to apply to the alternative IDs.

		:returns: The combined content, or None if both given contents are None.
		"""
		if primary_content is None and alternative_content is None:
		return None

		# getParser from compare_locale returns the same instance for the same file
		# extension.
		parser = getParser(filename)

		is_android = filename.endswith(".xml")
		if primary_content is None:
		if is_android:
		# File was deleted, add some document parts.
		content_start = (
		'<?xml version="1.0" encoding="utf-8" standalone="yes"?>\n<resources>\n'
		)
		content_end = "</resources>\n"
		else:
		# Treat as an empty file.
		content_start = ""
		content_end = ""
		existing_keys = []
		else:
		parser.readUnicode(primary_content)

		# Start with the same content as the current file.
		# For android strings, we want to keep the final "</resources>" until after.
		if is_android:
		closing_match = re.match(
		r"^(.)(</resources>\s)$", parser.ctx.contents, re.DOTALL
		)
		if not closing_match:
		raise ValueError("Missing a final </resources>")
		content_start = closing_match.group(1)
		content_end = closing_match.group(2)
		else:
		content_start = parser.ctx.contents
		content_end = ""
		existing_keys = [entry.key for entry in parser.walk(only_localizable=True)]

		# For Fluent, we want to prefix the strings using GroupComments.
		# On weblate this will cause all the strings that fall under the GroupComment's
		# scope to have the prefix added to their "notes".
		# We set up an initial GroupComment for the first string we find. This will also
		# end the scope of the last GroupComment in the new translation file.
		# This will be replaced with a the next GroupComment when it is found.
		fluent_group_comment_prefix = f"\n## {comment_prefix}\n"
		fluent_group_comment: str \| None = fluent_group_comment_prefix

		# For other formats, we want to keep all the comment lines that come directly
		# before the string.
		# In compare_locales.parser, only the comment line directly before an Entity
		# counts as the pre_comment for that Entity. I.e. only this line will be
		# included in Entity.all
		# However, in weblate every comment line that comes before the Entity is
		# included as a comment. So we also want to keep these additional comments to
		# preserve them for weblate.
		# We gather these extra comments in stacked_comments, and clear them whenever we
		# reach an Entity or a blank line (Whitespace is more than "\n").
		stacked_comments: list[str] = []

		additions: list[str] = []

		entry_iter: Iterable[Any] = ()
		# If the file does not exist in the old branch, don't make any additions.
		if alternative_content is not None:
		parser.readUnicode(alternative_content)
		entry_iter = parser.walk(only_localizable=False)
		for entry in entry_iter:
		if isinstance(entry, Junk):
		raise ValueError(f"Unexpected Junk: {entry.all}")
		if isinstance(entry, Whitespace):
		# Clear stacked comments if more than one empty line.
		if entry.all != "\n":
		stacked_comments.clear()
		continue
		if isinstance(entry, Comment):
		if isinstance(entry, FluentComment):
		# Don't stack Fluent comments.
		# Only the comments included in Entity.pre_comment count towards
		# that Entity's comment.
		if entry.all.startswith("##"):
		# A Fluent GroupComment
		if entry.all == "##":
		# Empty GroupComment. Used to end the scope of a previous
		# GroupComment.
		# Replace this with our prefix comment.
		fluent_group_comment = fluent_group_comment_prefix
		else:
		# Prefix the group comment.
		fluent_group_comment = (
		f"{fluent_group_comment_prefix}{entry.all}\n"
		)
		else:
		stacked_comments.append(entry.all)
		continue
		if isinstance(entry, DocumentWrapper):
		# Not needed.
		continue

		if not isinstance(entry, Entity):
		raise ValueError(f"Unexpected type: {entry.__class__.__name__}")

		if include_ids is None:
		# We include the entry if it is not already included.
		include_entry = entry.key not in existing_keys
		else:
		# We include the entry if it is in our list.
		include_entry = entry.key in include_ids
		if not include_entry:
		# Drop the gathered comments for this Entity.
		stacked_comments.clear()
		continue

		if isinstance(entry, FluentEntity):
		id_regex = rf"^({re.escape(entry.key)})( *=)"
		if fluent_group_comment is not None:
		# We have a found GroupComment which has not been included yet.
		# All following Entity's will be under its scope, until the next
		# GroupComment.
		additions.append(fluent_group_comment)
		# Added GroupComment, so don't need to add again.
		fluent_group_comment = None
		elif isinstance(entry, DTDEntity):
		id_regex = rf"^(\s<!ENTITY\s{re.escape(entry.key)})(\s)"
		# Include our additional comment before we print the rest for this
		# Entity.
		additions.append(f"<!-- LOCALIZATION NOTE: {comment_prefix} -->")
		elif isinstance(entry, PropertiesEntity):
		id_regex = rf"^({re.escape(entry.key)})( *=)"
		additions.append(f"# {comment_prefix}")
		elif isinstance(entry, AndroidEntity):
		id_regex = rf'^(\s<string\s[^>]name="{re.escape(entry.key)})(")'
		additions.append(f"<!-- {comment_prefix} -->")
		else:
		raise ValueError(f"Unexpected Entity type: {entry.__class__.__name__}")

		# Add any other comment lines that came directly before this Entity.
		additions.extend(stacked_comments)
		stacked_comments.clear()
		entry_content = entry.all
		if alternative_suffix:
		# NOTE: compare_locales does not allow us to set the entry.key
		# value. Instead we use a regular expression to append the suffix to
		# the expected key.
		entry_content, count = re.subn(
		id_regex, rf"\1{alternative_suffix}\2", entry_content, flags=re.M
		)
		if count != 1:
		raise ValueError(f"Failed to substitute the ID for {entry.key}")
		additions.append(entry_content)

		content_middle = ""

		if additions:
		# New line before and after the additions
		additions.insert(0, "")
		additions.append("")
		if is_android:
		content_middle = "\n ".join(additions)
		else:
		content_middle = "\n".join(additions)

		# Remove " " in otherwise blank lines.
		content_middle = re.sub("^ +$", "", content_middle, flags=re.MULTILINE)

		return content_start + content_middle + content_end

tools/base_browser/l10n/combine/tests/init.py

0 → 100644

+0 −0

Empty file added.

tools/base_browser/l10n/combine/tests/python.toml

0 → 100644

+10 −0

Original line number	Diff line number	Diff line
		[DEFAULT]
		subsuite = "base-browser"

		["test_android.py"]

		["test_dtd.py"]

		["test_fluent.py"]

		["test_properties.py"]