Bug 42305: Add script to combine translation files across versions. (2a1333dd) · Commits · The Tor Project / Applications / Tor Browser

tools/torbrowser/l10n/combine-translation-versions.py

0 → 100644

+208 −0

Original line number	Diff line number	Diff line
		import argparse
		import json
		import logging
		import os
		import re
		import subprocess

		from combine import combine_files

		arg_parser = argparse.ArgumentParser(
		description="Combine a translation file across two different versions"
		)

		arg_parser.add_argument(
		"current_branch", metavar="<current-branch>", help="branch for the newest version"
		)
		arg_parser.add_argument(
		"filenames", metavar="<filenames>", help="name of the translation files"
		)
		arg_parser.add_argument("outname", metavar="<json>", help="name of the json output")

		args = arg_parser.parse_args()

		logging.basicConfig()
		logger = logging.getLogger("combine-translation-versions")
		logger.setLevel(logging.INFO)


		def in_pink(msg: str) -> str:
		"""Present a message as pink in the terminal output.

		:param msg: The message to wrap in pink.
		:returns: The message to print to terminal.
		"""
		# Pink and bold.
		return f"\x1b[1;38;5;212m{msg}\x1b[0m"


		def git_run(git_args: list[str]) -> None:
		"""Run a git command.

		:param git_args: The arguments that should follow "git".
		"""
		# Add some text to give context to git's stderr appearing in log.
		logger.info("Running: " + in_pink("git " + " ".join(git_args)))
		subprocess.run(["git", *git_args], check=True)


		def git_text(git_args: list[str]) -> str:
		"""Get the text output for a git command.

		:param git_args: The arguments that should follow "git".
		:returns: The stdout of the command.
		"""
		logger.info("Running: " + in_pink("git " + " ".join(git_args)))
		return subprocess.run(
		["git", *git_args], text=True, check=True, stdout=subprocess.PIPE
		).stdout


		def git_lines(git_args: list[str]) -> list[str]:
		"""Get the lines from a git command.

		:param git_args: The arguments that should follow "git".
		:returns: The non-empty lines from stdout of the command.
		"""
		return [line for line in git_text(git_args).split("\n") if line]


		def git_file_paths(git_ref: str) -> list[str]:
		"""Get the full list of file paths found under the given tree.

		:param git_ref: The git reference for the tree to search.
		:returns: The found file paths.
		"""
		return git_lines(["ls-tree", "-r", "--format=%(path)", git_ref])


		def matching_path(search_paths: list[str], filename: str) -> str \| None:
		"""Get the matching file path with the given filename, if it exists.

		:param search_paths: The file paths to search through.
		:param filename: The file name to match.
		:returns: The unique file path with the matching name, or None if no such
		match was found.
		:throws Exception: If multiple paths shared the same file name.
		"""
		matching = [path for path in search_paths if os.path.basename(path) == filename]
		if not matching:
		return None
		if len(matching) > 1:
		raise Exception("Multiple occurrences of {filename}")
		return matching[0]


		def git_file_content(git_ref: str, path: str \| None) -> str \| None:
		"""Get the file content of the specified git blob object.

		:param git_ref: The reference for the tree to find the file under.
		:param path: The file path for the object, or None if there is no path.
		:returns: The file content, or None if no path was given.
		"""
		if path is None:
		return None
		return git_text(["cat-file", "blob", f"{git_ref}:{path}"])


		def get_stable_branch(branch_prefix: str) -> str:
		"""Find the most recent stable branch in the origin repository.

		:param branch_prefix: The prefix that the stable branch should have.
		:returns: The branch name.
		"""
		tag_glob = f"{branch_prefix}-*-build1"
		# To speed up, only fetch the tags without blobs.
		git_run(
		["fetch", "--depth=1", "--filter=object:type=tag", "origin", "tag", tag_glob]
		)
		# Get most recent stable tag.
		for build_tag, annotation in (
		line.split(" ", 1)
		for line in git_lines(["tag", "-n1", "--list", tag_glob, "--sort=-taggerdate"])
		):
		if "stable" in annotation:
		# Branch name is the same as the tag, minus "-build1".
		return re.sub(r"-build1$", "", build_tag)
		raise Exception("No stable build1 tag found")


		def get_version_from_branch_name(branch_name: str) -> tuple[str, float]:
		"""Get the branch prefix and version from its name.

		:param branch_name: The branch to extract from.
		:returns: The branch prefix and its version number.
		"""
		version_match = re.match(
		r"([a-z-]+)-[^-]*-([0-9]+\.[05])-",
		branch_name,
		)

		if not version_match:
		raise ValueError(f"Unable to parse the version from the branch {branch_name}")

		return (version_match.group(1), float(version_match.group(2)))


		branch_prefix, current_version = get_version_from_branch_name(args.current_branch)

		stable_branch = get_stable_branch(branch_prefix)
		_, stable_version = get_version_from_branch_name(stable_branch)

		if stable_version > current_version or stable_version < current_version - 0.5:
		raise Exception(
		f"Version of stable branch {stable_branch} is not within 0.5 of the "
		f"current branch {args.current_branch}"
		)

		# Minimal fetch of stable_branch.
		# Individual file blobs will be downloaded as needed.
		git_run(["fetch", "--depth=1", "--filter=blob:none", "origin", stable_branch])

		current_file_paths = git_file_paths("HEAD")
		old_file_paths = git_file_paths(f"origin/{stable_branch}")

		ci_commit = os.environ.get("CI_COMMIT_SHA", "")
		ci_url_base = os.environ.get("CI_PROJECT_URL", "")

		json_data = {
		"commit": ci_commit,
		"commit-url": f"{ci_url_base}/-/commit/{ci_commit}"
		if (ci_commit and ci_url_base)
		else "",
		"project-path": os.environ.get("CI_PROJECT_PATH", ""),
		"current-branch": args.current_branch,
		"stable-branch": stable_branch,
		"files": [],
		}

		for translation_branch, name in (
		part.strip().split(":", 1) for part in args.filenames.split(" ") if part.strip()
		):
		current_path = matching_path(current_file_paths, name)
		old_path = matching_path(old_file_paths, name)

		if current_path is None and old_path is None:
		# No file in either branch.
		logger.warning(f"{name} does not exist in either the current or stable branch")
		elif current_path is None:
		logger.warning(f"{name} deleted in the current branch")
		elif old_path is None:
		logger.warning(f"{name} does not exist in the stable branch")

		content = combine_files(
		name,
		git_file_content("HEAD", current_path),
		git_file_content(f"origin/{stable_branch}", old_path),
		f"Will be unused in Tor Browser {current_version}!",
		)
		json_data["files"].append(
		{
		"name": name,
		"branch": translation_branch,
		"content": content,
		}
		)

		with open(args.outname, "w") as file:
		json.dump(json_data, file)

tools/torbrowser/l10n/combine/init.py

0 → 100644

+3 −0

Original line number	Diff line number	Diff line
		# flake8: noqa

		from .combine import combine_files

tools/torbrowser/l10n/combine/combine.py

0 → 100644

+181 −0

Original line number	Diff line number	Diff line
		import re
		from typing import TYPE_CHECKING, Any

		from compare_locales.parser import getParser
		from compare_locales.parser.android import AndroidEntity, DocumentWrapper
		from compare_locales.parser.base import Comment, Entity, Junk, Whitespace
		from compare_locales.parser.dtd import DTDEntity
		from compare_locales.parser.fluent import FluentComment, FluentEntity
		from compare_locales.parser.properties import PropertiesEntity

		if TYPE_CHECKING:
		from collections.abc import Iterable


		def combine_files(
		filename: str,
		new_content: str \| None,
		old_content: str \| None,
		comment_prefix: str,
		) -> str \| None:
		"""Combine two translation files into one to include all strings from both.
		The new content is presented first, and any strings only found in the old
		content are placed at the end with an additional comment.

		:param filename: The filename for the file, determines the format.
		:param new_content: The new content for the file, or None if it has been
		deleted.
		:param old_content: The old content for the file, or None if it did not
		exist before.
		:comment_prefix: A comment to include for any strings that are only found in
		the old content. This will be placed before any other comments for the
		string.

		:returns: The combined content, or None if both given contents are None.
		"""
		if new_content is None and old_content is None:
		return None

		# getParser from compare_locale returns the same instance for the same file
		# extension.
		parser = getParser(filename)

		is_android = filename.endswith(".xml")
		if new_content is None:
		if is_android:
		# File was deleted, add some document parts.
		content_start = (
		'<?xml version="1.0" encoding="utf-8" standalone="yes"?>\n<resources>\n'
		)
		content_end = "</resources>\n"
		else:
		# Treat as an empty file.
		content_start = ""
		content_end = ""
		existing_keys = []
		else:
		parser.readUnicode(new_content)

		# Start with the same content as the current file.
		# For android strings, we want to keep the final "</resources>" until after.
		if is_android:
		closing_match = re.match(
		r"^(.)(</resources>\s)$", parser.ctx.contents, re.DOTALL
		)
		if not closing_match:
		raise ValueError("Missing a final </resources>")
		content_start = closing_match.group(1)
		content_end = closing_match.group(2)
		else:
		content_start = parser.ctx.contents
		content_end = ""
		existing_keys = [entry.key for entry in parser.walk(only_localizable=True)]

		# For Fluent, we want to prefix the strings using GroupComments.
		# On weblate this will cause all the strings that fall under the GroupComment's
		# scope to have the prefix added to their "notes".
		# We set up an initial GroupComment for the first string we find. This will also
		# end the scope of the last GroupComment in the new translation file.
		# This will be replaced with a the next GroupComment when it is found.
		fluent_group_comment_prefix = f"\n## {comment_prefix}\n"
		fluent_group_comment: str \| None = fluent_group_comment_prefix

		# For other formats, we want to keep all the comment lines that come directly
		# before the string.
		# In compare_locales.parser, only the comment line directly before an Entity
		# counts as the pre_comment for that Entity. I.e. only this line will be
		# included in Entity.all
		# However, in weblate every comment line that comes before the Entity is
		# included as a comment. So we also want to keep these additional comments to
		# preserve them for weblate.
		# We gather these extra comments in stacked_comments, and clear them whenever we
		# reach an Entity or a blank line (Whitespace is more than "\n").
		stacked_comments: list[str] = []

		additions: list[str] = []

		entry_iter: Iterable[Any] = ()
		# If the file does not exist in the old branch, don't make any additions.
		if old_content is not None:
		parser.readUnicode(old_content)
		entry_iter = parser.walk(only_localizable=False)
		for entry in entry_iter:
		if isinstance(entry, Junk):
		raise ValueError(f"Unexpected Junk: {entry.all}")
		if isinstance(entry, Whitespace):
		# Clear stacked comments if more than one empty line.
		if entry.all != "\n":
		stacked_comments.clear()
		continue
		if isinstance(entry, Comment):
		if isinstance(entry, FluentComment):
		# Don't stack Fluent comments.
		# Only the comments included in Entity.pre_comment count towards
		# that Entity's comment.
		if entry.all.startswith("##"):
		# A Fluent GroupComment
		if entry.all == "##":
		# Empty GroupComment. Used to end the scope of a previous
		# GroupComment.
		# Replace this with our prefix comment.
		fluent_group_comment = fluent_group_comment_prefix
		else:
		# Prefix the group comment.
		fluent_group_comment = (
		f"{fluent_group_comment_prefix}{entry.all}\n"
		)
		else:
		stacked_comments.append(entry.all)
		continue
		if isinstance(entry, DocumentWrapper):
		# Not needed.
		continue

		if not isinstance(entry, Entity):
		raise ValueError(f"Unexpected type: {entry.__class__.__name__}")

		if entry.key in existing_keys:
		# Already included this string in the new translation file.
		# Drop the gathered comments for this Entity.
		stacked_comments.clear()
		continue

		if isinstance(entry, FluentEntity):
		if fluent_group_comment is not None:
		# We have a found GroupComment which has not been included yet.
		# All following Entity's will be under its scope, until the next
		# GroupComment.
		additions.append(fluent_group_comment)
		# Added GroupComment, so don't need to add again.
		fluent_group_comment = None
		elif isinstance(entry, DTDEntity):
		# Include our additional comment before we print the rest for this
		# Entity.
		additions.append(f"<!-- LOCALIZATION NOTE: {comment_prefix} -->")
		elif isinstance(entry, PropertiesEntity):
		additions.append(f"# {comment_prefix}")
		elif isinstance(entry, AndroidEntity):
		additions.append(f"<!-- {comment_prefix} -->")
		else:
		raise ValueError(f"Unexpected Entity type: {entry.__class__.__name__}")

		# Add any other comment lines that came directly before this Entity.
		additions.extend(stacked_comments)
		stacked_comments.clear()
		additions.append(entry.all)

		content_middle = ""

		if additions:
		# New line before and after the additions
		additions.insert(0, "")
		additions.append("")
		if is_android:
		content_middle = "\n ".join(additions)
		else:
		content_middle = "\n".join(additions)

		# Remove " " in otherwise blank lines.
		content_middle = re.sub("^ +$", "", content_middle, flags=re.MULTILINE)

		return content_start + content_middle + content_end

tools/torbrowser/l10n/combine/tests/README

0 → 100644

+2 −0

Original line number	Diff line number	Diff line
		python tests to be run with pytest.
		Requires the compare-locales package.

tools/torbrowser/l10n/combine/tests/init.py

0 → 100644

+0 −0

Empty file added.