#!/usr/bin/python3

# Copyright (C) 2021 Antoine Beaupré <anarcat@debian.org>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

"""this script will:

 1. periodically check a Nextcloud (WebDAV?) folder (called
    dangerzone) for new files

 2. when a file is found, move it to a dangerzone/processing folder as
    an ad-hoc locking mechanism

 3. download the file locally

 4. process the file with the dangerzone-converter

 5. on failure, delete the failed file locally, and move it to a
    dangerzone/rejected folder remotely

 6. on success, upload the sanitized file to a safe/ folder, move the
    original to dangerzone/processed
"""

import argparse
import logging
import os
from os.path import join
import re
import shutil
import subprocess
import sys
import tempfile

try:
    import webdav3.client as wc
    from webdav3.exceptions import RemoteResourceNotFound, ResponseErrorCode
except ImportError:
    print(
        "cannot find webdav.client, try `apt install python3-webdavclient` or `pip install webdavclient3`",
        file=sys.stderr,
    )


class BatchArgParser(argparse.ArgumentParser):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.add_argument(
            "--image",
            default="flmcode/dangerzone",
            help="Docker image to use, default %(default)s",
        )
        self.add_argument(
            "-v",
            "--verbose",
            action=LoggingAction,
            const="INFO",
            help="enable verbose messages",
        )
        self.add_argument(
            "-d",
            "--debug",
            action=LoggingAction,
            const="DEBUG",
            help="enable debugging messages",
        )
        self.add_argument(
            "-n",
            "--dryrun",
            action="store_true",
            help="simulate",
        )
        self.add_argument(
            "-l",
            "--location",
            required=True,
            help="WebDAV URI, default: %(default)s, from WEBDAV_LOCATION environment",
            default=os.environ.get("WEBDAV_LOCATION"),
        )
        self.add_argument(
            "-u",
            "--username",
            required=True,
            help="WebDAV username, default: %(default)s, from WEBDAV_USERNAME environment",
            default=os.environ.get("WEBDAV_USERNAME"),
        )
        self.add_argument(
            "-p",
            "--password",
            help="WebDAV username, default: prompted or taken from WEBDAV_PASSWORD environment",
            default=os.environ.get("WEBDAV_PASSWORD"),
        )


class Sanitizer:
    """actually sanitize files with Docker

    This is designed to keep the existing folder structure. Sample use:

        sanitizer = Sanitizer(safe_dir, image, verbose, dryrun)
        if os.path.isdir(args.document):
            sanitizer.sanitize_dir(args.document)
        else:
            sanitizer.sanitize_file(args.document)
    """

    class ParseError(Exception):
        pass

    def __init__(self, safe_dir, image, verbose, dryrun):
        self.safe_dir = safe_dir
        self.runner = DockerRunner(image=image, cmd_output=verbose, dryrun=dryrun)

    def sanitize_dir(self, path):
        for root, dirs, files in os.walk(path):
            for path in dirs:
                self.sanitize_dir(path)
            root = root.rstrip("/")
            safe_dir = join(self.safe_dir, os.path.basename(root))
            logging.info(
                "processing %d files in dir %s to safe_dir: %s",
                len(files),
                root,
                safe_dir,
            )
            for file in files:
                self.sanitize_file(join(root, file), safe_dir=safe_dir)

    def sanitize_file(self, path, safe_dir=None):
        if safe_dir is None:
            safe_dir = self.safe_dir
        os.makedirs(safe_dir, exist_ok=True)
        logging.info("sanitizing file %s into %s", path, safe_dir)
        container_id, output = self.runner.run(
            docker_args=["--volume", os.path.abspath(path) + ":/tmp/input_file"],
            args=["document-to-pixels-unpriv"],
        )

        logging.info("stage 1 completed in container %s", container_id)
        m = re.search(rb"Document has (\d+) pages", output)
        if not m:
            raise Sanitizer.ParseError("failed to find page numbers in %r" % output)
        pages = int(m.group(1))
        logging.info("generated %d pages", pages)

        with tempfile.TemporaryDirectory() as pixel_dir:
            for page in range(1, pages + 1):
                for type in ("rgb", "width", "height"):
                    self.runner.cp(f"{container_id}:/tmp/page-{page}.{type}", pixel_dir)
            self.runner.rm(container_id)
            # make temp dir readable by all, so the container can also
            # read it XXX: this is bad, but it's necessary because
            # it's running as a different user.
            os.chmod(pixel_dir, 0o755)
            container_id, _ = self.runner.run(
                # -e OCR="$OCR" -e OCR_LANGUAGE="$OCR_LANG"
                docker_args=["--volume", f"{pixel_dir}:/dangerzone"],
                args=["pixels-to-pdf-unpriv"],
            )

        logging.info("stage 2 completed in container %s", container_id)
        self.runner.cp(
            f"{container_id}:/tmp/safe-output-compressed.pdf",
            join(safe_dir, os.path.basename(path)),
        )
        self.runner.rm(container_id)


class LoggingAction(argparse.Action):
    """change log level on the fly

    The logging system should be initialized befure this, using
    `basicConfig`.

    Example usage:

    parser.add_argument(
        "-v",
        "--verbose",
        action=LoggingAction,
        const="INFO",
        help="enable verbose messages",
    )
    parser.add_argument(
        "-d",
        "--debug",
        action=LoggingAction,
        const="DEBUG",
        help="enable debugging messages",
    )
    """

    def __init__(self, *args, **kwargs):
        """setup the action parameters

        This enforces a selection of logging levels. It also checks if
        const is provided, in which case we assume it's an argument
        like `--verbose` or `--debug` without an argument.
        """
        kwargs["choices"] = logging._nameToLevel.keys()
        if "const" in kwargs:
            kwargs["nargs"] = 0
        super().__init__(*args, **kwargs)

    def __call__(self, parser, ns, values, option):
        """if const was specified it means argument-less parameters"""
        if self.const:
            logging.getLogger("").setLevel(self.const)
        else:
            logging.getLogger("").setLevel(values)


class DockerRunner(object):
    "convenience function to call Docker always the same way"
    DOCKER_HARDENING = ("--network", "none", "--security-opt=no-new-privileges:true")

    def __init__(self, image, cmd_output, dryrun):
        self.image = image
        self.cmd_output = cmd_output
        self.dryrun = dryrun

    def run(self, docker_args=[], image=None, args=[]):
        if image is None:
            image = self.image
        with tempfile.TemporaryDirectory() as tmpdir:
            cmd = [
                "docker",
                "run",
                "-it",
                f"--cidfile={tmpdir}/cidfile",  # to get the container ID
            ]
            cmd += docker_args
            cmd += self.DOCKER_HARDENING
            cmd += [self.image]
            cmd += args
            output = b""
            container_id = None
            if self.dryrun:
                logging.info("dry run, not running: %s", cmd)
            else:
                logging.debug("running command: %s", cmd)
                p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
                for line in p.stdout.readlines():
                    if self.cmd_output or True:
                        print(line.decode("utf-8"), end="")
                    output += line
                if p.wait() != 0:
                    logging.error("failed to run docker command: %s", cmd)
                with open(f"{tmpdir}/cidfile") as fp:
                    container_id = fp.read().strip()
        return container_id, output

    def cp(self, source, target):
        cmd = (
            "docker",
            "cp",
            source,
            target,
        )
        if self.dryrun:
            logging.debug("would run: %s", cmd)
            return
        logging.debug("running: %s", cmd)
        try:
            subprocess.check_call(cmd)
        except subprocess.CalledProcessError as e:
            logging.warning("failed to copy file: %s", e)

    def rm(self, container_id):
        cmd = ("docker", "rm", container_id)
        if self.dryrun:
            logging.debug("would run: %s", cmd)
            return
        logging.debug("running: %s", cmd)
        subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL)


# NOTE: do *not* add a trailing or leading slash to a SPECIAL_FOLDERS entry
# the `folder` param is stripped for a better check
SPECIAL_FOLDERS = (
    # those are legacy and might eventually get removed
    "dangerzone-processed",
    "dangerzone-processing",
    "dangerzone-rejected",
    # those are actually in use
    "dangerzone",
    "safe"
)


def is_special_folder(folder: str) -> bool:
    """Normalize `folder` and check if it's in `SPECIAL_FOLDERS`"""

    return folder.strip('/') in SPECIAL_FOLDERS


def main():
    logging.basicConfig(format="%(message)s")
    args = BatchArgParser().parse_args()
    if args.password is None:
        args.password = input("Password:")
    options = {
        "webdav_hostname": args.location,
        "webdav_login": args.username,
        "webdav_password": args.password,
    }
    client = ProcessingClient(options)
    logging.info("authenticated with webdav %s", args.location)

    with tempfile.TemporaryDirectory() as tmpdir:
        safe_dir = join(tmpdir, "safe")
        client.sanitizer = Sanitizer(
            safe_dir, args.image, args.verbose or args.debug, dryrun=args.dryrun
        )
        client.dryrun = args.dryrun

        # 1. periodically check a Nextcloud (WebDAV?) folder (called
        #    dangerzone) for new files
        paths = client.list()
        logging.debug("top level paths: %s", paths)
        for folder in paths[1:]:
            logging.debug("found path %s", folder)
            if folder.endswith("/"):
                subpaths = client.list(folder)
                logging.debug("subpaths: %s", subpaths)
                for path in subpaths[1:]:
                    client.process_path(folder, path)
                    try:
                        shutil.rmtree(safe_dir)
                    except FileNotFoundError:
                        pass
                    assert (
                        not os.path.exists(safe_dir) or len(os.listdir(safe_dir)) == 0
                    ), "safe dir should be cleaned between invocations"


class ProcessingClient(wc.Client):
    def process_path(self, folder, path):
        if is_special_folder(path):
            logging.debug("found special folder %s, skipping", path)
            return

        # 2. when a file is found, move it to a dangerzone/processing folder as
        #    an ad-hoc locking mechanis
        try:
            listing = self.list(join(folder, path))
        except RemoteResourceNotFound as e:
            # XXX: Nextcloud has this weird thing where it shows the
            # actual folder we're listing as the first element, but
            # then WebDAV raises an exception if we actually try to
            # list it. this handles the exception and skips the item,
            # assuming that is the scenario.
            logging.info("skipping non-existent resource %s/%s: %s", folder, path, e)
            return

        logging.info("sanitizing %s %s", folder, path)
        # non-empty folder or regular file
        if len(listing) > 1 or not path.endswith("/"):
            remote_processing_path = join(folder, "dangerzone/processing", path)
            logging.info("moving %s to %s before dangerzone/processing", path, remote_processing_path)
            if not self.dryrun:
                self.mkdir(join(folder, "dangerzone"))
                self.mkdir(join(folder, "dangerzone/processing"))
                try:
                    self.move(
                        remote_path_from=join(folder, path),
                        remote_path_to=join(remote_processing_path)
                    )
                except ResponseErrorCode as e:
                    # https://datatracker.ietf.org/doc/html/rfc7232#section-4.2
                    # actually used in WebDAV to show the file already exists
                    if e.code == 412:
                        logging.warning("file already being processed, skipping")
                        return
            with tempfile.TemporaryDirectory() as tmpdir:
                # TODO: sanitize path for local use
                local_path = join(tmpdir, "danger", path)
                # 3. download the file locally
                logging.info("downloading %s to %s", remote_processing_path, local_path)
                if not self.dryrun:
                    os.mkdir(join(tmpdir, "danger"))
                    self.download_sync(
                        remote_path=remote_processing_path, local_path=local_path
                    )

                # 4. process the file with the dangerzone-converter
                try:
                    if os.path.isdir(local_path):
                        self.sanitizer.sanitize_dir(local_path)
                    else:
                        self.sanitizer.sanitize_file(local_path)
                except Sanitizer.ParseError:
                    # TODO: error handling:
                    # 5. on failure, delete the failed file locally, and
                    #    move it to a dangerzone/rejected folder remotely
                    remote_rejected_path = join(folder, "rejected", path)
                    logging.warning(
                        "cannot process %s (%s), moving to %s",
                        path,
                        local_path,
                        remote_rejected_path,
                    )
                    if not self.dryrun:
                        self.mkdir(join(folder, "dangerzone/rejected"))
                        try:
                            self.move(
                                remote_path_from=remote_processing_path,
                                remote_path_to=remote_rejected_path,
                            )
                        except ResponseErrorCode as e:
                            # https://datatracker.ietf.org/doc/html/rfc7232#section-4.2
                            # actually used in WebDAV to show the file already exists
                            if e.code == 412:
                                # rejected already exists, fall back
                                # to delete the processing version
                                # altogether
                                #
                                # XXX: we actually lose data here
                                # which isn't nice. maybe we should
                                # find a unique filename instead?
                                self.clean(remote_processing_path)
                    return

                # 6. on success, upload the sanitized file to a safe/
                #    folder, move the original to dangerzone/processed
                remote_safe_path = join(folder, "safe", path)
                local_safe_path = join(self.sanitizer.safe_dir, path)
                logging.info(
                    "uploading %s to %s", local_safe_path, remote_safe_path
                )
                if not self.dryrun:
                    self.mkdir(remote_safe_path)
                    self.upload_sync(
                        # does that work when safe/ already has stuff?
                        remote_path=remote_safe_path,
                        local_path=local_safe_path,
                    )
                remote_processed_path = join(folder, "dangerzone/processed", path)
                logging.info(
                    "renaming %s to %s", remote_processing_path, remote_processed_path,
                )
                if not self.dryrun:
                    self.mkdir(join(folder, "dangerzone/processed"))
                    try:
                        self.move(
                            remote_path_from=remote_processing_path,
                            remote_path_to=remote_processed_path,
                        )
                    except ResponseErrorCode as e:
                        # https://datatracker.ietf.org/doc/html/rfc7232#section-4.2
                        # actually used in WebDAV to show the file already exists
                        if e.code == 412:
                            # rejected already exists, fall back
                            # to delete the processing version
                            # altogether
                            #
                            # XXX: we actually lose data here
                            # which isn't nice. maybe we should
                            # find a unique filename instead?
                            self.clean(remote_processing_path)


if __name__ == "__main__":
    main()
