Verified Commit 444dc43e authored by anarcat's avatar anarcat
Browse files

process folders on webdav

parent 042cacc6
......@@ -23,6 +23,7 @@ import argparse
import logging
import os
import re
import shutil
import subprocess
import sys
import tempfile
......@@ -99,6 +100,9 @@ class Sanitizer:
sanitizer.sanitize_file(args.document)
"""
class ParseError(Exception):
pass
def __init__(self, safe_dir, image, verbose, dryrun):
self.safe_dir = safe_dir
self.runner = DockerRunner(image=image, cmd_output=verbose, dryrun=dryrun)
......@@ -131,8 +135,7 @@ class Sanitizer:
logging.info("stage 1 completed in container %s", container_id)
m = re.search(rb"Document has (\d+) pages", output)
if not m:
logging.error("failed to find page numbers")
sys.exit(1)
raise Sanitizer.ParseError("failed to find page numbers in %r" % output)
pages = int(m.group(1))
logging.info("generated %d pages", pages)
......@@ -268,6 +271,9 @@ class DockerRunner(object):
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL)
SPECIAL_FOLDERS = ("dangerzone-processed", "dangerzone-processing/", "dangerzone-rejected/", "safe/")
def main():
logging.basicConfig(format="%(message)s")
args = BatchArgParser().parse_args()
......@@ -278,30 +284,117 @@ def main():
"webdav_login": args.username,
"webdav_password": args.password,
}
client = wc.Client(options)
client = ProcessingClient(options)
logging.info("authenticated with webdav %s", args.location)
# 1. periodically check a Nextcloud (WebDAV?) folder (called
# dangerzone) for new files
for folder in client.list():
print(folder)
with tempfile.TemporaryDirectory() as tmpdir:
safe_dir = tmpdir + "/safe/"
client.sanitizer = Sanitizer(
safe_dir, args.image, args.verbose or args.debug, dryrun=args.dryrun
)
client.dryrun = args.dryrun
# 1. periodically check a Nextcloud (WebDAV?) folder (called
# dangerzone) for new files
paths = client.list()
logging.debug("top level paths: %s", paths)
for folder in paths[1:]:
logging.debug("found path %s", folder)
if folder.endswith("/"):
subpaths = client.list(folder)
logging.debug("subpaths: %s", subpaths)
for path in subpaths[1:]:
client.process_path(folder, path)
shutil.rmtree(safe_dir)
assert (
not os.path.exists(safe_dir) or len(os.listdir(safe_dir)) == 0
), "safe dir should be cleaned before invocations"
class ProcessingClient(wc.Client):
def process_path(self, folder, path):
if path in SPECIAL_FOLDERS:
logging.debug("found special folder %s, skipping", path)
return
# 2. when a file is found, move it to a dangerzone/processing folder as
# an ad-hoc locking mechanis
try:
print(client.info(folder))
print(client.list(folder))
except RemoteResourceNotFound:
pass
# 2. when a file is found, move it to a dangerzone/processing folder as
# an ad-hoc locking mechanism
#
# 3. download the file locally
#
# 4. process the file with the dangerzone-converter
#
# 5. on failure, delete the failed file locally, and move it to a
# dangerzone/rejected folder remotely
#
# 6. on success, upload the sanitized file to a safe/ folder, move the
# original to dangerzone/processed
listing = self.list(folder + "/" + path)
except RemoteResourceNotFound as e:
# XXX: Nextcloud has this weird thing where it shows the
# actual folder we're listing as the first element, but
# then WebDAV raises an exception if we actually try to
# list it. this handles the exception and skips the item,
# assuming that is the scenario.
logging.info("skipping non-existent resource %s/%s: %s", folder, path, e)
return
logging.info("sanitizing %s %s", folder, path)
# non-empty folder or regular file
if len(listing) > 1 or not path.endswith("/"):
remote_processing_path = folder + "/dangerzone-processing/" + path
logging.info("moving %s to %s before dangerzone-processing", path, remote_processing_path)
if not self.dryrun:
self.mkdir(folder + "/dangerzone-processing")
self.move(remote_path_from=folder + "/" + path, remote_path_to=remote_processing_path)
with tempfile.TemporaryDirectory() as tmpdir:
# TODO: sanitize path for local use
local_path = tmpdir + "/danger/" + path
# 3. download the file locally
logging.info("downloading %s to %s", remote_processing_path, local_path)
if not self.dryrun:
self.download_sync(
remote_path=remote_processing_path, local_path=local_path
)
# 4. process the file with the dangerzone-converter
try:
if os.path.isdir(local_path):
self.sanitizer.sanitize_dir(local_path)
else:
self.sanitizer.sanitize_file(local_path)
except Sanitizer.ParseError:
# TODO: error handling:
# 5. on failure, delete the failed file locally, and
# move it to a dangerzone/rejected folder remotely
remote_rejected_path = folder + "/rejected/" + path
logging.warning(
"cannot process %s (%s), moving to %s",
path,
local_path,
remote_rejected_path,
)
if not self.dryrun:
self.mkdir(folder + "/dangerzone-rejected")
self.move(
remote_path_from=remote_processing_path,
remote_path_to=remote_rejected_path,
)
return
# 6. on success, upload the sanitized file to a safe/
# folder, move the original to dangerzone/processed
remote_safe_path = folder + "/safe/"
logging.info(
"uploading %s to %s", self.sanitizer.safe_dir, remote_safe_path
)
if not self.dryrun:
self.upload_sync(
# does that work when safe/ already has stuff?
remote_path=remote_safe_path,
local_path=self.sanitizer.safe_dir,
)
remote_processed_path = folder + "/dangerzone-processed/" + path
logging.info(
"renaming %s to %s", remote_processing_path, remote_processed_path,
)
if not self.dryrun:
self.mkdir(folder + "/dangerzone-processed")
self.move(
remote_path_from=remote_processing_path,
remote_path_to=remote_processed_path,
)
if __name__ == "__main__":
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment