#!/bin/bash
set -euo pipefail

source "$(dirname "$0")/config"

: "${DATA_HOME:?ERROR: DATA_HOME must be set in config}"
: "${DOWNLOADS_LOG_FILE:?ERROR: DOWNLOADS_LOG_FILE must be set in config}"

# Ensure the downloads log file exists
mkdir -p "$(dirname "$DOWNLOADS_LOG_FILE")"
touch "$DOWNLOADS_LOG_FILE"

download_recent_files() {
    local base_urls=(
        "https://collector.torproject.org/recent"
        "https://collector-02.torproject.org/recent"
    )

    for p in "${RECENT_PATHS[@]}"; do
        rm -rf "$DATA_HOME/$p"
        mkdir -p "$DATA_HOME/$p"

        for base_url in "${base_urls[@]}"; do
            # Fetch directory listing; skip path if page is empty or unreachable
            local page
            page=$(curl -s "$base_url/$p/") || continue

            # Parse HREFs into a proper array
            local -a urls=()
            mapfile -t urls < <(echo "$page" | grep -oP '(HREF|href)="\K[^"]+' || true)

            if (( ${#urls[@]} == 0 )); then
                continue
            fi

            # Skip first 5 entries (navigation links: parent dir, column headers, etc.)
            local -a file_urls=("${urls[@]:5}")

            for u in "${file_urls[@]}"; do
                # Skip directory links and empty entries
                [[ -n "$u" && "$u" != */ ]] || continue

                local download_url="$base_url/$p/$u"
                local dest="$DATA_HOME/$p/$u"

                # Already downloaded this run
                if [[ -f "$dest" ]]; then
                    continue
                fi

                # Already in the log from a previous run
                if grep -qF "$download_url" "$DOWNLOADS_LOG_FILE" 2>/dev/null; then
                    continue
                fi

                # Download into the target directory
                if wget -q -O "$dest" "$download_url"; then
                    echo "$download_url" >> "$DOWNLOADS_LOG_FILE"
                else
                    rm -f "$dest"  # clean up partial download
                fi
            done
        done
    done
}

download_recent_files
