#!/bin/bash
# ------------------------------------------------------------------------------
# run: orchestrates download/parse/clean with month resume state.
# State file: $PARSER_HOME/state/last_month_processed (YYYY-MM)
# ------------------------------------------------------------------------------

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
source "$SCRIPT_DIR/config"

# ------------------------------------------------------------------------------
# Defaults / layout (safe under -u)
# ------------------------------------------------------------------------------
PARSER_HOME="${PARSER_HOME:-$SCRIPT_DIR}"
DATA_HOME="${DATA_HOME:-$PARSER_HOME/descriptors}"      # descriptors root
ARCHIVE_DIR_DEFAULT="$PARSER_HOME/archive"              # tarballs
ARCHIVES_DIR_DEFAULT="$PARSER_HOME/archives"            # extracted

# ------------------------------------------------------------------------------
# Logging
# ------------------------------------------------------------------------------
LLOG_DIR="$PARSER_HOME/logs"; mkdir -p "$LLOG_DIR"
LLOG_FILE="$LLOG_DIR/run.log"
log(){ printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*" >>"$LLOG_FILE"; }
log "----- run: START -----"

# ------------------------------------------------------------------------------
# State (single source of truth)
# ------------------------------------------------------------------------------
STATE_DIR="$PARSER_HOME/state"; mkdir -p "$STATE_DIR"
STATE_FILE="$STATE_DIR/last_month_processed"

read_last_month(){
  [[ -s "$STATE_FILE" ]] || return 1
  sed -n '1{s/[[:space:]]*$//;p;q;}' "$STATE_FILE"
}
write_last_month(){
  local ym="$1" tmp="${STATE_FILE}.tmp"
  printf '%s\n' "$ym" >"$tmp" && mv -f "$tmp" "$STATE_FILE"
  log "Saved last processed month: $ym → $STATE_FILE"
}

# ------------------------------------------------------------------------------
# Exit code aggregation
# ------------------------------------------------------------------------------
JAVA_RC=0       # first non-zero failure (prefer parse)
PARSE_FAILED=0
RECENT_DONE=0

# ------------------------------------------------------------------------------
# Utils (defensive under `set -u`)
# ------------------------------------------------------------------------------
normalize_to_ym() {
  local in="${1-}"
  if [[ -z "$in" ]]; then echo "ERROR: missing date/month" >&2; return 2; fi
  if [[ "$in" =~ ^[0-9]{4}-[0-9]{2}$ ]]; then
    echo "$in"
  elif [[ "$in" =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}$ ]]; then
    echo "${in:0:7}"
  else
    echo "ERROR: invalid date/month '$in' (use YYYY-MM or YYYY-MM-DD)" >&2
    return 2
  fi
}

ym_to_epoch() {
  local in="${1-}"
  if [[ -z "$in" ]]; then echo "ERROR: ym_to_epoch: missing YYYY-MM" >&2; return 2; fi
  date -d "$in-01" +%s
}

ym_next() {
  local in="${1-}"
  if [[ -z "$in" ]]; then echo "ERROR: ym_next: missing YYYY-MM" >&2; return 2; fi
  local y="${in%-*}" m="${in#*-}"
  if [[ "$m" == "12" ]]; then
    printf "%04d-01\n" $((10#$y + 1))
  else
    printf "%04d-%02d\n" "$y" $((10#$m + 1))
  fi
}

parse_with_rc(){
  set +e
  "$SCRIPT_DIR/parse" "${JAR_ARG[@]}"
  local rc=$?
  set -e
  if (( rc != 0 )); then
    PARSE_FAILED=1
    [[ $JAVA_RC -eq 0 ]] && JAVA_RC=$rc
    log "parse exited with code $rc"
    return 1
  fi
  return 0
}

# ------------------------------------------------------------------------------
# Preflight: tools & cleanup (best effort so we still reach "recent")
# ------------------------------------------------------------------------------
PRECHECK_RC=0

ensure_xz(){
  if command -v xz >/dev/null 2>&1; then
    log "xz present: $(command -v xz)"
    return 0
  fi
  log "xz not found; attempting to install xz-utils ..."
  if command -v apt-get >/dev/null 2>&1; then
    set +e
    apt-get update >>"$LLOG_FILE" 2>&1
    apt-get install -y --no-install-recommends xz-utils >>"$LLOG_FILE" 2>&1
    local apt_rc=$?
    set -e
    if (( apt_rc != 0 )); then
      log "WARN: failed to install xz-utils (rc=$apt_rc); continuing."
      (( PRECHECK_RC == 0 )) && PRECHECK_RC=$apt_rc
      return 1
    fi
    log "xz-utils installed."
  else
    log "WARN: apt-get not available; cannot install xz-utils; continuing."
    return 1
  fi
}

# Delete everything under a directory but keep the directory itself
empty_dir_preserve(){
  local dir="${1-}"
  [[ -n "$dir" && -d "$dir" ]] || { log "Preflight: skip (missing) ${dir:-<none>}"; return 0; }
  log "Preflight: emptying contents of $dir (preserving directory)"
  if [[ "${DRY_RUN:-0}" == "1" ]]; then
    find "$dir" -mindepth 1 -depth -print || true
  else
    find "$dir" -mindepth 1 -depth -delete || true
  fi
}

preflight_cleanup(){
  local ARCHIVE_DIR_LOCAL="${ARCHIVE_DIR:-$ARCHIVE_DIR_DEFAULT}"
  local ARCHIVES_DIR_LOCAL="${ARCHIVES_DIR:-$ARCHIVES_DIR_DEFAULT}"
  local DESCRIPTORS_DIR_LOCAL="${DESCRIPTORS_DIR:-$DATA_HOME}"

  set +e
  empty_dir_preserve "$ARCHIVE_DIR_LOCAL";  local rc1=$?
  empty_dir_preserve "$ARCHIVES_DIR_LOCAL"; local rc2=$?
  empty_dir_preserve "$DESCRIPTORS_DIR_LOCAL"; local rc3=$?
  set -e

  local pc_rc=$(( rc1 || rc2 || rc3 ))
  if (( pc_rc != 0 )); then
    log "WARN: preflight cleanup had non-zero rc (archive:$rc1 archives:$rc2 descriptors:$rc3)"
    (( PRECHECK_RC == 0 )) && PRECHECK_RC=$pc_rc
  fi
}

# ------------------------------------------------------------------------------
# Work units
# ------------------------------------------------------------------------------
run_recent(){
  log "run_recent: start"
  RECENT_DONE=1

  log "run_recent: downloading"
  set +e
  "$SCRIPT_DIR/download-recent"
  local dl_rc=$?
  set -e
  if (( dl_rc != 0 )); then
    log "run_recent: download-recent FAILED rc=$dl_rc; skipping parse"
    [[ $JAVA_RC -eq 0 ]] && JAVA_RC=$dl_rc
    log "run_recent: end (download failed)"
    return 1
  fi

  log "run_recent: parsing"
  if ! parse_with_rc; then
    log "run_recent: parse FAILED rc=$JAVA_RC"
    log "run_recent: end (parse failed)"
    return 1
  fi

  log "run_recent: end (ok)"
  return 0
}

run_month(){
  local ym="${1-}"
  if [[ -z "$ym" ]]; then log "Monthly run: missing YYYY-MM"; return 2; fi
  log "Monthly run for $ym: download -> parse -> clean"

  set +e
  "$SCRIPT_DIR/download" "$ym"; local d_rc=$?
  set -e
  if (( d_rc != 0 )); then
    log "Monthly download FAILED for $ym rc=$d_rc"
    [[ $JAVA_RC -eq 0 ]] && JAVA_RC=$d_rc
    return 1
  fi

  if ! parse_with_rc; then
    log "Monthly parse FAILED for $ym rc=$JAVA_RC; skipping clean/state"
    return 1
  fi

  set +e
  "$SCRIPT_DIR/clean" "$ym"; local c_rc=$?
  set -e
  if (( c_rc != 0 )); then
    log "WARN: clean FAILED for $ym rc=$c_rc"
    [[ $JAVA_RC -eq 0 ]] && JAVA_RC=$c_rc
  fi

  write_last_month "$ym"
  log "Finished monthly run for $ym"
  return 0
}

# ------------------------------------------------------------------------------
# Ensure "recent" runs at process end (fallback). Define AFTER functions exist.
# ------------------------------------------------------------------------------
on_exit() {
  if [[ $RECENT_DONE -eq 0 ]]; then
    log "EXIT trap: ensuring recent ingestion runs"
    run_recent || true
  fi
  log "Exit: JAVA_RC=$JAVA_RC"
  :  # trap returns success
}
trap on_exit EXIT

# ------------------------------------------------------------------------------
# CLI parsing (optional --jar, then optional month)
# ------------------------------------------------------------------------------
JAR_ARG=()
while [[ $# -gt 0 ]]; do
  case "$1" in
    --jar)
      [[ -n "${2:-}" ]] || { echo "ERROR: --jar requires a path" >&2; exit 2; }
      JAR_ARG=(--jar "$2"); shift 2 ;;
    --) shift; break ;;
    *) break ;;
  esac
done

YM=""
if [[ $# -ge 1 ]]; then
  if [[ $# -ge 2 && "$1" =~ ^[0-9]{4}$ && "$2" =~ ^(0[1-9]|1[0-2])$ ]]; then
    YM="$1-$2"; shift 2
  else
    YM="$(normalize_to_ym "$1")"; shift || true
  fi
elif [[ -n "${START_YEARMONTH:-}" ]]; then
  YM="$(normalize_to_ym "$START_YEARMONTH")"
fi

LAST_YM="$(read_last_month || true)"
CURRENT_YM="$(date '+%Y-%m')"
log "Starting run (requested=${YM:-none}; state last_month=${LAST_YM:-none}; current=${CURRENT_YM})"

# ------------------------------------------------------------------------------
# Preflight (best effort; never abort)
# ------------------------------------------------------------------------------
ensure_xz || true
preflight_cleanup || true

# Log helper presence (diagnose “nothing happens”)
for helper in download download-recent parse clean; do
  if [[ -x "$SCRIPT_DIR/$helper" ]]; then
    log "helper $helper: OK ($SCRIPT_DIR/$helper)"
  else
    log "helper $helper: MISSING or not executable ($SCRIPT_DIR/$helper)"
  fi
done

# ------------------------------------------------------------------------------
# Decision logic (resume from state; skip repeats)
# ------------------------------------------------------------------------------
START_YM=""
if [[ -n "$YM" ]]; then
  START_YM="$YM"
  if [[ -n "${LAST_YM:-}" ]] && (( $(ym_to_epoch "$START_YM") <= $(ym_to_epoch "$LAST_YM") )); then
    next_from_state="$(ym_next "$LAST_YM")"
    log "Requested $START_YM ≤ last processed $LAST_YM; starting from $next_from_state instead."
    START_YM="$next_from_state"
  fi
elif [[ -n "${LAST_YM:-}" ]]; then
  START_YM="$(ym_next "$LAST_YM")"
  log "No month provided; resuming from state → $START_YM"
elif [[ -n "${START_YEARMONTH:-}" ]]; then
  START_YM="$(normalize_to_ym "$START_YEARMONTH")"
  log "No state; using START_YEARMONTH → $START_YM"
fi

if [[ -n "$START_YM" && $(ym_to_epoch "$START_YM") -le $(ym_to_epoch "$CURRENT_YM") ]]; then
  CUR="$START_YM"
  while (( $(ym_to_epoch "$CUR") <= $(ym_to_epoch "$CURRENT_YM") )); do
    log "Processing month: $CUR"
    if ! run_month "$CUR"; then
      log "Stopping monthly catch-up due to failure at $CUR"
      break
    fi
    CUR="$(ym_next "$CUR")"
  done
else
  log "No monthly work to do (START_YM=${START_YM:-none}, current=$CURRENT_YM)."
fi

# Always try recent explicitly (EXIT trap is a fallback)
log "Calling run_recent after monthly sequence"
run_recent || true

log "----- run: END -----"
exit "$JAVA_RC"
