#!/usr/bin/env python3
"""
One-time seeder: pull all "send" rows from phase2_results/*.csv, distribute
their scheduled_at times across N business days, and INSERT into the
outreach_recipients table.

Idempotent on email: ON DUPLICATE KEY UPDATE leaves existing rows alone.

Usage:
    python3 seed_outreach_recipients.py [--days 14] [--dry-run]
"""
import argparse
import csv
import glob
import json
import os
import random
import secrets
import string
import sys
from datetime import datetime, time, timedelta, timezone
from urllib.parse import urlparse

import pymysql

# Paths inside the cron container — the host's /home2/writeup/public_html/outreach
# is bind-mounted at /outreach by docker-compose.
ROOT    = "/outreach"
RESULTS = f"{ROOT}/phase2_results"
CONFIG  = f"{ROOT}/send/config.json"

# 24-hour spacing — recipients live in many timezones, so spreading sends
# across all hours means everyone gets the email during their day at some
# point, and the per-hour rate stays low and human-looking.
WORK_START_UTC = time(0, 0)
WORK_END_UTC   = time(23, 59)


def load_sendable():
    """Read every YYYY-MM-DD.csv under phase2_results/ and yield rows where
    final_status == 'send'. Dedup by email so re-runs of the seeder don't
    double-count if you happen to have overlapping day files."""
    seen = set()
    rows = []
    for path in sorted(glob.glob(f"{RESULTS}/*.csv")):
        with open(path, encoding="utf-8", errors="replace") as fh:
            r = csv.DictReader(fh)
            for row in r:
                if row.get("final_status") != "send":
                    continue
                email = (row.get("email") or "").strip().lower()
                if not email or email in seen:
                    continue
                seen.add(email)
                rows.append(row)
    return rows


def host_of(url):
    s = (url or "").strip()
    if "://" not in s:
        s = "http://" + s
    try:
        h = urlparse(s).netloc.lower()
    except Exception:
        return ""
    if h.startswith("www."):
        h = h[4:]
    return h.split(":", 1)[0]


def random_token(n=24):
    """URL-safe base64 trimmed to n chars (default 24)."""
    return "".join(secrets.choice(string.ascii_letters + string.digits) for _ in range(n))


def next_workday(d):
    """Sends span all 7 days of the week — bloggers don't keep office hours."""
    return d


def schedule_times(n_rows, n_days, start_date_utc):
    """Distribute n_rows across n_days business days, within the work window,
    with ±90s jitter per slot. Returns list of datetimes (length n_rows)."""
    per_day = max(1, (n_rows + n_days - 1) // n_days)
    work_secs = (
        (WORK_END_UTC.hour * 3600 + WORK_END_UTC.minute * 60)
        - (WORK_START_UTC.hour * 3600 + WORK_START_UTC.minute * 60)
    )
    spacing = work_secs / per_day  # ideal seconds between sends within a day

    slots = []
    day = next_workday(start_date_utc)
    placed = 0
    while placed < n_rows:
        for i in range(per_day):
            if placed >= n_rows:
                break
            base = datetime.combine(day, WORK_START_UTC, tzinfo=timezone.utc) \
                + timedelta(seconds=int(spacing * i))
            jitter = timedelta(seconds=random.randint(-90, 90))
            slots.append(base + jitter)
            placed += 1
        day = next_workday(day + timedelta(days=1))
    slots.sort()
    return slots


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--days", type=int, default=19, help="Spread across N days (1888 / 100 per day = 19)")
    ap.add_argument("--start-tomorrow", action="store_true",
                    help="Begin scheduling from tomorrow (default: today if before work window)")
    ap.add_argument("--dry-run", action="store_true")
    args = ap.parse_args()

    rows = load_sendable()
    if not rows:
        print("No sendable rows found. Nothing to do.")
        return

    today = datetime.now(timezone.utc).date()
    start = today + timedelta(days=1) if args.start_tomorrow else today
    slots = schedule_times(len(rows), args.days, start)

    print(f"Loaded {len(rows):,} sendable rows; scheduling across "
          f"{args.days} weekdays starting {start.isoformat()}")
    print(f"First slot: {slots[0]} UTC")
    print(f"Last  slot: {slots[-1]} UTC")

    if args.dry_run:
        print("[dry-run] not writing to DB")
        for row, slot in list(zip(rows, slots))[:5]:
            print(f"  {slot.isoformat()}  {row['email']:40s}  {row['website']}")
        return

    cfg = json.load(open(CONFIG))
    cn = pymysql.connect(
        host=cfg["db"]["host"], port=cfg["db"]["port"],
        user=cfg["db"]["user"], password=cfg["db"]["password"],
        database=cfg["db"]["name"], autocommit=False, charset="utf8mb4",
    )
    cur = cn.cursor()

    sql = (
        "INSERT INTO outreach_recipients "
        "(email, blog_url, blog_host, category, token, scheduled_at, status) "
        "VALUES (%s, %s, %s, %s, %s, %s, 'pending') "
        "ON DUPLICATE KEY UPDATE id = id"
    )
    inserted = 0
    skipped  = 0
    BATCH = 500
    buf = []
    for row, slot in zip(rows, slots):
        url   = row["website"].strip()
        host  = host_of(url)
        token = random_token(24)
        buf.append((
            row["email"].strip().lower(),
            url[:2048],
            host[:255],
            (row.get("category") or "")[:255] or None,
            token,
            slot.strftime("%Y-%m-%d %H:%M:%S"),
        ))
        if len(buf) >= BATCH:
            cur.executemany(sql, buf)
            inserted += cur.rowcount
            cn.commit()
            buf = []
    if buf:
        cur.executemany(sql, buf)
        inserted += cur.rowcount
        cn.commit()

    cur.execute("SELECT COUNT(*) FROM outreach_recipients")
    total = cur.fetchone()[0]
    print(f"DB rows after seed: {total:,}  (insert/update events: {inserted:,})")
    cur.close(); cn.close()


if __name__ == "__main__":
    main()
