#!/usr/bin/env python3
"""
Phase 0 + 1: merge 3 CSVs, dedup by email, apply cheap pre-filters.

Outputs (written to OUT_DIR):
  merged.csv            — all unique rows (deduped by lowercase email)
  role_based.csv        — rows whose email local-part is role-based (kept separate)
  phase1_survivors.csv  — clean candidates for Phase 2 (DNS/HTTP/SMTP)

Drops silently (per user direction):
  rows with empty website
  rows whose website domain is a social/aggregator/non-blog platform
  rows with malformed email syntax
"""

import csv
import re
import sys
from urllib.parse import urlparse

OUT_DIR = "/home2/writeup/public_html/outreach"
INPUT_FILES = ["Update 01.csv", "Update 02.csv", "Update 03.csv"]

# Domains that aren't blogs. ANY URL whose host (minus leading www.) is in
# this set is dropped. Note: blog-platform domains like wordpress.com,
# substack.com, blogspot.com, medium.com are NOT here — subdomains/paths
# under those are real blogs.
SOCIAL_DOMAINS = {
    # Link aggregators / link-in-bio
    "linktr.ee", "lnk.bio", "linktree.com", "beacons.ai", "beacons.page",
    "bio.link", "bio.site", "campsite.bio", "linkin.bio", "later.com",
    "koji.to", "withkoji.com", "milkshake.app", "shor.by", "snipfeed.co",
    "hopp.bio", "msha.ke", "instabio.cc", "taplink.cc", "solo.to",
    "flow.page", "linkr.bio", "smart.bio", "carrd.co", "magic.ly",
    "byvora.com", "stan.store", "flowcode.com", "linkpop.com", "many.link",
    # Video
    "youtube.com", "youtu.be", "m.youtube.com", "music.youtube.com",
    "vimeo.com", "twitch.tv", "dailymotion.com", "hudl.com",
    # Social
    "facebook.com", "m.facebook.com", "fb.me", "fb.watch", "business.facebook.com",
    "instagram.com", "instagr.am",
    "twitter.com", "x.com", "t.co",
    "tiktok.com", "vm.tiktok.com",
    "pinterest.com", "pin.it",
    "linkedin.com", "lnkd.in",
    "snapchat.com",
    "reddit.com",
    "tumblr.com",
    "vk.com", "ok.ru",
    "tellonym.me",
    # Photo / portfolio (not blog-style)
    "vsco.co", "behance.net", "dribbble.com", "flickr.com", "500px.com",
    # Messaging
    "wa.me", "api.whatsapp.com", "chat.whatsapp.com", "wa.link",
    "whats.link", "wasap.my",
    "discord.gg", "discord.com",
    "telegram.me", "t.me",
    "messenger.com",
    # Audio / podcast / music distribution (not blogs)
    "open.spotify.com", "spotify.com", "spoti.fi",
    "music.apple.com", "podcasts.apple.com",
    "soundcloud.com", "m.soundcloud.com", "anchor.fm", "mixcloud.com",
    "bandcamp.com", "audiomack.com",
    "distrokid.com", "beatstars.com", "unitedmasters.com",
    "fanlink.to", "songwhip.com", "ffm.to", "smarturl.it", "orcd.co",
    "show.co", "lnk.to",
    # Commerce / marketplace
    "etsy.com", "etsy.me", "shopee.com", "amazon.com", "amzn.to",
    "ebay.com", "redbubble.com", "teespring.com", "depop.com",
    "opensea.io", "squareup.com", "square.site",
    # URL shorteners
    "bit.ly", "tinyurl.com", "rebrand.ly", "rb.gy", "ow.ly", "buff.ly",
    "shorturl.at", "cutt.ly", "t.ly", "is.gd", "goo.gl",
    # Payment / donation / crowdfunding
    "venmo.com", "cash.app", "paypal.me", "paypal.com",
    "patreon.com", "ko-fi.com", "buymeacoffee.com",
    "gofund.me", "gofundme.com", "kickstarter.com",
    # Adult / restricted (not blogs in our sense)
    "onlyfans.com", "fansly.com",
    # Booking / events / scheduling
    "calendly.com", "eventbrite.com", "meetup.com", "ticketmaster.com",
    "vagaro.com", "booksy.com",
    # Forms / surveys (not blogs)
    "forms.gle", "docs.google.com", "drive.google.com", "surveyheart.com",
    "typeform.com", "jotform.com", "surveymonkey.com", "google.com",
    # Reviews / listings
    "yelp.com", "tripadvisor.com",
    # Service marketplaces
    "fiverr.com", "upwork.com",
    # Code / dev
    "github.com",
    # App stores / deep links
    "play.google.com", "apps.apple.com", "googleplay.com",
    # Bogus self-references (someone put their email's domain as website)
    "gmail.com", "outlook.com", "yahoo.com", "hotmail.com", "aol.com",
    "mail.ru", "yandex.com", "icloud.com",
}

# Suffix matches — host endswith one of these
SOCIAL_HOST_SUFFIXES = (
    ".app.link",       # Branch deep links (hu48f.app.link/...)
    ".branch.io",
    ".onelink.me",
    ".page.link",      # Firebase dynamic links
    ".smart.link",
    ".app.goo.gl",     # Firebase dynamic links via Google
    ".carrd.co",       # username.carrd.co (one-page link sites)
    ".bandcamp.com",   # artist.bandcamp.com
    ".square.site",    # business.square.site
    ".myshopify.com",  # shop.myshopify.com (e-commerce)
    ".bigcartel.com",
    ".eventbrite.com",
)

# Role-based email local-parts
ROLE_PARTS = {
    "info", "contact", "admin", "support", "sales", "hello", "hi", "team",
    "office", "mail", "email", "marketing", "press", "media", "pr",
    "billing", "accounts", "accounting", "help", "service", "customer",
    "noreply", "no-reply", "donotreply", "do-not-reply",
    "webmaster", "postmaster", "abuse", "hr", "jobs", "careers",
    "general", "inquiries", "enquiries", "booking", "bookings",
    "newsletter", "subscribe", "feedback", "events",
}

EMAIL_RE = re.compile(r"^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$")


def normalize_email(email):
    return (email or "").strip().lower()


def host_of(url):
    if not url:
        return ""
    s = url.strip()
    if s.lower().startswith("mailto:"):
        return ""
    if "://" not in s:
        s = "http://" + s
    try:
        host = urlparse(s).netloc.lower()
    except Exception:
        return ""
    if host.startswith("www."):
        host = host[4:]
    # Strip port
    return host.split(":", 1)[0]


def is_social(host):
    if not host:
        return False
    if host in SOCIAL_DOMAINS:
        return True
    for suffix in SOCIAL_HOST_SUFFIXES:
        if host.endswith(suffix):
            return True
    return False


def is_role_email(email):
    local = email.split("@", 1)[0]
    base = re.split(r"[.\-_+]", local, maxsplit=1)[0]
    return base in ROLE_PARTS or local in ROLE_PARTS


def main():
    seen_emails = set()
    counts = {
        "rows_read": 0, "bad_email": 0, "duplicate_email": 0,
        "merged": 0, "no_website": 0, "social_dropped": 0,
        "role_kept_separate": 0, "survivors": 0,
    }

    f_merged = open(f"{OUT_DIR}/merged.csv", "w", newline="", encoding="utf-8")
    f_role   = open(f"{OUT_DIR}/role_based.csv", "w", newline="", encoding="utf-8")
    f_surv   = open(f"{OUT_DIR}/phase1_survivors.csv", "w", newline="", encoding="utf-8")

    w_merged = csv.writer(f_merged); w_merged.writerow(["category", "website", "email"])
    w_role   = csv.writer(f_role);   w_role.writerow(["category", "website", "email"])
    w_surv   = csv.writer(f_surv);   w_surv.writerow(["category", "website", "email"])

    for fname in INPUT_FILES:
        path = f"{OUT_DIR}/{fname}"
        sys.stderr.write(f"reading {fname}...\n")
        with open(path, newline="", encoding="utf-8", errors="replace") as fh:
            reader = csv.reader(fh)
            next(reader, None)  # skip header
            for row in reader:
                counts["rows_read"] += 1
                if len(row) < 3:
                    counts["bad_email"] += 1
                    continue
                category, website, email = row[0], row[1], row[2]
                email_n = normalize_email(email)
                if not email_n or not EMAIL_RE.match(email_n):
                    counts["bad_email"] += 1
                    continue
                if email_n in seen_emails:
                    counts["duplicate_email"] += 1
                    continue
                seen_emails.add(email_n)
                counts["merged"] += 1
                w_merged.writerow([category, website, email_n])

                # Phase 1 filters
                if not website.strip():
                    counts["no_website"] += 1
                    continue
                host = host_of(website)
                if is_social(host):
                    counts["social_dropped"] += 1
                    continue
                if is_role_email(email_n):
                    counts["role_kept_separate"] += 1
                    w_role.writerow([category, website, email_n])
                    continue
                counts["survivors"] += 1
                w_surv.writerow([category, website, email_n])

    for fh in (f_merged, f_role, f_surv):
        fh.close()

    print("=== Phase 0+1 complete ===")
    width = max(len(k) for k in counts)
    for k, v in counts.items():
        print(f"  {k.ljust(width)}  {v:>10,}")


if __name__ == "__main__":
    main()