COTexplorer/cftc_downloader.py

#!/usr/bin/env python3
"""
CFTC COT Report Downloader

Downloads the CFTC Commitments of Traders CBT Long Form Combined report
and stores historical copies locally with dated filenames.

Usage:
    python cftc_downloader.py

The report is typically published every Friday around 3:30 PM ET.
"""

import os
import re
import sys
from datetime import datetime
from pathlib import Path

import requests
from bs4 import BeautifulSoup

# Configuration
URL = "https://www.cftc.gov/dea/options/deacbtlof.htm"
DATA_DIR = Path(__file__).parent / "data"
LOG_FILE = DATA_DIR / "download_log.txt"

# Headers to avoid 403 errors
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}


def extract_report_date(html_content: str) -> str | None:
    """
    Extract the report date from the HTML content.
    Looks for patterns like "February 17, 2026" in the report.
    """
    # Pattern matches dates like "February 17, 2026" or "January 5, 2026"
    pattern = r"(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2}),\s+(\d{4})"
    match = re.search(pattern, html_content)

    if match:
        month_name, day, year = match.groups()
        # Convert to date object for formatting
        date_str = f"{month_name} {day}, {year}"
        try:
            date_obj = datetime.strptime(date_str, "%B %d, %Y")
            return date_obj.strftime("%Y-%m-%d")
        except ValueError:
            return None
    return None


def download_report() -> tuple[bool, str]:
    """
    Download the CFTC report and save it with a dated filename.

    Returns:
        tuple: (success: bool, message: str)
    """
    # Ensure data directory exists
    DATA_DIR.mkdir(exist_ok=True)

    # Download the page
    print(f"Downloading from {URL}...")
    try:
        response = requests.get(URL, headers=HEADERS, timeout=30)
        response.raise_for_status()
    except requests.RequestException as e:
        return False, f"Download failed: {e}"

    html_content = response.text

    # Extract the report date
    report_date = extract_report_date(html_content)
    if not report_date:
        return False, "Could not extract report date from content"

    print(f"Report date: {report_date}")

    # Create filename
    filename = f"{report_date}_deacbtlof.htm"
    filepath = DATA_DIR / filename

    # Check if already downloaded
    if filepath.exists():
        return False, f"Report for {report_date} already exists: {filepath}"

    # Save the file
    filepath.write_text(html_content, encoding="utf-8")
    print(f"Saved to: {filepath}")

    # Log the download
    log_download(report_date, filepath)

    # Import into database
    try:
        from app.ingestion.importer import import_html_file
        result = import_html_file(str(filepath))
        if result.error:
            print(f"DB import warning: {result.error}")
        else:
            print(f"Imported to DB: {result.rows_inserted} rows inserted")
    except ImportError:
        pass  # app package not available, skip DB import

    return True, f"Successfully downloaded report for {report_date}"


def log_download(report_date: str, filepath: Path) -> None:
    """Log the download to the log file."""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    log_entry = f"{timestamp} | Report date: {report_date} | File: {filepath.name}\n"

    with open(LOG_FILE, "a", encoding="utf-8") as f:
        f.write(log_entry)


def list_downloads() -> None:
    """List all downloaded reports."""
    if not DATA_DIR.exists():
        print("No downloads yet.")
        return

    files = sorted(DATA_DIR.glob("*_deacbtlof.htm"))
    if not files:
        print("No downloaded reports found.")
        return

    print(f"\nDownloaded reports ({len(files)} files):")
    print("-" * 40)
    for f in files:
        size_kb = f.stat().st_size / 1024
        print(f"  {f.name} ({size_kb:.1f} KB)")


def main():
    """Main entry point."""
    print("CFTC COT Report Downloader")
    print("=" * 40)

    success, message = download_report()
    print(f"\n{message}")

    # Show current downloads
    list_downloads()

    return 0 if success else 1


if __name__ == "__main__":
    sys.exit(main())