#!/usr/bin/env python3 """ CFTC COT Report Downloader Downloads the CFTC Commitments of Traders CBT Long Form Combined report and stores historical copies locally with dated filenames. Usage: python cftc_downloader.py The report is typically published every Friday around 3:30 PM ET. """ import os import re import sys from datetime import datetime from pathlib import Path import requests from bs4 import BeautifulSoup # Configuration URL = "https://www.cftc.gov/dea/options/deacbtlof.htm" DATA_DIR = Path(__file__).parent / "data" LOG_FILE = DATA_DIR / "download_log.txt" # Headers to avoid 403 errors HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } def extract_report_date(html_content: str) -> str | None: """ Extract the report date from the HTML content. Looks for patterns like "February 17, 2026" in the report. """ # Pattern matches dates like "February 17, 2026" or "January 5, 2026" pattern = r"(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2}),\s+(\d{4})" match = re.search(pattern, html_content) if match: month_name, day, year = match.groups() # Convert to date object for formatting date_str = f"{month_name} {day}, {year}" try: date_obj = datetime.strptime(date_str, "%B %d, %Y") return date_obj.strftime("%Y-%m-%d") except ValueError: return None return None def download_report() -> tuple[bool, str]: """ Download the CFTC report and save it with a dated filename. Returns: tuple: (success: bool, message: str) """ # Ensure data directory exists DATA_DIR.mkdir(exist_ok=True) # Download the page print(f"Downloading from {URL}...") try: response = requests.get(URL, headers=HEADERS, timeout=30) response.raise_for_status() except requests.RequestException as e: return False, f"Download failed: {e}" html_content = response.text # Extract the report date report_date = extract_report_date(html_content) if not report_date: return False, "Could not extract report date from content" print(f"Report date: {report_date}") # Create filename filename = f"{report_date}_deacbtlof.htm" filepath = DATA_DIR / filename # Check if already downloaded if filepath.exists(): return False, f"Report for {report_date} already exists: {filepath}" # Save the file filepath.write_text(html_content, encoding="utf-8") print(f"Saved to: {filepath}") # Log the download log_download(report_date, filepath) # Import into database try: from app.ingestion.importer import import_html_file result = import_html_file(str(filepath)) if result.error: print(f"DB import warning: {result.error}") else: print(f"Imported to DB: {result.rows_inserted} rows inserted") except ImportError: pass # app package not available, skip DB import return True, f"Successfully downloaded report for {report_date}" def log_download(report_date: str, filepath: Path) -> None: """Log the download to the log file.""" timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") log_entry = f"{timestamp} | Report date: {report_date} | File: {filepath.name}\n" with open(LOG_FILE, "a", encoding="utf-8") as f: f.write(log_entry) def list_downloads() -> None: """List all downloaded reports.""" if not DATA_DIR.exists(): print("No downloads yet.") return files = sorted(DATA_DIR.glob("*_deacbtlof.htm")) if not files: print("No downloaded reports found.") return print(f"\nDownloaded reports ({len(files)} files):") print("-" * 40) for f in files: size_kb = f.stat().st_size / 1024 print(f" {f.name} ({size_kb:.1f} KB)") def main(): """Main entry point.""" print("CFTC COT Report Downloader") print("=" * 40) success, message = download_report() print(f"\n{message}") # Show current downloads list_downloads() return 0 if success else 1 if __name__ == "__main__": sys.exit(main())