Greg 37f8eac932 Initial commit: CFTC COT Explorer
FastAPI application that ingests CFTC Commitments of Traders data into SQLite
and exposes it via a REST API with analytics endpoints (screener, percentile rank,
concentration). Includes CLI for historical and weekly data ingestion, Docker setup,
and a frontend.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-22 11:23:00 +01:00

619 lines
22 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
CFTC COT Report Parser
Parses the fixed-width text format used by CFTC for Commitments of Traders
Long Reports. The format uses ':' as group separators within each data line.
Handles both:
- Weekly HTML files (text wrapped in <pre> tag)
- Historical ZIP files (.txt files, same format without HTML wrapper)
"""
import re
import zipfile
from dataclasses import dataclass, field
from datetime import date
from pathlib import Path
from typing import Iterator, Optional
# Regex patterns
COMMODITY_HEADER_RE = re.compile(r'^(\S.+?)\s{2,}Code-(\d+)\s*$')
DATE_RE = re.compile(
r'(January|February|March|April|May|June|July|August|September|October|November|December)'
r'\s+(\d{1,2}),\s+(\d{4})'
)
EXCHANGE_ABBR = {
'CHICAGO BOARD OF TRADE': 'CBT',
'CHICAGO MERCANTILE EXCHANGE': 'CME',
'NEW YORK MERCANTILE EXCHANGE': 'NYMEX',
'COMMODITY EXCHANGE INC': 'COMEX',
'COMMODITY EXCHANGE INC.': 'COMEX',
'ICE FUTURES U.S.': 'ICE',
'ICE FUTURES U.S': 'ICE',
'ICE FUTURES EUROPE': 'ICE-EU',
'KANSAS CITY BOARD OF TRADE': 'KCBT',
'MINNEAPOLIS GRAIN EXCHANGE': 'MGE',
}
def _parse_date(text: str) -> Optional[str]:
"""Extract ISO date string from text like 'February 17, 2026'."""
m = DATE_RE.search(text)
if not m:
return None
month, day, year = m.groups()
try:
d = date(int(year), list(['January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November',
'December']).index(month) + 1, int(day))
return d.isoformat()
except (ValueError, IndexError):
return None
def _nums(s: str, as_float: bool = False) -> list:
"""Parse whitespace-separated numbers from a string, stripping commas."""
clean = s.replace(',', '').strip()
if not clean:
return []
result = []
for tok in clean.split():
try:
result.append(float(tok) if as_float else int(float(tok)))
except ValueError:
pass
return result
def _parse_position_line(line: str, as_float: bool = False) -> tuple[str, list]:
"""
Parse a data line like:
'All : 544,127: 117,677 175,249 205,702 184,989 124,796 508,367 505,746: 35,760 38,381'
Returns (row_label, flat_list_of_values) where values are:
[open_interest, noncomm_long, noncomm_short, spreading,
comm_long, comm_short, total_long, total_short,
nonrept_long, nonrept_short] -- 10 values total
"""
parts = line.split(':')
label = parts[0].strip()
values = []
for part in parts[1:]:
values.extend(_nums(part, as_float=as_float))
return label, values
def _parse_trader_line(line: str) -> tuple[str, list]:
"""
Parse a traders line like:
'All : 375: 122 119 146 105 104 309 296:'
Returns (label, [total, noncomm_long, noncomm_short, spread, comm_long, comm_short, total_long, total_short])
"""
parts = line.split(':')
label = parts[0].strip()
values = []
for part in parts[1:]:
values.extend(_nums(part))
return label, values
def _parse_concentration_line(line: str) -> tuple[str, list]:
"""
Parse a concentration line like:
'All : 12.5 11.6 21.6 20.1 9.2 7.2 15.3 12.5'
Returns (label, [gross_long_4, gross_short_4, gross_long_8, gross_short_8,
net_long_4, net_short_4, net_long_8, net_short_8])
"""
# Only one colon (after label) -- but some lines may have more from header artefacts
idx = line.index(':')
label = line[:idx].strip()
values = _nums(line[idx + 1:], as_float=True)
return label, values
@dataclass
class PositionRow:
open_interest: Optional[int] = None
noncomm_long: Optional[int] = None
noncomm_short: Optional[int] = None
noncomm_spreading: Optional[int] = None
comm_long: Optional[int] = None
comm_short: Optional[int] = None
total_long: Optional[int] = None
total_short: Optional[int] = None
nonrept_long: Optional[int] = None
nonrept_short: Optional[int] = None
@dataclass
class ChangesRow:
chg_open_interest: Optional[int] = None
chg_noncomm_long: Optional[int] = None
chg_noncomm_short: Optional[int] = None
chg_noncomm_spreading: Optional[int] = None
chg_comm_long: Optional[int] = None
chg_comm_short: Optional[int] = None
chg_total_long: Optional[int] = None
chg_total_short: Optional[int] = None
chg_nonrept_long: Optional[int] = None
chg_nonrept_short: Optional[int] = None
@dataclass
class PctRow:
pct_open_interest: Optional[float] = None
pct_noncomm_long: Optional[float] = None
pct_noncomm_short: Optional[float] = None
pct_noncomm_spreading: Optional[float] = None
pct_comm_long: Optional[float] = None
pct_comm_short: Optional[float] = None
pct_total_long: Optional[float] = None
pct_total_short: Optional[float] = None
pct_nonrept_long: Optional[float] = None
pct_nonrept_short: Optional[float] = None
@dataclass
class TraderRow:
traders_total: Optional[int] = None
traders_noncomm_long: Optional[int] = None
traders_noncomm_short: Optional[int] = None
traders_noncomm_spread: Optional[int] = None
traders_comm_long: Optional[int] = None
traders_comm_short: Optional[int] = None
traders_total_long: Optional[int] = None
traders_total_short: Optional[int] = None
@dataclass
class ConcentrationRow:
conc_gross_long_4: Optional[float] = None
conc_gross_short_4: Optional[float] = None
conc_gross_long_8: Optional[float] = None
conc_gross_short_8: Optional[float] = None
conc_net_long_4: Optional[float] = None
conc_net_short_4: Optional[float] = None
conc_net_long_8: Optional[float] = None
conc_net_short_8: Optional[float] = None
@dataclass
class CommodityBlock:
cftc_code: str
name: str
exchange: str
exchange_abbr: str
contract_unit: str
report_date: str
prev_report_date: Optional[str]
positions: dict = field(default_factory=dict) # row_type -> PositionRow
changes: Optional[ChangesRow] = None
percentages: dict = field(default_factory=dict) # row_type -> PctRow
traders: dict = field(default_factory=dict) # row_type -> TraderRow
concentration: dict = field(default_factory=dict) # row_type -> ConcentrationRow
def _assign_position_values(values: list, as_float: bool = False) -> dict:
"""Map a 10-value list to position field names."""
keys = ['open_interest', 'noncomm_long', 'noncomm_short', 'noncomm_spreading',
'comm_long', 'comm_short', 'total_long', 'total_short',
'nonrept_long', 'nonrept_short']
return {k: values[i] if i < len(values) else None for i, k in enumerate(keys)}
def _parse_block(lines: list[str]) -> Optional[CommodityBlock]:
"""Parse a single commodity block into a CommodityBlock."""
if not lines:
return None
# --- Header line (line 0): NAME - EXCHANGE ... Code-XXXXXX ---
header = lines[0].strip()
m = COMMODITY_HEADER_RE.match(lines[0].rstrip())
if not m:
return None
full_name = m.group(1).strip()
cftc_code = m.group(2)
# Split "NAME - EXCHANGE" on first " - "
if ' - ' in full_name:
name, exchange = full_name.split(' - ', 1)
else:
name, exchange = full_name, ''
name = name.strip()
exchange = exchange.strip()
exchange_abbr = EXCHANGE_ABBR.get(exchange.upper(), exchange[:6].upper().replace(' ', ''))
# --- Report date line (line 1) ---
report_date = None
if len(lines) > 1:
report_date = _parse_date(lines[1])
if not report_date:
return None
contract_unit = ''
prev_report_date = None
positions: dict = {}
changes: Optional[ChangesRow] = None
percentages: dict = {}
traders: dict = {}
concentration: dict = {}
# State machine
section = 'POSITIONS'
expect_changes = False
for line in lines[2:]:
stripped = line.strip()
# Skip pure separator / empty lines
if not stripped or stripped.startswith('---') or stripped == ':':
continue
# Contract unit
if '(CONTRACTS OF' in line:
m2 = re.search(r'\(CONTRACTS OF[^)]+\)', line)
if m2:
contract_unit = m2.group(0)
continue
# Section triggers
if 'Changes in Commitments from' in line:
prev_report_date = _parse_date(line)
expect_changes = True
section = 'CHANGES'
continue
if 'Percent of Open Interest Represented' in line:
section = 'PERCENT'
expect_changes = False
continue
if '# Traders' in line or 'Number of Traders in Each Category' in line:
section = 'TRADERS'
expect_changes = False
continue
if 'Percent of Open Interest Held by' in line:
section = 'CONCENTRATION'
expect_changes = False
continue
# Skip other header/label-only lines
if ':' not in line:
continue
label_part = line.split(':')[0].strip()
if section == 'POSITIONS' or section == 'PERCENT':
if label_part not in ('All', 'Old', 'Other'):
continue
if section == 'POSITIONS':
_, vals = _parse_position_line(line, as_float=False)
if len(vals) >= 1:
d = _assign_position_values(vals)
positions[label_part] = PositionRow(**d)
else:
_, vals = _parse_position_line(line, as_float=True)
if len(vals) >= 1:
keys = ['pct_open_interest', 'pct_noncomm_long', 'pct_noncomm_short',
'pct_noncomm_spreading', 'pct_comm_long', 'pct_comm_short',
'pct_total_long', 'pct_total_short', 'pct_nonrept_long', 'pct_nonrept_short']
d = {k: vals[i] if i < len(vals) else None for i, k in enumerate(keys)}
percentages[label_part] = PctRow(**d)
elif section == 'CHANGES':
# Changes row has blank label
if label_part == '' or label_part == ':':
_, vals = _parse_position_line(line, as_float=False)
if len(vals) >= 1:
keys = ['chg_open_interest', 'chg_noncomm_long', 'chg_noncomm_short',
'chg_noncomm_spreading', 'chg_comm_long', 'chg_comm_short',
'chg_total_long', 'chg_total_short', 'chg_nonrept_long', 'chg_nonrept_short']
d = {k: vals[i] if i < len(vals) else None for i, k in enumerate(keys)}
changes = ChangesRow(**d)
section = 'CHANGES_DONE'
elif section == 'TRADERS':
if label_part not in ('All', 'Old', 'Other'):
continue
_, vals = _parse_trader_line(line)
if len(vals) >= 1:
keys = ['traders_total', 'traders_noncomm_long', 'traders_noncomm_short',
'traders_noncomm_spread', 'traders_comm_long', 'traders_comm_short',
'traders_total_long', 'traders_total_short']
d = {k: vals[i] if i < len(vals) else None for i, k in enumerate(keys)}
traders[label_part] = TraderRow(**d)
elif section == 'CONCENTRATION':
if label_part not in ('All', 'Old', 'Other'):
continue
_, vals = _parse_concentration_line(line)
if len(vals) >= 8:
concentration[label_part] = ConcentrationRow(
conc_gross_long_4=vals[0],
conc_gross_short_4=vals[1],
conc_gross_long_8=vals[2],
conc_gross_short_8=vals[3],
conc_net_long_4=vals[4],
conc_net_short_4=vals[5],
conc_net_long_8=vals[6],
conc_net_short_8=vals[7],
)
if not positions:
return None
return CommodityBlock(
cftc_code=cftc_code,
name=name,
exchange=exchange,
exchange_abbr=exchange_abbr,
contract_unit=contract_unit,
report_date=report_date,
prev_report_date=prev_report_date,
positions=positions,
changes=changes,
percentages=percentages,
traders=traders,
concentration=concentration,
)
def parse_text_blocks(text: str) -> Iterator[CommodityBlock]:
"""
Split raw fixed-width text into commodity blocks and parse each one.
Each block starts with a line matching the commodity header pattern.
"""
lines = text.splitlines()
block_lines: list[str] = []
for line in lines:
if COMMODITY_HEADER_RE.match(line.rstrip()):
if block_lines:
block = _parse_block(block_lines)
if block:
yield block
block_lines = [line]
else:
block_lines.append(line)
if block_lines:
block = _parse_block(block_lines)
if block:
yield block
def extract_text_from_html(html: str) -> str:
"""Extract raw text content from the <pre> block in a CFTC HTML file."""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
pre = soup.find('pre')
if pre:
return pre.get_text()
# Fallback: strip HTML tags
return re.sub(r'<[^>]+>', '', html)
def parse_html_file(path: str) -> Iterator[CommodityBlock]:
"""Parse a weekly HTML file downloaded from CFTC."""
content = Path(path).read_text(encoding='latin-1')
text = extract_text_from_html(content)
yield from parse_text_blocks(text)
def parse_zip_file(zip_path: str) -> Iterator[CommodityBlock]:
"""
Parse a historical CFTC ZIP archive.
Annual/historical ZIPs contain a CSV file ('annualof.txt' or similar).
Falls back to fixed-width text parsing if no CSV header detected.
"""
with zipfile.ZipFile(zip_path) as zf:
txt_files = [n for n in zf.namelist() if n.lower().endswith('.txt')]
for fname in txt_files:
with zf.open(fname) as f:
text = f.read().decode('latin-1')
# Detect CSV by checking for quoted header on first line
if text.lstrip().startswith('"Market'):
yield from parse_csv_text(text)
else:
yield from parse_text_blocks(text)
# ── CSV format (historical annual ZIPs) ────────────────────────────────────
# Map CSV column name suffixes to our field names
_POS_FIELDS = {
'Open Interest': 'open_interest',
'Noncommercial Positions-Long': 'noncomm_long',
'Noncommercial Positions-Short': 'noncomm_short',
'Noncommercial Positions-Spreading': 'noncomm_spreading',
'Commercial Positions-Long': 'comm_long',
'Commercial Positions-Short': 'comm_short',
'Total Reportable Positions-Long': 'total_long',
'Total Reportable Positions-Short': 'total_short',
'Nonreportable Positions-Long': 'nonrept_long',
'Nonreportable Positions-Short': 'nonrept_short',
}
_CHG_FIELDS = {
'Change in Open Interest': 'chg_open_interest',
'Change in Noncommercial-Long': 'chg_noncomm_long',
'Change in Noncommercial-Short': 'chg_noncomm_short',
'Change in Noncommercial-Spreading': 'chg_noncomm_spreading',
'Change in Commercial-Long': 'chg_comm_long',
'Change in Commercial-Short': 'chg_comm_short',
'Change in Total Reportable-Long': 'chg_total_long',
'Change in Total Reportable-Short': 'chg_total_short',
'Change in Nonreportable-Long': 'chg_nonrept_long',
'Change in Nonreportable-Short': 'chg_nonrept_short',
}
_PCT_FIELDS = {
'% of Open Interest (OI)': 'pct_open_interest',
'% of OI-Noncommercial-Long': 'pct_noncomm_long',
'% of OI-Noncommercial-Short': 'pct_noncomm_short',
'% of OI-Noncommercial-Spreading': 'pct_noncomm_spreading',
'% of OI-Commercial-Long': 'pct_comm_long',
'% of OI-Commercial-Short': 'pct_comm_short',
'% of OI-Total Reportable-Long': 'pct_total_long',
'% of OI-Total Reportable-Short': 'pct_total_short',
'% of OI-Nonreportable-Long': 'pct_nonrept_long',
'% of OI-Nonreportable-Short': 'pct_nonrept_short',
}
_TRD_FIELDS = {
'Traders-Total': 'traders_total',
'Traders-Noncommercial-Long': 'traders_noncomm_long',
'Traders-Noncommercial-Short':'traders_noncomm_short',
'Traders-Noncommercial-Spreading': 'traders_noncomm_spread',
'Traders-Commercial-Long': 'traders_comm_long',
'Traders-Commercial-Short': 'traders_comm_short',
'Traders-Total Reportable-Long': 'traders_total_long',
'Traders-Total Reportable-Short': 'traders_total_short',
}
_CONC_FIELDS = {} # populated dynamically — column names are inconsistent
def _csv_val(row: dict, key: str, as_float: bool = False):
"""Get a value from a CSV row by key prefix match, stripping whitespace."""
# Try exact key first, then strip leading/trailing spaces from all keys
for k, v in row.items():
if k.strip() == key.strip():
v = v.strip()
if not v:
return None
try:
return float(v) if as_float else int(float(v))
except ValueError:
return None
return None
def _build_position_row_from_csv(row: dict, suffix: str) -> PositionRow:
kwargs = {}
for prefix, field in _POS_FIELDS.items():
col = f'{prefix} ({suffix})'
# open_interest has slightly different format for Old/Other
kwargs[field] = _csv_val(row, col)
return PositionRow(**kwargs)
def _build_changes_from_csv(row: dict) -> ChangesRow:
kwargs = {}
for prefix, field in _CHG_FIELDS.items():
kwargs[field] = _csv_val(row, f'{prefix} (All)')
return ChangesRow(**kwargs)
def _build_pct_row_from_csv(row: dict, suffix: str) -> PctRow:
kwargs = {}
for prefix, field in _PCT_FIELDS.items():
# Percent columns have slightly inconsistent naming between All and Old/Other
col_all = f'{prefix} (OI) ({suffix})' if '% of Open Interest' in prefix else f'{prefix} ({suffix})'
val = _csv_val(row, f'{prefix} ({suffix})', as_float=True)
if val is None:
# Try alternate form
val = _csv_val(row, f'{prefix}(OI) ({suffix})', as_float=True)
kwargs[field] = val
return PctRow(**kwargs)
def _build_trader_row_from_csv(row: dict, suffix: str) -> TraderRow:
kwargs = {}
for prefix, field in _TRD_FIELDS.items():
kwargs[field] = _csv_val(row, f'{prefix} ({suffix})')
return TraderRow(**kwargs)
def _build_concentration_from_csv(row: dict, suffix: str) -> ConcentrationRow:
"""
Concentration columns have inconsistent spacing in CFTC CSVs, e.g.:
'Concentration-Gross LT = 4 TDR-Long (All)'
'Concentration-Gross LT =4 TDR-Short (All)'
Match by normalizing whitespace.
"""
import re as _re
def _norm(s: str) -> str:
return _re.sub(r'\s+', '', s).lower()
# Build a normalized lookup for this row
norm_row = {_norm(k): v for k, v in row.items()}
suf = suffix.lower()
def _get(pattern: str):
key = _norm(pattern + f'({suffix})')
v = norm_row.get(key, '').strip()
if not v:
return None
try:
return float(v)
except ValueError:
return None
return ConcentrationRow(
conc_gross_long_4=_get('Concentration-Gross LT =4 TDR-Long '),
conc_gross_short_4=_get('Concentration-Gross LT =4 TDR-Short '),
conc_gross_long_8=_get('Concentration-Gross LT =8 TDR-Long '),
conc_gross_short_8=_get('Concentration-Gross LT =8 TDR-Short '),
conc_net_long_4=_get('Concentration-Net LT =4 TDR-Long '),
conc_net_short_4=_get('Concentration-Net LT =4 TDR-Short '),
conc_net_long_8=_get('Concentration-Net LT =8 TDR-Long '),
conc_net_short_8=_get('Concentration-Net LT =8 TDR-Short '),
)
def _csv_row_to_block(row: dict) -> Optional[CommodityBlock]:
"""Convert one CSV row (= one commodity × one date) to a CommodityBlock."""
import csv as _csv
full_name = row.get('Market and Exchange Names', '').strip()
report_date = row.get('As of Date in Form YYYY-MM-DD', '').strip()
cftc_code = row.get('CFTC Contract Market Code', '').strip()
if not full_name or not report_date or not cftc_code:
return None
if ' - ' in full_name:
name, exchange = full_name.split(' - ', 1)
else:
name, exchange = full_name, ''
name = name.strip()
exchange = exchange.strip()
exchange_abbr = EXCHANGE_ABBR.get(exchange.upper(),
exchange[:6].upper().replace(' ', ''))
positions = {}
percentages = {}
traders = {}
concentration = {}
for suffix, label in [('All', 'All'), ('Old', 'Old'), ('Other', 'Other')]:
positions[label] = _build_position_row_from_csv(row, suffix)
percentages[label] = _build_pct_row_from_csv(row, suffix)
traders[label] = _build_trader_row_from_csv(row, suffix)
concentration[label] = _build_concentration_from_csv(row, suffix)
changes = _build_changes_from_csv(row)
return CommodityBlock(
cftc_code=cftc_code,
name=name,
exchange=exchange,
exchange_abbr=exchange_abbr,
contract_unit='',
report_date=report_date,
prev_report_date=None,
positions=positions,
changes=changes,
percentages=percentages,
traders=traders,
concentration=concentration,
)
def parse_csv_text(text: str) -> Iterator[CommodityBlock]:
"""Parse a CFTC historical CSV file (annualof.txt format)."""
import csv as _csv
reader = _csv.DictReader(text.splitlines())
for row in reader:
block = _csv_row_to_block(row)
if block:
yield block