From 4e4f37c9fc4e7436159a3ada01f9972633cabb22 Mon Sep 17 00:00:00 2001 From: Alex Fox Date: Sun, 25 May 2025 00:04:48 +0000 Subject: [PATCH] evade tls fingerprinting --- quickstart_etl/assets/nyboe.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/quickstart_etl/assets/nyboe.py b/quickstart_etl/assets/nyboe.py index 0a9ef27..190a748 100644 --- a/quickstart_etl/assets/nyboe.py +++ b/quickstart_etl/assets/nyboe.py @@ -2,7 +2,7 @@ import os from io import StringIO import pandas as pd -import requests +import curl_cffi import dagster as dg import datetime @@ -12,7 +12,7 @@ from sqlalchemy import create_engine IE_ENDPOINT = "https://publicreporting.elections.ny.gov/IndependentExpenditure" -def get_cookies(s: requests.Session, from_date: datetime.date, to_date: datetime.date): +def get_cookies(s: curl_cffi.Session, from_date: datetime.date, to_date: datetime.date): """Fetch cookies into session""" cookie_postdata = { 'lstUCOfficeType': '0', @@ -31,7 +31,11 @@ def get_cookies(s: requests.Session, from_date: datetime.date, to_date: datetime 'ddlSearchBy': 'All' } - return s.post(f"{IE_ENDPOINT}/BindIndExpData/", json=cookie_postdata) + return s.post( + f"{IE_ENDPOINT}/BindIndExpData/", + data=cookie_postdata, + impersonate="chrome", + ) def gen_ie_query(from_date: datetime.date, to_date: datetime.date): """Fill in query parameters for independent expenditures and date range""" @@ -58,7 +62,7 @@ def gen_ie_query(from_date: datetime.date, to_date: datetime.date): } NY_DAILY_PARTITION = dg.DailyPartitionsDefinition( - start_date="2025-01-01", + start_date="2024-01-01", timezone="America/New_York", ) @@ -72,19 +76,20 @@ def fetch_expenditures(context: dg.AssetExecutionContext) -> dg.MaterializeResul end_date = pendulum.parse(context.partition_key).subtract(days=1) start_date = end_date.subtract(days=1) - with requests.Session() as s: + with curl_cffi.Session() as s: res = get_cookies(s, start_date, end_date) if not res.json()["aaData"]: return None req = s.get(f"{IE_ENDPOINT}/IndependentExpenditure", params=gen_ie_query(start_date, end_date), + impersonate="chrome", ) df = pd.read_csv(StringIO(req.text), index_col=False) engine = create_engine("postgresql://superset:PASSWORD@IP_ADDR/superset") df.to_sql( - "independent_expenditures_raw", + "independent_expenditures_raw_redo", con=engine, if_exists="append", )