diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..671932d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,32 @@ +beautifulsoup4==4.14.2 +certifi==2025.10.5 +cffi==2.0.0 +charset-normalizer==3.4.3 +curl_cffi==0.13.0 +exchange_calendars==4.11.1 +frozendict==2.4.6 +html5lib==1.1 +idna==3.10 +korean-lunar-calendar==0.3.1 +lxml==6.0.2 +multitasking==0.0.12 +numpy==2.3.3 +pandas==2.3.3 +peewee==3.18.2 +platformdirs==4.4.0 +protobuf==6.32.1 +pycparser==2.23 +pyluach==2.3.0 +python-dateutil==2.9.0.post0 +pytz==2025.2 +requests==2.32.5 +scipy==1.16.2 +six==1.17.0 +soupsieve==2.8 +toolz==1.0.0 +typing_extensions==4.15.0 +tzdata==2025.2 +urllib3==2.5.0 +webencodings==0.5.1 +websockets==15.0.1 +yfinance==0.2.66 diff --git a/scrap.sh b/scrap.sh new file mode 100755 index 0000000..81336d3 --- /dev/null +++ b/scrap.sh @@ -0,0 +1,4 @@ +#!/bin/bash +cd /home/westfarn/Documents/repos/smart_trading +. venv/bin/activate +python scrap_options_data.py diff --git a/scrap_data.py b/scrap_data.py new file mode 100644 index 0000000..5c90a51 --- /dev/null +++ b/scrap_data.py @@ -0,0 +1,152 @@ +import pandas as pd +import yfinance as yf +import requests +import numpy as np +from utils import get_sp500_tickers + + +# --- 1. Main Function to Create Enhanced DataFrame --- +def create_enhanced_repository(tickers, years=3): + """ + Downloads enhanced historical data for all tickers, calculates metrics, + and returns a single combined DataFrame. + """ + if not tickers: + print("Ticker list is empty. Cannot fetch data.") + return pd.DataFrame() + + print(f"Starting data download for {len(tickers)} stocks for {years} years...") + + # 2a. Download all historical OHLCV data + data = yf.download(tickers, period=f"{years}y", progress=False, auto_adjust=False) + + if data.empty: + print("Failed to download data.") + return pd.DataFrame() + + # Isolate the OHLCV data + df_ohlcv = data[["Open", "High", "Low", "Close", "Adj Close", "Volume"]].copy() + + # --- CALCULATE NEW METRICS --- + + # Calculate 250-day and 30-day Simple Moving Average (SMA) on Adj Close + adj_close_data = df_ohlcv["Adj Close"] + sma_250_data = adj_close_data.rolling(window=250).mean() + sma_30_data = adj_close_data.rolling(window=30).mean() + + for ticker in tickers: + df_ohlcv.loc[:, ("SMA_250", ticker)] = sma_250_data[ticker] + df_ohlcv.loc[:, ("SMA_30", ticker)] = sma_30_data[ticker] + + # Calculate Adjusted Open (using the Adj Close to Close ratio as the adjustment factor) + for ticker in tickers: + # The adjustment factor handles splits and dividends + adj_factor = df_ohlcv["Adj Close"][ticker] / df_ohlcv["Close"][ticker] + df_ohlcv.loc[:, ("Calculated_Adj_Open", ticker)] = ( + df_ohlcv["Open"][ticker] * adj_factor + ) + + # --- RESTRUCTURE DATA --- + + # Stack the multi-index columns to create a "tidy" format (Date, Ticker, Metric) + df_long = df_ohlcv.stack(level=1).reset_index() + + # Assign column names based on the order of the multi-index columns after stacking + df_long.columns = [ + "Date", + "Ticker", + "Open", + "High", + "Low", + "Close", + "Adj_Close", + "Volume", + "SMA_250", + "SMA_30", + "Calculated_Adj_Open", + ] + + # Initialize columns for fundamental data + df_long["Market_Cap"] = np.nan + df_long["PE_Ratio"] = np.nan + df_long["Earnings_Release_Day"] = False + + # --- FETCH FUNDAMENTAL DATA AND EARNINGS DATES --- + for ticker_symbol in tickers: + try: + ticker = yf.Ticker(ticker_symbol) + + # Fundamentals (Latest values) + info = ticker.info + market_cap = info.get("marketCap") + pe_ratio = info.get("trailingPE") + + # Apply latest fundamental data to all historical rows for the stock + # NOTE: Market Cap and P/E are the LATEST values, not historical time-series + mask = df_long["Ticker"] == ticker_symbol + df_long.loc[mask, "Market_Cap"] = market_cap + df_long.loc[mask, "PE_Ratio"] = pe_ratio + + # Earnings Release Dates + earnings_dates_df = ticker.earnings_dates + if earnings_dates_df is not None and not earnings_dates_df.empty: + # Use only the date part for comparison + earnings_dates = set(earnings_dates_df.index.date) + + # Set the Earnings_Release_Day flag + df_long.loc[ + mask & df_long["Date"].dt.date.isin(earnings_dates), + "Earnings_Release_Day", + ] = True + + except Exception as e: + # This is common for tickers that may have delisted or have bad data + print( + f"Warning: Could not fetch fundamental/earnings data for {ticker_symbol}. Error: {e}" + ) + + # Final cleanup and column selection + final_cols = [ + "Date", + "Ticker", + "Open", + "High", + "Low", + "Close", + "Adj_Close", + "Calculated_Adj_Open", + "Volume", + "SMA_250", + "SMA_30", + "Market_Cap", + "PE_Ratio", + "Earnings_Release_Day", + ] + + df_repository_final = ( + df_long[final_cols].sort_values(by=["Ticker", "Date"]).reset_index(drop=True) + ) + + print("\nEnhanced Data Repository created successfully.") + return df_repository_final + + +# --- Execution Block --- + +# 1. Get the list of tickers (Will fetch all 500+ when run locally) +sp500_tickers = get_sp500_tickers() +sp500_tickers = sp500_tickers[:100] + + +# 2. Create the final DataFrame (Consider reducing 'years' for the full list to speed up) +# Running on all 500 stocks for 3 years will take time. +enhanced_repository_df = create_enhanced_repository(sp500_tickers, years=4) + +# 3. Save the result +if not enhanced_repository_df.empty: + filename = "SP500_Enhanced_Data_Repository" + enhanced_repository_df.to_csv(f"{filename}.csv", index=False) + enhanced_repository_df.to_pickle(f"{filename}.pkl") + print(f"\n✅ Data saved to: {filename}") + print("\n--- Sample Data ---") + print(enhanced_repository_df.head()) diff --git a/scrap_options_data.py b/scrap_options_data.py new file mode 100644 index 0000000..49ed899 --- /dev/null +++ b/scrap_options_data.py @@ -0,0 +1,168 @@ +import yfinance as yf +import pandas as pd +import datetime +import os +import numpy as np +import exchange_calendars as xcals +from scipy.stats import norm +from utils import get_nasdaq100_tickers, get_sp500_tickers +from time import sleep + + +# --- Delta Calculation Function (Black-Scholes-Merton) --- +def bsm_delta(S, K, T, r, sigma, option_type): + """ + Calculates the option delta using the Black-Scholes-Merton model. + S: Current stock price + K: Strike price + T: Time to expiration (in years) + r: Risk-free rate (annual) + sigma: Volatility (annualized, typically Implied Volatility) + option_type: 'call' or 'put' + """ + if T <= 0: # Handle options that have expired + if option_type == "call": + return 1.0 if S > K else 0.0 + else: # put + return 0.0 if S < K else -1.0 + + # Calculate d1 + d1 = (np.log(S / K) + (r + 0.5 * sigma**2) * T) / (sigma * np.sqrt(T)) + + if option_type == "call": + # Delta for a call option is N(d1) + return norm.cdf(d1) + elif option_type == "put": + # Delta for a put option is N(d1) - 1 + return norm.cdf(d1) - 1 + return np.nan # Should not happen + + +def is_trading_day(check_date: datetime.date) -> bool: + nyse = xcals.get_calendar("XNYS") + + # Check if the date is a valid trading day (excludes weekends and holidays) + is_trading = nyse.is_session(check_date.strftime("%Y-%m-%d")) + + return is_trading + + +if __name__ == "__main__": + # Only run if it is a trading day + if not is_trading_day(datetime.date.today()): + raise UserWarning("Today is not a trading day") + + # --- Main Script Modifications --- + + # 1. Set the risk-free rate (e.g., current 3-month T-bill rate) + RISK_FREE_RATE = 0.01 # Use a current rate (e.g., 5%) + + date_str = datetime.datetime.now().strftime("%Y_%m_%d") + base_folder = os.path.join("data", "options") + if not os.path.isdir(os.path.join(base_folder, date_str)): + os.mkdir(os.path.join(base_folder, date_str)) + else: + # only run if we have not grabbed the data yet today + raise UserWarning("We already have the data, no need to get it again") + base_folder = os.path.join(base_folder, date_str) + TODAY = datetime.datetime.now() # Current date/time for T calculation + + for ticker_list in [get_sp500_tickers, get_nasdaq100_tickers]: + for ticker_symbol in ticker_list(): + filename_start = f"{date_str}_{ticker_symbol}" + # ... (rest of setup) + + # Create a Ticker object + ticker = yf.Ticker(ticker_symbol) + + # 2. Get the current stock price + try: + stock_info = ticker.info + current_stock_price = stock_info.get("regularMarketPrice") + if current_stock_price is None: + print(f"Could not get current price for {ticker_symbol}. Skipping.") + continue + except Exception as e: + print(f"Error getting stock price for {ticker_symbol}: {e}. Skipping.") + continue + + expirations = ticker.options + all_options_data = [] + + for date_str_exp in expirations: + try: + # Calculate T (Time to Expiration in years) + # Note: yfinance date format is YYYY-MM-DD + expiration_date = datetime.datetime.strptime( + date_str_exp, "%Y-%m-%d" + ) + time_to_expiration_days = (expiration_date - TODAY).days + # Use 252 or 365 as convention, 252 for trading days, 365 for calendar days + # 365 is often used for options pricing + T = time_to_expiration_days / 365.0 + + options_chain = ticker.option_chain(date_str_exp) + calls_df = options_chain.calls + puts_df = options_chain.puts + + # ... (Add expiration and option_type columns as before) + calls_df["expiration"] = date_str_exp + puts_df["expiration"] = date_str_exp + calls_df["option_type"] = "call" + puts_df["option_type"] = "put" + + # 3. Calculate Delta for Calls + calls_df["delta"] = calls_df.apply( + lambda row: bsm_delta( + S=current_stock_price, + K=row["strike"], + T=T, + r=RISK_FREE_RATE, + sigma=row["impliedVolatility"], + option_type="call", + ), + axis=1, + ) + + # 4. Calculate Delta for Puts + puts_df["delta"] = puts_df.apply( + lambda row: bsm_delta( + S=current_stock_price, + K=row["strike"], + T=T, + r=RISK_FREE_RATE, + sigma=row["impliedVolatility"], + option_type="put", + ), + axis=1, + ) + + all_options_data.append(calls_df) + all_options_data.append(puts_df) + except Exception as e: + print( + f"Could not retrieve or calculate delta for {date_str_exp} on {ticker_symbol}: {e}" + ) + + # ... (Concatenate and save data as before) + if all_options_data: + full_options_df = pd.concat(all_options_data) + print( + f"\nFull Options Chain for {ticker_symbol} across all expirations (with Delta):" + ) + # Display columns relevant to delta calculation + print( + full_options_df[ + ["strike", "impliedVolatility", "option_type", "delta"] + ].head() + ) + + full_options_df.to_csv( + os.path.join(base_folder, f"{filename_start}.csv") + ) + full_options_df.to_pickle( + os.path.join(base_folder, f"{filename_start}.pkl") + ) + else: + print(f"No options data retrieved for {ticker_symbol}.") + sleep(0.5) diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..677ccf8 --- /dev/null +++ b/utils.py @@ -0,0 +1,28 @@ +import requests +import pandas as pd + + +def get_slickchart_tickers(index: str) -> list[str]: + url = f"https://www.slickcharts.com/{index}" + user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0" # Default user-agent fails. + response = requests.get(url, headers={"User-Agent": user_agent}) + data = pd.read_html(response.text, match="Symbol", index_col="Symbol")[0] + return data.index.to_list() + + +def get_sp500_tickers() -> list[str]: + try: + return get_slickchart_tickers("sp500") + except Exception as e: + print(f"Error fetching S&P 500 tickers. Using a sample list. Error: {e}") + # Fallback to a sample list if scraping fails + return ["AAPL", "MSFT", "AMZN", "GOOGL", "NVDA"] + + +def get_nasdaq100_tickers() -> list[str]: + try: + return get_slickchart_tickers("nasdaq100") + except Exception as e: + print(f"Error fetching NASDAQ 100 tickers. Using a sample list. Error: {e}") + # Fallback to a sample list if scraping fails + return ["AAPL", "MSFT", "AMZN", "GOOGL", "NVDA"]