import pandas as pd import yfinance as yf import requests import numpy as np from utils import get_sp500_tickers # --- 1. Main Function to Create Enhanced DataFrame --- def create_enhanced_repository(tickers, years=3): """ Downloads enhanced historical data for all tickers, calculates metrics, and returns a single combined DataFrame. """ if not tickers: print("Ticker list is empty. Cannot fetch data.") return pd.DataFrame() print(f"Starting data download for {len(tickers)} stocks for {years} years...") # 2a. Download all historical OHLCV data data = yf.download(tickers, period=f"{years}y", progress=False, auto_adjust=False) if data.empty: print("Failed to download data.") return pd.DataFrame() # Isolate the OHLCV data df_ohlcv = data[["Open", "High", "Low", "Close", "Adj Close", "Volume"]].copy() # --- CALCULATE NEW METRICS --- # Calculate 250-day and 30-day Simple Moving Average (SMA) on Adj Close adj_close_data = df_ohlcv["Adj Close"] sma_250_data = adj_close_data.rolling(window=250).mean() sma_30_data = adj_close_data.rolling(window=30).mean() for ticker in tickers: df_ohlcv.loc[:, ("SMA_250", ticker)] = sma_250_data[ticker] df_ohlcv.loc[:, ("SMA_30", ticker)] = sma_30_data[ticker] # Calculate Adjusted Open (using the Adj Close to Close ratio as the adjustment factor) for ticker in tickers: # The adjustment factor handles splits and dividends adj_factor = df_ohlcv["Adj Close"][ticker] / df_ohlcv["Close"][ticker] df_ohlcv.loc[:, ("Calculated_Adj_Open", ticker)] = ( df_ohlcv["Open"][ticker] * adj_factor ) # --- RESTRUCTURE DATA --- # Stack the multi-index columns to create a "tidy" format (Date, Ticker, Metric) df_long = df_ohlcv.stack(level=1).reset_index() # Assign column names based on the order of the multi-index columns after stacking df_long.columns = [ "Date", "Ticker", "Open", "High", "Low", "Close", "Adj_Close", "Volume", "SMA_250", "SMA_30", "Calculated_Adj_Open", ] # Initialize columns for fundamental data df_long["Market_Cap"] = np.nan df_long["PE_Ratio"] = np.nan df_long["Earnings_Release_Day"] = False # --- FETCH FUNDAMENTAL DATA AND EARNINGS DATES --- for ticker_symbol in tickers: try: ticker = yf.Ticker(ticker_symbol) # Fundamentals (Latest values) info = ticker.info market_cap = info.get("marketCap") pe_ratio = info.get("trailingPE") # Apply latest fundamental data to all historical rows for the stock # NOTE: Market Cap and P/E are the LATEST values, not historical time-series mask = df_long["Ticker"] == ticker_symbol df_long.loc[mask, "Market_Cap"] = market_cap df_long.loc[mask, "PE_Ratio"] = pe_ratio # Earnings Release Dates earnings_dates_df = ticker.earnings_dates if earnings_dates_df is not None and not earnings_dates_df.empty: # Use only the date part for comparison earnings_dates = set(earnings_dates_df.index.date) # Set the Earnings_Release_Day flag df_long.loc[ mask & df_long["Date"].dt.date.isin(earnings_dates), "Earnings_Release_Day", ] = True except Exception as e: # This is common for tickers that may have delisted or have bad data print( f"Warning: Could not fetch fundamental/earnings data for {ticker_symbol}. Error: {e}" ) # Final cleanup and column selection final_cols = [ "Date", "Ticker", "Open", "High", "Low", "Close", "Adj_Close", "Calculated_Adj_Open", "Volume", "SMA_250", "SMA_30", "Market_Cap", "PE_Ratio", "Earnings_Release_Day", ] df_repository_final = ( df_long[final_cols].sort_values(by=["Ticker", "Date"]).reset_index(drop=True) ) print("\nEnhanced Data Repository created successfully.") return df_repository_final # --- Execution Block --- # 1. Get the list of tickers (Will fetch all 500+ when run locally) sp500_tickers = get_sp500_tickers() sp500_tickers = sp500_tickers[:100] # 2. Create the final DataFrame (Consider reducing 'years' for the full list to speed up) # Running on all 500 stocks for 3 years will take time. enhanced_repository_df = create_enhanced_repository(sp500_tickers, years=4) # 3. Save the result if not enhanced_repository_df.empty: filename = "SP500_Enhanced_Data_Repository" enhanced_repository_df.to_csv(f"{filename}.csv", index=False) enhanced_repository_df.to_pickle(f"{filename}.pkl") print(f"\n✅ Data saved to: {filename}") print("\n--- Sample Data ---") print(enhanced_repository_df.head())