inital commit

2025-11-29 14:07:45 -06:00
parent 63518d364c
commit f48ab9869d
5 changed files with 384 additions and 0 deletions
--- a/scrap_data.py
+++ b/scrap_data.py
@@ -0,0 +1,152 @@
+import pandas as pd
+import yfinance as yf
+import requests
+import numpy as np
+from utils import get_sp500_tickers
+
+
+# --- 1. Main Function to Create Enhanced DataFrame ---
+def create_enhanced_repository(tickers, years=3):
+    """
+    Downloads enhanced historical data for all tickers, calculates metrics,
+    and returns a single combined DataFrame.
+    """
+    if not tickers:
+        print("Ticker list is empty. Cannot fetch data.")
+        return pd.DataFrame()
+
+    print(f"Starting data download for {len(tickers)} stocks for {years} years...")
+
+    # 2a. Download all historical OHLCV data
+    data = yf.download(tickers, period=f"{years}y", progress=False, auto_adjust=False)
+
+    if data.empty:
+        print("Failed to download data.")
+        return pd.DataFrame()
+
+    # Isolate the OHLCV data
+    df_ohlcv = data[["Open", "High", "Low", "Close", "Adj Close", "Volume"]].copy()
+
+    # --- CALCULATE NEW METRICS ---
+
+    # Calculate 250-day and 30-day Simple Moving Average (SMA) on Adj Close
+    adj_close_data = df_ohlcv["Adj Close"]
+    sma_250_data = adj_close_data.rolling(window=250).mean()
+    sma_30_data = adj_close_data.rolling(window=30).mean()
+
+    for ticker in tickers:
+        df_ohlcv.loc[:, ("SMA_250", ticker)] = sma_250_data[ticker]
+        df_ohlcv.loc[:, ("SMA_30", ticker)] = sma_30_data[ticker]
+
+    # Calculate Adjusted Open (using the Adj Close to Close ratio as the adjustment factor)
+    for ticker in tickers:
+        # The adjustment factor handles splits and dividends
+        adj_factor = df_ohlcv["Adj Close"][ticker] / df_ohlcv["Close"][ticker]
+        df_ohlcv.loc[:, ("Calculated_Adj_Open", ticker)] = (
+            df_ohlcv["Open"][ticker] * adj_factor
+        )
+
+    # --- RESTRUCTURE DATA ---
+
+    # Stack the multi-index columns to create a "tidy" format (Date, Ticker, Metric)
+    df_long = df_ohlcv.stack(level=1).reset_index()
+
+    # Assign column names based on the order of the multi-index columns after stacking
+    df_long.columns = [
+        "Date",
+        "Ticker",
+        "Open",
+        "High",
+        "Low",
+        "Close",
+        "Adj_Close",
+        "Volume",
+        "SMA_250",
+        "SMA_30",
+        "Calculated_Adj_Open",
+    ]
+
+    # Initialize columns for fundamental data
+    df_long["Market_Cap"] = np.nan
+    df_long["PE_Ratio"] = np.nan
+    df_long["Earnings_Release_Day"] = False
+
+    # --- FETCH FUNDAMENTAL DATA AND EARNINGS DATES ---
+    for ticker_symbol in tickers:
+        try:
+            ticker = yf.Ticker(ticker_symbol)
+
+            # Fundamentals (Latest values)
+            info = ticker.info
+            market_cap = info.get("marketCap")
+            pe_ratio = info.get("trailingPE")
+
+            # Apply latest fundamental data to all historical rows for the stock
+            # NOTE: Market Cap and P/E are the LATEST values, not historical time-series
+            mask = df_long["Ticker"] == ticker_symbol
+            df_long.loc[mask, "Market_Cap"] = market_cap
+            df_long.loc[mask, "PE_Ratio"] = pe_ratio
+
+            # Earnings Release Dates
+            earnings_dates_df = ticker.earnings_dates
+            if earnings_dates_df is not None and not earnings_dates_df.empty:
+                # Use only the date part for comparison
+                earnings_dates = set(earnings_dates_df.index.date)
+
+                # Set the Earnings_Release_Day flag
+                df_long.loc[
+                    mask & df_long["Date"].dt.date.isin(earnings_dates),
+                    "Earnings_Release_Day",
+                ] = True
+
+        except Exception as e:
+            # This is common for tickers that may have delisted or have bad data
+            print(
+                f"Warning: Could not fetch fundamental/earnings data for {ticker_symbol}. Error: {e}"
+            )
+
+    # Final cleanup and column selection
+    final_cols = [
+        "Date",
+        "Ticker",
+        "Open",
+        "High",
+        "Low",
+        "Close",
+        "Adj_Close",
+        "Calculated_Adj_Open",
+        "Volume",
+        "SMA_250",
+        "SMA_30",
+        "Market_Cap",
+        "PE_Ratio",
+        "Earnings_Release_Day",
+    ]
+
+    df_repository_final = (
+        df_long[final_cols].sort_values(by=["Ticker", "Date"]).reset_index(drop=True)
+    )
+
+    print("\nEnhanced Data Repository created successfully.")
+    return df_repository_final
+
+
+# --- Execution Block ---
+
+# 1. Get the list of tickers (Will fetch all 500+ when run locally)
+sp500_tickers = get_sp500_tickers()
+sp500_tickers = sp500_tickers[:100]
+
+
+# 2. Create the final DataFrame (Consider reducing 'years' for the full list to speed up)
+# Running on all 500 stocks for 3 years will take time.
+enhanced_repository_df = create_enhanced_repository(sp500_tickers, years=4)
+
+# 3. Save the result
+if not enhanced_repository_df.empty:
+    filename = "SP500_Enhanced_Data_Repository"
+    enhanced_repository_df.to_csv(f"{filename}.csv", index=False)
+    enhanced_repository_df.to_pickle(f"{filename}.pkl")
+    print(f"\n✅ Data saved to: {filename}")
+    print("\n--- Sample Data ---")
+    print(enhanced_repository_df.head())