inital commit
This commit is contained in:
152
scrap_data.py
Normal file
152
scrap_data.py
Normal file
@@ -0,0 +1,152 @@
|
||||
import pandas as pd
|
||||
import yfinance as yf
|
||||
import requests
|
||||
import numpy as np
|
||||
from utils import get_sp500_tickers
|
||||
|
||||
|
||||
# --- 1. Main Function to Create Enhanced DataFrame ---
|
||||
def create_enhanced_repository(tickers, years=3):
|
||||
"""
|
||||
Downloads enhanced historical data for all tickers, calculates metrics,
|
||||
and returns a single combined DataFrame.
|
||||
"""
|
||||
if not tickers:
|
||||
print("Ticker list is empty. Cannot fetch data.")
|
||||
return pd.DataFrame()
|
||||
|
||||
print(f"Starting data download for {len(tickers)} stocks for {years} years...")
|
||||
|
||||
# 2a. Download all historical OHLCV data
|
||||
data = yf.download(tickers, period=f"{years}y", progress=False, auto_adjust=False)
|
||||
|
||||
if data.empty:
|
||||
print("Failed to download data.")
|
||||
return pd.DataFrame()
|
||||
|
||||
# Isolate the OHLCV data
|
||||
df_ohlcv = data[["Open", "High", "Low", "Close", "Adj Close", "Volume"]].copy()
|
||||
|
||||
# --- CALCULATE NEW METRICS ---
|
||||
|
||||
# Calculate 250-day and 30-day Simple Moving Average (SMA) on Adj Close
|
||||
adj_close_data = df_ohlcv["Adj Close"]
|
||||
sma_250_data = adj_close_data.rolling(window=250).mean()
|
||||
sma_30_data = adj_close_data.rolling(window=30).mean()
|
||||
|
||||
for ticker in tickers:
|
||||
df_ohlcv.loc[:, ("SMA_250", ticker)] = sma_250_data[ticker]
|
||||
df_ohlcv.loc[:, ("SMA_30", ticker)] = sma_30_data[ticker]
|
||||
|
||||
# Calculate Adjusted Open (using the Adj Close to Close ratio as the adjustment factor)
|
||||
for ticker in tickers:
|
||||
# The adjustment factor handles splits and dividends
|
||||
adj_factor = df_ohlcv["Adj Close"][ticker] / df_ohlcv["Close"][ticker]
|
||||
df_ohlcv.loc[:, ("Calculated_Adj_Open", ticker)] = (
|
||||
df_ohlcv["Open"][ticker] * adj_factor
|
||||
)
|
||||
|
||||
# --- RESTRUCTURE DATA ---
|
||||
|
||||
# Stack the multi-index columns to create a "tidy" format (Date, Ticker, Metric)
|
||||
df_long = df_ohlcv.stack(level=1).reset_index()
|
||||
|
||||
# Assign column names based on the order of the multi-index columns after stacking
|
||||
df_long.columns = [
|
||||
"Date",
|
||||
"Ticker",
|
||||
"Open",
|
||||
"High",
|
||||
"Low",
|
||||
"Close",
|
||||
"Adj_Close",
|
||||
"Volume",
|
||||
"SMA_250",
|
||||
"SMA_30",
|
||||
"Calculated_Adj_Open",
|
||||
]
|
||||
|
||||
# Initialize columns for fundamental data
|
||||
df_long["Market_Cap"] = np.nan
|
||||
df_long["PE_Ratio"] = np.nan
|
||||
df_long["Earnings_Release_Day"] = False
|
||||
|
||||
# --- FETCH FUNDAMENTAL DATA AND EARNINGS DATES ---
|
||||
for ticker_symbol in tickers:
|
||||
try:
|
||||
ticker = yf.Ticker(ticker_symbol)
|
||||
|
||||
# Fundamentals (Latest values)
|
||||
info = ticker.info
|
||||
market_cap = info.get("marketCap")
|
||||
pe_ratio = info.get("trailingPE")
|
||||
|
||||
# Apply latest fundamental data to all historical rows for the stock
|
||||
# NOTE: Market Cap and P/E are the LATEST values, not historical time-series
|
||||
mask = df_long["Ticker"] == ticker_symbol
|
||||
df_long.loc[mask, "Market_Cap"] = market_cap
|
||||
df_long.loc[mask, "PE_Ratio"] = pe_ratio
|
||||
|
||||
# Earnings Release Dates
|
||||
earnings_dates_df = ticker.earnings_dates
|
||||
if earnings_dates_df is not None and not earnings_dates_df.empty:
|
||||
# Use only the date part for comparison
|
||||
earnings_dates = set(earnings_dates_df.index.date)
|
||||
|
||||
# Set the Earnings_Release_Day flag
|
||||
df_long.loc[
|
||||
mask & df_long["Date"].dt.date.isin(earnings_dates),
|
||||
"Earnings_Release_Day",
|
||||
] = True
|
||||
|
||||
except Exception as e:
|
||||
# This is common for tickers that may have delisted or have bad data
|
||||
print(
|
||||
f"Warning: Could not fetch fundamental/earnings data for {ticker_symbol}. Error: {e}"
|
||||
)
|
||||
|
||||
# Final cleanup and column selection
|
||||
final_cols = [
|
||||
"Date",
|
||||
"Ticker",
|
||||
"Open",
|
||||
"High",
|
||||
"Low",
|
||||
"Close",
|
||||
"Adj_Close",
|
||||
"Calculated_Adj_Open",
|
||||
"Volume",
|
||||
"SMA_250",
|
||||
"SMA_30",
|
||||
"Market_Cap",
|
||||
"PE_Ratio",
|
||||
"Earnings_Release_Day",
|
||||
]
|
||||
|
||||
df_repository_final = (
|
||||
df_long[final_cols].sort_values(by=["Ticker", "Date"]).reset_index(drop=True)
|
||||
)
|
||||
|
||||
print("\nEnhanced Data Repository created successfully.")
|
||||
return df_repository_final
|
||||
|
||||
|
||||
# --- Execution Block ---
|
||||
|
||||
# 1. Get the list of tickers (Will fetch all 500+ when run locally)
|
||||
sp500_tickers = get_sp500_tickers()
|
||||
sp500_tickers = sp500_tickers[:100]
|
||||
|
||||
|
||||
# 2. Create the final DataFrame (Consider reducing 'years' for the full list to speed up)
|
||||
# Running on all 500 stocks for 3 years will take time.
|
||||
enhanced_repository_df = create_enhanced_repository(sp500_tickers, years=4)
|
||||
|
||||
# 3. Save the result
|
||||
if not enhanced_repository_df.empty:
|
||||
filename = "SP500_Enhanced_Data_Repository"
|
||||
enhanced_repository_df.to_csv(f"{filename}.csv", index=False)
|
||||
enhanced_repository_df.to_pickle(f"{filename}.pkl")
|
||||
print(f"\n✅ Data saved to: {filename}")
|
||||
print("\n--- Sample Data ---")
|
||||
print(enhanced_repository_df.head())
|
||||
Reference in New Issue
Block a user