Spaces:
Running
Running
File size: 6,198 Bytes
2d5897e 4cb21eb 2d5897e 4cb21eb 2d5897e 4cb21eb 2d5897e 4cb21eb 2d5897e 4cb21eb 2d5897e 4cb21eb 2d5897e 4cb21eb 2d5897e 4cb21eb 2d5897e 4cb21eb 2d5897e 4cb21eb 2d5897e 4cb21eb 2d5897e 4cb21eb 2d5897e 4cb21eb 2d5897e 4cb21eb 2d5897e 4cb21eb 2d5897e 4cb21eb 2d5897e 4cb21eb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | import os
import pandas as pd
import numpy as np
import torch
import joblib
import time
from datetime import datetime
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
class Processor:
def __init__(self, scaler_path="models/robust_scaler.pkl"):
print("⚙️ Initializing AlphaProcessor...")
self.device = 0 if torch.cuda.is_available() else -1
self.model_name = "ProsusAI/finbert"
# 1. Load Scaler
try:
self.scaler = joblib.load(scaler_path)
print(f"✅ Scaler loaded from {scaler_path}")
except:
print("⚠️ Scaler not found in models/ folder.")
# 2. Initialize FinBERT
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.model = AutoModelForSequenceClassification.from_pretrained(
self.model_name, use_safetensors=True
)
self.sentiment_pipe = pipeline(
"sentiment-analysis",
model=self.model,
tokenizer=self.tokenizer,
device=self.device
)
def fetch_market_data(self, days=60):
"""
Loads market data from your provided CSV backup.
Bypasses Finnhub to avoid 403 errors during presentation.
"""
print(f"📁 System: Bypassing API. Loading local market data...")
backup_path = "data/market_data_backup.csv"
if not os.path.exists(backup_path):
print(f"🚨 FATAL: {backup_path} not found!")
return pd.DataFrame()
df = pd.read_csv(backup_path, index_col=0, parse_dates=True)
# Optional: Sync dates to today for presentation realism
# last_date = df.index[-1]
# offset = pd.Timestamp(datetime.now().date()) - last_date
# df.index = df.index + offset
return df.tail(days)
def process(self, df_market, df_news):
"""
Main pipeline: News Sentiment -> Feature Engineering -> GRU Input
"""
# 1. Process Sentiment from headlines
df_sent, df_news_scored = self._generate_sentiment_profile(df_news)
# 2. Merge and engineer all 14 features
df_features = self._engineer_14_features(df_market, df_sent)
# 3. Extract metadata for Streamlit UI
latest_metrics = {
"Sent_Mean": df_features['Sent_Mean'].iloc[-1],
"News_Volume": int(np.exp(df_features['News_Volume'].iloc[-1]) - 1),
"Panic_Interaction": df_features['Sent_x_VIX'].iloc[-1],
"RSI": df_features['RSI'].iloc[-1] * 100
}
# 4. Prepare 30-day window for GRU
final_window = df_features.tail(30).values
scaled_window = self.scaler.transform(final_window)
input_tensor = np.expand_dims(scaled_window, axis=0).astype('float32')
return input_tensor, latest_metrics, df_features, df_news_scored
def _generate_sentiment_profile(self, df_news):
print("🧠 Running FinBERT Batch Analysis...")
titles = df_news['Title'].astype(str).tolist()
# Batch processing to handle 1700+ headlines efficiently
results = self.sentiment_pipe(titles, batch_size=32, truncation=True)
scores = []
for res in results:
label, score = res['label'].lower(), res['score']
scores.append(score if label == 'positive' else -score if label == 'negative' else 0.0)
df_news['Score'] = scores
df_news['Date'] = pd.to_datetime(df_news['Date']).dt.date
grouped = df_news.groupby('Date')['Score']
daily = pd.DataFrame({
'Sent_Mean': grouped.mean(),
'Sent_Intensity': grouped.apply(lambda x: x.abs().mean()),
'News_Volume': np.log1p(grouped.count()),
'Net_Bull': grouped.apply(lambda x: x.sum() / (len(x) + 1))
}).fillna(0.0)
daily.index = pd.to_datetime(daily.index)
return daily, df_news
def _engineer_14_features(self, df, df_sent):
data = df.copy()
data.columns = [c.capitalize() for c in data.columns]
if 'Vix' in data.columns: data = data.rename(columns={'Vix': 'VIX'})
# --- QUANT BRANCH (7 Features) ---
tp = (data['High'] + data['Low'] + data['Close']) / 3
vwap = (tp * data['Volume']).rolling(20).sum() / (data['Volume'].rolling(20).sum() + 1e-9)
data['VWAP_Dist'] = np.log(data['Close'] / vwap)
delta = data['Close'].diff()
gain = (delta.where(delta > 0, 0)).rolling(14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
data['RSI'] = (100 - (100 / (1 + (gain/(loss + 1e-9))))) / 100.0
ema_12 = data['Close'].ewm(span=12).mean()
ema_26 = data['Close'].ewm(span=26).mean()
macd = ema_12 - ema_26
data['MACD_Hist'] = (macd - macd.ewm(span=9).mean()) / data['Close']
data['VIX_Norm'] = data['VIX'] / 100.0
data['VIX_Change'] = data['VIX'].pct_change()
tr = pd.concat([data['High']-data['Low'],
abs(data['High']-data['Close'].shift()),
abs(data['Low']-data['Close'].shift())], axis=1).max(axis=1)
data['ATR_Dist'] = np.tanh((data['Close'] - data['Close'].rolling(22).mean()) / (tr.rolling(14).mean() + 1e-9))
data['Realized_Vol'] = data['Close'].pct_change().rolling(10).std() * 10
# --- SENTIMENT BRANCH (7 Features) ---
data.index = pd.to_datetime(data.index)
data = data.join(df_sent, how='left').fillna(0.0)
data['Sent_Mean_Delta'] = data['Sent_Mean'].diff().fillna(0.0)
data['Sent_Mean_EMA'] = data['Sent_Mean'].ewm(span=3).mean()
data['Sent_x_VIX'] = data['Sent_Mean'] * data['VIX_Norm']
feature_cols = [
'VWAP_Dist', 'RSI', 'MACD_Hist', 'VIX_Norm', 'VIX_Change', 'ATR_Dist', 'Realized_Vol',
'Sent_Mean', 'Sent_Intensity', 'News_Volume', 'Net_Bull', 'Sent_Mean_Delta', 'Sent_Mean_EMA', 'Sent_x_VIX'
]
return data[feature_cols].dropna() |