-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbaseline_model.py
More file actions
97 lines (78 loc) · 2.93 KB
/
baseline_model.py
File metadata and controls
97 lines (78 loc) · 2.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python3
"""Baseline model: predict `is_profitable` from feature columns.
Trains a logistic regression and a random forest on the dataset.
Reports cross-validated accuracy and feature importances.
Usage:
pip install cross-signal-data[ml]
python notebooks/baseline_model.py
The point isn't to beat 80% accuracy — the bot already enters with ~80% WR
based on the trigger alone. The point is to understand WHICH features carry
the predictive signal, so you can design a smarter trigger.
"""
from __future__ import annotations
import sys
try:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
except ImportError:
print("ERROR: install ML deps with: pip install cross-signal-data[ml]")
sys.exit(1)
from cross_signal_data import load
def main():
df = load()
print(f"Loaded {len(df)} trades")
print(f"Class balance: {df['is_profitable'].mean():.1%} profitable")
print()
# Feature set
feature_cols = [
"entry_price",
"pre_crash_high",
"drop_pct",
"size_usd",
"shares",
"entry_hour_utc",
"entry_dow",
]
X = df[feature_cols].astype(float).values
y = df["is_profitable"].astype(int).values
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Logistic regression baseline
lr_pipe = Pipeline([
("scaler", StandardScaler()),
("lr", LogisticRegression(max_iter=1000, random_state=42)),
])
lr_scores = cross_val_score(lr_pipe, X, y, cv=cv, scoring="accuracy", n_jobs=1)
print(f"Logistic Regression: {lr_scores.mean():.3f} ± {lr_scores.std():.3f}")
# Random forest
rf = RandomForestClassifier(n_estimators=300, max_depth=6, random_state=42, n_jobs=1)
rf_scores = cross_val_score(rf, X, y, cv=cv, scoring="accuracy", n_jobs=1)
print(f"Random Forest: {rf_scores.mean():.3f} ± {rf_scores.std():.3f}")
# Train on full data for feature importance
rf.fit(X, y)
importances = sorted(
zip(feature_cols, rf.feature_importances_),
key=lambda x: -x[1],
)
print()
print("Feature importance (random forest, full-data fit):")
for name, imp in importances:
bar = "█" * int(imp * 50)
print(f" {name:20} {imp:.4f} {bar}")
# Class-conditional means
print()
print("Mean of each feature by class:")
print("=" * 70)
grp = df.groupby("is_profitable")[feature_cols].mean().T
print(grp.to_string(float_format=lambda v: f"{v:.3f}"))
# Diurnal pattern
print()
print("Win rate by entry_hour_utc:")
diurnal = df.groupby("entry_hour_utc")["is_profitable"].agg(["mean", "count"])
print(diurnal.to_string())
if __name__ == "__main__":
main()