Skip to content

Commit 56dccce

Browse files
committed
add the script, and the else
1 parent 3cbb0ea commit 56dccce

File tree

17 files changed

+238
-10
lines changed

17 files changed

+238
-10
lines changed

README.md

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@ engineering, EDA, model building, reporting, and CI setup.
77
## Project Structure
88

99
end-to-end/
10-
10+
.github\workflows
11+
│ |──ci,yml
12+
| |──codeql.yml
1113
├── data/
1214
│ ├── raw/ # Original data (untouched)
1315
│ └── processed/ # Cleaned, transformed data
@@ -25,18 +27,9 @@ engineering, EDA, model building, reporting, and CI setup.
2527
│ │ └── train_model.py
2628
│ └── utils/
2729
│ └── data_loader.py
28-
29-
├── reports/
30-
│ ├── interim_report.md
31-
│ └── final_report.md
32-
33-
├── ci/
34-
│ └── python-ci.yml # CI pipeline (GitHub Actions)
35-
3630
├── .gitignore
3731
├── requirements.txt
3832
├── README.md
39-
└── setup.py (optional)
4033

4134
## How to Run This Project
4235

requirements.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
pandas
2+
numpy
3+
matplotlib
4+
seaborn
5+
scikit-learn
6+
scipy
7+
statsmodels
8+
dvc
9+
pytest
4.58 KB
Binary file not shown.
File renamed without changes.

scripts/run_eda.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# scripts/run_eda.py
2+
import os
3+
from src.utils.data_loader import load_csv, save_csv
4+
from src.eda.eda_tools import (
5+
data_structure, descriptive_stats, overall_loss_ratio, loss_ratio_by_group,
6+
plot_loss_ratio_by_province, plot_totalclaims_distribution,
7+
plot_claims_premium_time_series, scatter_premium_vs_claims, outlier_summary
8+
)
9+
import argparse
10+
import json
11+
12+
def main(input_path, output_dir):
13+
os.makedirs(output_dir, exist_ok=True)
14+
figures_dir = os.path.join(output_dir, "figures")
15+
os.makedirs(figures_dir, exist_ok=True)
16+
summaries_dir = os.path.join(output_dir, "summaries")
17+
os.makedirs(summaries_dir, exist_ok=True)
18+
19+
print("Loading data:", input_path)
20+
df = load_csv(input_path, parse_dates=["TransactionMonth", "VehicleIntroDate"])
21+
22+
# Basic data structure
23+
structure = data_structure(df)
24+
structure.to_csv(os.path.join(summaries_dir, "data_structure.csv"))
25+
26+
# Descriptive stats for key numeric columns
27+
numeric_cols = ["TotalPremium", "TotalClaims", "CustomValueEstimate"]
28+
present = [c for c in numeric_cols if c in df.columns]
29+
stats = descriptive_stats(df, present)
30+
stats.to_csv(os.path.join(summaries_dir, "descriptive_stats.csv"))
31+
32+
# Compute Loss Ratios
33+
overall_lr = overall_loss_ratio(df)
34+
lr_by_province = loss_ratio_by_group(df, "Province").reset_index()
35+
lr_by_province.to_csv(os.path.join(summaries_dir, "loss_ratio_by_province.csv"))
36+
with open(os.path.join(summaries_dir, "loss_ratio_overall.json"), "w") as f:
37+
json.dump({"overall_loss_ratio": overall_lr}, f, default=str)
38+
39+
# Outliers summary
40+
outlier_tc = outlier_summary(df, "TotalClaims") if "TotalClaims" in df.columns else {}
41+
with open(os.path.join(summaries_dir, "outlier_totalclaims.json"), "w") as f:
42+
json.dump(outlier_tc, f, default=str)
43+
44+
# Create required 3 beautiful plots
45+
p1 = plot_loss_ratio_by_province(df, figures_dir)
46+
p2 = plot_totalclaims_distribution(df, figures_dir)
47+
p3 = plot_claims_premium_time_series(df, figures_dir)
48+
p4 = scatter_premium_vs_claims(df, figures_dir)
49+
50+
print("Saved figures:", p1, p2, p3, p4)
51+
print("Summaries saved to", summaries_dir)
52+
print("Overall Loss Ratio:", overall_lr)
53+
print("EDA complete")
54+
55+
if __name__ == "__main__":
56+
parser = argparse.ArgumentParser(description="Run EDA for ACIS dataset")
57+
parser.add_argument("--input", default="data/raw/data.csv")
58+
parser.add_argument("--output", default="reports")
59+
args = parser.parse_args()
60+
main(args.input, args.output)
File renamed without changes.
9.45 KB
Binary file not shown.

src/eda/eda_tools.py

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
# src/eda/eda_tools.py
2+
import os
3+
import pandas as pd
4+
import numpy as np
5+
import matplotlib.pyplot as plt
6+
import seaborn as sns
7+
from typing import Tuple
8+
9+
sns.set(style="whitegrid", rc={"figure.dpi": 150})
10+
11+
def ensure_dir(path: str):
12+
os.makedirs(path, exist_ok=True)
13+
14+
# ---------- Summaries ----------
15+
def data_structure(df: pd.DataFrame) -> pd.DataFrame:
16+
"""Return dtypes and non-null counts."""
17+
info = pd.DataFrame({
18+
"dtype": df.dtypes.astype(str),
19+
"non_null_count": df.count(),
20+
"null_count": df.isna().sum(),
21+
"unique": df.nunique(dropna=False)
22+
})
23+
return info
24+
25+
def descriptive_stats(df: pd.DataFrame, cols: list) -> pd.DataFrame:
26+
return df[cols].describe().T
27+
28+
# ---------- Business Metric: Loss Ratio ----------
29+
def overall_loss_ratio(df: pd.DataFrame) -> float:
30+
total_claims = df["TotalClaims"].sum(skipna=True)
31+
total_premium = df["TotalPremium"].sum(skipna=True)
32+
if total_premium == 0:
33+
return np.nan
34+
return total_claims / total_premium
35+
36+
def loss_ratio_by_group(df: pd.DataFrame, group_col: str) -> pd.DataFrame:
37+
grp = df.groupby(group_col)[["TotalPremium","TotalClaims"]].sum()
38+
grp = grp.assign(LossRatio = grp["TotalClaims"] / grp["TotalPremium"])
39+
grp = grp.sort_values("LossRatio", ascending=False)
40+
return grp
41+
42+
# ---------- Time series ----------
43+
def monthly_claims_premiums(df: pd.DataFrame, date_col: str = "TransactionMonth") -> pd.DataFrame:
44+
df = df.copy()
45+
df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
46+
df = df.dropna(subset=[date_col])
47+
monthly = df.groupby(pd.Grouper(key=date_col, freq="MS"))[["TotalClaims","TotalPremium"]].sum()
48+
monthly["ClaimFrequency"] = df.groupby(pd.Grouper(key=date_col, freq="MS"))["TotalClaims"].apply(lambda s: (s>0).sum())
49+
# severity: average claim amount per claim (avoid div by zero)
50+
monthly["ClaimSeverity"] = monthly.apply(lambda r: r["TotalClaims"] / max(r["ClaimFrequency"], 1), axis=1)
51+
return monthly
52+
53+
# ---------- Outlier detection ----------
54+
def outlier_summary(df: pd.DataFrame, col: str) -> dict:
55+
s = df[col].dropna()
56+
q1, q3 = s.quantile([0.25, 0.75])
57+
iqr = q3 - q1
58+
lower = q1 - 1.5 * iqr
59+
upper = q3 + 1.5 * iqr
60+
return {"q1": q1, "q3": q3, "iqr": iqr, "lower": lower, "upper": upper,
61+
"n_outliers": ((s < lower) | (s > upper)).sum()}
62+
63+
# ---------- Plots (3 required polished plots) ----------
64+
def plot_loss_ratio_by_province(df: pd.DataFrame, outdir: str):
65+
ensure_dir(outdir)
66+
grp = loss_ratio_by_group(df, "Province")
67+
plt.figure(figsize=(10,6))
68+
sns.barplot(x=grp.index, y=grp["LossRatio"])
69+
plt.xticks(rotation=45, ha="right")
70+
plt.ylabel("Loss Ratio (TotalClaims / TotalPremium)")
71+
plt.title("Loss Ratio by Province")
72+
plt.tight_layout()
73+
path = os.path.join(outdir, "loss_ratio_by_province.png")
74+
plt.savefig(path)
75+
plt.close()
76+
return path
77+
78+
def plot_totalclaims_distribution(df: pd.DataFrame, outdir: str):
79+
ensure_dir(outdir)
80+
plt.figure(figsize=(8,5))
81+
# log scale helps when heavy skew/outliers
82+
sns.histplot(df["TotalClaims"].dropna(), bins=100, kde=True)
83+
plt.xscale('symlog') # symmetric log to keep zeros visible
84+
plt.xlabel("TotalClaims (symlog scale)")
85+
plt.title("Distribution of TotalClaims (log-friendly)")
86+
plt.tight_layout()
87+
path = os.path.join(outdir, "totalclaims_distribution.png")
88+
plt.savefig(path)
89+
plt.close()
90+
return path
91+
92+
def plot_claims_premium_time_series(df: pd.DataFrame, outdir: str, date_col="TransactionMonth"):
93+
ensure_dir(outdir)
94+
monthly = monthly_claims_premiums(df, date_col=date_col)
95+
plt.figure(figsize=(10,6))
96+
ax = monthly[["TotalClaims","TotalPremium"]].plot(title="Monthly TotalClaims vs TotalPremium")
97+
ax.set_ylabel("Amount (local currency)")
98+
plt.tight_layout()
99+
path = os.path.join(outdir, "monthly_claims_premium.png")
100+
plt.savefig(path)
101+
plt.close()
102+
return path
103+
104+
# ---------- Bivariate exploration ----------
105+
def scatter_premium_vs_claims(df: pd.DataFrame, outdir: str, sample=10000):
106+
ensure_dir(outdir)
107+
n = min(len(df), sample)
108+
sample_df = df.sample(n=n, random_state=42)
109+
plt.figure(figsize=(8,6))
110+
sns.scatterplot(x=sample_df["TotalPremium"], y=sample_df["TotalClaims"], alpha=0.6)
111+
plt.xscale("symlog")
112+
plt.yscale("symlog")
113+
plt.xlabel("TotalPremium (symlog)")
114+
plt.ylabel("TotalClaims (symlog)")
115+
plt.title(f"Scatter: TotalPremium vs TotalClaims (sample n={n})")
116+
plt.tight_layout()
117+
path = os.path.join(outdir, "scatter_premium_vs_claims.png")
118+
plt.savefig(path)
119+
plt.close()
120+
return path

src/features/build_features.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
def build_features(df):
2+
df["VehicleAge"] = 2025 - df["RegistrationYear"]
3+
df["ClaimOccurred"] = (df["TotalClaims"] > 0).astype(int)
4+
return df

0 commit comments

Comments
 (0)