Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 10 additions & 13 deletions local_api.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
import json

import requests

# TODO: send a GET using the URL http://127.0.0.1:8000
r = None # Your code here
BASE_URL = "http://127.0.0.1:8000"

# TODO: print the status code
# print()
# TODO: print the welcome message
# print()
# ---- GET request ----------------------------------------------------------
r = requests.get(f"{BASE_URL}/")

# print status code and welcome message
print("GET / status:", r.status_code)
print("GET / response:", r.json())


# ---- POST request ---------------------------------------------------------
data = {
"age": 37,
"workclass": "Private",
Expand All @@ -29,10 +29,7 @@
"native-country": "United-States",
}

# TODO: send a POST using the data above
r = None # Your code here
r = requests.post(f"{BASE_URL}/predict/", json=data)

# TODO: print the status code
# print()
# TODO: print the result
# print()
print("POST /predict/ status:", r.status_code)
print("POST /predict/ response:", r.json())
54 changes: 36 additions & 18 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from ml.data import apply_label, process_data
from ml.model import inference, load_model


# DO NOT MODIFY
class Data(BaseModel):
age: int = Field(..., example=37)
Expand All @@ -26,25 +27,32 @@ class Data(BaseModel):
hours_per_week: int = Field(..., example=40, alias="hours-per-week")
native_country: str = Field(..., example="United-States", alias="native-country")

path = None # TODO: enter the path for the saved encoder
encoder = load_model(path)

path = None # TODO: enter the path for the saved model
model = load_model(path)
# ---- load saved artifacts -------------------------------------------------
project_path = "."
model_dir = os.path.join(project_path, "model")

encoder_path = os.path.join(model_dir, "encoder.pkl")
lb_path = os.path.join(model_dir, "lb.pkl")
model_path = os.path.join(model_dir, "model.pkl")

encoder = load_model(encoder_path)
lb = load_model(lb_path)
model = load_model(model_path)

# ---- create FastAPI app ---------------------------------------------------
app = FastAPI(title="Income Classification API")

# TODO: create a RESTful API using FastAPI
app = None # your code here

# TODO: create a GET on the root giving a welcome message
# ---- root endpoint --------------------------------------------------------
@app.get("/")
async def get_root():
""" Say hello!"""
# your code here
pass
"""Simple welcome endpoint."""
return {"message": "Hello from the income classification API!"}


# TODO: create a POST on a different path that does model inference
@app.post("/data/")
# ---- prediction endpoint --------------------------------------------------
@app.post("/predict/")
async def post_inference(data: Data):
# DO NOT MODIFY: turn the Pydantic model into a dict.
data_dict = data.dict()
Expand All @@ -64,11 +72,21 @@ async def post_inference(data: Data):
"sex",
"native-country",
]

# process data for inference
data_processed, _, _, _ = process_data(
# your code here
# use data as data input
# use training = False
# do not need to pass lb as input
data,
categorical_features=cat_features,
label=None,
training=False,
encoder=encoder,
lb=lb,
)
_inference = None # your code here to predict the result using data_processed
return {"result": apply_label(_inference)}

# make prediction
preds = inference(model, data_processed)
_inference = preds[0]

# convert 0/1 to <=50K />50K label
# NOTE: apply_label expects an indexable input, so wrap in a list
return {"result": apply_label([_inference])}
125 changes: 46 additions & 79 deletions ml/model.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import pickle
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import fbeta_score, precision_score, recall_score

from ml.data import process_data
# TODO: add necessary import

# Optional: implement hyperparameter tuning.

def train_model(X_train, y_train):
"""
Trains a machine learning model and returns it.
Expand All @@ -19,110 +22,74 @@ def train_model(X_train, y_train):
model
Trained machine learning model.
"""
# TODO: implement the function
pass
model = RandomForestClassifier(
n_estimators=100,
random_state=42,
n_jobs=-1,
)
model.fit(X_train, y_train)
return model


def compute_model_metrics(y, preds):
"""
Validates the trained machine learning model using precision, recall, and F1.

Inputs
------
y : np.array
Known labels, binarized.
preds : np.array
Predicted labels, binarized.
Returns
-------
precision : float
recall : float
fbeta : float
"""
y = np.array(y).reshape(-1)
preds = np.array(preds).reshape(-1)

fbeta = fbeta_score(y, preds, beta=1, zero_division=1)
precision = precision_score(y, preds, zero_division=1)
recall = recall_score(y, preds, zero_division=1)
return precision, recall, fbeta


def inference(model, X):
""" Run model inferences and return the predictions.

Inputs
------
model : ???
Trained machine learning model.
X : np.array
Data used for prediction.
Returns
-------
preds : np.array
Predictions from the model.
"""
# TODO: implement the function
pass
Run model inferences and return the predictions.
"""
preds = model.predict(X)
return np.array(preds).reshape(-1)

def save_model(model, path):
""" Serializes model to a file.

Inputs
------
model
Trained machine learning model or OneHotEncoder.
path : str
Path to save pickle file.
def save_model(model, path):
"""
Serializes model or encoder or label binarizer to a file.
"""
# TODO: implement the function
pass
with open(path, "wb") as f:
pickle.dump(model, f)


def load_model(path):
""" Loads pickle file from `path` and returns it."""
# TODO: implement the function
pass
"""
Loads pickle file from path and returns it.
"""
with open(path, "rb") as f:
obj = pickle.load(f)
return obj


def performance_on_categorical_slice(
data, column_name, slice_value, categorical_features, label, encoder, lb, model
):
""" Computes the model metrics on a slice of the data specified by a column name and

Processes the data using one hot encoding for the categorical features and a
label binarizer for the labels. This can be used in either training or
inference/validation.
"""
Computes the model metrics on a slice of the data specified by a column name
and slice value.
"""
slice_data = data[data[column_name] == slice_value]

Inputs
------
data : pd.DataFrame
Dataframe containing the features and label. Columns in `categorical_features`
column_name : str
Column containing the sliced feature.
slice_value : str, int, float
Value of the slice feature.
categorical_features: list
List containing the names of the categorical features (default=[])
label : str
Name of the label column in `X`. If None, then an empty array will be returned
for y (default=None)
encoder : sklearn.preprocessing._encoders.OneHotEncoder
Trained sklearn OneHotEncoder, only used if training=False.
lb : sklearn.preprocessing._label.LabelBinarizer
Trained sklearn LabelBinarizer, only used if training=False.
model : ???
Model used for the task.
if slice_data.empty:
return 0.0, 0.0, 0.0

Returns
-------
precision : float
recall : float
fbeta : float

"""
# TODO: implement the function
X_slice, y_slice, _, _ = process_data(
# your code here
# for input data, use data in column given as "column_name", with the slice_value
# use training = False
slice_data,
categorical_features=categorical_features,
label=label,
training=False,
encoder=encoder,
lb=lb,
)
preds = None # your code here to get prediction on X_slice using the inference function

preds = inference(model, X_slice)
precision, recall, fbeta = compute_model_metrics(y_slice, preds)
return precision, recall, fbeta
Binary file added model/encoder.pkl
Binary file not shown.
Binary file added model/lb.pkl
Binary file not shown.
Binary file added model/model.pkl
Binary file not shown.
71 changes: 70 additions & 1 deletion model_card_template.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,86 @@

For additional information see the Model Card paper: https://arxiv.org/pdf/1810.03993.pdf

---

## Model Details

This project uses a Random Forest classifier implemented with scikit learn.
The purpose of the model is to predict whether an individual earns more than fifty thousand dollars per year.
The model was trained locally using Python 3.10 as part of the Udacity Machine Learning DevOps project.

The model is saved as `model/model.pkl` and the preprocessing objects (encoder and label binarizer) are stored in the same folder.

---

## Intended Use

This model is intended purely for educational and demonstration purposes.
It shows how to build an end to end ML pipeline including data processing, model training, automated testing, performance evaluation, CI, and API deployment with FastAPI.

It is **not** intended for real world decision making in hiring, lending, housing, insurance, or other areas that affect people’s lives.

---

## Training Data

The model was trained on the provided **Census Income dataset**, which contains demographic and employment related features.
Features include workclass, education, marital status, occupation, race, sex, and native country.

Eighty percent of the dataset is used for training.
Categorical features are one hot encoded using the provided `process_data` function.

---

## Evaluation Data

Twenty percent of the dataset is held out as a test set.
The same preprocessing pipeline (encoder and label binarizer) is applied to the test data.

Evaluation includes both overall performance on the test set and performance on slices of data across individual categories (workclass, sex, race, etc.).
Slice performance is saved to `slice_output.txt`.

---

## Metrics
_Please include the metrics used and your model's performance on those metrics._

Overall performance on the held out test set:

- **Precision:** 0.7419
- **Recall:** 0.6384
- **F1 Score:** 0.6863

Slice performance varies across groups. Examples include:

- **Sex = Male:** Higher recall and F1 compared to female
- **Workclass = Private:** Moderate precision and recall
- **Education levels:** Significant variation depending on category

These results indicate that model performance is not uniform across demographic groups.

---

## Ethical Considerations

The Census dataset contains real world demographic information and may reflect historical social biases.
A model trained on such data can inherit or amplify these biases.
Because of this, the model should **not** be used for any real decision making that can affect individuals.

Potential risks include:

- Unequal performance for different demographic groups
- Unintended discrimination if used in hiring or income related screening
- Misinterpretation of model predictions due to limited context

Care must be taken to avoid misuse.

---

## Caveats and Recommendations

- The model is not tuned or optimized; default Random Forest parameters are used.
- Data is outdated and does not reflect current populations.
- Model should be used only for learning purposes.
- If extended further, a fairness analysis, hyperparameter tuning, and updated datasets are recommended.

The model should always be accompanied by transparency about its limitations and intended educational use.
Binary file added screenshots/continuous_integration.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added screenshots/local_api.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added screenshots/unit_test.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading