diff --git a/local_api.py b/local_api.py index a3bff2f988..5ad4eb0a7b 100644 --- a/local_api.py +++ b/local_api.py @@ -1,17 +1,17 @@ import json - import requests -# TODO: send a GET using the URL http://127.0.0.1:8000 -r = None # Your code here +BASE_URL = "http://127.0.0.1:8000" -# TODO: print the status code -# print() -# TODO: print the welcome message -# print() +# ---- GET request ---------------------------------------------------------- +r = requests.get(f"{BASE_URL}/") +# print status code and welcome message +print("GET / status:", r.status_code) +print("GET / response:", r.json()) +# ---- POST request --------------------------------------------------------- data = { "age": 37, "workclass": "Private", @@ -29,10 +29,7 @@ "native-country": "United-States", } -# TODO: send a POST using the data above -r = None # Your code here +r = requests.post(f"{BASE_URL}/predict/", json=data) -# TODO: print the status code -# print() -# TODO: print the result -# print() +print("POST /predict/ status:", r.status_code) +print("POST /predict/ response:", r.json()) diff --git a/main.py b/main.py index 638e2414de..53a058816c 100644 --- a/main.py +++ b/main.py @@ -7,6 +7,7 @@ from ml.data import apply_label, process_data from ml.model import inference, load_model + # DO NOT MODIFY class Data(BaseModel): age: int = Field(..., example=37) @@ -26,25 +27,32 @@ class Data(BaseModel): hours_per_week: int = Field(..., example=40, alias="hours-per-week") native_country: str = Field(..., example="United-States", alias="native-country") -path = None # TODO: enter the path for the saved encoder -encoder = load_model(path) -path = None # TODO: enter the path for the saved model -model = load_model(path) +# ---- load saved artifacts ------------------------------------------------- +project_path = "." +model_dir = os.path.join(project_path, "model") + +encoder_path = os.path.join(model_dir, "encoder.pkl") +lb_path = os.path.join(model_dir, "lb.pkl") +model_path = os.path.join(model_dir, "model.pkl") + +encoder = load_model(encoder_path) +lb = load_model(lb_path) +model = load_model(model_path) + +# ---- create FastAPI app --------------------------------------------------- +app = FastAPI(title="Income Classification API") -# TODO: create a RESTful API using FastAPI -app = None # your code here -# TODO: create a GET on the root giving a welcome message +# ---- root endpoint -------------------------------------------------------- @app.get("/") async def get_root(): - """ Say hello!""" - # your code here - pass + """Simple welcome endpoint.""" + return {"message": "Hello from the income classification API!"} -# TODO: create a POST on a different path that does model inference -@app.post("/data/") +# ---- prediction endpoint -------------------------------------------------- +@app.post("/predict/") async def post_inference(data: Data): # DO NOT MODIFY: turn the Pydantic model into a dict. data_dict = data.dict() @@ -64,11 +72,21 @@ async def post_inference(data: Data): "sex", "native-country", ] + + # process data for inference data_processed, _, _, _ = process_data( - # your code here - # use data as data input - # use training = False - # do not need to pass lb as input + data, + categorical_features=cat_features, + label=None, + training=False, + encoder=encoder, + lb=lb, ) - _inference = None # your code here to predict the result using data_processed - return {"result": apply_label(_inference)} + + # make prediction + preds = inference(model, data_processed) + _inference = preds[0] + + # convert 0/1 to <=50K />50K label + # NOTE: apply_label expects an indexable input, so wrap in a list + return {"result": apply_label([_inference])} diff --git a/ml/model.py b/ml/model.py index f361110f18..d676fa4929 100644 --- a/ml/model.py +++ b/ml/model.py @@ -1,9 +1,12 @@ import pickle +import numpy as np + +from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import fbeta_score, precision_score, recall_score + from ml.data import process_data -# TODO: add necessary import -# Optional: implement hyperparameter tuning. + def train_model(X_train, y_train): """ Trains a machine learning model and returns it. @@ -19,26 +22,22 @@ def train_model(X_train, y_train): model Trained machine learning model. """ - # TODO: implement the function - pass + model = RandomForestClassifier( + n_estimators=100, + random_state=42, + n_jobs=-1, + ) + model.fit(X_train, y_train) + return model def compute_model_metrics(y, preds): """ Validates the trained machine learning model using precision, recall, and F1. - - Inputs - ------ - y : np.array - Known labels, binarized. - preds : np.array - Predicted labels, binarized. - Returns - ------- - precision : float - recall : float - fbeta : float """ + y = np.array(y).reshape(-1) + preds = np.array(preds).reshape(-1) + fbeta = fbeta_score(y, preds, beta=1, zero_division=1) precision = precision_score(y, preds, zero_division=1) recall = recall_score(y, preds, zero_division=1) @@ -46,83 +45,51 @@ def compute_model_metrics(y, preds): def inference(model, X): - """ Run model inferences and return the predictions. - - Inputs - ------ - model : ??? - Trained machine learning model. - X : np.array - Data used for prediction. - Returns - ------- - preds : np.array - Predictions from the model. """ - # TODO: implement the function - pass + Run model inferences and return the predictions. + """ + preds = model.predict(X) + return np.array(preds).reshape(-1) -def save_model(model, path): - """ Serializes model to a file. - Inputs - ------ - model - Trained machine learning model or OneHotEncoder. - path : str - Path to save pickle file. +def save_model(model, path): + """ + Serializes model or encoder or label binarizer to a file. """ - # TODO: implement the function - pass + with open(path, "wb") as f: + pickle.dump(model, f) + def load_model(path): - """ Loads pickle file from `path` and returns it.""" - # TODO: implement the function - pass + """ + Loads pickle file from path and returns it. + """ + with open(path, "rb") as f: + obj = pickle.load(f) + return obj def performance_on_categorical_slice( data, column_name, slice_value, categorical_features, label, encoder, lb, model ): - """ Computes the model metrics on a slice of the data specified by a column name and - - Processes the data using one hot encoding for the categorical features and a - label binarizer for the labels. This can be used in either training or - inference/validation. + """ + Computes the model metrics on a slice of the data specified by a column name + and slice value. + """ + slice_data = data[data[column_name] == slice_value] - Inputs - ------ - data : pd.DataFrame - Dataframe containing the features and label. Columns in `categorical_features` - column_name : str - Column containing the sliced feature. - slice_value : str, int, float - Value of the slice feature. - categorical_features: list - List containing the names of the categorical features (default=[]) - label : str - Name of the label column in `X`. If None, then an empty array will be returned - for y (default=None) - encoder : sklearn.preprocessing._encoders.OneHotEncoder - Trained sklearn OneHotEncoder, only used if training=False. - lb : sklearn.preprocessing._label.LabelBinarizer - Trained sklearn LabelBinarizer, only used if training=False. - model : ??? - Model used for the task. + if slice_data.empty: + return 0.0, 0.0, 0.0 - Returns - ------- - precision : float - recall : float - fbeta : float - - """ - # TODO: implement the function X_slice, y_slice, _, _ = process_data( - # your code here - # for input data, use data in column given as "column_name", with the slice_value - # use training = False + slice_data, + categorical_features=categorical_features, + label=label, + training=False, + encoder=encoder, + lb=lb, ) - preds = None # your code here to get prediction on X_slice using the inference function + + preds = inference(model, X_slice) precision, recall, fbeta = compute_model_metrics(y_slice, preds) return precision, recall, fbeta diff --git a/model/encoder.pkl b/model/encoder.pkl new file mode 100644 index 0000000000..ba9c2135af Binary files /dev/null and b/model/encoder.pkl differ diff --git a/model/lb.pkl b/model/lb.pkl new file mode 100644 index 0000000000..510550bf3e Binary files /dev/null and b/model/lb.pkl differ diff --git a/model/model.pkl b/model/model.pkl new file mode 100644 index 0000000000..10d3212a5c Binary files /dev/null and b/model/model.pkl differ diff --git a/model_card_template.md b/model_card_template.md index 0392f3b9eb..d42c143a15 100644 --- a/model_card_template.md +++ b/model_card_template.md @@ -2,17 +2,86 @@ For additional information see the Model Card paper: https://arxiv.org/pdf/1810.03993.pdf +--- + ## Model Details +This project uses a Random Forest classifier implemented with scikit learn. +The purpose of the model is to predict whether an individual earns more than fifty thousand dollars per year. +The model was trained locally using Python 3.10 as part of the Udacity Machine Learning DevOps project. + +The model is saved as `model/model.pkl` and the preprocessing objects (encoder and label binarizer) are stored in the same folder. + +--- + ## Intended Use +This model is intended purely for educational and demonstration purposes. +It shows how to build an end to end ML pipeline including data processing, model training, automated testing, performance evaluation, CI, and API deployment with FastAPI. + +It is **not** intended for real world decision making in hiring, lending, housing, insurance, or other areas that affect people’s lives. + +--- + ## Training Data +The model was trained on the provided **Census Income dataset**, which contains demographic and employment related features. +Features include workclass, education, marital status, occupation, race, sex, and native country. + +Eighty percent of the dataset is used for training. +Categorical features are one hot encoded using the provided `process_data` function. + +--- + ## Evaluation Data +Twenty percent of the dataset is held out as a test set. +The same preprocessing pipeline (encoder and label binarizer) is applied to the test data. + +Evaluation includes both overall performance on the test set and performance on slices of data across individual categories (workclass, sex, race, etc.). +Slice performance is saved to `slice_output.txt`. + +--- + ## Metrics -_Please include the metrics used and your model's performance on those metrics._ + +Overall performance on the held out test set: + +- **Precision:** 0.7419 +- **Recall:** 0.6384 +- **F1 Score:** 0.6863 + +Slice performance varies across groups. Examples include: + +- **Sex = Male:** Higher recall and F1 compared to female +- **Workclass = Private:** Moderate precision and recall +- **Education levels:** Significant variation depending on category + +These results indicate that model performance is not uniform across demographic groups. + +--- ## Ethical Considerations +The Census dataset contains real world demographic information and may reflect historical social biases. +A model trained on such data can inherit or amplify these biases. +Because of this, the model should **not** be used for any real decision making that can affect individuals. + +Potential risks include: + +- Unequal performance for different demographic groups +- Unintended discrimination if used in hiring or income related screening +- Misinterpretation of model predictions due to limited context + +Care must be taken to avoid misuse. + +--- + ## Caveats and Recommendations + +- The model is not tuned or optimized; default Random Forest parameters are used. +- Data is outdated and does not reflect current populations. +- Model should be used only for learning purposes. +- If extended further, a fairness analysis, hyperparameter tuning, and updated datasets are recommended. + +The model should always be accompanied by transparency about its limitations and intended educational use. diff --git a/screenshots/continuous_integration.png b/screenshots/continuous_integration.png new file mode 100644 index 0000000000..b92bd71819 Binary files /dev/null and b/screenshots/continuous_integration.png differ diff --git a/screenshots/local_api.png b/screenshots/local_api.png new file mode 100644 index 0000000000..05362e059c Binary files /dev/null and b/screenshots/local_api.png differ diff --git a/screenshots/unit_test.png b/screenshots/unit_test.png new file mode 100644 index 0000000000..de81ea496e Binary files /dev/null and b/screenshots/unit_test.png differ diff --git a/slice_output.txt b/slice_output.txt new file mode 100644 index 0000000000..d52839dd53 --- /dev/null +++ b/slice_output.txt @@ -0,0 +1,198 @@ +workclass: ?, Count: 389 +Precision: 0.6538 | Recall: 0.4048 | F1: 0.5000 +workclass: Federal-gov, Count: 191 +Precision: 0.7971 | Recall: 0.7857 | F1: 0.7914 +workclass: Local-gov, Count: 387 +Precision: 0.7576 | Recall: 0.6818 | F1: 0.7177 +workclass: Private, Count: 4,578 +Precision: 0.7376 | Recall: 0.6404 | F1: 0.6856 +workclass: Self-emp-inc, Count: 212 +Precision: 0.7807 | Recall: 0.7542 | F1: 0.7672 +workclass: Self-emp-not-inc, Count: 498 +Precision: 0.7064 | Recall: 0.4904 | F1: 0.5789 +workclass: State-gov, Count: 254 +Precision: 0.7424 | Recall: 0.6712 | F1: 0.7050 +workclass: Without-pay, Count: 4 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +education: 10th, Count: 183 +Precision: 0.4000 | Recall: 0.1667 | F1: 0.2353 +education: 11th, Count: 225 +Precision: 1.0000 | Recall: 0.2727 | F1: 0.4286 +education: 12th, Count: 98 +Precision: 1.0000 | Recall: 0.4000 | F1: 0.5714 +education: 1st-4th, Count: 23 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +education: 5th-6th, Count: 62 +Precision: 1.0000 | Recall: 0.5000 | F1: 0.6667 +education: 7th-8th, Count: 141 +Precision: 0.0000 | Recall: 0.0000 | F1: 0.0000 +education: 9th, Count: 115 +Precision: 1.0000 | Recall: 0.3333 | F1: 0.5000 +education: Assoc-acdm, Count: 198 +Precision: 0.7000 | Recall: 0.5957 | F1: 0.6437 +education: Assoc-voc, Count: 273 +Precision: 0.6471 | Recall: 0.5238 | F1: 0.5789 +education: Bachelors, Count: 1,053 +Precision: 0.7523 | Recall: 0.7289 | F1: 0.7404 +education: Doctorate, Count: 77 +Precision: 0.8644 | Recall: 0.8947 | F1: 0.8793 +education: HS-grad, Count: 2,085 +Precision: 0.6594 | Recall: 0.4377 | F1: 0.5261 +education: Masters, Count: 369 +Precision: 0.8271 | Recall: 0.8551 | F1: 0.8409 +education: Preschool, Count: 10 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +education: Prof-school, Count: 116 +Precision: 0.8182 | Recall: 0.9643 | F1: 0.8852 +education: Some-college, Count: 1,485 +Precision: 0.6857 | Recall: 0.5199 | F1: 0.5914 +marital-status: Divorced, Count: 920 +Precision: 0.7600 | Recall: 0.3689 | F1: 0.4967 +marital-status: Married-AF-spouse, Count: 4 +Precision: 1.0000 | Recall: 0.0000 | F1: 0.0000 +marital-status: Married-civ-spouse, Count: 2,950 +Precision: 0.7346 | Recall: 0.6900 | F1: 0.7116 +marital-status: Married-spouse-absent, Count: 96 +Precision: 1.0000 | Recall: 0.2500 | F1: 0.4000 +marital-status: Never-married, Count: 2,126 +Precision: 0.8302 | Recall: 0.4272 | F1: 0.5641 +marital-status: Separated, Count: 209 +Precision: 1.0000 | Recall: 0.4211 | F1: 0.5926 +marital-status: Widowed, Count: 208 +Precision: 1.0000 | Recall: 0.1579 | F1: 0.2727 +occupation: ?, Count: 389 +Precision: 0.6538 | Recall: 0.4048 | F1: 0.5000 +occupation: Adm-clerical, Count: 726 +Precision: 0.6338 | Recall: 0.4688 | F1: 0.5389 +occupation: Armed-Forces, Count: 3 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +occupation: Craft-repair, Count: 821 +Precision: 0.6567 | Recall: 0.4862 | F1: 0.5587 +occupation: Exec-managerial, Count: 838 +Precision: 0.7952 | Recall: 0.7531 | F1: 0.7736 +occupation: Farming-fishing, Count: 193 +Precision: 0.5455 | Recall: 0.2143 | F1: 0.3077 +occupation: Handlers-cleaners, Count: 273 +Precision: 0.5714 | Recall: 0.3333 | F1: 0.4211 +occupation: Machine-op-inspct, Count: 378 +Precision: 0.5938 | Recall: 0.4043 | F1: 0.4810 +occupation: Other-service, Count: 667 +Precision: 1.0000 | Recall: 0.1923 | F1: 0.3226 +occupation: Priv-house-serv, Count: 26 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +occupation: Prof-specialty, Count: 828 +Precision: 0.7880 | Recall: 0.7679 | F1: 0.7778 +occupation: Protective-serv, Count: 136 +Precision: 0.7353 | Recall: 0.5952 | F1: 0.6579 +occupation: Sales, Count: 729 +Precision: 0.7273 | Recall: 0.6667 | F1: 0.6957 +occupation: Tech-support, Count: 189 +Precision: 0.7143 | Recall: 0.6863 | F1: 0.7000 +occupation: Transport-moving, Count: 317 +Precision: 0.6250 | Recall: 0.4688 | F1: 0.5357 +relationship: Husband, Count: 2,590 +Precision: 0.7370 | Recall: 0.6923 | F1: 0.7140 +relationship: Not-in-family, Count: 1,702 +Precision: 0.7959 | Recall: 0.4149 | F1: 0.5455 +relationship: Other-relative, Count: 178 +Precision: 1.0000 | Recall: 0.3750 | F1: 0.5455 +relationship: Own-child, Count: 1,019 +Precision: 1.0000 | Recall: 0.1765 | F1: 0.3000 +relationship: Unmarried, Count: 702 +Precision: 0.9231 | Recall: 0.2667 | F1: 0.4138 +relationship: Wife, Count: 322 +Precision: 0.7132 | Recall: 0.6783 | F1: 0.6953 +race: Amer-Indian-Eskimo, Count: 71 +Precision: 0.6250 | Recall: 0.5000 | F1: 0.5556 +race: Asian-Pac-Islander, Count: 193 +Precision: 0.7857 | Recall: 0.7097 | F1: 0.7458 +race: Black, Count: 599 +Precision: 0.7273 | Recall: 0.6154 | F1: 0.6667 +race: Other, Count: 55 +Precision: 1.0000 | Recall: 0.6667 | F1: 0.8000 +race: White, Count: 5,595 +Precision: 0.7404 | Recall: 0.6373 | F1: 0.6850 +sex: Female, Count: 2,126 +Precision: 0.7229 | Recall: 0.5150 | F1: 0.6015 +sex: Male, Count: 4,387 +Precision: 0.7445 | Recall: 0.6599 | F1: 0.6997 +native-country: ?, Count: 125 +Precision: 0.7500 | Recall: 0.6774 | F1: 0.7119 +native-country: Cambodia, Count: 3 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +native-country: Canada, Count: 22 +Precision: 0.6667 | Recall: 0.7500 | F1: 0.7059 +native-country: China, Count: 18 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +native-country: Columbia, Count: 6 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +native-country: Cuba, Count: 19 +Precision: 0.6667 | Recall: 0.8000 | F1: 0.7273 +native-country: Dominican-Republic, Count: 8 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +native-country: Ecuador, Count: 5 +Precision: 1.0000 | Recall: 0.5000 | F1: 0.6667 +native-country: El-Salvador, Count: 20 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +native-country: England, Count: 14 +Precision: 0.6667 | Recall: 0.5000 | F1: 0.5714 +native-country: France, Count: 5 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +native-country: Germany, Count: 32 +Precision: 0.8462 | Recall: 0.8462 | F1: 0.8462 +native-country: Greece, Count: 7 +Precision: 0.0000 | Recall: 0.0000 | F1: 0.0000 +native-country: Guatemala, Count: 13 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +native-country: Haiti, Count: 6 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +native-country: Honduras, Count: 4 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +native-country: Hong, Count: 8 +Precision: 0.5000 | Recall: 1.0000 | F1: 0.6667 +native-country: Hungary, Count: 3 +Precision: 1.0000 | Recall: 0.5000 | F1: 0.6667 +native-country: India, Count: 21 +Precision: 0.8750 | Recall: 0.8750 | F1: 0.8750 +native-country: Iran, Count: 12 +Precision: 0.3333 | Recall: 0.2000 | F1: 0.2500 +native-country: Ireland, Count: 5 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +native-country: Italy, Count: 14 +Precision: 0.7500 | Recall: 0.7500 | F1: 0.7500 +native-country: Jamaica, Count: 13 +Precision: 0.0000 | Recall: 1.0000 | F1: 0.0000 +native-country: Japan, Count: 11 +Precision: 0.7500 | Recall: 0.7500 | F1: 0.7500 +native-country: Laos, Count: 4 +Precision: 1.0000 | Recall: 0.0000 | F1: 0.0000 +native-country: Mexico, Count: 114 +Precision: 1.0000 | Recall: 0.3333 | F1: 0.5000 +native-country: Nicaragua, Count: 7 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +native-country: Peru, Count: 5 +Precision: 0.0000 | Recall: 0.0000 | F1: 0.0000 +native-country: Philippines, Count: 35 +Precision: 1.0000 | Recall: 0.6875 | F1: 0.8148 +native-country: Poland, Count: 14 +Precision: 0.6667 | Recall: 1.0000 | F1: 0.8000 +native-country: Portugal, Count: 6 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +native-country: Puerto-Rico, Count: 22 +Precision: 0.8333 | Recall: 0.8333 | F1: 0.8333 +native-country: Scotland, Count: 3 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +native-country: South, Count: 13 +Precision: 0.3333 | Recall: 0.5000 | F1: 0.4000 +native-country: Taiwan, Count: 11 +Precision: 0.7500 | Recall: 0.7500 | F1: 0.7500 +native-country: Thailand, Count: 5 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +native-country: Trinadad&Tobago, Count: 3 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +native-country: United-States, Count: 5,870 +Precision: 0.7392 | Recall: 0.6321 | F1: 0.6814 +native-country: Vietnam, Count: 5 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 +native-country: Yugoslavia, Count: 2 +Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 diff --git a/test_ml.py b/test_ml.py index 5f8306f14c..2174148dbd 100644 --- a/test_ml.py +++ b/test_ml.py @@ -1,28 +1,101 @@ import pytest -# TODO: add necessary import +import numpy as np +from sklearn.ensemble import RandomForestClassifier -# TODO: implement the first test. Change the function name and input as needed -def test_one(): +from ml.data import process_data +from ml.model import ( + train_model, + inference, + compute_model_metrics +) + + +# --------------------------------------------------------- +# Test 1: process_data returns correct structure +# --------------------------------------------------------- +def test_process_data_shapes(): """ - # add description for the first test + Ensures that process_data correctly encodes categorical features + and splits labels. Checks correct output lengths and array types. """ - # Your code here - pass + sample = { + "age": [25, 40, 60], + "workclass": ["Private", "Self-emp", "Private"], + "salary": ["<=50K", ">50K", "<=50K"] + } + + import pandas as pd + df = pd.DataFrame(sample) + + cat_features = ["workclass"] + X, y, encoder, lb = process_data( + df, + categorical_features=cat_features, + label="salary", + training=True + ) -# TODO: implement the second test. Change the function name and input as needed -def test_two(): + # X must be 2D + assert len(X.shape) == 2 + # y must be 1D + assert len(y.shape) == 1 + # lengths must match + assert X.shape[0] == y.shape[0] + # encoder should not be None + assert encoder is not None + assert lb is not None + + +# --------------------------------------------------------- +# Test 2: Model trains and inference returns correct shape +# --------------------------------------------------------- +def test_model_training_and_inference(): """ - # add description for the second test + Ensures that train_model returns a sklearn model and inference() + produces predictions of correct shape. """ - # Your code here - pass + sample = { + "age": [22, 45, 37, 52], + "workclass": ["Private", "Self-emp", "Private", "State-gov"], + "salary": ["<=50K", ">50K", "<=50K", ">50K"] + } + + import pandas as pd + df = pd.DataFrame(sample) + + cat_features = ["workclass"] + X, y, encoder, lb = process_data( + df, + categorical_features=cat_features, + label="salary", + training=True + ) -# TODO: implement the third test. Change the function name and input as needed -def test_three(): + model = train_model(X, y) + + assert isinstance(model, RandomForestClassifier) + + preds = inference(model, X) + + assert len(preds) == len(y) + assert preds.shape == y.shape + + +# --------------------------------------------------------- +# Test 3: compute_model_metrics outputs valid values +# --------------------------------------------------------- +def test_compute_model_metrics_values(): """ - # add description for the third test + Ensures compute_model_metrics returns precision, recall, and fbeta + scores that fall between 0 and 1. """ - # Your code here - pass + y_true = np.array([0, 1, 1, 0]) + y_pred = np.array([0, 1, 0, 0]) + + precision, recall, fbeta = compute_model_metrics(y_true, y_pred) + + assert 0 <= precision <= 1 + assert 0 <= recall <= 1 + assert 0 <= fbeta <= 1 diff --git a/train_model.py b/train_model.py index ae783ed5b9..bfe6dba9d3 100644 --- a/train_model.py +++ b/train_model.py @@ -12,15 +12,15 @@ save_model, train_model, ) -# TODO: load the cencus.csv data -project_path = "Your path here" + +# Load the census.csv data +project_path = "." # working directory is the project root data_path = os.path.join(project_path, "data", "census.csv") print(data_path) -data = None # your code here +data = pd.read_csv(data_path) -# TODO: split the provided data to have a train dataset and a test dataset -# Optional enhancement, use K-fold cross validation instead of a train-test split. -train, test = None, None# Your code here +# Split the data into train and test +train, test = train_test_split(data, test_size=0.20, random_state=42) # DO NOT MODIFY cat_features = [ @@ -34,13 +34,13 @@ "native-country", ] -# TODO: use the process_data function provided to process the data. +# Process training and test data X_train, y_train, encoder, lb = process_data( - # your code here - # use the train dataset - # use training=True - # do not need to pass encoder and lb as input - ) + train, + categorical_features=cat_features, + label="salary", + training=True, +) X_test, y_test, _, _ = process_data( test, @@ -51,37 +51,50 @@ lb=lb, ) -# TODO: use the train_model function to train the model on the training dataset -model = None # your code here +# Train the model +model = train_model(X_train, y_train) -# save the model and the encoder +# Save the model and the encoder and label binarizer model_path = os.path.join(project_path, "model", "model.pkl") save_model(model, model_path) + encoder_path = os.path.join(project_path, "model", "encoder.pkl") save_model(encoder, encoder_path) -# load the model -model = load_model( - model_path -) +lb_path = os.path.join(project_path, "model", "lb.pkl") +save_model(lb, lb_path) + +# Load the model back +model = load_model(model_path) -# TODO: use the inference function to run the model inferences on the test dataset. -preds = None # your code here +# Run inference on the test data +preds = inference(model, X_test) -# Calculate and print the metrics +# Calculate and print the overall metrics p, r, fb = compute_model_metrics(y_test, preds) print(f"Precision: {p:.4f} | Recall: {r:.4f} | F1: {fb:.4f}") -# TODO: compute the performance on model slices using the performance_on_categorical_slice function -# iterate through the categorical features +# Remove old slice file if it exists so we start fresh +if os.path.exists("slice_output.txt"): + os.remove("slice_output.txt") + +# Compute performance on model slices and save to slice_output.txt for col in cat_features: - # iterate through the unique values in one categorical feature for slicevalue in sorted(test[col].unique()): count = test[test[col] == slicevalue].shape[0] p, r, fb = performance_on_categorical_slice( - # your code here - # use test, col and slicevalue as part of the input + data=test, + column_name=col, + slice_value=slicevalue, + categorical_features=cat_features, + label="salary", + encoder=encoder, + lb=lb, + model=model, ) with open("slice_output.txt", "a") as f: print(f"{col}: {slicevalue}, Count: {count:,}", file=f) - print(f"Precision: {p:.4f} | Recall: {r:.4f} | F1: {fb:.4f}", file=f) + print( + f"Precision: {p:.4f} | Recall: {r:.4f} | F1: {fb:.4f}", + file=f, + )