udacity · connerworrell-alt · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025
@@ -1,17 +1,17 @@
 import json
-
 import requests
 
-# TODO: send a GET using the URL http://127.0.0.1:8000
-r = None # Your code here
+BASE_URL = "http://127.0.0.1:8000"
 
-# TODO: print the status code
-# print()
-# TODO: print the welcome message
-# print()
+# ---- GET request ----------------------------------------------------------
+r = requests.get(f"{BASE_URL}/")
 
+# print status code and welcome message
+print("GET / status:", r.status_code)
+print("GET / response:", r.json())
 
 
+# ---- POST request ---------------------------------------------------------
 data = {
     "age": 37,
     "workclass": "Private",
@@ -29,10 +29,7 @@
     "native-country": "United-States",
 }
 
-# TODO: send a POST using the data above
-r = None # Your code here
+r = requests.post(f"{BASE_URL}/predict/", json=data)
 
-# TODO: print the status code
-# print()
-# TODO: print the result
-# print()
+print("POST /predict/ status:", r.status_code)
+print("POST /predict/ response:", r.json())
@@ -7,6 +7,7 @@
 from ml.data import apply_label, process_data
 from ml.model import inference, load_model
 
+
 # DO NOT MODIFY
 class Data(BaseModel):
     age: int = Field(..., example=37)
@@ -26,25 +27,32 @@ class Data(BaseModel):
     hours_per_week: int = Field(..., example=40, alias="hours-per-week")
     native_country: str = Field(..., example="United-States", alias="native-country")
 
-path = None # TODO: enter the path for the saved encoder 
-encoder = load_model(path)
 
-path = None # TODO: enter the path for the saved model 
-model = load_model(path)
+# ---- load saved artifacts -------------------------------------------------
+project_path = "."
+model_dir = os.path.join(project_path, "model")
+
+encoder_path = os.path.join(model_dir, "encoder.pkl")
+lb_path = os.path.join(model_dir, "lb.pkl")
+model_path = os.path.join(model_dir, "model.pkl")
+
+encoder = load_model(encoder_path)
+lb = load_model(lb_path)
+model = load_model(model_path)
+
+# ---- create FastAPI app ---------------------------------------------------
+app = FastAPI(title="Income Classification API")
 
-# TODO: create a RESTful API using FastAPI
-app = None # your code here
 
-# TODO: create a GET on the root giving a welcome message
+# ---- root endpoint --------------------------------------------------------
 @app.get("/")
 async def get_root():
-    """ Say hello!"""
-    # your code here
-    pass
+    """Simple welcome endpoint."""
+    return {"message": "Hello from the income classification API!"}
 
 
-# TODO: create a POST on a different path that does model inference
-@app.post("/data/")
+# ---- prediction endpoint --------------------------------------------------
+@app.post("/predict/")
 async def post_inference(data: Data):
     # DO NOT MODIFY: turn the Pydantic model into a dict.
     data_dict = data.dict()
@@ -64,11 +72,21 @@ async def post_inference(data: Data):
         "sex",
         "native-country",
     ]
+
+    # process data for inference
     data_processed, _, _, _ = process_data(
-        # your code here
-        # use data as data input
-        # use training = False
-        # do not need to pass lb as input
+        data,
+        categorical_features=cat_features,
+        label=None,
+        training=False,
+        encoder=encoder,
+        lb=lb,
     )
-    _inference = None # your code here to predict the result using data_processed
-    return {"result": apply_label(_inference)}
+
+    # make prediction
+    preds = inference(model, data_processed)
+    _inference = preds[0]
+
+    # convert 0/1 to <=50K />50K label
+    # NOTE: apply_label expects an indexable input, so wrap in a list
+    return {"result": apply_label([_inference])}
@@ -1,9 +1,12 @@
 import pickle
+import numpy as np
+
+from sklearn.ensemble import RandomForestClassifier
 from sklearn.metrics import fbeta_score, precision_score, recall_score
+
 from ml.data import process_data
-# TODO: add necessary import
 
-# Optional: implement hyperparameter tuning.
+
 def train_model(X_train, y_train):
     """
     Trains a machine learning model and returns it.
@@ -19,110 +22,74 @@ def train_model(X_train, y_train):
     model
         Trained machine learning model.
     """
-    # TODO: implement the function
-    pass
+    model = RandomForestClassifier(
+        n_estimators=100,
+        random_state=42,
+        n_jobs=-1,
+    )
+    model.fit(X_train, y_train)
+    return model
 
 
 def compute_model_metrics(y, preds):
     """
     Validates the trained machine learning model using precision, recall, and F1.
-
-    Inputs
-    ------
-    y : np.array
-        Known labels, binarized.
-    preds : np.array
-        Predicted labels, binarized.
-    Returns
-    -------
-    precision : float
-    recall : float
-    fbeta : float
     """
+    y = np.array(y).reshape(-1)
+    preds = np.array(preds).reshape(-1)
+
     fbeta = fbeta_score(y, preds, beta=1, zero_division=1)
     precision = precision_score(y, preds, zero_division=1)
     recall = recall_score(y, preds, zero_division=1)
     return precision, recall, fbeta
 
 
 def inference(model, X):
-    """ Run model inferences and return the predictions.
-
-    Inputs
-    ------
-    model : ???
-        Trained machine learning model.
-    X : np.array
-        Data used for prediction.
-    Returns
-    -------
-    preds : np.array
-        Predictions from the model.
     """
-    # TODO: implement the function
-    pass
+    Run model inferences and return the predictions.
+    """
+    preds = model.predict(X)
+    return np.array(preds).reshape(-1)
 
-def save_model(model, path):
-    """ Serializes model to a file.
 
-    Inputs
-    ------
-    model
-        Trained machine learning model or OneHotEncoder.
-    path : str
-        Path to save pickle file.
+def save_model(model, path):
+    """
+    Serializes model or encoder or label binarizer to a file.
     """
-    # TODO: implement the function
-    pass
+    with open(path, "wb") as f:
+        pickle.dump(model, f)
+
 
 def load_model(path):
-    """ Loads pickle file from `path` and returns it."""
-    # TODO: implement the function
-    pass
+    """
+    Loads pickle file from path and returns it.
+    """
+    with open(path, "rb") as f:
+        obj = pickle.load(f)
+    return obj
 
 
 def performance_on_categorical_slice(
     data, column_name, slice_value, categorical_features, label, encoder, lb, model
 ):
-    """ Computes the model metrics on a slice of the data specified by a column name and
-
-    Processes the data using one hot encoding for the categorical features and a
-    label binarizer for the labels. This can be used in either training or
-    inference/validation.
+    """
+    Computes the model metrics on a slice of the data specified by a column name
+    and slice value.
+    """
+    slice_data = data[data[column_name] == slice_value]
 
-    Inputs
-    ------
-    data : pd.DataFrame
-        Dataframe containing the features and label. Columns in `categorical_features`
-    column_name : str
-        Column containing the sliced feature.
-    slice_value : str, int, float
-        Value of the slice feature.
-    categorical_features: list
-        List containing the names of the categorical features (default=[])
-    label : str
-        Name of the label column in `X`. If None, then an empty array will be returned
-        for y (default=None)
-    encoder : sklearn.preprocessing._encoders.OneHotEncoder
-        Trained sklearn OneHotEncoder, only used if training=False.
-    lb : sklearn.preprocessing._label.LabelBinarizer
-        Trained sklearn LabelBinarizer, only used if training=False.
-    model : ???
-        Model used for the task.
+    if slice_data.empty:
+        return 0.0, 0.0, 0.0
 
-    Returns
-    -------
-    precision : float
-    recall : float
-    fbeta : float
-
-    """
-    # TODO: implement the function
     X_slice, y_slice, _, _ = process_data(
-        # your code here
-        # for input data, use data in column given as "column_name", with the slice_value 
-        # use training = False
+        slice_data,
+        categorical_features=categorical_features,
+        label=label,
+        training=False,
+        encoder=encoder,
+        lb=lb,
     )
-    preds = None # your code here to get prediction on X_slice using the inference function
+
+    preds = inference(model, X_slice)
     precision, recall, fbeta = compute_model_metrics(y_slice, preds)
     return precision, recall, fbeta
@@ -2,17 +2,86 @@
 
 For additional information see the Model Card paper: https://arxiv.org/pdf/1810.03993.pdf
 
+---
+
 ## Model Details
 
+This project uses a Random Forest classifier implemented with scikit learn.  
+The purpose of the model is to predict whether an individual earns more than fifty thousand dollars per year.  
+The model was trained locally using Python 3.10 as part of the Udacity Machine Learning DevOps project.  
+
+The model is saved as `model/model.pkl` and the preprocessing objects (encoder and label binarizer) are stored in the same folder.
+
+---
+
 ## Intended Use
 
+This model is intended purely for educational and demonstration purposes.  
+It shows how to build an end to end ML pipeline including data processing, model training, automated testing, performance evaluation, CI, and API deployment with FastAPI.
+
+It is **not** intended for real world decision making in hiring, lending, housing, insurance, or other areas that affect people’s lives.
+
+---
+
 ## Training Data
 
+The model was trained on the provided **Census Income dataset**, which contains demographic and employment related features.  
+Features include workclass, education, marital status, occupation, race, sex, and native country.
+
+Eighty percent of the dataset is used for training.  
+Categorical features are one hot encoded using the provided `process_data` function.
+
+---
+
 ## Evaluation Data
 
+Twenty percent of the dataset is held out as a test set.  
+The same preprocessing pipeline (encoder and label binarizer) is applied to the test data.
+
+Evaluation includes both overall performance on the test set and performance on slices of data across individual categories (workclass, sex, race, etc.).  
+Slice performance is saved to `slice_output.txt`.
+
+---
+
 ## Metrics
-_Please include the metrics used and your model's performance on those metrics._
+
+Overall performance on the held out test set:
+
+- **Precision:** 0.7419  
+- **Recall:** 0.6384  
+- **F1 Score:** 0.6863  
+
+Slice performance varies across groups. Examples include:
+
+- **Sex = Male:** Higher recall and F1 compared to female  
+- **Workclass = Private:** Moderate precision and recall  
+- **Education levels:** Significant variation depending on category  
+
+These results indicate that model performance is not uniform across demographic groups.
+
+---
 
 ## Ethical Considerations
 
+The Census dataset contains real world demographic information and may reflect historical social biases.  
+A model trained on such data can inherit or amplify these biases.  
+Because of this, the model should **not** be used for any real decision making that can affect individuals.
+
+Potential risks include:
+
+- Unequal performance for different demographic groups  
+- Unintended discrimination if used in hiring or income related screening  
+- Misinterpretation of model predictions due to limited context  
+
+Care must be taken to avoid misuse.
+
+---
+
 ## Caveats and Recommendations
+
+- The model is not tuned or optimized; default Random Forest parameters are used.  
+- Data is outdated and does not reflect current populations.  
+- Model should be used only for learning purposes.  
+- If extended further, a fairness analysis, hyperparameter tuning, and updated datasets are recommended.  
+
+The model should always be accompanied by transparency about its limitations and intended educational use.