UMass-Rescue · jbardowski · Apr 26, 2025 · May 2, 2025 · May 2, 2025 · May 10, 2025
diff --git a/.gitignore b/.gitignore
@@ -170,4 +170,7 @@ rescuebox/bin/
 rescuebox/lib/
 rescuebox/pyvenv.cfg
 src/rb-api/rb/api/static/index/main.js
-**/*/.DS_Store
+**/*/.DS_Store
+src/age_gender_classifier/**/*.db
+src/age_gender_classifier/**/*.csv
+src/age_gender_classifier/**/*.onnx
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,14 +22,18 @@ onnxruntime = "1.21.0"
 opencv-python = ">=4.11.0.86,<5.0.0.0"
 ollama = ">=0.4.7,<0.5.0"
 pypdf2 = ">=3.0.1,<4.0.0"
+torch = ">=2.6.0,<3.0.0"
+pandas = ">=2.2.3,<3.0.0"
+sqlalchemy = ">=2.0.38,<3.0.0"
+seaborn = ">=0.13.2,<0.14.0"
 
 rb-lib = { path = "src/rb-lib", develop = true }
-
 file-utils = { path = "src/file-utils", develop = true }
 doc-parser = { path = "src/doc-parser", develop = true }
 audio-transcription = { path = "src/audio-transcription", develop = true }
 age-and-gender-detection = { path = "src/age_and_gender_detection", develop = true }
 text-summary = {path = "src/text-summary", develop = true}
+age-classifier = { path = "src/age_gender_classifier", develop = true }
 
 # Don't add new packages here, add them appropriately in the list above
 beautifulsoup4 = "^4.13.3"

diff --git a/src/age_gender_classifier/.env.sample b/src/age_gender_classifier/.env.sample
@@ -0,0 +1,2 @@
+# Replace with appropriate connection string
+export DB_CONN_STR="sqlite:///<databaseName>.db"
diff --git a/src/age_gender_classifier/README.md b/src/age_gender_classifier/README.md
@@ -0,0 +1,91 @@
+## Age and Gender Classification
+
+### Installation and Setup
+
+**Install `pipx`:** To manage `poetry`, this is recommended as it isolates Poetry in its own virtual environment, preventing conflicts with system-wide Python packages.
+
+    # macOS
+    brew update
+    brew install pipx
+    pipx ensurepath
+
+    # linux (debian)
+    sudo apt update
+    sudo apt install pipx
+    export PATH="$HOME/.local/bin:$PATH"
+
+    # alt, not recommended
+    python -m pip install --user pipx
+    python -m pipx ensurepath
+
+**Install `poetry`:** Restart or refresh your shell, install Poetry, and set desired Python version (>=3.11).
+
+    source ~/.bashrc
+
+    pipx install poetry
+    poetry env use 3.11.1
+
+**Activate venv:** Set configurations, create a virtual environment, and activate it. Note: use `poetry init` when starting a new project from scratch, use `poetry install` to set up dependencies from an existing lock file.
+
+    poetry config virtualenvs.create true
+    poetry config virtualenvs.in-project true
+
+    poetry install
+    # If you are on Mac OS run:
+    source .venv/bin/activate
+    # If you are on Windows run instead:
+    source .venv/Scripts/activate
+
+To install or remove packages, respectively, use `poetry add` and `poetry remove`.
+
+    poetry add [--dev] <package_name>
+    poetry remove [--dev] <package_name>
+
+**Set environment variables:** From the project root directory, set the `PYTHONPATH` environment variable to make local directories accessible for import in your venv, and define any other vars in a `.env` file; follow `.env.sample`
+
+    export PYTHONPATH=$(pwd):$PYTHONPATH
+    set -a; source .env; set +a
+
+**Get the ONNX models**
+
+In order to run survey_models.py you must download the onnx model files from this GoogleDrive [link](https://drive.google.com/drive/folders/1IgG6w6lJ9cd8Qlckd7HwdBUjWCd_-gxN), then copy them in their respective directories.
+
+    cp ~/Downloads/v001_model.onnx src/onnx_models/age_classify_v001/v001_model.onnx 
+    cp ~/Downloads/vit_model.onnx src/onnx_models/vit_age_classifier/vit_model.onnx
+    cp ~/Downloads/fareface_age.onnx src/onnx_models/fareface/fareface_age.onnx
+
+Alternatively, you could run the `convert_to_onnx.py` files in each directory to regenerate the respective ONNX files.
+
+Your models directory structure should then look like this:
+```
+/onnx_models
+--/age_classify_v001
+----convert_to_onnx.py
+----model.py
+----v001_model.onnx
+--/vit_age_classifier
+----convert_to_onnx.py
+----model.py
+----vit_model.onnx
+--surbey_models.py
+```
+
+
+You are good to go!
+
+---
+
+### Start the server...
+    python src/server/server-onnx.py
+
+**... And open Rescue Box:** With the server running, register the models in the Rescue Box desktop application (`localhost:5000`), and use as inputs test images located in `src/onnx_models/test_images/`. 
+
+Results will be displayed as a JSON blob in the desktop app, and be written to a SQLite database at the project's root directory, in a table named `model_output`. Each run will generate the same `created_at` timestamp in the table (for each image and each model).
+
+To confirm results were successfully written to the DB, simply log into a `venv` interpreter and run the following:
+
+    from src.utils.common import read_db
+    df = read_db("model_output", "select * from MODEL_OUTPUT order by created_at desc")
+    df.head()
+
+
diff --git a/src/age_gender_classifier/img-app-info.md b/src/age_gender_classifier/img-app-info.md
@@ -0,0 +1 @@
+## TODO
diff --git a/src/age_gender_classifier/pyproject.toml b/src/age_gender_classifier/pyproject.toml
@@ -0,0 +1,31 @@
+[project]
+name = "age-classifier"
+version = "0.1.0"
+description = "UMass Rescue Box: Age and Gender Classification"
+authors = [
+    {name = "James Bardowski",email = "jbardowski@umass.edu"},
+    {name = "Juhi Manish Jain",email = "juhimanishja@umass.edu"},
+    {name = "Jacob Sweet",email = "jdsweet@umass.edu"}
+]
+readme = "README.md"
+
+requires-python = ">=3.11"
+dependencies = [
+    "torch",
+    "numpy",
+    "pandas",
+    "requests",
+    "opencv-python",
+    "sqlalchemy",
+    "seaborn",
+    "onnxruntime",
+]
+
+[tool.poetry]
+packages = [
+    { include = "server", from = "src" }
+]
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/src/age_gender_classifier/src/__init__.py b/src/age_gender_classifier/src/__init__.py
diff --git a/src/age_gender_classifier/src/eval/README.md b/src/age_gender_classifier/src/eval/README.md
@@ -0,0 +1,13 @@
+## Evaluating Labeled Data Set
+
+**Pre-requisite**  
+Ensure the labeled data set exists in the data store. If not, follow steps in the `init_data` directory.  
+
+**Analyze Model Efficacy**
+- Analyze raw labeled data set: `analyze_labeled_data_raw.py` writes charts for age distribution and gender counts to `imgs/` directory.  
+- Score labeled data: `score_labeled_data.py` runs labeled data through our models and writes scores to the DB.  
+- Analyze scored labeled data: `analyze_labeled_data_scored.py` visualize model efficacy and writes charts to `imgs/` directory.  
+
+Process is grouped in `eval_main.py`
+
+    poetry run python src/eval/eval_main.py
diff --git a/src/age_gender_classifier/src/eval/analyze_labeled_data_raw.py b/src/age_gender_classifier/src/eval/analyze_labeled_data_raw.py
@@ -0,0 +1,37 @@
+from pathlib import Path
+import logging
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+from src.utils.common import read_db
+
+logging.basicConfig(level=logging.INFO)
+path = Path(__file__).parent
+
+
+def age_histogram(df: pd.DataFrame, saveFile: str="imgs/age_distribution.png") -> None:
+    """"""
+    sns.histplot(data=df, x='age', kde=True)
+    plt.savefig(path / saveFile)
+    logging.info(f" Saved histogram to {saveFile}")
+
+
+def gender_counts(df: pd.DataFrame, saveFile: str="imgs/gender_counts.png") -> None:
+    """"""
+    gender_counts = {0: (df["gender"] == 0).sum(), 1: (df["gender"] == 1).sum()}
+    sns.barplot(x=["Male", "Female"], y=[gender_counts[0], gender_counts[1]], palette=["gray", "orange"], hue=["Male", "Female"], width=0.5)
+    plt.ylabel("Count")
+    plt.savefig(path / saveFile)
+    logging.info(f" Saved histogram to {saveFile}")
+
+
+def main(table: str="age_gender_labeled"):
+    """Labeled data set must exist in the database."""
+    df = read_db(table_name=table, query=f"SELECT * FROM {table}")
+    age_histogram(df)
+    gender_counts(df)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/age_gender_classifier/src/eval/eval_main.py b/src/age_gender_classifier/src/eval/eval_main.py
@@ -0,0 +1,29 @@
+import pandas as pd
+from src.eval.analyze_labeled_data_raw import main as raw_main
+from src.eval.score_labeled_data import main as run_evaluation
+from eval.transform_scores import main as transform_outputs
+
+
+def main(eval_table: str="age_gender_labeled", raw_plots: bool=False) -> pd.DataFrame:
+    """Orchestrate full evaluation pipeline.
+
+    - Plot raw data, if raw_plots is True
+    - Run eval, query true samples and run Survey models
+    - Transform outputs, extract predictions from json payload
+    - Plot predicted data
+    """
+    if raw_plots:
+        raw_main(eval_table)
+
+    ts, df = run_evaluation()
+
+    df = transform_outputs(t_stamp=ts)
+
+    # TODO: endpoint for visualizations here
+    # chart scores and true/predicted labels
+
+    return df
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/age_gender_classifier/src/eval/evaluate_inputs.py b/src/age_gender_classifier/src/eval/evaluate_inputs.py
@@ -0,0 +1,36 @@
+from pathlib import Path
+import logging
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+from src.utils.common import read_db
+
+logging.basicConfig(level=logging.INFO)
+path = Path(__file__).parent
+
+def age_histogram(df: pd.DataFrame, saveFile: str="imgs/age_distribution.png") -> None:
+    """"""
+    sns.histplot(data=df,x='age',kde=True)
+    plt.savefig(path / saveFile)
+    logging.info(f" Saved histogram to {saveFile}")
+
+
+def gender_counts(df: pd.DataFrame, saveFile: str="imgs/gender_counts.png") -> None:
+    """"""
+    gender_counts = {0: (df["gender"] == 0).sum(), 1: (df["gender"] == 1).sum()}
+    sns.barplot(x=["Male", "Female"], y=[gender_counts[0], gender_counts[1]], palette=["gray", "orange"], hue=["Male", "Female"], width=0.5)
+    plt.ylabel("Count")
+    plt.savefig(path / saveFile)
+    logging.info(f" Saved histogram to {saveFile}")
+
+
+def main():
+    """"""
+    df = read_db(table_name="age_gender_labeled", query="SELECT * FROM age_gender_labeled")
+    age_histogram(df)
+    gender_counts(df)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/age_gender_classifier/src/eval/imgs/age_distribution.png b/src/age_gender_classifier/src/eval/imgs/age_distribution.png
diff --git a/src/age_gender_classifier/src/eval/imgs/gender_counts.png b/src/age_gender_classifier/src/eval/imgs/gender_counts.png
diff --git a/src/age_gender_classifier/src/eval/score_labeled_data.py b/src/age_gender_classifier/src/eval/score_labeled_data.py
@@ -0,0 +1,36 @@
+from pathlib import Path
+from typing import Tuple
+import logging
+import pandas as pd
+
+from src.utils.common import read_db
+from src.onnx_models.survey_models import SurveyModels
+
+logging.basicConfig(level=logging.INFO)
+path = Path(__file__).parent
+
+
+def main(table: str="age_gender_labeled", mod_size: int=200) -> Tuple[str, pd.DataFrame]:
+    """Read subset of true data from database, preprocess, run inference against our models,
+    and return a dataframe. Labeled data set must exist in the database.
+
+    Return: tuple of timestamp as str and dataframe
+    """
+    df = read_db(
+        table_name=table,
+        query=f"SELECT id, age, img_name, pixels FROM {table} where id % {mod_size} = 0 order by age"
+    )
+
+    imgs = list(df["pixels"])
+    ids = list(df["img_name"])
+    logging.info(f" About to run inference on true label subset of size = {len(df)} ...")
+
+    sm = SurveyModels()
+    df = sm.main_predict_eval(imgs, age_threshold=20, ids=ids)
+
+    logging.info(" Completed prediction on true labels, returning timestamp and dataframe of predicted results")
+    return sm.now, df
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/age_gender_classifier/src/eval/transform_scores.py b/src/age_gender_classifier/src/eval/transform_scores.py
@@ -0,0 +1,47 @@
+from pathlib import Path
+import logging
+import json
+import pandas as pd
+from typing import Optional
+
+from src.utils.common import read_db
+
+logging.basicConfig(level=logging.INFO)
+path = Path(__file__).parent
+
+
+def flatten_json_scores(df: pd.DataFrame) -> pd.DataFrame:
+    """Expand json scores from model_output."""
+    scores_expanded = pd.json_normalize(df["scores"].apply(lambda x: json.loads(x)))
+    df = df.join(scores_expanded).drop("scores", axis=1)
+    logging.info(" Expanded JSON scores")
+    return df
+
+
+def main(t_stamp: Optional[str] = None) -> pd.DataFrame:
+    """Read values from DB where created_at = t_stamp,
+    expand json scores into separate columns, and return a dataframe.
+    """
+    if t_stamp is None:
+        df = pd.read_csv(path / "temp_output.csv", header=0, index_col=False)
+    else:
+        pred_df = read_db(
+            table_name="model_output",
+            query=f"SELECT * FROM model_output where created_at = '{t_stamp}' order by created_at"
+        )
+        df = flatten_json_scores(pred_df)
+        df.to_csv(path / "temp_output.csv", index=False)
+
+    true_df = read_db(
+        table_name="age_gender_labeled",
+        query=f"SELECT age AS true_label, img_name FROM age_gender_labeled"
+    )
+
+    df_merged = pd.merge(df, true_df, left_on='imageId', right_on='img_name').drop("img_name", axis=1)
+    df_merged.to_csv(path / "temp_output_labeled.csv", index=False)
+
+    return df_merged
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/age_gender_classifier/src/init_data/README.md b/src/age_gender_classifier/src/init_data/README.md
@@ -0,0 +1,10 @@
+### How to get the evaluation data set
+
+The evaluation dataset is 199MB in size, too large to check into GitHub.  
+Download it from [here](https://www.kaggle.com/datasets/nipunarora8/age-gender-and-ethnicity-face-data-csv/data), and save the file in this directory as `age_gender.csv`.  
+
+### Load into SQLite
+
+Run the `load_db.py` file. This reads the downloaded CSV file, connects to the database, creates a new table if it does not exist, truncates the table, and writes the data.
+
+    poetry run python src/init_data/load_db.py
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Replace with appropriate connection string
		export DB_CONN_STR="sqlite:///<databaseName>.db"