Skip to content

Commit 5d63965

Browse files
author
Taylor Miller
committedMay 24, 2017
* note about an issue in data_prep
* deprecated old code in data_prep * first pass mermaid.js diagram
1 parent e8e18e5 commit 5d63965

File tree

2 files changed

+129
-19
lines changed

2 files changed

+129
-19
lines changed
 

‎docs/mermaid.js

+127
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
graph LR
2+
subgraph Data Sources
3+
database
4+
csv
5+
end
6+
7+
database-->dataframe
8+
csv-->dataframe
9+
dataframe-->SMT_init
10+
11+
subgraph TrainedSupervisedModel
12+
subgraph TSM_Properties
13+
algorithm_name
14+
is_classification
15+
is_regression
16+
best_hyperparameters
17+
model_type
18+
binary_classification_scores
19+
metrics
20+
end
21+
subgraph TSM_Properties2
22+
model
23+
feature_model
24+
fit_pipeline
25+
column_names
26+
_model_type
27+
grain_column
28+
prediction_column
29+
test_set_predictions
30+
test_set_class_labels
31+
test_set_actual
32+
_metric_by_name
33+
end
34+
35+
subgraph TSM_methods
36+
TSM_init[__init__]
37+
save
38+
make_predictions
39+
prepare_and_subset
40+
make_factors
41+
make_predictions_with_k_factors
42+
make_original_with_predictions_and_features
43+
create_catalyst_dataframe
44+
predict_to_catalyst_sam
45+
predict_to_sqlite
46+
roc_curve_plot
47+
roc
48+
pr_curve_plot
49+
pr
50+
validate_classification
51+
end
52+
end
53+
54+
subgraph SupervisedModelTrainer
55+
SMT_init-->ASMT_init
56+
SMT_init[__init__]-->full_pipeline
57+
58+
knn-->knn2
59+
random_forest-->random_forest_classification_a
60+
random_forest-->random_forest_regression_a
61+
logistic_regression-->logistic_regression2
62+
linear_regression-->linear_regression2
63+
subgraph AdvancedSupervisedModelTrainer
64+
ASMT_init[__init__]
65+
knn2-->TSM_init
66+
random_forest_classification_a-->TSM_init
67+
random_forest_regression_a-->TSM_init
68+
logistic_regression2-->TSM_init
69+
linear_regression2-->TSM_init
70+
end
71+
end
72+
73+
subgraph toolbox
74+
subgraph model_eval.py
75+
compute_roc
76+
compute_pr
77+
validate_predictions_and_labels_are_equal_length
78+
calculate_regression_metrics
79+
calculate_binary_classification_metrics
80+
tsm_classification_comparison_plots
81+
roc_plot_from_thresholds
82+
pr_plot_from_thresholds
83+
plot_rf_from_tsm
84+
plot_random_forest_feature_importance
85+
get_estimator_from_trained_supervised_model
86+
get_estimator_from_meta_estimator
87+
get_hyperparameters_from_meta_estimator
88+
end
89+
90+
subgraph data_preparation.py
91+
full_pipeline
92+
end
93+
94+
full_pipeline-->DataFrameImputer
95+
full_pipeline-->DataFrameConvertTargetToBinary
96+
full_pipeline-->DataFrameCreateDummyVariables
97+
full_pipeline-->DataFrameConvertColumnToNumeric
98+
full_pipeline-->DataFrameUnderSampling
99+
full_pipeline-->DataFrameOverSampling
100+
full_pipeline-->DataframeDateTimeColumnSuffixFilter
101+
full_pipeline-->DataframeColumnRemover
102+
full_pipeline-->DataframeNullValueFilter
103+
104+
subgraph transformers.py
105+
DataFrameImputer
106+
DataFrameConvertTargetToBinary
107+
DataFrameCreateDummyVariables
108+
DataFrameConvertColumnToNumeric
109+
DataFrameUnderSampling
110+
DataFrameOverSampling
111+
end
112+
113+
subgraph filters.py
114+
DataframeDateTimeColumnSuffixFilter
115+
DataframeColumnRemover
116+
DataframeNullValueFilter
117+
end
118+
end
119+
120+
class model_eval pythonModule;
121+
122+
classDef pythonClass fill:#00ff33;
123+
classDef pythonModule fill:#ff1100;
124+
125+
class Trainer pythonClass;
126+
class AdvancedTrainer pythonClass;
127+
class TSM pythonClass;

‎healthcareai/pipelines/data_preparation.py

+2-19
Original file line numberDiff line numberDiff line change
@@ -12,29 +12,12 @@ def full_pipeline(model_type, predicted_column, grain_column, impute=True):
1212
('remove_grain_column', filters.DataframeColumnRemover(grain_column)),
1313
# Perform one of two basic imputation methods
1414
# TODO we need to think about making this optional to solve the problem of rare and very predictive values
15-
# where neither imputation or dropping rows is appropriate
15+
# TODO This pipeline may drop nulls in prediction rows if impute=False
16+
# TODO See https://github.com/HealthCatalyst/healthcareai-py/issues/276
1617
('imputation', transformers.DataFrameImputer(impute=impute)),
1718
('null_row_filter', filters.DataframeNullValueFilter(excluded_columns=None)),
1819
('convert_target_to_binary', transformers.DataFrameConvertTargetToBinary(model_type, predicted_column)),
1920
('prediction_to_numeric', transformers.DataFrameConvertColumnToNumeric(predicted_column)),
2021
('create_dummy_variables', transformers.DataFrameCreateDummyVariables([predicted_column])),
2122
])
2223
return pipeline
23-
24-
25-
def dataframe_prediction(dataframe, model_type, grain_column_name, predicted_column, impute=True):
26-
# TODO Deprecate this
27-
"""
28-
Main prediction data preparation pipeline. Sequentially runs transformers and methods to clean and prepare the
29-
before dropping the prediction column
30-
"""
31-
32-
# Apply the pipelines
33-
# TODO do we want to enforce imputation so that entire rows with null values don't get dropped?
34-
# TODO ... or do we want to leave out the null dropping step - and if so, what impact will this have ML-wise?
35-
result_dataframe = full_pipeline(model_type, predicted_column, grain_column_name, impute=impute).transform(dataframe)
36-
37-
# Remove the predicted column
38-
result_dataframe = filters.DataframeColumnRemover(predicted_column).fit_transform(result_dataframe)
39-
40-
return result_dataframe

0 commit comments

Comments
 (0)
Please sign in to comment.