diff --git a/pyproject.toml b/pyproject.toml index bfc602c6d..82303d69d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,7 +68,8 @@ select = [ "F823", # undefined-local "F841", # unused-variable "I", # isort - "W292", # missing-newline-at-end-of-file + "W292", # missing-newline-at-end-of-file, + "PD002", # pandas-use-of-inplace-argument ] [tool.setuptools.packages.find] diff --git a/spras/evaluation.py b/spras/evaluation.py index 507ffb10a..0145341c8 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -231,7 +231,7 @@ def visualize_precision_and_recall_plot(pr_df: pd.DataFrame, output_file: str | plt.close() # save dataframe - pr_df.drop(columns=['Algorithm'], inplace=True) + pr_df = pr_df.drop(columns=['Algorithm']) pr_df.to_csv(output_file, sep='\t', index=False) @staticmethod @@ -248,7 +248,7 @@ def precision_and_recall_per_pathway(pr_df: pd.DataFrame, output_file: str | Pat """ if not pr_df.empty: pr_df['Algorithm'] = pr_df['Pathway'].apply(lambda p: Path(p).parent.name.split('-')[1]) - pr_df.sort_values(by=['Recall', 'Pathway'], axis=0, ascending=True, inplace=True) + pr_df = pr_df.sort_values(by=['Recall', 'Pathway'], axis=0, ascending=True) if aggregate_per_algorithm: # Guaranteed to only have one algorithm in Algorithm column @@ -281,7 +281,7 @@ def precision_and_recall_pca_chosen_pathway(pr_df: pd.DataFrame, output_file: st if not pr_df.empty: pr_df['Algorithm'] = pr_df['Pathway'].apply(lambda p: Path(p).parent.name.split('-')[1]) - pr_df.sort_values(by=['Recall', 'Pathway'], axis=0, ascending=True, inplace=True) + pr_df = pr_df.sort_values(by=['Recall', 'Pathway'], axis=0, ascending=True) if aggregate_per_algorithm: title = "PCA-Chosen Pathway Per Algorithm Precision and Recall Plot" @@ -305,7 +305,7 @@ def precision_and_recall_pca_chosen_pathway(pr_df: pd.DataFrame, output_file: st plt.close() @staticmethod - def pca_chosen_pathway(coordinates_files: Iterable[Union[str, PathLike]], pathway_summary_file: str, output_dir: str): + def pca_chosen_pathway(coordinates_files: Iterable[Union[str, PathLike]], pathway_summary_file: str | PathLike, output_dir: str | PathLike) -> list[str]: """ Identifies the pathway closest to a specified highest kernel density estimated (KDE) peak based on PCA coordinates @@ -323,7 +323,7 @@ def pca_chosen_pathway(coordinates_files: Iterable[Union[str, PathLike]], pathwa """ # TODO update to add in the pathways for the algorithms that do not provide a pca chosen pathway https://github.com/Reed-CompBio/spras/issues/341 - rep_pathways = [] + rep_pathways: list[str] = [] for coordinates_file in coordinates_files: coord_df = pd.read_csv(coordinates_file, delimiter='\t', header=0) @@ -356,7 +356,7 @@ def pca_chosen_pathway(coordinates_files: Iterable[Union[str, PathLike]], pathwa return rep_pathways @staticmethod - def edge_frequency_node_ensemble(node_table: pd.DataFrame, ensemble_files: Iterable[Union[str, PathLike]], dataset_file: str) -> dict: + def edge_frequency_node_ensemble(node_table: pd.DataFrame, ensemble_files: Iterable[Union[str, PathLike]], dataset_file: str | PathLike) -> dict: """ Generates a dictionary of node ensembles using edge frequency data from a list of ensemble files. A list of ensemble files can contain an aggregated ensemble or algorithm-specific ensembles per dataset @@ -387,11 +387,11 @@ def edge_frequency_node_ensemble(node_table: pd.DataFrame, ensemble_files: Itera if interactome.empty: raise ValueError( - f"Cannot compute PR curve or generate node ensemble. Input network for dataset \"{dataset_file.split('-')[0]}\" is empty." + f"Cannot compute PR curve or generate node ensemble. Input network for dataset \"{Path(dataset_file).name.split('-')[0]}\" is empty." ) if node_table.empty: raise ValueError( - f"Cannot compute PR curve or generate node ensemble. Gold standard associated with dataset \"{dataset_file.split('-')[0]}\" is empty." + f"Cannot compute PR curve or generate node ensemble. Gold standard associated with dataset \"{Path(dataset_file).name.split('-')[0]}\" is empty." ) # set the initial default frequencies to 0 for all interactome and gold standard nodes diff --git a/spras/meo.py b/spras/meo.py index 6fe06e058..e6c40a326 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -215,7 +215,7 @@ def parse_output(raw_pathway_file, standardized_pathway_file, params): # Would need to load the paths output file to rank edges correctly df = add_rank_column(df) df = reinsert_direction_col_directed(df) - df.drop(columns=['Type', 'Oriented', 'Weight'], inplace=True) + df = df.drop(columns=['Type', 'Oriented', 'Weight']) df.columns = ['Node1', 'Node2', 'Rank', "Direction"] df, has_duplicates = duplicate_edges(df) if has_duplicates: diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 1bfa277e9..4cd7e9cf8 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -235,7 +235,7 @@ def parse_output(raw_pathway_file, standardized_pathway_file, params): df.columns = ["Edge1", "InteractionType", "Edge2"] df = add_rank_column(df) df = reinsert_direction_col_mixed(df, "InteractionType", "pd", "pp") - df.drop(columns=['InteractionType'], inplace=True) + df = df.drop(columns=['InteractionType']) df.columns = ['Node1', 'Node2', 'Rank', 'Direction'] df, has_duplicates = duplicate_edges(df) if has_duplicates: diff --git a/test/evaluate/expected/expected-pca-coordinates.txt b/test/evaluate/expected/expected-pca-coordinates.txt new file mode 100644 index 000000000..786861cc1 --- /dev/null +++ b/test/evaluate/expected/expected-pca-coordinates.txt @@ -0,0 +1,6 @@ +datapoint_labels PC1 PC2 +data-test-params-123 0.52704628 0.70710678 +data-test-params-456 -1.05409255 -0.0 +data-test-params-789 0.52704628 -0.70710678 +centroid 0.0 -0.0 +kde_peak 0.11419336 -0.00785674 diff --git a/test/evaluate/test_evaluate.py b/test/evaluate/test_evaluate.py index ce50350e5..aa6fcb020 100644 --- a/test/evaluate/test_evaluate.py +++ b/test/evaluate/test_evaluate.py @@ -9,11 +9,11 @@ from spras.dataset import Dataset from spras.evaluation import Evaluation -INPUT_DIR = 'test/evaluate/input/' -OUT_DIR = 'test/evaluate/output/' -EXPECT_DIR = 'test/evaluate/expected/' -GS_NODE_TABLE = pd.read_csv(INPUT_DIR + 'gs_node_table.csv', header=0) -SUMMARY_FILE = INPUT_DIR + 'example_summary.txt' +INPUT_DIR = Path('test', 'evaluate', 'input') +OUT_DIR = Path('test', 'evaluate', 'output') +EXPECT_DIR = Path('test', 'evaluate', 'expected') +GS_NODE_TABLE = pd.read_csv(INPUT_DIR / 'gs_node_table.csv', header=0) +SUMMARY_FILE = INPUT_DIR / 'example_summary.txt' class TestEvaluate: @@ -39,9 +39,9 @@ def setup_class(cls): pickle.dump(dataset, f) def test_node_precision_recall_per_pathway(self): - file_paths = [INPUT_DIR + 'data-test-params-123/pathway.txt', INPUT_DIR + 'data-test-params-456/pathway.txt', INPUT_DIR + 'data-test-params-789/pathway.txt', INPUT_DIR + 'data-test-params-empty/pathway.txt'] - output_file = Path(OUT_DIR + 'pr-per-pathway.txt') - output_png = Path(OUT_DIR + 'pr-per-pathway.png') + file_paths = [INPUT_DIR / 'data-test-params-123/pathway.txt', INPUT_DIR / 'data-test-params-456/pathway.txt', INPUT_DIR / 'data-test-params-789/pathway.txt', INPUT_DIR / 'data-test-params-empty/pathway.txt'] + output_file = Path(OUT_DIR, 'pr-per-pathway.txt') + output_png = Path(OUT_DIR, 'pr-per-pathway.png') output_file.unlink(missing_ok=True) output_png.unlink(missing_ok=True) @@ -49,16 +49,16 @@ def test_node_precision_recall_per_pathway(self): Evaluation.precision_and_recall_per_pathway(pr_df, output_file, output_png, True) output = pd.read_csv(output_file, sep='\t', header=0).round(8) - expected = pd.read_csv(EXPECT_DIR + 'expected-pr-per-pathway.txt', sep='\t', header=0).round(8) + expected = pd.read_csv(EXPECT_DIR / 'expected-pr-per-pathway.txt', sep='\t', header=0).round(8) assert output.equals(expected) assert output_png.exists() def test_node_precision_recall_per_pathway_empty(self): - file_paths = [INPUT_DIR + 'data-test-params-empty/pathway.txt'] - output_file = Path(OUT_DIR + 'pr-per-pathway-empty.txt') - output_png = Path(OUT_DIR + 'pr-per-pathway-empty.png') + file_paths = [INPUT_DIR / 'data-test-params-empty/pathway.txt'] + output_file = Path(OUT_DIR, 'pr-per-pathway-empty.txt') + output_png = Path(OUT_DIR, 'pr-per-pathway-empty.png') output_file.unlink(missing_ok=True) output_png.unlink(missing_ok=True) @@ -66,14 +66,14 @@ def test_node_precision_recall_per_pathway_empty(self): Evaluation.precision_and_recall_per_pathway(pr_df, output_file, output_png, True) output = pd.read_csv(output_file, sep='\t', header=0).round(8) - expected = pd.read_csv(EXPECT_DIR + 'expected-pr-per-pathway-empty.txt', sep='\t', header=0).round(8) + expected = pd.read_csv(EXPECT_DIR / 'expected-pr-per-pathway-empty.txt', sep='\t', header=0).round(8) assert output.equals(expected) assert output_png.exists() def test_node_precision_recall_per_pathway_not_provided(self): - output_file = OUT_DIR + 'pr-per-pathway-not-provided.txt' - output_png = OUT_DIR + 'pr-per-pathway-not-provided.png' + output_file = OUT_DIR / 'pr-per-pathway-not-provided.txt' + output_png = OUT_DIR / 'pr-per-pathway-not-provided.png' file_paths = [] pr_df = Evaluation.node_precision_and_recall(file_paths, GS_NODE_TABLE) @@ -81,9 +81,9 @@ def test_node_precision_recall_per_pathway_not_provided(self): Evaluation.precision_and_recall_per_pathway(pr_df, output_file, output_png) def test_node_precision_recall_pca_chosen_pathway_not_provided(self): - output_file = Path( OUT_DIR + 'pr-per-pathway-pca-chosen-not-provided.txt') + output_file = Path(OUT_DIR, 'pr-per-pathway-pca-chosen-not-provided.txt') output_file.unlink(missing_ok=True) - output_png = Path(OUT_DIR + 'pr-per-pathway-pca-chosen-not-provided.png') + output_png = Path(OUT_DIR, 'pr-per-pathway-pca-chosen-not-provided.png') output_png.unlink(missing_ok=True) file_paths = [] @@ -91,102 +91,106 @@ def test_node_precision_recall_pca_chosen_pathway_not_provided(self): Evaluation.precision_and_recall_pca_chosen_pathway(pr_df, output_file, output_png) output = pd.read_csv(output_file, sep='\t', header=0).round(8) - expected = pd.read_csv(EXPECT_DIR + 'expected-pr-pca-chosen-not-provided.txt', sep='\t', header=0).round(8) + expected = pd.read_csv(EXPECT_DIR / 'expected-pr-pca-chosen-not-provided.txt', sep='\t', header=0).round(8) assert output.equals(expected) assert output_png.exists() def test_node_precision_recall_pca_chosen_pathway(self): - output_file = Path(OUT_DIR + 'pr-per-pathway-pca-chosen.txt') + output_file = Path(OUT_DIR / 'pr-per-pathway-pca-chosen.txt') output_file.unlink(missing_ok=True) - output_png = Path(OUT_DIR + 'pr-per-pathway-pca-chosen.png') + output_png = Path(OUT_DIR / 'pr-per-pathway-pca-chosen.png') output_png.unlink(missing_ok=True) - output_coordinates = Path(OUT_DIR + 'pca-coordinates.tsv') + output_coordinates = Path(OUT_DIR / 'pca-coordinates.tsv') output_coordinates.unlink(missing_ok=True) - file_paths = [INPUT_DIR + 'data-test-params-123/pathway.txt', INPUT_DIR + 'data-test-params-456/pathway.txt', - INPUT_DIR + 'data-test-params-789/pathway.txt', INPUT_DIR + 'data-test-params-empty/pathway.txt'] + file_paths = [INPUT_DIR / 'data-test-params-123' / 'pathway.txt', INPUT_DIR / 'data-test-params-456' / 'pathway.txt', + INPUT_DIR / 'data-test-params-789' / 'pathway.txt', INPUT_DIR / 'data-test-params-empty' / 'pathway.txt'] dataframe = ml.summarize_networks(file_paths) - ml.pca(dataframe, OUT_DIR + 'pca.png', OUT_DIR + 'pca-variance.txt', str(output_coordinates), kde=True, remove_empty_pathways=True) + ml.pca(dataframe, OUT_DIR / 'pca.png', OUT_DIR / 'pca-variance.txt', output_coordinates, kde=True, remove_empty_pathways=True) - pathway = Evaluation.pca_chosen_pathway([output_coordinates], SUMMARY_FILE, INPUT_DIR) + pathways = Evaluation.pca_chosen_pathway([output_coordinates], SUMMARY_FILE, INPUT_DIR) + assert len(pathways) == 1, f"There must only be one pathway, but got {len(pathways)} instead! ({pathways})" + pd.testing.assert_frame_equal( + pd.read_csv(output_coordinates, sep='\t', header=0), + pd.read_csv(EXPECT_DIR / 'expected-pca-coordinates.txt', sep='\t', header=0) + ) - pr_df = Evaluation.node_precision_and_recall(pathway, GS_NODE_TABLE) + pr_df = Evaluation.node_precision_and_recall(pathways, GS_NODE_TABLE) Evaluation.precision_and_recall_pca_chosen_pathway(pr_df, output_file, output_png, True) - chosen = pd.read_csv(output_file, sep='\t', header=0).round(8) - expected = pd.read_csv(EXPECT_DIR + 'expected-pr-per-pathway-pca-chosen.txt', sep='\t', header=0).round(8) + expected = pd.read_csv(EXPECT_DIR / 'expected-pr-per-pathway-pca-chosen.txt', sep='\t', header=0).round(8) - assert chosen.equals(expected) + pd.testing.assert_frame_equal(chosen, expected) assert output_png.exists() def test_node_ensemble(self): - out_path_file = Path(OUT_DIR + 'node-ensemble.csv') + out_path_file = Path(OUT_DIR, 'node-ensemble.csv') out_path_file.unlink(missing_ok=True) - ensemble_network = [INPUT_DIR + 'ensemble-network.tsv'] - input_network = OUT_DIR + 'data.pickle' + ensemble_network = [INPUT_DIR / 'ensemble-network.tsv'] + input_network = OUT_DIR / 'data.pickle' node_ensemble_dict = Evaluation.edge_frequency_node_ensemble(GS_NODE_TABLE, ensemble_network, input_network) node_ensemble_dict['ensemble'].to_csv(out_path_file, sep='\t', index=False) - assert filecmp.cmp(out_path_file, EXPECT_DIR + 'expected-node-ensemble.csv', shallow=False) + assert filecmp.cmp(out_path_file, EXPECT_DIR / 'expected-node-ensemble.csv', shallow=False) def test_empty_node_ensemble(self): - out_path_file = Path(OUT_DIR + 'empty-node-ensemble.csv') + out_path_file = Path(OUT_DIR, 'empty-node-ensemble.csv') out_path_file.unlink(missing_ok=True) - empty_ensemble_network = [INPUT_DIR + 'empty-ensemble-network.tsv'] - input_network = OUT_DIR + 'data.pickle' + empty_ensemble_network = [INPUT_DIR / 'empty-ensemble-network.tsv'] + input_network = OUT_DIR / 'data.pickle' node_ensemble_dict = Evaluation.edge_frequency_node_ensemble(GS_NODE_TABLE, empty_ensemble_network, input_network) node_ensemble_dict['empty'].to_csv(out_path_file, sep='\t', index=False) - assert filecmp.cmp(out_path_file, EXPECT_DIR + 'expected-empty-node-ensemble.csv', shallow=False) + assert filecmp.cmp(out_path_file, EXPECT_DIR / 'expected-empty-node-ensemble.csv', shallow=False) def test_multiple_node_ensemble(self): - out_path_file = Path(OUT_DIR + 'node-ensemble.csv') + out_path_file = Path(OUT_DIR / 'node-ensemble.csv') out_path_file.unlink(missing_ok=True) - out_path_empty_file = Path(OUT_DIR + 'empty-node-ensemble.csv') + out_path_empty_file = Path(OUT_DIR / 'empty-node-ensemble.csv') out_path_empty_file.unlink(missing_ok=True) - ensemble_networks = [INPUT_DIR + 'ensemble-network.tsv', INPUT_DIR + 'empty-ensemble-network.tsv'] - input_network = OUT_DIR + 'data.pickle' + ensemble_networks = [INPUT_DIR / 'ensemble-network.tsv', INPUT_DIR / 'empty-ensemble-network.tsv'] + input_network = OUT_DIR / 'data.pickle' node_ensemble_dict = Evaluation.edge_frequency_node_ensemble(GS_NODE_TABLE, ensemble_networks, input_network) node_ensemble_dict['ensemble'].to_csv(out_path_file, sep='\t', index=False) - assert filecmp.cmp(out_path_file, EXPECT_DIR + 'expected-node-ensemble.csv', shallow=False) + assert filecmp.cmp(out_path_file, EXPECT_DIR / 'expected-node-ensemble.csv', shallow=False) node_ensemble_dict['empty'].to_csv(out_path_empty_file, sep='\t', index=False) - assert filecmp.cmp(out_path_empty_file, EXPECT_DIR + 'expected-empty-node-ensemble.csv', shallow=False) + assert filecmp.cmp(out_path_empty_file, EXPECT_DIR / 'expected-empty-node-ensemble.csv', shallow=False) def test_precision_recall_curve_ensemble_nodes(self): - out_path_png = Path(OUT_DIR + 'pr-curve-ensemble-nodes.png') + out_path_png = Path(OUT_DIR, 'pr-curve-ensemble-nodes.png') out_path_png.unlink(missing_ok=True) - out_path_file = Path(OUT_DIR + 'pr-curve-ensemble-nodes.txt') + out_path_file = Path(OUT_DIR, 'pr-curve-ensemble-nodes.txt') out_path_file.unlink(missing_ok=True) - ensemble_file = pd.read_csv(INPUT_DIR + 'node-ensemble.csv', sep='\t', header=0) + ensemble_file = pd.read_csv(INPUT_DIR / 'node-ensemble.csv', sep='\t', header=0) node_ensembles_dict = {'ensemble': ensemble_file} Evaluation.precision_recall_curve_node_ensemble(node_ensembles_dict, GS_NODE_TABLE, out_path_png, out_path_file) assert out_path_png.exists() - assert filecmp.cmp(out_path_file, EXPECT_DIR + 'expected-pr-curve-ensemble-nodes.txt', shallow=False) + assert filecmp.cmp(out_path_file, EXPECT_DIR / 'expected-pr-curve-ensemble-nodes.txt', shallow=False) def test_precision_recall_curve_ensemble_nodes_empty(self): - out_path_png = Path(OUT_DIR + 'pr-curve-ensemble-nodes-empty.png') + out_path_png = Path(OUT_DIR, 'pr-curve-ensemble-nodes-empty.png') out_path_png.unlink(missing_ok=True) - out_path_file = Path(OUT_DIR + 'pr-curve-ensemble-nodes-empty.txt') + out_path_file = Path(OUT_DIR, 'pr-curve-ensemble-nodes-empty.txt') out_path_file.unlink(missing_ok=True) - empty_ensemble_file = pd.read_csv(INPUT_DIR + 'node-ensemble-empty.csv', sep='\t', header=0) + empty_ensemble_file = pd.read_csv(INPUT_DIR / 'node-ensemble-empty.csv', sep='\t', header=0) node_ensembles_dict = {'ensemble': empty_ensemble_file} Evaluation.precision_recall_curve_node_ensemble(node_ensembles_dict, GS_NODE_TABLE, out_path_png, out_path_file) assert out_path_png.exists() - assert filecmp.cmp(out_path_file, EXPECT_DIR + 'expected-pr-curve-ensemble-nodes-empty.txt', shallow=False) + assert filecmp.cmp(out_path_file, EXPECT_DIR / 'expected-pr-curve-ensemble-nodes-empty.txt', shallow=False) def test_precision_recall_curve_multiple_ensemble_nodes(self): - out_path_png = Path(OUT_DIR + 'pr-curve-multiple-ensemble-nodes.png') + out_path_png = Path(OUT_DIR, 'pr-curve-multiple-ensemble-nodes.png') out_path_png.unlink(missing_ok=True) - out_path_file = Path(OUT_DIR + 'pr-curve-multiple-ensemble-nodes.txt') + out_path_file = Path(OUT_DIR, 'pr-curve-multiple-ensemble-nodes.txt') out_path_file.unlink(missing_ok=True) - ensemble_file = pd.read_csv(INPUT_DIR + 'node-ensemble.csv', sep='\t', header=0) - empty_ensemble_file = pd.read_csv(INPUT_DIR + 'node-ensemble-empty.csv', sep='\t', header=0) + ensemble_file = pd.read_csv(INPUT_DIR / 'node-ensemble.csv', sep='\t', header=0) + empty_ensemble_file = pd.read_csv(INPUT_DIR / 'node-ensemble-empty.csv', sep='\t', header=0) node_ensembles_dict = {'ensemble1': ensemble_file, 'ensemble2': ensemble_file, 'ensemble3': empty_ensemble_file} Evaluation.precision_recall_curve_node_ensemble(node_ensembles_dict, GS_NODE_TABLE, out_path_png, out_path_file, True) assert out_path_png.exists() - assert filecmp.cmp(out_path_file, EXPECT_DIR + 'expected-pr-curve-multiple-ensemble-nodes.txt', shallow=False) + assert filecmp.cmp(out_path_file, EXPECT_DIR / 'expected-pr-curve-multiple-ensemble-nodes.txt', shallow=False) diff --git a/test/ml/expected/expected-pca-coordinates-kde-negated.tsv b/test/ml/expected/expected-pca-coordinates-kde-negated.tsv deleted file mode 100644 index 3c13c8c4e..000000000 --- a/test/ml/expected/expected-pca-coordinates-kde-negated.tsv +++ /dev/null @@ -1,7 +0,0 @@ -datapoint_labels PC1 PC2 -test-data-s1 -1.01220906 0.05003395 -test-data-s2 -0.84372464 -0.59953316 -test-data-s3 1.56185985 -0.48650911 -test-data-empty 0.29407385 1.03600832 -centroid 0.0 0.0 -kde_peak 0.65469949 0.06343901 diff --git a/test/ml/expected/expected-pca-coordinates-sorted-negated.tsv b/test/ml/expected/expected-pca-coordinates-sorted-negated.tsv deleted file mode 100644 index 4ccadef05..000000000 --- a/test/ml/expected/expected-pca-coordinates-sorted-negated.tsv +++ /dev/null @@ -1,5 +0,0 @@ -datapoint_labels PC1 PC2 -centroid 0.0 0.0 -test-data-s1 0.94594398 -0.46508182 -test-data-s2 0.72014153 0.5090913 -test-data-s3 -1.66608552 -0.04400948 diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index cc6620164..6b74cd681 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -99,23 +99,17 @@ def test_pca_kernel_density(self): OUT_DIR / 'pca-coordinates-kde.tsv', kde=True) coord = pd.read_table(OUT_DIR / 'pca-coordinates-kde.tsv') expected = pd.read_table(EXPECT_DIR / 'expected-pca-coordinates-kde.tsv') - expected_negated = pd.read_table(EXPECT_DIR / 'expected-pca-coordinates-kde-negated.tsv') coord_kde_peak = coord.loc[coord['datapoint_labels'] == 'kde_peak'].round(5) expected_kde_peak = expected.loc[expected['datapoint_labels'] == 'kde_peak'].round(5) - expected_kde_peak_negated = expected_negated.loc[expected_negated['datapoint_labels'] == 'kde_peak'].round(5) - assert coord_kde_peak.equals(expected_kde_peak) or coord_kde_peak.equals(expected_kde_peak_negated) + pd.testing.assert_frame_equal(coord_kde_peak, expected_kde_peak) def test_pca_robustness(self): dataframe = ml.summarize_networks([INPUT_DIR / 'test-data-s1/s1.txt', INPUT_DIR / 'test-data-s2/s2.txt', INPUT_DIR / 'test-data-s3/s3.txt']) - # PCA signage now depends on the input data: we need two differently signed PCA coordinate files. - # See https://scikit-learn.org/stable/whats_new/v1.5.html#changed-models for more info. expected = pd.read_table(EXPECT_DIR / 'expected-pca-coordinates-sorted.tsv') - expected_other = pd.read_table(EXPECT_DIR / 'expected-pca-coordinates-sorted-negated.tsv') expected = expected.round(5) - expected_other = expected_other.round(5) - expected.sort_values(by='datapoint_labels', ignore_index=True, inplace=True) + expected = expected.sort_values(by='datapoint_labels', ignore_index=True) for _ in range(5): dataframe_shuffled = dataframe.sample(frac=1, axis=1) # permute the columns @@ -123,8 +117,8 @@ def test_pca_robustness(self): OUT_DIR / 'pca-shuffled-columns-coordinates.tsv') coord = pd.read_table(OUT_DIR / 'pca-shuffled-columns-coordinates.tsv') coord = coord.round(5) # round values to 5 digits to account for numeric differences across machines - coord.sort_values(by='datapoint_labels', ignore_index=True, inplace=True) - assert coord.equals(expected) or coord.equals(expected_other) + coord = coord.sort_values(by='datapoint_labels', ignore_index=True) + pd.testing.assert_frame_equal(coord, expected) for _ in range(5): dataframe_shuffled = dataframe.sample(frac=1, axis=0) # permute the rows @@ -132,9 +126,9 @@ def test_pca_robustness(self): OUT_DIR / 'pca-shuffled-rows-coordinates.tsv') coord = pd.read_table(OUT_DIR / 'pca-shuffled-rows-coordinates.tsv') coord = coord.round(5) # round values to 5 digits to account for numeric differences across machines - coord.sort_values(by='datapoint_labels', ignore_index=True, inplace=True) + coord = coord.sort_values(by='datapoint_labels', ignore_index=True) - assert coord.equals(expected) or coord.equals(expected_other) + pd.testing.assert_frame_equal(coord, expected) def test_hac_horizontal(self): dataframe = ml.summarize_networks([INPUT_DIR / 'test-data-s1/s1.txt', INPUT_DIR / 'test-data-s2/s2.txt', INPUT_DIR / 'test-data-s3/s3.txt'])