diff --git a/environment.yml b/environment.yml index 7c2a2e98c..a0fb02293 100644 --- a/environment.yml +++ b/environment.yml @@ -2,30 +2,36 @@ name: spras channels: - conda-forge dependencies: - - adjusttext=0.7.3.1 - - bioconda::snakemake-minimal=8.17.0 - - docker-py=5.0 - - matplotlib=3.6 - - networkx=2.8 - - pandas=1.5 - - numpy=1.26.4 - - pre-commit=2.20 # Only required for development - - go=1.24 # Only required for development - - pytest=8.0 # Only required for development - - python=3.11 + - adjusttext=1.3.0 + - bioconda::snakemake-minimal=9.6.2 + # Conda refers to pypi/docker as docker-py. + - docker-py=7.1.0 + - matplotlib=3.10.3 + - networkx=3.5 + - pandas=2.3.0 + - numpy=2.3.1 + - requests=2.32.4 + - scikit-learn=1.7.0 + - seaborn=0.13.2 + - spython=0.3.14 + + # conda-specific for dsub + - python-dateutil=2.9.0 + - pytz=2025.2 + - pyyaml=6.0.2 + - tenacity=9.1.2 + - tabulate=0.9.0 + + # toolchain deps - pip=22.1 - - requests=2.28 - - scikit-learn=1.2 - - seaborn=0.12 - - spython=0.2 - # for dsub - - python-dateutil<=2.9.0 - - pytz<=2024.1 - - pyyaml<=6.0.1 - - tenacity<=8.2.3 - - tabulate<=0.9.0 - - sphinx=6.0 - - sphinx-rtd-theme=2.0.0 + # This should be the same as requires-python minus the >=. + - python=3.11 + + # development dependencies + - pre-commit=4.2.0 + - pytest=8.4.1 + # development dependencies - conda-specific + - go=1.24 + - pip: - dsub==0.4.13 - diff --git a/pyproject.toml b/pyproject.toml index 89d4b3f32..eee89b240 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,27 +19,26 @@ classifiers = [ requires-python = ">=3.11" dependencies = [ "adjusttext==0.7.3", - # A bug was introduced in older versions of snakemake that prevent it from running. Update to fix - "snakemake==8.17.0", - "docker==5.0.3", # Switched from docker-py to docker because docker-py is not maintained in pypi. This appears to have no effect - "matplotlib==3.6", - "networkx==2.8", - "pandas==1.5", - "numpy==1.26.4", + "snakemake==9.6.2", + "docker==7.1.0", + "matplotlib==3.10.3", + "networkx==3.5", + "pandas==2.3.0", + "numpy==2.3.1", + "requests==2.32.4", + "scikit-learn==1.7.0", + "seaborn==0.13.2", + "spython==0.3.14", + + # toolchain deps "pip==22.1", - "requests==2.28", - "scikit-learn==1.2", - "seaborn==0.12", - "spython==0.2", - "sphinx==6.0", - "sphinx-rtd-theme==2.0.0", ] [project.optional-dependencies] dev = [ # Only required for development - "pre-commit==2.20", - "pytest==8.0", + "pre-commit==4.2.0", + "pytest==8.4.1", ] [project.urls] diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index 436769f78..335cf3e10 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -382,7 +382,7 @@ def hac_horizontal(dataframe: pd.DataFrame, output_png: str, output_file: str, l # plotting figure plt.figure(figsize=(10, 7)) - model = AgglomerativeClustering(linkage=linkage, affinity=metric,distance_threshold=0.5, n_clusters=None) + model = AgglomerativeClustering(linkage=linkage, metric=metric,distance_threshold=0.5, n_clusters=None) model = model.fit(df) plt.figure(figsize=(10, 7)) plt.title("Hierarchical Agglomerative Clustering Dendrogram") diff --git a/test/ml/expected/expected-pca-coordinates-kde-negated.tsv b/test/ml/expected/expected-pca-coordinates-kde-negated.tsv new file mode 100644 index 000000000..3c13c8c4e --- /dev/null +++ b/test/ml/expected/expected-pca-coordinates-kde-negated.tsv @@ -0,0 +1,7 @@ +datapoint_labels PC1 PC2 +test-data-s1 -1.01220906 0.05003395 +test-data-s2 -0.84372464 -0.59953316 +test-data-s3 1.56185985 -0.48650911 +test-data-empty 0.29407385 1.03600832 +centroid 0.0 0.0 +kde_peak 0.65469949 0.06343901 diff --git a/test/ml/expected/expected-pca-coordinates-sorted-negated.tsv b/test/ml/expected/expected-pca-coordinates-sorted-negated.tsv new file mode 100644 index 000000000..4ccadef05 --- /dev/null +++ b/test/ml/expected/expected-pca-coordinates-sorted-negated.tsv @@ -0,0 +1,5 @@ +datapoint_labels PC1 PC2 +centroid 0.0 0.0 +test-data-s1 0.94594398 -0.46508182 +test-data-s2 0.72014153 0.5090913 +test-data-s3 -1.66608552 -0.04400948 diff --git a/test/ml/expected/expected-pca-coordinates-sorted.tsv b/test/ml/expected/expected-pca-coordinates-sorted.tsv new file mode 100644 index 000000000..1a091964e --- /dev/null +++ b/test/ml/expected/expected-pca-coordinates-sorted.tsv @@ -0,0 +1,5 @@ +datapoint_labels PC1 PC2 +centroid 0.0 0.0 +test-data-s1 -0.94594398 -0.46508182 +test-data-s2 -0.72014153 0.5090913 +test-data-s3 1.66608552 -0.04400948 diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index 944e01c40..8a8cb1687 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -99,16 +99,22 @@ def test_pca_kernel_density(self): OUT_DIR + 'pca-coordinates-kde.tsv', kde=True) coord = pd.read_table(OUT_DIR + 'pca-coordinates-kde.tsv') expected = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates-kde.tsv') + expected_negated = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates-kde-negated.tsv') coord_kde_peak = coord.loc[coord['datapoint_labels'] == 'kde_peak'].round(5) expected_kde_peak = expected.loc[expected['datapoint_labels'] == 'kde_peak'].round(5) + expected_kde_peak_negated = expected_negated.loc[expected_negated['datapoint_labels'] == 'kde_peak'].round(5) - assert coord_kde_peak.equals(expected_kde_peak) + assert coord_kde_peak.equals(expected_kde_peak) or coord_kde_peak.equals(expected_kde_peak_negated) def test_pca_robustness(self): dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt']) - expected = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates.tsv') + # PCA signage now depends on the input data: we need two differently signed PCA coordinate files. + # See https://scikit-learn.org/stable/whats_new/v1.5.html#changed-models for more info. + expected = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates-sorted.tsv') + expected_other = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates-sorted-negated.tsv') expected = expected.round(5) + expected_other = expected_other.round(5) expected.sort_values(by='datapoint_labels', ignore_index=True, inplace=True) for _ in range(5): @@ -118,7 +124,7 @@ def test_pca_robustness(self): coord = pd.read_table(OUT_DIR + 'pca-shuffled-columns-coordinates.tsv') coord = coord.round(5) # round values to 5 digits to account for numeric differences across machines coord.sort_values(by='datapoint_labels', ignore_index=True, inplace=True) - assert coord.equals(expected) + assert coord.equals(expected) or coord.equals(expected_other) for _ in range(5): dataframe_shuffled = dataframe.sample(frac=1, axis=0) # permute the rows @@ -128,7 +134,7 @@ def test_pca_robustness(self): coord = coord.round(5) # round values to 5 digits to account for numeric differences across machines coord.sort_values(by='datapoint_labels', ignore_index=True, inplace=True) - assert coord.equals(expected) + assert coord.equals(expected) or coord.equals(expected_other) def test_hac_horizontal(self): dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt'])