Reed-CompBio · tristan-f-r · Jul 25, 2025 · Jul 1, 2025 · Jul 2, 2025 · Jul 2, 2025
diff --git a/environment.yml b/environment.yml
@@ -2,30 +2,36 @@ name: spras
 channels:
   - conda-forge
 dependencies:
-  - adjusttext=0.7.3.1
-  - bioconda::snakemake-minimal=8.17.0
-  - docker-py=5.0
-  - matplotlib=3.6
-  - networkx=2.8
-  - pandas=1.5
-  - numpy=1.26.4
-  - pre-commit=2.20 # Only required for development
-  - go=1.24 # Only required for development
-  - pytest=8.0 # Only required for development
-  - python=3.11
+  - adjusttext=1.3.0
+  - bioconda::snakemake-minimal=9.6.2
+  # Conda refers to pypi/docker as docker-py.
+  - docker-py=7.1.0
+  - matplotlib=3.10.3
+  - networkx=3.5
+  - pandas=2.3.0
+  - numpy=2.3.1
+  - requests=2.32.4
+  - scikit-learn=1.7.0
+  - seaborn=0.13.2
+  - spython=0.3.14
+
+  # conda-specific for dsub
+  - python-dateutil=2.9.0
+  - pytz=2025.2
+  - pyyaml=6.0.2
+  - tenacity=9.1.2
+  - tabulate=0.9.0
+
+  # toolchain deps
   - pip=22.1
-  - requests=2.28
-  - scikit-learn=1.2
-  - seaborn=0.12
-  - spython=0.2
-  # for dsub
-  - python-dateutil<=2.9.0
-  - pytz<=2024.1
-  - pyyaml<=6.0.1
-  - tenacity<=8.2.3
-  - tabulate<=0.9.0
-  - sphinx=6.0
-  - sphinx-rtd-theme=2.0.0
+  # This should be the same as requires-python minus the >=.
+  - python=3.11
+
+  # development dependencies
+  - pre-commit=4.2.0
+  - pytest=8.4.1
+  # development dependencies - conda-specific
+  - go=1.24
+
   - pip:
       - dsub==0.4.13
-
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,27 +19,26 @@ classifiers = [
 requires-python = ">=3.11"
 dependencies = [
     "adjusttext==0.7.3",
-    # A bug was introduced in older versions of snakemake that prevent it from running. Update to fix
-    "snakemake==8.17.0",
-    "docker==5.0.3", # Switched from docker-py to docker because docker-py is not maintained in pypi. This appears to have no effect
-    "matplotlib==3.6",
-    "networkx==2.8",
-    "pandas==1.5",
-    "numpy==1.26.4",
+    "snakemake==9.6.2",
+    "docker==7.1.0",
+    "matplotlib==3.10.3",
+    "networkx==3.5",
+    "pandas==2.3.0",
+    "numpy==2.3.1",
+    "requests==2.32.4",
+    "scikit-learn==1.7.0",
+    "seaborn==0.13.2",
+    "spython==0.3.14",
+
+    # toolchain deps
     "pip==22.1",
-    "requests==2.28",
-    "scikit-learn==1.2",
-    "seaborn==0.12",
-    "spython==0.2",
-    "sphinx==6.0",
-    "sphinx-rtd-theme==2.0.0",
 ]
 
 [project.optional-dependencies]
 dev = [
    # Only required for development
-    "pre-commit==2.20",
-    "pytest==8.0",
+    "pre-commit==4.2.0",
+    "pytest==8.4.1",
 ]
 
 [project.urls]

diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py
@@ -382,7 +382,7 @@ def hac_horizontal(dataframe: pd.DataFrame, output_png: str, output_file: str, l
 
     # plotting figure
     plt.figure(figsize=(10, 7))
-    model = AgglomerativeClustering(linkage=linkage, affinity=metric,distance_threshold=0.5, n_clusters=None)
+    model = AgglomerativeClustering(linkage=linkage, metric=metric,distance_threshold=0.5, n_clusters=None)
     model = model.fit(df)
     plt.figure(figsize=(10, 7))
     plt.title("Hierarchical Agglomerative Clustering Dendrogram")

diff --git a/test/ml/expected/expected-pca-coordinates-kde-negated.tsv b/test/ml/expected/expected-pca-coordinates-kde-negated.tsv
@@ -0,0 +1,7 @@
+datapoint_labels	PC1	PC2
+test-data-s1	-1.01220906	0.05003395
+test-data-s2	-0.84372464	-0.59953316
+test-data-s3	1.56185985	-0.48650911
+test-data-empty	0.29407385	1.03600832
+centroid	0.0	0.0
+kde_peak	0.65469949	0.06343901
diff --git a/test/ml/expected/expected-pca-coordinates-sorted-negated.tsv b/test/ml/expected/expected-pca-coordinates-sorted-negated.tsv
@@ -0,0 +1,5 @@
+datapoint_labels	PC1	PC2
+centroid	0.0	0.0
+test-data-s1	0.94594398	-0.46508182
+test-data-s2	0.72014153	0.5090913
+test-data-s3	-1.66608552	-0.04400948
diff --git a/test/ml/expected/expected-pca-coordinates-sorted.tsv b/test/ml/expected/expected-pca-coordinates-sorted.tsv
@@ -0,0 +1,5 @@
+datapoint_labels	PC1	PC2
+centroid	0.0	0.0
+test-data-s1	-0.94594398	-0.46508182
+test-data-s2	-0.72014153	0.5090913
+test-data-s3	1.66608552	-0.04400948
diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py
@@ -99,16 +99,22 @@ def test_pca_kernel_density(self):
                OUT_DIR + 'pca-coordinates-kde.tsv', kde=True)
         coord = pd.read_table(OUT_DIR + 'pca-coordinates-kde.tsv')
         expected = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates-kde.tsv')
+        expected_negated = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates-kde-negated.tsv')
         coord_kde_peak = coord.loc[coord['datapoint_labels'] == 'kde_peak'].round(5)
         expected_kde_peak = expected.loc[expected['datapoint_labels'] == 'kde_peak'].round(5)
+        expected_kde_peak_negated = expected_negated.loc[expected_negated['datapoint_labels'] == 'kde_peak'].round(5)
 
-        assert coord_kde_peak.equals(expected_kde_peak)
+        assert coord_kde_peak.equals(expected_kde_peak) or coord_kde_peak.equals(expected_kde_peak_negated)
 
     def test_pca_robustness(self):
         dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt',
                                            INPUT_DIR + 'test-data-s3/s3.txt'])
-        expected = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates.tsv')
+        # PCA signage now depends on the input data: we need two differently signed PCA coordinate files.
+        # See https://scikit-learn.org/stable/whats_new/v1.5.html#changed-models for more info.
+        expected = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates-sorted.tsv')
+        expected_other = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates-sorted-negated.tsv')
         expected = expected.round(5)
+        expected_other = expected_other.round(5)
         expected.sort_values(by='datapoint_labels', ignore_index=True, inplace=True)
 
         for _ in range(5):
@@ -118,7 +124,7 @@ def test_pca_robustness(self):
             coord = pd.read_table(OUT_DIR + 'pca-shuffled-columns-coordinates.tsv')
             coord = coord.round(5)  # round values to 5 digits to account for numeric differences across machines
             coord.sort_values(by='datapoint_labels', ignore_index=True, inplace=True)
-            assert coord.equals(expected)
+            assert coord.equals(expected) or coord.equals(expected_other)
 
         for _ in range(5):
             dataframe_shuffled = dataframe.sample(frac=1, axis=0)  # permute the rows
@@ -128,7 +134,7 @@ def test_pca_robustness(self):
             coord = coord.round(5)  # round values to 5 digits to account for numeric differences across machines
             coord.sort_values(by='datapoint_labels', ignore_index=True, inplace=True)
 
-            assert coord.equals(expected)
+            assert coord.equals(expected) or coord.equals(expected_other)
 
     def test_hac_horizontal(self):
         dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt'])