Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 30 additions & 24 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,36 @@ name: spras
channels:
- conda-forge
dependencies:
- adjusttext=0.7.3.1
- bioconda::snakemake-minimal=8.17.0
- docker-py=5.0
- matplotlib=3.6
- networkx=2.8
- pandas=1.5
- numpy=1.26.4
- pre-commit=2.20 # Only required for development
- go=1.24 # Only required for development
- pytest=8.0 # Only required for development
- python=3.11
- adjusttext=1.3.0
- bioconda::snakemake-minimal=9.6.2
# Conda refers to pypi/docker as docker-py.
- docker-py=7.1.0
- matplotlib=3.10.3
- networkx=3.5
- pandas=2.3.0
- numpy=2.3.1
- requests=2.32.4
- scikit-learn=1.7.0
- seaborn=0.13.2
- spython=0.3.14

# conda-specific for dsub
- python-dateutil=2.9.0
- pytz=2025.2
- pyyaml=6.0.2
- tenacity=9.1.2
- tabulate=0.9.0

# toolchain deps
- pip=22.1
- requests=2.28
- scikit-learn=1.2
- seaborn=0.12
- spython=0.2
# for dsub
- python-dateutil<=2.9.0
- pytz<=2024.1
- pyyaml<=6.0.1
- tenacity<=8.2.3
- tabulate<=0.9.0
- sphinx=6.0
- sphinx-rtd-theme=2.0.0
# This should be the same as requires-python minus the >=.
- python=3.11

# development dependencies
- pre-commit=4.2.0
- pytest=8.4.1
# development dependencies - conda-specific
- go=1.24

- pip:
- dsub==0.4.13

29 changes: 14 additions & 15 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,27 +19,26 @@ classifiers = [
requires-python = ">=3.11"
dependencies = [
"adjusttext==0.7.3",
# A bug was introduced in older versions of snakemake that prevent it from running. Update to fix
"snakemake==8.17.0",
"docker==5.0.3", # Switched from docker-py to docker because docker-py is not maintained in pypi. This appears to have no effect
"matplotlib==3.6",
"networkx==2.8",
"pandas==1.5",
"numpy==1.26.4",
"snakemake==9.6.2",
"docker==7.1.0",
"matplotlib==3.10.3",
"networkx==3.5",
"pandas==2.3.0",
"numpy==2.3.1",
"requests==2.32.4",
"scikit-learn==1.7.0",
"seaborn==0.13.2",
"spython==0.3.14",

# toolchain deps
"pip==22.1",
"requests==2.28",
"scikit-learn==1.2",
"seaborn==0.12",
"spython==0.2",
"sphinx==6.0",
"sphinx-rtd-theme==2.0.0",
]

[project.optional-dependencies]
dev = [
# Only required for development
"pre-commit==2.20",
"pytest==8.0",
"pre-commit==4.2.0",
"pytest==8.4.1",
]

[project.urls]
Expand Down
2 changes: 1 addition & 1 deletion spras/analysis/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ def hac_horizontal(dataframe: pd.DataFrame, output_png: str, output_file: str, l

# plotting figure
plt.figure(figsize=(10, 7))
model = AgglomerativeClustering(linkage=linkage, affinity=metric,distance_threshold=0.5, n_clusters=None)
model = AgglomerativeClustering(linkage=linkage, metric=metric,distance_threshold=0.5, n_clusters=None)
model = model.fit(df)
plt.figure(figsize=(10, 7))
plt.title("Hierarchical Agglomerative Clustering Dendrogram")
Expand Down
7 changes: 7 additions & 0 deletions test/ml/expected/expected-pca-coordinates-kde-negated.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
datapoint_labels PC1 PC2
test-data-s1 -1.01220906 0.05003395
test-data-s2 -0.84372464 -0.59953316
test-data-s3 1.56185985 -0.48650911
test-data-empty 0.29407385 1.03600832
centroid 0.0 0.0
kde_peak 0.65469949 0.06343901
5 changes: 5 additions & 0 deletions test/ml/expected/expected-pca-coordinates-sorted-negated.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
datapoint_labels PC1 PC2
centroid 0.0 0.0
test-data-s1 0.94594398 -0.46508182
test-data-s2 0.72014153 0.5090913
test-data-s3 -1.66608552 -0.04400948
5 changes: 5 additions & 0 deletions test/ml/expected/expected-pca-coordinates-sorted.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
datapoint_labels PC1 PC2
centroid 0.0 0.0
test-data-s1 -0.94594398 -0.46508182
test-data-s2 -0.72014153 0.5090913
test-data-s3 1.66608552 -0.04400948
14 changes: 10 additions & 4 deletions test/ml/test_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,16 +99,22 @@ def test_pca_kernel_density(self):
OUT_DIR + 'pca-coordinates-kde.tsv', kde=True)
coord = pd.read_table(OUT_DIR + 'pca-coordinates-kde.tsv')
expected = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates-kde.tsv')
expected_negated = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates-kde-negated.tsv')
coord_kde_peak = coord.loc[coord['datapoint_labels'] == 'kde_peak'].round(5)
expected_kde_peak = expected.loc[expected['datapoint_labels'] == 'kde_peak'].round(5)
expected_kde_peak_negated = expected_negated.loc[expected_negated['datapoint_labels'] == 'kde_peak'].round(5)

assert coord_kde_peak.equals(expected_kde_peak)
assert coord_kde_peak.equals(expected_kde_peak) or coord_kde_peak.equals(expected_kde_peak_negated)

def test_pca_robustness(self):
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt',
INPUT_DIR + 'test-data-s3/s3.txt'])
expected = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates.tsv')
# PCA signage now depends on the input data: we need two differently signed PCA coordinate files.
# See https://scikit-learn.org/stable/whats_new/v1.5.html#changed-models for more info.
expected = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates-sorted.tsv')
expected_other = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates-sorted-negated.tsv')
expected = expected.round(5)
expected_other = expected_other.round(5)
expected.sort_values(by='datapoint_labels', ignore_index=True, inplace=True)

for _ in range(5):
Expand All @@ -118,7 +124,7 @@ def test_pca_robustness(self):
coord = pd.read_table(OUT_DIR + 'pca-shuffled-columns-coordinates.tsv')
coord = coord.round(5) # round values to 5 digits to account for numeric differences across machines
coord.sort_values(by='datapoint_labels', ignore_index=True, inplace=True)
assert coord.equals(expected)
assert coord.equals(expected) or coord.equals(expected_other)

for _ in range(5):
dataframe_shuffled = dataframe.sample(frac=1, axis=0) # permute the rows
Expand All @@ -128,7 +134,7 @@ def test_pca_robustness(self):
coord = coord.round(5) # round values to 5 digits to account for numeric differences across machines
coord.sort_values(by='datapoint_labels', ignore_index=True, inplace=True)

assert coord.equals(expected)
assert coord.equals(expected) or coord.equals(expected_other)

def test_hac_horizontal(self):
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt'])
Expand Down
Loading