Skip to content

Commit b6dfc76

Browse files
authored
Merge pull request #86 from Arcadia-Science/release/0.5.0
[Sync] Reorganize output directories + deduplicate snakefiles + bugfixes
2 parents 3af6f09 + 128945c commit b6dfc76

File tree

88 files changed

+71840
-1182
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

88 files changed

+71840
-1182
lines changed

.github/workflows/lint.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ jobs:
1212
lint:
1313
runs-on: ubuntu-latest
1414
steps:
15-
- uses: actions/checkout@v3
15+
- uses: actions/checkout@v4
1616

1717
- name: Install Python
1818
uses: actions/setup-python@v4
+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
name: open-public-sync-pr
2+
3+
on:
4+
# allow this workflow to be run manually from the Actions tab in the private repo
5+
workflow_dispatch:
6+
7+
env:
8+
PUBLIC_REPO_URL: https://github.com/Arcadia-Science/ProteinCartography
9+
PUBLIC_REPO_REMOTE: public
10+
# the name of the branch to use for syncing the private repo with the public repo
11+
SYNC_BRANCH: sync-from-public
12+
13+
jobs:
14+
open-public-sync-pr:
15+
runs-on: ubuntu-latest
16+
# only run this job in the private repo
17+
if: github.repository == 'Arcadia-Science/ProteinCartography-private'
18+
steps:
19+
- name: Checkout the repo
20+
uses: actions/checkout@v4
21+
with:
22+
# this token should be a fine-grained PAT with write permissions
23+
# for repository "contents", "pull-requests", and "workflows"
24+
# note: a PAT is needed to allow the action to push the sync branch to the private repo;
25+
# it is not sufficient to grant `permissions: write-all` to the default GITHUB_TOKEN
26+
# (see https://github.com/orgs/community/discussions/35410)
27+
token: ${{ secrets.OPEN_PUBLIC_SYNC_PR_PAT }}
28+
29+
# fetch the full history so that we can check for new commits
30+
fetch-depth: 0
31+
32+
- name: Add and fetch the public repo
33+
run: |
34+
git remote add ${{ env.PUBLIC_REPO_REMOTE }} ${{ env.PUBLIC_REPO_URL }}
35+
git fetch --all
36+
37+
- name: Check for new commits in the public repo and exit if there are none
38+
run: |
39+
NEW_COMMITS=$(git log --oneline --no-merges origin/main..${{ env.PUBLIC_REPO_REMOTE }}/main)
40+
if [ -z "$NEW_COMMITS" ]; then
41+
echo "There are no new commits in the public repo, so no PR will be opened."
42+
exit 1
43+
fi
44+
45+
- name: Delete the sync branch if it exists
46+
# note: if the sync branch already exists and there is an open PR associated with it,
47+
# this will automatically close the PR
48+
run: |
49+
git branch -D ${{ env.SYNC_BRANCH }} || true
50+
git push --force --delete origin ${{ env.SYNC_BRANCH }} || true
51+
52+
- name: Create and push the sync branch to the private repo
53+
# we use `git branch` here instead of `git checkout` followed by `git pull public main`
54+
# because the latter will fail if there are merge conflicts
55+
run: |
56+
git branch ${{ env.SYNC_BRANCH }} ${{ env.PUBLIC_REPO_REMOTE }}/main
57+
git branch --unset-upstream ${{ env.SYNC_BRANCH }}
58+
git push origin ${{ env.SYNC_BRANCH }}
59+
60+
- name: Open a PR to merge the sync branch into the private repo's main branch
61+
env:
62+
GH_TOKEN: ${{ secrets.OPEN_PUBLIC_SYNC_PR_PAT }}
63+
PR_BODY: >
64+
This PR adds new contributions made to the public repo
65+
in order to keep the private repo up to date with the public repo.
66+
It was automatically generated by the "open-public-sync-pr"
67+
GitHub Actions workflow.
68+
run: |
69+
gh pr create \
70+
--repo "${{ github.repository }}" \
71+
--base main \
72+
--head "${{ env.SYNC_BRANCH }}" \
73+
--title "Sync with the public repo" \
74+
--body "${{ env.PR_BODY }}"

.github/workflows/test.yml

+10-3
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ on:
99
pull_request:
1010
branches:
1111
- main
12+
# 'synchronize' means the PR was updated with new commits
13+
types: [opened, reopened, labeled, synchronize]
1214

1315
jobs:
1416
test:
@@ -17,7 +19,7 @@ jobs:
1719
run:
1820
shell: bash -el {0}
1921
steps:
20-
- uses: actions/checkout@v3
22+
- uses: actions/checkout@v4
2123

2224
- name: Setup conda
2325
uses: conda-incubator/setup-miniconda@v2
@@ -57,9 +59,14 @@ jobs:
5759

5860
- name: Create the snakemake conda envs
5961
run: |
60-
snakemake --configfile demo/config_actin.yml --use-conda --conda-create-envs-only --cores 1
62+
snakemake --configfile demo/search-mode/config_actin.yml --use-conda --conda-create-envs-only --cores 1
6163
if: steps.cache-snakemake-conda-envs.outputs.cache-hit != 'true'
6264

6365
- name: Run the tests
6466
run: |
65-
pytest -vv -s .
67+
make test
68+
69+
- name: Run the tests without mocks
70+
if: contains(github.event.pull_request.labels.*.name, 'run-slow-tests')
71+
run: |
72+
make test-without-mocks
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
name: verify-no-new-public-commits
2+
3+
on:
4+
pull_request:
5+
branches:
6+
- main
7+
push:
8+
branches:
9+
- main
10+
11+
env:
12+
PUBLIC_REPO_URL: https://github.com/Arcadia-Science/ProteinCartography
13+
14+
jobs:
15+
verify-no-new-public-commits:
16+
runs-on: ubuntu-latest
17+
# only run this job in the private repo
18+
# (but do not check the repo owner, because forks will have a different owner)
19+
if: github.event.repository.name == 'ProteinCartography-private'
20+
steps:
21+
- name: Checkout the repo
22+
uses: actions/checkout@v4
23+
with:
24+
# fetch the full history so that we can check for new commits
25+
fetch-depth: 0
26+
27+
- name: Add and fetch the public repo
28+
run: |
29+
git remote add public ${{ env.PUBLIC_REPO_URL }}
30+
git fetch --all
31+
32+
- name: Check for new commits in the public repo
33+
run: |
34+
NEW_COMMITS=$(git log --oneline --no-merges ..public/main)
35+
if [ -z "$NEW_COMMITS" ]; then
36+
echo "There are no new commits in the public repo."
37+
exit 0
38+
else
39+
echo -e "This action failed because there are new commits in the public repo:\n\
40+
$NEW_COMMITS\n\n\
41+
Please use the \"open-public-sync-pr\" GitHub Action to merge them \
42+
into the private repo before merging new PRs."
43+
exit 1
44+
fi

.gitignore

+1-2
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,7 @@ envs/src/**
55
.snakemake/
66
/input/
77
/output/
8-
/demo/output/
9-
/demo/input/
8+
/demo/**/output/
109
/logs
1110
/tmp
1211

CONTRIBUTING.md

+153
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
# Contributing to ProteinCartography
2+
Thanks for your interest in contributing to ProteinCartography!
3+
Please read this document in its entirety before contributing to help ensure that your contribution meets our standards and is readily accepted.
4+
5+
## Getting Started
6+
All the packages needed to develop for ProteinCartography are found in the `envs/cartography_dev.yml` conda environment.
7+
You can install this environment as follows:
8+
9+
1. Make sure `miniconda` is installed. Even if you’re using an Apple Silicon (M1, M2, etc. macOS) laptop, you will need to install the macOS Intel x86-64 version of `miniconda` [here](https://docs.conda.io/projects/miniconda/en/latest/).
10+
11+
2. Create a conda environment from the `cartography_dev.yml` file in the `envs/` directory.
12+
```sh
13+
conda env create -n cartography_dev --file envs/cartography_dev.yml
14+
```
15+
16+
3. Activate the environment.
17+
```sh
18+
conda activate cartography_dev
19+
```
20+
21+
## How to contribute
22+
### Bug reports and feature requests
23+
We track all bugs, new feature requests, enhancements, etc. using [GitHub Issues](https://github.com/Arcadia-Science/ProteinCartography/issues). Please check to make sure that your issue has not already been reported. If it has, please add a comment to the existing issue instead of creating a new one.
24+
25+
### Making a contribution
26+
The steps below apply to both external and internal contributors and also apply to working both with this repo itself and with your own fork. However, if you are an external contributor, please fork this repository and make your changes in a new branch on your fork. Please see the GitHub docs on [working with forks](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo) for more details about how to do this.
27+
28+
1. Whenever you start work, you should make sure to `git pull` on the main branch to get the latest version of the `main` branch.
29+
30+
1. When you’re planning on working on an issue, you should claim it by assigning it to yourself. You should also add a comment briefly explaining how you plan to address the issue. If you are an external contributor, please wait for a maintainer to sign off on your plan before you start working; this will make it easier for us to accept your PR later on.
31+
32+
1. After claiming an issue, create a new branch from the `main` branch. __Make sure your branch begins with your initials, followed by a forward slash__. This is very important to keep everyone's branches well-organized. Use short descriptive names for the branch name itself. For example, if your initials are `abc` and you are adding a new feature to evaluate clustering, you might name your branch `abc/add-cluster-evaluation`. If you are working on an issue to fix a bug, you might name your branch `abc/fix-foldseek-format-bug`.
33+
Create the new branch using the following command:
34+
```sh
35+
git checkout -b <your-initials>/<branch-name>
36+
```
37+
38+
1. Once you’ve created a branch, push the branch to the GitHub repo so you can keep a remote record of your work.
39+
```sh
40+
git push -u origin <your-initials>/<branch-name>
41+
```
42+
43+
1. Once you’ve completed the feature or fixed the bug, you are ready to open a PR. Please aim to keep the PRs as small as possible to increase the speed and ease of review; a PR can be as small as changing a few characters or resolving a single bug. When you open a PR, please use a succinct and human-readable title and always add a description of the changes you made as well as a link to the issue that the PR addresses.
44+
45+
1. Check that your PR is passing the CI checks. These checks verify that the changes in your PR are formatted correctly (we use a tool called `ruff` for formatting; see below for details) and also that your PR passes the automated tests of the pipeline.
46+
47+
### Keeping your development branches up to date
48+
Occasionally, your development branch will be behind the main branch. If this happens and you’re working on a file that was changed in the updated version of the main branch, you may need to merge the updated `main` branch into your local development branch.
49+
50+
1. First, update the main branch in your local repo using the following in main:
51+
```sh
52+
git checkout main
53+
git pull
54+
```
55+
56+
1. Next, check out your development branch:
57+
```sh
58+
git checkout <your-initials>/<branch-name>
59+
```
60+
61+
1. Now, merge the main branch into your local branch:
62+
```sh
63+
git merge origin/main
64+
```
65+
__Note that you must be on your local development branch when you call `git merge`!__
66+
67+
1. Once you’ve merged the main branch into your local development branch, use `git push` to push the merged changes to your branch on GitHub.
68+
69+
### Linting
70+
We use `ruff` to lint and format our Python code. We also use snakemake’s code formatter, `snakefmt` , to format the snakefiles. You can and should run these tools in your local repo using the commands `make format` and `make lint`. Note that `ruff` is also available as an extension in VS Code, allowing you to configure VS Code to automatically format your code whenever you save the file you are currently editing.
71+
72+
### Testing
73+
Tests are found in the `ProteinCartography/tests/` directory. We use `pytest` for testing; you can run the tests locally using the command `make test`. Currently, we only have integration-like tests that run the pipeline in both 'search' and 'cluster' modes using a test dataset and test config designed to allow the pipeline run very quickly (2-3min). The tests then check that the output files are created and have the correct shape. We plan to add unit tests in the future.
74+
75+
#### Running the tests without mocked API responses
76+
When the pipeline is run in 'search' mode, it makes many calls to external APIs (including Foldseek, Blast, and Alphafold). By default, these calls are mocked during testing so that the tests do not depend on external APIs; this is important to ensure test reproducibility and also helps to make the tests run quickly. However, it is important to periodically test that the pipeline also runs correctly when real API calls are made. To do this, you can run the tests without mocks using `make test-without-mocks`.
77+
78+
When merging PRs on GitHub, it is likewise important to test that the pipeline runs correctly with real API calls. To do so, add the label `run-slow-tests` to your PR. This will trigger the CI actions (see below) to run again on your PR, but now without mocks. __Please add this label only when your PR is ready to merge, as it will cause the CI to run more slowly and will also result in unnecessary API calls.__
79+
80+
#### Updating the mocked API responses
81+
When changes you have made involve changes to the API calls made by the pipeline, it will be necessary to update the mocked responses in order for the tests to pass. Currently, this is a manual process.
82+
1. Enable API response logging in your local environment by setting the following environment variable:
83+
```sh
84+
export PROTEINCARTOGRAPHY_SHOULD_LOG_API_REQUESTS=true
85+
```
86+
87+
1. Run the pipeline in 'search' mode using the 'small' search-mode demo (this demo uses the same input PDB file as the tests). The API responses made by the pipeline will be logged a `logs/` directory in the root of the repo.
88+
```sh
89+
snakemake --cores all --use-conda --configfile demo/search-mode/config_actin_small.yml
90+
```
91+
92+
1. Use the logged responses to update the mocked responses constructed in the `ProteinCartography/tests/mocks.py` module. For large responses, response contents should be added to `ProteinCartography/tests/integration-test-artifacts/search-mode/actin/api_response_content/`.
93+
94+
1. When you're finished, don't forget to delete the `logs/` directory and unset the `PROTEINCARTOGRAPHY_SHOULD_LOG_API_REQUESTS` environment variable.
95+
96+
### CI
97+
We use GitHub Actions for CI. Currently, there is one workflow for linting and one for testing. Both workflows are run automatically on GitHub when a PR is opened and also whenever new commits are pushed to an open PR. PRs cannot be merged until the CI checks pass.
98+
99+
The linting workflow runs `ruff --check` and `snakefmt --check` on all Python and snakefiles in the repo. This means that the workflow does not modify any files; it only checks that your code is formatted correctly. If the workflow fails for your PR, you can run `make format` locally to format your code and `make lint` to determine if there are lint errors that need to be fixed.
100+
101+
The testing workflow runs pytest using the same `make test` command that is used locally. If the workflow fails for your PR, it is usually best to run `make test` locally to recapitulate the failure and determine which tests are failing.
102+
103+
### Style guide
104+
In addition to the formatting and lint rules imposed by `ruff` and `snakefmt`, we also have a few additional style rules that are not enforced by these tools. These rules are listed below.
105+
106+
- Function and variable names should be in `lower_snake_case` and should be descriptive; avoid abbreviations.
107+
- Function arguments and return values should have [type hints](https://docs.python.org/3/library/typing.html).
108+
- Functions should include a [Google-style docstring](https://github.com/google/styleguide/blob/gh-pages/pyguide.md#38-comments-and-docstrings) explaining the arguments and what the function returns (if not `None`).
109+
- Comments should be written in complete sentences in the present tense and should end with a period.
110+
- Comments should be used sparingly and only when necessary to explain something that is not obvious from the code itself.
111+
- Class names should use `CapitalizedCamelCase` with descriptive names.
112+
- Currently, we don’t use many custom classes, but the conventions for functions apply to class methods as well.
113+
114+
Here is an example of a function that adheres to all of these rules:
115+
```python
116+
def add_integers(first_integer: int, second_integer: int) -> int:
117+
"""
118+
Add two integers together, returning the result.
119+
120+
Args:
121+
first_integer (int): first integer to add.
122+
second_integer (int): second integer to add.
123+
124+
Returns:
125+
The sum of the two integers.
126+
"""
127+
result = first_integer + second_integer
128+
return result
129+
```
130+
131+
### Code organization
132+
We strive to encapsulate new functionality within modular Python scripts that accept arguments from the command line using `argparse`. These scripts are then called from snakemake rules and can also be run directly from the command line by the user.
133+
- Every script should include a `parse_args()` function and a `main()` function.
134+
- Every script with `#!/usr/bin/env python` (so that the scripts are executable from the command line on unix systems).
135+
- An example template for new scripts is found in [`template.py`](./ProteinCartography/template.py).
136+
137+
### Adding new dependencies
138+
First, please consider carefully whether you need to add a new dependency to the project.
139+
When changes you have made absolutely require new dependencies, please make sure that they are `conda`-installable.
140+
Dependencies should be added to two environment files:
141+
1. the `cartography_dev.yml` file in the `envs/` directory.
142+
2. the appropriate snakemake rule environment file in the `envs/` directory.
143+
144+
In both files, please include the version of the dependency you are using (this is called "pinning" the dependency).
145+
Include only the exact version number; do not include the package hash.
146+
For example, if you are adding a new dependency called `new_dependency` and you are using version `1.2.3`, you would add the following line to the `cartography_dev.yml` file:
147+
```yaml
148+
- new_dependency=1.2.3
149+
```
150+
151+
152+
## Crediting Contributions
153+
See how we recognize feedback and contributions to our code at Arcadia [here](https://github.com/Arcadia-Science/arcadia-software-handbook/blob/main/guides-and-standards/guide-credit-for-contributions.md).

Makefile

+27-2
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,33 @@ pre-commit:
1515

1616
.PHONY: test
1717
test:
18-
pytest -v
18+
pytest -vv -s .
19+
20+
.PHONY: test-without-mocks
21+
test-without-mocks:
22+
pytest -vv -s --no-mocks
1923

2024
.PHONY: run-demo-workflow
2125
run-demo-workflow:
22-
snakemake --snakefile Snakefile --configfile demo/config_actin.yml --use-conda $(ARGS)
26+
snakemake \
27+
--snakefile Snakefile \
28+
--configfile demo/search-mode/config_actin.yml \
29+
--use-conda \
30+
--cores all \
31+
$(ARGS)
32+
33+
.PHONY: generate-search-mode-rulegraph
34+
generate-search-mode-rulegraph:
35+
snakemake \
36+
--configfile demo/search-mode/config_actin.yml \
37+
--rulegraph \
38+
| dot -Tpng \
39+
> rulegraph-search-mode.png
40+
41+
.PHONY: generate-cluster-mode-rulegraph
42+
generate-cluster-mode-rulegraph:
43+
snakemake \
44+
--configfile demo/cluster-mode/config.yml \
45+
--rulegraph \
46+
| dot -Tpng \
47+
> rulegraph-cluster-mode.png

0 commit comments

Comments
 (0)