refactor clip inference (#107)

* refactor clip inference divides in * reader: reads the files or wds into tensors of images and text * mapper: transform tensors into embeddings and metadata * write: write the embeddings and metadata * runner combine reader, mapper and writer * distributor: run runner using various distribution strategies * main: use all of that to provide the whole feature distribution is based on output partitions * add logger module * add tool to build pex * fix logger * make ci better * Remove pytest coverage
rom1504 · Feb 18, 2022 · 3082878 · 3082878
1 parent 2e351a9
commit 3082878
Show file tree

Hide file tree

Showing 43 changed files with 2,043 additions and 418 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -9,8 +9,25 @@ on:
     - main
 
 jobs:
-  build:
-
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python 3.8
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.8
+      - name: Install
+        run: |
+          python3 -m venv .env
+          source .env/bin/activate
+          python -m pip install -U pip
+          make install-dev
+      - name: Lint
+        run: |
+          source .env/bin/activate
+          make lint
+  tests:
     runs-on: ubuntu-latest
     strategy:
       matrix:
@@ -22,6 +39,13 @@ jobs:
       uses: actions/setup-python@v2
       with:
         python-version: ${{ matrix.python-version }}
-    - name: Install, lint and unit tests
+    - name: Install
+      run: |
+        python3 -m venv .env
+        source .env/bin/activate
+        make install
+        make install-dev
+    - name: Unit tests
       run: |
-        make venv-lint-test
+        source .env/bin/activate
+        make test
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -1,35 +1,40 @@
-# This workflows will upload a Python Package using Twine when a release is created
-# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
-
-name: Upload Python Package
+name: Release
 
 on:
-  release:
-    types: [created]
-  workflow_dispatch:
-
+  push:
+    branches:
+    - main
 jobs:
   deploy:
-
     runs-on: ubuntu-latest
-
     steps:
     - uses: actions/checkout@v2
-    - name: Use Node.js 14.x
-      uses: actions/setup-node@v1
+    - uses: actions-ecosystem/action-regex-match@v2
+      id: regex-match
       with:
-        node-version: 14.x
-    - run: cd front && npm install
-    - run: cd front && npm run build
+        text: ${{ github.event.head_commit.message }}
+        regex: '^Release ([^ ]+)'
     - name: Set up Python
       uses: actions/setup-python@v2
       with:
-        python-version: '3.x'
+        python-version: '3.8'
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install setuptools wheel twine
+        pip install setuptools wheel twine pex
+    - name: Build pex
+      run: |
+        make build-pex
+    - name: Release
+      if: ${{ steps.regex-match.outputs.match != '' }}
+      uses: softprops/action-gh-release@v1
+      with:
+        files: |
+          clip_retrieval_torch.tgz
+          clip_retrieval.tgz
+        tag_name: ${{ steps.regex-match.outputs.group1 }}
     - name: Build and publish
+      if: ${{ steps.regex-match.outputs.match != '' }}
       env:
         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}

diff --git a/.gitignore b/.gitignore
@@ -12,7 +12,11 @@ cat
 embedding_folder
 index_folder
 indices_paths.json
-.coverage
+.coverage*
 test_folder
 build
-dist
+dist
+wandb
+.pexing
+*.tgz
+*.pex
diff --git a/Makefile b/Makefile
@@ -13,14 +13,22 @@ lint: ## [Local development] Run mypy, pylint and black
 black: ## [Local development] Auto-format python code using black
 	python -m black -l 120 .
 
+build-pex:
+	python3 -m venv .pexing
+	. .pexing/bin/activate && python -m pip install -U pip && python -m pip install pex
+	. .pexing/bin/activate && python -m pex --layout packed  -f https://download.pytorch.org/whl/cu113/torch_stable.html setuptools s3fs==2021.11.0 pyspark==3.2.0 torch==1.10.2+cu113 torchvision==0.11.3+cu113 . -o clip_retrieval.pex -v
+	rm -rf .pexing
+	tar czf clip_retrieval_torch.tgz clip_retrieval.pex/.deps/torch-1.10.2+cu113-cp38-cp38-linux_x86_64.whl
+	tar czf clip_retrieval.tgz --exclude clip_retrieval.pex/.deps/torch-1.10.2+cu113-cp38-cp38-linux_x86_64.whl clip_retrieval.pex
+
 venv-lint-test: ## [Continuous integration]
 	python3 -m venv .env && . .env/bin/activate && make install install-dev lint test && rm -rf .env
 
 test: ## [Local development] Run unit tests
 	rm -rf tests/test_folder/
-	python -m pytest -v --cov=clip_retrieval --cov-report term-missing --cov-fail-under 0.0 tests
+	python -m pytest -x -s -v tests
 
 .PHONY: help
 
 help: # Run `make help` to get help on the make commands
-	@grep -E '^[0-9a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
+	@grep -E '^[0-9a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
diff --git a/README.md b/README.md
@@ -83,13 +83,17 @@ clip_inference turn a set of text+image into clip embeddings
 * **enable_image** Enable image processing (default *True*)
 * **enable_metadata** Enable metadata processing (default *False*)
 * **write_batch_size** Write batch size (default *10**6*)
-* **subset_size** Only process a subset of this size (default *None*)
 * **wds_image_key** Key to use for images in webdataset. (default *jpg*)
 * **wds_caption_key** Key to use for captions in webdataset. (default *txt*)
 * **clip_model** CLIP model to load (default *ViT-B/32*)
 * **mclip_model** MCLIP model to load (default *sentence-transformers/clip-ViT-B-32-multilingual-v1*)
 * **use_mclip** If False it performs the inference using CLIP; MCLIP otherwise (default *False*)
 * **use_jit** uses jit for the clip model (default *True*)
+* **distribution_strategy** choose how to distribute the job, see distribution section for details (default *sequential*)
+* **wds_number_file_per_input_file** estimation of the number of sample per tar if using wds and not specifying output_partition_count (default *10000*)
+* **output_partition_count** number of output partitions (default *None*)
+* **wandb_project** wandb project to use (default *clip_retrieval*)
+* **enable_wandb** whether to use wandb (default *False*)
 
 
 ### Loading/writing files on hdfs
@@ -281,6 +285,8 @@ make test
 
 You can use `make black` to reformat the code
 
+`python -m pytest -x -s -v tests -k "test_runner"` to run a specific test
+
 If you want to use the front through the python backend or frontend, run
 ```
 cd front

diff --git a/clip_retrieval/__init__.py b/clip_retrieval/__init__.py
@@ -3,6 +3,8 @@
 from .clip_back import clip_back
 from .clip_filter import clip_filter
 from .clip_index import clip_index
-from .clip_inference import clip_inference
+from .clip_inference.main import main as clip_inference
+
+# from .clip_inference import clip_inference
 from .clip_end2end import clip_end2end
 from .clip_front import clip_front
diff --git a/clip_retrieval/cli.py b/clip_retrieval/cli.py
@@ -21,3 +21,7 @@ def main():
             "front": clip_front,
         }
     )
+
+
+if __name__ == "__main__":
+    main()
-Original file line number
+Diff line change
@@ Expand Up / @@ -21,3 +21,7 @@ def main(): @@
                 "front": clip_front,
             }
         )
+    if __name__ == "__main__":
+        main()