feat: get embedding, markdown; ci: github action

ValMystletainn · Oct 16, 2024 · a852068 · a852068
1 parent 8f842a2
commit a852068
Show file tree

Hide file tree

Showing 475 changed files with 742 additions and 93,608 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -0,0 +1,49 @@
+name: CustomICLRRecommendation
+on:
+  workflow_dispatch:
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+jobs:
+  push:
+    runs-on: ubuntu-latest
+    env:
+      GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
+      BASE_URL: /${{ github.event.repository.name }}
+    steps:
+    - uses: actions/checkout@v4
+    - uses: actions/setup-python@v5
+      with:
+        python-version: '3.10'
+    - uses: actions/setup-node@v4
+      with:
+        node-version: 22.x
+    - name: install dependencies
+      run: pip install -r requirements.txt
+    - name: build markdown
+      # --num_threshold 1000 can turn to the exclusive options --score_threshold; example --score_threshold 0.5
+      # --likes and --dislikes can turn to the exclusive options --like_dislike_config; example --like_dislike_config ./likes_dislikes.json
+      # --emnbedding_from has another options title_abs, means the text embedding come from title and abstract together
+      run: >
+        python get_markdown.py
+        --crawl_result_dir outputs
+        --num_threshold 1000
+        --likes
+        "Distribution Backtracking Builds A Faster Convergence Trajectory for Diffusion Distillation" 
+        "Diffusion Models for 4D Novel View Synthesis"
+        --dislikes 
+        "CELL-Diff: Unified Diffusion Modeling for Protein Sequences and Microscopy Images" 
+        "Build your own cell: Diffusion Models for Multichannel 3D Microscopy Image Generation"
+        --embedding_from title
+        
+    - name: to myst pages
+      run:  myst build --html
+    - uses: actions/upload-artifact@v4
+      with:
+        path: output.md
+    - uses: actions/upload-pages-artifact@v1
+      with:
+        path: ./_build/html
+    - uses: actions/deploy-pages@v2
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,9 @@
 .pixi/
 .vscode/
 pixi.lock
+_build/
+output.md
+score_cdf.png
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2024 Wu Wenxu
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,32 @@
+# Customize your ICLR 2025 Recommendation
+
+This repo contains the crawl result for all the activate submission of ICLR 2025.
+All of them are annotated some text embeddings from google Gemini API.
+You can use those embeddings to build your own ICLR 2025 recommendation paper list, by the self defined like and dislike title or (title, abstract) list.
+
+Rather than just scanning too many paper directly, or use the openreview keyword searching engine to filter. I think the language model embeddings will do a better balance to filter the paper you may interested.
+
+## Usage
+
+## Basic Usage
+
+1. fork this repo
+2. open the github page for the fork repo
+3. **Do the customization**: adjust the parameters in `.github/workflows/main.yml`, about your preference of the paper list, the number of paper your want, etc.
+4. go to the github action page and trigger the action
+5. get the result markdown in the artifact and the rendering pages in the github page of the fork repo
+
+The page rendering is powered by [mystmd](https://github.com/jupyter-book/mystmd), in a very academic style.
+
+If the title you type in `likes` and `dislikes` not in the ICLR2025 paper like, the action would request the google gemini api. So you have to get your own google gemini api key at [ai.google.dev](https://ai.google.dev/). and set the action secret environment as `GOOGLE_API_KEY=<YOUR API KEY>`
+
+## The detailed process
+```bash
+pip install -r requirements.txt  # install the dependency
+python main.py  # do the crawl, powered by crawl4ai
+python get_embeddings.py
+python get_markdown.py
+
+pip install mystmd
+myst start
+```
diff --git a/get_embeddings.py b/get_embeddings.py
@@ -1,17 +1,18 @@
 import argparse
 import functools
+import itertools
 import json
 import logging
 import os
 import random
 import time
-import itertools
 from concurrent.futures import ThreadPoolExecutor
 
 import google.generativeai as genai
+import numpy as np
 
 logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.DEBUG)
+logging.basicConfig(level=logging.INFO)
 genai.configure(transport="rest")
 
 parser = argparse.ArgumentParser()
@@ -91,7 +92,7 @@ def combine_title_abs(title: str, abs: str):
     ]
     embedding_to_file_name += [file_name] * batch_size
     embedding_to_inner_file_indices += list(range(batch_size))
-    embedding_to_fields += ["title_embeddings"] * batch_size
+    embedding_to_fields += ["title_embedding"] * batch_size
 
     ## title + abstract embeddings
 
@@ -107,15 +108,15 @@ def combine_title_abs(title: str, abs: str):
     ]
     embedding_to_file_name += [file_name] * batch_size
     embedding_to_inner_file_indices += list(range(batch_size))
-    embedding_to_fields += ["title_abs_embeddings"] * batch_size
+    embedding_to_fields += ["title_abs_embedding"] * batch_size
 
 
-# get_embedding_funcs = get_embedding_funcs[:100]  # TODO: delete it
+# get_embedding_funcs = get_embedding_funcs[:1000]  # TODO: delete it
 
 get_embedding_funcs = [
     retry_with_timeout_decorator(
         max_retries=3,
-        base_delay=10,
+        base_delay=60 / 1200,  # 1500 RPM at peak, leave some redundency
         factor=2,
         jitter=True,
     )(func)
@@ -126,19 +127,22 @@ def combine_title_abs(title: str, abs: str):
     embeddings = list(executor.map(lambda func: func(), get_embedding_funcs))
 
 embeddings = [d["embedding"] for d in embeddings]
+embeddings = np.array(embeddings)
+embeddings = embeddings.astype(np.float16)
+np.save(os.path.join(args.crawl_result_dir, "embeddings.npy"), embeddings)
 
 grouped_by_file_indices = list(range(len(embeddings)))
 grouped_by_file_indices.sort(key=lambda x: embedding_to_file_name[x])
 
-for file_name, indices in itertools.groupby(grouped_by_file_indices):
+for file_name, indices in itertools.groupby(grouped_by_file_indices, key=lambda x: embedding_to_file_name[x]):
     indices = list(indices)
     with open(file_name, "rt") as f:
         output_dicts = json.load(f)
 
     for index, embedding in zip(indices, embeddings):
         inner_index = embedding_to_inner_file_indices[index]
         field = embedding_to_fields[index]
-        output_dicts[inner_index][embedding_to_fields[index]] = embedding
+        output_dicts[inner_index][f"{embedding_to_fields[index]}_index"] = index
 
     with open(file_name + 'l', "wt") as f:  # TODO turn it to json
-        json.dump(output_dicts, f, indent=4)
+        json.dump(output_dicts, f)
diff --git a/get_markdown.py b/get_markdown.py
@@ -0,0 +1,131 @@
+import argparse
+import json
+import os
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+def get_extra_embedding(text: str) -> np.ndarray:
+    import google.generativeai as genai
+    genai.configure(transport="rest")
+    embedding_dict = genai.embed_content(
+        model="models/embedding-001",
+        content=text,
+        task_type="clustering",
+    )
+    embedding = embedding_dict["embedding"]
+    return np.array(embedding)
+
+def build_paper_section(paper_dict: dict) -> str:
+    title = paper_dict["title"]
+    abstract = paper_dict.get("abstract", "No absctract")
+    openreview_link = paper_dict["link"]
+    pdf_link = paper_dict["pdf_link"]
+    result = ""
+    result += f"## {title}"
+    result += "\n\n"
+    result += f"\[[openreview]({openreview_link})\] \[[pdf]({pdf_link})\]"
+    result += "\n\n"
+    result += f"**Abstract** {abstract}"
+
+    return result
+
+def dump_data_cdf(data: np.ndarray):
+    data_sorted = np.sort(data)
+    cdf = np.arange(1, len(data_sorted) + 1) / len(data_sorted)
+    plt.plot(data_sorted, cdf, marker='.', linestyle='none')
+    plt.xlabel('favor score')
+    plt.ylabel('CDF')
+    plt.title('CDF of scores for those paper')
+    plt.savefig("score_cdf.png", bbox_inches='tight')
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--crawl_result_dir", type=str, default="outputs")
+    parser.add_argument("--score_threshold", type=float)
+    parser.add_argument("--num_threshold", type=int)
+    parser.add_argument("--likes", nargs='+')
+    parser.add_argument("--dislikes", nargs='+')
+    parser.add_argument("--like_dislike_config", type=str)
+    parser.add_argument("--embedding_from", type=str, choices=["title", "title_abs"], default="title")
+
+    args = parser.parse_args()
+    assert not(args.score_threshold is not None and args.num_threshold is not None), "`score threshold` and `num threshld` cannot be both set"
+    assert not((args.likes is not None or args.dislikes is not None) and args.like_dislike_config is not None), "command line options passing likes, dislikes is conflicting with passing config file"
+
+    embeddings = np.load(os.path.join(args.crawl_result_dir, "embeddings.npy"))
+    crawl_results = [
+        os.path.join(args.crawl_result_dir, p)
+        for p in os.listdir(args.crawl_result_dir) if p.endswith(".json")
+    ]
+    crawl_results.sort(key=lambda x: int(x.split('result')[1].split('.json')[0]))
+    paper_list = []  # type: list[dict[str, Any]]
+    for crawl_result in crawl_results:
+        with open(crawl_result, "r") as f:
+            paper_list.extend(json.load(f))
+    embedding_index_key = f"{args.embedding_from}_embedding_index"
+    title_to_embedding_index_lut = {
+        paper_dict["title"]: paper_dict[embedding_index_key]
+        for paper_dict in paper_list
+    }
+
+    ## get projection weight by like and dislike
+    score_projection_weight = np.zeros(embeddings.shape[1])
+    if args.like_dislike_config is not None:
+        with open(args.like_dislike_config, "r") as f:
+            like_dislike_config = json.load(f)
+        likes = like_dislike_config["likes"]
+        dislikes = like_dislike_config["dislikes"]
+    elif args.likes is not None:
+        likes = args.likes
+        dislikes = args.dislikes
+
+    if likes is not None and len(likes) > 0:
+        like_embeddings = np.array([
+            embeddings[title_to_embedding_index_lut[title], :] 
+            if title in title_to_embedding_index_lut 
+            else get_extra_embedding(title)
+            for title in likes
+        ])
+        score_projection_weight += np.mean(like_embeddings, axis=0)
+    if dislikes is not None and len(dislikes) > 0:
+        dislike_embeddings = np.array([
+            embeddings[title_to_embedding_index_lut[title], :] 
+            if title in title_to_embedding_index_lut 
+            else get_extra_embedding(title)
+            for title in dislikes
+        ])
+        score_projection_weight -= np.mean(dislike_embeddings, axis=0)
+
+    scores = [
+        score_projection_weight @ embeddings[d[embedding_index_key], :]
+        for d in paper_list
+    ]
+    scores = np.array(scores)
+    favor_indices = np.argsort(scores)[::-1]
+    if args.score_threshold is not None:
+        favor_indices = favor_indices[scores[favor_indices] > args.score_threshold]
+    if args.num_threshold is not None:
+        favor_indices = favor_indices[:args.num_threshold]
+
+    favor_papers = [paper_list[i] for i in favor_indices]
+    favor_scores = scores[favor_indices]
+
+    ## output the markdown
+    header = "# Your ICLR Recommendation list"
+    header += "\n\n"
+    header += f"There is {len(favor_papers)} papers for you in ICLR 2025"
+    header += "\n\n"
+    dump_data_cdf(favor_scores)
+    header += "![score_cdf](score_cdf.png)"
+
+    paper_section = "\n\n".join([build_paper_section(d) for d in favor_papers])
+
+    markdown_str = header + "\n\n" + paper_section
+    with open("output.md", "wt") as f:
+        f.write(markdown_str)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/likes_dislikes.json b/likes_dislikes.json
@@ -0,0 +1,10 @@
+{
+    "likes": [
+        "Distribution Backtracking Builds A Faster Convergence Trajectory for Diffusion Distillation",
+        "Diffusion Models for 4D Novel View Synthesis"
+    ],
+    "dislikes": [
+        "CELL-Diff: Unified Diffusion Modeling for Protein Sequences and Microscopy Images",
+        "Build your own cell: Diffusion Models for Multichannel 3D Microscopy Image Generation"
+    ]
+}
diff --git a/myst.yml b/myst.yml
@@ -0,0 +1,14 @@
+# See docs at: https://mystmd.org/guide/frontmatter
+version: 1
+project:
+  numbering: true
+  toc:
+    # Auto-generated by `myst init --write-toc`
+    - file: output.md
+
+site:
+  template: book-theme
+  options:
+    hide_toc: true
+    hide_outline: false
+    outline_maxdepth: 1
diff --git a/outputs/embeddings.npy b/outputs/embeddings.npy