fredriko
diff --git a/‎.pre-commit-config.yaml
+2-2 b/‎.pre-commit-config.yaml
+2-2
diff --git a/‎README.md
+63-14 b/‎README.md
+63-14
diff --git a/‎config.json
-25 b/‎config.json
-25
diff --git a/‎configs/metacurate_news_2022_1.json
+35 b/‎configs/metacurate_news_2022_1.json
+35
diff --git a/‎configs/metacurate_news_2022_2.json
+35 b/‎configs/metacurate_news_2022_2.json
+35
diff --git a/‎configs/metacurate_news_2022_3.json
+35 b/‎configs/metacurate_news_2022_3.json
+35
diff --git a/‎configs/metacurate_news_2022_4.json
+35 b/‎configs/metacurate_news_2022_4.json
+35
diff --git a/‎configs/metacurate_news_2022_5.json
+35 b/‎configs/metacurate_news_2022_5.json
+35
diff --git a/‎configs/metacurate_news_2022_6.json
+35 b/‎configs/metacurate_news_2022_6.json
+35
diff --git a/‎data/news_urls.csv ‎data/metacurate_news_2022.csv b/‎data/news_urls.csv ‎data/metacurate_news_2022.csv
@@ -1,4 +1,4 @@
-exclude: ^(data/|resources/)
+exclude: ^(data/|resources/|configs/)
 default_stages: [ commit ]
 
 repos:
@@ -13,7 +13,7 @@ repos:
     rev: 22.8.0
     hooks:
       - id: black
-        exclude: ^(data/|resources/)
+        exclude: ^(data/|resources/|configs/)
         language_version: python3
 
   - repo: https://github.com/PyCQA/flake8
 
@@ -1,23 +1,72 @@
-TODO
+# metacurate.io: Top _N_ AI/ML/data science news of 2022.
 
-* Viz: have plotly set width of image, but set height explicitly
-* Produce report/list of clusters w top n urls for Medium (markdown?)
-* pre-commit, black, linting
-* Integrate viz into main.
-* Make sure it works end-to-end.
-* Refactor code.
-* Add command line argument for selecting other config file.
+This repository contains the code required to generate...
 
-## Set up chart studio
+Live Plotly graph here...
 
-https://jennifer-banks8585.medium.com/how-to-embed-interactive-plotly-visualizations-on-medium-blogs-710209f93bd
+Link to list with top N news stories here...
 
+## TODO
+* Run final version of clustering, description, viz, report.
+* README
+* Medium/LinkedIn article:
+  * Top list
+  * Behind the scenes w code
 
+## Install
+This section contains instructions for how to install the code, resources, and dependencies
+needed to reproduce the clustering of the news headlines available in
+[metacurate_news_2022.csv](data/metacurate_news_2022.csv).
+
+### Requirements
+
+* git
+* Python (this repo was developed using Python 3.9)
+* pip
+* virtualenv
+* An API key from Cohere
+* Optional: Plotly Chart Studio credentials
+
+### Create and activate a virtual environment
+
+### Clone this repository
+
+### Install dependencies
+
+### Get and set up a Cohere API Key
+
+In order to use [Topically](link) to describe the clusters, you need to have an API key
+from Cohere. Get a free API account/key for Cohere here. Take note of the key, and set
+the environment variable `COHERE_API_KEY` like so:
+
+```bash
+export COHERE_API_KEY=<your_key>
 ```
-import chart_studio
 
-username = "<username>"
-api_key = "<api_key>"
 
-chart_studio.tools.set_credentials_file(username=username, api_key=api_key)
+### Optional: Get and set up Plotly Chart Studio credentials
+In order to publish the generated Plotly plot to the web (Plotly Chart studio), you need to
+have an account and set up the credentials locally. Follow the instructions for getting an
+account
+[here](https://jennifer-banks8585.medium.com/how-to-embed-interactive-plotly-visualizations-on-medium-blogs-710209f93bd)
+and edit the file [set_up_plotly_credentials.py](src/set_up_plotly_credentials.py) to include
+your `username` and `api_key`.
+
+Run the file:
+
+```bash
+python chart_studio.py
 ```
+
+to generate and store the credentials. This only has to be done once.
+
+## Run
+
+To run the code, simply issue the following:
+
+````bash
+python main.py
+````
+
+NOTE that this is a long-running process: the vectorization step will take a long time (up to an
+hour) if you're running on a CPU, and the clustering takes quite some time too.
@@ -0,0 +1,35 @@
+{
+  "params": {
+    "visualize_top_n": 50,
+    "report_top_n": 200,
+    "cluster_probability": 0.75,
+    "title": "Top AI/ML/data science and related news of 2022",
+    "publish_to_plotly": true,
+    "plotly_file_name": "metacurate_top_ai_ml_news_2022_1"
+  },
+  "data": {
+    "raw": "./data/metacurate_news_2022.csv",
+    "normalized": "./data/transient/normalized.csv",
+    "clustered": "./data/transient/clustered.csv",
+    "cluster_info": "./data/transient/cluster_info.csv",
+    "cluster_descriptions": "./data/transient/cluster_descriptions.csv",
+    "cluster_viz_data": "./data/output/2022_1/cluster_viz_data.csv",
+    "cluster_viz_html": "./data/output/2022_1/metacurate_news_viz_2022.html",
+    "cluster_report": "./data/output/2022_1/metacurate_news_report_2022.md",
+    "cache": "./data/transient/.cache"
+  },
+  "resources": {
+    "omit_strings": "./resources/omit_strings.csv"
+  },
+  "vectorizer": {
+    "model_name_or_path": "all-mpnet-base-v2"
+  },
+  "clusterer": {
+    "metric": "precomputed",
+    "cluster_selection_method": "leaf",
+    "min_cluster_size": 10,
+    "min_samples": 2,
+    "cluster_selection_epsilon":0.05,
+    "memory": "./data/transient/.cache"
+  }
+}
@@ -0,0 +1,35 @@
+{
+  "params": {
+    "visualize_top_n": 50,
+    "report_top_n": 200,
+    "cluster_probability": 0.75,
+    "title": "Top AI/ML/data science and related news of 2022",
+    "publish_to_plotly": true,
+    "plotly_file_name": "metacurate_top_ai_ml_news_2022_2"
+  },
+  "data": {
+    "raw": "./data/metacurate_news_2022.csv",
+    "normalized": "./data/transient/normalized.csv",
+    "clustered": "./data/transient/clustered.csv",
+    "cluster_info": "./data/transient/cluster_info.csv",
+    "cluster_descriptions": "./data/transient/cluster_descriptions.csv",
+    "cluster_viz_data": "./data/output/2022_2/cluster_viz_data.csv",
+    "cluster_viz_html": "./data/output/2022_2/metacurate_news_viz_2022.html",
+    "cluster_report": "./data/output/2022_2/metacurate_news_report_2022.md",
+    "cache": "./data/transient/.cache"
+  },
+  "resources": {
+    "omit_strings": "./resources/omit_strings.csv"
+  },
+  "vectorizer": {
+    "model_name_or_path": "all-mpnet-base-v2"
+  },
+  "clusterer": {
+    "metric": "precomputed",
+    "cluster_selection_method": "leaf",
+    "min_cluster_size": 20,
+    "min_samples": 2,
+    "cluster_selection_epsilon":0.05,
+    "memory": "./data/transient/.cache"
+  }
+}
@@ -0,0 +1,35 @@
+{
+  "params": {
+    "visualize_top_n": 50,
+    "report_top_n": 200,
+    "cluster_probability": 0.75,
+    "title": "Top AI/ML/data science and related news of 2022",
+    "publish_to_plotly": true,
+    "plotly_file_name": "metacurate_top_ai_ml_news_2022_3"
+  },
+  "data": {
+    "raw": "./data/metacurate_news_2022.csv",
+    "normalized": "./data/transient/normalized.csv",
+    "clustered": "./data/transient/clustered.csv",
+    "cluster_info": "./data/transient/cluster_info.csv",
+    "cluster_descriptions": "./data/transient/cluster_descriptions.csv",
+    "cluster_viz_data": "./data/output/2022_3/cluster_viz_data.csv",
+    "cluster_viz_html": "./data/output/2022_3/metacurate_news_viz_2022.html",
+    "cluster_report": "./data/output/2022_3/metacurate_news_report_2022.md",
+    "cache": "./data/transient/.cache"
+  },
+  "resources": {
+    "omit_strings": "./resources/omit_strings.csv"
+  },
+  "vectorizer": {
+    "model_name_or_path": "all-mpnet-base-v2"
+  },
+  "clusterer": {
+    "metric": "precomputed",
+    "cluster_selection_method": "leaf",
+    "min_cluster_size": 15,
+    "min_samples": 2,
+    "cluster_selection_epsilon":0.05,
+    "memory": "./data/transient/.cache"
+  }
+}
@@ -0,0 +1,35 @@
+{
+  "params": {
+    "visualize_top_n": 50,
+    "report_top_n": 200,
+    "cluster_probability": 0.9,
+    "title": "Top AI/ML/data science and related news of 2022",
+    "publish_to_plotly": true,
+    "plotly_file_name": "metacurate_top_ai_ml_news_2022_4"
+  },
+  "data": {
+    "raw": "./data/metacurate_news_2022.csv",
+    "normalized": "./data/transient/normalized.csv",
+    "clustered": "./data/transient/clustered.csv",
+    "cluster_info": "./data/transient/cluster_info.csv",
+    "cluster_descriptions": "./data/transient/cluster_descriptions.csv",
+    "cluster_viz_data": "./data/output/2022_4/cluster_viz_data.csv",
+    "cluster_viz_html": "./data/output/2022_4/metacurate_news_viz_2022.html",
+    "cluster_report": "./data/output/2022_4/metacurate_news_report_2022.md",
+    "cache": "./data/transient/.cache"
+  },
+  "resources": {
+    "omit_strings": "./resources/omit_strings.csv"
+  },
+  "vectorizer": {
+    "model_name_or_path": "all-mpnet-base-v2"
+  },
+  "clusterer": {
+    "metric": "precomputed",
+    "cluster_selection_method": "leaf",
+    "min_cluster_size": 50,
+    "min_samples": 25,
+    "cluster_selection_epsilon":0.05,
+    "memory": "./data/transient/.cache"
+  }
+}
@@ -0,0 +1,35 @@
+{
+  "params": {
+    "visualize_top_n": 50,
+    "report_top_n": 200,
+    "cluster_probability": 0.9,
+    "title": "Top AI/ML/data science and related news of 2022",
+    "publish_to_plotly": true,
+    "plotly_file_name": "metacurate_top_ai_ml_news_2022_5"
+  },
+  "data": {
+    "raw": "./data/metacurate_news_2022.csv",
+    "normalized": "./data/transient/normalized.csv",
+    "clustered": "./data/transient/clustered.csv",
+    "cluster_info": "./data/transient/cluster_info.csv",
+    "cluster_descriptions": "./data/transient/cluster_descriptions.csv",
+    "cluster_viz_data": "./data/output/2022_5/cluster_viz_data.csv",
+    "cluster_viz_html": "./data/output/2022_5/metacurate_news_viz_2022.html",
+    "cluster_report": "./data/output/2022_5/metacurate_news_report_2022.md",
+    "cache": "./data/transient/.cache"
+  },
+  "resources": {
+    "omit_strings": "./resources/omit_strings.csv"
+  },
+  "vectorizer": {
+    "model_name_or_path": "all-mpnet-base-v2"
+  },
+  "clusterer": {
+    "metric": "precomputed",
+    "cluster_selection_method": "leaf",
+    "min_cluster_size": 3,
+    "min_samples": 1,
+    "cluster_selection_epsilon":0.2,
+    "memory": "./data/transient/.cache"
+  }
+}
@@ -0,0 +1,35 @@
+{
+  "params": {
+    "visualize_top_n": 50,
+    "report_top_n": 200,
+    "cluster_probability": 0.9,
+    "title": "Top AI/ML/data science and related news of 2022",
+    "publish_to_plotly": true,
+    "plotly_file_name": "metacurate_top_ai_ml_news_2022_6"
+  },
+  "data": {
+    "raw": "./data/metacurate_news_2022.csv",
+    "normalized": "./data/transient/normalized.csv",
+    "clustered": "./data/transient/clustered.csv",
+    "cluster_info": "./data/transient/cluster_info.csv",
+    "cluster_descriptions": "./data/transient/cluster_descriptions.csv",
+    "cluster_viz_data": "./data/output/2022_6/cluster_viz_data.csv",
+    "cluster_viz_html": "./data/output/2022_6/metacurate_news_viz_2022.html",
+    "cluster_report": "./data/output/2022_6/metacurate_news_report_2022.md",
+    "cache": "./data/transient/.cache"
+  },
+  "resources": {
+    "omit_strings": "./resources/omit_strings.csv"
+  },
+  "vectorizer": {
+    "model_name_or_path": "all-mpnet-base-v2"
+  },
+  "clusterer": {
+    "metric": "precomputed",
+    "cluster_selection_method": "eom",
+    "min_cluster_size": 3,
+    "min_samples": 1,
+    "cluster_selection_epsilon":0.2,
+    "memory": "./data/transient/.cache"
+  }
+}