Skip to content

Commit 657d9bf

Browse files
committed
add ability to have start and stop dates
* allows for a check of a single week * continues to support processing a month at a time * expands support for controlling function through .env file * provides example .env file
1 parent 597842a commit 657d9bf

File tree

3 files changed

+286
-61
lines changed

3 files changed

+286
-61
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
MONGO_CONNECTION_STRING="mongodb://localhost:27017/"
2+
BASE_AZURE_BLOB_URL = "https://storageaccount.blob.core.windows.net/container_name"
3+
OUTPUT_FILE = "invalid-data.json"
4+
# START_DATE = "2024-06-21"
5+
# END_DATE = "2024-06-28"
6+
START_MONTH = str(os.environ.get("START_MONTH", "2024-06"))
7+
END_MONTH = str(os.environ.get("END_MONTH", "2024-06"))
8+
MAX_DOCS = 500
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
# analyze_data_synchronization tool
2+
3+
## Usage
4+
5+
### Prerequisites
6+
7+
Set up environment variables that drive how the tool runs. This can be set as system env vars. They can also be set in a `.env` You can
8+
rename `.env-example` to `.env` and modify as desired.
9+
10+
- MONGO_CONNECTION_STRING (required) - the connection string to the MongoDB database
11+
- BASE_AZURE_BLOB_URL (required) - the base path including the container
12+
- START_DATE (optional) - the first date to include in the query (default: `""`)
13+
- END_DATE (optional) - the last date to include in the query (default: `""`)
14+
- START_MONTH (optional) - the first month to include in the query (default: `"2024-01"`)
15+
- END_MONTH (optional) - the last month to include in the query (default: `"2024-06"`)
16+
- MAX_DOCS (optional) - the max number of documents that will be processed for each month or during the custom date range (default: 5000)
17+
- OUTPUT_FILE (optional) - the file to write the output to (default: `"invalid_data.json"`)
18+
19+
### Set up virtual environment
20+
21+
This is best run in a Python virtual environment. Set up the .venv and install the required dependencies.
22+
23+
```bash
24+
python3 -m venv .venv
25+
source .venv/bin/activate
26+
python3 -m pip install -r requirements.txt
27+
```
28+
29+
### Run the script
30+
31+
```bash
32+
python3 analyze.py
33+
```
34+
35+
## Example
36+
37+
### Example coordinates
38+
39+
```text
40+
composer/packagist/00f100/fcphp-cache/revision/0.1.0.json
41+
```
42+
43+
### Example Mongo document with unused fields removed
44+
45+
```json
46+
{
47+
"_id": "composer/packagist/00f100/fcphp-cache/0.1.0",
48+
"_meta": {
49+
"schemaVersion": "1.6.1",
50+
"updated": "2019-08-29T02:06:54.498Z"
51+
},
52+
"coordinates": {
53+
"type": "composer",
54+
"provider": "packagist",
55+
"namespace": "00f100",
56+
"name": "fcphp-cache",
57+
"revision": "0.1.0"
58+
},
59+
"licensed": {
60+
"declared": "MIT",
61+
"toolScore": {
62+
"total": 17,
63+
"declared": 0,
64+
"discovered": 2,
65+
"consistency": 0,
66+
"spdx": 0,
67+
"texts": 15
68+
},
69+
"score": {
70+
"total": 17,
71+
"declared": 0,
72+
"discovered": 2,
73+
"consistency": 0,
74+
"spdx": 0,
75+
"texts": 15
76+
}
77+
}
78+
}
79+
```
80+
81+
### Example Output
82+
83+
The following shows the summary stats and an example of one of the 6 invalid samples.
84+
85+
```json
86+
{
87+
"2024-06": {
88+
"stats": {
89+
"sample_total": 500,
90+
"sample_invalid": 6,
91+
"percent_invalid": "1.2%",
92+
"sample_percent_of_total": "0.58%",
93+
"total_documents": 86576,
94+
"total_estimated_invalid": 1039
95+
},
96+
"sourcearchive/mavencentral/org.apache.kerby/kerby-util/1.0.1": {
97+
"db": {
98+
"licensed": null,
99+
"_meta": {
100+
"schemaVersion": "1.6.1",
101+
"updated": "2024-06-13T12:59:21.981Z"
102+
}
103+
},
104+
"blob": {
105+
"licensed": "Apache-2.0",
106+
"_meta": {
107+
"schemaVersion": "1.6.1",
108+
"updated": "2024-06-13T12:59:31.368Z"
109+
}
110+
}
111+
},
112+
...
113+
}
114+
...
115+
}
116+
```

0 commit comments

Comments
 (0)