-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
85 lines (75 loc) · 2.51 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import argparse
import logging
import datasets
from dotenv import load_dotenv
from multi_crawler import ArchiveCrawler, CSVExporter, YoutubeCrawler
logging.basicConfig(level=logging.INFO)
load_dotenv(override=True)
logging.getLogger("multi_crawler.ytb_session").setLevel(logging.CRITICAL)
if __name__ == "__main__":
argparser = argparse.ArgumentParser(
prog="multi_crawler",
description="Utility to crawl audio files from the internet using webarhive.org and youtube.com",
)
argparser.add_argument(
"--input",
type=str,
required=True,
help="Input file with search terms from youtube or collection name from archive.org",
)
argparser.add_argument(
"--csv",
required=True,
action="store_true",
help="Output file in CSV format",
)
argparser.add_argument(
"--overwrite",
action="store_true",
help="Overwrite the csv file if it exists",
)
argparser.add_argument(
"--file_name",
type=str,
help="Name of the output file",
required=False,
)
argparser.add_argument(
"--tor_proxy",
action="store_true",
help="Use Tor proxy to make requests on youtube",
default=False,
)
argparser.add_argument(
"--num_processes",
type=int,
help="Number of processes to use for crawling",
default=40,
required=False,
)
argparser.add_argument(
"--huggingface_dataset",
type=str,
help="Name of the dataset to push to Huggingface Hub",
required=False,
)
args = argparser.parse_args()
if args.csv and args.file_name is None:
raise ValueError("Please provide the name of the output file using --file_name")
exporter = CSVExporter(args.file_name, overwrite=args.overwrite)
with open(args.input, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
logging.info("Processing line: %s", line)
if line.startswith("youtube:"):
crawlers = YoutubeCrawler(
line.split(" ", 1)[1],
callback=exporter,
num_processes=args.num_processes,
)
else:
crawlers = ArchiveCrawler(line.split(" ", 1)[1], callback=exporter)
crawlers.crawl()
if args.huggingface_dataset:
dataset = datasets.load_dataset("csv", data_files=args.file_name)
dataset.push_to_hub(args.huggingface_dataset)