Skip to content

Commit 1026624

Browse files
committed
adding python download script
1 parent 8b8432d commit 1026624

File tree

2 files changed

+297
-0
lines changed

2 files changed

+297
-0
lines changed

download_script.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
#!/usr/bin/env python
2+
3+
import requests
4+
import xml.etree.ElementTree as ET
5+
from html.parser import HTMLParser
6+
import subprocess
7+
from argparse import ArgumentParser
8+
from multiprocessing import Pool
9+
10+
def download_html(url):
11+
try:
12+
response = requests.get(url)
13+
response.raise_for_status() # Check if the request was successful
14+
return response.text
15+
except requests.exceptions.RequestException as e:
16+
print(f"An error occurred: {e}")
17+
return None
18+
19+
20+
class MyHTMLParser(HTMLParser):
21+
def __init__(self):
22+
super().__init__()
23+
self.capture_data = False
24+
self.target_class = ""
25+
self.captured_data = []
26+
27+
def handle_starttag(self, tag, attrs):
28+
if tag == "span":
29+
for attr in attrs:
30+
if attr[0] == "class" and attr[1] == self.target_class:
31+
self.capture_data = True
32+
33+
def handle_endtag(self, tag):
34+
if tag == "span" and self.capture_data:
35+
self.capture_data = False
36+
37+
def handle_data(self, data):
38+
if self.capture_data:
39+
self.captured_data.append(data)
40+
41+
def extract_span_by_class(self, html, class_value):
42+
self.target_class = class_value
43+
self.captured_data = []
44+
self.feed(html)
45+
return self.captured_data
46+
47+
48+
49+
def handle_args():
50+
p = ArgumentParser(description="Download files from AARNet FileSender URL. By default will download 8 files concurrently.",
51+
usage="""download_script.py [-h] --url URL [--parallel PARALLEL] [--single {tar,zip}]
52+
(url string should be enclosed in quotation marks)""")
53+
p.add_argument("--url", "-u", required=True, help="URL to FileSender download page, url string should be enclosed in quotation marks")
54+
p.add_argument("--parallel", "-p", type=int, default=8, help="Number of files to download in parallel (default behaviour), default=8")
55+
p.add_argument("--outdir", "-o", default="./", help="Output directory")
56+
p.add_argument("--single", "-s", choices=["tar", "zip"],
57+
help="Download data in a single archive file (either zip or tar). If specified, overrides --parallel")
58+
return p.parse_args()
59+
60+
OUTDIR="./"
61+
def download_url(url):
62+
print(f"downloading {url}")
63+
wget_cmd = f"wget -P {OUTDIR} --content-disposition \"{url}\""
64+
wget_proc = subprocess.Popen(wget_cmd, shell=True)
65+
wget_proc.communicate()
66+
67+
class FileSenderDownload:
68+
def __init__(self, url, archive_format=None):
69+
self.archive_format = archive_format
70+
self.html_content = download_html(url)
71+
parser = MyHTMLParser()
72+
self.directlinks = parser.extract_span_by_class(self.html_content, "directlink")
73+
self.directlinks = [x.split("Direct Link: ")[1].strip() for x in self.directlinks]
74+
self.token = url.split("&token=")[1]
75+
self.fileids = [x.split("&files_ids=")[1] for x in self.directlinks]
76+
77+
def single_archive_link(self):
78+
base_url = "https://filesender.aarnet.edu.au/download.php?"
79+
base_url += f"token={self.token}&files_ids={'%2C'.join(self.fileids)}&archive_format={self.archive_format}"
80+
return base_url
81+
82+
if __name__=="__main__":
83+
args = handle_args()
84+
OUTDIR=args.outdir
85+
fsdownload=FileSenderDownload(args.url, archive_format=args.single)
86+
if args.single:
87+
print(f"downloading a single {args.single} file")
88+
download_url(fsdownload.single_archive_link())
89+
exit()
90+
elif args.parallel:
91+
print(f"download {args.parallel} files in parallel")
92+
if args.parallel < 1:
93+
raise ValueError("--parallel value must be positive integer")
94+
elif args.parallel == 1:
95+
for url_ in fsdownload.directlinks:
96+
download_url(url_)
97+
else:
98+
pool = Pool(args.parallel)
99+
pool.map(download_url, fsdownload.directlinks)
100+
101+
102+
103+
104+
105+
106+
# https://filesender.aarnet.edu.au/download.php?token=d6be9395-9fc6-427f-b565-289cb32a16e2&files_ids=22322589%2C22322355%2C22322460%2C22322286%2C22322646%2C22322397%2C22322496%2C22322271%2C22322595%2C22322385%2C22322676%2C22322358%2C22322547%2C22322253%2C22322661%2C22322340%2C22322640%2C22322313%2C22322685%2C22322361%2C22322619%2C22322301%2C22322631%2C22322319%2C22322190%2C22322136%2C22322127%2C22322124%2C22322265%2C22322196%2C22322217%2C22322163%2C22322229%2C22322157%2C22322205%2C22322148%2C22322199%2C22322145%2C22322247%2C22322184%2C22322232%2C22322166%2C22322244%2C22322175%2C22322322%2C22322250%2C22322220%2C22322169%2C22322379%2C22322316%2C22322292%2C22322226%2C22322574%2C22322406%2C22322505%2C22322388%2C22322562%2C22322400%2C22322610%2C22322454%2C22322613%2C22322481%2C22322529%2C22322442%2C22322553%2C22322448%2C22322541%2C22322472%2C22322568%2C22322520%2C22322478%2C22322433%2C22322469%2C22322331%2C22322304%2C22322238%2C22322367%2C22322259%2C22322376%2C22322295%2C22322268%2C22322193%2C22322283%2C22322208%2C22322370%2C22322274%2C22322256%2C22322181%2C22322334%2C22322223%2C22322463%2C22322307%2C22322415%2C22322280%2C22322364%2C22322262%2C22322424%2C22322277%2C22322142%2C22322130%2C22322298%2C22322211%2C22322235%2C22322172%2C22322325%2C22322241%2C22322289%2C22322214%2C22322160%2C22322133%2C22322178%2C22322154%2C22322373%2C22322337%2C22322202%2C22322187%2C22322151%2C22322139%2C22322592%2C22322538%2C22322487%2C22322403%2C22322571%2C22322475%2C22322604%2C22322502%2C22322625%2C22322577%2C22322682%2C22322532%2C22322679%2C22322535%2C22322583%2C22322421%2C22322664%2C22322511%2C22322673%2C22322526%2C22322655%2C22322445%2C22322607%2C22322412%2C22322634%2C22322439%2C22322499%2C22322310%2C22322517%2C22322328%2C22322556%2C22322346%2C22322550%2C22322352%2C22322598%2C22322394%2C22322667%2C22322484%2C22322616%2C22322409%2C22322628%2C22322430%2C22322652%2C22322457%2C22322565%2C22322343%2C22322586%2C22322391%2C22322580%2C22322382%2C22322559%2C22322349%2C22322643%2C22322466%2C22322637%2C22322436%2C22322601%2C22322418%2C22322691%2C22322508%2C22322697%2C22322544%2C22322649%2C22322451%2C22322658%2C22322490%2C22322688%2C22322523%2C22322694%2C22322514%2C22322622%2C22322427%2C22322670%2C22322493%2C22322121%2C22322700%2C22322703&archive_format=zip&transaction_id=ac97c143-344b-452b-a017-006fbaddac7e

download_scripts.ipynb

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 13,
6+
"id": "c9cae81a-2aea-4fac-b3b9-8a25738e2b77",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"import requests\n",
11+
"import xml.etree.ElementTree as ET\n",
12+
"from html.parser import HTMLParser"
13+
]
14+
},
15+
{
16+
"cell_type": "code",
17+
"execution_count": null,
18+
"id": "0dbe8524-a52c-4689-bb92-a835c6141d37",
19+
"metadata": {},
20+
"outputs": [],
21+
"source": []
22+
},
23+
{
24+
"cell_type": "code",
25+
"execution_count": null,
26+
"id": "c55bc9b1-5592-4cad-933d-91062ef4450e",
27+
"metadata": {},
28+
"outputs": [],
29+
"source": []
30+
},
31+
{
32+
"cell_type": "code",
33+
"execution_count": null,
34+
"id": "7792f381-240d-4733-8807-adfcf8cd10d5",
35+
"metadata": {},
36+
"outputs": [],
37+
"source": []
38+
},
39+
{
40+
"cell_type": "code",
41+
"execution_count": null,
42+
"id": "dd9194b4-0a52-4493-974f-d117468d3f08",
43+
"metadata": {},
44+
"outputs": [],
45+
"source": []
46+
},
47+
{
48+
"cell_type": "code",
49+
"execution_count": 36,
50+
"id": "d6885753-934f-49a3-aa33-07b5dacb0504",
51+
"metadata": {},
52+
"outputs": [],
53+
"source": [
54+
"\n",
55+
"def download_html(url):\n",
56+
" try:\n",
57+
" response = requests.get(url)\n",
58+
" response.raise_for_status() # Check if the request was successful\n",
59+
" return response.text\n",
60+
" except requests.exceptions.RequestException as e:\n",
61+
" print(f\"An error occurred: {e}\")\n",
62+
" return None\n",
63+
"\n",
64+
"\n",
65+
"class MyHTMLParser(HTMLParser):\n",
66+
" def __init__(self):\n",
67+
" super().__init__()\n",
68+
" self.capture_data = False\n",
69+
" self.target_class = \"\"\n",
70+
" self.captured_data = []\n",
71+
"\n",
72+
" def handle_starttag(self, tag, attrs):\n",
73+
" if tag == \"span\":\n",
74+
" for attr in attrs:\n",
75+
" if attr[0] == \"class\" and attr[1] == self.target_class:\n",
76+
" self.capture_data = True\n",
77+
"\n",
78+
" def handle_endtag(self, tag):\n",
79+
" if tag == \"span\" and self.capture_data:\n",
80+
" self.capture_data = False\n",
81+
"\n",
82+
" def handle_data(self, data):\n",
83+
" if self.capture_data:\n",
84+
" self.captured_data.append(data)\n",
85+
"\n",
86+
" def extract_span_by_class(self, html, class_value):\n",
87+
" self.target_class = class_value\n",
88+
" self.captured_data = []\n",
89+
" self.feed(html)\n",
90+
" return self.captured_data\n"
91+
]
92+
},
93+
{
94+
"cell_type": "code",
95+
"execution_count": 31,
96+
"id": "37a457e7-fa4f-483c-84dd-bdd03f8a7888",
97+
"metadata": {},
98+
"outputs": [],
99+
"source": [
100+
"\n",
101+
"url = \"https://www.example.com\"\n",
102+
"url = \"https://filesender.aarnet.edu.au/?s=download&token=d6be9395-9fc6-427f-b565-289cb32a16e2\"\n",
103+
"html_content = download_html(url)"
104+
]
105+
},
106+
{
107+
"cell_type": "code",
108+
"execution_count": null,
109+
"id": "2c4637b2-04c8-40ad-8c0c-e2c1bb3f042f",
110+
"metadata": {},
111+
"outputs": [],
112+
"source": []
113+
},
114+
{
115+
"cell_type": "code",
116+
"execution_count": 21,
117+
"id": "8e829e58-0c4f-46a7-a876-6051cbe3c7a1",
118+
"metadata": {},
119+
"outputs": [],
120+
"source": []
121+
},
122+
{
123+
"cell_type": "code",
124+
"execution_count": 44,
125+
"id": "24c488c9-ba2a-403c-8a82-99c5ee35223a",
126+
"metadata": {},
127+
"outputs": [],
128+
"source": [
129+
"\n",
130+
"parser = MyHTMLParser()\n",
131+
"directlinks = parser.extract_span_by_class(html_content, \"directlink\")\n",
132+
"directlinks = [x.split(\"Direct Link: \")[1].strip() for x in directlinks]\n",
133+
"token = url.split(\"&token=\")[1]\n",
134+
"fileids = [x.split(\"&files_ids=\")[1] for x in directlinks]\n"
135+
]
136+
},
137+
{
138+
"cell_type": "code",
139+
"execution_count": null,
140+
"id": "60975e26-2bfc-4fd1-bcd1-dd2478be173f",
141+
"metadata": {},
142+
"outputs": [],
143+
"source": []
144+
},
145+
{
146+
"cell_type": "code",
147+
"execution_count": null,
148+
"id": "d973a5e6-c9b4-4e7d-80f1-6be0557c92a9",
149+
"metadata": {},
150+
"outputs": [],
151+
"source": []
152+
},
153+
{
154+
"cell_type": "code",
155+
"execution_count": null,
156+
"id": "9d855a73-8007-4a75-b5e7-5ca7aae01c3b",
157+
"metadata": {},
158+
"outputs": [],
159+
"source": []
160+
},
161+
{
162+
"cell_type": "code",
163+
"execution_count": null,
164+
"id": "ba26a9ca-02db-4041-99c3-264982b238b4",
165+
"metadata": {},
166+
"outputs": [],
167+
"source": []
168+
}
169+
],
170+
"metadata": {
171+
"kernelspec": {
172+
"display_name": "Python 3 (ipykernel)",
173+
"language": "python",
174+
"name": "python3"
175+
},
176+
"language_info": {
177+
"codemirror_mode": {
178+
"name": "ipython",
179+
"version": 3
180+
},
181+
"file_extension": ".py",
182+
"mimetype": "text/x-python",
183+
"name": "python",
184+
"nbconvert_exporter": "python",
185+
"pygments_lexer": "ipython3",
186+
"version": "3.10.14"
187+
}
188+
},
189+
"nbformat": 4,
190+
"nbformat_minor": 5
191+
}

0 commit comments

Comments
 (0)