Skip to content

Commit 46c51eb

Browse files
ThinkChaossnejus
authored andcommitted
fix: use httpx instead of requests to avoid Bandcamp blocking
When using requests/urllib3, Bandcamp response to all requests with 403 errors. Investigating why, I tried: - using curl to send the same request: it worked - writing a tiny Python script to `GET bandcamp.com/` with requests: it failed with 403 - waiting a week to see if it solved itself: no luck - changing the above mentioned script to use http.client or httpx worked I think that in this case, Bandcamp's Web Application Firewall (WAF) blocks the requests based not on their contents but on an artifact of how urllib3 builds/sends the data, since curl with exact same headers works. Instead of trying to identify the exact reason, which is quite hard without any info on Bandcamp's WAF, and fix/workaround that, I rewrote the very little required HTTP code to use httpx and sidestep the issue.
1 parent ffacfe1 commit 46c51eb

File tree

6 files changed

+122
-177
lines changed

6 files changed

+122
-177
lines changed

CHANGELOG.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@
44

55
### Fixed
66

7-
- `search`: properly escape query strings for better results with special characters
7+
- `search`:
8+
- properly escape query strings for better results with special characters
9+
- change HTTP client implementation to avoid Bandcamp "403 Forbidden" responses
810

911
## [0.19.1] 2024-05-10
1012

beetsplug/bandcamp/__init__.py

+4-13
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,15 @@
2222
import re
2323
from contextlib import contextmanager
2424
from functools import lru_cache, partial
25-
from html import unescape
2625
from itertools import chain
2726
from operator import itemgetter
2827
from typing import TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Literal, Sequence
2928

30-
import requests
31-
from beets import IncludeLazyConfig, __version__, config, library, plugins
29+
from beets import IncludeLazyConfig, config, library, plugins
3230

3331
from beetsplug import fetchart # type: ignore[attr-defined]
3432

33+
from .http import HTTPError, http_get_text
3534
from .metaguru import Metaguru
3635
from .search import search_bandcamp
3736

@@ -57,12 +56,6 @@
5756

5857
ALBUM_URL_IN_TRACK = re.compile(r'<a id="buyAlbumLink" href="([^"]+)')
5958
LABEL_URL_IN_COMMENT = re.compile(r"Visit (https:[\w/.-]+\.[a-z]+)")
60-
USER_AGENT = f"beets/{__version__} +http://beets.radbox.org/"
61-
62-
63-
@lru_cache(maxsize=None)
64-
def get_response(url: str) -> requests.Response:
65-
return requests.get(url, headers={"User-Agent": USER_AGENT})
6659

6760

6861
class BandcampRequestsHandler:
@@ -79,13 +72,11 @@ def _info(self, msg_template: str, *args: Sequence[str]) -> None:
7972

8073
def _get(self, url: str) -> str:
8174
"""Return text contents of the url response."""
82-
response = get_response(url)
8375
try:
84-
response.raise_for_status()
85-
except requests.HTTPError as e:
76+
return http_get_text(url)
77+
except HTTPError as e:
8678
self._info("{}", e)
8779
return ""
88-
return unescape(response.text)
8980

9081
def guru(self, url: str) -> Metaguru:
9182
return Metaguru.from_html(self._get(url), config=self.config.flatten())

beetsplug/bandcamp/http.py

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from functools import lru_cache
2+
from html import unescape
3+
from urllib.parse import urlsplit
4+
5+
from beets import __version__
6+
import httpx
7+
8+
HTTPError = httpx.HTTPError
9+
10+
USER_AGENT = f"beets/{__version__} +https://beets.io/"
11+
12+
_client = httpx.Client(headers={"User-Agent": USER_AGENT})
13+
14+
@lru_cache(maxsize=None)
15+
def http_get_text(url: str) -> str:
16+
"""Return text contents of the url."""
17+
18+
response = _client.get(url)
19+
response.raise_for_status()
20+
21+
return unescape(response.text)

beetsplug/bandcamp/search.py

+2-8
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from typing import Any, Callable, Dict, List
88
from urllib.parse import quote_plus
99

10-
import requests
10+
from .http import http_get_text
1111

1212
JSONDict = Dict[str, Any]
1313
SEARCH_URL = "https://bandcamp.com/search?page={}&q={}"
@@ -95,17 +95,11 @@ def parse_and_sort_results(html: str, **kwargs: str) -> List[JSONDict]:
9595
return [{"index": i + 1, **r} for i, r in enumerate(results)]
9696

9797

98-
def get_url(url: str) -> str:
99-
response = requests.get(url)
100-
response.raise_for_status()
101-
return unescape(response.text)
102-
103-
10498
def search_bandcamp(
10599
query: str = "",
106100
search_type: str = "",
107101
page: int = 1,
108-
get: Callable[[str], str] = get_url,
102+
get: Callable[[str], str] = http_get_text,
109103
**kwargs: Any,
110104
) -> List[JSONDict]:
111105
"""Return a list with item JSONs of type search_type matching the query."""

0 commit comments

Comments
 (0)