From 04218cfdefbbac7522e354f6ec02247c5252d8cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= Date: Thu, 16 May 2024 17:20:19 +0100 Subject: [PATCH] Parse soundcloud album --- beetsplug/bandcamp/__init__.py | 41 +++++++++----- beetsplug/bandcamp/helpers.py | 47 ++++++++++++++-- beetsplug/bandcamp/metaguru.py | 40 +++++++++++--- beetsplug/bandcamp/soundcloud.py | 93 +++++++++++++++++++++----------- poetry.lock | 47 +++++++++++++--- pyproject.toml | 2 +- 6 files changed, 204 insertions(+), 66 deletions(-) diff --git a/beetsplug/bandcamp/__init__.py b/beetsplug/bandcamp/__init__.py index 81698d4..b016617 100644 --- a/beetsplug/bandcamp/__init__.py +++ b/beetsplug/bandcamp/__init__.py @@ -26,9 +26,11 @@ import json import logging import re +from contextlib import suppress from functools import lru_cache, partial from itertools import chain from operator import itemgetter, truth +from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union import requests @@ -198,6 +200,28 @@ def candidates(self, items, artist, album, va_likely, extra_tags=None): """Return a sequence of AlbumInfo objects that match the album whose items are provided or are being searched. """ + from pprint import pprint + + url = items[0].comments + parent_dir = Path(items[0].path.decode()).parent + with suppress(StopIteration): + playlist_info_path = next(parent_dir.glob("Playlist_*")) + with open(playlist_info_path) as f: + playlist_info = json.load(f) + + playlist_info["tracks"] = [] + for track_info_path in set(parent_dir.glob("*.info.json")) - { + playlist_info_path + }: + with open(track_info_path) as f: + track_data = {**json.load(f), "path": str(track_info_path)} + playlist_info["tracks"].append(track_data) + + pprint(playlist_info) + + # if url.startswith("https://"): + # yield from self.get_album_info(url) + label = "" if items and album == items[0].album and artist == items[0].albumartist: label = items[0].label @@ -260,16 +284,6 @@ def track_for_id(self, track_id: str) -> Optional[TrackInfo]: self._info("Not a bandcamp URL, skipping") return None - def handle(self, guru: Metaguru, attr: str, _id: str) -> Any: - try: - return getattr(guru, attr) - except (KeyError, ValueError, AttributeError, IndexError): - self._info("Failed obtaining {}", _id) - except Exception: # pylint: disable=broad-except - url = "https://github.com/snejus/beetcamp/issues/new" - self._exc("Unexpected error obtaining {}, please report at {}", _id, url) - return None - def get_album_info(self, url: str) -> Optional[List[AlbumInfo]]: """Return an AlbumInfo object for a bandcamp album page. If track url is given by mistake, find and fetch the album url instead. @@ -296,8 +310,8 @@ def _get_soundcloud_data(self, url: str) -> AlbumInfo | TrackInfo | None: sc_data_key = "sound" method = get_soundcloud_track - self._info("Fetching data from soundcloud url {} as {}", url, _type) - data = re.search(r"\[\{[^<]+[^;<)]", self._get(url)) + self._info("Fetching data from soundcloud url {}", url) + data = re.search(r"\[.*hydratable.*\]", self._get(url)) if not data: return None @@ -311,8 +325,7 @@ def get_track_info(self, url: str) -> Optional[TrackInfo]: if track: return track - guru = self.guru(url, "singleton") - return self.handle(guru, "singleton", url) if guru else None + return self.guru(url, "singleton") def _search(self, data: JSONDict) -> Iterable[JSONDict]: """Return a list of track/album URLs of type search_type matching the query.""" diff --git a/beetsplug/bandcamp/helpers.py b/beetsplug/bandcamp/helpers.py index 7fc830c..8a9e2e8 100644 --- a/beetsplug/bandcamp/helpers.py +++ b/beetsplug/bandcamp/helpers.py @@ -6,6 +6,7 @@ from typing import Any, Dict, Iterable, List, NamedTuple, Pattern from beets.autotag.hooks import AlbumInfo +from beets.ui import log from ordered_set import OrderedSet as ordset from .genres_lookup import GENRES @@ -262,6 +263,10 @@ def valid_for_mode(kw: str) -> bool: return valid_mb_genre(kw) or valid_mb_genre(list(words)[-1]) unique_genres: ordset[str] = ordset() + keywords = set(keywords) + for kw in list(keywords): + keywords.add(kw.replace(" ", "-")) + keywords.add(kw.replace("-", " ")) # expand badly delimited keywords split_kw = partial(re.split, r"[.] | #| - ") for kw in it.chain.from_iterable(map(split_kw, keywords)): @@ -280,13 +285,17 @@ def duplicate(genre: str) -> bool: others = others.union(x.replace(" ", "").replace("-", "") for x in others) # type: ignore[attr-defined] # noqa return any(genre in x for x in others) - return it.filterfalse(duplicate, unique_genres) + return list(it.filterfalse(duplicate, unique_genres)) @staticmethod - def unpack_props(obj: JSONDict) -> JSONDict: + def unpack_props(obj: Any) -> Any: """Add all 'additionalProperty'-ies to the parent dictionary.""" - for prop in obj.get("additionalProperty") or []: - obj[prop["name"]] = prop["value"] + if isinstance(obj, dict): + for prop in obj.pop("additionalProperty", []): + obj[prop["name"]] = prop["value"] + return {k: Helpers.unpack_props(v) for k, v in obj.items()} + if isinstance(obj, list): + return [Helpers.unpack_props(item) for item in obj] return obj @staticmethod @@ -363,3 +372,33 @@ def get_medium_total(medium: int) -> int: else: medium_index += 1 return album + + @staticmethod + def parse_additional_fields(meta: str, field_patterns: JSONDict) -> JSONDict: + additional_fields = {} + for field, pattern_item in field_patterns.items(): + # log.debug("Parsing [b]{}[/]", field) + try: + pat = pattern_item["pattern"] + if len(pat.splitlines()) > 1: + matches = list(re.finditer(pat, meta, re.VERBOSE)) + else: + matches = list(re.finditer(pat, meta)) + # log.debug("\n".join(map(str, matches))) + if matches: + if "replace" in pattern_item: + log.info(str(matches[0].expand(pattern_item["replace"]))) + value = matches[0].expand(pattern_item["replace"]) + elif "replace_expr" in pattern_item: + value = eval( + pattern_item["replace_expr"], + {"matches": matches, "match": matches[0]}, + ) + else: + value = matches[0].group() + if isinstance(value, str): + value = value.replace("\r", "").strip() + additional_fields[field] = value + except Exception: + log.error("Failed parsing {}", field, exc_info=True) + return additional_fields diff --git a/beetsplug/bandcamp/metaguru.py b/beetsplug/bandcamp/metaguru.py index bec8abe..28c52e6 100644 --- a/beetsplug/bandcamp/metaguru.py +++ b/beetsplug/bandcamp/metaguru.py @@ -1,4 +1,5 @@ """Module for parsing bandcamp metadata.""" + import itertools as it import json import operator as op @@ -6,7 +7,7 @@ import sys from collections import Counter from datetime import date, datetime -from functools import partial +from functools import partial, singledispatch from typing import Any, Dict, Iterable, List, Optional, Set from unicodedata import normalize @@ -15,9 +16,9 @@ from beets.autotag.hooks import AlbumInfo, TrackInfo from pycountry import countries, subdivisions +from .album import AlbumName from .helpers import PATTERNS, Helpers, MediaInfo from .tracks import Track, Tracks -from .album import AlbumName if sys.version_info.minor > 7: from functools import cached_property # pylint: disable=ungrouped-imports @@ -73,7 +74,7 @@ def from_html(cls, html: str, config: Optional[JSONDict] = None) -> "Metaguru": except AttributeError as exc: raise AttributeError("Could not find release metadata JSON") from exc else: - return cls(json.loads(meta), config) + return cls(cls.unpack_props(json.loads(meta)), config) @cached_property def excluded_fields(self) -> Set[str]: @@ -322,11 +323,11 @@ def is_comp(self) -> bool: def first_one(artist: str) -> str: return PATTERNS["split_artists"].split(artist.replace(" & ", ", "))[0] - truly_unique = set(map(first_one, self.tracks.artists)) - return ( + artist_count = len(set(map(first_one, self.tracks.artists))) + return artist_count > 1 and ( self._album_name.mentions_compilation or self._search_albumtype("compilation") - or (len(truly_unique) > 3 and len(self.tracks) > 4) + or (artist_count > 3 and len(self.tracks) > 4) ) @cached_property @@ -348,7 +349,7 @@ def albumtype(self) -> str: return "album" @cached_property - def albumtypes(self) -> str: + def albumtypes(self) -> list[str]: albumtypes = {self.albumtype} if self.is_comp: if self.albumtype == "ep": @@ -365,7 +366,7 @@ def albumtypes(self) -> str: if len(self.tracks.remixers) == len(self.tracks): albumtypes.add("remix") - return "; ".join(sorted(albumtypes)) + return sorted(albumtypes) @cached_property def va(self) -> bool: @@ -416,16 +417,39 @@ def get_fields(self, fields: Iterable[str], src: object = None) -> JSONDict: return {field: getattr(self, field)} return dict(zip(fields, iter(op.attrgetter(*fields)(src or self)))) + @cached_property + def parseable_meta(self) -> str: + @singledispatch + def to_text(x: Any, key: str = "") -> str: + return f"{key}: {x}".replace("\r", "") + "\r\n" + + @to_text.register(dict) + def _(x: JSONDict, key: str = "") -> str: + return "".join([to_text(v, f"{key}.{k}") for k, v in x.items()]) + + @to_text.register(list) + def _(x: List[Any], key: str = "") -> str: + return "".join([to_text(v, f"{key}[{i}]") for i, v in enumerate(x)]) + + return to_text(self.meta) + @property def _common_album(self) -> JSONDict: common_data: JSONDict = {"album": self.album_name} fields = ["label", "catalognum", "albumtype", "country"] if NEW_BEETS: fields.extend(["genre", "style", "comments", "albumtypes"]) + common_data.update(self.get_fields(fields)) reldate = self.release_date if reldate: common_data.update(self.get_fields(["year", "month", "day"], reldate)) + if "field_patterns" in self.config: + common_data.update( + self.parse_additional_fields( + self.parseable_meta, self.config["field_patterns"] + ) + ) return common_data diff --git a/beetsplug/bandcamp/soundcloud.py b/beetsplug/bandcamp/soundcloud.py index e04adc9..c196abc 100644 --- a/beetsplug/bandcamp/soundcloud.py +++ b/beetsplug/bandcamp/soundcloud.py @@ -43,7 +43,7 @@ class ParsedTrack(BaseModel): @cached_property def data(self) -> JSONDict: - return {k: v for k, v in self.dict().items() if k != "live" and v} + return {k: v for k, v in self.dict().items() if k != "live"} def parse_title(source: str, title: str, artist: str) -> ParsedTrack: @@ -76,6 +76,8 @@ def parse_title(source: str, title: str, artist: str) -> ParsedTrack: rf"^{album_pat} {index_pat}{_delim}{artist_pat}$", # SACHSENTRANCE PODCAST rf"^{artist_pat}{_delim}{album_pat} {index_pat}$", + # PURE Guest + rf"^{album_pat} Guest[.]{index_pat} {artist_pat}$", ): # print(pat) m = re.search(pat, title) @@ -135,8 +137,8 @@ class Visuals(BaseModel): class BasicUser(SCEntity): avatar_url: str # "https://i1.sndcdn.com/avatars-VdiyiKIAvTrN0eFz-bPJOIg-large.jpg" badges: dict[str, bool] # {"pro": false, "pro_unlimited": true, "verified": false} - city: str # "Berlin" - country_code: str # "DE" + city: str | None # "Berlin" + country_code: str | None # "DE" first_name: str # "" followers_count: int # 5982 full_name: str # "" @@ -168,19 +170,21 @@ class User(BasicUser): JSONDict ] # [{"product": {"id": "creator-pro-unlimited"}}] creator_subscription: JSONDict # {"product": {"id": "creator-pro-unlimited"}} - description: str # "✧ ☆ H<3core-Poet ☆ ✧ \n\n🌎booking via paolo@moonagency.xyz\n\nSYNDIKAET\nASYLUM \nDEESTRICTED \nENIGMA \nEHRENKLUB\nPARA//E/ \nPUBLIC ENERGY\n240KMH\n\n\n\nHigh in the ethereal skies, a majestic crystal castle stands, its translucent walls reflecting a kaleidoscope of colors. Within its walls, a hidden realm of enchantment unfolds, where whimsical fairies dance on shimmering petals, weaving dreams with their delicate wings.\nThis fairy world sparkles with magic, where laughter and wonder embrace every corner, and imagination reigns supreme.\n\nAexhy has been a long time in the scene as a dj which motivated him to push further and start his career as a producer. In his Productions and in his sets you can clearly notice what Aexhy is made of. Playfully guiding you through all styles which inspire him, aexhy pushes boundaries and combines everything in a playful style to suprise you each minute. Constantly rising energy and letting it drop just to change to another style to capture your emotions and make you feel something.\n__________________________________✘✘✘_________________\nalso performing as:\n\"Space Cowboys\" with Trancemaster Krause\n\"SAEXHY\" with SACID" + description: ( + str | None + ) # "✧ ☆ H<3core-Poet ☆ ✧ \n\n🌎booking via paolo@moonagency.xyz\n\nSYNDIKAET\nASYLUM \nDEESTRICTED \nENIGMA \nEHRENKLUB\nPARA//E/ \nPUBLIC ENERGY\n240KMH\n\n\n\nHigh in the ethereal skies, a majestic crystal castle stands, its translucent walls reflecting a kaleidoscope of colors. Within its walls, a hidden realm of enchantment unfolds, where whimsical fairies dance on shimmering petals, weaving dreams with their delicate wings.\nThis fairy world sparkles with magic, where laughter and wonder embrace every corner, and imagination reigns supreme.\n\nAexhy has been a long time in the scene as a dj which motivated him to push further and start his career as a producer. In his Productions and in his sets you can clearly notice what Aexhy is made of. Playfully guiding you through all styles which inspire him, aexhy pushes boundaries and combines everything in a playful style to suprise you each minute. Constantly rising energy and letting it drop just to change to another style to capture your emotions and make you feel something.\n__________________________________✘✘✘_________________\nalso performing as:\n\"Space Cowboys\" with Trancemaster Krause\n\"SAEXHY\" with SACID" followings_count: int # 524 groups_count: int # 0 - likes_count: int # 1472 + likes_count: int | None # 1472 playlist_likes_count: int # 91 playlist_count: int # 12 reposts_count: int | None # null track_count: int # 83 - visuals: Visuals + visuals: Visuals | None @property - def visual_url(self) -> str: - return self.visuals.visuals[0].visual_url + def visual_url(self) -> str | None: + return self.visuals.visuals[0].visual_url if self.visuals else None class SCMedia(SCEntity): @@ -189,7 +193,7 @@ class SCMedia(SCEntity): info_type: ClassVar[type[TrackInfo | AlbumInfo]] artwork_url: ( - str # "https://i1.sndcdn.com/artworks-1JYcoqeTzmZQOzYk-5ZSI7g-large.jpg" + str | None # "https://i1.sndcdn.com/artworks-1JYcoqeTzmZQOzYk-5ZSI7g-large.jpg" ) created_at: datetime # "2022-09-09T22:09:09Z" description: str # "" @@ -198,7 +202,7 @@ class SCMedia(SCEntity): embeddable_by: str # "all" label_name: str | None # null license: str # "all-rights-reserved" - likes_count: int # 115 + likes_count: int | None # 115 permalink: str # "02-aexhy-x-dj-traytex-glasversteck-fallen-shrine-nxc" public: bool # true purchase_title: str | None # null @@ -223,10 +227,9 @@ def artist(self) -> str: @property def data(self) -> JSONDict: return { - "albumstatus": "Official", "artist": self.artist, "artist_id": self.user.urn, - "artwork_url": self.artwork_url.replace("-large", "-t500x500"), + "artwork_url": (self.artwork_url or "").replace("-large", "-t500x500"), "comments": self.description or None, "country": self.user.country, "data_source": self.DATA_SOURCE, @@ -248,22 +251,24 @@ class PlaylistTrack(SCMedia): caption: str | None # null commentable: bool # true - comment_count: int # 4 + comment_count: int | None # 4 downloadable: bool # false - download_count: int # 0 + download_count: int | None # 0 full_duration: int # 173610 has_downloads_left: bool # false media: JSONDict monetization_model: str # "NOT_APPLICABLE" - playback_count: int # 5202 + playback_count: int | None # 5202 policy: str # "ALLOW" - publisher_metadata: JSONDict # {"id": int # 1341013456, "urn": str # "soundcloud:tracks:1341013456", "contains_music": bool # true} + publisher_metadata: ( + JSONDict | None + ) # {"id": int # 1341013456, "urn": str # "soundcloud:tracks:1341013456", "contains_music": bool # true} state: str # "finished" station_permalink: str # "track-stations:1341013456" station_urn: str # "soundcloud:system-playlists:track-stations:1341013456" streamable: bool # true track_authorization: str # "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJnZW8iOiJHQiIsInN1YiI6IiIsInJpZCI6IjBiZjA3OWRhLTI5ZGQtNDFkYi1hYzIxLTBmYjdiOTBkMzIyMCIsImlhdCI6MTcwODI2MjQxMX0.AmCfYHJ-jY4J6b6dEs91SF0dtyYZsfm4vnecMX29JaI" - track_format: str # "single-track" + # track_format: str | None # "single-track" urn: str # "soundcloud:tracks:1341013438" visuals: Visuals | None # null waveform_url: str # "https://wave.sndcdn.com/6FiJyRHgHJQ1_m.json" @@ -289,19 +294,29 @@ def data(self) -> JSONDict: class ReleaseMixin(BaseModel): - original_genre: str = Field(validation_alias="genre") # "" config: JSONDict + @cached_property + def albumtypes(self) -> list[str]: + return [self.albumtype] + @cached_property def genre(self) -> str: - keywords = list(map(str.casefold, re.split(r" ?[-,/] ", self.original_genre))) + keywords = [ + item.casefold().replace("\\", "") + for item in re.split(r" ?[-,/&] ", self.original_genre) + ] return ", ".join(Helpers.get_genre(keywords, self.config, "")) @property def data(self) -> JSONDict: return { **super().data, + "albumstatus": "Official", + "albumtype": self.albumtype, + "albumtypes": self.albumtypes, + "city": self.user.city, "genre": self.genre, } @@ -315,7 +330,7 @@ def albumtype(self) -> str: @cached_property def albumtypes(self) -> list[str]: - albumtypes = [self.albumtype] + albumtypes = super().albumtypes if self.albumtype == "broadcast": albumtypes.append("dj-mix") @@ -335,9 +350,8 @@ def albumartist(self) -> str: def data(self) -> JSONDict: return { **super().data, + "album": self.parsed_track.album, "albumartist": self.albumartist, - "albumtype": self.albumtype, - "albumtypes": self.albumtypes, "visual_url": self.user.visual_url, } @@ -347,20 +361,40 @@ class Playlist(ReleaseMixin, SCMedia): is_album: bool # true managed_by_feeds: bool # false - published_at: datetime # "2022-09-09T22:10:03Z" - set_type: str # "album" + published_at: datetime | None # "2022-09-09T22:10:03Z" + set_type: Literal["album", "compilation"] # "album" tracks: list[OnErrorOmit[PlaylistTrack]] track_count: int # 5 url: str # "/aexhy/sets/fallen-shrine-s-bday-present-4" user: User + @cached_property + def album(self) -> str: + return self.title + + @cached_property + def album_id(self) -> str: + return self.permalink_url + + @cached_property + def albumtype(self) -> str: + return "album" if len({t.artist for t in self.tracks}) == 1 else self.set_type + @property def data(self) -> JSONDict: + tracks = [t.info for t in self.tracks] + for idx, track in enumerate(tracks, 1): + track.index = track.medium_index = idx + track.medium_total = self.track_count + track.album = self.album + track.album_id = self.album_id + return { **super().data, - "album": self.title, - "album_id": self.permalink_url, - "tracks": [t.info for t in self.tracks], + "album": self.album, + "album_id": self.album_id, + "tracks": tracks, + "medium_total": self.track_count, } @@ -369,7 +403,4 @@ def get_soundcloud_track(data: JSONDict, config: IncludeLazyConfig) -> TrackInfo def get_soundcloud_album(data: JSONDict, config: IncludeLazyConfig) -> AlbumInfo: - playlist = Playlist(**data, config=config) - from pprint import pprint - pprint(playlist.dict()) - return playlist.info + return Playlist(**data, config=config).info diff --git a/poetry.lock b/poetry.lock index 1bb9c0d..8de3240 100644 --- a/poetry.lock +++ b/poetry.lock @@ -412,6 +412,17 @@ attrs = "*" eradicate = ">=2.0,<3.0" flake8 = ">5" +[[package]] +name = "funcy" +version = "2.0" +description = "A fancy and practical functional tools" +optional = false +python-versions = "*" +files = [ + {file = "funcy-2.0-py2.py3-none-any.whl", hash = "sha256:53df23c8bb1651b12f095df764bfb057935d49537a56de211b098f4c79614bb0"}, + {file = "funcy-2.0.tar.gz", hash = "sha256:3963315d59d41c6f30c04bc910e10ab50a3ac4a225868bfa96feed133df075cb"}, +] + [[package]] name = "idna" version = "3.6" @@ -1161,22 +1172,26 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"] [[package]] name = "rich-tables" -version = "0.3.0" +version = "0.4.0a1" description = "Ready-made rich tables for various purposes" optional = false -python-versions = ">=3.7,<4" -files = [ - {file = "rich_tables-0.3.0-py3-none-any.whl", hash = "sha256:364bbb1e8da7166aac675dfaad082be4b902534de9e1a35b39a1a907e3c62998"}, - {file = "rich_tables-0.3.0.tar.gz", hash = "sha256:0dc5f08c82565fc3f59b29aac4fb49d727d2c062a90057f09d52ed23f3baafde"}, -] +python-versions = ">=3.8,<4" +files = [] +develop = true [package.dependencies] +funcy = "^2.0" multimethod = "*" rich = ">=12.3.0" +sqlparse = ">=0.4.4" +typing-extensions = ">=4.7.1" [package.extras] hue = ["rgbxy (>=0.5)"] -sql = ["sqlparse (>=0.4.4,<0.5.0)"] + +[package.source] +type = "directory" +url = "../rich-tables" [[package]] name = "six" @@ -1189,6 +1204,22 @@ files = [ {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, ] +[[package]] +name = "sqlparse" +version = "0.4.4" +description = "A non-validating SQL parser." +optional = false +python-versions = ">=3.5" +files = [ + {file = "sqlparse-0.4.4-py3-none-any.whl", hash = "sha256:5430a4fe2ac7d0f93e66f1efc6e1338a41884b7ddf2a350cedd20ccc4d9d28f3"}, + {file = "sqlparse-0.4.4.tar.gz", hash = "sha256:d446183e84b8349fa3061f0fe7f06ca94ba65b426946ffebe6e3e8295332420c"}, +] + +[package.extras] +dev = ["build", "flake8"] +doc = ["sphinx"] +test = ["pytest", "pytest-cov"] + [[package]] name = "tomli" version = "2.0.1" @@ -1304,4 +1335,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">=3.8, <4" -content-hash = "c0b1ae9038b046ae9ada5bc35a6b0ebc0256012e16480b45beb60f907db2a8b2" +content-hash = "0020c667742542a57cc7020cedc79433555c15757d1a3257379021af4d74d854" diff --git a/pyproject.toml b/pyproject.toml index 0ef2d5c..b8a3786 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,7 @@ pytest-cov = ">=2.10.1" pytest-randomly = ">=3.10" pytest-lazy-fixture = ">=0.6.3" pytest-xdist = ">=3.5.0" -rich-tables = "*" +rich-tables = { path = "../rich-tables", develop = true } types-setuptools = ">=57.0.0" types-requests = ">=2.25.0" types-six = ">=0.1.7"