diff --git a/beetsplug/_typing.py b/beetsplug/_typing.py new file mode 100644 index 0000000000..915ea77e80 --- /dev/null +++ b/beetsplug/_typing.py @@ -0,0 +1,115 @@ +from __future__ import annotations + +from typing import Any + +from typing_extensions import NotRequired, TypedDict + +JSONDict = dict[str, Any] + + +class LRCLibAPI: + class Item(TypedDict): + """Lyrics data item returned by the LRCLib API.""" + + id: int + name: str + trackName: str + artistName: str + albumName: str + duration: float | None + instrumental: bool + plainLyrics: str + syncedLyrics: str | None + + +class GeniusAPI: + """Genius API data types. + + This documents *only* the fields that are used in the plugin. + :attr:`SearchResult` is an exception, since I thought some of the other + fields might be useful in the future. + """ + + class DateComponents(TypedDict): + year: int + month: int + day: int + + class Artist(TypedDict): + api_path: str + header_image_url: str + id: int + image_url: str + is_meme_verified: bool + is_verified: bool + name: str + url: str + + class Stats(TypedDict): + unreviewed_annotations: int + hot: bool + + class SearchResult(TypedDict): + annotation_count: int + api_path: str + artist_names: str + full_title: str + header_image_thumbnail_url: str + header_image_url: str + id: int + lyrics_owner_id: int + lyrics_state: str + path: str + primary_artist_names: str + pyongs_count: int | None + relationships_index_url: str + release_date_components: GeniusAPI.DateComponents + release_date_for_display: str + release_date_with_abbreviated_month_for_display: str + song_art_image_thumbnail_url: str + song_art_image_url: str + stats: GeniusAPI.Stats + title: str + title_with_featured: str + url: str + featured_artists: list[GeniusAPI.Artist] + primary_artist: GeniusAPI.Artist + primary_artists: list[GeniusAPI.Artist] + + class SearchHit(TypedDict): + result: GeniusAPI.SearchResult + + class SearchResponse(TypedDict): + hits: list[GeniusAPI.SearchHit] + + class Search(TypedDict): + response: GeniusAPI.SearchResponse + + +class GoogleCustomSearchAPI: + class Response(TypedDict): + """Search response from the Google Custom Search API. + + If the search returns no results, the :attr:`items` field is not found. + """ + + items: NotRequired[list[GoogleCustomSearchAPI.Item]] + + class Item(TypedDict): + """A Google Custom Search API result item. + + :attr:`title` field is shown to the user in the search interface, thus + it gets truncated with an ellipsis for longer queries. For most + results, the full title is available as ``og:title`` metatag found + under the :attr:`pagemap` field. Note neither this metatag nor the + ``pagemap`` field is guaranteed to be present in the data. + """ + + title: str + link: str + pagemap: NotRequired[GoogleCustomSearchAPI.Pagemap] + + class Pagemap(TypedDict): + """Pagemap data with a single meta tags dict in a list.""" + + metatags: list[JSONDict] diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index d1d715ce46..1732edbf77 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -16,52 +16,35 @@ from __future__ import annotations -import difflib +import atexit import errno import itertools -import json +import math import os.path import re -import struct -import unicodedata -import warnings -from contextlib import suppress +from contextlib import contextmanager, suppress from dataclasses import dataclass from functools import cached_property, partial, total_ordering +from html import unescape from http import HTTPStatus -from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator -from urllib.parse import quote, urlencode +from typing import TYPE_CHECKING, Iterable, Iterator, NamedTuple +from urllib.parse import quote, quote_plus, urlencode, urlparse +import langdetect import requests -from typing_extensions import TypedDict +from bs4 import BeautifulSoup from unidecode import unidecode import beets from beets import plugins, ui +from beets.autotag.hooks import string_dist if TYPE_CHECKING: from beets.importer import ImportTask from beets.library import Item -try: - import bs4 - from bs4 import SoupStrainer + from ._typing import GeniusAPI, GoogleCustomSearchAPI, JSONDict, LRCLibAPI - HAS_BEAUTIFUL_SOUP = True -except ImportError: - HAS_BEAUTIFUL_SOUP = False - -try: - import langdetect - - HAS_LANGDETECT = True -except ImportError: - HAS_LANGDETECT = False - -DIV_RE = re.compile(r"<(/?)div>?", re.I) -COMMENT_RE = re.compile(r"<!--.*-->", re.S) -TAG_RE = re.compile(r"<[^>]*>") -BREAK_RE = re.compile(r"\n?\s*<br([\s|/][^>]*)*>\s*\n?", re.I) USER_AGENT = f"beets/{beets.__version__}" INSTRUMENTAL_LYRICS = "[Instrumental]" @@ -105,37 +88,36 @@ class NotFoundError(requests.exceptions.HTTPError): pass -# Utilities. +class CaptchaError(requests.exceptions.HTTPError): + pass + +class TimeoutSession(requests.Session): + def request(self, *args, **kwargs): + """Wrap the request method to raise an exception on HTTP errors.""" + kwargs.setdefault("timeout", 10) + r = super().request(*args, **kwargs) + if r.status_code == HTTPStatus.NOT_FOUND: + raise NotFoundError("HTTP Error: Not Found", response=r) + if 300 <= r.status_code < 400: + raise CaptchaError("Captcha is required", response=r) + + r.raise_for_status() -def unichar(i): - try: - return chr(i) - except ValueError: - return struct.pack("i", i).decode("utf-32") + return r -def unescape(text): - """Resolve &#xxx; HTML entities (and some others).""" - if isinstance(text, bytes): - text = text.decode("utf-8", "ignore") - out = text.replace(" ", " ") +r_session = TimeoutSession() +r_session.headers.update({"User-Agent": USER_AGENT}) - def replchar(m): - num = m.group(1) - return unichar(int(num)) - out = re.sub("&#(\\d+);", replchar, out) - return out +@atexit.register +def close_session(): + """Close the requests session on shut down.""" + r_session.close() -def extract_text_between(html, start_marker, end_marker): - try: - _, html = html.split(start_marker, 1) - html, _ = html.split(end_marker, 1) - except ValueError: - return "" - return html +# Utilities. def search_pairs(item): @@ -176,10 +158,20 @@ def generate_alternatives(string, patterns): # Remove any featuring artists from the artists name rf"(.*?) {plugins.feat_tokens()}" ] - artists = generate_alternatives(artist, patterns) + + # Skip various artists + artists = [] + lower_artist = artist.lower() + if "various" not in lower_artist: + artists.extend(generate_alternatives(artist, patterns)) # Use the artist_sort as fallback only if it differs from artist to avoid # repeated remote requests with the same search terms - if artist_sort and artist.lower() != artist_sort.lower(): + artist_sort_lower = artist_sort.lower() + if ( + artist_sort + and lower_artist != artist_sort_lower + and "various" not in artist_sort_lower + ): artists.append(artist_sort) patterns = [ @@ -198,13 +190,13 @@ def generate_alternatives(string, patterns): multi_titles = [] for title in titles: multi_titles.append([title]) - if "/" in title: - multi_titles.append([x.strip() for x in title.split("/")]) + if " / " in title: + multi_titles.append([x.strip() for x in title.split(" / ")]) return itertools.product(artists, multi_titles) -def slug(text): +def slug(text: str) -> str: """Make a URL-safe, human-readable version of the given text This will do the following: @@ -214,79 +206,78 @@ def slug(text): 3. strip whitespace 4. replace other non-word characters with dashes 5. strip extra dashes - - This somewhat duplicates the :func:`Google.slugify` function but - slugify is not as generic as this one, which can be reused - elsewhere. """ return re.sub(r"\W+", "-", unidecode(text).lower().strip()).strip("-") -if HAS_BEAUTIFUL_SOUP: +class RequestHandler: + _log: beets.logging.Logger - def try_parse_html(html, **kwargs): - return bs4.BeautifulSoup(html, "html.parser", **kwargs) + def debug(self, message: str, *args) -> None: + """Log a debug message with the class name.""" + self._log.debug(f"{self.__class__.__name__}: {message}", *args) -else: + def info(self, message: str, *args) -> None: + """Log an info message with the class name.""" + self._log.info(f"{self.__class__.__name__}: {message}", *args) - def try_parse_html(html, **kwargs): - return None + def warn(self, message: str, *args) -> None: + """Log warning with the class name.""" + self._log.warning(f"{self.__class__.__name__}: {message}", *args) + @staticmethod + def format_url(url: str, params: JSONDict | None) -> str: + if not params: + return url -class Backend: - REQUIRES_BS = False + return f"{url}?{urlencode(params)}" - def __init__(self, config, log): - self._log = log - self.config = config + def fetch_text( + self, url: str, params: JSONDict | None = None, **kwargs + ) -> str: + """Return text / HTML data from the given URL. - def fetch_url(self, url, **kwargs): - """Retrieve the content at a given URL, or return None if the source - is unreachable. + Set the encoding to None to let requests handle it because some sites + set it incorrectly. """ + url = self.format_url(url, params) + self.debug("Fetching HTML from {}", url) + r = r_session.get(url, **kwargs) + r.encoding = None + return r.text + + def fetch_json(self, url: str, params: JSONDict | None = None, **kwargs): + """Return JSON data from the given URL.""" + url = self.format_url(url, params) + self.debug("Fetching JSON from {}", url) + return r_session.get(url, **kwargs).json() + + @contextmanager + def handle_request(self) -> Iterator[None]: try: - # Disable the InsecureRequestWarning that comes from using - # `verify=false`. - # https://github.com/kennethreitz/requests/issues/2214 - # We're not overly worried about the NSA MITMing our lyrics scraper - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - r = requests.get( - url, - verify=False, - headers={ - "User-Agent": USER_AGENT, - }, - timeout=10, - **kwargs, - ) + yield + except requests.JSONDecodeError: + self.warn("Could not decode response JSON data") except requests.RequestException as exc: - self._log.debug("lyrics request failed: {0}", exc) - return - if r.status_code == requests.codes.ok: - return r.text - else: - self._log.debug("failed to fetch: {0} ({1})", url, r.status_code) - return None + self.warn("Request error: {}", exc) - def fetch( - self, artist: str, title: str, album: str, length: int - ) -> str | None: - raise NotImplementedError +class BackendClass(type): + @property + def name(cls) -> str: + """Return lowercase name of the backend class.""" + return cls.__name__.lower() -class LRCLibItem(TypedDict): - """Lyrics data item returned by the LRCLib API.""" - id: int - name: str - trackName: str - artistName: str - albumName: str - duration: float | None - instrumental: bool - plainLyrics: str - syncedLyrics: str | None +class Backend(RequestHandler, metaclass=BackendClass): + def __init__(self, config, log): + self._log = log + self.config = config + + def fetch( + self, artist: str, title: str, album: str, length: int + ) -> tuple[str, str] | None: + raise NotImplementedError @dataclass @@ -296,6 +287,7 @@ class LRCLyrics: DURATION_DIFF_TOLERANCE = 0.05 target_duration: float + id: int duration: float instrumental: bool plain: str @@ -306,9 +298,12 @@ def __le__(self, other: LRCLyrics) -> bool: return self.dist < other.dist @classmethod - def make(cls, candidate: LRCLibItem, target_duration: float) -> LRCLyrics: + def make( + cls, candidate: LRCLibAPI.Item, target_duration: float + ) -> LRCLyrics: return cls( target_duration, + candidate["id"], candidate["duration"] or 0.0, candidate["instrumental"], candidate["plainLyrics"], @@ -361,24 +356,9 @@ class LRCLib(Backend): GET_URL = f"{BASE_URL}/get" SEARCH_URL = f"{BASE_URL}/search" - def warn(self, message: str, *args) -> None: - """Log a warning message with the class name.""" - self._log.warning(f"{self.__class__.__name__}: {message}", *args) - - def fetch_json(self, *args, **kwargs): - """Wrap the request method to raise an exception on HTTP errors.""" - kwargs.setdefault("timeout", 10) - kwargs.setdefault("headers", {"User-Agent": USER_AGENT}) - r = requests.get(*args, **kwargs) - if r.status_code == HTTPStatus.NOT_FOUND: - raise NotFoundError("HTTP Error: Not Found", response=r) - r.raise_for_status() - - return r.json() - def fetch_candidates( self, artist: str, title: str, album: str, length: int - ) -> Iterator[list[LRCLibItem]]: + ) -> Iterator[list[LRCLibAPI.Item]]: """Yield lyrics candidates for the given song data. I found that the ``/get`` endpoint sometimes returns inaccurate or @@ -406,41 +386,20 @@ def pick_best_match(cls, lyrics: Iterable[LRCLyrics]) -> LRCLyrics | None: def fetch( self, artist: str, title: str, album: str, length: int - ) -> str | None: + ) -> tuple[str, str] | None: """Fetch lyrics text for the given song data.""" evaluate_item = partial(LRCLyrics.make, target_duration=length) - try: - for group in self.fetch_candidates(artist, title, album, length): - candidates = [evaluate_item(item) for item in group] - if item := self.pick_best_match(candidates): - return item.get_text(self.config["synced"]) - except StopIteration: - pass - except requests.JSONDecodeError: - self.warn("Could not decode response JSON data") - except requests.RequestException as exc: - self.warn("Request error: {}", exc) + for group in self.fetch_candidates(artist, title, album, length): + candidates = [evaluate_item(item) for item in group] + if item := self.pick_best_match(candidates): + lyrics = item.get_text(self.config["synced"]) + return lyrics, f"{self.GET_URL}/{item.id}" return None -class DirectBackend(Backend): - """A backend for fetching lyrics directly.""" - - URL_TEMPLATE: ClassVar[str] #: May include formatting placeholders - - @classmethod - def encode(cls, text: str) -> str: - """Encode the string for inclusion in a URL.""" - raise NotImplementedError - - @classmethod - def build_url(cls, *args: str) -> str: - return cls.URL_TEMPLATE.format(*map(cls.encode, args)) - - -class MusiXmatch(DirectBackend): +class MusiXmatch(Backend): URL_TEMPLATE = "https://www.musixmatch.com/lyrics/{}/{}" REPLACEMENTS = { @@ -459,22 +418,22 @@ def encode(cls, text: str) -> str: return quote(unidecode(text)) - def fetch(self, artist: str, title: str, *_) -> str | None: + @classmethod + def build_url(cls, *args: str) -> str: + return cls.URL_TEMPLATE.format(*map(cls.encode, args)) + + def fetch(self, artist: str, title: str, *_) -> tuple[str, str] | None: url = self.build_url(artist, title) - html = self.fetch_url(url) - if not html: - return None + html = self.fetch_text(url) if "We detected that your IP is blocked" in html: - self._log.warning( - "we are blocked at MusixMatch: url %s failed" % url - ) + self.warn("Failed: Blocked IP address") return None html_parts = html.split('<p class="mxm-lyrics__content') # Sometimes lyrics come in 2 or more parts lyrics_parts = [] for html_part in html_parts: - lyrics_parts.append(extract_text_between(html_part, ">", "</p>")) + lyrics_parts.append(re.sub(r"^[^>]+>|</p>.*", "", html_part)) lyrics = "\n".join(lyrics_parts) lyrics = lyrics.strip(',"').replace("\\n", "\n") # another odd case: sometimes only that string remains, for @@ -485,381 +444,338 @@ def fetch(self, artist: str, title: str, *_) -> str | None: # sometimes there are non-existent lyrics with some content if "Lyrics | Musixmatch" in lyrics: return None - return lyrics + return lyrics, url + + +class Html: + collapse_space = partial(re.compile(r"(^| ) +", re.M).sub, r"\1") + expand_br = partial(re.compile(r"\s*<br[^>]*>\s*", re.I).sub, "\n") + #: two newlines between paragraphs on the same line (musica, letras.mus.br) + merge_blocks = partial(re.compile(r"(?<!>)</p><p[^>]*>").sub, "\n\n") + #: a single new line between paragraphs on separate lines + #: (paroles.net, sweetslyrics.com, lacoccinelle.net) + merge_lines = partial(re.compile(r"</p>\s+<p[^>]*>(?!___)").sub, "\n") + #: remove empty divs (lacoccinelle.net) + remove_empty_tags = partial( + re.compile(r"(<(div|span)[^>]*>\s*</\2>)").sub, "" + ) + #: remove Google Ads tags (musica.com) + remove_aside = partial(re.compile("<aside .+?</aside>").sub, "") + #: remove adslot-Content_1 div from the lyrics text (paroles.net) + remove_adslot = partial( + re.compile(r"\n</div>[^\n]+-- Content_\d+ --.*?\n<div>", re.S).sub, + "\n", + ) + #: remove text formatting (azlyrics.com, lacocinelle.net) + remove_formatting = partial( + re.compile(r" *</?(i|em|pre|strong)[^>]*>").sub, "" + ) + @classmethod + def normalize_space(cls, text: str) -> str: + text = unescape(text).replace("\r", "").replace("\xa0", " ") + return cls.collapse_space(cls.expand_br(text)) -class Genius(Backend): - """Fetch lyrics from Genius via genius-api. + @classmethod + def remove_ads(cls, text: str) -> str: + return cls.remove_adslot(cls.remove_aside(text)) - Simply adapted from - bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/ - """ + @classmethod + def merge_paragraphs(cls, text: str) -> str: + return cls.merge_blocks(cls.merge_lines(cls.remove_empty_tags(text))) - REQUIRES_BS = True - base_url = "https://api.genius.com" +class SoupMixin: + @classmethod + def pre_process_html(cls, html: str) -> str: + """Pre-process the HTML content before scraping.""" + return Html.normalize_space(html) - def __init__(self, config, log): - super().__init__(config, log) - self.api_key = config["genius_api_key"].as_str() - self.headers = { - "Authorization": "Bearer %s" % self.api_key, - "User-Agent": USER_AGENT, - } + @classmethod + def get_soup(cls, html: str) -> BeautifulSoup: + return BeautifulSoup(cls.pre_process_html(html), "html.parser") - def fetch(self, artist: str, title: str, *_) -> str | None: - """Fetch lyrics from genius.com - Because genius doesn't allow accessing lyrics via the api, - we first query the api for a url matching our artist & title, - then attempt to scrape that url for the lyrics. - """ - json = self._search(artist, title) - if not json: - self._log.debug("Genius API request returned invalid JSON") - return None +class SearchResult(NamedTuple): + artist: str + title: str + url: str - # find a matching artist in the json - for hit in json["response"]["hits"]: - hit_artist = hit["result"]["primary_artist"]["name"] + @property + def source(self) -> str: + return urlparse(self.url).netloc - if slug(hit_artist) == slug(artist): - html = self.fetch_url(hit["result"]["url"]) - if not html: - return None - return self._scrape_lyrics_from_html(html) - self._log.debug( - "Genius failed to find a matching artist for '{0}'", artist +class SearchBackend(SoupMixin, Backend): + @cached_property + def dist_thresh(self) -> float: + return self.config["dist_thresh"].get(float) + + def check_match( + self, target_artist: str, target_title: str, result: SearchResult + ) -> bool: + """Check if the given search result is a 'good enough' match.""" + max_dist = max( + string_dist(target_artist, result.artist), + string_dist(target_title, result.title), ) - return None - - def _search(self, artist, title): - """Searches the genius api for a given artist and title - https://docs.genius.com/#search-h2 + if (max_dist := round(max_dist, 2)) <= self.dist_thresh: + return True - :returns: json response - """ - search_url = self.base_url + "/search" - data = {"q": title + " " + artist.lower()} - try: - response = requests.get( - search_url, - params=data, - headers=self.headers, - timeout=10, + if math.isclose(max_dist, self.dist_thresh, abs_tol=0.4): + # log out the candidate that did not make it but was close. + # This may show a matching candidate with some noise in the name + self.debug( + "({}, {}) does not match ({}, {}) but dist was close: {:.2f}", + result.artist, + result.title, + target_artist, + target_title, + max_dist, ) - except requests.RequestException as exc: - self._log.debug("Genius API request failed: {0}", exc) - return None - - try: - return response.json() - except ValueError: - return None - - def replace_br(self, lyrics_div): - for br in lyrics_div.find_all("br"): - br.replace_with("\n") - def _scrape_lyrics_from_html(self, html): - """Scrape lyrics from a given genius.com html""" + return False - soup = try_parse_html(html) - if not soup: - return - - # Remove script tags that they put in the middle of the lyrics. - [h.extract() for h in soup("script")] - - # Most of the time, the page contains a div with class="lyrics" where - # all of the lyrics can be found already correctly formatted - # Sometimes, though, it packages the lyrics into separate divs, most - # likely for easier ad placement + def search(self, artist: str, title: str) -> Iterable[SearchResult]: + """Search for the given query and yield search results.""" + raise NotImplementedError - lyrics_divs = soup.find_all("div", {"data-lyrics-container": True}) - if not lyrics_divs: - self._log.debug("Received unusual song page html") - return self._try_extracting_lyrics_from_non_data_lyrics_container( - soup - ) - lyrics = "" - for lyrics_div in lyrics_divs: - self.replace_br(lyrics_div) - lyrics += lyrics_div.get_text() + "\n\n" - while lyrics[-1] == "\n": - lyrics = lyrics[:-1] - return lyrics - - def _try_extracting_lyrics_from_non_data_lyrics_container(self, soup): - """Extract lyrics from a div without attribute data-lyrics-container - This is the second most common layout on genius.com - """ - verse_div = soup.find("div", class_=re.compile("Lyrics__Container")) - if not verse_div: - if soup.find( - "div", - class_=re.compile("LyricsPlaceholder__Message"), - string="This song is an instrumental", + def get_results(self, artist: str, title: str) -> Iterable[SearchResult]: + check_match = partial(self.check_match, artist, title) + for candidate in self.search(artist, title): + if check_match(candidate): + yield candidate + + def fetch(self, artist: str, title: str, *_) -> tuple[str, str] | None: + """Fetch lyrics for the given artist and title.""" + for result in self.get_results(artist, title): + if (html := self.fetch_text(result.url)) and ( + lyrics := self.scrape(html) ): - self._log.debug("Detected instrumental") - return INSTRUMENTAL_LYRICS - else: - self._log.debug("Couldn't scrape page using known layouts") - return None + return lyrics, result.url - lyrics_div = verse_div.parent - self.replace_br(lyrics_div) + return None - ads = lyrics_div.find_all( - "div", class_=re.compile("InreadAd__Container") - ) - for ad in ads: - ad.replace_with("\n") + @classmethod + def scrape(cls, html: str) -> str | None: + """Scrape the lyrics from the given HTML.""" + raise NotImplementedError - footers = lyrics_div.find_all( - "div", class_=re.compile("Lyrics__Footer") - ) - for footer in footers: - footer.replace_with("") - return lyrics_div.get_text() +class Genius(SearchBackend): + """Fetch lyrics from Genius via genius-api. -class Tekstowo(DirectBackend): - """Fetch lyrics from Tekstowo.pl.""" + Because genius doesn't allow accessing lyrics via the api, we first query + the api for a url matching our artist & title, then scrape the HTML text + for the JSON data containing the lyrics. + """ - REQUIRES_BS = True - URL_TEMPLATE = "https://www.tekstowo.pl/piosenka,{},{}.html" + SEARCH_URL = "https://api.genius.com/search" + LYRICS_IN_JSON_RE = re.compile(r'(?<=.\\"html\\":\\").*?(?=(?<!\\)\\")') + remove_backslash = partial(re.compile(r"\\(?=[^\\])").sub, "") - non_alpha_to_underscore = partial(re.compile(r"\W").sub, "_") + @cached_property + def headers(self) -> dict[str, str]: + return {"Authorization": f'Bearer {self.config["genius_api_key"]}'} + + def search(self, artist: str, title: str) -> Iterable[SearchResult]: + search_data: GeniusAPI.Search = self.fetch_json( + self.SEARCH_URL, + params={"q": f"{artist} {title}"}, + headers=self.headers, + ) + for r in (hit["result"] for hit in search_data["response"]["hits"]): + yield SearchResult(r["artist_names"], r["title"], r["url"]) @classmethod - def encode(cls, text: str) -> str: - return cls.non_alpha_to_underscore(unidecode(text.lower())) - - def fetch(self, artist: str, title: str, *_) -> str | None: - if html := self.fetch_url(self.build_url(artist, title)): - return self.extract_lyrics(html) - - return None - - def extract_lyrics(self, html: str) -> str | None: - html = _scrape_strip_cruft(html) - html = _scrape_merge_paragraphs(html) - - soup = try_parse_html(html) - - if lyrics_div := soup.select_one("div.song-text > div.inner-text"): - return lyrics_div.get_text() + def scrape(cls, html: str) -> str | None: + if m := cls.LYRICS_IN_JSON_RE.search(html): + html_text = cls.remove_backslash(m[0]).replace(r"\n", "\n") + return cls.get_soup(html_text).get_text().strip() return None -def remove_credits(text): - """Remove first/last line of text if it contains the word 'lyrics' - eg 'Lyrics by songsdatabase.com' - """ - textlines = text.split("\n") - credits = None - for i in (0, -1): - if textlines and "lyrics" in textlines[i].lower(): - credits = textlines.pop(i) - if credits: - text = "\n".join(textlines) - return text - - -def _scrape_strip_cruft(html, plain_text_out=False): - """Clean up HTML""" - html = unescape(html) - - html = html.replace("\r", "\n") # Normalize EOL. - html = re.sub(r" +", " ", html) # Whitespaces collapse. - html = BREAK_RE.sub("\n", html) # <br> eats up surrounding '\n'. - html = re.sub(r"(?s)<(script).*?</\1>", "", html) # Strip script tags. - html = re.sub("\u2005", " ", html) # replace unicode with regular space - html = re.sub("<aside .+?</aside>", "", html) # remove Google Ads tags - html = re.sub(r"</?(em|strong)[^>]*>", "", html) # remove italics / bold - - if plain_text_out: # Strip remaining HTML tags - html = COMMENT_RE.sub("", html) - html = TAG_RE.sub("", html) - - html = "\n".join([x.strip() for x in html.strip().split("\n")]) - html = re.sub(r"\n{3,}", r"\n\n", html) - return html +class Tekstowo(SearchBackend): + """Fetch lyrics from Tekstowo.pl.""" + BASE_URL = "https://www.tekstowo.pl" + SEARCH_URL = BASE_URL + "/szukaj,{}.html" -def _scrape_merge_paragraphs(html): - html = re.sub(r"</p>\s*<p(\s*[^>]*)>", "\n", html) - return re.sub(r"<div .*>\s*</div>", "\n", html) + def build_url(self, artist, title): + artistitle = f"{artist.title()} {title.title()}" + return self.SEARCH_URL.format(quote_plus(unidecode(artistitle))) -def scrape_lyrics_from_html(html): - """Scrape lyrics from a URL. If no lyrics can be found, return None - instead. - """ + def search(self, artist: str, title: str) -> Iterable[SearchResult]: + if html := self.fetch_text(self.build_url(title, artist)): + soup = self.get_soup(html) + for tag in soup.select("div[class=flex-group] > a[title*=' - ']"): + artist, title = str(tag["title"]).split(" - ", 1) + yield SearchResult( + artist, title, f"{self.BASE_URL}{tag['href']}" + ) - def is_text_notcode(text): - if not text: - return False - length = len(text) - return ( - length > 20 - and text.count(" ") > length / 25 - and (text.find("{") == -1 or text.find(";") == -1) - ) + return None - html = _scrape_strip_cruft(html) - html = _scrape_merge_paragraphs(html) + @classmethod + def scrape(cls, html: str) -> str | None: + soup = cls.get_soup(html) - # extract all long text blocks that are not code - soup = try_parse_html(html, parse_only=SoupStrainer(string=is_text_notcode)) - if not soup: - return None + if lyrics_div := soup.select_one("div.song-text > div.inner-text"): + return lyrics_div.get_text() - # Get the longest text element (if any). - strings = sorted(soup.stripped_strings, key=len, reverse=True) - if strings: - return strings[0] - else: return None -class Google(Backend): +class Google(SearchBackend): """Fetch lyrics from Google search results.""" - REQUIRES_BS = True SEARCH_URL = "https://www.googleapis.com/customsearch/v1" - def is_lyrics(self, text, artist=None): - """Determine whether the text seems to be valid lyrics.""" - if not text: - return False - bad_triggers_occ = [] - nb_lines = text.count("\n") - if nb_lines <= 1: - self._log.debug("Ignoring too short lyrics '{0}'", text) - return False - elif nb_lines < 5: - bad_triggers_occ.append("too_short") - else: - # Lyrics look legit, remove credits to avoid being penalized - # further down - text = remove_credits(text) + #: Exclude some letras.mus.br pages which do not contain lyrics. + EXCLUDE_PAGES = [ + "significado.html", + "traduccion.html", + "traducao.html", + "significados.html", + ] - bad_triggers = ["lyrics", "copyright", "property", "links"] - if artist: - bad_triggers += [artist] + #: Regular expression to match noise in the URL title. + URL_TITLE_NOISE_RE = re.compile( + r""" +\b +( + paroles(\ et\ traduction|\ de\ chanson)? + | letras?(\ de)? + | liedtexte + | dainų\ žodžiai + | original\ song\ full\ text\. + | official + | 20[12]\d\ version + | (absolute\ |az)?lyrics(\ complete)? + | www\S+ + | \S+\.(com|net|mus\.br) +) +([^\w.]|$) +""", + re.IGNORECASE | re.VERBOSE, + ) + #: Split cleaned up URL title into artist and title parts. + URL_TITLE_PARTS_RE = re.compile(r" +(?:[ :|-]+|par|by) +") - for item in bad_triggers: - bad_triggers_occ += [item] * len( - re.findall(r"\W%s\W" % item, text, re.I) - ) + SOURCE_DIST_FACTOR = {"www.azlyrics.com": 0.5, "www.songlyrics.com": 0.6} - if bad_triggers_occ: - self._log.debug("Bad triggers detected: {0}", bad_triggers_occ) - return len(bad_triggers_occ) < 2 + ignored_domains: set[str] = set() - def slugify(self, text): - """Normalize a string and remove non-alphanumeric characters.""" - text = re.sub(r"[-'_\s]", "_", text) - text = re.sub(r"_+", "_", text).strip("_") - pat = r"([^,\(]*)\((.*?)\)" # Remove content within parentheses - text = re.sub(pat, r"\g<1>", text).strip() - try: - text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore") - text = str(re.sub(r"[-\s]+", " ", text.decode("utf-8"))) - except UnicodeDecodeError: - self._log.exception("Failing to normalize '{0}'", text) - return text + @classmethod + def pre_process_html(cls, html: str) -> str: + """Pre-process the HTML content before scraping.""" + html = Html.remove_ads(super().pre_process_html(html)) + return Html.remove_formatting(Html.merge_paragraphs(html)) + + def fetch_text(self, *args, **kwargs) -> str: + """Handle an error so that we can continue with the next URL.""" + kwargs.setdefault("allow_redirects", False) + with self.handle_request(): + try: + return super().fetch_text(*args, **kwargs) + except CaptchaError: + self.ignored_domains.add(urlparse(args[0]).netloc) + raise - BY_TRANS = ["by", "par", "de", "von"] - LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"] + @staticmethod + def get_part_dist(artist: str, title: str, part: str) -> float: + """Return the distance between the given part and the artist and title. - def is_page_candidate(self, url_link, url_title, title, artist): - """Return True if the URL title makes it a good candidate to be a - page that contains lyrics of title by artist. + A number between -1 and 1 is returned, where -1 means the part is + closer to the artist and 1 means it is closer to the title. """ - title = self.slugify(title.lower()) - artist = self.slugify(artist.lower()) - sitename = re.search( - "//([^/]+)/.*", self.slugify(url_link.lower()) - ).group(1) - url_title = self.slugify(url_title.lower()) - - # Check if URL title contains song title (exact match) - if url_title.find(title) != -1: - return True + return string_dist(artist, part) - string_dist(title, part) - # or try extracting song title from URL title and check if - # they are close enough - tokens = ( - [by + "_" + artist for by in self.BY_TRANS] - + [artist, sitename, sitename.replace("www.", "")] - + self.LYRICS_TRANS + @classmethod + def make_search_result( + cls, artist: str, title: str, item: GoogleCustomSearchAPI.Item + ) -> SearchResult: + """Parse artist and title from the URL title and return a search result.""" + url_title = ( + # get full title from metatags if available + item.get("pagemap", {}).get("metatags", [{}])[0].get("og:title") + # default to the dispolay title + or item["title"] ) - tokens = [re.escape(t) for t in tokens] - song_title = re.sub("(%s)" % "|".join(tokens), "", url_title) + clean_title = cls.URL_TITLE_NOISE_RE.sub("", url_title).strip(" .-|") + # split it into parts which may be part of the artist or the title + # `dict.fromkeys` removes duplicates keeping the order + parts = list(dict.fromkeys(cls.URL_TITLE_PARTS_RE.split(clean_title))) + + if len(parts) == 1: + part = parts[0] + if m := re.search(rf"(?i)\W*({re.escape(title)})\W*", part): + # artist and title may not have a separator + result_title = m[1] + result_artist = part.replace(m[0], "") + else: + # assume that this is the title + result_artist, result_title = "", parts[0] + else: + # sort parts by their similarity to the artist + parts.sort(key=lambda p: cls.get_part_dist(artist, title, p)) + result_artist, result_title = parts[0], " ".join(parts[1:]) - song_title = song_title.strip("_|") - typo_ratio = 0.9 - ratio = difflib.SequenceMatcher(None, song_title, title).ratio() - return ratio >= typo_ratio + return SearchResult(result_artist, result_title, item["link"]) - def fetch(self, artist: str, title: str, *_) -> str | None: + def search(self, artist: str, title: str) -> Iterable[SearchResult]: params = { "key": self.config["google_API_key"].as_str(), "cx": self.config["google_engine_ID"].as_str(), "q": f"{artist} {title}", + "siteSearch": "www.musixmatch.com", + "siteSearchFilter": "e", + "excludeTerms": ", ".join(self.EXCLUDE_PAGES), } - data = self.fetch_url(self.SEARCH_URL, params=params) - if not data: - self._log.debug("google backend returned no data") - return None - try: - data = json.loads(data) - except ValueError as exc: - self._log.debug("google backend returned malformed JSON: {}", exc) - if "error" in data: - reason = data["error"]["errors"][0]["reason"] - self._log.debug("google backend error: {0}", reason) - return None + data: GoogleCustomSearchAPI.Response = self.fetch_json( + self.SEARCH_URL, params=params + ) + for item in data.get("items", []): + yield self.make_search_result(artist, title, item) + + def get_results(self, *args) -> Iterable[SearchResult]: + """Try results from preferred sources first.""" + for result in sorted( + super().get_results(*args), + key=lambda r: self.SOURCE_DIST_FACTOR.get(r.source, 1), + ): + if result.source not in self.ignored_domains: + yield result - if "items" in data.keys(): - for item in data["items"]: - url_link = item["link"] - url_title = item.get("title", "") - if not self.is_page_candidate( - url_link, url_title, title, artist - ): - continue - html = self.fetch_url(url_link) - if not html: - continue - lyrics = scrape_lyrics_from_html(html) - if not lyrics: - continue - - if self.is_lyrics(lyrics, artist): - self._log.debug("got lyrics from {0}", item["displayLink"]) - return lyrics + @classmethod + def scrape(cls, html: str) -> str | None: + # Get the longest text element (if any). + if strings := sorted(cls.get_soup(html).stripped_strings, key=len): + return strings[-1] return None -class LyricsPlugin(plugins.BeetsPlugin): - SOURCES = ["lrclib", "google", "musixmatch", "genius", "tekstowo"] - SOURCE_BACKENDS = { - "google": Google, - "musixmatch": MusiXmatch, - "genius": Genius, - "tekstowo": Tekstowo, - "lrclib": LRCLib, +class LyricsPlugin(RequestHandler, plugins.BeetsPlugin): + BACKEND_BY_NAME = { + b.name: b for b in [LRCLib, Google, Genius, Tekstowo, MusiXmatch] } + @cached_property + def backends(self) -> list[Backend]: + user_sources = self.config["sources"].get() + + chosen = plugins.sanitize_choices(user_sources, self.BACKEND_BY_NAME) + if "google" in chosen and not self.config["google_API_key"].get(): + self.warn("Disabling Google source: no API key configured.") + chosen.remove("google") + + return [self.BACKEND_BY_NAME[c](self.config, self._log) for c in chosen] + def __init__(self): super().__init__() self.import_stages = [self.imported] @@ -869,18 +785,22 @@ def __init__(self): "bing_client_secret": None, "bing_lang_from": [], "bing_lang_to": None, + "dist_thresh": 0.11, "google_API_key": None, "google_engine_ID": "009217259823014548361:lndtuqkycfu", - "genius_api_key": "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W" - "76V-uFL5jks5dNvcGCdarqFjDhP9c", + "genius_api_key": ( + "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W" + "76V-uFL5jks5dNvcGCdarqFjDhP9c" + ), "fallback": None, "force": False, "local": False, "synced": False, # Musixmatch is disabled by default as they are currently blocking # requests with the beets user agent. - "sources": [s for s in self.SOURCES if s != "musixmatch"], - "dist_thresh": 0.1, + "sources": [ + n for n in self.BACKEND_BY_NAME if n != "musixmatch" + ], } ) self.config["bing_client_secret"].redact = True @@ -897,57 +817,12 @@ def __init__(self): # open yet. self.rest = None - available_sources = list(self.SOURCES) - sources = plugins.sanitize_choices( - self.config["sources"].as_str_seq(), available_sources - ) - - if not HAS_BEAUTIFUL_SOUP: - sources = self.sanitize_bs_sources(sources) - - if "google" in sources: - if not self.config["google_API_key"].get(): - # We log a *debug* message here because the default - # configuration includes `google`. This way, the source - # is silent by default but can be enabled just by - # setting an API key. - self._log.debug( - "Disabling google source: " "no API key configured." - ) - sources.remove("google") - self.config["bing_lang_from"] = [ x.lower() for x in self.config["bing_lang_from"].as_str_seq() ] - self.bing_auth_token = None - - if not HAS_LANGDETECT and self.config["bing_client_secret"].get(): - self._log.warning( - "To use bing translations, you need to " - "install the langdetect module. See the " - "documentation for further details." - ) - - self.backends = [ - self.SOURCE_BACKENDS[source](self.config, self._log) - for source in sources - ] - - def sanitize_bs_sources(self, sources): - enabled_sources = [] - for source in sources: - if self.SOURCE_BACKENDS[source].REQUIRES_BS: - self._log.debug( - "To use the %s lyrics source, you must " - "install the beautifulsoup4 module. See " - "the documentation for further details." % source - ) - else: - enabled_sources.append(source) - return enabled_sources - - def get_bing_access_token(self): + @cached_property + def bing_access_token(self) -> str | None: params = { "client_id": "beets", "client_secret": self.config["bing_client_secret"], @@ -956,20 +831,9 @@ def get_bing_access_token(self): } oauth_url = "https://datamarket.accesscontrol.windows.net/v2/OAuth2-13" - oauth_token = json.loads( - requests.post( - oauth_url, - data=urlencode(params), - timeout=10, - ).content - ) - if "access_token" in oauth_token: - return "Bearer " + oauth_token["access_token"] - else: - self._log.warning( - "Could not get Bing Translate API access token." - ' Check your "bing_client_secret" password' - ) + with self.handle_request(): + r = r_session.post(oauth_url, params=params) + return r.json()["access_token"] def commands(self): cmd = ui.Subcommand("lyrics", help="fetch song lyrics") @@ -1115,7 +979,7 @@ def fetch_item_lyrics(self, item: Item, write: bool, force: bool) -> None: """ # Skip if the item already has lyrics. if not force and item.lyrics: - self._log.info("lyrics already present: {0}", item) + self.info("🔵 Lyrics already present: {}", item) return lyrics_matches = [] @@ -1131,8 +995,8 @@ def fetch_item_lyrics(self, item: Item, write: bool, force: bool) -> None: lyrics = "\n\n---\n\n".join(filter(None, lyrics_matches)) if lyrics: - self._log.info("fetched lyrics: {0}", item) - if HAS_LANGDETECT and self.config["bing_client_secret"].get(): + self.info("🟢 Found lyrics: {0}", item) + if self.config["bing_client_secret"].get(): lang_from = langdetect.detect(lyrics) if self.config["bing_lang_to"].get() != lang_from and ( not self.config["bing_lang_from"] @@ -1142,62 +1006,51 @@ def fetch_item_lyrics(self, item: Item, write: bool, force: bool) -> None: lyrics, self.config["bing_lang_to"] ) else: - self._log.info("lyrics not found: {0}", item) - fallback = self.config["fallback"].get() - if fallback: - lyrics = fallback - else: - return - item.lyrics = lyrics - if write: - item.try_write() - item.store() + self.info("🔴 Lyrics not found: {}", item) + lyrics = self.config["fallback"].get() + + if lyrics not in {None, item.lyrics}: + item.lyrics = lyrics + if write: + item.try_write() + item.store() def get_lyrics(self, artist: str, title: str, *args) -> str | None: """Fetch lyrics, trying each source in turn. Return a string or None if no lyrics were found. """ + self.info("Fetching lyrics for {} - {}", artist, title) for backend in self.backends: - lyrics = backend.fetch(artist, title, *args) - if lyrics: - self._log.debug( - "got lyrics from backend: {0}", backend.__class__.__name__ - ) - return _scrape_strip_cruft(lyrics, True) + with backend.handle_request(): + if lyrics_info := backend.fetch(artist, title, *args): + lyrics, url = lyrics_info + return f"{lyrics}\n\nSource: {url}" return None def append_translation(self, text, to_lang): from xml.etree import ElementTree - if not self.bing_auth_token: - self.bing_auth_token = self.get_bing_access_token() - if self.bing_auth_token: - # Extract unique lines to limit API request size per song - text_lines = set(text.split("\n")) - url = ( - "https://api.microsofttranslator.com/v2/Http.svc/" - "Translate?text=%s&to=%s" % ("|".join(text_lines), to_lang) + if not (token := self.bing_access_token): + self.warn( + "Could not get Bing Translate API access token. " + "Check your 'bing_client_secret' password." ) - r = requests.get( + return text + + # Extract unique lines to limit API request size per song + lines = text.split("\n") + unique_lines = set(lines) + url = "https://api.microsofttranslator.com/v2/Http.svc/Translate" + with self.handle_request(): + text = self.fetch_text( url, - headers={"Authorization ": self.bing_auth_token}, - timeout=10, + headers={"Authorization": f"Bearer {token}"}, + params={"text": "|".join(unique_lines), "to": to_lang}, ) - if r.status_code != 200: - self._log.debug( - "translation API error {}: {}", r.status_code, r.text - ) - if "token has expired" in r.text: - self.bing_auth_token = None - return self.append_translation(text, to_lang) - return text - lines_translated = ElementTree.fromstring( - r.text.encode("utf-8") - ).text - # Use a translation mapping dict to build resulting lyrics - translations = dict(zip(text_lines, lines_translated.split("|"))) - result = "" - for line in text.split("\n"): - result += "{} / {}\n".format(line, translations[line]) - return result + if translated := ElementTree.fromstring(text.encode("utf-8")).text: + # Use a translation mapping dict to build resulting lyrics + translations = dict(zip(unique_lines, translated.split("|"))) + return "".join(f"{ln} / {translations[ln]}\n" for ln in lines) + + return text diff --git a/docs/changelog.rst b/docs/changelog.rst index 46fa3b64e1..54d0855990 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -11,6 +11,15 @@ been dropped. New features: +* :doc:`plugins/lastgenre`: The new configuration option, ``keep_existing``, + provides more fine-grained control over how pre-populated genre tags are + handled. The ``force`` option now behaves in a more conventional manner. + :bug:`4982` +* :doc:`plugins/lyrics`: Add new configuration option ``dist_thresh`` to + control the maximum allowed distance between the lyrics search result and the + tagged item's artist and title. This is useful for preventing false positives + when fetching lyrics. + Bug fixes: * :doc:`plugins/lyrics`: LRCLib will fallback to plain lyrics if synced lyrics @@ -55,10 +64,12 @@ Bug fixes: ``lrclib`` over other sources since it returns reliable results quicker than others. :bug:`5102` -* :doc:`plugins/lastgenre`: The new configuration option, ``keep_existing``, - provides more fine-grained control over how pre-populated genre tags are - handled. The ``force`` option now behaves in a more conventional manner. - :bug:`4982` +* :doc:`plugins/lyrics`: Fix the issue with ``genius`` backend not being able + to match lyrics when there is a slight variation in the artist name. + :bug:`4791` +* :doc:`plugins/lyrics`: Fix plugin crash when ``genius`` backend returns empty + lyrics. + :bug:`5583` For packagers: diff --git a/docs/plugins/lyrics.rst b/docs/plugins/lyrics.rst index d1f434d70f..f034cf47a1 100644 --- a/docs/plugins/lyrics.rst +++ b/docs/plugins/lyrics.rst @@ -2,25 +2,27 @@ Lyrics Plugin ============= The ``lyrics`` plugin fetches and stores song lyrics from databases on the Web. -Namely, the current version of the plugin uses `Genius.com`_, `Tekstowo.pl`_, `LRCLIB`_ -and, optionally, the Google custom search API. +Namely, the current version of the plugin uses `Genius.com`_, `Tekstowo.pl`_, +`LRCLIB`_ and, optionally, the Google Custom Search API. .. _Genius.com: https://genius.com/ .. _Tekstowo.pl: https://www.tekstowo.pl/ .. _LRCLIB: https://lrclib.net/ -Fetch Lyrics During Import --------------------------- +Install +------- -To automatically fetch lyrics for songs you import, first enable it in your -configuration (see :ref:`using-plugins`). Then, install ``beets`` with -``lyrics`` extra +Firstly, enable ``lyrics`` plugin in your configuration (see +:ref:`using-plugins`). Then, install ``beets`` with ``lyrics`` extra .. code-block:: bash pip install "beets[lyrics]" +Fetch Lyrics During Import +-------------------------- + When importing new files, beets will now fetch lyrics for files that don't already have them. The lyrics will be stored in the beets database. If the ``import.write`` config option is on, then the lyrics will also be written to @@ -29,46 +31,52 @@ the files' tags. Configuration ------------- -To configure the plugin, make a ``lyrics:`` section in your -configuration file. The available options are: +To configure the plugin, make a ``lyrics:`` section in your configuration file. +Default configuration: + +.. code-block:: yaml + + lyrics: + auto: yes + bing_client_secret: null + bing_lang_from: [] + bing_lang_to: null + dist_thresh: 0.11 + fallback: null + force: no + google_API_key: null + google_engine_ID: 009217259823014548361:lndtuqkycfu + sources: [lrclib, google, genius, tekstowo] + synced: no + +The available options are: - **auto**: Fetch lyrics automatically during import. - Default: ``yes``. - **bing_client_secret**: Your Bing Translation application password - (to :ref:`lyrics-translation`) + (see :ref:`lyrics-translation`) - **bing_lang_from**: By default all lyrics with a language other than ``bing_lang_to`` are translated. Use a list of lang codes to restrict the set of source languages to translate. - Default: ``[]`` - **bing_lang_to**: Language to translate lyrics into. - Default: None. +- **dist_thresh**: The maximum distance between the artist and title + combination of the music file and lyrics candidate to consider them a match. + Lower values will make the plugin more strict, higher values will make it + more lenient. This does not apply to the ``lrclib`` backend as it matches + durations. - **fallback**: By default, the file will be left unchanged when no lyrics are found. Use the empty string ``''`` to reset the lyrics in such a case. - Default: None. - **force**: By default, beets won't fetch lyrics if the files already have ones. To instead always fetch lyrics, set the ``force`` option to ``yes``. - Default: ``no``. - **google_API_key**: Your Google API key (to enable the Google Custom Search backend). - Default: None. - **google_engine_ID**: The custom search engine to use. Default: The `beets custom search engine`_, which gathers an updated list of sources known to be scrapeable. - **sources**: List of sources to search for lyrics. An asterisk ``*`` expands - to all available sources. - Default: ``lrclib google genius tekstowo``, i.e., all the available sources. The - ``google`` source will be automatically deactivated if no ``google_API_key`` - is setup. - The ``google``, ``genius``, and ``tekstowo`` sources will only be enabled if - BeautifulSoup is installed. -- **synced**: Prefer synced lyrics over plain lyrics if a source offers them. Currently `lrclib` is the only source that provides them. Default: `no`. - -Here's an example of ``config.yaml``:: - - lyrics: - fallback: '' - google_API_key: AZERTYUIOPQSDFGHJKLMWXCVBN1234567890_ab - google_engine_ID: 009217259823014548361:lndtuqkycfu + to all available sources. The ``google`` source will be automatically + deactivated if no ``google_API_key`` is setup. +- **synced**: Prefer synced lyrics over plain lyrics if a source offers them. + Currently ``lrclib`` is the only source that provides them. .. _beets custom search engine: https://www.google.com:443/cse/publicurl?cx=009217259823014548361:lndtuqkycfu @@ -83,74 +91,74 @@ by that band, and ``beet lyrics`` will get lyrics for my entire library. The lyrics will be added to the beets database and, if ``import.write`` is on, embedded into files' metadata. -The ``-p`` option to the ``lyrics`` command makes it print lyrics out to the -console so you can view the fetched (or previously-stored) lyrics. +The ``-p, --print`` option to the ``lyrics`` command makes it print lyrics out +to the console so you can view the fetched (or previously-stored) lyrics. -The ``-f`` option forces the command to fetch lyrics, even for tracks that -already have lyrics. Inversely, the ``-l`` option restricts operations -to lyrics that are locally available, which show lyrics faster without using -the network at all. +The ``-f, --force`` option forces the command to fetch lyrics, even for tracks +that already have lyrics. + +Inversely, the ``-l, --local`` option restricts operations to lyrics that are +locally available, which show lyrics faster without using the network at all. Rendering Lyrics into Other Formats ----------------------------------- -The ``-r directory`` option renders all lyrics as `reStructuredText`_ (ReST) -documents in ``directory`` (by default, the current directory). That -directory, in turn, can be parsed by tools like `Sphinx`_ to generate HTML, -ePUB, or PDF documents. +The ``-r directory, --write-rest directory`` option renders all lyrics as +`reStructuredText`_ (ReST) documents in ``directory`` (by default, the current +directory). That directory, in turn, can be parsed by tools like `Sphinx`_ to +generate HTML, ePUB, or PDF documents. -A minimal ``conf.py`` and ``index.rst`` files are created the first time the +Minimal ``conf.py`` and ``index.rst`` files are created the first time the command is run. They are not overwritten on subsequent runs, so you can safely modify these files to customize the output. -.. _Sphinx: https://www.sphinx-doc.org/ -.. _reStructuredText: http://docutils.sourceforge.net/rst.html +Sphinx supports various `builders`_, see a few suggestions: -Sphinx supports various `builders -<https://www.sphinx-doc.org/en/stable/builders.html>`_, but here are a -few suggestions. - * Build an HTML version:: +.. admonition:: Build an HTML version - sphinx-build -b html . _build/html + :: - * Build an ePUB3 formatted file, usable on ebook readers:: + sphinx-build -b html . _build/html - sphinx-build -b epub3 . _build/epub +.. admonition:: Build an ePUB3 formatted file, usable on ebook readers - * Build a PDF file, which incidentally also builds a LaTeX file:: + :: - sphinx-build -b latex %s _build/latex && make -C _build/latex all-pdf + sphinx-build -b epub3 . _build/epub -.. _activate-google-custom-search: +.. admonition:: Build a PDF file, which incidentally also builds a LaTeX file + + :: + + sphinx-build -b latex %s _build/latex && make -C _build/latex all-pdf + + +.. _Sphinx: https://www.sphinx-doc.org/ +.. _reStructuredText: http://docutils.sourceforge.net/rst.html +.. _builders: https://www.sphinx-doc.org/en/stable/builders.html Activate Google Custom Search ------------------------------ You need to `register for a Google API key`_. Set the ``google_API_key`` configuration option to your key. + Then add ``google`` to the list of sources in your configuration (or use default list, which includes it as long as you have an API key). If you use default ``google_engine_ID``, we recommend limiting the sources to ``google`` as the other sources are already included in the Google results. -.. _register for a Google API key: https://console.developers.google.com/ - Optionally, you can `define a custom search engine`_. Get your search engine's token and use it for your ``google_engine_ID`` configuration option. By default, beets use a list of sources known to be scrapeable. -.. _define a custom search engine: https://www.google.com/cse/all - Note that the Google custom search API is limited to 100 queries per day. After that, the lyrics plugin will fall back on other declared data sources. -.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/ - -Activate Genius and Tekstowo.pl Lyrics --------------------------------------- +.. _register for a Google API key: https://console.developers.google.com/ +.. _define a custom search engine: https://www.google.com/cse/all -These backends are enabled by default. .. _lyrics-translation: @@ -161,6 +169,6 @@ You need to register for a Microsoft Azure Marketplace free account and to the `Microsoft Translator API`_. Follow the four steps process, specifically at step 3 enter ``beets`` as *Client ID* and copy/paste the generated *Client secret* into your ``bing_client_secret`` configuration, alongside -``bing_lang_to`` target `language code`. +``bing_lang_to`` target ``language code``. .. _Microsoft Translator API: https://docs.microsoft.com/en-us/azure/cognitive-services/translator/translator-how-to-signup diff --git a/setup.cfg b/setup.cfg index 15ca23f658..8e3d7e3b82 100644 --- a/setup.cfg +++ b/setup.cfg @@ -21,8 +21,8 @@ omit = beets/test/* precision = 2 skip_empty = true show_missing = true -exclude_lines = - pragma: no cover +exclude_also = + @atexit.register if TYPE_CHECKING if typing.TYPE_CHECKING raise AssertionError diff --git a/test/plugins/lyrics_pages.py b/test/plugins/lyrics_pages.py index 46a72076d4..84c9e24410 100644 --- a/test/plugins/lyrics_pages.py +++ b/test/plugins/lyrics_pages.py @@ -147,6 +147,27 @@ def backend(self) -> str: """, url_title="The Beatles Lady Madonna lyrics", ), + LyricsPage.make( + "https://www.dainuzodziai.lt/m/mergaites-nori-mylet-atlanta/", + """ + Jos nesuspėja skriet paskui vėją + Bangos į krantą grąžina jas vėl + Jos karštą saulę paliesti norėjo + Ant kranto palikę visas negandas + + Bet jos nori mylėt + Jos nenori liūdėt + Leisk mergaitėms mylėt + Kaip jos moka mylėt + Koks vakaras šiltas ir nieko nestinga + Veidus apšviečia žaisminga šviesa + Jos buvo laimingos prie jūros kur liko + Tik vėjas išmokęs visas jų dainas + """, + artist="Atlanta", + track_title="Mergaitės Nori Mylėt", + url_title="Mergaitės nori mylėt – Atlanta | Dainų Žodžiai", + ), LyricsPage.make( "https://genius.com/The-beatles-lady-madonna-lyrics", """ @@ -223,6 +244,20 @@ def backend(self) -> str: Mademoiselle Madonna, couchée sur votre lit Listen to the music playing in your head. Vous écoutez la musique qui joue dans votre tête + + Tuesday afternoon is never ending. + Le mardi après-midi n'en finit pas + Wednesday morning papers didn't come. + Le mercredi matin les journaux ne sont pas arrivés + Thursday night you stockings needed mending. + Jeudi soir, vos bas avaient besoin d'être réparés + See how they run. + Regardez comme ils filent + + Lady Madonna, children at your feet. + Mademoiselle Madonna, les enfants à vos pieds + Wonder how you manage to make ends meet. + Je me demande comment vous vous débrouillez pour joindre les deux bouts """, url_title="Paroles et traduction The Beatles : Lady Madonna - paroles de chanson", # noqa: E501 ), @@ -235,29 +270,35 @@ def backend(self) -> str: Children at your feet Wonder how you manage To make ends meet + Who finds the money When you pay the rent? Did you think that money Was Heaven sent? + Friday night arrives without a suitcase Sunday morning creeping like a nun Monday's child has learned To tie his bootlace See how they run + Lady Madonna Baby at your breast Wonders how you manage To feed the rest See how they run + Lady Madonna Lying on the bed Listen to the music Playing in your head + Tuesday afternoon is neverending Wednesday morning papers didn't come Thursday night your stockings Needed mending See how they run + Lady Madonna Children at your feet Wonder how you manage @@ -415,15 +456,29 @@ def backend(self) -> str: LyricsPage.make( "https://www.musica.com/letras.asp?letra=59862", """ + Lady Madonna, children at your feet + Wonder how you manage to make ends meet + Who finds the money when you pay the rent? + Did you think that money was heaven sent? + + Friday night arrives without a suitcase + Sunday morning creeping like a nun + Monday's child has learned to tie his bootlace + See how they run + Lady Madonna, baby at your breast Wonders how you manage to feed the rest + See how they run + Lady Madonna lying on the bed Listen to the music playing in your head + Tuesday afternoon is never ending Wednesday morning papers didn't come Thursday night your stockings needed mending See how they run + Lady Madonna, children at your feet Wonder how you manage to make ends meet """, @@ -448,6 +503,14 @@ def backend(self) -> str: See how they run. Lady Madonna, lying on the bed, Listen to the music playing in your head. + + Tuesday afternoon is never ending. + Wednesday morning papers didn't come. + Thursday night your stockings needed mending. + See how they run. + + Lady Madonna, children at your feet. + Wonder how you manage to make ends meet. """, url_title="Paroles Lady Madonna par The Beatles - Lyrics - Paroles.net", ), @@ -480,6 +543,7 @@ def backend(self) -> str: Wonder how you manage to make ends meet """, url_title="THE BEATLES - LADY MADONNA LYRICS", + marks=[xfail_on_ci("Songlyrics is blocked by Cloudflare")], ), LyricsPage.make( "https://sweetslyrics.com/the-beatles/lady-madonna-lyrics", diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py index 47c9837701..c6d48c3bdb 100644 --- a/test/plugins/test_lyrics.py +++ b/test/plugins/test_lyrics.py @@ -37,6 +37,7 @@ class TestLyricsUtils: @pytest.mark.parametrize( "artist, title", [ + ("Various Artists", "Title"), ("Artist", ""), ("", "Title"), (" ", ""), @@ -81,7 +82,7 @@ def test_search_pairs_artists( @pytest.mark.parametrize( "title, expected_extra_titles", [ - ("1/2", ["1", "2"]), + ("1/2", []), ("1 / 2", ["1", "2"]), ("Song (live)", ["Song"]), ("Song (live) (new)", ["Song"]), @@ -101,47 +102,6 @@ def test_search_pairs_titles(self, title, expected_extra_titles): assert list(actual_titles) == [title, *expected_extra_titles] - @pytest.mark.parametrize( - "initial_lyrics, expected", - [ - ("Verse\nLyrics credit in the last line", "Verse"), - ("Lyrics credit in the first line\nVerse", "Verse"), - ( - """Verse - Lyrics mentioned somewhere in the middle - Verse""", - """Verse - Lyrics mentioned somewhere in the middle - Verse""", - ), - ], - ) - def test_remove_credits(self, initial_lyrics, expected): - assert lyrics.remove_credits(initial_lyrics) == expected - - @pytest.mark.parametrize( - "initial_text, expected", - [ - ( - """<!--lyrics below--> - one - <br class='myclass'> - two ! - <br><br \\> - <blink>four</blink>""", - "one\ntwo !\n\nfour", - ), - ("foo<script>bar</script>baz", "foobaz"), - ("foo<!--<bar>-->qux", "fooqux"), - ], - ) - def test_scrape_strip_cruft(self, initial_text, expected): - assert lyrics._scrape_strip_cruft(initial_text, True) == expected - - def test_scrape_merge_paragraphs(self): - text = "one</p> <p class='myclass'>two</p><p>three" - assert lyrics._scrape_merge_paragraphs(text) == "one\ntwo\nthree" - @pytest.mark.parametrize( "text, expected", [ @@ -161,12 +121,67 @@ def test_slug(self, text, expected): assert lyrics.slug(text) == expected +class TestHtml: + def test_scrape_strip_cruft(self): + initial = """<!--lyrics below--> + one + <br class='myclass'> + two ! + <br><br \\> + <blink>four</blink>""" + expected = "<!--lyrics below-->\none\ntwo !\n\n<blink>four</blink>" + + assert lyrics.Html.normalize_space(initial) == expected + + def test_scrape_merge_paragraphs(self): + text = "one</p> <p class='myclass'>two</p><p>three" + expected = "one\ntwo\n\nthree" + + assert lyrics.Html.merge_paragraphs(text) == expected + + +class TestSearchBackend: + @pytest.fixture + def backend(self, dist_thresh): + plugin = lyrics.LyricsPlugin() + plugin.config.set({"dist_thresh": dist_thresh}) + return lyrics.SearchBackend(plugin.config, plugin._log) + + @pytest.mark.parametrize( + "dist_thresh, target_artist, artist, should_match", + [ + (0.11, "Target Artist", "Target Artist", True), + (0.11, "Target Artist", "Target Artis", True), + (0.11, "Target Artist", "Target Arti", False), + (0.11, "Psychonaut", "Psychonaut (BEL)", True), + (0.11, "beets song", "beats song", True), + (0.10, "beets song", "beats song", False), + ( + 0.11, + "Lucid Dreams (Forget Me)", + "Lucid Dreams (Remix) ft. Lil Uzi Vert", + False, + ), + ( + 0.12, + "Lucid Dreams (Forget Me)", + "Lucid Dreams (Remix) ft. Lil Uzi Vert", + True, + ), + ], + ) + def test_check_match(self, backend, target_artist, artist, should_match): + result = lyrics.SearchResult(artist, "", "") + + assert backend.check_match(target_artist, "", result) == should_match + + @pytest.fixture(scope="module") def lyrics_root_dir(pytestconfig: pytest.Config): return pytestconfig.rootpath / "test" / "rsrc" / "lyrics" -class LyricsBackendTest(PluginMixin): +class LyricsPluginMixin(PluginMixin): plugin = "lyrics" @pytest.fixture @@ -182,6 +197,42 @@ def lyrics_plugin(self, backend_name, plugin_config): return lyrics.LyricsPlugin() + +class TestLyricsPlugin(LyricsPluginMixin): + @pytest.fixture + def backend_name(self): + """Return lyrics configuration to test.""" + return "lrclib" + + @pytest.mark.parametrize( + "request_kwargs, expected_log_match", + [ + ( + {"status_code": HTTPStatus.BAD_GATEWAY}, + r"LRCLib: Request error: 502", + ), + ({"text": "invalid"}, r"LRCLib: Could not decode.*JSON"), + ], + ) + def test_error_handling( + self, + requests_mock, + lyrics_plugin, + caplog, + request_kwargs, + expected_log_match, + ): + """Errors are logged with the backend name.""" + requests_mock.get(lyrics.LRCLib.SEARCH_URL, **request_kwargs) + + assert lyrics_plugin.get_lyrics("", "", "", 0.0) is None + assert caplog.messages + last_log = caplog.messages[-1] + assert last_log + assert re.search(expected_log_match, last_log, re.I) + + +class LyricsBackendTest(LyricsPluginMixin): @pytest.fixture def backend(self, lyrics_plugin): """Return a lyrics backend instance.""" @@ -229,24 +280,23 @@ def _patch_google_search(self, requests_mock, lyrics_page): def test_backend_source(self, lyrics_plugin, lyrics_page: LyricsPage): """Test parsed lyrics from each of the configured lyrics pages.""" - lyrics = lyrics_plugin.get_lyrics( + lyrics_info = lyrics_plugin.get_lyrics( lyrics_page.artist, lyrics_page.track_title, "", 186 ) - assert lyrics + assert lyrics_info + lyrics, _ = lyrics_info.split("\n\nSource: ") assert lyrics == lyrics_page.lyrics class TestGoogleLyrics(LyricsBackendTest): """Test scraping heuristics on a fake html page.""" - TITLE = "Beets song" - @pytest.fixture(scope="class") def backend_name(self): return "google" - @pytest.fixture(scope="class") + @pytest.fixture def plugin_config(self): return {"google_API_key": "test"} @@ -254,54 +304,59 @@ def plugin_config(self): def file_name(self): return "examplecom/beetssong" + @pytest.fixture + def search_item(self, url_title, url): + return {"title": url_title, "link": url} + + @pytest.mark.parametrize("plugin_config", [{}]) + def test_disabled_without_api_key(self, lyrics_plugin): + assert not lyrics_plugin.backends + def test_mocked_source_ok(self, backend, lyrics_html): """Test that lyrics of the mocked page are correctly scraped""" - result = lyrics.scrape_lyrics_from_html(lyrics_html).lower() + result = backend.scrape(lyrics_html).lower() assert result - assert backend.is_lyrics(result) - assert PHRASE_BY_TITLE[self.TITLE] in result + assert PHRASE_BY_TITLE["Beets song"] in result @pytest.mark.parametrize( - "url_title, artist, should_be_candidate", + "url_title, expected_artist, expected_title", [ - ("John Doe - beets song Lyrics", "John Doe", True), - ("example.com | Beats song by John doe", "John Doe", True), - ("example.com | seets bong lyrics by John doe", "John Doe", False), - ("foo", "Sun O)))", False), + ("Artist - beets song Lyrics", "Artist", "beets song"), + ("www.azlyrics.com | Beats song by Artist", "Artist", "Beats song"), + ("lyric.com | seets bong lyrics by Artist", "Artist", "seets bong"), + ("foo", "", "foo"), + ("Artist - Beets Song lyrics | AZLyrics", "Artist", "Beets Song"), + ("Letra de Artist - Beets Song", "Artist", "Beets Song"), + ("Letra de Artist - Beets ...", "Artist", "Beets"), + ("Artist Beets Song", "Artist", "Beets Song"), + ("BeetsSong - Artist", "Artist", "BeetsSong"), + ("Artist - BeetsSong", "Artist", "BeetsSong"), + ("Beets Song", "", "Beets Song"), + ("Beets Song Artist", "Artist", "Beets Song"), + ( + "BeetsSong (feat. Other & Another) - Artist", + "Artist", + "BeetsSong (feat. Other & Another)", + ), + ( + ( + "Beets song lyrics by Artist - original song full text. " + "Official Beets song lyrics, 2024 version | LyricsMode.com" + ), + "Artist", + "Beets song", + ), ], ) - def test_is_page_candidate( - self, backend, lyrics_html, url_title, artist, should_be_candidate + @pytest.mark.parametrize("url", ["http://doesntmatter.com"]) + def test_make_search_result( + self, backend, search_item, expected_artist, expected_title ): - result = backend.is_page_candidate( - "http://www.example.com/lyrics/beetssong", - url_title, - self.TITLE, - artist, - ) - assert bool(result) == should_be_candidate - - @pytest.mark.parametrize( - "lyrics", - [ - "LyricsMania.com - Copyright (c) 2013 - All Rights Reserved", - """All material found on this site is property\n - of mywickedsongtext brand""", - """ -Lyricsmania staff is working hard for you to add $TITLE lyrics as soon -as they'll be released by $ARTIST, check back soon! -In case you have the lyrics to $TITLE and want to send them to us, fill out -the following form. -""", - ], - ) - def test_bad_lyrics(self, backend, lyrics): - assert not backend.is_lyrics(lyrics) + result = backend.make_search_result("Artist", "Beets song", search_item) - def test_slugify(self, backend): - text = "http://site.com/\xe7afe-au_lait(boisson)" - assert backend.slugify(text) == "http://site.com/cafe_au_lait" + assert result.artist == expected_artist + assert result.title == expected_title class TestGeniusLyrics(LyricsBackendTest): @@ -312,13 +367,13 @@ def backend_name(self): @pytest.mark.parametrize( "file_name, expected_line_count", [ - ("geniuscom/2pacalleyezonmelyrics", 134), + ("geniuscom/2pacalleyezonmelyrics", 131), ("geniuscom/Ttngchinchillalyrics", 29), ("geniuscom/sample", 0), # see https://github.com/beetbox/beets/issues/3535 ], ) # fmt: skip def test_scrape(self, backend, lyrics_html, expected_line_count): - result = backend._scrape_lyrics_from_html(lyrics_html) or "" + result = backend.scrape(lyrics_html) or "" assert len(result.splitlines()) == expected_line_count @@ -339,7 +394,7 @@ def backend_name(self): ], ) def test_scrape(self, backend, lyrics_html, expecting_lyrics): - assert bool(backend.extract_lyrics(lyrics_html)) == expecting_lyrics + assert bool(backend.scrape(lyrics_html)) == expecting_lyrics LYRICS_DURATION = 950 @@ -347,6 +402,7 @@ def test_scrape(self, backend, lyrics_html, expecting_lyrics): def lyrics_match(**overrides): return { + "id": 1, "instrumental": False, "duration": LYRICS_DURATION, "syncedLyrics": "synced", @@ -363,13 +419,9 @@ def backend_name(self): return "lrclib" @pytest.fixture - def request_kwargs(self, response_data): - return {"json": response_data} - - @pytest.fixture - def fetch_lyrics(self, backend, requests_mock, request_kwargs): + def fetch_lyrics(self, backend, requests_mock, response_data): requests_mock.get(backend.GET_URL, status_code=HTTPStatus.NOT_FOUND) - requests_mock.get(backend.SEARCH_URL, **request_kwargs) + requests_mock.get(backend.SEARCH_URL, json=response_data) return partial(backend.fetch, "la", "la", "la", self.ITEM_DURATION) @@ -379,7 +431,9 @@ def fetch_lyrics(self, backend, requests_mock, request_kwargs): [({"synced": True}, "synced"), ({"synced": False}, "plain")], ) def test_synced_config_option(self, fetch_lyrics, expected_lyrics): - assert fetch_lyrics() == expected_lyrics + lyrics, _ = fetch_lyrics() + + assert lyrics == expected_lyrics @pytest.mark.parametrize( "response_data, expected_lyrics", @@ -441,20 +495,10 @@ def test_synced_config_option(self, fetch_lyrics, expected_lyrics): ) @pytest.mark.parametrize("plugin_config", [{"synced": True}]) def test_fetch_lyrics(self, fetch_lyrics, expected_lyrics): - assert fetch_lyrics() == expected_lyrics + lyrics_info = fetch_lyrics() + if lyrics_info is None: + assert expected_lyrics is None + else: + lyrics, _ = fetch_lyrics() - @pytest.mark.parametrize( - "request_kwargs, expected_log_match", - [ - ( - {"status_code": HTTPStatus.BAD_GATEWAY}, - r"LRCLib: Request error: 502", - ), - ({"text": "invalid"}, r"LRCLib: Could not decode.*JSON"), - ], - ) - def test_error(self, caplog, fetch_lyrics, expected_log_match): - assert fetch_lyrics() is None - assert caplog.messages - assert (last_log := caplog.messages[-1]) - assert re.search(expected_log_match, last_log, re.I) + assert lyrics == expected_lyrics