api/lyric_search/sources/genius.py

import sys

sys.path.insert(1, "..")
import traceback
import logging
import time
import re
from typing import Optional
from aiohttp import ClientTimeout, ClientSession
from bs4 import BeautifulSoup, ResultSet  # type: ignore
from tenacity import retry, stop_after_attempt, wait_fixed
import html as htm
from . import private, common, cache, redis_cache
from lyric_search import utils
from lyric_search.constructors import LyricsResult, InvalidGeniusResponseException

logger = logging.getLogger()
log_level = logging.getLevelName(logger.level)


class Genius:
    """
    Genius Search Module
    """

    def __init__(self) -> None:
        self.label: str = "Genius"
        self.genius_url: str = private.GENIUS_URL
        self.genius_search_url: str = f"{self.genius_url}api/search/song?q="
        self.headers: dict = common.SCRAPE_HEADERS
        self.timeout = ClientTimeout(connect=5, sock_read=5)
        self.datautils = utils.DataUtils()
        self.matcher = utils.TrackMatcher()
        self.cache = cache.Cache()
        self.redis_cache = redis_cache.RedisCache()

    @retry(stop=stop_after_attempt(3), wait=wait_fixed(0.2))
    async def search(self, artist: str, song: str, **kwargs) -> Optional[LyricsResult]:
        """
        Genius Search
        Args:
            artist (str): the artist to search
            song (str): the song to search
        Returns:
            Optional[LyricsResult]: The result, if found - None otherwise.
        """
        try:
            artist = artist.strip().lower()
            song = song.strip().lower()
            time_start: float = time.time()
            logging.info("Searching %s - %s on %s", artist, song, self.label)
            search_term: str = f"{artist}%20{song}"
            returned_lyrics: str = ""
            async with ClientSession() as client:
                async with client.get(
                    f"{self.genius_search_url}{search_term}",
                    timeout=self.timeout,
                    headers=self.headers,
                    proxy=private.GENIUS_PROXY,
                ) as request:
                    request.raise_for_status()
                    text: Optional[str] = await request.text()

                    if not text:
                        raise InvalidGeniusResponseException("No search response.")

                    if len(text) < 100:
                        raise InvalidGeniusResponseException(
                            "Search response text was invalid (len < 100 chars.)"
                        )
                    search_data = await request.json()

                    if not isinstance(search_data, dict):
                        raise InvalidGeniusResponseException("Invalid JSON.")

                    if not isinstance(search_data["response"], dict):
                        raise InvalidGeniusResponseException(
                            f"Invalid JSON: Cannot find response key.\n{search_data}"
                        )

                    if not isinstance(search_data["response"]["sections"], list):
                        raise InvalidGeniusResponseException(
                            f"Invalid JSON: Cannot find response->sections key.\n{search_data}"
                        )

                    if not isinstance(
                        search_data["response"]["sections"][0]["hits"], list
                    ):
                        raise InvalidGeniusResponseException(
                            "Invalid JSON: Cannot find response->sections[0]->hits key."
                        )

                    possible_matches: list = search_data["response"]["sections"][0][
                        "hits"
                    ]
                    to_scrape: list[tuple] = [
                        (
                            returned["result"]["path"],
                            f"{returned['result']['artist_names']} - {returned['result']['title']}",
                        )
                        for returned in possible_matches
                    ]
                    searched: str = f"{artist} - {song}"
                    best_match: tuple = self.matcher.find_best_match(
                        input_track=searched, candidate_tracks=to_scrape
                    )
                    logging.info("To scrape: %s", to_scrape)
                    ((scrape_stub, track), confidence) = best_match
                    scrape_url: str = f"{self.genius_url}{scrape_stub[1:]}"

                    async with client.get(
                        scrape_url,
                        timeout=self.timeout,
                        headers=self.headers,
                        proxy=private.GENIUS_PROXY,
                    ) as scrape_request:
                        scrape_request.raise_for_status()
                        scrape_text: Optional[str] = await scrape_request.text()

                        if not scrape_text:
                            raise InvalidGeniusResponseException("No scrape response.")

                        if len(scrape_text) < 100:
                            raise InvalidGeniusResponseException(
                                "Scrape response was invalid (len < 100 chars.)"
                            )

                        html = BeautifulSoup(
                            htm.unescape(scrape_text).replace("<br/>", "\n"),
                            "html.parser",
                        )

                        header_tags_genius: Optional[ResultSet] = html.find_all(
                            class_=re.compile(r".*Header.*")
                        )
                        if header_tags_genius:
                            for tag in header_tags_genius:
                                tag.extract()

                        divs: Optional[ResultSet] = html.find_all(
                            "div", {"data-lyrics-container": "true"}
                        )

                        if not divs:
                            return

                        for div in divs:
                            header_tags: Optional[ResultSet] = div.find_all(
                                ["h1", "h2", "h3", "h4", "h5"]
                            )
                            if header_tags:
                                for tag in header_tags:
                                    tag.extract()

                            returned_lyrics += div.get_text()

                        returned_lyrics: str = self.datautils.scrub_lyrics(
                            returned_lyrics
                        )
                        artist: str = track.split(" - ", maxsplit=1)[0]
                        song: str = track.split(" - ", maxsplit=1)[1]
                        logging.info("Result found on %s", self.label)
                        time_end: float = time.time()
                        time_diff: float = time_end - time_start
                        matched = LyricsResult(
                            artist=artist,
                            song=song,
                            src=self.label,
                            lyrics=returned_lyrics,
                            confidence=confidence,
                            time=time_diff,
                        )
                        await self.redis_cache.increment_found_count(self.label)
                        await self.cache.store(matched)
                        return matched
        except Exception as e:
            logging.debug("Exception: %s", str(e))
            traceback.print_exc()
progress 2025-01-13 20:47:39 -05:00			`import sys`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00
			`sys.path.insert(1, "..")`
progress 2025-01-13 20:47:39 -05:00			`import traceback`
revisions 2025-01-14 11:10:13 -05:00			`import logging`
resolves #22, #29 2025-01-15 20:17:49 -05:00			`import time`
allow for excluding cache on lyrics search (temp?) + addl genius fixes 2025-04-08 11:27:56 -04:00			`import re`
0 2025-01-14 14:17:18 -05:00			`from typing import Optional`
progress 2025-01-13 20:47:39 -05:00			`from aiohttp import ClientTimeout, ClientSession`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00			`from bs4 import BeautifulSoup, ResultSet # type: ignore`
formatting/add meme endpoints 2025-05-17 08:07:38 -04:00			`from tenacity import retry, stop_after_attempt, wait_fixed`
progress 2025-01-13 20:47:39 -05:00			`import html as htm`
misc 2025-01-22 06:38:40 -05:00			`from . import private, common, cache, redis_cache`
lyric_search_new renamed to lyric_search 2025-01-24 09:10:54 -05:00			`from lyric_search import utils`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00			`from lyric_search.constructors import LyricsResult, InvalidGeniusResponseException`
progress 2025-01-13 20:47:39 -05:00
revisions 2025-01-14 11:10:13 -05:00			`logger = logging.getLogger()`
			`log_level = logging.getLevelName(logger.level)`

radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00
WIP 2025-01-12 20:19:48 -05:00			`class Genius:`
cleanup 2025-02-15 21:09:33 -05:00			`"""`
			`Genius Search Module`
			`"""`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00
misc 2025-01-19 07:09:05 -05:00			`def __init__(self) -> None:`
0 2025-01-14 14:17:18 -05:00			`self.label: str = "Genius"`
			`self.genius_url: str = private.GENIUS_URL`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00			`self.genius_search_url: str = f"{self.genius_url}api/search/song?q="`
0 2025-01-14 14:17:18 -05:00			`self.headers: dict = common.SCRAPE_HEADERS`
misc 2025-07-15 11:39:12 -04:00			`self.timeout = ClientTimeout(connect=5, sock_read=5)`
progress 2025-01-13 20:47:39 -05:00			`self.datautils = utils.DataUtils()`
			`self.matcher = utils.TrackMatcher()`
resolves #22, #29 2025-01-15 20:17:49 -05:00			`self.cache = cache.Cache()`
misc 2025-01-22 06:38:40 -05:00			`self.redis_cache = redis_cache.RedisCache()`
WIP 2025-01-12 20:19:48 -05:00
meme/misc/rm karma 2025-05-20 11:14:08 -04:00			`@retry(stop=stop_after_attempt(3), wait=wait_fixed(0.2))`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00			`async def search(self, artist: str, song: str, **kwargs) -> Optional[LyricsResult]:`
WIP 2025-01-12 20:19:48 -05:00			`"""`
docstring stuff 2025-01-19 07:01:07 -05:00			`Genius Search`
			`Args:`
			`artist (str): the artist to search`
			`song (str): the song to search`
			`Returns:`
cleanup 2025-02-15 21:09:33 -05:00			`Optional[LyricsResult]: The result, if found - None otherwise.`
progress 2025-01-13 20:47:39 -05:00			`"""`
			`try:`
misc 2025-09-12 22:39:59 -04:00			`artist = artist.strip().lower()`
			`song = song.strip().lower()`
resolves #22, #29 2025-01-15 20:17:49 -05:00			`time_start: float = time.time()`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00			`logging.info("Searching %s - %s on %s", artist, song, self.label)`
			`search_term: str = f"{artist}%20{song}"`
			`returned_lyrics: str = ""`
progress 2025-01-13 20:47:39 -05:00			`async with ClientSession() as client:`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00			`async with client.get(`
			`f"{self.genius_search_url}{search_term}",`
			`timeout=self.timeout,`
			`headers=self.headers,`
misc/migration related 2025-06-08 08:53:18 -04:00			`proxy=private.GENIUS_PROXY,`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00			`) as request:`
progress 2025-01-13 20:47:39 -05:00			`request.raise_for_status()`
cleanup 2025-02-15 21:09:33 -05:00			`text: Optional[str] = await request.text()`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00
cleanup 2025-02-15 21:09:33 -05:00			`if not text:`
cleanup 2025-02-18 14:56:24 -05:00			`raise InvalidGeniusResponseException("No search response.")`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00
progress 2025-01-13 20:47:39 -05:00			`if len(text) < 100:`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00			`raise InvalidGeniusResponseException(`
			`"Search response text was invalid (len < 100 chars.)"`
			`)`
progress 2025-01-13 20:47:39 -05:00			`search_data = await request.json()`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00
progress 2025-01-13 20:47:39 -05:00			`if not isinstance(search_data, dict):`
cleanup 2025-02-18 14:56:24 -05:00			`raise InvalidGeniusResponseException("Invalid JSON.")`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00
			`if not isinstance(search_data["response"], dict):`
			`raise InvalidGeniusResponseException(`
			`f"Invalid JSON: Cannot find response key.\n{search_data}"`
			`)`

			`if not isinstance(search_data["response"]["sections"], list):`
			`raise InvalidGeniusResponseException(`
			`f"Invalid JSON: Cannot find response->sections key.\n{search_data}"`
			`)`

			`if not isinstance(`
			`search_data["response"]["sections"][0]["hits"], list`
			`):`
			`raise InvalidGeniusResponseException(`
			`"Invalid JSON: Cannot find response->sections[0]->hits key."`
			`)`

			`possible_matches: list = search_data["response"]["sections"][0][`
			`"hits"`
			`]`
0 2025-01-14 14:17:18 -05:00			`to_scrape: list[tuple] = [`
progress 2025-01-13 20:47:39 -05:00			`(`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00			`returned["result"]["path"],`
			`f"{returned['result']['artist_names']} - {returned['result']['title']}",`
			`)`
			`for returned in possible_matches`
progress 2025-01-13 20:47:39 -05:00			`]`
0 2025-01-14 14:17:18 -05:00			`searched: str = f"{artist} - {song}"`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00			`best_match: tuple = self.matcher.find_best_match(`
			`input_track=searched, candidate_tracks=to_scrape`
			`)`
misc 2025-07-15 11:39:12 -04:00			`logging.info("To scrape: %s", to_scrape)`
progress 2025-01-13 20:47:39 -05:00			`((scrape_stub, track), confidence) = best_match`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00			`scrape_url: str = f"{self.genius_url}{scrape_stub[1:]}"`

			`async with client.get(`
formatting/add meme endpoints 2025-05-17 08:07:38 -04:00			`scrape_url,`
			`timeout=self.timeout,`
			`headers=self.headers,`
misc/migration related 2025-06-08 08:53:18 -04:00			`proxy=private.GENIUS_PROXY,`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00			`) as scrape_request:`
progress 2025-01-13 20:47:39 -05:00			`scrape_request.raise_for_status()`
cleanup 2025-02-15 21:09:33 -05:00			`scrape_text: Optional[str] = await scrape_request.text()`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00
cleanup 2025-02-15 21:09:33 -05:00			`if not scrape_text:`
cleanup 2025-02-18 14:56:24 -05:00			`raise InvalidGeniusResponseException("No scrape response.")`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00
progress 2025-01-13 20:47:39 -05:00			`if len(scrape_text) < 100:`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00			`raise InvalidGeniusResponseException(`
			`"Scrape response was invalid (len < 100 chars.)"`
			`)`

			`html = BeautifulSoup(`
			`htm.unescape(scrape_text).replace("<br/>", "\n"),`
			`"html.parser",`
			`)`

			`header_tags_genius: Optional[ResultSet] = html.find_all(`
			`class_=re.compile(r".Header.")`
			`)`
allow for excluding cache on lyrics search (temp?) + addl genius fixes 2025-04-08 11:27:56 -04:00			`if header_tags_genius:`
			`for tag in header_tags_genius:`
Genius site was updated, lyrics header/contributor info was relocated within a LyricsContainer div, resulting in that header/garbage being prefixed to returned lyrics. Resolved by finding unwanted tags and extracting from html 2025-04-07 11:08:07 -04:00			`tag.extract()`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00
			`divs: Optional[ResultSet] = html.find_all(`
			`"div", {"data-lyrics-container": "true"}`
			`)`

progress 2025-01-13 20:47:39 -05:00			`if not divs:`
			`return`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00
progress 2025-01-13 20:47:39 -05:00			`for div in divs:`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00			`header_tags: Optional[ResultSet] = div.find_all(`
			`["h1", "h2", "h3", "h4", "h5"]`
			`)`
allow for excluding cache on lyrics search (temp?) + addl genius fixes 2025-04-08 11:27:56 -04:00			`if header_tags:`
			`for tag in header_tags:`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00			`tag.extract()`
allow for excluding cache on lyrics search (temp?) + addl genius fixes 2025-04-08 11:27:56 -04:00
progress 2025-01-13 20:47:39 -05:00			`returned_lyrics += div.get_text()`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00
			`returned_lyrics: str = self.datautils.scrub_lyrics(`
			`returned_lyrics`
			`)`
0 2025-01-14 14:17:18 -05:00			`artist: str = track.split(" - ", maxsplit=1)[0]`
			`song: str = track.split(" - ", maxsplit=1)[1]`
revisions 2025-01-14 11:10:13 -05:00			`logging.info("Result found on %s", self.label)`
resolves #22, #29 2025-01-15 20:17:49 -05:00			`time_end: float = time.time()`
			`time_diff: float = time_end - time_start`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00			`matched = LyricsResult(`
			`artist=artist,`
			`song=song,`
			`src=self.label,`
			`lyrics=returned_lyrics,`
			`confidence=confidence,`
			`time=time_diff,`
			`)`
misc 2025-01-22 06:38:40 -05:00			`await self.redis_cache.increment_found_count(self.label)`
resolves #22, #29 2025-01-15 20:17:49 -05:00			`await self.cache.store(matched)`
			`return matched`
misc 2025-04-26 21:27:55 -04:00			`except Exception as e:`
			`logging.debug("Exception: %s", str(e))`
radio_util: open tracks SQLite DB in readonly mode; black: reformat files 2025-04-17 07:28:05 -04:00			`traceback.print_exc()`