From dcc6c7b24e8036dca566fad56df9201e16dedcba Mon Sep 17 00:00:00 2001 From: codey Date: Sat, 22 Nov 2025 13:13:03 -0500 Subject: [PATCH] More progress re: #34 - Change of direction, LRCLib searches from /lyric/search now use internal cache - which is a PGSQL import of the LRCLib SQLite database. Change to PGSQL was made for performance. --- base.py | 1 - endpoints/constructors.py | 19 --- endpoints/lrclib.py | 199 ----------------------- lyric_search/constructors.py | 21 +-- lyric_search/models.py | 110 +++++++++++++ lyric_search/sources/aggregate.py | 8 +- lyric_search/sources/genius.py | 19 ++- lyric_search/sources/lrclib.py | 261 +++++++++++++----------------- pgloader_config.load | 0 postgres_schema.sql | 0 10 files changed, 240 insertions(+), 398 deletions(-) delete mode 100644 endpoints/lrclib.py create mode 100644 lyric_search/models.py create mode 100644 pgloader_config.load create mode 100644 postgres_schema.sql diff --git a/base.py b/base.py index 2d33c3c..d29b4d4 100644 --- a/base.py +++ b/base.py @@ -108,7 +108,6 @@ routes: dict = { "lyrics": importlib.import_module("endpoints.lyric_search").LyricSearch( app, util, constants ), - "lrclib": importlib.import_module("endpoints.lrclib").LRCLib(app, util, constants), "yt": importlib.import_module("endpoints.yt").YT(app, util, constants), "radio": importlib.import_module("endpoints.radio").Radio( app, util, constants, loop diff --git a/endpoints/constructors.py b/endpoints/constructors.py index 491bb72..ac16050 100644 --- a/endpoints/constructors.py +++ b/endpoints/constructors.py @@ -110,25 +110,6 @@ class ValidLyricRequest(BaseModel): } -class ValidLRCLibRequest(BaseModel): - """ - Request model for lyric search. - - Attributes: - - **artist** (str): Artist. - - **song** (str): Song. - - **duration** (Optional[int]): Optional duration. - """ - - artist: Optional[str] = None - song: Optional[str] = None - duration: Optional[int] = None - - model_config = { - "json_schema_extra": {"examples": [{"artist": "eminem", "song": "rap god"}]} - } - - class ValidTypeAheadRequest(BaseModel): """ Request model for typeahead query. diff --git a/endpoints/lrclib.py b/endpoints/lrclib.py deleted file mode 100644 index 7397e68..0000000 --- a/endpoints/lrclib.py +++ /dev/null @@ -1,199 +0,0 @@ -import urllib.parse -from fastapi import FastAPI, HTTPException, Depends -from fastapi_throttle import RateLimiter -from fastapi.responses import JSONResponse -from typing import Type, Optional -from sqlalchemy import ( - and_, - true, - Column, - Integer, - String, - Float, - Boolean, - DateTime, - ForeignKey, - UniqueConstraint, - create_engine, -) -from sqlalchemy.orm import Session, relationship -from sqlalchemy.ext.declarative import declarative_base, DeclarativeMeta -from sqlalchemy.orm import sessionmaker -from .constructors import ValidLRCLibRequest -from lyric_search.constructors import LRCLibResult -from lyric_search import notifier -from sqlalchemy.orm import foreign - -Base: Type[DeclarativeMeta] = declarative_base() - - -class Tracks(Base): # type: ignore - __tablename__ = "tracks" - - id = Column(Integer, primary_key=True, autoincrement=True) - name = Column(String) - name_lower = Column(String, index=True) - artist_name = Column(String) - artist_name_lower = Column(String, index=True) - album_name = Column(String) - album_name_lower = Column(String, index=True) - duration = Column(Float, index=True) - last_lyrics_id = Column(Integer, ForeignKey("lyrics.id"), index=True) - created_at = Column(DateTime) - updated_at = Column(DateTime) - - # Relationships - lyrics = relationship( - "Lyrics", - back_populates="track", - foreign_keys=[last_lyrics_id], - primaryjoin="Tracks.id == foreign(Lyrics.track_id)", # Use string reference for Lyrics - ) - - # Constraints - __table_args__ = ( - UniqueConstraint( - "name_lower", - "artist_name_lower", - "album_name_lower", - "duration", - name="uq_tracks", - ), - ) - - -class Lyrics(Base): # type: ignore - __tablename__ = "lyrics" - - id = Column(Integer, primary_key=True, autoincrement=True) - plain_lyrics = Column(String) - synced_lyrics = Column(String) - track_id = Column(Integer, ForeignKey("tracks.id"), index=True) - has_plain_lyrics = Column(Boolean, index=True) - has_synced_lyrics = Column(Boolean, index=True) - instrumental = Column(Boolean) - source = Column(String, index=True) - created_at = Column(DateTime, index=True) - updated_at = Column(DateTime) - - # Relationships - track = relationship( - "Tracks", - back_populates="lyrics", - foreign_keys=[track_id], - primaryjoin=(Tracks.id == foreign(track_id)), - remote_side=Tracks.id, - ) - - -DATABASE_URL: str = "sqlite:////nvme/sqlite_dbs/lrclib.db" -engine = create_engine(DATABASE_URL, connect_args={"check_same_thread": False}) -SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) - - -def get_db(): - db = SessionLocal() - try: - yield db - finally: - db.close() - - -""" -TODO: - - Move retrieval to lyric_search.sources, with separate file for DB Model -""" - - -class LRCLib(FastAPI): - """ - LRCLib Cache Search Endpoint - """ - - def __init__(self, app: FastAPI, util, constants) -> None: - """Initialize LyricSearch endpoints.""" - self.app: FastAPI = app - self.util = util - self.constants = constants - self.declarative_base = declarative_base() - self.notifier = notifier.DiscordNotifier() - - self.endpoints: dict = { - "lrclib/search": self.lyric_search_handler, - } - - for endpoint, handler in self.endpoints.items(): - times: int = 20 - seconds: int = 2 - rate_limit: tuple[int, int] = (2, 3) # Default; (Times, Seconds) - (times, seconds) = rate_limit - - app.add_api_route( - f"/{endpoint}", - handler, - methods=["POST"], - include_in_schema=True, - dependencies=[Depends(RateLimiter(times=times, seconds=seconds))], - ) - - async def lyric_search_handler( - self, data: ValidLRCLibRequest, db: Session = Depends(get_db) - ) -> JSONResponse: - """ - Search for lyrics. - - Parameters: - - **data** (ValidLRCLibRequest): Request containing artist, song, and other parameters. - - Returns: - - **JSONResponse**: LRCLib data or error. - """ - if not data.artist or not data.song: - raise HTTPException(detail="Invalid request", status_code=500) - - search_artist: str = urllib.parse.unquote(data.artist).lower() - search_song: str = urllib.parse.unquote(data.song).lower() - search_duration: Optional[int] = data.duration - - if not isinstance(search_artist, str) or not isinstance(search_song, str): - return JSONResponse( - status_code=500, - content={ - "err": True, - "errorText": "Invalid request", - }, - ) - - query = ( - db.query( - Tracks.id.label("id"), - Tracks.artist_name.label("artist"), - Tracks.name.label("song"), - Lyrics.plain_lyrics.label("plainLyrics"), - Lyrics.synced_lyrics.label("syncedLyrics"), - ) - .join(Lyrics, Tracks.id == Lyrics.track_id) - .filter( - and_( - Tracks.artist_name_lower == search_artist, - Tracks.name == search_song, - Tracks.duration == search_duration if search_duration else true(), - ) - ) - ) - - db_result = query.first() - if not db_result: - return JSONResponse( - status_code=404, content={"err": True, "errorText": "No result found."} - ) - - result = LRCLibResult( - id=db_result.id, - artist=db_result.artist, - song=db_result.song, - plainLyrics=db_result.plainLyrics, - syncedLyrics=db_result.syncedLyrics, - ) - - return JSONResponse(content=vars(result)) diff --git a/lyric_search/constructors.py b/lyric_search/constructors.py index 616c02d..73bf626 100644 --- a/lyric_search/constructors.py +++ b/lyric_search/constructors.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Union, Optional +from typing import Union @dataclass @@ -22,25 +22,6 @@ class LyricsResult: time: float = 0.00 -@dataclass -class LRCLibResult: - """ - Class for returned Lyrics Results - Attributes: - id (int): returned id - artist (str): returned artist - song (str): returned song - plainLyrics (str): returned (plain) lyrics - syncedLyrics (str): returned synchronizedLyrics - """ - - id: int - artist: str - song: str - plainLyrics: Optional[str] = None - syncedLyrics: Optional[str] = None - - """ Generic """ diff --git a/lyric_search/models.py b/lyric_search/models.py new file mode 100644 index 0000000..51e006f --- /dev/null +++ b/lyric_search/models.py @@ -0,0 +1,110 @@ +""" +Database models for LRCLib lyrics cache. +""" +import os +import urllib.parse +from typing import Type, AsyncGenerator +from sqlalchemy import ( + Column, + Integer, + String, + Float, + Boolean, + DateTime, + ForeignKey, + UniqueConstraint, +) +from sqlalchemy.orm import relationship, foreign +from sqlalchemy.ext.declarative import declarative_base, DeclarativeMeta +from sqlalchemy.ext.asyncio import AsyncEngine, create_async_engine, AsyncSession +from sqlalchemy.ext.asyncio import async_sessionmaker + +Base: Type[DeclarativeMeta] = declarative_base() + + +class Tracks(Base): # type: ignore + """Tracks table - stores track metadata.""" + __tablename__ = "tracks" + + id = Column(Integer, primary_key=True, autoincrement=True) + name = Column(String, index=True) + name_lower = Column(String, index=True) + artist_name = Column(String, index=True) + artist_name_lower = Column(String, index=True) + album_name = Column(String) + album_name_lower = Column(String, index=True) + duration = Column(Float, index=True) + last_lyrics_id = Column(Integer, ForeignKey("lyrics.id"), index=True) + created_at = Column(DateTime) + updated_at = Column(DateTime) + + # Relationships + lyrics = relationship( + "Lyrics", + back_populates="track", + foreign_keys=[last_lyrics_id], + primaryjoin="Tracks.id == foreign(Lyrics.track_id)", + ) + + # Constraints + __table_args__ = ( + UniqueConstraint( + "name_lower", + "artist_name_lower", + "album_name_lower", + "duration", + name="uq_tracks", + ), + ) + + +class Lyrics(Base): # type: ignore + """Lyrics table - stores lyrics content.""" + __tablename__ = "lyrics" + + id = Column(Integer, primary_key=True, autoincrement=True) + plain_lyrics = Column(String) + synced_lyrics = Column(String) + track_id = Column(Integer, ForeignKey("tracks.id"), index=True) + has_plain_lyrics = Column(Boolean, index=True) + has_synced_lyrics = Column(Boolean, index=True) + instrumental = Column(Boolean) + source = Column(String, index=True) + created_at = Column(DateTime, index=True) + updated_at = Column(DateTime) + + # Relationships + track = relationship( + "Tracks", + back_populates="lyrics", + foreign_keys=[track_id], + primaryjoin=(Tracks.id == foreign(track_id)), + remote_side=Tracks.id, + ) + + +# PostgreSQL connection - using environment variables +POSTGRES_HOST = os.getenv("POSTGRES_HOST", "localhost") +POSTGRES_PORT = os.getenv("POSTGRES_PORT", "5432") +POSTGRES_DB = os.getenv("POSTGRES_DB", "lrclib") +POSTGRES_USER = os.getenv("POSTGRES_USER", "api") +POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "") + +# URL-encode the password to handle special characters +encoded_password = urllib.parse.quote_plus(POSTGRES_PASSWORD) + +DATABASE_URL: str = f"postgresql+asyncpg://{POSTGRES_USER}:{encoded_password}@{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}" +async_engine: AsyncEngine = create_async_engine( + DATABASE_URL, + pool_size=20, + max_overflow=10, + pool_pre_ping=True, + echo=False +) +AsyncSessionLocal = async_sessionmaker(bind=async_engine, expire_on_commit=False) + + +async def get_async_db(): + """Get async database session.""" + async with AsyncSessionLocal() as session: + yield session diff --git a/lyric_search/sources/aggregate.py b/lyric_search/sources/aggregate.py index 8bba919..492ecd0 100644 --- a/lyric_search/sources/aggregate.py +++ b/lyric_search/sources/aggregate.py @@ -14,9 +14,7 @@ class Aggregate: Aggregate all source methods """ - def __init__(self, exclude_methods=None) -> None: - if not exclude_methods: - exclude_methods: list = [] + def __init__(self, exclude_methods: list = []) -> None: self.exclude_methods = exclude_methods self.redis_cache = redis_cache.RedisCache() self.notifier = notifier.DiscordNotifier() @@ -70,14 +68,14 @@ class Aggregate: if plain: # do not record LRC fails try: await self.redis_cache.increment_found_count("failed") - self.notifier.send( + await self.notifier.send( "WARNING", f"Could not find {artist} - {song} via queried sources.", ) except Exception as e: traceback.print_exc() logging.info("Could not increment redis failed counter: %s", str(e)) - self.notifier.send( + await self.notifier.send( f"ERROR @ {__file__.rsplit('/', maxsplit=1)[-1]}", f"Could not increment redis failed counter: {str(e)}", ) diff --git a/lyric_search/sources/genius.py b/lyric_search/sources/genius.py index 81391fc..37a41b7 100644 --- a/lyric_search/sources/genius.py +++ b/lyric_search/sources/genius.py @@ -45,11 +45,11 @@ class Genius: Optional[LyricsResult]: The result, if found - None otherwise. """ try: - artist = artist.strip().lower() - song = song.strip().lower() + artist_name = artist.strip().lower() + song_name = song.strip().lower() time_start: float = time.time() - logging.info("Searching %s - %s on %s", artist, song, self.label) - search_term: str = f"{artist}%20{song}" + logging.info("Searching %s - %s on %s", artist_name, song_name, self.label) + search_term: str = f"{artist_name}%20{song_name}" returned_lyrics: str = "" async with ClientSession() as client: async with client.get( @@ -100,10 +100,13 @@ class Genius: ) for returned in possible_matches ] - searched: str = f"{artist} - {song}" - best_match: tuple = self.matcher.find_best_match( + searched: str = f"{artist_name} - {song_name}" + best_match: Optional[tuple] = self.matcher.find_best_match( input_track=searched, candidate_tracks=to_scrape ) + if not best_match: + raise InvalidGeniusResponseException("No matching result") + logging.info("To scrape: %s", to_scrape) ((scrape_stub, track), confidence) = best_match scrape_url: str = f"{self.genius_url}{scrape_stub[1:]}" @@ -157,8 +160,8 @@ class Genius: returned_lyrics: str = self.datautils.scrub_lyrics( returned_lyrics ) - artist: str = track.split(" - ", maxsplit=1)[0] - song: str = track.split(" - ", maxsplit=1)[1] + artist = track.split(" - ", maxsplit=1)[0] + song = track.split(" - ", maxsplit=1)[1] logging.info("Result found on %s", self.label) time_end: float = time.time() time_diff: float = time_end - time_start diff --git a/lyric_search/sources/lrclib.py b/lyric_search/sources/lrclib.py index 87c41ad..3c405de 100644 --- a/lyric_search/sources/lrclib.py +++ b/lyric_search/sources/lrclib.py @@ -1,33 +1,25 @@ import time - -import traceback import logging -from typing import Optional, Union -from aiohttp import ClientTimeout, ClientSession -from tenacity import retry, stop_after_attempt, wait_fixed +from typing import Optional +from sqlalchemy.future import select from lyric_search import utils from lyric_search.constructors import LyricsResult -from . import common, cache, redis_cache -from lyric_search.constructors import InvalidLRCLibResponseException +from lyric_search.models import Tracks, Lyrics, AsyncSessionLocal +from . import redis_cache logger = logging.getLogger() log_level = logging.getLevelName(logger.level) class LRCLib: - """LRCLib Search Module""" + """LRCLib Search Module - Local PostgreSQL Database""" def __init__(self) -> None: - self.label: str = "LRCLib" - self.lrclib_url: str = "https://lrclib.net/api/search" - self.headers: dict = common.SCRAPE_HEADERS - self.timeout = ClientTimeout(connect=3, sock_read=8) + self.label: str = "LRCLib-Cache" self.datautils = utils.DataUtils() self.matcher = utils.TrackMatcher() - self.cache = cache.Cache() self.redis_cache = redis_cache.RedisCache() - @retry(stop=stop_after_attempt(2), wait=wait_fixed(0.5)) async def search( self, artist: str, @@ -36,10 +28,12 @@ class LRCLib: duration: Optional[int] = None, ) -> Optional[LyricsResult]: """ - LRCLib Search + LRCLib Local Database Search Args: artist (str): the artist to search song (str): the song to search + plain (bool): return plain lyrics (True) or synced lyrics (False) + duration (int): optional track duration for better matching Returns: Optional[LyricsResult]: The result, if found - None otherwise. """ @@ -47,140 +41,115 @@ class LRCLib: artist = artist.strip().lower() song = song.strip().lower() time_start: float = time.time() - lrc_obj: Optional[list[dict]] = None logging.info("Searching %s - %s on %s", artist, song, self.label) - input_track: str = f"{artist} - {song}" - returned_lyrics: str = "" - async with ClientSession() as client: - async with await client.get( - self.lrclib_url, - params={ - "artist_name": artist, - "track_name": song, - **({"duration": duration} if duration else {}), - }, - timeout=self.timeout, - headers=self.headers, - ) as request: - request.raise_for_status() - - text: Optional[str] = await request.text() - if not text: - raise InvalidLRCLibResponseException("No search response.") - if len(text) < 100: - raise InvalidLRCLibResponseException( - "Search response text was invalid (len < 100 chars.)" - ) - - search_data: Optional[Union[list, dict]] = await request.json() - if not isinstance(search_data, list | dict): - raise InvalidLRCLibResponseException("No JSON search data.") - - # logging.info("Search Data:\n%s", search_data) - - if not isinstance(search_data, list): - raise InvalidLRCLibResponseException("Invalid JSON.") - - # Filter by duration if provided - if duration: - search_data = [ - r - for r in search_data - if abs(r.get("duration", 0) - duration) <= 10 - ] - - if plain: - possible_matches = [ - ( - x, - f"{result.get('artistName')} - {result.get('trackName')}", - ) - for x, result in enumerate(search_data) - ] - else: - logging.info( - "Limiting possible matches to only those with non-null syncedLyrics" - ) - possible_matches = [ - ( - x, - f"{result.get('artistName')} - {result.get('trackName')}", - ) - for x, result in enumerate(search_data) - if isinstance(result["syncedLyrics"], str) - ] - - best_match = None - try: - match_result = self.matcher.find_best_match( - input_track, - possible_matches, # type: ignore - ) - if match_result: - best_match = match_result[0] - except: # noqa - pass - - if not best_match: - return - best_match_id = best_match[0] - - if not isinstance(search_data[best_match_id]["artistName"], str): - raise InvalidLRCLibResponseException( - f"Invalid JSON: Cannot find artistName key.\n{search_data}" - ) - - if not isinstance(search_data[best_match_id]["trackName"], str): - raise InvalidLRCLibResponseException( - f"Invalid JSON: Cannot find trackName key.\n{search_data}" - ) - - returned_artist: str = search_data[best_match_id]["artistName"] - returned_song: str = search_data[best_match_id]["trackName"] - if plain: - if not isinstance( - search_data[best_match_id]["plainLyrics"], str - ): - raise InvalidLRCLibResponseException( - f"Invalid JSON: Cannot find plainLyrics key.\n{search_data}" - ) - returned_lyrics: str = search_data[best_match_id]["plainLyrics"] - returned_lyrics = self.datautils.scrub_lyrics(returned_lyrics) - else: - if not isinstance( - search_data[best_match_id]["syncedLyrics"], str - ): - raise InvalidLRCLibResponseException( - f"Invalid JSON: Cannot find syncedLyrics key.\n{search_data}" - ) - returned_lyrics: str = search_data[best_match_id][ - "syncedLyrics" - ] - lrc_obj = self.datautils.create_lrc_object(returned_lyrics) - returned_track: str = f"{returned_artist} - {returned_song}" - match_result = self.matcher.find_best_match( - input_track=input_track, candidate_tracks=[(0, returned_track)] + async with AsyncSessionLocal() as db: + best_match = None + + # Try exact match first (fastest) + result = await db.execute( + select( + Tracks.artist_name, + Tracks.name, + Lyrics.plain_lyrics, + Lyrics.synced_lyrics, ) - if not match_result: - return # No suitable match found - _matched, confidence = match_result - logging.info("Result found on %s", self.label) - time_end: float = time.time() - time_diff: float = time_end - time_start - matched = LyricsResult( - artist=returned_artist, - song=returned_song, - src=self.label, - lyrics=returned_lyrics if plain else lrc_obj, # type: ignore - confidence=confidence, - time=time_diff, + .join(Lyrics, Tracks.id == Lyrics.track_id) + .filter( + Tracks.artist_name_lower == artist, + Tracks.name_lower == song, ) - await self.redis_cache.increment_found_count(self.label) - if plain: - await self.cache.store(matched) - return matched + .limit(1) + ) + best_match = result.first() + + # If no exact match, try prefix match (faster than full ILIKE) + if not best_match: + result = await db.execute( + select( + Tracks.artist_name, + Tracks.name, + Lyrics.plain_lyrics, + Lyrics.synced_lyrics, + ) + .join(Lyrics, Tracks.id == Lyrics.track_id) + .filter( + Tracks.artist_name_lower.like(f"{artist}%"), + Tracks.name_lower.like(f"{song}%"), + ) + .limit(1) + ) + best_match = result.first() + + # If still no match, try full ILIKE (slowest) + if not best_match: + result = await db.execute( + select( + Tracks.artist_name, + Tracks.name, + Lyrics.plain_lyrics, + Lyrics.synced_lyrics, + ) + .join(Lyrics, Tracks.id == Lyrics.track_id) + .filter( + Tracks.artist_name_lower.ilike(f"%{artist}%"), + Tracks.name_lower.ilike(f"%{song}%"), + ) + .limit(1) + ) + best_match = result.first() + + if not best_match: + logging.info("No result found on %s", self.label) + return None + + returned_artist = best_match.artist_name + returned_song = best_match.name + + if plain: + if not best_match.plain_lyrics: + logging.info("No plain lyrics available on %s", self.label) + return None + returned_lyrics = best_match.plain_lyrics + returned_lyrics = self.datautils.scrub_lyrics(returned_lyrics) + lrc_obj = None + else: + if not best_match.synced_lyrics: + logging.info("No synced lyrics available on %s", self.label) + return None + returned_lyrics = best_match.synced_lyrics + lrc_obj = self.datautils.create_lrc_object(returned_lyrics) + + # Calculate match confidence + input_track = f"{artist} - {song}" + returned_track = f"{returned_artist} - {returned_song}" + match_result = self.matcher.find_best_match( + input_track=input_track, + candidate_tracks=[(0, returned_track)] + ) + + if not match_result: + return None + + _matched, confidence = match_result + + logging.info("Result found on %s", self.label) + time_end = time.time() + time_diff = time_end - time_start + + matched = LyricsResult( + artist=returned_artist, + song=returned_song, + src=self.label, + lyrics=returned_lyrics if plain else lrc_obj, # type: ignore + confidence=confidence, + time=time_diff, + ) + + await self.redis_cache.increment_found_count(self.label) + return matched + except Exception as e: - logging.debug("Exception: %s", str(e)) - traceback.print_exc() + logging.error("Exception in %s: %s", self.label, str(e)) + return None diff --git a/pgloader_config.load b/pgloader_config.load new file mode 100644 index 0000000..e69de29 diff --git a/postgres_schema.sql b/postgres_schema.sql new file mode 100644 index 0000000..e69de29