import re import time import logging from typing import Optional from sqlalchemy import text from sqlalchemy.future import select from lyric_search import utils from lyric_search.constructors import LyricsResult from lyric_search.models import Tracks, Lyrics, AsyncSessionLocal from . import redis_cache, cache logger = logging.getLogger() log_level = logging.getLevelName(logger.level) def normalize_for_search(s: str) -> str: """ Normalize string for better matching. Removes common variations that cause exact match failures. """ s = s.lower().strip() # Remove parenthetical content: (Remastered), (feat. X), (2020 Remix), etc. s = re.sub(r"\s*\([^)]*\)\s*", " ", s) # Remove bracketed content: [Explicit], [Deluxe Edition], etc. s = re.sub(r"\s*\[[^\]]*\]\s*", " ", s) # Remove "feat.", "ft.", "featuring" and everything after s = re.sub(r"\s*(feat\.?|ft\.?|featuring)\s+.*$", "", s, flags=re.IGNORECASE) # Remove "The " prefix from artist names s = re.sub(r"^the\s+", "", s) # Normalize & to "and" s = re.sub(r"\s*&\s*", " and ", s) # Remove punctuation except spaces s = re.sub(r"[^\w\s]", "", s) # Collapse multiple spaces s = re.sub(r"\s+", " ", s).strip() return s class LRCLib: """LRCLib Search Module - Local PostgreSQL Database""" def __init__(self) -> None: self.label: str = "LRCLib-Cache" self.datautils = utils.DataUtils() self.matcher = utils.TrackMatcher() self.redis_cache = redis_cache.RedisCache() self.cache = cache.Cache() async def search( self, artist: str, song: str, plain: Optional[bool] = True, duration: Optional[int] = None, raw: bool = False, ) -> Optional[LyricsResult]: """ LRCLib Local Database Search with normalization and smart fallback. Search strategy: 1. Exact match on lowercased input (fastest, ~0.1ms) 2. Exact match on normalized input (fast, ~0.1ms) 3. Artist trigram + song exact within results (medium, ~50-200ms) Args: artist (str): the artist to search song (str): the song to search plain (bool): return plain lyrics (True) or synced lyrics (False) duration (int): optional track duration for better matching raw (bool): return raw LRC string instead of parsed object (only for synced) Returns: Optional[LyricsResult]: The result, if found - None otherwise. """ try: artist_lower = artist.strip().lower() song_lower = song.strip().lower() time_start: float = time.time() logging.info("Searching %s - %s on %s", artist, song, self.label) async with AsyncSessionLocal() as db: best_match = None # Strategy 1: Exact match on raw lowercase (fastest) result = await db.execute( select( Tracks.artist_name, Tracks.name, Lyrics.plain_lyrics, Lyrics.synced_lyrics, ) .join(Lyrics, Tracks.id == Lyrics.track_id) .filter( Tracks.artist_name_lower == artist_lower, Tracks.name_lower == song_lower, ) .limit(1) ) best_match = result.first() # Strategy 2: Exact match on normalized input if not best_match: artist_norm = normalize_for_search(artist) song_norm = normalize_for_search(song) if artist_norm != artist_lower or song_norm != song_lower: result = await db.execute( select( Tracks.artist_name, Tracks.name, Lyrics.plain_lyrics, Lyrics.synced_lyrics, ) .join(Lyrics, Tracks.id == Lyrics.track_id) .filter( Tracks.artist_name_lower == artist_norm, Tracks.name_lower == song_norm, ) .limit(1) ) best_match = result.first() # Strategy 3: Normalized artist with song prefix match # Catches cases like "Song (Remastered)" when DB has "Song" if not best_match: artist_norm = normalize_for_search(artist) song_norm = normalize_for_search(song) result = await db.execute( select( Tracks.artist_name, Tracks.name, Lyrics.plain_lyrics, Lyrics.synced_lyrics, ) .join(Lyrics, Tracks.id == Lyrics.track_id) .filter( Tracks.artist_name_lower == artist_norm, Tracks.name_lower.like(f"{song_norm}%"), ) .limit(1) ) best_match = result.first() if not best_match: logging.info("No result found on %s", self.label) return None returned_artist = best_match.artist_name returned_song = best_match.name if plain: if not best_match.plain_lyrics: logging.info("No plain lyrics available on %s", self.label) return None returned_lyrics = best_match.plain_lyrics returned_lyrics = self.datautils.scrub_lyrics(returned_lyrics) lrc_obj = None else: if not best_match.synced_lyrics: logging.info("No synced lyrics available on %s", self.label) return None returned_lyrics = best_match.synced_lyrics if raw: lrc_obj = returned_lyrics else: lrc_obj = self.datautils.create_lrc_object(returned_lyrics) # Calculate match confidence input_track = f"{artist} - {song}" returned_track = f"{returned_artist} - {returned_song}" match_result = self.matcher.find_best_match( input_track=input_track, candidate_tracks=[(0, returned_track)] ) confidence = match_result[1] if match_result else 85 logging.info("Result found on %s", self.label) time_end = time.time() time_diff = time_end - time_start matched = LyricsResult( artist=returned_artist, song=returned_song, src=self.label, lyrics=returned_lyrics if plain else lrc_obj, # type: ignore confidence=confidence, time=time_diff, ) await self.redis_cache.increment_found_count(self.label) # Store plain lyrics to Redis cache (like Genius does) if plain: await self.cache.store(matched) return matched except Exception as e: logging.error("Exception in %s: %s", self.label, str(e)) return None