api/lyric_search/sources/lrclib.py

import re
import time
import logging
from typing import Optional
from sqlalchemy import text
from sqlalchemy.future import select
from lyric_search import utils
from lyric_search.constructors import LyricsResult
from lyric_search.models import Tracks, Lyrics, AsyncSessionLocal
from . import redis_cache, cache

logger = logging.getLogger()
log_level = logging.getLevelName(logger.level)


def normalize_for_search(s: str) -> str:
    """
    Normalize string for better matching.
    Removes common variations that cause exact match failures.
    """
    s = s.lower().strip()

    # Remove parenthetical content: (Remastered), (feat. X), (2020 Remix), etc.
    s = re.sub(r"\s*\([^)]*\)\s*", " ", s)

    # Remove bracketed content: [Explicit], [Deluxe Edition], etc.
    s = re.sub(r"\s*\[[^\]]*\]\s*", " ", s)

    # Remove "feat.", "ft.", "featuring" and everything after
    s = re.sub(r"\s*(feat\.?|ft\.?|featuring)\s+.*$", "", s, flags=re.IGNORECASE)

    # Remove "The " prefix from artist names
    s = re.sub(r"^the\s+", "", s)

    # Normalize & to "and"
    s = re.sub(r"\s*&\s*", " and ", s)

    # Remove punctuation except spaces
    s = re.sub(r"[^\w\s]", "", s)

    # Collapse multiple spaces
    s = re.sub(r"\s+", " ", s).strip()

    return s


class LRCLib:
    """LRCLib Search Module - Local PostgreSQL Database"""

    def __init__(self) -> None:
        self.label: str = "LRCLib-Cache"
        self.datautils = utils.DataUtils()
        self.matcher = utils.TrackMatcher()
        self.redis_cache = redis_cache.RedisCache()
        self.cache = cache.Cache()

    async def search(
        self,
        artist: str,
        song: str,
        plain: Optional[bool] = True,
        duration: Optional[int] = None,
        raw: bool = False,
    ) -> Optional[LyricsResult]:
        """
        LRCLib Local Database Search with normalization and smart fallback.

        Search strategy:
        1. Exact match on lowercased input (fastest, ~0.1ms)
        2. Exact match on normalized input (fast, ~0.1ms)
        3. Artist trigram + song exact within results (medium, ~50-200ms)

        Args:
            artist (str): the artist to search
            song (str): the song to search
            plain (bool): return plain lyrics (True) or synced lyrics (False)
            duration (int): optional track duration for better matching
            raw (bool): return raw LRC string instead of parsed object (only for synced)
        Returns:
            Optional[LyricsResult]: The result, if found - None otherwise.
        """
        try:
            artist_lower = artist.strip().lower()
            song_lower = song.strip().lower()
            time_start: float = time.time()

            logging.info("Searching %s - %s on %s", artist, song, self.label)

            async with AsyncSessionLocal() as db:
                best_match = None

                # Strategy 1: Exact match on raw lowercase (fastest)
                result = await db.execute(
                    select(
                        Tracks.artist_name,
                        Tracks.name,
                        Lyrics.plain_lyrics,
                        Lyrics.synced_lyrics,
                    )
                    .join(Lyrics, Tracks.id == Lyrics.track_id)
                    .filter(
                        Tracks.artist_name_lower == artist_lower,
                        Tracks.name_lower == song_lower,
                    )
                    .limit(1)
                )
                best_match = result.first()

                # Strategy 2: Exact match on normalized input
                if not best_match:
                    artist_norm = normalize_for_search(artist)
                    song_norm = normalize_for_search(song)

                    if artist_norm != artist_lower or song_norm != song_lower:
                        result = await db.execute(
                            select(
                                Tracks.artist_name,
                                Tracks.name,
                                Lyrics.plain_lyrics,
                                Lyrics.synced_lyrics,
                            )
                            .join(Lyrics, Tracks.id == Lyrics.track_id)
                            .filter(
                                Tracks.artist_name_lower == artist_norm,
                                Tracks.name_lower == song_norm,
                            )
                            .limit(1)
                        )
                        best_match = result.first()

                # Strategy 3: Normalized artist with song prefix match
                # Catches cases like "Song (Remastered)" when DB has "Song"
                if not best_match:
                    artist_norm = normalize_for_search(artist)
                    song_norm = normalize_for_search(song)

                    result = await db.execute(
                        select(
                            Tracks.artist_name,
                            Tracks.name,
                            Lyrics.plain_lyrics,
                            Lyrics.synced_lyrics,
                        )
                        .join(Lyrics, Tracks.id == Lyrics.track_id)
                        .filter(
                            Tracks.artist_name_lower == artist_norm,
                            Tracks.name_lower.like(f"{song_norm}%"),
                        )
                        .limit(1)
                    )
                    best_match = result.first()

                if not best_match:
                    logging.info("No result found on %s", self.label)
                    return None

                returned_artist = best_match.artist_name
                returned_song = best_match.name

                if plain:
                    if not best_match.plain_lyrics:
                        logging.info("No plain lyrics available on %s", self.label)
                        return None
                    returned_lyrics = best_match.plain_lyrics
                    returned_lyrics = self.datautils.scrub_lyrics(returned_lyrics)
                    lrc_obj = None
                else:
                    if not best_match.synced_lyrics:
                        logging.info("No synced lyrics available on %s", self.label)
                        return None
                    returned_lyrics = best_match.synced_lyrics
                    if raw:
                        lrc_obj = returned_lyrics
                    else:
                        lrc_obj = self.datautils.create_lrc_object(returned_lyrics)

                # Calculate match confidence
                input_track = f"{artist} - {song}"
                returned_track = f"{returned_artist} - {returned_song}"
                match_result = self.matcher.find_best_match(
                    input_track=input_track, candidate_tracks=[(0, returned_track)]
                )

                confidence = match_result[1] if match_result else 85

                logging.info("Result found on %s", self.label)
                time_end = time.time()
                time_diff = time_end - time_start

                matched = LyricsResult(
                    artist=returned_artist,
                    song=returned_song,
                    src=self.label,
                    lyrics=returned_lyrics if plain else lrc_obj,  # type: ignore
                    confidence=confidence,
                    time=time_diff,
                )

                await self.redis_cache.increment_found_count(self.label)
                # Store plain lyrics to Redis cache (like Genius does)
                if plain:
                    await self.cache.store(matched)
                return matched

        except Exception as e:
            logging.error("Exception in %s: %s", self.label, str(e))
            return None