208 lines
7.7 KiB
Python
208 lines
7.7 KiB
Python
import re
|
|
import time
|
|
import logging
|
|
from typing import Optional
|
|
from sqlalchemy import text
|
|
from sqlalchemy.future import select
|
|
from lyric_search import utils
|
|
from lyric_search.constructors import LyricsResult
|
|
from lyric_search.models import Tracks, Lyrics, AsyncSessionLocal
|
|
from . import redis_cache, cache
|
|
|
|
logger = logging.getLogger()
|
|
log_level = logging.getLevelName(logger.level)
|
|
|
|
|
|
def normalize_for_search(s: str) -> str:
|
|
"""
|
|
Normalize string for better matching.
|
|
Removes common variations that cause exact match failures.
|
|
"""
|
|
s = s.lower().strip()
|
|
|
|
# Remove parenthetical content: (Remastered), (feat. X), (2020 Remix), etc.
|
|
s = re.sub(r"\s*\([^)]*\)\s*", " ", s)
|
|
|
|
# Remove bracketed content: [Explicit], [Deluxe Edition], etc.
|
|
s = re.sub(r"\s*\[[^\]]*\]\s*", " ", s)
|
|
|
|
# Remove "feat.", "ft.", "featuring" and everything after
|
|
s = re.sub(r"\s*(feat\.?|ft\.?|featuring)\s+.*$", "", s, flags=re.IGNORECASE)
|
|
|
|
# Remove "The " prefix from artist names
|
|
s = re.sub(r"^the\s+", "", s)
|
|
|
|
# Normalize & to "and"
|
|
s = re.sub(r"\s*&\s*", " and ", s)
|
|
|
|
# Remove punctuation except spaces
|
|
s = re.sub(r"[^\w\s]", "", s)
|
|
|
|
# Collapse multiple spaces
|
|
s = re.sub(r"\s+", " ", s).strip()
|
|
|
|
return s
|
|
|
|
|
|
class LRCLib:
|
|
"""LRCLib Search Module - Local PostgreSQL Database"""
|
|
|
|
def __init__(self) -> None:
|
|
self.label: str = "LRCLib-Cache"
|
|
self.datautils = utils.DataUtils()
|
|
self.matcher = utils.TrackMatcher()
|
|
self.redis_cache = redis_cache.RedisCache()
|
|
self.cache = cache.Cache()
|
|
|
|
async def search(
|
|
self,
|
|
artist: str,
|
|
song: str,
|
|
plain: Optional[bool] = True,
|
|
duration: Optional[int] = None,
|
|
raw: bool = False,
|
|
) -> Optional[LyricsResult]:
|
|
"""
|
|
LRCLib Local Database Search with normalization and smart fallback.
|
|
|
|
Search strategy:
|
|
1. Exact match on lowercased input (fastest, ~0.1ms)
|
|
2. Exact match on normalized input (fast, ~0.1ms)
|
|
3. Artist trigram + song exact within results (medium, ~50-200ms)
|
|
|
|
Args:
|
|
artist (str): the artist to search
|
|
song (str): the song to search
|
|
plain (bool): return plain lyrics (True) or synced lyrics (False)
|
|
duration (int): optional track duration for better matching
|
|
raw (bool): return raw LRC string instead of parsed object (only for synced)
|
|
Returns:
|
|
Optional[LyricsResult]: The result, if found - None otherwise.
|
|
"""
|
|
try:
|
|
artist_lower = artist.strip().lower()
|
|
song_lower = song.strip().lower()
|
|
time_start: float = time.time()
|
|
|
|
logging.info("Searching %s - %s on %s", artist, song, self.label)
|
|
|
|
async with AsyncSessionLocal() as db:
|
|
best_match = None
|
|
|
|
# Strategy 1: Exact match on raw lowercase (fastest)
|
|
result = await db.execute(
|
|
select(
|
|
Tracks.artist_name,
|
|
Tracks.name,
|
|
Lyrics.plain_lyrics,
|
|
Lyrics.synced_lyrics,
|
|
)
|
|
.join(Lyrics, Tracks.id == Lyrics.track_id)
|
|
.filter(
|
|
Tracks.artist_name_lower == artist_lower,
|
|
Tracks.name_lower == song_lower,
|
|
)
|
|
.limit(1)
|
|
)
|
|
best_match = result.first()
|
|
|
|
# Strategy 2: Exact match on normalized input
|
|
if not best_match:
|
|
artist_norm = normalize_for_search(artist)
|
|
song_norm = normalize_for_search(song)
|
|
|
|
if artist_norm != artist_lower or song_norm != song_lower:
|
|
result = await db.execute(
|
|
select(
|
|
Tracks.artist_name,
|
|
Tracks.name,
|
|
Lyrics.plain_lyrics,
|
|
Lyrics.synced_lyrics,
|
|
)
|
|
.join(Lyrics, Tracks.id == Lyrics.track_id)
|
|
.filter(
|
|
Tracks.artist_name_lower == artist_norm,
|
|
Tracks.name_lower == song_norm,
|
|
)
|
|
.limit(1)
|
|
)
|
|
best_match = result.first()
|
|
|
|
# Strategy 3: Normalized artist with song prefix match
|
|
# Catches cases like "Song (Remastered)" when DB has "Song"
|
|
if not best_match:
|
|
artist_norm = normalize_for_search(artist)
|
|
song_norm = normalize_for_search(song)
|
|
|
|
result = await db.execute(
|
|
select(
|
|
Tracks.artist_name,
|
|
Tracks.name,
|
|
Lyrics.plain_lyrics,
|
|
Lyrics.synced_lyrics,
|
|
)
|
|
.join(Lyrics, Tracks.id == Lyrics.track_id)
|
|
.filter(
|
|
Tracks.artist_name_lower == artist_norm,
|
|
Tracks.name_lower.like(f"{song_norm}%"),
|
|
)
|
|
.limit(1)
|
|
)
|
|
best_match = result.first()
|
|
|
|
if not best_match:
|
|
logging.info("No result found on %s", self.label)
|
|
return None
|
|
|
|
returned_artist = best_match.artist_name
|
|
returned_song = best_match.name
|
|
|
|
if plain:
|
|
if not best_match.plain_lyrics:
|
|
logging.info("No plain lyrics available on %s", self.label)
|
|
return None
|
|
returned_lyrics = best_match.plain_lyrics
|
|
returned_lyrics = self.datautils.scrub_lyrics(returned_lyrics)
|
|
lrc_obj = None
|
|
else:
|
|
if not best_match.synced_lyrics:
|
|
logging.info("No synced lyrics available on %s", self.label)
|
|
return None
|
|
returned_lyrics = best_match.synced_lyrics
|
|
if raw:
|
|
lrc_obj = returned_lyrics
|
|
else:
|
|
lrc_obj = self.datautils.create_lrc_object(returned_lyrics)
|
|
|
|
# Calculate match confidence
|
|
input_track = f"{artist} - {song}"
|
|
returned_track = f"{returned_artist} - {returned_song}"
|
|
match_result = self.matcher.find_best_match(
|
|
input_track=input_track, candidate_tracks=[(0, returned_track)]
|
|
)
|
|
|
|
confidence = match_result[1] if match_result else 85
|
|
|
|
logging.info("Result found on %s", self.label)
|
|
time_end = time.time()
|
|
time_diff = time_end - time_start
|
|
|
|
matched = LyricsResult(
|
|
artist=returned_artist,
|
|
song=returned_song,
|
|
src=self.label,
|
|
lyrics=returned_lyrics if plain else lrc_obj, # type: ignore
|
|
confidence=confidence,
|
|
time=time_diff,
|
|
)
|
|
|
|
await self.redis_cache.increment_found_count(self.label)
|
|
# Store plain lyrics to Redis cache (like Genius does)
|
|
if plain:
|
|
await self.cache.store(matched)
|
|
return matched
|
|
|
|
except Exception as e:
|
|
logging.error("Exception in %s: %s", self.label, str(e))
|
|
return None
|