More progress re: #34

- Change of direction, LRCLib searches from /lyric/search now use internal cache - which is a PGSQL import of the LRCLib SQLite database.  Change to PGSQL was made for performance.
This commit is contained in:
2025-11-22 13:13:03 -05:00
parent c302b256d3
commit dcc6c7b24e
10 changed files with 240 additions and 398 deletions

View File

@@ -108,7 +108,6 @@ routes: dict = {
"lyrics": importlib.import_module("endpoints.lyric_search").LyricSearch( "lyrics": importlib.import_module("endpoints.lyric_search").LyricSearch(
app, util, constants app, util, constants
), ),
"lrclib": importlib.import_module("endpoints.lrclib").LRCLib(app, util, constants),
"yt": importlib.import_module("endpoints.yt").YT(app, util, constants), "yt": importlib.import_module("endpoints.yt").YT(app, util, constants),
"radio": importlib.import_module("endpoints.radio").Radio( "radio": importlib.import_module("endpoints.radio").Radio(
app, util, constants, loop app, util, constants, loop

View File

@@ -110,25 +110,6 @@ class ValidLyricRequest(BaseModel):
} }
class ValidLRCLibRequest(BaseModel):
"""
Request model for lyric search.
Attributes:
- **artist** (str): Artist.
- **song** (str): Song.
- **duration** (Optional[int]): Optional duration.
"""
artist: Optional[str] = None
song: Optional[str] = None
duration: Optional[int] = None
model_config = {
"json_schema_extra": {"examples": [{"artist": "eminem", "song": "rap god"}]}
}
class ValidTypeAheadRequest(BaseModel): class ValidTypeAheadRequest(BaseModel):
""" """
Request model for typeahead query. Request model for typeahead query.

View File

@@ -1,199 +0,0 @@
import urllib.parse
from fastapi import FastAPI, HTTPException, Depends
from fastapi_throttle import RateLimiter
from fastapi.responses import JSONResponse
from typing import Type, Optional
from sqlalchemy import (
and_,
true,
Column,
Integer,
String,
Float,
Boolean,
DateTime,
ForeignKey,
UniqueConstraint,
create_engine,
)
from sqlalchemy.orm import Session, relationship
from sqlalchemy.ext.declarative import declarative_base, DeclarativeMeta
from sqlalchemy.orm import sessionmaker
from .constructors import ValidLRCLibRequest
from lyric_search.constructors import LRCLibResult
from lyric_search import notifier
from sqlalchemy.orm import foreign
Base: Type[DeclarativeMeta] = declarative_base()
class Tracks(Base): # type: ignore
__tablename__ = "tracks"
id = Column(Integer, primary_key=True, autoincrement=True)
name = Column(String)
name_lower = Column(String, index=True)
artist_name = Column(String)
artist_name_lower = Column(String, index=True)
album_name = Column(String)
album_name_lower = Column(String, index=True)
duration = Column(Float, index=True)
last_lyrics_id = Column(Integer, ForeignKey("lyrics.id"), index=True)
created_at = Column(DateTime)
updated_at = Column(DateTime)
# Relationships
lyrics = relationship(
"Lyrics",
back_populates="track",
foreign_keys=[last_lyrics_id],
primaryjoin="Tracks.id == foreign(Lyrics.track_id)", # Use string reference for Lyrics
)
# Constraints
__table_args__ = (
UniqueConstraint(
"name_lower",
"artist_name_lower",
"album_name_lower",
"duration",
name="uq_tracks",
),
)
class Lyrics(Base): # type: ignore
__tablename__ = "lyrics"
id = Column(Integer, primary_key=True, autoincrement=True)
plain_lyrics = Column(String)
synced_lyrics = Column(String)
track_id = Column(Integer, ForeignKey("tracks.id"), index=True)
has_plain_lyrics = Column(Boolean, index=True)
has_synced_lyrics = Column(Boolean, index=True)
instrumental = Column(Boolean)
source = Column(String, index=True)
created_at = Column(DateTime, index=True)
updated_at = Column(DateTime)
# Relationships
track = relationship(
"Tracks",
back_populates="lyrics",
foreign_keys=[track_id],
primaryjoin=(Tracks.id == foreign(track_id)),
remote_side=Tracks.id,
)
DATABASE_URL: str = "sqlite:////nvme/sqlite_dbs/lrclib.db"
engine = create_engine(DATABASE_URL, connect_args={"check_same_thread": False})
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
def get_db():
db = SessionLocal()
try:
yield db
finally:
db.close()
"""
TODO:
- Move retrieval to lyric_search.sources, with separate file for DB Model
"""
class LRCLib(FastAPI):
"""
LRCLib Cache Search Endpoint
"""
def __init__(self, app: FastAPI, util, constants) -> None:
"""Initialize LyricSearch endpoints."""
self.app: FastAPI = app
self.util = util
self.constants = constants
self.declarative_base = declarative_base()
self.notifier = notifier.DiscordNotifier()
self.endpoints: dict = {
"lrclib/search": self.lyric_search_handler,
}
for endpoint, handler in self.endpoints.items():
times: int = 20
seconds: int = 2
rate_limit: tuple[int, int] = (2, 3) # Default; (Times, Seconds)
(times, seconds) = rate_limit
app.add_api_route(
f"/{endpoint}",
handler,
methods=["POST"],
include_in_schema=True,
dependencies=[Depends(RateLimiter(times=times, seconds=seconds))],
)
async def lyric_search_handler(
self, data: ValidLRCLibRequest, db: Session = Depends(get_db)
) -> JSONResponse:
"""
Search for lyrics.
Parameters:
- **data** (ValidLRCLibRequest): Request containing artist, song, and other parameters.
Returns:
- **JSONResponse**: LRCLib data or error.
"""
if not data.artist or not data.song:
raise HTTPException(detail="Invalid request", status_code=500)
search_artist: str = urllib.parse.unquote(data.artist).lower()
search_song: str = urllib.parse.unquote(data.song).lower()
search_duration: Optional[int] = data.duration
if not isinstance(search_artist, str) or not isinstance(search_song, str):
return JSONResponse(
status_code=500,
content={
"err": True,
"errorText": "Invalid request",
},
)
query = (
db.query(
Tracks.id.label("id"),
Tracks.artist_name.label("artist"),
Tracks.name.label("song"),
Lyrics.plain_lyrics.label("plainLyrics"),
Lyrics.synced_lyrics.label("syncedLyrics"),
)
.join(Lyrics, Tracks.id == Lyrics.track_id)
.filter(
and_(
Tracks.artist_name_lower == search_artist,
Tracks.name == search_song,
Tracks.duration == search_duration if search_duration else true(),
)
)
)
db_result = query.first()
if not db_result:
return JSONResponse(
status_code=404, content={"err": True, "errorText": "No result found."}
)
result = LRCLibResult(
id=db_result.id,
artist=db_result.artist,
song=db_result.song,
plainLyrics=db_result.plainLyrics,
syncedLyrics=db_result.syncedLyrics,
)
return JSONResponse(content=vars(result))

View File

@@ -1,5 +1,5 @@
from dataclasses import dataclass from dataclasses import dataclass
from typing import Union, Optional from typing import Union
@dataclass @dataclass
@@ -22,25 +22,6 @@ class LyricsResult:
time: float = 0.00 time: float = 0.00
@dataclass
class LRCLibResult:
"""
Class for returned Lyrics Results
Attributes:
id (int): returned id
artist (str): returned artist
song (str): returned song
plainLyrics (str): returned (plain) lyrics
syncedLyrics (str): returned synchronizedLyrics
"""
id: int
artist: str
song: str
plainLyrics: Optional[str] = None
syncedLyrics: Optional[str] = None
""" """
Generic Generic
""" """

110
lyric_search/models.py Normal file
View File

@@ -0,0 +1,110 @@
"""
Database models for LRCLib lyrics cache.
"""
import os
import urllib.parse
from typing import Type, AsyncGenerator
from sqlalchemy import (
Column,
Integer,
String,
Float,
Boolean,
DateTime,
ForeignKey,
UniqueConstraint,
)
from sqlalchemy.orm import relationship, foreign
from sqlalchemy.ext.declarative import declarative_base, DeclarativeMeta
from sqlalchemy.ext.asyncio import AsyncEngine, create_async_engine, AsyncSession
from sqlalchemy.ext.asyncio import async_sessionmaker
Base: Type[DeclarativeMeta] = declarative_base()
class Tracks(Base): # type: ignore
"""Tracks table - stores track metadata."""
__tablename__ = "tracks"
id = Column(Integer, primary_key=True, autoincrement=True)
name = Column(String, index=True)
name_lower = Column(String, index=True)
artist_name = Column(String, index=True)
artist_name_lower = Column(String, index=True)
album_name = Column(String)
album_name_lower = Column(String, index=True)
duration = Column(Float, index=True)
last_lyrics_id = Column(Integer, ForeignKey("lyrics.id"), index=True)
created_at = Column(DateTime)
updated_at = Column(DateTime)
# Relationships
lyrics = relationship(
"Lyrics",
back_populates="track",
foreign_keys=[last_lyrics_id],
primaryjoin="Tracks.id == foreign(Lyrics.track_id)",
)
# Constraints
__table_args__ = (
UniqueConstraint(
"name_lower",
"artist_name_lower",
"album_name_lower",
"duration",
name="uq_tracks",
),
)
class Lyrics(Base): # type: ignore
"""Lyrics table - stores lyrics content."""
__tablename__ = "lyrics"
id = Column(Integer, primary_key=True, autoincrement=True)
plain_lyrics = Column(String)
synced_lyrics = Column(String)
track_id = Column(Integer, ForeignKey("tracks.id"), index=True)
has_plain_lyrics = Column(Boolean, index=True)
has_synced_lyrics = Column(Boolean, index=True)
instrumental = Column(Boolean)
source = Column(String, index=True)
created_at = Column(DateTime, index=True)
updated_at = Column(DateTime)
# Relationships
track = relationship(
"Tracks",
back_populates="lyrics",
foreign_keys=[track_id],
primaryjoin=(Tracks.id == foreign(track_id)),
remote_side=Tracks.id,
)
# PostgreSQL connection - using environment variables
POSTGRES_HOST = os.getenv("POSTGRES_HOST", "localhost")
POSTGRES_PORT = os.getenv("POSTGRES_PORT", "5432")
POSTGRES_DB = os.getenv("POSTGRES_DB", "lrclib")
POSTGRES_USER = os.getenv("POSTGRES_USER", "api")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "")
# URL-encode the password to handle special characters
encoded_password = urllib.parse.quote_plus(POSTGRES_PASSWORD)
DATABASE_URL: str = f"postgresql+asyncpg://{POSTGRES_USER}:{encoded_password}@{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}"
async_engine: AsyncEngine = create_async_engine(
DATABASE_URL,
pool_size=20,
max_overflow=10,
pool_pre_ping=True,
echo=False
)
AsyncSessionLocal = async_sessionmaker(bind=async_engine, expire_on_commit=False)
async def get_async_db():
"""Get async database session."""
async with AsyncSessionLocal() as session:
yield session

View File

@@ -14,9 +14,7 @@ class Aggregate:
Aggregate all source methods Aggregate all source methods
""" """
def __init__(self, exclude_methods=None) -> None: def __init__(self, exclude_methods: list = []) -> None:
if not exclude_methods:
exclude_methods: list = []
self.exclude_methods = exclude_methods self.exclude_methods = exclude_methods
self.redis_cache = redis_cache.RedisCache() self.redis_cache = redis_cache.RedisCache()
self.notifier = notifier.DiscordNotifier() self.notifier = notifier.DiscordNotifier()
@@ -70,14 +68,14 @@ class Aggregate:
if plain: # do not record LRC fails if plain: # do not record LRC fails
try: try:
await self.redis_cache.increment_found_count("failed") await self.redis_cache.increment_found_count("failed")
self.notifier.send( await self.notifier.send(
"WARNING", "WARNING",
f"Could not find {artist} - {song} via queried sources.", f"Could not find {artist} - {song} via queried sources.",
) )
except Exception as e: except Exception as e:
traceback.print_exc() traceback.print_exc()
logging.info("Could not increment redis failed counter: %s", str(e)) logging.info("Could not increment redis failed counter: %s", str(e))
self.notifier.send( await self.notifier.send(
f"ERROR @ {__file__.rsplit('/', maxsplit=1)[-1]}", f"ERROR @ {__file__.rsplit('/', maxsplit=1)[-1]}",
f"Could not increment redis failed counter: {str(e)}", f"Could not increment redis failed counter: {str(e)}",
) )

View File

@@ -45,11 +45,11 @@ class Genius:
Optional[LyricsResult]: The result, if found - None otherwise. Optional[LyricsResult]: The result, if found - None otherwise.
""" """
try: try:
artist = artist.strip().lower() artist_name = artist.strip().lower()
song = song.strip().lower() song_name = song.strip().lower()
time_start: float = time.time() time_start: float = time.time()
logging.info("Searching %s - %s on %s", artist, song, self.label) logging.info("Searching %s - %s on %s", artist_name, song_name, self.label)
search_term: str = f"{artist}%20{song}" search_term: str = f"{artist_name}%20{song_name}"
returned_lyrics: str = "" returned_lyrics: str = ""
async with ClientSession() as client: async with ClientSession() as client:
async with client.get( async with client.get(
@@ -100,10 +100,13 @@ class Genius:
) )
for returned in possible_matches for returned in possible_matches
] ]
searched: str = f"{artist} - {song}" searched: str = f"{artist_name} - {song_name}"
best_match: tuple = self.matcher.find_best_match( best_match: Optional[tuple] = self.matcher.find_best_match(
input_track=searched, candidate_tracks=to_scrape input_track=searched, candidate_tracks=to_scrape
) )
if not best_match:
raise InvalidGeniusResponseException("No matching result")
logging.info("To scrape: %s", to_scrape) logging.info("To scrape: %s", to_scrape)
((scrape_stub, track), confidence) = best_match ((scrape_stub, track), confidence) = best_match
scrape_url: str = f"{self.genius_url}{scrape_stub[1:]}" scrape_url: str = f"{self.genius_url}{scrape_stub[1:]}"
@@ -157,8 +160,8 @@ class Genius:
returned_lyrics: str = self.datautils.scrub_lyrics( returned_lyrics: str = self.datautils.scrub_lyrics(
returned_lyrics returned_lyrics
) )
artist: str = track.split(" - ", maxsplit=1)[0] artist = track.split(" - ", maxsplit=1)[0]
song: str = track.split(" - ", maxsplit=1)[1] song = track.split(" - ", maxsplit=1)[1]
logging.info("Result found on %s", self.label) logging.info("Result found on %s", self.label)
time_end: float = time.time() time_end: float = time.time()
time_diff: float = time_end - time_start time_diff: float = time_end - time_start

View File

@@ -1,33 +1,25 @@
import time import time
import traceback
import logging import logging
from typing import Optional, Union from typing import Optional
from aiohttp import ClientTimeout, ClientSession from sqlalchemy.future import select
from tenacity import retry, stop_after_attempt, wait_fixed
from lyric_search import utils from lyric_search import utils
from lyric_search.constructors import LyricsResult from lyric_search.constructors import LyricsResult
from . import common, cache, redis_cache from lyric_search.models import Tracks, Lyrics, AsyncSessionLocal
from lyric_search.constructors import InvalidLRCLibResponseException from . import redis_cache
logger = logging.getLogger() logger = logging.getLogger()
log_level = logging.getLevelName(logger.level) log_level = logging.getLevelName(logger.level)
class LRCLib: class LRCLib:
"""LRCLib Search Module""" """LRCLib Search Module - Local PostgreSQL Database"""
def __init__(self) -> None: def __init__(self) -> None:
self.label: str = "LRCLib" self.label: str = "LRCLib-Cache"
self.lrclib_url: str = "https://lrclib.net/api/search"
self.headers: dict = common.SCRAPE_HEADERS
self.timeout = ClientTimeout(connect=3, sock_read=8)
self.datautils = utils.DataUtils() self.datautils = utils.DataUtils()
self.matcher = utils.TrackMatcher() self.matcher = utils.TrackMatcher()
self.cache = cache.Cache()
self.redis_cache = redis_cache.RedisCache() self.redis_cache = redis_cache.RedisCache()
@retry(stop=stop_after_attempt(2), wait=wait_fixed(0.5))
async def search( async def search(
self, self,
artist: str, artist: str,
@@ -36,10 +28,12 @@ class LRCLib:
duration: Optional[int] = None, duration: Optional[int] = None,
) -> Optional[LyricsResult]: ) -> Optional[LyricsResult]:
""" """
LRCLib Search LRCLib Local Database Search
Args: Args:
artist (str): the artist to search artist (str): the artist to search
song (str): the song to search song (str): the song to search
plain (bool): return plain lyrics (True) or synced lyrics (False)
duration (int): optional track duration for better matching
Returns: Returns:
Optional[LyricsResult]: The result, if found - None otherwise. Optional[LyricsResult]: The result, if found - None otherwise.
""" """
@@ -47,128 +41,103 @@ class LRCLib:
artist = artist.strip().lower() artist = artist.strip().lower()
song = song.strip().lower() song = song.strip().lower()
time_start: float = time.time() time_start: float = time.time()
lrc_obj: Optional[list[dict]] = None
logging.info("Searching %s - %s on %s", artist, song, self.label) logging.info("Searching %s - %s on %s", artist, song, self.label)
input_track: str = f"{artist} - {song}" async with AsyncSessionLocal() as db:
returned_lyrics: str = ""
async with ClientSession() as client:
async with await client.get(
self.lrclib_url,
params={
"artist_name": artist,
"track_name": song,
**({"duration": duration} if duration else {}),
},
timeout=self.timeout,
headers=self.headers,
) as request:
request.raise_for_status()
text: Optional[str] = await request.text()
if not text:
raise InvalidLRCLibResponseException("No search response.")
if len(text) < 100:
raise InvalidLRCLibResponseException(
"Search response text was invalid (len < 100 chars.)"
)
search_data: Optional[Union[list, dict]] = await request.json()
if not isinstance(search_data, list | dict):
raise InvalidLRCLibResponseException("No JSON search data.")
# logging.info("Search Data:\n%s", search_data)
if not isinstance(search_data, list):
raise InvalidLRCLibResponseException("Invalid JSON.")
# Filter by duration if provided
if duration:
search_data = [
r
for r in search_data
if abs(r.get("duration", 0) - duration) <= 10
]
if plain:
possible_matches = [
(
x,
f"{result.get('artistName')} - {result.get('trackName')}",
)
for x, result in enumerate(search_data)
]
else:
logging.info(
"Limiting possible matches to only those with non-null syncedLyrics"
)
possible_matches = [
(
x,
f"{result.get('artistName')} - {result.get('trackName')}",
)
for x, result in enumerate(search_data)
if isinstance(result["syncedLyrics"], str)
]
best_match = None best_match = None
try:
match_result = self.matcher.find_best_match( # Try exact match first (fastest)
input_track, result = await db.execute(
possible_matches, # type: ignore select(
Tracks.artist_name,
Tracks.name,
Lyrics.plain_lyrics,
Lyrics.synced_lyrics,
) )
if match_result: .join(Lyrics, Tracks.id == Lyrics.track_id)
best_match = match_result[0] .filter(
except: # noqa Tracks.artist_name_lower == artist,
pass Tracks.name_lower == song,
)
.limit(1)
)
best_match = result.first()
# If no exact match, try prefix match (faster than full ILIKE)
if not best_match:
result = await db.execute(
select(
Tracks.artist_name,
Tracks.name,
Lyrics.plain_lyrics,
Lyrics.synced_lyrics,
)
.join(Lyrics, Tracks.id == Lyrics.track_id)
.filter(
Tracks.artist_name_lower.like(f"{artist}%"),
Tracks.name_lower.like(f"{song}%"),
)
.limit(1)
)
best_match = result.first()
# If still no match, try full ILIKE (slowest)
if not best_match:
result = await db.execute(
select(
Tracks.artist_name,
Tracks.name,
Lyrics.plain_lyrics,
Lyrics.synced_lyrics,
)
.join(Lyrics, Tracks.id == Lyrics.track_id)
.filter(
Tracks.artist_name_lower.ilike(f"%{artist}%"),
Tracks.name_lower.ilike(f"%{song}%"),
)
.limit(1)
)
best_match = result.first()
if not best_match: if not best_match:
return logging.info("No result found on %s", self.label)
best_match_id = best_match[0] return None
if not isinstance(search_data[best_match_id]["artistName"], str): returned_artist = best_match.artist_name
raise InvalidLRCLibResponseException( returned_song = best_match.name
f"Invalid JSON: Cannot find artistName key.\n{search_data}"
)
if not isinstance(search_data[best_match_id]["trackName"], str):
raise InvalidLRCLibResponseException(
f"Invalid JSON: Cannot find trackName key.\n{search_data}"
)
returned_artist: str = search_data[best_match_id]["artistName"]
returned_song: str = search_data[best_match_id]["trackName"]
if plain: if plain:
if not isinstance( if not best_match.plain_lyrics:
search_data[best_match_id]["plainLyrics"], str logging.info("No plain lyrics available on %s", self.label)
): return None
raise InvalidLRCLibResponseException( returned_lyrics = best_match.plain_lyrics
f"Invalid JSON: Cannot find plainLyrics key.\n{search_data}"
)
returned_lyrics: str = search_data[best_match_id]["plainLyrics"]
returned_lyrics = self.datautils.scrub_lyrics(returned_lyrics) returned_lyrics = self.datautils.scrub_lyrics(returned_lyrics)
lrc_obj = None
else: else:
if not isinstance( if not best_match.synced_lyrics:
search_data[best_match_id]["syncedLyrics"], str logging.info("No synced lyrics available on %s", self.label)
): return None
raise InvalidLRCLibResponseException( returned_lyrics = best_match.synced_lyrics
f"Invalid JSON: Cannot find syncedLyrics key.\n{search_data}"
)
returned_lyrics: str = search_data[best_match_id][
"syncedLyrics"
]
lrc_obj = self.datautils.create_lrc_object(returned_lyrics) lrc_obj = self.datautils.create_lrc_object(returned_lyrics)
returned_track: str = f"{returned_artist} - {returned_song}"
# Calculate match confidence
input_track = f"{artist} - {song}"
returned_track = f"{returned_artist} - {returned_song}"
match_result = self.matcher.find_best_match( match_result = self.matcher.find_best_match(
input_track=input_track, candidate_tracks=[(0, returned_track)] input_track=input_track,
candidate_tracks=[(0, returned_track)]
) )
if not match_result: if not match_result:
return # No suitable match found return None
_matched, confidence = match_result _matched, confidence = match_result
logging.info("Result found on %s", self.label) logging.info("Result found on %s", self.label)
time_end: float = time.time() time_end = time.time()
time_diff: float = time_end - time_start time_diff = time_end - time_start
matched = LyricsResult( matched = LyricsResult(
artist=returned_artist, artist=returned_artist,
song=returned_song, song=returned_song,
@@ -177,10 +146,10 @@ class LRCLib:
confidence=confidence, confidence=confidence,
time=time_diff, time=time_diff,
) )
await self.redis_cache.increment_found_count(self.label) await self.redis_cache.increment_found_count(self.label)
if plain:
await self.cache.store(matched)
return matched return matched
except Exception as e: except Exception as e:
logging.debug("Exception: %s", str(e)) logging.error("Exception in %s: %s", self.label, str(e))
traceback.print_exc() return None

0
pgloader_config.load Normal file
View File

0
postgres_schema.sql Normal file
View File