2025-01-12 20:19:48 -05:00
|
|
|
#!/usr/bin/env python3.12
|
|
|
|
|
|
|
|
from difflib import SequenceMatcher
|
|
|
|
from typing import List, Optional, Tuple
|
2025-01-17 06:41:56 -05:00
|
|
|
import logging
|
2025-01-13 20:47:39 -05:00
|
|
|
import regex
|
2025-01-12 20:19:48 -05:00
|
|
|
|
|
|
|
class TrackMatcher:
|
|
|
|
"""Track Matcher"""
|
|
|
|
def __init__(self, threshold: float = 0.85):
|
|
|
|
"""
|
|
|
|
Initialize the TrackMatcher with a similarity threshold.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
threshold (float): Minimum similarity score to consider a match valid
|
|
|
|
(between 0 and 1, default 0.85)
|
|
|
|
"""
|
|
|
|
self.threshold = threshold
|
|
|
|
|
2025-01-13 20:47:39 -05:00
|
|
|
def find_best_match(self, input_track: str, candidate_tracks: List[tuple[int|str, str]]) -> Optional[Tuple[str, float]]:
|
2025-01-12 20:19:48 -05:00
|
|
|
"""
|
|
|
|
Find the best matching track from the candidate list.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
input_track (str): Input track in "ARTIST - SONG" format
|
2025-01-14 09:06:40 -05:00
|
|
|
candidate_tracks (List[tuple[int|str, str]]): List of candidate tracks
|
2025-01-12 20:19:48 -05:00
|
|
|
|
|
|
|
Returns:
|
2025-01-13 20:47:39 -05:00
|
|
|
Optional[Tuple[int, str, float]]: Tuple of (best matching track, similarity score)
|
2025-01-12 20:19:48 -05:00
|
|
|
or None if no good match found
|
|
|
|
"""
|
2025-01-17 06:41:56 -05:00
|
|
|
|
|
|
|
|
2025-01-12 20:19:48 -05:00
|
|
|
if not input_track or not candidate_tracks:
|
|
|
|
return None
|
|
|
|
|
|
|
|
# Normalize input track
|
|
|
|
input_track = self._normalize_string(input_track)
|
|
|
|
|
|
|
|
best_match = None
|
|
|
|
best_score = 0
|
|
|
|
|
|
|
|
for candidate in candidate_tracks:
|
2025-01-13 20:47:39 -05:00
|
|
|
normalized_candidate = self._normalize_string(candidate[1])
|
2025-01-12 20:19:48 -05:00
|
|
|
|
|
|
|
# Calculate various similarity scores
|
|
|
|
exact_score = 1.0 if input_track == normalized_candidate else 0.0
|
|
|
|
sequence_score = SequenceMatcher(None, input_track, normalized_candidate).ratio()
|
|
|
|
token_score = self._calculate_token_similarity(input_track, normalized_candidate)
|
|
|
|
|
|
|
|
# Take the maximum of the different scoring methods
|
|
|
|
final_score = max(exact_score, sequence_score, token_score)
|
|
|
|
|
|
|
|
if final_score > best_score:
|
|
|
|
best_score = final_score
|
|
|
|
best_match = candidate
|
|
|
|
|
|
|
|
# Return the match only if it meets the threshold
|
2025-01-17 06:42:35 -05:00
|
|
|
return (best_match, round(best_score * 100)) if best_score >= self.threshold else None
|
2025-01-12 20:19:48 -05:00
|
|
|
|
|
|
|
def _normalize_string(self, text: str) -> str:
|
|
|
|
"""
|
|
|
|
Normalize string for comparison by removing special characters,
|
|
|
|
extra spaces, and converting to lowercase.
|
|
|
|
"""
|
|
|
|
# Remove special characters and convert to lowercase
|
2025-01-14 07:45:34 -05:00
|
|
|
text = regex.sub(r'[^\w\s-]', '', text).lower()
|
2025-01-12 20:19:48 -05:00
|
|
|
# Normalize spaces
|
|
|
|
text = ' '.join(text.split())
|
|
|
|
return text
|
|
|
|
|
|
|
|
def _calculate_token_similarity(self, str1: str, str2: str) -> float:
|
|
|
|
"""
|
|
|
|
Calculate similarity based on matching tokens (words).
|
|
|
|
"""
|
|
|
|
tokens1 = set(str1.split())
|
|
|
|
tokens2 = set(str2.split())
|
|
|
|
|
|
|
|
if not tokens1 or not tokens2:
|
|
|
|
return 0.0
|
|
|
|
|
|
|
|
intersection = tokens1.intersection(tokens2)
|
|
|
|
union = tokens1.union(tokens2)
|
|
|
|
|
2025-01-13 20:47:39 -05:00
|
|
|
return len(intersection) / len(union)
|
|
|
|
|
|
|
|
class DataUtils:
|
|
|
|
"""
|
|
|
|
Data Utils
|
|
|
|
"""
|
2025-01-17 07:48:29 -05:00
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
self.lrc_regex = regex.compile(r'\[([0-9]{2}:[0-9]{2})\.[0-9]{1,3}\](\s(.*)){0,}')
|
|
|
|
|
|
|
|
|
2025-01-13 20:47:39 -05:00
|
|
|
def scrub_lyrics(self, lyrics: str) -> str:
|
2025-01-17 07:54:17 -05:00
|
|
|
"""Regex Chain
|
|
|
|
@lyrics: The lyrics (str) to scrub
|
|
|
|
"""
|
2025-01-13 20:47:39 -05:00
|
|
|
lyrics = regex.sub(r'(\[.*?\])(\s){0,}(\:){0,1}', '', lyrics)
|
|
|
|
lyrics = regex.sub(r'(\d?)(Embed\b)', '', lyrics, flags=regex.IGNORECASE)
|
|
|
|
lyrics = regex.sub(r'\n{2}', '\n', lyrics) # Gaps between verses
|
|
|
|
lyrics = regex.sub(r'[0-9]\b$', '', lyrics)
|
2025-01-17 07:48:29 -05:00
|
|
|
return lyrics
|
|
|
|
|
|
|
|
def create_lrc_object(self, lrc_str: str) -> list[dict]:
|
2025-01-17 07:54:17 -05:00
|
|
|
"""Create LRC Object
|
|
|
|
@lrc_str: The raw LRCLib syncedLyrics (str)
|
|
|
|
"""
|
2025-01-17 07:48:29 -05:00
|
|
|
lrc_out: list = []
|
|
|
|
for line in lrc_str.split("\n"):
|
|
|
|
_timetag = None
|
|
|
|
_words = None
|
|
|
|
if not line.strip():
|
|
|
|
continue
|
|
|
|
reg_helper = regex.findall(self.lrc_regex, line.strip())
|
|
|
|
if not reg_helper:
|
|
|
|
continue
|
|
|
|
reg_helper = reg_helper[0]
|
|
|
|
logging.debug("Reg helper: %s for line: %s; len: %s",
|
|
|
|
reg_helper, line, len(reg_helper))
|
|
|
|
_timetag = reg_helper[0]
|
|
|
|
if not reg_helper[1].strip():
|
|
|
|
_words = "♪"
|
|
|
|
else:
|
|
|
|
_words = reg_helper[1]
|
|
|
|
lrc_out.append({
|
|
|
|
"timeTag": _timetag,
|
|
|
|
"words": _words,
|
|
|
|
})
|
|
|
|
logging.info("util: returning %s, type: %s",
|
|
|
|
lrc_out, type(lrc_out))
|
2025-01-17 07:54:17 -05:00
|
|
|
return lrc_out
|