#!/usr/bin/env python3.12 from difflib import SequenceMatcher from typing import List, Optional, Union, Any import logging import regex from regex import Pattern class TrackMatcher: """Track Matcher""" def __init__(self, threshold: float = 0.85): """ Initialize the TrackMatcher with a similarity threshold. Args: threshold (float): Minimum similarity score to consider a match valid (between 0 and 1, default 0.85) """ self.threshold = threshold def find_best_match(self, input_track: str, candidate_tracks: List[tuple[int|str, str]]) -> Optional[tuple]: """ Find the best matching track from the candidate list. Args: input_track (str): Input track in "ARTIST - SONG" format candidate_tracks (List[tuple[int|str, str]]): List of candidate tracks Returns: Optional[tuple[int, str, float]]: Tuple of (best matching track, similarity score) or None if no good match found """ if not input_track or not candidate_tracks: return None # Normalize input track input_track = self._normalize_string(input_track) best_match = None best_score: float = 0.0 for candidate in candidate_tracks: normalized_candidate = self._normalize_string(candidate[1]) # Calculate various similarity scores exact_score = 1.0 if input_track == normalized_candidate else 0.0 sequence_score = SequenceMatcher(None, input_track, normalized_candidate).ratio() token_score = self._calculate_token_similarity(input_track, normalized_candidate) # Take the maximum of the different scoring methods final_score = max(exact_score, sequence_score, token_score) if final_score > best_score: best_score = final_score best_match = candidate # Return the match only if it meets the threshold if best_score < self.threshold: return None match: tuple = (best_match, round(best_score * 100)) return match def _normalize_string(self, text: str) -> str: """ Normalize string for comparison by removing special characters, extra spaces, and converting to lowercase. Args: text (str): The text to normalize Returns: str: Normalized text """ # Remove special characters and convert to lowercase text = regex.sub(r'[^\w\s-]', '', text).lower() # Normalize spaces text = ' '.join(text.split()) return text def _calculate_token_similarity(self, str1: str, str2: str) -> float: """ Calculate similarity based on matching tokens (words). Args: str1 (str): string 1 to compare str2 (str): string 2 to compare Returns: float: The token similarity score """ tokens1 = set(str1.split()) tokens2 = set(str2.split()) if not tokens1 or not tokens2: return 0.0 intersection = tokens1.intersection(tokens2) union = tokens1.union(tokens2) return len(intersection) / len(union) class DataUtils: """ Data Utils """ def __init__(self) -> None: self.lrc_regex = regex.compile(r'\[([0-9]{2}:[0-9]{2})\.[0-9]{1,3}\](\s(.*)){0,}') self.scrub_regex_1: Pattern = regex.compile(r'(\[.*?\])(\s){0,}(\:){0,1}') self.scrub_regex_2: Pattern = regex.compile(r'(\d?)(Embed\b)', flags=regex.IGNORECASE) self.scrub_regex_3: Pattern = regex.compile(r'\n{2}') self.scrub_regex_4: Pattern = regex.compile(r'[0-9]\b$') def scrub_lyrics(self, lyrics: str) -> str: """ Lyric Scrub Regex Chain Args: lyrics (str): The lyrics to scrub Returns: str: Regex scrubbed lyrics """ lyrics = self.scrub_regex_1.sub('', lyrics) lyrics = self.scrub_regex_2.sub('', lyrics) lyrics = self.scrub_regex_3.sub('\n', lyrics) # Gaps between verses lyrics = self.scrub_regex_3.sub('', lyrics) return lyrics def create_lrc_object(self, lrc_str: str) -> list[dict]: """ Create LRC Object Args: lrc_str (str): The raw LRCLib syncedLyrics Returns: list[dict]: LRC Object comprised of timestamps/lyrics """ lrc_out: list = [] for line in lrc_str.split("\n"): _timetag = None _words = None if not line.strip(): continue reg_helper = regex.findall(self.lrc_regex, line.strip()) if not reg_helper: continue reg_helper = reg_helper[0] logging.debug("Reg helper: %s for line: %s; len: %s", reg_helper, line, len(reg_helper)) _timetag = reg_helper[0] if not reg_helper[1].strip(): _words = "♪" else: _words = reg_helper[1].strip() lrc_out.append({ "timeTag": _timetag, "words": _words, }) return lrc_out