| 
									
										
										
										
											2025-01-12 20:19:48 -05:00
										 |  |  | from difflib import SequenceMatcher | 
					
						
							| 
									
										
										
										
											2025-04-26 19:47:12 -04:00
										 |  |  | from typing import List, Optional | 
					
						
							| 
									
										
										
										
											2025-01-17 06:41:56 -05:00
										 |  |  | import logging | 
					
						
							| 
									
										
										
										
											2025-01-13 20:47:39 -05:00
										 |  |  | import regex | 
					
						
							| 
									
										
										
										
											2025-02-15 21:09:33 -05:00
										 |  |  | from regex import Pattern | 
					
						
							| 
									
										
										
										
											2025-01-12 20:19:48 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-04-17 07:28:05 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-12 20:19:48 -05:00
										 |  |  | class TrackMatcher: | 
					
						
							|  |  |  |     """Track Matcher""" | 
					
						
							| 
									
										
										
										
											2025-04-17 07:28:05 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-12 20:19:48 -05:00
										 |  |  |     def __init__(self, threshold: float = 0.85): | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  |         Initialize the TrackMatcher with a similarity threshold. | 
					
						
							| 
									
										
										
										
											2025-04-17 07:28:05 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-12 20:19:48 -05:00
										 |  |  |         Args: | 
					
						
							|  |  |  |             threshold (float): Minimum similarity score to consider a match valid | 
					
						
							|  |  |  |                              (between 0 and 1, default 0.85) | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  |         self.threshold = threshold | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-04-17 07:28:05 -04:00
										 |  |  |     def find_best_match( | 
					
						
							|  |  |  |         self, input_track: str, candidate_tracks: List[tuple[int | str, str]] | 
					
						
							|  |  |  |     ) -> Optional[tuple]: | 
					
						
							| 
									
										
										
										
											2025-01-12 20:19:48 -05:00
										 |  |  |         """
 | 
					
						
							|  |  |  |         Find the best matching track from the candidate list. | 
					
						
							| 
									
										
										
										
											2025-04-17 07:28:05 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-12 20:19:48 -05:00
										 |  |  |         Args: | 
					
						
							|  |  |  |             input_track (str): Input track in "ARTIST - SONG" format | 
					
						
							| 
									
										
										
										
											2025-01-14 09:06:40 -05:00
										 |  |  |             candidate_tracks (List[tuple[int|str, str]]): List of candidate tracks | 
					
						
							| 
									
										
										
										
											2025-04-17 07:28:05 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-12 20:19:48 -05:00
										 |  |  |         Returns: | 
					
						
							| 
									
										
										
										
											2025-02-15 21:09:33 -05:00
										 |  |  |             Optional[tuple[int, str, float]]: Tuple of (best matching track, similarity score) | 
					
						
							| 
									
										
										
										
											2025-01-12 20:19:48 -05:00
										 |  |  |                                        or None if no good match found | 
					
						
							|  |  |  |         """
 | 
					
						
							| 
									
										
										
										
											2025-01-17 06:41:56 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-12 20:19:48 -05:00
										 |  |  |         if not input_track or not candidate_tracks: | 
					
						
							|  |  |  |             return None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # Normalize input track | 
					
						
							|  |  |  |         input_track = self._normalize_string(input_track) | 
					
						
							| 
									
										
										
										
											2025-04-17 07:28:05 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-12 20:19:48 -05:00
										 |  |  |         best_match = None | 
					
						
							| 
									
										
										
										
											2025-02-15 21:09:33 -05:00
										 |  |  |         best_score: float = 0.0 | 
					
						
							| 
									
										
										
										
											2025-01-12 20:19:48 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  |         for candidate in candidate_tracks: | 
					
						
							| 
									
										
										
										
											2025-01-13 20:47:39 -05:00
										 |  |  |             normalized_candidate = self._normalize_string(candidate[1]) | 
					
						
							| 
									
										
										
										
											2025-02-26 20:47:29 -05:00
										 |  |  |             if normalized_candidate.strip().lower() == input_track.strip().lower(): | 
					
						
							|  |  |  |                 return (candidate, 100.0) | 
					
						
							| 
									
										
										
										
											2025-04-17 07:28:05 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-12 20:19:48 -05:00
										 |  |  |             # Calculate various similarity scores | 
					
						
							|  |  |  |             exact_score = 1.0 if input_track == normalized_candidate else 0.0 | 
					
						
							| 
									
										
										
										
											2025-04-17 07:28:05 -04:00
										 |  |  |             sequence_score = SequenceMatcher( | 
					
						
							|  |  |  |                 None, input_track, normalized_candidate | 
					
						
							|  |  |  |             ).ratio() | 
					
						
							|  |  |  |             token_score = self._calculate_token_similarity( | 
					
						
							|  |  |  |                 input_track, normalized_candidate | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-12 20:19:48 -05:00
										 |  |  |             # Take the maximum of the different scoring methods | 
					
						
							|  |  |  |             final_score = max(exact_score, sequence_score, token_score) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             if final_score > best_score: | 
					
						
							|  |  |  |                 best_score = final_score | 
					
						
							|  |  |  |                 best_match = candidate | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # Return the match only if it meets the threshold | 
					
						
							| 
									
										
										
										
											2025-02-15 21:18:20 -05:00
										 |  |  |         if best_score < self.threshold: | 
					
						
							| 
									
										
										
										
											2025-02-15 21:09:33 -05:00
										 |  |  |             return None | 
					
						
							| 
									
										
										
										
											2025-04-17 07:28:05 -04:00
										 |  |  |         match: tuple = (best_match, round(best_score * 100)) | 
					
						
							| 
									
										
										
										
											2025-02-15 21:09:33 -05:00
										 |  |  |         return match | 
					
						
							| 
									
										
										
										
											2025-01-12 20:19:48 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def _normalize_string(self, text: str) -> str: | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  |         Normalize string for comparison by removing special characters, | 
					
						
							|  |  |  |         extra spaces, and converting to lowercase. | 
					
						
							| 
									
										
										
										
											2025-01-19 07:01:07 -05:00
										 |  |  |         Args: | 
					
						
							|  |  |  |             text (str): The text to normalize | 
					
						
							|  |  |  |         Returns: | 
					
						
							|  |  |  |             str: Normalized text | 
					
						
							| 
									
										
										
										
											2025-01-12 20:19:48 -05:00
										 |  |  |         """
 | 
					
						
							|  |  |  |         # Remove special characters and convert to lowercase | 
					
						
							| 
									
										
										
										
											2025-04-17 07:28:05 -04:00
										 |  |  |         text = regex.sub(r"[^\w\s-]", "", text).lower() | 
					
						
							| 
									
										
										
										
											2025-01-12 20:19:48 -05:00
										 |  |  |         # Normalize spaces | 
					
						
							| 
									
										
										
										
											2025-04-17 07:28:05 -04:00
										 |  |  |         text = " ".join(text.split()) | 
					
						
							| 
									
										
										
										
											2025-01-12 20:19:48 -05:00
										 |  |  |         return text | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def _calculate_token_similarity(self, str1: str, str2: str) -> float: | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  |         Calculate similarity based on matching tokens (words). | 
					
						
							| 
									
										
										
										
											2025-01-19 07:01:07 -05:00
										 |  |  |         Args: | 
					
						
							|  |  |  |             str1 (str): string 1 to compare | 
					
						
							|  |  |  |             str2 (str): string 2 to compare | 
					
						
							|  |  |  |         Returns: | 
					
						
							|  |  |  |             float: The token similarity score | 
					
						
							| 
									
										
										
										
											2025-01-12 20:19:48 -05:00
										 |  |  |         """
 | 
					
						
							|  |  |  |         tokens1 = set(str1.split()) | 
					
						
							|  |  |  |         tokens2 = set(str2.split()) | 
					
						
							| 
									
										
										
										
											2025-04-17 07:28:05 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-12 20:19:48 -05:00
										 |  |  |         if not tokens1 or not tokens2: | 
					
						
							|  |  |  |             return 0.0 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         intersection = tokens1.intersection(tokens2) | 
					
						
							|  |  |  |         union = tokens1.union(tokens2) | 
					
						
							| 
									
										
										
										
											2025-04-17 07:28:05 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-13 20:47:39 -05:00
										 |  |  |         return len(intersection) / len(union) | 
					
						
							| 
									
										
										
										
											2025-04-17 07:28:05 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-13 20:47:39 -05:00
										 |  |  | class DataUtils: | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Data Utils | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2025-01-17 07:48:29 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-02-15 21:09:33 -05:00
										 |  |  |     def __init__(self) -> None: | 
					
						
							| 
									
										
										
										
											2025-08-09 07:48:07 -04:00
										 |  |  |         self.lrc_regex = ( | 
					
						
							|  |  |  |             regex.compile(  # capture mm:ss and optional .xxx, then the lyric text | 
					
						
							|  |  |  |                 r"""
 | 
					
						
							| 
									
										
										
										
											2025-08-07 11:47:57 -04:00
										 |  |  |             \[                 # literal “[” | 
					
						
							|  |  |  |             (                # 1st (and only) capture group: | 
					
						
							|  |  |  |                 [0-9]{2}       #   two-digit minutes | 
					
						
							|  |  |  |                 :[0-9]{2}      #   colon + two-digit seconds | 
					
						
							|  |  |  |                 (?:\.[0-9]{1,3})?  #   optional decimal part, e.g. .123 | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |             \]                 # literal “]” | 
					
						
							|  |  |  |             \s*                # optional whitespace | 
					
						
							|  |  |  |             (.*)               # capture the rest of the line as words | 
					
						
							|  |  |  |             """,
 | 
					
						
							| 
									
										
										
										
											2025-08-09 07:48:07 -04:00
										 |  |  |                 regex.VERBOSE, | 
					
						
							|  |  |  |             ) | 
					
						
							| 
									
										
										
										
											2025-04-17 07:28:05 -04:00
										 |  |  |         ) | 
					
						
							|  |  |  |         self.scrub_regex_1: Pattern = regex.compile(r"(\[.*?\])(\s){0,}(\:){0,1}") | 
					
						
							|  |  |  |         self.scrub_regex_2: Pattern = regex.compile( | 
					
						
							|  |  |  |             r"(\d?)(Embed\b)", flags=regex.IGNORECASE | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  |         self.scrub_regex_3: Pattern = regex.compile(r"\n{2}") | 
					
						
							|  |  |  |         self.scrub_regex_4: Pattern = regex.compile(r"[0-9]\b$") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-13 20:47:39 -05:00
										 |  |  |     def scrub_lyrics(self, lyrics: str) -> str: | 
					
						
							| 
									
										
										
										
											2025-01-19 07:01:07 -05:00
										 |  |  |         """
 | 
					
						
							|  |  |  |         Lyric Scrub Regex Chain | 
					
						
							|  |  |  |         Args: | 
					
						
							|  |  |  |             lyrics (str): The lyrics to scrub | 
					
						
							|  |  |  |         Returns: | 
					
						
							|  |  |  |             str: Regex scrubbed lyrics | 
					
						
							| 
									
										
										
										
											2025-01-17 07:54:17 -05:00
										 |  |  |         """
 | 
					
						
							| 
									
										
										
										
											2025-04-17 07:28:05 -04:00
										 |  |  |         lyrics = self.scrub_regex_1.sub("", lyrics) | 
					
						
							|  |  |  |         lyrics = self.scrub_regex_2.sub("", lyrics) | 
					
						
							|  |  |  |         lyrics = self.scrub_regex_3.sub("\n", lyrics)  # Gaps between verses | 
					
						
							|  |  |  |         lyrics = self.scrub_regex_3.sub("", lyrics) | 
					
						
							|  |  |  |         return lyrics | 
					
						
							| 
									
										
										
										
											2025-01-17 07:48:29 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def create_lrc_object(self, lrc_str: str) -> list[dict]: | 
					
						
							| 
									
										
										
										
											2025-01-19 07:01:07 -05:00
										 |  |  |         """
 | 
					
						
							|  |  |  |         Create LRC Object | 
					
						
							|  |  |  |         Args: | 
					
						
							|  |  |  |             lrc_str (str): The raw LRCLib syncedLyrics | 
					
						
							|  |  |  |         Returns: | 
					
						
							|  |  |  |             list[dict]: LRC Object comprised of timestamps/lyrics | 
					
						
							| 
									
										
										
										
											2025-01-17 07:54:17 -05:00
										 |  |  |         """
 | 
					
						
							| 
									
										
										
										
											2025-01-17 07:48:29 -05:00
										 |  |  |         lrc_out: list = [] | 
					
						
							|  |  |  |         for line in lrc_str.split("\n"): | 
					
						
							|  |  |  |             _timetag = None | 
					
						
							|  |  |  |             _words = None | 
					
						
							|  |  |  |             if not line.strip(): | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             reg_helper = regex.findall(self.lrc_regex, line.strip()) | 
					
						
							|  |  |  |             if not reg_helper: | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             reg_helper = reg_helper[0] | 
					
						
							| 
									
										
										
										
											2025-09-26 13:45:39 -04:00
										 |  |  |             # logging.debug( | 
					
						
							|  |  |  |             #     "Reg helper: %s for line: %s; len: %s", | 
					
						
							|  |  |  |             #     reg_helper, | 
					
						
							|  |  |  |             #     line, | 
					
						
							|  |  |  |             #     len(reg_helper), | 
					
						
							|  |  |  |             # ) | 
					
						
							| 
									
										
										
										
											2025-01-17 07:48:29 -05:00
										 |  |  |             _timetag = reg_helper[0] | 
					
						
							|  |  |  |             if not reg_helper[1].strip(): | 
					
						
							| 
									
										
										
										
											2025-08-07 11:47:57 -04:00
										 |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2025-01-17 07:48:29 -05:00
										 |  |  |             else: | 
					
						
							| 
									
										
										
										
											2025-01-20 05:47:09 -05:00
										 |  |  |                 _words = reg_helper[1].strip() | 
					
						
							| 
									
										
										
										
											2025-04-17 07:28:05 -04:00
										 |  |  |             lrc_out.append( | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "timeTag": _timetag, | 
					
						
							|  |  |  |                     "words": _words, | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |         return lrc_out |