lyric_search_new renamed to lyric_search

2025-01-24 09:10:54 -05:00
parent b0a9af0560
commit c3bbd27cd5
13 changed files with 14 additions and 14 deletions
--- a/lyric_search/utils.py
+++ b/lyric_search/utils.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3.12
+
+from difflib import SequenceMatcher
+from typing import List, Optional, Tuple
+import logging
+import regex
+
+class TrackMatcher:
+    """Track Matcher"""
+    def __init__(self, threshold: float = 0.85):
+        """
+        Initialize the TrackMatcher with a similarity threshold.
+        
+        Args:
+            threshold (float): Minimum similarity score to consider a match valid
+                             (between 0 and 1, default 0.85)
+        """
+        self.threshold = threshold
+
+    def find_best_match(self, input_track: str, candidate_tracks: List[tuple[int|str, str]]) -> Optional[Tuple[str, float]]:
+        """
+        Find the best matching track from the candidate list.
+        
+        Args:
+            input_track (str): Input track in "ARTIST - SONG" format
+            candidate_tracks (List[tuple[int|str, str]]): List of candidate tracks
+            
+        Returns:
+            Optional[Tuple[int, str, float]]: Tuple of (best matching track, similarity score)
+                                       or None if no good match found
+        """
+
+
+        if not input_track or not candidate_tracks:
+            return None
+
+        # Normalize input track
+        input_track = self._normalize_string(input_track)
+        
+        best_match = None
+        best_score = 0
+
+        for candidate in candidate_tracks:
+            normalized_candidate = self._normalize_string(candidate[1])
+            
+            # Calculate various similarity scores
+            exact_score = 1.0 if input_track == normalized_candidate else 0.0
+            sequence_score = SequenceMatcher(None, input_track, normalized_candidate).ratio()
+            token_score = self._calculate_token_similarity(input_track, normalized_candidate)
+            
+            # Take the maximum of the different scoring methods
+            final_score = max(exact_score, sequence_score, token_score)
+
+            if final_score > best_score:
+                best_score = final_score
+                best_match = candidate
+
+        # Return the match only if it meets the threshold
+        return (best_match, round(best_score * 100)) if best_score >= self.threshold else None
+
+    def _normalize_string(self, text: str) -> str:
+        """
+        Normalize string for comparison by removing special characters,
+        extra spaces, and converting to lowercase.
+        Args:
+            text (str): The text to normalize
+        Returns:
+            str: Normalized text
+        """
+        # Remove special characters and convert to lowercase
+        text = regex.sub(r'[^\w\s-]', '', text).lower()
+        # Normalize spaces
+        text = ' '.join(text.split())
+        return text
+
+    def _calculate_token_similarity(self, str1: str, str2: str) -> float:
+        """
+        Calculate similarity based on matching tokens (words).
+        Args:
+            str1 (str): string 1 to compare
+            str2 (str): string 2 to compare
+        Returns:
+            float: The token similarity score
+        """
+        tokens1 = set(str1.split())
+        tokens2 = set(str2.split())
+        
+        if not tokens1 or not tokens2:
+            return 0.0
+
+        intersection = tokens1.intersection(tokens2)
+        union = tokens1.union(tokens2)
+        
+        return len(intersection) / len(union)
+    
+class DataUtils:
+    """
+    Data Utils
+    """
+
+    def __init__(self):
+        self.lrc_regex = regex.compile(r'\[([0-9]{2}:[0-9]{2})\.[0-9]{1,3}\](\s(.*)){0,}')
+
+
+    def scrub_lyrics(self, lyrics: str) -> str:
+        """
+        Lyric Scrub Regex Chain
+        Args:
+            lyrics (str): The lyrics to scrub
+        Returns:
+            str: Regex scrubbed lyrics
+        """
+        lyrics = regex.sub(r'(\[.*?\])(\s){0,}(\:){0,1}', '', lyrics)
+        lyrics = regex.sub(r'(\d?)(Embed\b)', '', lyrics, flags=regex.IGNORECASE)
+        lyrics = regex.sub(r'\n{2}', '\n', lyrics)  # Gaps between verses
+        lyrics = regex.sub(r'[0-9]\b$', '', lyrics)
+        return lyrics   
+
+    def create_lrc_object(self, lrc_str: str) -> list[dict]:
+        """
+        Create LRC Object
+        Args:
+            lrc_str (str): The raw LRCLib syncedLyrics
+        Returns:
+            list[dict]: LRC Object comprised of timestamps/lyrics
+        """
+        lrc_out: list = []
+        for line in lrc_str.split("\n"):
+            _timetag = None
+            _words = None
+            if not line.strip():
+                continue
+            reg_helper = regex.findall(self.lrc_regex, line.strip())
+            if not reg_helper:
+                continue
+            reg_helper = reg_helper[0]
+            logging.debug("Reg helper: %s for line: %s; len: %s",
+                            reg_helper, line, len(reg_helper))
+            _timetag = reg_helper[0]
+            if not reg_helper[1].strip():
+                _words = "♪"
+            else:
+                _words = reg_helper[1].strip()
+            lrc_out.append({
+                "timeTag": _timetag,
+                "words": _words,
+            })
+        return lrc_out