lyric_search_new/utils.py

#!/usr/bin/env python3.12

from difflib import SequenceMatcher
from typing import List, Optional, Tuple
import logging
import regex

class TrackMatcher:
    """Track Matcher"""
    def __init__(self, threshold: float = 0.85):
        """
        Initialize the TrackMatcher with a similarity threshold.
        
        Args:
            threshold (float): Minimum similarity score to consider a match valid
                             (between 0 and 1, default 0.85)
        """
        self.threshold = threshold

    def find_best_match(self, input_track: str, candidate_tracks: List[tuple[int|str, str]]) -> Optional[Tuple[str, float]]:
        """
        Find the best matching track from the candidate list.
        
        Args:
            input_track (str): Input track in "ARTIST - SONG" format
            candidate_tracks (List[tuple[int|str, str]]): List of candidate tracks
            
        Returns:
            Optional[Tuple[int, str, float]]: Tuple of (best matching track, similarity score)
                                       or None if no good match found
        """


        if not input_track or not candidate_tracks:
            return None

        # Normalize input track
        input_track = self._normalize_string(input_track)
        
        best_match = None
        best_score = 0

        for candidate in candidate_tracks:
            normalized_candidate = self._normalize_string(candidate[1])
            
            # Calculate various similarity scores
            exact_score = 1.0 if input_track == normalized_candidate else 0.0
            sequence_score = SequenceMatcher(None, input_track, normalized_candidate).ratio()
            token_score = self._calculate_token_similarity(input_track, normalized_candidate)
            
            # Take the maximum of the different scoring methods
            final_score = max(exact_score, sequence_score, token_score)

            if final_score > best_score:
                best_score = final_score
                best_match = candidate

        # Return the match only if it meets the threshold
        return (best_match, round(best_score * 100)) if best_score >= self.threshold else None

    def _normalize_string(self, text: str) -> str:
        """
        Normalize string for comparison by removing special characters,
        extra spaces, and converting to lowercase.
        """
        # Remove special characters and convert to lowercase
        text = regex.sub(r'[^\w\s-]', '', text).lower()
        # Normalize spaces
        text = ' '.join(text.split())
        return text

    def _calculate_token_similarity(self, str1: str, str2: str) -> float:
        """
        Calculate similarity based on matching tokens (words).
        """
        tokens1 = set(str1.split())
        tokens2 = set(str2.split())
        
        if not tokens1 or not tokens2:
            return 0.0

        intersection = tokens1.intersection(tokens2)
        union = tokens1.union(tokens2)
        
        return len(intersection) / len(union)
    
class DataUtils:
    """
    Data Utils
    """

    def __init__(self):
        self.lrc_regex = regex.compile(r'\[([0-9]{2}:[0-9]{2})\.[0-9]{1,3}\](\s(.*)){0,}')


    def scrub_lyrics(self, lyrics: str) -> str:
        """Regex Chain
        @lyrics: The lyrics (str) to scrub
        """
        lyrics = regex.sub(r'(\[.*?\])(\s){0,}(\:){0,1}', '', lyrics)
        lyrics = regex.sub(r'(\d?)(Embed\b)', '', lyrics, flags=regex.IGNORECASE)
        lyrics = regex.sub(r'\n{2}', '\n', lyrics)  # Gaps between verses
        lyrics = regex.sub(r'[0-9]\b$', '', lyrics)
        return lyrics   

    def create_lrc_object(self, lrc_str: str) -> list[dict]:
        """Create LRC Object
        @lrc_str: The raw LRCLib syncedLyrics (str)
        """
        lrc_out: list = []
        for line in lrc_str.split("\n"):
            _timetag = None
            _words = None
            if not line.strip():
                continue
            reg_helper = regex.findall(self.lrc_regex, line.strip())
            if not reg_helper:
                continue
            reg_helper = reg_helper[0]
            logging.debug("Reg helper: %s for line: %s; len: %s",
                            reg_helper, line, len(reg_helper))
            _timetag = reg_helper[0]
            if not reg_helper[1].strip():
                _words = "♪"
            else:
                _words = reg_helper[1]
            lrc_out.append({
                "timeTag": _timetag,
                "words": _words,
            })
            logging.info("util: returning %s, type: %s",
                         lrc_out, type(lrc_out))
        return lrc_out
WIP 2025-01-12 20:19:48 -05:00			`#!/usr/bin/env python3.12`

			`from difflib import SequenceMatcher`
			`from typing import List, Optional, Tuple`
change confidence to % 2025-01-17 06:41:56 -05:00			`import logging`
progress 2025-01-13 20:47:39 -05:00			`import regex`
WIP 2025-01-12 20:19:48 -05:00
			`class TrackMatcher:`
			`"""Track Matcher"""`
			`def __init__(self, threshold: float = 0.85):`
			`"""`
			`Initialize the TrackMatcher with a similarity threshold.`

			`Args:`
			`threshold (float): Minimum similarity score to consider a match valid`
			`(between 0 and 1, default 0.85)`
			`"""`
			`self.threshold = threshold`

progress 2025-01-13 20:47:39 -05:00			`def find_best_match(self, input_track: str, candidate_tracks: List[tuple[int\|str, str]]) -> Optional[Tuple[str, float]]:`
WIP 2025-01-12 20:19:48 -05:00			`"""`
			`Find the best matching track from the candidate list.`

			`Args:`
			`input_track (str): Input track in "ARTIST - SONG" format`
docstring 2025-01-14 09:06:40 -05:00			`candidate_tracks (List[tuple[int\|str, str]]): List of candidate tracks`
WIP 2025-01-12 20:19:48 -05:00
			`Returns:`
progress 2025-01-13 20:47:39 -05:00			`Optional[Tuple[int, str, float]]: Tuple of (best matching track, similarity score)`
WIP 2025-01-12 20:19:48 -05:00			`or None if no good match found`
			`"""`
change confidence to % 2025-01-17 06:41:56 -05:00

WIP 2025-01-12 20:19:48 -05:00			`if not input_track or not candidate_tracks:`
			`return None`

			`# Normalize input track`
			`input_track = self._normalize_string(input_track)`

			`best_match = None`
			`best_score = 0`

			`for candidate in candidate_tracks:`
progress 2025-01-13 20:47:39 -05:00			`normalized_candidate = self._normalize_string(candidate[1])`
WIP 2025-01-12 20:19:48 -05:00
			`# Calculate various similarity scores`
			`exact_score = 1.0 if input_track == normalized_candidate else 0.0`
			`sequence_score = SequenceMatcher(None, input_track, normalized_candidate).ratio()`
			`token_score = self._calculate_token_similarity(input_track, normalized_candidate)`

			`# Take the maximum of the different scoring methods`
			`final_score = max(exact_score, sequence_score, token_score)`

			`if final_score > best_score:`
			`best_score = final_score`
			`best_match = candidate`

			`# Return the match only if it meets the threshold`
space 2025-01-17 06:42:35 -05:00			`return (best_match, round(best_score * 100)) if best_score >= self.threshold else None`
WIP 2025-01-12 20:19:48 -05:00
			`def _normalize_string(self, text: str) -> str:`
			`"""`
			`Normalize string for comparison by removing special characters,`
			`extra spaces, and converting to lowercase.`
			`"""`
			`# Remove special characters and convert to lowercase`
stoof 2025-01-14 07:45:34 -05:00			`text = regex.sub(r'[^\w\s-]', '', text).lower()`
WIP 2025-01-12 20:19:48 -05:00			`# Normalize spaces`
			`text = ' '.join(text.split())`
			`return text`

			`def _calculate_token_similarity(self, str1: str, str2: str) -> float:`
			`"""`
			`Calculate similarity based on matching tokens (words).`
			`"""`
			`tokens1 = set(str1.split())`
			`tokens2 = set(str2.split())`

			`if not tokens1 or not tokens2:`
			`return 0.0`

			`intersection = tokens1.intersection(tokens2)`
			`union = tokens1.union(tokens2)`

progress 2025-01-13 20:47:39 -05:00			`return len(intersection) / len(union)`

			`class DataUtils:`
			`"""`
			`Data Utils`
			`"""`
lrc tweaks 2025-01-17 07:48:29 -05:00
			`def __init__(self):`
			`self.lrc_regex = regex.compile(r'\[([0-9]{2}:[0-9]{2})\.[0-9]{1,3}\](\s(.*)){0,}')`


progress 2025-01-13 20:47:39 -05:00			`def scrub_lyrics(self, lyrics: str) -> str:`
linter fixes 2025-01-17 07:54:17 -05:00			`"""Regex Chain`
			`@lyrics: The lyrics (str) to scrub`
			`"""`
progress 2025-01-13 20:47:39 -05:00			`lyrics = regex.sub(r'(\[.*?\])(\s){0,}(\:){0,1}', '', lyrics)`
			`lyrics = regex.sub(r'(\d?)(Embed\b)', '', lyrics, flags=regex.IGNORECASE)`
			`lyrics = regex.sub(r'\n{2}', '\n', lyrics) # Gaps between verses`
			`lyrics = regex.sub(r'[0-9]\b$', '', lyrics)`
lrc tweaks 2025-01-17 07:48:29 -05:00			`return lyrics`

			`def create_lrc_object(self, lrc_str: str) -> list[dict]:`
linter fixes 2025-01-17 07:54:17 -05:00			`"""Create LRC Object`
			`@lrc_str: The raw LRCLib syncedLyrics (str)`
			`"""`
lrc tweaks 2025-01-17 07:48:29 -05:00			`lrc_out: list = []`
			`for line in lrc_str.split("\n"):`
			`_timetag = None`
			`_words = None`
			`if not line.strip():`
			`continue`
			`reg_helper = regex.findall(self.lrc_regex, line.strip())`
			`if not reg_helper:`
			`continue`
			`reg_helper = reg_helper[0]`
			`logging.debug("Reg helper: %s for line: %s; len: %s",`
			`reg_helper, line, len(reg_helper))`
			`_timetag = reg_helper[0]`
			`if not reg_helper[1].strip():`
			`_words = "♪"`
			`else:`
			`_words = reg_helper[1]`
			`lrc_out.append({`
			`"timeTag": _timetag,`
			`"words": _words,`
			`})`
			`logging.info("util: returning %s, type: %s",`
			`lrc_out, type(lrc_out))`
linter fixes 2025-01-17 07:54:17 -05:00			`return lrc_out`