api/lyric_search_new/utils.py

#!/usr/bin/env python3.12

from difflib import SequenceMatcher
from typing import List, Optional, Tuple
import regex

class TrackMatcher:
    """Track Matcher"""
    def __init__(self, threshold: float = 0.85):
        """
        Initialize the TrackMatcher with a similarity threshold.

        Args:
            threshold (float): Minimum similarity score to consider a match valid
                             (between 0 and 1, default 0.85)
        """
        self.threshold = threshold

    def find_best_match(self, input_track: str, candidate_tracks: List[tuple[int|str, str]]) -> Optional[Tuple[str, float]]:
        """
        Find the best matching track from the candidate list.

        Args:
            input_track (str): Input track in "ARTIST - SONG" format
            candidate_tracks (List[tuple[int|str, str]]): List of candidate tracks

        Returns:
            Optional[Tuple[int, str, float]]: Tuple of (best matching track, similarity score)
                                       or None if no good match found
        """
        if not input_track or not candidate_tracks:
            return None

        # Normalize input track
        input_track = self._normalize_string(input_track)

        print(f"input_track: {input_track}")

        best_match = None
        best_score = 0

        for candidate in candidate_tracks:
            normalized_candidate = self._normalize_string(candidate[1])

            # Calculate various similarity scores
            exact_score = 1.0 if input_track == normalized_candidate else 0.0
            sequence_score = SequenceMatcher(None, input_track, normalized_candidate).ratio()
            token_score = self._calculate_token_similarity(input_track, normalized_candidate)

            # Take the maximum of the different scoring methods
            final_score = max(exact_score, sequence_score, token_score)

            if final_score > best_score:
                best_score = final_score
                best_match = candidate

        # Return the match only if it meets the threshold
        return (best_match, round(best_score, 2)) if best_score >= self.threshold else None

    def _normalize_string(self, text: str) -> str:
        """
        Normalize string for comparison by removing special characters,
        extra spaces, and converting to lowercase.
        """
        # Remove special characters and convert to lowercase
        text = regex.sub(r'[^\w\s-]', '', text).lower()
        print(f"Text: {text}")
        # Normalize spaces
        text = ' '.join(text.split())
        return text

    def _calculate_token_similarity(self, str1: str, str2: str) -> float:
        """
        Calculate similarity based on matching tokens (words).
        """
        tokens1 = set(str1.split())
        tokens2 = set(str2.split())

        if not tokens1 or not tokens2:
            return 0.0

        intersection = tokens1.intersection(tokens2)
        union = tokens1.union(tokens2)

        return len(intersection) / len(union)

class DataUtils:
    """
    Data Utils
    """
    def scrub_lyrics(self, lyrics: str) -> str:
        # Regex chain
        lyrics = regex.sub(r'(\[.*?\])(\s){0,}(\:){0,1}', '', lyrics)
        lyrics = regex.sub(r'(\d?)(Embed\b)', '', lyrics, flags=regex.IGNORECASE)
        lyrics = regex.sub(r'\n{2}', '\n', lyrics)  # Gaps between verses
        lyrics = regex.sub(r'[0-9]\b$', '', lyrics)
        return lyrics