#!/usr/bin/env python3.12 from difflib import SequenceMatcher from typing import List, Optional, Tuple import re # Example usage: if __name__ == "__main__": matcher = TrackMatcher(threshold=0.85) candidate_tracks = [ "The Beatles - Hey Jude", "Led Zeppelin - Stairway to Heaven", "Queen - Bohemian Rhapsody", "Pink Floyd - Comfortably Numb", "The Beatles - Hey Jules", # Intentionally similar to "Hey Jude" ] # Test cases test_tracks = [ "The Beatles - Hey Jude", # Exact match "Beatles - Hey Jude", # Similar match "The Beatles - Hey Jules", # Similar but different "Metallica - Nothing Else Matters", # No match "Queen - bohemian rhapsody", # Different case ] for test_track in test_tracks: result = matcher.find_best_match(test_track, candidate_tracks) if result: match, score = result print(f"Input: {test_track}") print(f"Best match: {match}") print(f"Similarity score: {score:.3f}\n") else: print(f"No good match found for: {test_track}\n") class TrackMatcher: """Track Matcher""" def __init__(self, threshold: float = 0.85): """ Initialize the TrackMatcher with a similarity threshold. Args: threshold (float): Minimum similarity score to consider a match valid (between 0 and 1, default 0.85) """ self.threshold = threshold def find_best_match(self, input_track: str, candidate_tracks: List[str]) -> Optional[Tuple[str, float]]: """ Find the best matching track from the candidate list. Args: input_track (str): Input track in "ARTIST - SONG" format candidate_tracks (List[str]): List of candidate tracks in same format Returns: Optional[Tuple[str, float]]: Tuple of (best matching track, similarity score) or None if no good match found """ if not input_track or not candidate_tracks: return None # Normalize input track input_track = self._normalize_string(input_track) best_match = None best_score = 0 for candidate in candidate_tracks: normalized_candidate = self._normalize_string(candidate) # Calculate various similarity scores exact_score = 1.0 if input_track == normalized_candidate else 0.0 sequence_score = SequenceMatcher(None, input_track, normalized_candidate).ratio() token_score = self._calculate_token_similarity(input_track, normalized_candidate) # Take the maximum of the different scoring methods final_score = max(exact_score, sequence_score, token_score) if final_score > best_score: best_score = final_score best_match = candidate # Return the match only if it meets the threshold return (best_match, best_score) if best_score >= self.threshold else None def _normalize_string(self, text: str) -> str: """ Normalize string for comparison by removing special characters, extra spaces, and converting to lowercase. """ # Remove special characters and convert to lowercase text = re.sub(r'[^\w\s-]', '', text.lower()) # Normalize spaces text = ' '.join(text.split()) return text def _calculate_token_similarity(self, str1: str, str2: str) -> float: """ Calculate similarity based on matching tokens (words). """ tokens1 = set(str1.split()) tokens2 = set(str2.split()) if not tokens1 or not tokens2: return 0.0 intersection = tokens1.intersection(tokens2) union = tokens1.union(tokens2) return len(intersection) / len(union)