82 lines
2.9 KiB
Python
Raw Normal View History

2025-01-12 20:19:48 -05:00
#!/usr/bin/env python3.12
from difflib import SequenceMatcher
from typing import List, Optional, Tuple
import re
class TrackMatcher:
"""Track Matcher"""
def __init__(self, threshold: float = 0.85):
"""
Initialize the TrackMatcher with a similarity threshold.
Args:
threshold (float): Minimum similarity score to consider a match valid
(between 0 and 1, default 0.85)
"""
self.threshold = threshold
def find_best_match(self, input_track: str, candidate_tracks: List[str]) -> Optional[Tuple[str, float]]:
"""
Find the best matching track from the candidate list.
Args:
input_track (str): Input track in "ARTIST - SONG" format
candidate_tracks (List[str]): List of candidate tracks in same format
Returns:
Optional[Tuple[str, float]]: Tuple of (best matching track, similarity score)
or None if no good match found
"""
if not input_track or not candidate_tracks:
return None
# Normalize input track
input_track = self._normalize_string(input_track)
best_match = None
best_score = 0
for candidate in candidate_tracks:
normalized_candidate = self._normalize_string(candidate)
# Calculate various similarity scores
exact_score = 1.0 if input_track == normalized_candidate else 0.0
sequence_score = SequenceMatcher(None, input_track, normalized_candidate).ratio()
token_score = self._calculate_token_similarity(input_track, normalized_candidate)
# Take the maximum of the different scoring methods
final_score = max(exact_score, sequence_score, token_score)
if final_score > best_score:
best_score = final_score
best_match = candidate
# Return the match only if it meets the threshold
return (best_match, best_score) if best_score >= self.threshold else None
def _normalize_string(self, text: str) -> str:
"""
Normalize string for comparison by removing special characters,
extra spaces, and converting to lowercase.
"""
# Remove special characters and convert to lowercase
text = re.sub(r'[^\w\s-]', '', text.lower())
# Normalize spaces
text = ' '.join(text.split())
return text
def _calculate_token_similarity(self, str1: str, str2: str) -> float:
"""
Calculate similarity based on matching tokens (words).
"""
tokens1 = set(str1.split())
tokens2 = set(str2.split())
if not tokens1 or not tokens2:
return 0.0
intersection = tokens1.intersection(tokens2)
union = tokens1.union(tokens2)
return len(intersection) / len(union)