progress
This commit is contained in:
@@ -2,7 +2,7 @@
|
||||
|
||||
from difflib import SequenceMatcher
|
||||
from typing import List, Optional, Tuple
|
||||
import re
|
||||
import regex
|
||||
|
||||
class TrackMatcher:
|
||||
"""Track Matcher"""
|
||||
@@ -16,16 +16,16 @@ class TrackMatcher:
|
||||
"""
|
||||
self.threshold = threshold
|
||||
|
||||
def find_best_match(self, input_track: str, candidate_tracks: List[str]) -> Optional[Tuple[str, float]]:
|
||||
def find_best_match(self, input_track: str, candidate_tracks: List[tuple[int|str, str]]) -> Optional[Tuple[str, float]]:
|
||||
"""
|
||||
Find the best matching track from the candidate list.
|
||||
|
||||
Args:
|
||||
input_track (str): Input track in "ARTIST - SONG" format
|
||||
candidate_tracks (List[str]): List of candidate tracks in same format
|
||||
candidate_tracks (List[tuple[int, str]]): List of candidate tracks
|
||||
|
||||
Returns:
|
||||
Optional[Tuple[str, float]]: Tuple of (best matching track, similarity score)
|
||||
Optional[Tuple[int, str, float]]: Tuple of (best matching track, similarity score)
|
||||
or None if no good match found
|
||||
"""
|
||||
if not input_track or not candidate_tracks:
|
||||
@@ -33,12 +33,14 @@ class TrackMatcher:
|
||||
|
||||
# Normalize input track
|
||||
input_track = self._normalize_string(input_track)
|
||||
|
||||
print(f"input_track: {input_track}")
|
||||
|
||||
best_match = None
|
||||
best_score = 0
|
||||
|
||||
for candidate in candidate_tracks:
|
||||
normalized_candidate = self._normalize_string(candidate)
|
||||
normalized_candidate = self._normalize_string(candidate[1])
|
||||
|
||||
# Calculate various similarity scores
|
||||
exact_score = 1.0 if input_track == normalized_candidate else 0.0
|
||||
@@ -61,7 +63,7 @@ class TrackMatcher:
|
||||
extra spaces, and converting to lowercase.
|
||||
"""
|
||||
# Remove special characters and convert to lowercase
|
||||
text = re.sub(r'[^\w\s-]', '', text.lower())
|
||||
text = regex.sub(r'[^\w\s-]', '', text.lower())
|
||||
# Normalize spaces
|
||||
text = ' '.join(text.split())
|
||||
return text
|
||||
@@ -79,4 +81,16 @@ class TrackMatcher:
|
||||
intersection = tokens1.intersection(tokens2)
|
||||
union = tokens1.union(tokens2)
|
||||
|
||||
return len(intersection) / len(union)
|
||||
return len(intersection) / len(union)
|
||||
|
||||
class DataUtils:
|
||||
"""
|
||||
Data Utils
|
||||
"""
|
||||
def scrub_lyrics(self, lyrics: str) -> str:
|
||||
# Regex chain
|
||||
lyrics = regex.sub(r'(\[.*?\])(\s){0,}(\:){0,1}', '', lyrics)
|
||||
lyrics = regex.sub(r'(\d?)(Embed\b)', '', lyrics, flags=regex.IGNORECASE)
|
||||
lyrics = regex.sub(r'\n{2}', '\n', lyrics) # Gaps between verses
|
||||
lyrics = regex.sub(r'[0-9]\b$', '', lyrics)
|
||||
return lyrics
|
||||
|
Reference in New Issue
Block a user