radio_util: open tracks SQLite DB in readonly mode; black: reformat files
This commit is contained in:
@ -4,38 +4,41 @@ import logging
|
||||
import regex
|
||||
from regex import Pattern
|
||||
|
||||
|
||||
class TrackMatcher:
|
||||
"""Track Matcher"""
|
||||
|
||||
def __init__(self, threshold: float = 0.85):
|
||||
"""
|
||||
Initialize the TrackMatcher with a similarity threshold.
|
||||
|
||||
|
||||
Args:
|
||||
threshold (float): Minimum similarity score to consider a match valid
|
||||
(between 0 and 1, default 0.85)
|
||||
"""
|
||||
self.threshold = threshold
|
||||
|
||||
def find_best_match(self, input_track: str, candidate_tracks: List[tuple[int|str, str]]) -> Optional[tuple]:
|
||||
def find_best_match(
|
||||
self, input_track: str, candidate_tracks: List[tuple[int | str, str]]
|
||||
) -> Optional[tuple]:
|
||||
"""
|
||||
Find the best matching track from the candidate list.
|
||||
|
||||
|
||||
Args:
|
||||
input_track (str): Input track in "ARTIST - SONG" format
|
||||
candidate_tracks (List[tuple[int|str, str]]): List of candidate tracks
|
||||
|
||||
|
||||
Returns:
|
||||
Optional[tuple[int, str, float]]: Tuple of (best matching track, similarity score)
|
||||
or None if no good match found
|
||||
"""
|
||||
|
||||
|
||||
if not input_track or not candidate_tracks:
|
||||
return None
|
||||
|
||||
# Normalize input track
|
||||
input_track = self._normalize_string(input_track)
|
||||
|
||||
|
||||
best_match = None
|
||||
best_score: float = 0.0
|
||||
|
||||
@ -43,12 +46,16 @@ class TrackMatcher:
|
||||
normalized_candidate = self._normalize_string(candidate[1])
|
||||
if normalized_candidate.strip().lower() == input_track.strip().lower():
|
||||
return (candidate, 100.0)
|
||||
|
||||
|
||||
# Calculate various similarity scores
|
||||
exact_score = 1.0 if input_track == normalized_candidate else 0.0
|
||||
sequence_score = SequenceMatcher(None, input_track, normalized_candidate).ratio()
|
||||
token_score = self._calculate_token_similarity(input_track, normalized_candidate)
|
||||
|
||||
sequence_score = SequenceMatcher(
|
||||
None, input_track, normalized_candidate
|
||||
).ratio()
|
||||
token_score = self._calculate_token_similarity(
|
||||
input_track, normalized_candidate
|
||||
)
|
||||
|
||||
# Take the maximum of the different scoring methods
|
||||
final_score = max(exact_score, sequence_score, token_score)
|
||||
|
||||
@ -59,7 +66,7 @@ class TrackMatcher:
|
||||
# Return the match only if it meets the threshold
|
||||
if best_score < self.threshold:
|
||||
return None
|
||||
match: tuple = (best_match, round(best_score * 100))
|
||||
match: tuple = (best_match, round(best_score * 100))
|
||||
return match
|
||||
|
||||
def _normalize_string(self, text: str) -> str:
|
||||
@ -72,9 +79,9 @@ class TrackMatcher:
|
||||
str: Normalized text
|
||||
"""
|
||||
# Remove special characters and convert to lowercase
|
||||
text = regex.sub(r'[^\w\s-]', '', text).lower()
|
||||
text = regex.sub(r"[^\w\s-]", "", text).lower()
|
||||
# Normalize spaces
|
||||
text = ' '.join(text.split())
|
||||
text = " ".join(text.split())
|
||||
return text
|
||||
|
||||
def _calculate_token_similarity(self, str1: str, str2: str) -> float:
|
||||
@ -88,28 +95,32 @@ class TrackMatcher:
|
||||
"""
|
||||
tokens1 = set(str1.split())
|
||||
tokens2 = set(str2.split())
|
||||
|
||||
|
||||
if not tokens1 or not tokens2:
|
||||
return 0.0
|
||||
|
||||
intersection = tokens1.intersection(tokens2)
|
||||
union = tokens1.union(tokens2)
|
||||
|
||||
|
||||
return len(intersection) / len(union)
|
||||
|
||||
|
||||
|
||||
class DataUtils:
|
||||
"""
|
||||
Data Utils
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.lrc_regex = regex.compile(r'\[([0-9]{2}:[0-9]{2})\.[0-9]{1,3}\](\s(.*)){0,}')
|
||||
self.scrub_regex_1: Pattern = regex.compile(r'(\[.*?\])(\s){0,}(\:){0,1}')
|
||||
self.scrub_regex_2: Pattern = regex.compile(r'(\d?)(Embed\b)',
|
||||
flags=regex.IGNORECASE)
|
||||
self.scrub_regex_3: Pattern = regex.compile(r'\n{2}')
|
||||
self.scrub_regex_4: Pattern = regex.compile(r'[0-9]\b$')
|
||||
|
||||
self.lrc_regex = regex.compile(
|
||||
r"\[([0-9]{2}:[0-9]{2})\.[0-9]{1,3}\](\s(.*)){0,}"
|
||||
)
|
||||
self.scrub_regex_1: Pattern = regex.compile(r"(\[.*?\])(\s){0,}(\:){0,1}")
|
||||
self.scrub_regex_2: Pattern = regex.compile(
|
||||
r"(\d?)(Embed\b)", flags=regex.IGNORECASE
|
||||
)
|
||||
self.scrub_regex_3: Pattern = regex.compile(r"\n{2}")
|
||||
self.scrub_regex_4: Pattern = regex.compile(r"[0-9]\b$")
|
||||
|
||||
def scrub_lyrics(self, lyrics: str) -> str:
|
||||
"""
|
||||
Lyric Scrub Regex Chain
|
||||
@ -118,11 +129,11 @@ class DataUtils:
|
||||
Returns:
|
||||
str: Regex scrubbed lyrics
|
||||
"""
|
||||
lyrics = self.scrub_regex_1.sub('', lyrics)
|
||||
lyrics = self.scrub_regex_2.sub('', lyrics)
|
||||
lyrics = self.scrub_regex_3.sub('\n', lyrics) # Gaps between verses
|
||||
lyrics = self.scrub_regex_3.sub('', lyrics)
|
||||
return lyrics
|
||||
lyrics = self.scrub_regex_1.sub("", lyrics)
|
||||
lyrics = self.scrub_regex_2.sub("", lyrics)
|
||||
lyrics = self.scrub_regex_3.sub("\n", lyrics) # Gaps between verses
|
||||
lyrics = self.scrub_regex_3.sub("", lyrics)
|
||||
return lyrics
|
||||
|
||||
def create_lrc_object(self, lrc_str: str) -> list[dict]:
|
||||
"""
|
||||
@ -142,15 +153,21 @@ class DataUtils:
|
||||
if not reg_helper:
|
||||
continue
|
||||
reg_helper = reg_helper[0]
|
||||
logging.debug("Reg helper: %s for line: %s; len: %s",
|
||||
reg_helper, line, len(reg_helper))
|
||||
logging.debug(
|
||||
"Reg helper: %s for line: %s; len: %s",
|
||||
reg_helper,
|
||||
line,
|
||||
len(reg_helper),
|
||||
)
|
||||
_timetag = reg_helper[0]
|
||||
if not reg_helper[1].strip():
|
||||
_words = "♪"
|
||||
else:
|
||||
_words = reg_helper[1].strip()
|
||||
lrc_out.append({
|
||||
"timeTag": _timetag,
|
||||
"words": _words,
|
||||
})
|
||||
return lrc_out
|
||||
lrc_out.append(
|
||||
{
|
||||
"timeTag": _timetag,
|
||||
"words": _words,
|
||||
}
|
||||
)
|
||||
return lrc_out
|
||||
|
Reference in New Issue
Block a user