misc / TRip: folder structure / tar naming

This commit is contained in:
2025-08-15 14:58:06 -04:00
parent 27fa1f78ed
commit 3cebe14674

View File

@@ -4,8 +4,8 @@ import random
import os import os
import tarfile import tarfile
import uuid import uuid
import re
import shutil import shutil
import re
from pathlib import Path from pathlib import Path
from urllib.parse import urlparse, unquote from urllib.parse import urlparse, unquote
@@ -13,14 +13,8 @@ import aiohttp
from rq import get_current_job from rq import get_current_job
from utils.sr_wrapper import SRUtil from utils.sr_wrapper import SRUtil
# Configure logging # ---------- Config ----------
logging.basicConfig( ROOT_DIR = Path("/storage/music2") # change to your music folder
level=logging.DEBUG,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
# Constants
ROOT_DIR = Path("/storage/music2") # Change to your music folder
MAX_RETRIES = 3 MAX_RETRIES = 3
THROTTLE_MIN = 0.2 THROTTLE_MIN = 0.2
THROTTLE_MAX = 1.5 THROTTLE_MAX = 1.5
@@ -36,44 +30,63 @@ HEADERS = {
"Connection": "keep-alive", "Connection": "keep-alive",
} }
# StreamRip utility logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
sr = SRUtil() sr = SRUtil()
import re # ---------- Helpers ----------
def sanitize_filename(name: str) -> str: def sanitize_filename(name: str) -> str:
""" """Make a string safe for file/dir names."""
Remove or replace characters not allowed in filenames. if not name:
Also trims whitespace and collapses consecutive spaces. return "Unknown"
""" # Replace path separators first
# Replace slashes/backslashes with a dash
name = name.replace("/", "-").replace("\\", "-") name = name.replace("/", "-").replace("\\", "-")
# Remove illegal characters for most OSes # Remove illegal characters on common filesystems
name = re.sub(r'[<>:"|?*\x00-\x1F]', "", name) name = re.sub(r'[<>:"|?*\x00-\x1F]', "", name)
# Strip leading/trailing spaces and dots # Trim spaces and trailing dots
name = name.strip().strip(".") name = name.strip().strip(".")
# Collapse multiple spaces into one # Collapse whitespace
name = re.sub(r"\s+", " ", name) name = re.sub(r"\s+", " ", name)
return name or "Unknown" # Reasonable length cap
return name[:180] or "Unknown"
def ensure_unique_path(p: Path) -> Path:
"""If path exists, append ' (n)' before extension."""
if not p.exists():
return p
stem, suffix = p.stem, p.suffix
parent = p.parent
n = 2
while True:
candidate = parent / f"{stem} ({n}){suffix}"
if not candidate.exists():
return candidate
n += 1
# ---------- Job ----------
def bulk_download(track_list: list): def bulk_download(track_list: list):
""" """
Full RQ-compatible bulk download job with: RQ job:
- async per-track URL fetching - fetches stream URLs
- retry on failure - downloads with retries + throttling
- per-track success/failure - uses SR metadata to name/organize files
- metadata extraction - creates ONE tarball for all tracks, with all artist names in the filename
- organized file storage - returns [tarball_path]
- throttling
- per-artist tarball creation
- progress updates
""" """
job = get_current_job() job = get_current_job()
async def process_tracks(): async def process_tracks():
per_track_meta = [] per_track_meta = []
artist_files = {} # artist -> list of files all_final_files: list[Path] = []
all_artists: set[str] = set()
(ROOT_DIR / "completed").mkdir(parents=True, exist_ok=True)
async with aiohttp.ClientSession(headers=HEADERS) as session: async with aiohttp.ClientSession(headers=HEADERS) as session:
total = len(track_list) total = len(track_list)
@@ -89,96 +102,124 @@ def bulk_download(track_list: list):
attempt = 0 attempt = 0
while attempt < MAX_RETRIES: while attempt < MAX_RETRIES:
tmp_file: Path | None = None
attempt += 1 attempt += 1
try: try:
# Get track URL # 1) Stream URL
url = await sr.get_stream_url_by_track_id(track_id) url = await sr.get_stream_url_by_track_id(track_id)
if not url: if not url:
logging.critical( raise RuntimeError("No stream URL")
"Failed to get URL for track: %s", track_id
)
await asyncio.sleep(
random.uniform(THROTTLE_MIN, THROTTLE_MAX)
)
continue
# Download file (chunked) # 2) Extension from URL path only (no query)
parsed = urlparse(url) parsed = urlparse(url)
ext = Path(unquote(parsed.path)).suffix or ".mp3" clean_path = unquote(parsed.path) # path has no query; just in case we unquote
tmp_file = Path(f"/tmp/{track_id}{ext}") ext = Path(clean_path).suffix or ".mp3"
# Unique temp file
tmp_file = Path(f"/tmp/{uuid.uuid4().hex}{ext}")
# 3) Download (chunked)
async with session.get(url) as resp: async with session.get(url) as resp:
resp.raise_for_status() resp.raise_for_status()
with open(tmp_file, "wb") as f: with open(tmp_file, "wb") as f:
async for chunk in resp.content.iter_chunked(64 * 1024): async for chunk in resp.content.iter_chunked(64 * 1024):
f.write(chunk) f.write(chunk)
# Extract metadata # 4) Metadata from SR (prefer API over tags)
metadata = await sr.get_metadata_by_track_id(track_id) md = await sr.get_metadata_by_track_id(track_id) or {}
if not metadata: artist_raw = md.get("artist") or "Unknown Artist"
logging.critical( album_raw = md.get("album") or "Unknown Album"
"Failed to retrieve metadata for track ID: %s. Skipping", title_raw = md.get("song") or f"Track {track_id}"
track_id,
)
continue
artist = sanitize_filename(metadata.get("artist", "Unknown Artist"))
album = sanitize_filename(metadata.get("album", "Unknown Album"))
title = sanitize_filename(metadata.get("song", "Unknown Song"))
logging.critical("Got metadata: %s/%s/%s", artist, album, title) artist = sanitize_filename(artist_raw)
album = sanitize_filename(album_raw)
title = sanitize_filename(title_raw)
# Organize path all_artists.add(artist)
# 5) Final path
final_dir = ROOT_DIR / artist / album final_dir = ROOT_DIR / artist / album
final_dir.mkdir(parents=True, exist_ok=True) final_dir.mkdir(parents=True, exist_ok=True)
final_file = final_dir / f"{title}{ext}" final_file = ensure_unique_path(final_dir / f"{title}{ext}")
tmp_file.rename(final_file)
# Track per-track info tmp_file.rename(final_file)
tmp_file = None # consumed
# Track success
track_info.update( track_info.update(
{"status": "success", "file_path": str(final_file)} {"status": "success", "file_path": str(final_file)}
) )
artist_files.setdefault(artist, []).append(final_file) all_final_files.append(final_file)
break # success; exit retry loop
break # success
except Exception as e: except Exception as e:
logging.error("Error downloading track %s: %s", track_id, e) logging.error("Track %s attempt %s failed: %s", track_id, attempt, e)
track_info["error"] = str(e) track_info["error"] = str(e)
if attempt >= MAX_RETRIES: if attempt >= MAX_RETRIES:
track_info["status"] = "failed" track_info["status"] = "failed"
else: # small backoff before next attempt (or next track)
# small delay before retry await asyncio.sleep(random.uniform(THROTTLE_MIN, THROTTLE_MAX))
await asyncio.sleep( finally:
random.uniform(THROTTLE_MIN, THROTTLE_MAX) # Clean partial temp file on failure
) if tmp_file and tmp_file.exists():
try:
tmp_file.unlink()
except Exception:
pass
# Update RQ job meta # Update RQ meta after each track
per_track_meta.append(track_info) per_track_meta.append(track_info)
if job: if job:
job.meta["progress"] = int((i + 1) / total * 100) job.meta["progress"] = int((i + 1) / max(total, 1) * 100)
job.meta["tracks"] = track_list job.meta["tracks"] = track_list
job.save_meta() job.save_meta()
# Throttle between downloads # Throttle between tracks
await asyncio.sleep(random.uniform(THROTTLE_MIN, THROTTLE_MAX)) await asyncio.sleep(random.uniform(THROTTLE_MIN, THROTTLE_MAX))
# Create per-artist tarballs # ---- Single combined tarball for all tracks ----
tarballs = [] if not all_final_files:
for artist, files in artist_files.items(): # nothing succeeded
short_id = uuid.uuid4().hex[:8] return []
tarball_name = ROOT_DIR / "completed" / f"{artist}_{short_id}.tar.gz"
with tarfile.open(tarball_name, "w:gz") as tar:
for f in files:
tar.add(f, arcname=f.name)
os.remove(f) # remove original file
logging.critical("Created tarball: %s", tarball_name)
tarballs.append(str(tarball_name))
artist_dir = ROOT_DIR / artist
shutil.rmtree(artist_dir, ignore_errors=True)
return tarballs combined_artists = sanitize_filename(" & ".join(sorted(all_artists))) or "Unknown Artist"
short_id = uuid.uuid4().hex[:8]
tarball_path = (ROOT_DIR / "completed" / f"{combined_artists}_{short_id}.tar.gz")
tarball_path.parent.mkdir(parents=True, exist_ok=True)
# Run the async function synchronously with tarfile.open(tarball_path, "w:gz") as tar:
for f in all_final_files:
# Preserve relative Artist/Album/Song.ext structure inside the tar
try:
arcname = f.relative_to(ROOT_DIR)
except ValueError:
arcname = f.name # fallback
tar.add(f, arcname=str(arcname))
# remove original file after adding
try:
os.remove(f)
except Exception:
pass
logging.critical("Created tarball: %s", tarball_path)
# Cleanup empty artist/album dirs (best-effort)
# Remove any directories under ROOT_DIR that are now empty
to_check = {p.parent for p in all_final_files} | {p.parent.parent for p in all_final_files}
for d in sorted(to_check, key=lambda p: len(p.parts), reverse=True):
if d.is_dir():
try:
# remove only if empty
next(d.iterdir())
except StopIteration:
# empty
shutil.rmtree(d, ignore_errors=True)
except Exception:
pass
return [str(tarball_path)]
# Run async part synchronously for RQ
loop = asyncio.new_event_loop() loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop) asyncio.set_event_loop(loop)
try: try: