Files
api/utils/rip_background.py

317 lines
11 KiB
Python
Raw Normal View History

2025-08-15 13:31:15 -04:00
import logging
import asyncio
import random
import os
import tarfile
import uuid
import shutil
import re
2025-08-15 13:31:15 -04:00
from pathlib import Path
from urllib.parse import urlparse, unquote
import aiohttp
from rq import get_current_job
from utils.sr_wrapper import SRUtil
# ---------- Config ----------
2025-08-20 15:58:07 -04:00
ROOT_DIR = Path("/storage/music2")
2025-08-15 13:31:15 -04:00
MAX_RETRIES = 3
2025-08-20 07:32:57 -04:00
THROTTLE_MIN = 0.3
THROTTLE_MAX = 1.0
2025-08-15 13:31:15 -04:00
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/116.0.5845.97 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Connection": "keep-alive",
}
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
2025-08-15 13:31:15 -04:00
sr = SRUtil()
2025-08-15 13:31:15 -04:00
2025-08-15 14:15:13 -04:00
# ---------- Helpers ----------
def cleanup_empty_dirs(root: Path):
"""
Recursively remove any directories under root that contain no files
(empty or only empty subdirectories).
"""
for dirpath, dirnames, filenames in os.walk(root, topdown=False):
p = Path(dirpath)
# Check if there are any files in this directory or subdirectories
has_file = any(f.is_file() for f in p.rglob("*"))
if not has_file:
try:
p.rmdir() # safe to remove empty dirs
except Exception:
pass
2025-08-15 14:15:13 -04:00
def sanitize_filename(name: str) -> str:
"""Make a string safe for file/dir names."""
if not name:
return "Unknown"
# Replace path separators first
2025-08-15 14:15:13 -04:00
name = name.replace("/", "-").replace("\\", "-")
# Remove illegal characters on common filesystems
2025-08-15 14:15:13 -04:00
name = re.sub(r'[<>:"|?*\x00-\x1F]', "", name)
# Trim spaces and trailing dots
2025-08-15 14:15:13 -04:00
name = name.strip().strip(".")
# Collapse whitespace
2025-08-15 14:15:13 -04:00
name = re.sub(r"\s+", " ", name)
# Reasonable length cap
return name[:180] or "Unknown"
def ensure_unique_path(p: Path) -> Path:
2025-08-20 15:58:07 -04:00
"""Always append a short UUID fragment before the extension."""
stem, suffix = p.stem, p.suffix
parent = p.parent
2025-08-20 15:58:07 -04:00
short_id = uuid.uuid4().hex[:8]
return parent / f"{stem}_{short_id}{suffix}"
2025-08-15 14:15:13 -04:00
# ---------- Job ----------
def bulk_download(track_list: list, quality: str = "FLAC"):
2025-08-15 13:31:15 -04:00
"""
RQ job:
- fetches stream URLs
- downloads with retries + throttling
- uses SR metadata to name/organize files
2025-08-20 15:58:07 -04:00
- creates ONE tarball for all tracks
- returns [tarball_path]
2025-08-15 13:31:15 -04:00
"""
job = get_current_job()
2025-08-20 07:32:57 -04:00
# Initialize job meta in a JSON/pickle-safe way
if job:
try:
job.meta["track_ids"] = [str(t) for t in (track_list or [])]
job.meta["tracks"] = [] # will hold per-track dicts
job.meta["progress"] = 0
job.meta["tarball"] = None
2025-08-20 15:58:07 -04:00
job.meta["status"] = "started"
2025-08-20 07:32:57 -04:00
job.save_meta()
except Exception as e:
logging.warning("Failed to init job.meta: %s", e)
2025-08-15 13:31:15 -04:00
async def process_tracks():
2025-08-20 07:32:57 -04:00
per_track_meta = [] # list of per-track dicts (JSON-safe)
all_final_files = [] # list[Path]
all_artists = set() # set[str]
(ROOT_DIR / "completed").mkdir(parents=True, exist_ok=True)
2025-08-15 13:31:15 -04:00
async with aiohttp.ClientSession(headers=HEADERS) as session:
2025-08-20 07:32:57 -04:00
total = len(track_list or [])
2025-08-15 13:31:15 -04:00
logging.critical("Total tracks to process: %s", total)
2025-08-20 15:58:07 -04:00
if job:
job.meta["progress"] = 0
job.save_meta()
2025-08-15 13:31:15 -04:00
2025-08-20 07:32:57 -04:00
for i, track_id in enumerate(track_list or []):
2025-08-15 13:31:15 -04:00
track_info = {
2025-08-20 07:32:57 -04:00
"track_id": str(track_id),
"status": "pending", # pending | success | failed
"file_path": None, # str | None
"error": None, # str | None
"attempts": 0, # int
2025-08-15 13:31:15 -04:00
}
attempt = 0
while attempt < MAX_RETRIES:
2025-08-20 07:32:57 -04:00
tmp_file = None
2025-08-15 13:31:15 -04:00
attempt += 1
2025-08-20 07:32:57 -04:00
track_info["attempts"] = attempt
2025-08-15 13:31:15 -04:00
try:
# 1) Stream URL
url = await sr.get_stream_url_by_track_id(track_id, quality)
2025-08-15 13:31:15 -04:00
if not url:
raise RuntimeError("No stream URL")
# 2) Extension from URL path only (no query)
2025-08-15 13:31:15 -04:00
parsed = urlparse(url)
2025-08-20 07:32:57 -04:00
clean_path = unquote(parsed.path)
ext = Path(clean_path).suffix or ".mp3"
2025-08-15 13:31:15 -04:00
# Unique temp file
tmp_file = Path(f"/tmp/{uuid.uuid4().hex}{ext}")
# 3) Download (chunked)
2025-08-15 13:31:15 -04:00
async with session.get(url) as resp:
resp.raise_for_status()
with open(tmp_file, "wb") as f:
async for chunk in resp.content.iter_chunked(64 * 1024):
f.write(chunk)
# 4) Metadata from SR (prefer API over tags)
md = await sr.get_metadata_by_track_id(track_id) or {}
artist_raw = md.get("artist") or "Unknown Artist"
2025-08-20 07:32:57 -04:00
album_raw = md.get("album") or "Unknown Album"
title_raw = md.get("song") or f"Track {track_id}"
artist = sanitize_filename(artist_raw)
album = sanitize_filename(album_raw)
title = sanitize_filename(title_raw)
all_artists.add(artist)
# 5) Final path
2025-08-15 13:31:15 -04:00
final_dir = ROOT_DIR / artist / album
final_dir.mkdir(parents=True, exist_ok=True)
final_file = ensure_unique_path(final_dir / f"{title}{ext}")
2025-08-15 13:31:15 -04:00
tmp_file.rename(final_file)
tmp_file = None # consumed
2025-08-15 13:31:15 -04:00
# Track success
2025-08-20 07:32:57 -04:00
track_info["status"] = "success"
track_info["file_path"] = str(final_file)
track_info["error"] = None
all_final_files.append(final_file)
2025-08-20 15:58:07 -04:00
if job:
job.meta["progress"] = int(((i + 1) / total) * 100)
job.save_meta()
break # success; exit retry loop
2025-08-15 13:31:15 -04:00
except Exception as e:
logging.error("Track %s attempt %s failed: %s", track_id, attempt, e)
2025-08-15 13:31:15 -04:00
track_info["error"] = str(e)
if attempt >= MAX_RETRIES:
track_info["status"] = "failed"
# small backoff before next attempt (or next track)
await asyncio.sleep(random.uniform(THROTTLE_MIN, THROTTLE_MAX))
finally:
# Clean partial temp file on failure
2025-08-20 07:32:57 -04:00
try:
if tmp_file and tmp_file.exists():
tmp_file.unlink()
2025-08-20 07:32:57 -04:00
except Exception:
pass
# Update RQ meta after each track
2025-08-15 13:31:15 -04:00
per_track_meta.append(track_info)
if job:
2025-08-20 07:32:57 -04:00
try:
job.meta["tracks"] = per_track_meta
job.save_meta()
except Exception as e:
logging.warning("Failed to update job.meta after track %s: %s", track_id, e)
2025-08-15 13:31:15 -04:00
# Throttle between tracks
2025-08-15 13:31:15 -04:00
await asyncio.sleep(random.uniform(THROTTLE_MIN, THROTTLE_MAX))
2025-08-20 15:58:07 -04:00
# ---- Single combined tarball for all tracks ----
if not all_final_files:
2025-08-20 07:32:57 -04:00
if job:
try:
job.meta["tarball"] = None
job.meta["status"] = "failed"
job.save_meta()
except Exception:
pass
return []
2025-08-20 07:32:57 -04:00
# Pick artist with the most tracks
artist_counts: dict[str, int] = {}
for t in per_track_meta:
if t["status"] == "success" and t.get("file_path"):
try:
artist = Path(t["file_path"]).relative_to(ROOT_DIR).parts[0]
except Exception:
artist = "Unknown Artist"
artist_counts[artist] = artist_counts.get(artist, 0) + 1
if artist_counts:
top_artist = sorted(
artist_counts.items(), key=lambda kv: (-kv[1], kv[0])
)[0][0]
else:
top_artist = "Unknown Artist"
combined_artist = sanitize_filename(top_artist)
short_id = uuid.uuid4().hex[:8]
2025-08-20 07:32:57 -04:00
# Stage tarball in ROOT_DIR first
staged_tarball = ROOT_DIR / f"{combined_artist}_{short_id}.tar.gz"
final_tarball = ROOT_DIR / "completed" / quality / staged_tarball.name
2025-08-20 07:32:57 -04:00
final_tarball.parent.mkdir(parents=True, exist_ok=True)
2025-08-20 15:58:07 -04:00
if job:
try:
job.meta["status"] = "compressing"
job.save_meta()
except Exception:
pass
2025-08-20 07:32:57 -04:00
2025-08-20 15:58:07 -04:00
logging.info("Creating tarball: %s", staged_tarball)
# Run blocking tar creation in background thread
def _create_tar_sync():
with tarfile.open(staged_tarball, "w:gz") as tar:
for f in all_final_files:
try:
arcname = f.relative_to(ROOT_DIR)
except ValueError:
arcname = f.name
tar.add(f, arcname=str(arcname))
try:
os.remove(f)
except Exception:
pass
await asyncio.to_thread(_create_tar_sync)
# sanity check
if not staged_tarball.exists():
logging.error("Tarball was not created: %s", staged_tarball)
2025-08-20 07:32:57 -04:00
if job:
try:
2025-08-20 15:58:07 -04:00
job.meta["status"] = "compress_failed"
2025-08-20 07:32:57 -04:00
job.save_meta()
except Exception:
pass
2025-08-20 15:58:07 -04:00
return []
2025-08-20 07:32:57 -04:00
logging.critical("Tarball created: %s", staged_tarball)
# Now move tarball into completed folder
try:
staged_tarball.rename(final_tarball)
except Exception:
shutil.move(str(staged_tarball), str(final_tarball))
logging.critical("Tarball finalized: %s", final_tarball)
await asyncio.to_thread(cleanup_empty_dirs, ROOT_DIR)
2025-08-20 07:32:57 -04:00
if job:
2025-08-20 15:58:07 -04:00
job.meta["tarball"] = str(final_tarball)
job.meta["progress"] = 100
job.meta["status"] = "completed"
job.save_meta()
2025-08-20 07:32:57 -04:00
return [str(final_tarball)]
# Run async part synchronously for RQ
2025-08-15 13:31:15 -04:00
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
return loop.run_until_complete(process_tracks())
2025-08-20 15:58:07 -04:00
except Exception as e:
if job:
job.meta["status"] = "failed"
job.save_meta()
logging.critical("Exception: %s", str(e))
2025-08-15 13:31:15 -04:00
finally:
loop.close()