Files
api/utils/rip_background.py
2025-09-09 15:50:13 -04:00

398 lines
14 KiB
Python

import logging
import asyncio
import random
import os
import tarfile
import traceback
import uuid
import subprocess
import shutil
import re
from pathlib import Path
from urllib.parse import urlparse, unquote
import aiohttp
from datetime import datetime
from mediafile import MediaFile # type: ignore[import]
from rq import get_current_job
from utils.sr_wrapper import SRUtil
# ---------- Config ----------
ROOT_DIR = Path("/storage/music2")
MAX_RETRIES = 5
THROTTLE_MIN = 1.7
THROTTLE_MAX = 10.0
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/116.0.5845.97 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Connection": "keep-alive",
}
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
sr = SRUtil()
# ---------- Helpers ----------
def tag_with_mediafile(file_path: str, meta: dict):
f = MediaFile(file_path)
# --- Helper to safely set textual/number fields ---
def safe_set(attr, value, default=None, cast=None):
if value is None:
value = default
if value is not None:
if cast is not None:
setattr(f, attr, cast(value))
else:
setattr(f, attr, str(value))
# --- Basic textual metadata ---
safe_set("title", meta.get("title"), default="Unknown Title")
safe_set("artist", meta.get("artist"), default="Unknown Artist")
safe_set("albumartist", meta.get("album_artist"), default="Unknown Artist")
safe_set("album", meta.get("album"), default="Unknown Album")
safe_set("track", meta.get("track_number"), default=0, cast=int)
safe_set("disc", meta.get("disc_number"), default=0, cast=int)
safe_set("isrc", meta.get("isrc"), default="")
safe_set("bpm", meta.get("bpm"), default=0, cast=int)
# --- Release date ---
release_date_str = meta.get("release_date")
release_date_obj = None
if release_date_str:
try:
release_date_obj = datetime.fromisoformat(release_date_str).date()
except ValueError:
try:
# fallback if only year string
release_date_obj = datetime(int(release_date_str[:4]), 1, 1).date()
except Exception:
pass
if release_date_obj:
f.date = release_date_obj
# --- Save all tags ---
f.save()
def cleanup_empty_dirs(root: Path):
"""
Recursively remove any directories under root that contain no files
(empty or only empty subdirectories).
"""
for dirpath, dirnames, filenames in os.walk(root, topdown=False):
p = Path(dirpath)
has_file = any(f.is_file() for f in p.rglob("*"))
if not has_file:
try:
p.rmdir()
except Exception:
pass
def sanitize_filename(name: str) -> str:
"""Make a string safe for file/dir names."""
if not name:
return "Unknown"
name = name.replace("/", "-").replace("\\", "-")
name = re.sub(r'[<>:"|?*\x00-\x1F]', "", name)
name = name.strip().strip(".")
name = re.sub(r"\s+", " ", name)
return name[:180] or "Unknown"
def ensure_unique_path(p: Path) -> Path:
"""
Ensure the given file or directory path is unique *within its parent folder*.
Only appends (2), (3)... if a real conflict exists in that folder.
"""
parent = p.parent
stem, suffix = p.stem, p.suffix
existing = {f.name for f in parent.glob(f"*{suffix}") if f.is_file()}
candidate = f"{stem}{suffix}"
if candidate not in existing:
return parent / candidate
counter = 2
while True:
candidate = f"{stem} ({counter}){suffix}"
if candidate not in existing:
return parent / candidate
counter += 1
# ---------- Job ----------
def bulk_download(track_list: list, quality: str = "FLAC"):
"""
RQ job:
- fetches stream URLs
- downloads with retries + throttling
- uses SR metadata to name/organize files
- creates ONE tarball for all tracks
- returns [tarball_path]
"""
job = get_current_job()
job_id = job.id if job else uuid.uuid4().hex
staging_root = ROOT_DIR / job_id
if job:
try:
job.meta["track_ids"] = [str(t) for t in (track_list or [])]
job.meta["tracks"] = []
job.meta["progress"] = 0
job.meta["tarball"] = None
job.meta["status"] = "Started"
job.save_meta()
except Exception as e:
logging.warning("Failed to init job.meta: %s", e)
async def process_tracks():
per_track_meta = []
all_final_files = []
all_artists = set()
(ROOT_DIR / "completed").mkdir(parents=True, exist_ok=True)
async with aiohttp.ClientSession(headers=HEADERS) as session:
total = len(track_list or [])
logging.critical("Total tracks to process: %s", total)
if job:
job.meta["progress"] = 0
job.save_meta()
for i, track_id in enumerate(track_list or []):
track_info = {
"track_id": str(track_id),
"status": "Pending",
"file_path": None,
"error": None,
"attempts": 0,
}
attempt = 0
while attempt < MAX_RETRIES:
tmp_file = None
attempt += 1
track_info["attempts"] = attempt
try:
url = await sr.get_stream_url_by_track_id(track_id, quality)
if not url:
raise RuntimeError("No stream URL")
parsed = urlparse(url)
clean_path = unquote(parsed.path)
ext = Path(clean_path).suffix or ".mp3"
tmp_file = Path(f"/tmp/{uuid.uuid4().hex}{ext}")
async with session.get(url) as resp:
resp.raise_for_status()
with open(tmp_file, "wb") as f:
async for chunk in resp.content.iter_chunked(64 * 1024):
f.write(chunk)
md = await sr.get_metadata_by_track_id(track_id) or {}
logging.info("Metadata for %s: %s", track_id, md)
artist_raw = md.get("artist") or "Unknown Artist"
album_raw = md.get("album") or "Unknown Album"
title_raw = md.get("title") or f"Track {track_id}"
artist = sanitize_filename(artist_raw)
album = sanitize_filename(album_raw)
title = sanitize_filename(title_raw)
all_artists.add(artist)
artist_dir = staging_root / artist
album_dir = artist_dir / album
album_dir.mkdir(parents=True, exist_ok=True)
final_file = ensure_unique_path(album_dir / f"{title}{ext}")
tag_with_mediafile(str(tmp_file), md)
tmp_file.rename(final_file)
tmp_file = None
track_info["status"] = "Success"
track_info["file_path"] = str(final_file)
track_info["error"] = None
all_final_files.append(final_file)
if job:
job.meta["progress"] = int(((i + 1) / total) * 100)
job.save_meta()
break
except aiohttp.ClientResponseError as e:
if e.status == 429:
wait_time = min(60, 2**attempt) # exponential up to 60s
logging.warning(
"Rate limited (429). Sleeping %s seconds", wait_time
)
await asyncio.sleep(wait_time)
else:
await asyncio.sleep(
random.uniform(THROTTLE_MIN, THROTTLE_MAX)
)
except Exception as e:
logging.error(
"Track %s attempt %s failed: %s", track_id, attempt, e
)
traceback.print_exc()
track_info["error"] = str(e)
if attempt >= MAX_RETRIES:
track_info["status"] = "Failed"
await asyncio.sleep(random.uniform(THROTTLE_MIN, THROTTLE_MAX))
finally:
try:
await session.close()
if tmp_file and tmp_file.exists():
tmp_file.unlink()
except Exception:
pass
per_track_meta.append(track_info)
if job:
try:
job.meta["tracks"] = per_track_meta
job.save_meta()
except Exception as e:
logging.warning(
"Failed to update job.meta after track %s: %s", track_id, e
)
await asyncio.sleep(random.uniform(THROTTLE_MIN, THROTTLE_MAX))
if not all_final_files:
if job:
try:
job.meta["tarball"] = None
job.meta["status"] = "Failed"
job.save_meta()
except Exception:
pass
return []
artist_counts: dict[str, int] = {}
for t in per_track_meta:
if t["status"] == "Success" and t.get("file_path"):
try:
artist = Path(t["file_path"]).relative_to(staging_root).parts[0]
except Exception:
artist = "Unknown Artist"
artist_counts[artist] = artist_counts.get(artist, 0) + 1
if artist_counts:
top_artist = sorted(artist_counts.items(), key=lambda kv: (-kv[1], kv[0]))[
0
][0]
else:
top_artist = "Unknown Artist"
combined_artist = sanitize_filename(top_artist)
short_id = uuid.uuid4().hex[:8]
staged_tarball = staging_root / f"{combined_artist}_{short_id}.tar.gz"
final_tarball = ROOT_DIR / "completed" / quality / staged_tarball.name
final_tarball.parent.mkdir(parents=True, exist_ok=True)
if job:
try:
job.meta["status"] = "Compressing"
job.save_meta()
except Exception:
pass
logging.info("Creating tarball: %s", staged_tarball)
def _create_tar_sync():
try:
subprocess.run(
[
"tar",
"-I",
"pigz -9",
"-cf",
str(staged_tarball),
"-C",
str(staging_root),
]
+ [str(f.relative_to(staging_root)) for f in all_final_files],
check=True,
)
for f in all_final_files:
try:
os.remove(f)
except Exception:
pass
except FileNotFoundError:
logging.warning("pigz not available, falling back to tarfile (slower).")
with tarfile.open(staged_tarball, "w:gz") as tar:
for f in all_final_files:
try:
arcname = f.relative_to(staging_root)
except ValueError:
arcname = f.name
tar.add(f, arcname=str(arcname))
try:
os.remove(f)
except Exception:
pass
await asyncio.to_thread(_create_tar_sync)
if not staged_tarball.exists():
logging.error("Tarball was not created: %s", staged_tarball)
if job:
try:
job.meta["status"] = "compress_failed"
job.save_meta()
except Exception:
pass
return []
logging.critical("Tarball created: %s", staged_tarball)
try:
staged_tarball.rename(final_tarball)
except Exception:
shutil.move(str(staged_tarball), str(final_tarball))
logging.critical("Tarball finalized: %s", final_tarball)
await asyncio.to_thread(shutil.rmtree, staging_root, ignore_errors=True)
if job:
job.meta["tarball"] = str(final_tarball)
job.meta["progress"] = 100
job.meta["status"] = "Completed"
job.save_meta()
return [str(final_tarball)]
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
return loop.run_until_complete(process_tracks())
except Exception as e:
if job:
job.meta["status"] = "Failed"
job.save_meta()
logging.critical("Exception: %s", str(e))
finally:
loop.close()