Files
api/utils/rip_background.py

390 lines
14 KiB
Python
Raw Normal View History

2025-08-15 13:31:15 -04:00
import logging
import asyncio
import random
import os
import tarfile
2025-09-09 15:50:13 -04:00
import traceback
2025-08-15 13:31:15 -04:00
import uuid
import subprocess
2025-08-15 13:31:15 -04:00
import shutil
from pathlib import Path
2025-09-18 08:13:21 -04:00
from typing import Optional
2025-08-15 13:31:15 -04:00
from urllib.parse import urlparse, unquote
import aiohttp
2025-09-18 08:13:21 -04:00
from datetime import datetime, timezone
2025-09-09 15:50:13 -04:00
from mediafile import MediaFile # type: ignore[import]
2025-08-15 13:31:15 -04:00
from rq import get_current_job
from utils.sr_wrapper import SRUtil
2025-09-18 08:13:21 -04:00
from dotenv import load_dotenv
import re
2025-08-15 13:31:15 -04:00
# ---------- Config ----------
2025-08-21 15:08:13 -04:00
ROOT_DIR = Path("/storage/music2")
2025-09-09 15:50:13 -04:00
MAX_RETRIES = 5
2025-09-12 22:39:59 -04:00
THROTTLE_MIN = 1.0
THROTTLE_MAX = 3.5
2025-09-18 08:13:21 -04:00
DISCORD_WEBHOOK = os.getenv("TRIP_WEBHOOK_URI", "").strip()
2025-08-15 13:31:15 -04:00
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/116.0.5845.97 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Connection": "keep-alive",
}
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
2025-08-15 13:31:15 -04:00
2025-09-18 08:13:21 -04:00
load_dotenv()
sr = SRUtil()
2025-08-15 13:31:15 -04:00
2025-08-15 14:15:13 -04:00
2025-09-18 08:13:21 -04:00
# ---------- Discord helper ----------
async def discord_notify(webhook_url: str, title: str, description: str, target: Optional[str] = None, color: int = 0x00FF00):
embed = {
"title": title,
"description": description[:1900] if description else "",
"color": color,
"timestamp": datetime.now(timezone.utc).isoformat(),
}
if target:
embed["fields"] = [{"name": "Target", "value": str(target), "inline": True}]
payload = {
"embeds": [embed],
}
while True: # permanent retry
try:
async with aiohttp.ClientSession() as session:
async with session.post(webhook_url, json=payload, timeout=aiohttp.ClientTimeout(total=10)) as resp:
if resp.status >= 400:
text = await resp.text()
raise RuntimeError(f"Discord webhook failed ({resp.status}): {text}")
break
except Exception as e:
print(f"Discord send failed, retrying: {e}")
await asyncio.sleep(5)
def send_log_to_discord(message: str, level: str, target: Optional[str] = None):
colors = {"WARNING": 0xFFA500, "ERROR": 0xFF0000, "CRITICAL": 0xFF0000}
color = colors.get(level.upper(), 0xFFFF00)
async def _send():
await discord_notify(
webhook_url=DISCORD_WEBHOOK,
title=f"{level} in bulk_download",
description=message,
target=target,
color=color
)
try:
asyncio.get_running_loop()
# already in an event loop — schedule a task
asyncio.create_task(_send())
except RuntimeError:
# not in an event loop — safe to run
asyncio.run(_send())
2025-08-21 15:08:13 -04:00
2025-09-18 08:13:21 -04:00
# ---------- Helpers ----------
2025-09-09 15:50:13 -04:00
def tag_with_mediafile(file_path: str, meta: dict):
f = MediaFile(file_path)
def safe_set(attr, value, default=None, cast=None):
if value is None:
value = default
if value is not None:
2025-09-18 08:13:21 -04:00
if cast:
2025-09-09 15:50:13 -04:00
setattr(f, attr, cast(value))
else:
setattr(f, attr, str(value))
safe_set("title", meta.get("title"), default="Unknown Title")
safe_set("artist", meta.get("artist"), default="Unknown Artist")
safe_set("albumartist", meta.get("album_artist"), default="Unknown Artist")
safe_set("album", meta.get("album"), default="Unknown Album")
safe_set("track", meta.get("track_number"), default=0, cast=int)
safe_set("disc", meta.get("disc_number"), default=0, cast=int)
safe_set("isrc", meta.get("isrc"), default="")
safe_set("bpm", meta.get("bpm"), default=0, cast=int)
release_date_str = meta.get("release_date")
release_date_obj = None
if release_date_str:
try:
release_date_obj = datetime.fromisoformat(release_date_str).date()
except ValueError:
try:
release_date_obj = datetime(int(release_date_str[:4]), 1, 1).date()
except Exception:
pass
if release_date_obj:
f.date = release_date_obj
f.save()
2025-08-15 14:15:13 -04:00
def sanitize_filename(name: str) -> str:
if not name:
return "Unknown"
2025-08-15 14:15:13 -04:00
name = name.replace("/", "-").replace("\\", "-")
name = re.sub(r'[<>:"|?*\x00-\x1F]', "", name)
name = name.strip().strip(".")
name = re.sub(r"\s+", " ", name)
return name[:180] or "Unknown"
2025-09-09 15:50:13 -04:00
def ensure_unique_path(p: Path) -> Path:
parent = p.parent
stem, suffix = p.stem, p.suffix
2025-09-09 15:50:13 -04:00
existing = {f.name for f in parent.glob(f"*{suffix}") if f.is_file()}
candidate = f"{stem}{suffix}"
if candidate not in existing:
return parent / candidate
counter = 2
2025-09-09 15:50:13 -04:00
while True:
candidate = f"{stem} ({counter}){suffix}"
if candidate not in existing:
return parent / candidate
counter += 1
2025-09-09 15:50:13 -04:00
2025-08-21 15:08:13 -04:00
2025-09-18 08:13:21 -04:00
# ---------- bulk_download ----------
def bulk_download(track_list: list, quality: str = "FLAC"):
2025-08-15 13:31:15 -04:00
"""
RQ job:
- fetches stream URLs
- downloads with retries + throttling
- uses SR metadata to name/organize files
2025-08-20 15:58:07 -04:00
- creates ONE tarball for all tracks
- returns [tarball_path]
2025-09-18 08:13:21 -04:00
- sends relevant messages to Discord
2025-08-15 13:31:15 -04:00
"""
job = get_current_job()
2025-09-09 15:50:13 -04:00
job_id = job.id if job else uuid.uuid4().hex
2025-09-18 08:13:21 -04:00
target = job.meta.get("target") if job else None
2025-09-09 15:50:13 -04:00
staging_root = ROOT_DIR / job_id
2025-08-15 13:31:15 -04:00
2025-08-20 07:32:57 -04:00
if job:
try:
job.meta["track_ids"] = [str(t) for t in (track_list or [])]
2025-09-09 15:50:13 -04:00
job.meta["tracks"] = []
2025-08-20 07:32:57 -04:00
job.meta["progress"] = 0
job.meta["tarball"] = None
job.meta["status"] = "Started"
2025-08-20 07:32:57 -04:00
job.save_meta()
except Exception as e:
2025-09-18 08:13:21 -04:00
send_log_to_discord(f"Failed to init job.meta: {e}", "WARNING", target)
# Job started Discord message
asyncio.run(discord_notify(
DISCORD_WEBHOOK,
title=f"Job Started: {job_id}",
description=f"Processing `{len(track_list)}` track(s)",
target=target,
color=0x00FFFF
))
2025-08-20 07:32:57 -04:00
2025-08-15 13:31:15 -04:00
async def process_tracks():
2025-09-09 15:50:13 -04:00
per_track_meta = []
all_final_files = []
all_artists = set()
(ROOT_DIR / "completed").mkdir(parents=True, exist_ok=True)
2025-08-15 13:31:15 -04:00
async with aiohttp.ClientSession(headers=HEADERS) as session:
2025-08-20 07:32:57 -04:00
total = len(track_list or [])
for i, track_id in enumerate(track_list or []):
2025-09-18 08:13:21 -04:00
track_info = {"track_id": str(track_id), "status": "Pending", "file_path": None, "error": None, "attempts": 0}
2025-08-15 13:31:15 -04:00
attempt = 0
while attempt < MAX_RETRIES:
2025-08-20 07:32:57 -04:00
tmp_file = None
2025-08-15 13:31:15 -04:00
attempt += 1
2025-08-20 07:32:57 -04:00
track_info["attempts"] = attempt
2025-08-15 13:31:15 -04:00
try:
url = await sr.get_stream_url_by_track_id(track_id, quality)
2025-08-15 13:31:15 -04:00
if not url:
raise RuntimeError("No stream URL")
2025-08-15 13:31:15 -04:00
parsed = urlparse(url)
2025-08-20 07:32:57 -04:00
clean_path = unquote(parsed.path)
ext = Path(clean_path).suffix or ".mp3"
tmp_file = Path(f"/tmp/{uuid.uuid4().hex}{ext}")
2025-08-15 13:31:15 -04:00
async with session.get(url) as resp:
resp.raise_for_status()
with open(tmp_file, "wb") as f:
async for chunk in resp.content.iter_chunked(64 * 1024):
f.write(chunk)
md = await sr.get_metadata_by_track_id(track_id) or {}
artist_raw = md.get("artist") or "Unknown Artist"
2025-08-21 15:08:13 -04:00
album_raw = md.get("album") or "Unknown Album"
2025-09-09 15:50:13 -04:00
title_raw = md.get("title") or f"Track {track_id}"
artist = sanitize_filename(artist_raw)
2025-08-21 15:08:13 -04:00
album = sanitize_filename(album_raw)
title = sanitize_filename(title_raw)
all_artists.add(artist)
2025-09-18 08:13:21 -04:00
album_dir = staging_root / artist / album
album_dir.mkdir(parents=True, exist_ok=True)
final_file = ensure_unique_path(album_dir / f"{title}{ext}")
2025-09-18 08:13:21 -04:00
tag_with_mediafile(str(tmp_file), md)
2025-08-15 13:31:15 -04:00
tmp_file.rename(final_file)
2025-09-09 15:50:13 -04:00
tmp_file = None
2025-08-15 13:31:15 -04:00
track_info["status"] = "Success"
2025-08-20 07:32:57 -04:00
track_info["file_path"] = str(final_file)
track_info["error"] = None
all_final_files.append(final_file)
2025-08-20 15:58:07 -04:00
if job:
job.meta["progress"] = int(((i + 1) / total) * 100)
2025-09-18 08:13:21 -04:00
job.meta["tracks"] = per_track_meta + [track_info]
2025-08-20 15:58:07 -04:00
job.save_meta()
2025-09-09 15:50:13 -04:00
break
except aiohttp.ClientResponseError as e:
2025-09-18 08:13:21 -04:00
msg = f"Track {track_id} attempt {attempt} ClientResponseError: {e}"
send_log_to_discord(msg, "WARNING", target)
2025-09-09 15:50:13 -04:00
if e.status == 429:
2025-09-18 08:13:21 -04:00
wait_time = min(60, 2**attempt)
2025-09-09 15:50:13 -04:00
await asyncio.sleep(wait_time)
else:
2025-09-18 08:13:21 -04:00
await asyncio.sleep(random.uniform(THROTTLE_MIN, THROTTLE_MAX))
2025-08-15 13:31:15 -04:00
except Exception as e:
2025-09-18 08:13:21 -04:00
tb = traceback.format_exc()
msg = f"Track {track_id} attempt {attempt} failed: {e}\n{tb}"
send_log_to_discord(msg, "ERROR", target)
2025-08-15 13:31:15 -04:00
track_info["error"] = str(e)
if attempt >= MAX_RETRIES:
track_info["status"] = "Failed"
2025-09-18 08:13:21 -04:00
send_log_to_discord(f"Track {track_id} failed after {attempt} attempts", "ERROR", target)
await asyncio.sleep(random.uniform(THROTTLE_MIN, THROTTLE_MAX))
2025-09-18 08:13:21 -04:00
finally:
2025-08-20 07:32:57 -04:00
try:
if tmp_file and tmp_file.exists():
2025-09-18 08:13:21 -04:00
os.remove(tmp_file)
2025-08-20 07:32:57 -04:00
except Exception:
pass
2025-08-15 13:31:15 -04:00
per_track_meta.append(track_info)
if not all_final_files:
2025-08-20 07:32:57 -04:00
if job:
2025-09-18 08:13:21 -04:00
job.meta["tarball"] = None
job.meta["status"] = "Failed"
job.save_meta()
send_log_to_discord(f"No tracks were successfully downloaded for job `{job_id}`", "CRITICAL", target)
return []
2025-09-18 08:13:21 -04:00
# Tarball creation
artist_counts = {}
2025-08-20 07:32:57 -04:00
for t in per_track_meta:
if t["status"] == "Success" and t.get("file_path"):
2025-08-20 07:32:57 -04:00
try:
2025-09-09 15:50:13 -04:00
artist = Path(t["file_path"]).relative_to(staging_root).parts[0]
2025-08-20 07:32:57 -04:00
except Exception:
artist = "Unknown Artist"
artist_counts[artist] = artist_counts.get(artist, 0) + 1
2025-09-18 08:13:21 -04:00
top_artist = sorted(artist_counts.items(), key=lambda kv: (-kv[1], kv[0]))[0][0] if artist_counts else "Unknown Artist"
2025-08-20 07:32:57 -04:00
combined_artist = sanitize_filename(top_artist)
2025-09-12 22:39:59 -04:00
staged_tarball = staging_root / f"{combined_artist}.tar.gz"
2025-09-18 08:13:21 -04:00
2025-09-12 22:39:59 -04:00
counter = 1
base_name = staged_tarball.stem
while staged_tarball.exists():
counter += 1
staged_tarball = staging_root / f"{base_name} ({counter}).tar.gz"
final_tarball = ROOT_DIR / "completed" / quality / staged_tarball.name
2025-08-20 07:32:57 -04:00
final_tarball.parent.mkdir(parents=True, exist_ok=True)
2025-08-20 15:58:07 -04:00
if job:
2025-09-18 08:13:21 -04:00
job.meta["status"] = "Compressing"
job.save_meta()
2025-08-20 07:32:57 -04:00
2025-08-20 15:58:07 -04:00
logging.info("Creating tarball: %s", staged_tarball)
2025-09-18 08:13:21 -04:00
await discord_notify(DISCORD_WEBHOOK,
title=f"Compressing: Job {job_id}",
description=f"Creating tarball (`{len(track_list)}` track(s)).\nStaging path: {staged_tarball}",
color=0xFFA500,
target=target)
try:
subprocess.run(
["tar", "-I", "pigz -9", "-cf", str(staged_tarball), "-C", str(staging_root)]
+ [str(f.relative_to(staging_root)) for f in all_final_files],
check=True,
)
for f in all_final_files:
try:
os.remove(f)
except Exception:
pass
except FileNotFoundError:
send_log_to_discord("pigz not available, falling back to tarfile (slower).", "WARNING", target)
with tarfile.open(staged_tarball, "w:gz") as tar:
2025-08-20 15:58:07 -04:00
for f in all_final_files:
2025-09-18 08:13:21 -04:00
try:
arcname = f.relative_to(staging_root)
except ValueError:
arcname = f.name
tar.add(f, arcname=str(arcname))
2025-08-20 15:58:07 -04:00
try:
os.remove(f)
except Exception:
pass
if not staged_tarball.exists():
2025-09-18 08:13:21 -04:00
send_log_to_discord(f"Tarball was not created: `{staged_tarball}`", "CRITICAL", target)
2025-08-20 07:32:57 -04:00
if job:
2025-09-18 08:13:21 -04:00
job.meta["status"] = "compress_failed"
job.save_meta()
2025-08-20 15:58:07 -04:00
return []
2025-08-20 07:32:57 -04:00
try:
staged_tarball.rename(final_tarball)
except Exception:
shutil.move(str(staged_tarball), str(final_tarball))
2025-09-09 15:50:13 -04:00
await asyncio.to_thread(shutil.rmtree, staging_root, ignore_errors=True)
2025-08-20 07:32:57 -04:00
if job:
2025-08-20 15:58:07 -04:00
job.meta["tarball"] = str(final_tarball)
job.meta["progress"] = 100
job.meta["status"] = "Completed"
2025-08-20 15:58:07 -04:00
job.save_meta()
2025-08-20 07:32:57 -04:00
2025-09-18 08:13:21 -04:00
# Job completed Discord message
await discord_notify(
DISCORD_WEBHOOK,
title=f"Job Completed: {job_id}",
description=f"Processed `{len(track_list)}` track(s). Tarball: `{final_tarball}`",
target=target,
color=0x00FF00
)
2025-08-20 07:32:57 -04:00
return [str(final_tarball)]
2025-08-15 13:31:15 -04:00
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
return loop.run_until_complete(process_tracks())
2025-08-20 15:58:07 -04:00
except Exception as e:
2025-09-18 08:13:21 -04:00
send_log_to_discord(f"bulk_download failed: {e}\n{traceback.format_exc()}", "CRITICAL", target)
2025-08-20 15:58:07 -04:00
if job:
job.meta["status"] = "Failed"
2025-08-20 15:58:07 -04:00
job.save_meta()
2025-08-15 13:31:15 -04:00
finally:
loop.close()