misc / TRip: folder structure / tar naming
This commit is contained in:
@@ -4,8 +4,8 @@ import random
|
||||
import os
|
||||
import tarfile
|
||||
import uuid
|
||||
import re
|
||||
import shutil
|
||||
import re
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse, unquote
|
||||
|
||||
@@ -13,14 +13,8 @@ import aiohttp
|
||||
from rq import get_current_job
|
||||
from utils.sr_wrapper import SRUtil
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG,
|
||||
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||
)
|
||||
|
||||
# Constants
|
||||
ROOT_DIR = Path("/storage/music2") # Change to your music folder
|
||||
# ---------- Config ----------
|
||||
ROOT_DIR = Path("/storage/music2") # change to your music folder
|
||||
MAX_RETRIES = 3
|
||||
THROTTLE_MIN = 0.2
|
||||
THROTTLE_MAX = 1.5
|
||||
@@ -36,44 +30,63 @@ HEADERS = {
|
||||
"Connection": "keep-alive",
|
||||
}
|
||||
|
||||
# StreamRip utility
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG,
|
||||
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||
)
|
||||
|
||||
sr = SRUtil()
|
||||
|
||||
|
||||
import re
|
||||
|
||||
# ---------- Helpers ----------
|
||||
def sanitize_filename(name: str) -> str:
|
||||
"""
|
||||
Remove or replace characters not allowed in filenames.
|
||||
Also trims whitespace and collapses consecutive spaces.
|
||||
"""
|
||||
# Replace slashes/backslashes with a dash
|
||||
"""Make a string safe for file/dir names."""
|
||||
if not name:
|
||||
return "Unknown"
|
||||
# Replace path separators first
|
||||
name = name.replace("/", "-").replace("\\", "-")
|
||||
# Remove illegal characters for most OSes
|
||||
# Remove illegal characters on common filesystems
|
||||
name = re.sub(r'[<>:"|?*\x00-\x1F]', "", name)
|
||||
# Strip leading/trailing spaces and dots
|
||||
# Trim spaces and trailing dots
|
||||
name = name.strip().strip(".")
|
||||
# Collapse multiple spaces into one
|
||||
# Collapse whitespace
|
||||
name = re.sub(r"\s+", " ", name)
|
||||
return name or "Unknown"
|
||||
# Reasonable length cap
|
||||
return name[:180] or "Unknown"
|
||||
|
||||
|
||||
def ensure_unique_path(p: Path) -> Path:
|
||||
"""If path exists, append ' (n)' before extension."""
|
||||
if not p.exists():
|
||||
return p
|
||||
stem, suffix = p.stem, p.suffix
|
||||
parent = p.parent
|
||||
n = 2
|
||||
while True:
|
||||
candidate = parent / f"{stem} ({n}){suffix}"
|
||||
if not candidate.exists():
|
||||
return candidate
|
||||
n += 1
|
||||
|
||||
|
||||
# ---------- Job ----------
|
||||
def bulk_download(track_list: list):
|
||||
"""
|
||||
Full RQ-compatible bulk download job with:
|
||||
- async per-track URL fetching
|
||||
- retry on failure
|
||||
- per-track success/failure
|
||||
- metadata extraction
|
||||
- organized file storage
|
||||
- throttling
|
||||
- per-artist tarball creation
|
||||
- progress updates
|
||||
RQ job:
|
||||
- fetches stream URLs
|
||||
- downloads with retries + throttling
|
||||
- uses SR metadata to name/organize files
|
||||
- creates ONE tarball for all tracks, with all artist names in the filename
|
||||
- returns [tarball_path]
|
||||
"""
|
||||
job = get_current_job()
|
||||
|
||||
async def process_tracks():
|
||||
per_track_meta = []
|
||||
artist_files = {} # artist -> list of files
|
||||
all_final_files: list[Path] = []
|
||||
all_artists: set[str] = set()
|
||||
|
||||
(ROOT_DIR / "completed").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
async with aiohttp.ClientSession(headers=HEADERS) as session:
|
||||
total = len(track_list)
|
||||
@@ -89,96 +102,124 @@ def bulk_download(track_list: list):
|
||||
attempt = 0
|
||||
|
||||
while attempt < MAX_RETRIES:
|
||||
tmp_file: Path | None = None
|
||||
attempt += 1
|
||||
try:
|
||||
# Get track URL
|
||||
# 1) Stream URL
|
||||
url = await sr.get_stream_url_by_track_id(track_id)
|
||||
if not url:
|
||||
logging.critical(
|
||||
"Failed to get URL for track: %s", track_id
|
||||
)
|
||||
await asyncio.sleep(
|
||||
random.uniform(THROTTLE_MIN, THROTTLE_MAX)
|
||||
)
|
||||
continue
|
||||
raise RuntimeError("No stream URL")
|
||||
|
||||
# Download file (chunked)
|
||||
# 2) Extension from URL path only (no query)
|
||||
parsed = urlparse(url)
|
||||
ext = Path(unquote(parsed.path)).suffix or ".mp3"
|
||||
tmp_file = Path(f"/tmp/{track_id}{ext}")
|
||||
clean_path = unquote(parsed.path) # path has no query; just in case we unquote
|
||||
ext = Path(clean_path).suffix or ".mp3"
|
||||
|
||||
# Unique temp file
|
||||
tmp_file = Path(f"/tmp/{uuid.uuid4().hex}{ext}")
|
||||
|
||||
# 3) Download (chunked)
|
||||
async with session.get(url) as resp:
|
||||
resp.raise_for_status()
|
||||
with open(tmp_file, "wb") as f:
|
||||
async for chunk in resp.content.iter_chunked(64 * 1024):
|
||||
f.write(chunk)
|
||||
|
||||
# Extract metadata
|
||||
metadata = await sr.get_metadata_by_track_id(track_id)
|
||||
if not metadata:
|
||||
logging.critical(
|
||||
"Failed to retrieve metadata for track ID: %s. Skipping",
|
||||
track_id,
|
||||
)
|
||||
continue
|
||||
artist = sanitize_filename(metadata.get("artist", "Unknown Artist"))
|
||||
album = sanitize_filename(metadata.get("album", "Unknown Album"))
|
||||
title = sanitize_filename(metadata.get("song", "Unknown Song"))
|
||||
# 4) Metadata from SR (prefer API over tags)
|
||||
md = await sr.get_metadata_by_track_id(track_id) or {}
|
||||
artist_raw = md.get("artist") or "Unknown Artist"
|
||||
album_raw = md.get("album") or "Unknown Album"
|
||||
title_raw = md.get("song") or f"Track {track_id}"
|
||||
|
||||
logging.critical("Got metadata: %s/%s/%s", artist, album, title)
|
||||
artist = sanitize_filename(artist_raw)
|
||||
album = sanitize_filename(album_raw)
|
||||
title = sanitize_filename(title_raw)
|
||||
|
||||
# Organize path
|
||||
all_artists.add(artist)
|
||||
|
||||
# 5) Final path
|
||||
final_dir = ROOT_DIR / artist / album
|
||||
final_dir.mkdir(parents=True, exist_ok=True)
|
||||
final_file = final_dir / f"{title}{ext}"
|
||||
tmp_file.rename(final_file)
|
||||
final_file = ensure_unique_path(final_dir / f"{title}{ext}")
|
||||
|
||||
# Track per-track info
|
||||
tmp_file.rename(final_file)
|
||||
tmp_file = None # consumed
|
||||
|
||||
# Track success
|
||||
track_info.update(
|
||||
{"status": "success", "file_path": str(final_file)}
|
||||
)
|
||||
artist_files.setdefault(artist, []).append(final_file)
|
||||
|
||||
break # success
|
||||
all_final_files.append(final_file)
|
||||
break # success; exit retry loop
|
||||
|
||||
except Exception as e:
|
||||
logging.error("Error downloading track %s: %s", track_id, e)
|
||||
logging.error("Track %s attempt %s failed: %s", track_id, attempt, e)
|
||||
track_info["error"] = str(e)
|
||||
if attempt >= MAX_RETRIES:
|
||||
track_info["status"] = "failed"
|
||||
else:
|
||||
# small delay before retry
|
||||
await asyncio.sleep(
|
||||
random.uniform(THROTTLE_MIN, THROTTLE_MAX)
|
||||
)
|
||||
# small backoff before next attempt (or next track)
|
||||
await asyncio.sleep(random.uniform(THROTTLE_MIN, THROTTLE_MAX))
|
||||
finally:
|
||||
# Clean partial temp file on failure
|
||||
if tmp_file and tmp_file.exists():
|
||||
try:
|
||||
tmp_file.unlink()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Update RQ job meta
|
||||
# Update RQ meta after each track
|
||||
per_track_meta.append(track_info)
|
||||
if job:
|
||||
job.meta["progress"] = int((i + 1) / total * 100)
|
||||
job.meta["progress"] = int((i + 1) / max(total, 1) * 100)
|
||||
job.meta["tracks"] = track_list
|
||||
job.save_meta()
|
||||
|
||||
# Throttle between downloads
|
||||
# Throttle between tracks
|
||||
await asyncio.sleep(random.uniform(THROTTLE_MIN, THROTTLE_MAX))
|
||||
|
||||
# Create per-artist tarballs
|
||||
tarballs = []
|
||||
for artist, files in artist_files.items():
|
||||
short_id = uuid.uuid4().hex[:8]
|
||||
tarball_name = ROOT_DIR / "completed" / f"{artist}_{short_id}.tar.gz"
|
||||
with tarfile.open(tarball_name, "w:gz") as tar:
|
||||
for f in files:
|
||||
tar.add(f, arcname=f.name)
|
||||
os.remove(f) # remove original file
|
||||
logging.critical("Created tarball: %s", tarball_name)
|
||||
tarballs.append(str(tarball_name))
|
||||
artist_dir = ROOT_DIR / artist
|
||||
shutil.rmtree(artist_dir, ignore_errors=True)
|
||||
# ---- Single combined tarball for all tracks ----
|
||||
if not all_final_files:
|
||||
# nothing succeeded
|
||||
return []
|
||||
|
||||
return tarballs
|
||||
combined_artists = sanitize_filename(" & ".join(sorted(all_artists))) or "Unknown Artist"
|
||||
short_id = uuid.uuid4().hex[:8]
|
||||
tarball_path = (ROOT_DIR / "completed" / f"{combined_artists}_{short_id}.tar.gz")
|
||||
tarball_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Run the async function synchronously
|
||||
with tarfile.open(tarball_path, "w:gz") as tar:
|
||||
for f in all_final_files:
|
||||
# Preserve relative Artist/Album/Song.ext structure inside the tar
|
||||
try:
|
||||
arcname = f.relative_to(ROOT_DIR)
|
||||
except ValueError:
|
||||
arcname = f.name # fallback
|
||||
tar.add(f, arcname=str(arcname))
|
||||
# remove original file after adding
|
||||
try:
|
||||
os.remove(f)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
logging.critical("Created tarball: %s", tarball_path)
|
||||
|
||||
# Cleanup empty artist/album dirs (best-effort)
|
||||
# Remove any directories under ROOT_DIR that are now empty
|
||||
to_check = {p.parent for p in all_final_files} | {p.parent.parent for p in all_final_files}
|
||||
for d in sorted(to_check, key=lambda p: len(p.parts), reverse=True):
|
||||
if d.is_dir():
|
||||
try:
|
||||
# remove only if empty
|
||||
next(d.iterdir())
|
||||
except StopIteration:
|
||||
# empty
|
||||
shutil.rmtree(d, ignore_errors=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return [str(tarball_path)]
|
||||
|
||||
# Run async part synchronously for RQ
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
try:
|
||||
|
Reference in New Issue
Block a user