codey.lol/src/pages/api/link-preview.js

/**
 * Server-side link preview API endpoint
 * Fetches Open Graph / meta data for URLs to prevent user IP exposure
 * Returns signed proxy URLs for images from untrusted domains
 */

import {
  checkRateLimit,
  recordRequest,
  getCookieId,
  generateNonce,
  createNonceCookie,
} from '../../utils/rateLimit.js';
import { signImageUrl } from './image-proxy.js';

// Trusted domains that can be loaded client-side (embed-safe providers)
const TRUSTED_DOMAINS = new Set([
  'youtube.com',
  'www.youtube.com',
  'youtu.be',
  'img.youtube.com',
  'i.ytimg.com',
  'instagram.com',
  'www.instagram.com',
  'twitter.com',
  'x.com',
  'www.twitter.com',
  'pbs.twimg.com',
  'abs.twimg.com',
  'twitch.tv',
  'www.twitch.tv',
  'clips.twitch.tv',
  'spotify.com',
  'open.spotify.com',
  'soundcloud.com',
  'www.soundcloud.com',
  'vimeo.com',
  'www.vimeo.com',
  'imgur.com',
  'i.imgur.com',
  'giphy.com',
  'media.giphy.com',
  'tenor.com',
  'media.tenor.com',
  'gfycat.com',
  'reddit.com',
  'www.reddit.com',
  'v.redd.it',
  'i.redd.it',
  'preview.redd.it',
  'github.com',
  'gist.github.com',
  'raw.githubusercontent.com',
  'avatars.githubusercontent.com',
  'user-images.githubusercontent.com',
  'codepen.io',
  'codesandbox.io',
  'streamable.com',
  'medal.tv',
  'discord.com',
  'cdn.discordapp.com',
  'media.discordapp.net',
  'picsum.photos',
  'images.unsplash.com',
]);

/**
 * Check if a URL is from a trusted domain
 */
function isTrustedDomain(url) {
  try {
    const parsed = new URL(url);
    return TRUSTED_DOMAINS.has(parsed.hostname);
  } catch {
    return false;
  }
}

/**
 * Get a safe image URL - either direct (if trusted) or signed proxy URL
 */
async function getSafeImageUrl(imageUrl) {
  if (!imageUrl) return null;
  if (isTrustedDomain(imageUrl)) {
    return imageUrl; // Trusted, return as-is
  }
  // Create signed proxy URL
  const signature = await signImageUrl(imageUrl);
  return `/api/image-proxy?url=${encodeURIComponent(imageUrl)}&sig=${signature}`;
}

/**
 * Parse Open Graph and meta tags from HTML
 */
function parseMetaTags(html, url) {
  const meta = {
    url,
    title: null,
    description: null,
    image: null,
    siteName: null,
    type: null,
    video: null,
    themeColor: null,
  };

  // Helper to extract content from meta tags
  const getMetaContent = (pattern) => {
    const match = html.match(pattern);
    return match ? decodeHTMLEntities(match[1]) : null;
  };

  // Open Graph tags
  meta.title = getMetaContent(/<meta[^>]+property=["']og:title["'][^>]+content=["']([^"']+)["']/i)
    || getMetaContent(/<meta[^>]+content=["']([^"']+)["'][^>]+property=["']og:title["']/i);

  meta.description = getMetaContent(/<meta[^>]+property=["']og:description["'][^>]+content=["']([^"']+)["']/i)
    || getMetaContent(/<meta[^>]+content=["']([^"']+)["'][^>]+property=["']og:description["']/i);

  meta.image = getMetaContent(/<meta[^>]+property=["']og:image["'][^>]+content=["']([^"']+)["']/i)
    || getMetaContent(/<meta[^>]+content=["']([^"']+)["'][^>]+property=["']og:image["']/i);

  meta.siteName = getMetaContent(/<meta[^>]+property=["']og:site_name["'][^>]+content=["']([^"']+)["']/i)
    || getMetaContent(/<meta[^>]+content=["']([^"']+)["'][^>]+property=["']og:site_name["']/i);

  meta.type = getMetaContent(/<meta[^>]+property=["']og:type["'][^>]+content=["']([^"']+)["']/i)
    || getMetaContent(/<meta[^>]+content=["']([^"']+)["'][^>]+property=["']og:type["']/i);

  meta.video = getMetaContent(/<meta[^>]+property=["']og:video(?::url)?["'][^>]+content=["']([^"']+)["']/i)
    || getMetaContent(/<meta[^>]+content=["']([^"']+)["'][^>]+property=["']og:video(?::url)?["']/i);

  // Twitter cards fallback
  if (!meta.title) {
    meta.title = getMetaContent(/<meta[^>]+name=["']twitter:title["'][^>]+content=["']([^"']+)["']/i)
      || getMetaContent(/<meta[^>]+content=["']([^"']+)["'][^>]+name=["']twitter:title["']/i);
  }
  if (!meta.description) {
    meta.description = getMetaContent(/<meta[^>]+name=["']twitter:description["'][^>]+content=["']([^"']+)["']/i)
      || getMetaContent(/<meta[^>]+content=["']([^"']+)["'][^>]+name=["']twitter:description["']/i);
  }
  if (!meta.image) {
    meta.image = getMetaContent(/<meta[^>]+name=["']twitter:image["'][^>]+content=["']([^"']+)["']/i)
      || getMetaContent(/<meta[^>]+content=["']([^"']+)["'][^>]+name=["']twitter:image["']/i);
  }

  // Theme color
  meta.themeColor = getMetaContent(/<meta[^>]+name=["']theme-color["'][^>]+content=["']([^"']+)["']/i)
    || getMetaContent(/<meta[^>]+content=["']([^"']+)["'][^>]+name=["']theme-color["']/i);

  // Fallback to standard meta tags and title
  if (!meta.title) {
    const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
    meta.title = titleMatch ? decodeHTMLEntities(titleMatch[1]) : null;
  }
  if (!meta.description) {
    meta.description = getMetaContent(/<meta[^>]+name=["']description["'][^>]+content=["']([^"']+)["']/i)
      || getMetaContent(/<meta[^>]+content=["']([^"']+)["'][^>]+name=["']description["']/i);
  }

  // Resolve relative image URLs
  if (meta.image && !meta.image.startsWith('http')) {
    try {
      const baseUrl = new URL(url);
      meta.image = new URL(meta.image, baseUrl.origin).href;
    } catch {
      meta.image = null;
    }
  }

  // Get site name from domain if not found
  if (!meta.siteName) {
    try {
      const parsed = new URL(url);
      meta.siteName = parsed.hostname.replace(/^www\./, '');
    } catch {
      // ignore
    }
  }

  return meta;
}

/**
 * Decode HTML entities
 */
function decodeHTMLEntities(text) {
  if (!text) return text;
  return text
    .replace(/&amp;/g, '&')
    .replace(/&lt;/g, '<')
    .replace(/&gt;/g, '>')
    .replace(/&quot;/g, '"')
    .replace(/&#39;/g, "'")
    .replace(/&#x27;/g, "'")
    .replace(/&#x2F;/g, '/')
    .replace(/&#(\d+);/g, (_, num) => String.fromCharCode(parseInt(num, 10)))
    .replace(/&#x([a-fA-F0-9]+);/g, (_, hex) => String.fromCharCode(parseInt(hex, 16)));
}

export async function GET({ request }) {
  // Rate limit check
  const rateCheck = checkRateLimit(request, {
    limit: 10,
    windowMs: 1000,
    burstLimit: 50,
    burstWindowMs: 10_000,
  });

  let cookieId = getCookieId(request);
  const hadCookie = !!cookieId;
  if (!cookieId) {
    cookieId = generateNonce();
  }

  if (!rateCheck.allowed) {
    const errorMsg = rateCheck.isFlooding
      ? { error: 'Too many requests - please slow down' }
      : { error: 'Rate limit exceeded' };
    const response = new Response(JSON.stringify(errorMsg), {
      status: 429,
      headers: {
        'Content-Type': 'application/json',
        'Retry-After': '1',
      },
    });
    if (!hadCookie) {
      response.headers.set('Set-Cookie', createNonceCookie(cookieId));
    }
    return response;
  }

  recordRequest(request, 1000);

  const url = new URL(request.url);
  const targetUrl = url.searchParams.get('url');

  if (!targetUrl) {
    return new Response(JSON.stringify({ error: 'Missing url parameter' }), {
      status: 400,
      headers: { 'Content-Type': 'application/json' },
    });
  }

  // Validate URL format
  let parsedUrl;
  try {
    parsedUrl = new URL(targetUrl);
    if (!['http:', 'https:'].includes(parsedUrl.protocol)) {
      throw new Error('Invalid protocol');
    }
  } catch {
    return new Response(JSON.stringify({ error: 'Invalid URL' }), {
      status: 400,
      headers: { 'Content-Type': 'application/json' },
    });
  }

  // Check if it's a trusted domain (client can fetch directly)
  const trusted = isTrustedDomain(targetUrl);

  try {
    const controller = new AbortController();
    const timeout = setTimeout(() => controller.abort(), 8000);

    const response = await fetch(targetUrl, {
      method: 'GET',
      headers: {
        'User-Agent': 'Mozilla/5.0 (compatible; DiscordBot/2.0; +https://discordapp.com)',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
      },
      signal: controller.signal,
      redirect: 'follow',
    });

    clearTimeout(timeout);

    if (!response.ok) {
      return new Response(JSON.stringify({
        error: 'Failed to fetch URL',
        status: response.status
      }), {
        status: 502,
        headers: { 'Content-Type': 'application/json' },
      });
    }

    const contentType = response.headers.get('content-type') || '';

    // Handle image URLs directly - return safe (possibly proxied) URL
    if (contentType.startsWith('image/')) {
      const safeImageUrl = await getSafeImageUrl(targetUrl);
      const result = {
        url: targetUrl,
        type: 'image',
        image: safeImageUrl,
        trusted,
      };
      const resp = new Response(JSON.stringify(result), {
        status: 200,
        headers: {
          'Content-Type': 'application/json',
          'Cache-Control': 'public, max-age=3600',
        },
      });
      if (!hadCookie) {
        resp.headers.set('Set-Cookie', createNonceCookie(cookieId));
      }
      return resp;
    }

    // Handle video URLs directly (no proxy for video - too large)
    if (contentType.startsWith('video/')) {
      // Only allow trusted video sources
      if (!trusted) {
        return new Response(JSON.stringify({
          error: 'Untrusted video source',
        }), {
          status: 403,
          headers: { 'Content-Type': 'application/json' },
        });
      }
      const result = {
        url: targetUrl,
        type: 'video',
        video: targetUrl,
        trusted,
      };
      const resp = new Response(JSON.stringify(result), {
        status: 200,
        headers: {
          'Content-Type': 'application/json',
          'Cache-Control': 'public, max-age=3600',
        },
      });
      if (!hadCookie) {
        resp.headers.set('Set-Cookie', createNonceCookie(cookieId));
      }
      return resp;
    }

    // Only parse HTML
    if (!contentType.includes('text/html') && !contentType.includes('application/xhtml')) {
      return new Response(JSON.stringify({
        error: 'URL is not an HTML page',
        contentType
      }), {
        status: 400,
        headers: { 'Content-Type': 'application/json' },
      });
    }

    // Read only the first 50KB to get meta tags (they're usually in <head>)
    const reader = response.body.getReader();
    let html = '';
    let bytesRead = 0;
    const maxBytes = 50 * 1024;

    while (bytesRead < maxBytes) {
      const { done, value } = await reader.read();
      if (done) break;
      html += new TextDecoder().decode(value);
      bytesRead += value.length;
      // Stop early if we've passed </head>
      if (html.includes('</head>')) break;
    }
    reader.cancel();

    const meta = parseMetaTags(html, targetUrl);
    meta.trusted = trusted;

    // Convert image URL to safe URL (proxy if untrusted)
    if (meta.image) {
      meta.image = await getSafeImageUrl(meta.image);
    }

    const resp = new Response(JSON.stringify(meta), {
      status: 200,
      headers: {
        'Content-Type': 'application/json',
        'Cache-Control': 'public, max-age=3600',
      },
    });
    if (!hadCookie) {
      resp.headers.set('Set-Cookie', createNonceCookie(cookieId));
    }
    return resp;

  } catch (err) {
    console.error('[link-preview] Error fetching URL:', err.message);
    return new Response(JSON.stringify({
      error: 'Failed to fetch preview',
      message: err.message
    }), {
      status: 500,
      headers: { 'Content-Type': 'application/json' },
    });
  }
}