#!/usr/bin/env python3

import asyncio
import json
import logging
import os
from pathlib import Path
from playwright.async_api import async_playwright

# -------------------------------
# CONFIGURATION & GLOBAL CONSTANTS
# -------------------------------

OUTPUT_DIR = Path('grok_favorites')
COOKIES_FILE = 'cookies.json'
FAVORITES_URL = 'https://grok.com/imagine/saved'
MAX_SCROLL_ATTEMPTS = 30
SCROLL_PAUSE_MS = 2500
CONCURRENCY_LIMIT = 4

# UI selectors — update these by inspecting the page in browser DevTools
# These are placeholders based on common patterns; likely need tuning
#SELECTOR_POST_ITEMS = '.generation-card, .post-item, [data-testid="generation-item"]'
#SELECTOR_PROMPT = '.prompt-text, .generation-prompt, .text-prompt'
#SELECTOR_TIMESTAMP = '[data-timestamp], .timestamp, time'
#SELECTOR_MODEL = '.model-tag, .model-name, [data-model]'
#SELECTOR_MEDIA = 'img[src], video[src], [data-media-url], a[href$=".mp4"], a[href$=".jpg"], a[href$=".png"]'

# Robust selectors using ARIA roles + common patterns
SELECTOR_LIST_CONTAINER = '[role="list"]'                    # The main list wrapper
SELECTOR_POST_ITEMS     = '[role="list"] [role="listitem"]'  # Individual favorite cards/items

# Inside each listitem — chain from the listitem for scoping
# These are chained in the code below so they only look inside each card
SELECTOR_PROMPT    = '[role="listitem"] .prompt-text, [role="listitem"] [class*="prompt"], [role="listitem"] p, [role="listitem"] div[class*="text"]'  # Adjust after inspection
SELECTOR_TIMESTAMP = '[role="listitem"] time, [role="listitem"] [class*="time"], [role="listitem"] [data-timestamp]'
SELECTOR_MODEL     = '[role="listitem"] [class*="model"], [role="listitem"] span[class*="tag"], [role="listitem"] [data-model]'
SELECTOR_MEDIA     = '[role="listitem"] img, [role="listitem"] video, [role="listitem"] [class*="media"], [role="listitem"] source, [role="listitem"] a[href*="mp4"], [role="listitem"] a[href*="jpg"]'

SELECTOR_PROMPT_RELATIVE    = '.prompt-text, [class*="prompt"], p, div[class*="text"]'  # Update based on inspection
SELECTOR_TIMESTAMP_RELATIVE = 'time, [class*="time"], [data-timestamp]'
SELECTOR_MODEL_RELATIVE     = '[class*="model"], span[class*="tag"], [data-model]'
SELECTOR_MEDIA_RELATIVE     = 'img, video, [class*="media"], source, a[href*="mp4"], a[href*="jpg"]'

# Logging setup
_LOGGER = logging.getLogger('grok_favorites_downloader')
_LOGGER.setLevel(logging.INFO)

# Console handler
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
console_handler.setFormatter(console_formatter)
_LOGGER.addHandler(console_handler)

# File handler (rotates if needed, but simple for now)
file_handler = logging.FileHandler('grok_downloader.log')
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(console_formatter)
_LOGGER.addHandler(file_handler)


# -------------------------------
# HELPER FUNCTIONS
# -------------------------------

def ensure_output_dir():
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    _LOGGER.info(f"Output directory ensured: {OUTPUT_DIR}")


async def load_authenticated_context(playwright):
    browser = await playwright.chromium.launch(
        channel="chrome",
        headless=True,
        args=['--disable-gpu', '--no-sandbox']  # helpful on Linux servers
    )
    context = await browser.new_context(
        viewport={'width': 1920, 'height': 1080},
        user_agent='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
    )

    if not os.path.exists(COOKIES_FILE):
        _LOGGER.error(f"Cookies file not found: {COOKIES_FILE}")
        raise FileNotFoundError(f"{COOKIES_FILE} missing — export cookies first")

    with open(COOKIES_FILE, 'r', encoding='utf-8') as f:
        cookies = json.load(f)
    await context.add_cookies(cookies)

    _LOGGER.info("Browser context loaded with cookies")
    return browser, context


async def scroll_to_load_all(page):
    _LOGGER.info("Starting auto-scroll to load all favorites...")
    loaded_count = 0

    for attempt in range(1, MAX_SCROLL_ATTEMPTS + 1):
        prev_height = await page.evaluate('document.body.scrollHeight')
        await page.evaluate('window.scrollTo(0, document.body.scrollHeight)')
        await page.wait_for_timeout(SCROLL_PAUSE_MS)

        new_height = await page.evaluate('document.body.scrollHeight')
        if new_height == prev_height:
            _LOGGER.info(f"Scroll complete — no more content after {attempt} attempts")
            break

        loaded_count += 1
        if loaded_count % 5 == 0:
            _LOGGER.info(f"Scrolled {loaded_count} times...")

    _LOGGER.info("Scroll finished")


async def scrape_posts(page):
    # First find the main list container (for logging/debug)
    list_containers = await page.query_selector_all(SELECTOR_LIST_CONTAINER)
    _LOGGER.info(f"Found {len(list_containers)} elements with role='list'")

    posts_locator = page.locator(SELECTOR_POST_ITEMS)
    count_posts = await posts_locator.count()
    _LOGGER.info(f"Found {count_posts} items with role='listitem'")

    if count_posts == 0:
        _LOGGER.warning("No listitems found. Page may not be fully loaded, wrong URL, or selectors need update.")
        # Optional: Dump visible text or screenshot for debug
        # await page.screenshot(path="debug_favorites.png")
        # _LOGGER.info("Saved screenshot: debug_favorites.png")

    return posts_locator


async def process_post(post_locator, semaphore):
    async with semaphore:
        try:
            # Extract post_id (fallback to random if no data-id)
            post_id = await post_locator.get_attribute('data-id') or await post_locator.get_attribute('id') or f"unknown_{hash(await post_locator.inner_text())}"
            post_id = post_id.replace('/', '_').replace(':', '_').replace(' ', '_')

            # Prompt (look inside this specific listitem)
            prompt_locator = post_locator.locator(SELECTOR_PROMPT_RELATIVE)
            prompt = (await prompt_locator.inner_text()).strip() if await prompt_locator.count() > 0 else "No prompt found"

            # Timestamp
            ts_locator = post_locator.locator(SELECTOR_TIMESTAMP_RELATIVE)
            timestamp = await ts_locator.get_attribute('datetime') or (await ts_locator.inner_text()).strip() if await ts_locator.count() > 0 else "Unknown"

            # Model
            model_locator = post_locator.locator(SELECTOR_MODEL_RELATIVE)
            model = (await model_locator.inner_text()).strip() if await model_locator.count() > 0 else "Unknown"

            # Media URLs (images/videos/sources)
            media_locator = post_locator.locator(SELECTOR_MEDIA_RELATIVE)
            media_count = await media_locator.count()
            media_urls = []
            for i in range(media_count):
                elem_locator = media_locator.nth(i)
                src = await elem_locator.get_attribute('src') or await elem_locator.get_attribute('href') or await elem_locator.get_attribute('data-src')
                if src:
                    if src.startswith('//'): src = 'https:' + src
                    media_urls.append(src)

            if not media_urls:
                _LOGGER.warning(f"No media URLs found for {post_id}")
                return

            metadata = {
                'post_id': post_id,
                'prompt': prompt,
                'timestamp': timestamp,
                'model': model,
                'media_urls': media_urls,
                'media_count': len(media_urls)
            }

            # Download loop (same as before)
            for idx, url in enumerate(media_urls):
                ext = url.split('.')[-1].split('?')[0].lower() or 'jpg'
                filename = f"{post_id}_{idx}.{ext}"
                filepath = OUTPUT_DIR / filename
                jsonpath = OUTPUT_DIR / f"{post_id}_{idx}.json"

                if filepath.exists():
                    _LOGGER.debug(f"Skipping existing: {filename}")
                    continue

                _LOGGER.info(f"Downloading {filename} from {url[:80]}...")
                response = await post_locator.page.context.request.get(url, timeout=60000)
                if response.ok:
                    content = await response.body()
                    with open(filepath, 'wb') as f:
                        f.write(content)
                    with open(jsonpath, 'w', encoding='utf-8') as j:
                        json.dump(metadata, j, indent=2, ensure_ascii=False)
                    _LOGGER.info(f"Saved: {filename} + JSON metadata")
                else:
                    _LOGGER.warning(f"Download failed {url} — status {response.status}")

        except Exception as e:
            _LOGGER.error(f"Error on post {post_id}: {e}", exc_info=True)


async def process_all_posts(posts, concurrency_limit):
    semaphore = asyncio.Semaphore(concurrency_limit)
    all_post_locators = await posts.all()  # Get list of individual Locators
    tasks = [process_post(loc, semaphore) for loc in all_post_locators]
    await asyncio.gather(*tasks, return_exceptions=True)


# -------------------------------
# MAIN ENTRY POINT
# -------------------------------

async def main():
    _LOGGER.info("Starting Grok favorites incremental downloader")

    ensure_output_dir()

    async with async_playwright() as playwright:
        browser, context = await load_authenticated_context(playwright)
        page = await context.new_page()

        try:
            _LOGGER.info(f"Navigating to {FAVORITES_URL}")
            await page.goto(FAVORITES_URL, wait_until='networkidle', timeout=60000)

            await scroll_to_load_all(page)

            posts_locator = await scrape_posts(page)
            if await posts_locator.count() > 0:
                await process_all_posts(posts_locator, CONCURRENCY_LIMIT)
            else:
                _LOGGER.warning("No posts found — check selectors or login status")

        except Exception as e:
            _LOGGER.critical(f"Fatal error during execution: {e}", exc_info=True)

        finally:
            await browser.close()
            _LOGGER.info("Browser closed. Download session finished.")


if __name__ == '__main__':
    asyncio.run(main())