#!/usr/bin/env python3 import asyncio import json import logging import os from pathlib import Path from playwright.async_api import async_playwright # ------------------------------- # CONFIGURATION & GLOBAL CONSTANTS # ------------------------------- OUTPUT_DIR = Path('grok_favorites') COOKIES_FILE = 'cookies.json' FAVORITES_URL = 'https://grok.com/imagine/saved' MAX_SCROLL_ATTEMPTS = 30 SCROLL_PAUSE_MS = 2500 CONCURRENCY_LIMIT = 4 # UI selectors — update these by inspecting the page in browser DevTools # These are placeholders based on common patterns; likely need tuning #SELECTOR_POST_ITEMS = '.generation-card, .post-item, [data-testid="generation-item"]' #SELECTOR_PROMPT = '.prompt-text, .generation-prompt, .text-prompt' #SELECTOR_TIMESTAMP = '[data-timestamp], .timestamp, time' #SELECTOR_MODEL = '.model-tag, .model-name, [data-model]' #SELECTOR_MEDIA = 'img[src], video[src], [data-media-url], a[href$=".mp4"], a[href$=".jpg"], a[href$=".png"]' # Robust selectors using ARIA roles + common patterns SELECTOR_LIST_CONTAINER = '[role="list"]' # The main list wrapper SELECTOR_POST_ITEMS = '[role="list"] [role="listitem"]' # Individual favorite cards/items # Inside each listitem — chain from the listitem for scoping # These are chained in the code below so they only look inside each card SELECTOR_PROMPT = '[role="listitem"] .prompt-text, [role="listitem"] [class*="prompt"], [role="listitem"] p, [role="listitem"] div[class*="text"]' # Adjust after inspection SELECTOR_TIMESTAMP = '[role="listitem"] time, [role="listitem"] [class*="time"], [role="listitem"] [data-timestamp]' SELECTOR_MODEL = '[role="listitem"] [class*="model"], [role="listitem"] span[class*="tag"], [role="listitem"] [data-model]' SELECTOR_MEDIA = '[role="listitem"] img, [role="listitem"] video, [role="listitem"] [class*="media"], [role="listitem"] source, [role="listitem"] a[href*="mp4"], [role="listitem"] a[href*="jpg"]' SELECTOR_PROMPT_RELATIVE = '.prompt-text, [class*="prompt"], p, div[class*="text"]' # Update based on inspection SELECTOR_TIMESTAMP_RELATIVE = 'time, [class*="time"], [data-timestamp]' SELECTOR_MODEL_RELATIVE = '[class*="model"], span[class*="tag"], [data-model]' SELECTOR_MEDIA_RELATIVE = 'img, video, [class*="media"], source, a[href*="mp4"], a[href*="jpg"]' # Logging setup _LOGGER = logging.getLogger('grok_favorites_downloader') _LOGGER.setLevel(logging.INFO) # Console handler console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') console_handler.setFormatter(console_formatter) _LOGGER.addHandler(console_handler) # File handler (rotates if needed, but simple for now) file_handler = logging.FileHandler('grok_downloader.log') file_handler.setLevel(logging.INFO) file_handler.setFormatter(console_formatter) _LOGGER.addHandler(file_handler) # ------------------------------- # HELPER FUNCTIONS # ------------------------------- def ensure_output_dir(): OUTPUT_DIR.mkdir(parents=True, exist_ok=True) _LOGGER.info(f"Output directory ensured: {OUTPUT_DIR}") async def load_authenticated_context(playwright): browser = await playwright.chromium.launch( channel="chrome", headless=True, args=['--disable-gpu', '--no-sandbox'] # helpful on Linux servers ) context = await browser.new_context( viewport={'width': 1920, 'height': 1080}, user_agent='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36' ) if not os.path.exists(COOKIES_FILE): _LOGGER.error(f"Cookies file not found: {COOKIES_FILE}") raise FileNotFoundError(f"{COOKIES_FILE} missing — export cookies first") with open(COOKIES_FILE, 'r', encoding='utf-8') as f: cookies = json.load(f) await context.add_cookies(cookies) _LOGGER.info("Browser context loaded with cookies") return browser, context async def scroll_to_load_all(page): _LOGGER.info("Starting auto-scroll to load all favorites...") loaded_count = 0 for attempt in range(1, MAX_SCROLL_ATTEMPTS + 1): prev_height = await page.evaluate('document.body.scrollHeight') await page.evaluate('window.scrollTo(0, document.body.scrollHeight)') await page.wait_for_timeout(SCROLL_PAUSE_MS) new_height = await page.evaluate('document.body.scrollHeight') if new_height == prev_height: _LOGGER.info(f"Scroll complete — no more content after {attempt} attempts") break loaded_count += 1 if loaded_count % 5 == 0: _LOGGER.info(f"Scrolled {loaded_count} times...") _LOGGER.info("Scroll finished") async def scrape_posts(page): # First find the main list container (for logging/debug) list_containers = await page.query_selector_all(SELECTOR_LIST_CONTAINER) _LOGGER.info(f"Found {len(list_containers)} elements with role='list'") posts_locator = page.locator(SELECTOR_POST_ITEMS) count_posts = await posts_locator.count() _LOGGER.info(f"Found {count_posts} items with role='listitem'") if count_posts == 0: _LOGGER.warning("No listitems found. Page may not be fully loaded, wrong URL, or selectors need update.") # Optional: Dump visible text or screenshot for debug # await page.screenshot(path="debug_favorites.png") # _LOGGER.info("Saved screenshot: debug_favorites.png") return posts_locator async def process_post(post_locator, semaphore): async with semaphore: try: # Extract post_id (fallback to random if no data-id) post_id = await post_locator.get_attribute('data-id') or await post_locator.get_attribute('id') or f"unknown_{hash(await post_locator.inner_text())}" post_id = post_id.replace('/', '_').replace(':', '_').replace(' ', '_') # Prompt (look inside this specific listitem) prompt_locator = post_locator.locator(SELECTOR_PROMPT_RELATIVE) prompt = (await prompt_locator.inner_text()).strip() if await prompt_locator.count() > 0 else "No prompt found" # Timestamp ts_locator = post_locator.locator(SELECTOR_TIMESTAMP_RELATIVE) timestamp = await ts_locator.get_attribute('datetime') or (await ts_locator.inner_text()).strip() if await ts_locator.count() > 0 else "Unknown" # Model model_locator = post_locator.locator(SELECTOR_MODEL_RELATIVE) model = (await model_locator.inner_text()).strip() if await model_locator.count() > 0 else "Unknown" # Media URLs (images/videos/sources) media_locator = post_locator.locator(SELECTOR_MEDIA_RELATIVE) media_count = await media_locator.count() media_urls = [] for i in range(media_count): elem_locator = media_locator.nth(i) src = await elem_locator.get_attribute('src') or await elem_locator.get_attribute('href') or await elem_locator.get_attribute('data-src') if src: if src.startswith('//'): src = 'https:' + src media_urls.append(src) if not media_urls: _LOGGER.warning(f"No media URLs found for {post_id}") return metadata = { 'post_id': post_id, 'prompt': prompt, 'timestamp': timestamp, 'model': model, 'media_urls': media_urls, 'media_count': len(media_urls) } # Download loop (same as before) for idx, url in enumerate(media_urls): ext = url.split('.')[-1].split('?')[0].lower() or 'jpg' filename = f"{post_id}_{idx}.{ext}" filepath = OUTPUT_DIR / filename jsonpath = OUTPUT_DIR / f"{post_id}_{idx}.json" if filepath.exists(): _LOGGER.debug(f"Skipping existing: {filename}") continue _LOGGER.info(f"Downloading {filename} from {url[:80]}...") response = await post_locator.page.context.request.get(url, timeout=60000) if response.ok: content = await response.body() with open(filepath, 'wb') as f: f.write(content) with open(jsonpath, 'w', encoding='utf-8') as j: json.dump(metadata, j, indent=2, ensure_ascii=False) _LOGGER.info(f"Saved: {filename} + JSON metadata") else: _LOGGER.warning(f"Download failed {url} — status {response.status}") except Exception as e: _LOGGER.error(f"Error on post {post_id}: {e}", exc_info=True) async def process_all_posts(posts, concurrency_limit): semaphore = asyncio.Semaphore(concurrency_limit) all_post_locators = await posts.all() # Get list of individual Locators tasks = [process_post(loc, semaphore) for loc in all_post_locators] await asyncio.gather(*tasks, return_exceptions=True) # ------------------------------- # MAIN ENTRY POINT # ------------------------------- async def main(): _LOGGER.info("Starting Grok favorites incremental downloader") ensure_output_dir() async with async_playwright() as playwright: browser, context = await load_authenticated_context(playwright) page = await context.new_page() try: _LOGGER.info(f"Navigating to {FAVORITES_URL}") await page.goto(FAVORITES_URL, wait_until='networkidle', timeout=60000) await scroll_to_load_all(page) posts_locator = await scrape_posts(page) if await posts_locator.count() > 0: await process_all_posts(posts_locator, CONCURRENCY_LIMIT) else: _LOGGER.warning("No posts found — check selectors or login status") except Exception as e: _LOGGER.critical(f"Fatal error during execution: {e}", exc_info=True) finally: await browser.close() _LOGGER.info("Browser closed. Download session finished.") if __name__ == '__main__': asyncio.run(main())