/**
* Page utilities for extracting responses from NotebookLM web UI
*
* This module provides functions to:
* - Extract latest assistant responses from the page
* - Wait for new responses with streaming detection
* - Detect placeholders and loading states
* - Snapshot existing responses for comparison
*
* Based on the Python implementation from page_utils.py
*/
import type { Page } from 'patchright';
import { log } from './logger.js';
// ============================================================================
// Constants
// ============================================================================
/**
* CSS selectors to find assistant response elements
* Ordered by priority (most specific first)
*/
const RESPONSE_SELECTORS = [
'.to-user-container .message-text-content',
"[data-message-author='bot']",
"[data-message-author='assistant']",
"[data-message-role='assistant']",
"[data-author='assistant']",
"[data-renderer*='assistant']",
"[data-automation-id='response-text']",
"[data-automation-id='assistant-response']",
"[data-automation-id='chat-response']",
"[data-testid*='assistant']",
"[data-testid*='response']",
"[aria-live='polite']",
"[role='listitem'][data-message-author]",
];
/**
* Text snippets that indicate a placeholder/loading state
*/
const PLACEHOLDER_SNIPPETS = [
'antwort wird erstellt',
'answer wird erstellt',
'answer is being created',
'answer is being generated',
'creating answer',
'generating answer',
'wird erstellt',
'getting the context', // NotebookLM initial loading message
'loading',
'please wait',
];
// ============================================================================
// Helper Functions
// ============================================================================
/**
* Simple string hash function (for efficient comparison)
*/
function hashString(str: string): number {
let hash = 0;
for (let i = 0; i < str.length; i++) {
const char = str.charCodeAt(i);
hash = (hash << 5) - hash + char;
hash = hash & hash; // Convert to 32bit integer
}
return hash;
}
/**
* Check if text is a placeholder/loading message
*/
function isPlaceholder(text: string): boolean {
const lower = text.toLowerCase();
return PLACEHOLDER_SNIPPETS.some((snippet) => lower.includes(snippet));
}
// ============================================================================
// Main Functions
// ============================================================================
/**
* Snapshot the latest response text currently visible
* Returns null if no response found
*/
export async function snapshotLatestResponse(page: Page): Promise<string | null> {
return await extractLatestText(page, new Set(), false, 0);
}
/**
* Snapshot ALL existing assistant response texts
* Used to capture visible responses BEFORE submitting a new question
*/
export async function snapshotAllResponses(page: Page): Promise<string[]> {
const allTexts: string[] = [];
const primarySelector = '.to-user-container';
try {
const containers = await page.$$(primarySelector);
if (containers.length > 0) {
for (const container of containers) {
try {
const textElement = await container.$('.message-text-content');
if (textElement) {
const text = await textElement.innerText();
if (text && text.trim()) {
allTexts.push(text.trim());
}
}
} catch {
continue;
}
}
log.info(`πΈ [SNAPSHOT] Captured ${allTexts.length} existing responses`);
}
} catch (error) {
log.warning(`β οΈ [SNAPSHOT] Failed to snapshot responses: ${error}`);
}
return allTexts;
}
/**
* Count the number of visible assistant response elements
*/
export async function countResponseElements(page: Page): Promise<number> {
let count = 0;
for (const selector of RESPONSE_SELECTORS) {
try {
const elements = await page.$$(selector);
if (elements.length > 0) {
// Count only visible elements
for (const el of elements) {
try {
const isVisible = await el.isVisible();
if (isVisible) {
count++;
}
} catch {
continue;
}
}
// If we found elements with this selector, stop trying others
if (count > 0) {
break;
}
}
} catch {
continue;
}
}
return count;
}
/**
* Wait for a new assistant response with streaming detection
*
* This function:
* 1. Polls the page for new response text
* 2. Detects streaming (text changes) vs. complete (text stable)
* 3. Requires text to be stable for 3 consecutive polls before returning
* 4. Ignores placeholders, question echoes, and known responses
*
* @param page Playwright page instance
* @param options Options for waiting
* @returns The new response text, or null if timeout
*/
export async function waitForLatestAnswer(
page: Page,
options: {
question?: string;
timeoutMs?: number;
pollIntervalMs?: number;
ignoreTexts?: string[];
debug?: boolean;
} = {}
): Promise<string | null> {
const {
question = '',
timeoutMs = 120000,
pollIntervalMs = 1000,
ignoreTexts = [],
debug = false,
} = options;
const deadline = Date.now() + timeoutMs;
const sanitizedQuestion = question.trim().toLowerCase();
// Track ALL known texts as HASHES (memory efficient!)
const knownHashes = new Set<number>();
for (const text of ignoreTexts) {
if (typeof text === 'string' && text.trim()) {
knownHashes.add(hashString(text.trim()));
}
}
if (debug) {
log.debug(`π [DEBUG] Waiting for NEW answer. Ignoring ${knownHashes.size} known responses`);
}
let pollCount = 0;
let lastCandidate: string | null = null;
let stableCount = 0; // Track how many times we see the same text
const requiredStablePolls = 8; // Text must be stable for 8 consecutive polls (~8 seconds)
while (Date.now() < deadline) {
pollCount++;
// Extract latest NEW text
const candidate = await extractLatestText(page, knownHashes, debug, pollCount);
if (candidate) {
const normalized = candidate.trim();
if (normalized) {
const lower = normalized.toLowerCase();
// Check if it's a placeholder
if (isPlaceholder(lower)) {
if (debug) {
log.debug(
`π [DEBUG] Found placeholder: "${normalized.substring(0, 50)}..." - continuing...`
);
}
await page.waitForTimeout(250);
continue;
}
// DEBUG: Log the candidate text to see what we're getting
if (debug && normalized !== lastCandidate) {
log.debug(
`π [DEBUG] New candidate text (${normalized.length} chars): "${normalized.substring(0, 100)}..."`
);
}
// Check if it's the question echo
if (lower === sanitizedQuestion) {
if (debug) {
log.debug('π [DEBUG] Found question echo, ignoring');
}
knownHashes.add(hashString(normalized)); // Mark as seen
await page.waitForTimeout(pollIntervalMs);
continue;
}
// ========================================
// STREAMING DETECTION: Check if text is stable
// ========================================
if (normalized === lastCandidate) {
// Text hasn't changed - it's stable
stableCount++;
if (debug && stableCount === requiredStablePolls) {
log.debug(
`β
[DEBUG] Text stable for ${stableCount} polls (${normalized.length} chars)`
);
}
} else {
// Text changed - streaming in progress
if (debug && lastCandidate) {
log.debug(
`π [DEBUG] Text changed (${normalized.length} chars, was ${lastCandidate.length})`
);
}
stableCount = 1;
lastCandidate = normalized;
}
// Only return once text is stable
if (stableCount >= requiredStablePolls) {
if (debug) {
log.debug(`β
[DEBUG] Returning stable answer (${normalized.length} chars)`);
}
return normalized;
}
}
}
await page.waitForTimeout(pollIntervalMs);
}
if (debug) {
log.debug(`β±οΈ [DEBUG] Timeout after ${pollCount} polls`);
}
return null;
}
/**
* Extract the latest NEW response text from the page
* Uses hash-based comparison for efficiency
*
* @param page Playwright page instance
* @param knownHashes Set of hashes of already-seen response texts
* @param debug Enable debug logging
* @param pollCount Current poll number (for conditional logging)
* @returns First NEW response text found, or null
*/
async function extractLatestText(
page: Page,
knownHashes: Set<number>,
debug: boolean,
pollCount: number
): Promise<string | null> {
// Try the primary selector first (most specific for NotebookLM)
const primarySelector = '.to-user-container';
try {
const containers = await page.$$(primarySelector);
const totalContainers = containers.length;
// Early exit if no new containers possible
if (totalContainers <= knownHashes.size) {
if (debug && pollCount % 5 === 0) {
log.dim(
`βοΈ [EXTRACT] No new containers (${totalContainers} total, ${knownHashes.size} known)`
);
}
return null;
}
if (containers.length > 0) {
// Only log every 5th poll to reduce noise
if (debug && pollCount % 5 === 0) {
log.dim(`π [EXTRACT] Scanning ${totalContainers} containers (${knownHashes.size} known)`);
}
let skipped = 0;
let empty = 0;
// Scan ALL containers to find the FIRST with NEW text
for (let idx = 0; idx < containers.length; idx++) {
const container = containers[idx];
try {
const textElement = await container.$('.message-text-content');
if (textElement) {
const text = await textElement.innerText();
if (text && text.trim()) {
// Hash-based comparison (faster & less memory)
const textHash = hashString(text.trim());
if (!knownHashes.has(textHash)) {
log.success(
`β
[EXTRACT] Found NEW text in container[${idx}]: ${text.trim().length} chars`
);
return text.trim();
} else {
skipped++;
}
} else {
empty++;
}
}
} catch {
continue;
}
}
// Only log summary if debug enabled
if (debug && pollCount % 5 === 0) {
log.dim(`βοΈ [EXTRACT] No NEW text (skipped ${skipped} known, ${empty} empty)`);
}
return null; // Don't fall through to fallback!
} else {
if (debug) {
log.warning('β οΈ [EXTRACT] No containers found');
}
}
} catch (error) {
log.error(`β [EXTRACT] Primary selector failed: ${error}`);
}
// Fallback: Try other selectors (only if primary selector failed/found nothing)
if (debug) {
log.dim('π [EXTRACT] Trying fallback selectors...');
}
for (const selector of RESPONSE_SELECTORS) {
try {
const elements = await page.$$(selector);
if (elements.length === 0) continue;
// Scan ALL elements to find the first with NEW text
for (const element of elements) {
try {
// Prefer full container text when available
let container = element;
try {
const closest = await element.evaluateHandle((el) => {
return el.closest(
'[data-message-author], [data-message-role], [data-author], ' +
"[data-testid*='assistant'], [data-automation-id*='response'], article, section"
);
});
if (closest) {
container = closest.asElement() || element;
}
} catch {
container = element;
}
const text = await container.innerText();
if (text && text.trim() && !knownHashes.has(hashString(text.trim()))) {
return text.trim();
}
} catch {
continue;
}
}
} catch {
continue;
}
}
// Final fallback: JavaScript evaluation
try {
const fallbackText = await page.evaluate((): string | null => {
// @ts-expect-error - DOM types available in browser context
const unique = new Set<Element>();
// @ts-expect-error - DOM types available in browser context
const isVisible = (el: Element): boolean => {
// @ts-expect-error - DOM types available in browser context
if (!el || !(el as HTMLElement).isConnected) return false;
const rect = el.getBoundingClientRect();
if (rect.width === 0 || rect.height === 0) return false;
// @ts-expect-error - window available in browser context
const style = window.getComputedStyle(el as HTMLElement);
if (
style.visibility === 'hidden' ||
style.display === 'none' ||
parseFloat(style.opacity || '1') === 0
) {
return false;
}
return true;
};
const selectors = [
'[data-message-author]',
'[data-message-role]',
'[data-author]',
"[data-renderer*='assistant']",
"[data-testid*='assistant']",
"[data-automation-id*='response']",
];
const candidates: string[] = [];
for (const selector of selectors) {
// @ts-expect-error - document available in browser context
for (const el of document.querySelectorAll(selector)) {
if (!isVisible(el)) continue;
if (unique.has(el)) continue;
unique.add(el);
// @ts-expect-error - DOM types available in browser context
const text = (el as HTMLElement).innerText || (el as HTMLElement).textContent || '';
if (!text.trim()) continue;
candidates.push(text.trim());
}
}
if (candidates.length > 0) {
return candidates[candidates.length - 1];
}
return null;
});
if (typeof fallbackText === 'string' && fallbackText.trim()) {
return fallbackText.trim();
}
} catch {
// Ignore evaluation errors
}
return null;
}
// ============================================================================
// Exports
// ============================================================================
export default {
snapshotLatestResponse,
snapshotAllResponses,
countResponseElements,
waitForLatestAnswer,
};