diff options
author | A.J. Shulman <Shulman.aj@gmail.com> | 2025-05-11 17:18:18 -0400 |
---|---|---|
committer | A.J. Shulman <Shulman.aj@gmail.com> | 2025-05-11 17:18:18 -0400 |
commit | e141307dbd9b951f76c908610e7b89e296ad92b8 (patch) | |
tree | a48d7cae7a7702519d2099dfff5a503fcfc7875f /src/server/ApiManagers/AssistantManager.ts | |
parent | e5cb67b92d9b3c84dc90b1e64cc7128621523801 (diff) |
chanegd everything to be more consistent
- made both web related tools use doc manager and chunk Ids
Diffstat (limited to 'src/server/ApiManagers/AssistantManager.ts')
-rw-r--r-- | src/server/ApiManagers/AssistantManager.ts | 160 |
1 files changed, 119 insertions, 41 deletions
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts index 378f14094..b7ce4f663 100644 --- a/src/server/ApiManagers/AssistantManager.ts +++ b/src/server/ApiManagers/AssistantManager.ts @@ -514,30 +514,37 @@ export default class AssistantManager extends ApiManager { await browser.close(); browser = null; - // Use a try-catch block specifically for JSDOM parsing + let extractedText = ''; + + // First try with Readability try { // Parse HTML content using JSDOM const dom = new JSDOM(htmlContent, { url }); // Extract readable content using Mozilla's Readability API - const reader = new Readability(dom.window.document); + const reader = new Readability(dom.window.document, { + // Readability configuration to focus on text content + charThreshold: 100, + keepClasses: false, + }); const article = reader.parse(); - if (article) { - const plainText = article.textContent; - res.send({ website_plain_text: plainText }); + if (article && article.textContent) { + extractedText = article.textContent; } else { - // If Readability fails, fallback to extracting main content - const mainContent = await extractMainContent(htmlContent); - res.send({ website_plain_text: mainContent }); + // If Readability doesn't return useful content, try alternate method + extractedText = await extractEnhancedContent(htmlContent); } } catch (parsingError) { - console.error('Error parsing website content:', parsingError); - - // Fallback to a simplified extraction method - const mainContent = await extractMainContent(htmlContent); - res.send({ website_plain_text: mainContent }); + console.error('Error parsing website content with Readability:', parsingError); + // Fallback to enhanced content extraction + extractedText = await extractEnhancedContent(htmlContent); } + + // Clean up the extracted text + extractedText = cleanupText(extractedText); + + res.send({ website_plain_text: extractedText }); } catch (error) { console.error('Error scraping website:', error); @@ -985,48 +992,119 @@ function spawnPythonProcess(jobId: string, file_path: string, doc_id: string) { } /** - * Extracts main content from HTML by removing scripts, styles, and non-content elements - * Used as a fallback when Readability fails + * Enhanced content extraction that focuses on meaningful text content. * @param html The HTML content to process - * @returns Extracted main text content + * @returns Extracted and cleaned text content */ -async function extractMainContent(html: string): Promise<string> { +async function extractEnhancedContent(html: string): Promise<string> { try { - // Create a simple DOM to extract content + // Create DOM to extract content const dom = new JSDOM(html, { runScripts: 'outside-only' }); const document = dom.window.document; - // Remove scripts, styles, and other non-content elements - const elementsToRemove = ['script', 'style', 'iframe', 'noscript', 'svg', 'header', 'footer', 'nav', 'aside', 'ads', 'banner', 'form', 'button', 'input']; - - elementsToRemove.forEach(tag => { - const elements = document.querySelectorAll(tag); + // Remove all non-content elements + const elementsToRemove = [ + 'script', + 'style', + 'iframe', + 'noscript', + 'svg', + 'canvas', + 'header', + 'footer', + 'nav', + 'aside', + 'form', + 'button', + 'input', + 'select', + 'textarea', + 'meta', + 'link', + 'img', + 'video', + 'audio', + '.ad', + '.ads', + '.advertisement', + '.banner', + '.cookie', + '.popup', + '.modal', + '.newsletter', + '[role="banner"]', + '[role="navigation"]', + '[role="complementary"]', + ]; + + elementsToRemove.forEach(selector => { + const elements = document.querySelectorAll(selector); elements.forEach(el => el.remove()); }); - // Try to find the main content container using common selectors - const mainSelectors = ['main', 'article', '#content', '.content', '#main', '.main', '.post-content', '.article-content', '.entry-content']; - - let mainContent = ''; - - // Try each selector to find main content - for (const selector of mainSelectors) { - const element = document.querySelector(selector); - if (element && element.textContent && element.textContent.trim().length > 100) { - mainContent = element.textContent; - break; + // Get all text paragraphs with meaningful content + const contentElements = [ + ...Array.from(document.querySelectorAll('p')), + ...Array.from(document.querySelectorAll('h1')), + ...Array.from(document.querySelectorAll('h2')), + ...Array.from(document.querySelectorAll('h3')), + ...Array.from(document.querySelectorAll('h4')), + ...Array.from(document.querySelectorAll('h5')), + ...Array.from(document.querySelectorAll('h6')), + ...Array.from(document.querySelectorAll('li')), + ...Array.from(document.querySelectorAll('td')), + ...Array.from(document.querySelectorAll('article')), + ...Array.from(document.querySelectorAll('section')), + ...Array.from(document.querySelectorAll('div:not([class]):not([id])')), + ]; + + // Extract text from content elements that have meaningful text + let contentParts: string[] = []; + contentElements.forEach(el => { + const text = el.textContent?.trim(); + // Only include elements with substantial text (more than just a few characters) + if (text && text.length > 10 && !contentParts.includes(text)) { + contentParts.push(text); } - } + }); - // If no main content found with selectors, use body content - if (!mainContent || mainContent.length < 200) { - mainContent = document.body.textContent || ''; + // If no significant content found with selective approach, fallback to body + if (contentParts.length < 3) { + return document.body.textContent || ''; } - // Clean up the text - return mainContent.replace(/\s+/g, ' ').replace(/\n+/g, '\n').trim(); + return contentParts.join('\n\n'); } catch (error) { - console.error('Error extracting main content:', error); + console.error('Error extracting enhanced content:', error); return 'Failed to extract content from the webpage.'; } } + +/** + * Cleans up extracted text to improve readability and focus on useful content. + * @param text The raw extracted text + * @returns Cleaned and formatted text + */ +function cleanupText(text: string): string { + if (!text) return ''; + + return ( + text + // Remove excessive whitespace and normalize line breaks + .replace(/\s+/g, ' ') + .replace(/\n\s*\n\s*\n+/g, '\n\n') + // Remove common boilerplate phrases + .replace(/cookie policy|privacy policy|terms of service|all rights reserved|copyright ©/gi, '') + // Remove email addresses + .replace(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g, '') + // Remove URLs + .replace(/https?:\/\/[^\s]+/g, '') + // Remove social media handles + .replace(/@[a-zA-Z0-9_]+/g, '') + // Clean up any remaining HTML tags that might have been missed + .replace(/<[^>]*>/g, '') + // Fix spacing issues after cleanup + .replace(/ +/g, ' ') + .trim() + ); +} |