1 files changed, 119 insertions, 41 deletions
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts
index 378f14094..b7ce4f663 100644
--- a/src/server/ApiManagers/AssistantManager.ts
+++ b/src/server/ApiManagers/AssistantManager.ts
@@ -514,30 +514,37 @@ export default class AssistantManager extends ApiManager {
                     await browser.close();
                     browser = null;
 
-                    // Use a try-catch block specifically for JSDOM parsing
+                    let extractedText = '';
+
+                    // First try with Readability
                     try {
                         // Parse HTML content using JSDOM
                         const dom = new JSDOM(htmlContent, { url });
 
                         // Extract readable content using Mozilla's Readability API
-                        const reader = new Readability(dom.window.document);
+                        const reader = new Readability(dom.window.document, {
+                            // Readability configuration to focus on text content
+                            charThreshold: 100,
+                            keepClasses: false,
+                        });
                         const article = reader.parse();
 
-                        if (article) {
-                            const plainText = article.textContent;
-                            res.send({ website_plain_text: plainText });
+                        if (article && article.textContent) {
+                            extractedText = article.textContent;
                         } else {
-                            // If Readability fails, fallback to extracting main content
-                            const mainContent = await extractMainContent(htmlContent);
-                            res.send({ website_plain_text: mainContent });
+                            // If Readability doesn't return useful content, try alternate method
+                            extractedText = await extractEnhancedContent(htmlContent);
                         }
                     } catch (parsingError) {
-                        console.error('Error parsing website content:', parsingError);
-
-                        // Fallback to a simplified extraction method
-                        const mainContent = await extractMainContent(htmlContent);
-                        res.send({ website_plain_text: mainContent });
+                        console.error('Error parsing website content with Readability:', parsingError);
+                        // Fallback to enhanced content extraction
+                        extractedText = await extractEnhancedContent(htmlContent);
                     }
+
+                    // Clean up the extracted text
+                    extractedText = cleanupText(extractedText);
+
+                    res.send({ website_plain_text: extractedText });
                 } catch (error) {
                     console.error('Error scraping website:', error);
 
@@ -985,48 +992,119 @@ function spawnPythonProcess(jobId: string, file_path: string, doc_id: string) {
 }
 
 /**
- * Extracts main content from HTML by removing scripts, styles, and non-content elements
- * Used as a fallback when Readability fails
+ * Enhanced content extraction that focuses on meaningful text content.
  * @param html The HTML content to process
- * @returns Extracted main text content
+ * @returns Extracted and cleaned text content
  */
-async function extractMainContent(html: string): Promise<string> {
+async function extractEnhancedContent(html: string): Promise<string> {
     try {
-        // Create a simple DOM to extract content
+        // Create DOM to extract content
         const dom = new JSDOM(html, { runScripts: 'outside-only' });
         const document = dom.window.document;
 
-        // Remove scripts, styles, and other non-content elements
-        const elementsToRemove = ['script', 'style', 'iframe', 'noscript', 'svg', 'header', 'footer', 'nav', 'aside', 'ads', 'banner', 'form', 'button', 'input'];
-
-        elementsToRemove.forEach(tag => {
-            const elements = document.querySelectorAll(tag);
+        // Remove all non-content elements
+        const elementsToRemove = [
+            'script',
+            'style',
+            'iframe',
+            'noscript',
+            'svg',
+            'canvas',
+            'header',
+            'footer',
+            'nav',
+            'aside',
+            'form',
+            'button',
+            'input',
+            'select',
+            'textarea',
+            'meta',
+            'link',
+            'img',
+            'video',
+            'audio',
+            '.ad',
+            '.ads',
+            '.advertisement',
+            '.banner',
+            '.cookie',
+            '.popup',
+            '.modal',
+            '.newsletter',
+            '[role="banner"]',
+            '[role="navigation"]',
+            '[role="complementary"]',
+        ];
+
+        elementsToRemove.forEach(selector => {
+            const elements = document.querySelectorAll(selector);
             elements.forEach(el => el.remove());
         });
 
-        // Try to find the main content container using common selectors
-        const mainSelectors = ['main', 'article', '#content', '.content', '#main', '.main', '.post-content', '.article-content', '.entry-content'];
-
-        let mainContent = '';
-
-        // Try each selector to find main content
-        for (const selector of mainSelectors) {
-            const element = document.querySelector(selector);
-            if (element && element.textContent && element.textContent.trim().length > 100) {
-                mainContent = element.textContent;
-                break;
+        // Get all text paragraphs with meaningful content
+        const contentElements = [
+            ...Array.from(document.querySelectorAll('p')),
+            ...Array.from(document.querySelectorAll('h1')),
+            ...Array.from(document.querySelectorAll('h2')),
+            ...Array.from(document.querySelectorAll('h3')),
+            ...Array.from(document.querySelectorAll('h4')),
+            ...Array.from(document.querySelectorAll('h5')),
+            ...Array.from(document.querySelectorAll('h6')),
+            ...Array.from(document.querySelectorAll('li')),
+            ...Array.from(document.querySelectorAll('td')),
+            ...Array.from(document.querySelectorAll('article')),
+            ...Array.from(document.querySelectorAll('section')),
+            ...Array.from(document.querySelectorAll('div:not([class]):not([id])')),
+        ];
+
+        // Extract text from content elements that have meaningful text
+        let contentParts: string[] = [];
+        contentElements.forEach(el => {
+            const text = el.textContent?.trim();
+            // Only include elements with substantial text (more than just a few characters)
+            if (text && text.length > 10 && !contentParts.includes(text)) {
+                contentParts.push(text);
             }
-        }
+        });
 
-        // If no main content found with selectors, use body content
-        if (!mainContent || mainContent.length < 200) {
-            mainContent = document.body.textContent || '';
+        // If no significant content found with selective approach, fallback to body
+        if (contentParts.length < 3) {
+            return document.body.textContent || '';
         }
 
-        // Clean up the text
-        return mainContent.replace(/\s+/g, ' ').replace(/\n+/g, '\n').trim();
+        return contentParts.join('\n\n');
     } catch (error) {
-        console.error('Error extracting main content:', error);
+        console.error('Error extracting enhanced content:', error);
         return 'Failed to extract content from the webpage.';
     }
 }
+
+/**
+ * Cleans up extracted text to improve readability and focus on useful content.
+ * @param text The raw extracted text
+ * @returns Cleaned and formatted text
+ */
+function cleanupText(text: string): string {
+    if (!text) return '';
+
+    return (
+        text
+            // Remove excessive whitespace and normalize line breaks
+            .replace(/\s+/g, ' ')
+            .replace(/\n\s*\n\s*\n+/g, '\n\n')
+            // Remove common boilerplate phrases
+            .replace(/cookie policy|privacy policy|terms of service|all rights reserved|copyright ©/gi, '')
+            // Remove email addresses
+            .replace(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g, '')
+            // Remove URLs
+            .replace(/https?:\/\/[^\s]+/g, '')
+            // Remove social media handles
+            .replace(/@[a-zA-Z0-9_]+/g, '')
+            // Clean up any remaining HTML tags that might have been missed
+            .replace(/<[^>]*>/g, '')
+            // Fix spacing issues after cleanup
+            .replace(/ +/g, ' ')
+            .trim()
+    );
+}