aboutsummaryrefslogtreecommitdiff
path: root/src/server/ApiManagers/AssistantManager.ts
diff options
context:
space:
mode:
Diffstat (limited to 'src/server/ApiManagers/AssistantManager.ts')
-rw-r--r--src/server/ApiManagers/AssistantManager.ts160
1 files changed, 119 insertions, 41 deletions
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts
index 378f14094..b7ce4f663 100644
--- a/src/server/ApiManagers/AssistantManager.ts
+++ b/src/server/ApiManagers/AssistantManager.ts
@@ -514,30 +514,37 @@ export default class AssistantManager extends ApiManager {
await browser.close();
browser = null;
- // Use a try-catch block specifically for JSDOM parsing
+ let extractedText = '';
+
+ // First try with Readability
try {
// Parse HTML content using JSDOM
const dom = new JSDOM(htmlContent, { url });
// Extract readable content using Mozilla's Readability API
- const reader = new Readability(dom.window.document);
+ const reader = new Readability(dom.window.document, {
+ // Readability configuration to focus on text content
+ charThreshold: 100,
+ keepClasses: false,
+ });
const article = reader.parse();
- if (article) {
- const plainText = article.textContent;
- res.send({ website_plain_text: plainText });
+ if (article && article.textContent) {
+ extractedText = article.textContent;
} else {
- // If Readability fails, fallback to extracting main content
- const mainContent = await extractMainContent(htmlContent);
- res.send({ website_plain_text: mainContent });
+ // If Readability doesn't return useful content, try alternate method
+ extractedText = await extractEnhancedContent(htmlContent);
}
} catch (parsingError) {
- console.error('Error parsing website content:', parsingError);
-
- // Fallback to a simplified extraction method
- const mainContent = await extractMainContent(htmlContent);
- res.send({ website_plain_text: mainContent });
+ console.error('Error parsing website content with Readability:', parsingError);
+ // Fallback to enhanced content extraction
+ extractedText = await extractEnhancedContent(htmlContent);
}
+
+ // Clean up the extracted text
+ extractedText = cleanupText(extractedText);
+
+ res.send({ website_plain_text: extractedText });
} catch (error) {
console.error('Error scraping website:', error);
@@ -985,48 +992,119 @@ function spawnPythonProcess(jobId: string, file_path: string, doc_id: string) {
}
/**
- * Extracts main content from HTML by removing scripts, styles, and non-content elements
- * Used as a fallback when Readability fails
+ * Enhanced content extraction that focuses on meaningful text content.
* @param html The HTML content to process
- * @returns Extracted main text content
+ * @returns Extracted and cleaned text content
*/
-async function extractMainContent(html: string): Promise<string> {
+async function extractEnhancedContent(html: string): Promise<string> {
try {
- // Create a simple DOM to extract content
+ // Create DOM to extract content
const dom = new JSDOM(html, { runScripts: 'outside-only' });
const document = dom.window.document;
- // Remove scripts, styles, and other non-content elements
- const elementsToRemove = ['script', 'style', 'iframe', 'noscript', 'svg', 'header', 'footer', 'nav', 'aside', 'ads', 'banner', 'form', 'button', 'input'];
-
- elementsToRemove.forEach(tag => {
- const elements = document.querySelectorAll(tag);
+ // Remove all non-content elements
+ const elementsToRemove = [
+ 'script',
+ 'style',
+ 'iframe',
+ 'noscript',
+ 'svg',
+ 'canvas',
+ 'header',
+ 'footer',
+ 'nav',
+ 'aside',
+ 'form',
+ 'button',
+ 'input',
+ 'select',
+ 'textarea',
+ 'meta',
+ 'link',
+ 'img',
+ 'video',
+ 'audio',
+ '.ad',
+ '.ads',
+ '.advertisement',
+ '.banner',
+ '.cookie',
+ '.popup',
+ '.modal',
+ '.newsletter',
+ '[role="banner"]',
+ '[role="navigation"]',
+ '[role="complementary"]',
+ ];
+
+ elementsToRemove.forEach(selector => {
+ const elements = document.querySelectorAll(selector);
elements.forEach(el => el.remove());
});
- // Try to find the main content container using common selectors
- const mainSelectors = ['main', 'article', '#content', '.content', '#main', '.main', '.post-content', '.article-content', '.entry-content'];
-
- let mainContent = '';
-
- // Try each selector to find main content
- for (const selector of mainSelectors) {
- const element = document.querySelector(selector);
- if (element && element.textContent && element.textContent.trim().length > 100) {
- mainContent = element.textContent;
- break;
+ // Get all text paragraphs with meaningful content
+ const contentElements = [
+ ...Array.from(document.querySelectorAll('p')),
+ ...Array.from(document.querySelectorAll('h1')),
+ ...Array.from(document.querySelectorAll('h2')),
+ ...Array.from(document.querySelectorAll('h3')),
+ ...Array.from(document.querySelectorAll('h4')),
+ ...Array.from(document.querySelectorAll('h5')),
+ ...Array.from(document.querySelectorAll('h6')),
+ ...Array.from(document.querySelectorAll('li')),
+ ...Array.from(document.querySelectorAll('td')),
+ ...Array.from(document.querySelectorAll('article')),
+ ...Array.from(document.querySelectorAll('section')),
+ ...Array.from(document.querySelectorAll('div:not([class]):not([id])')),
+ ];
+
+ // Extract text from content elements that have meaningful text
+ let contentParts: string[] = [];
+ contentElements.forEach(el => {
+ const text = el.textContent?.trim();
+ // Only include elements with substantial text (more than just a few characters)
+ if (text && text.length > 10 && !contentParts.includes(text)) {
+ contentParts.push(text);
}
- }
+ });
- // If no main content found with selectors, use body content
- if (!mainContent || mainContent.length < 200) {
- mainContent = document.body.textContent || '';
+ // If no significant content found with selective approach, fallback to body
+ if (contentParts.length < 3) {
+ return document.body.textContent || '';
}
- // Clean up the text
- return mainContent.replace(/\s+/g, ' ').replace(/\n+/g, '\n').trim();
+ return contentParts.join('\n\n');
} catch (error) {
- console.error('Error extracting main content:', error);
+ console.error('Error extracting enhanced content:', error);
return 'Failed to extract content from the webpage.';
}
}
+
+/**
+ * Cleans up extracted text to improve readability and focus on useful content.
+ * @param text The raw extracted text
+ * @returns Cleaned and formatted text
+ */
+function cleanupText(text: string): string {
+ if (!text) return '';
+
+ return (
+ text
+ // Remove excessive whitespace and normalize line breaks
+ .replace(/\s+/g, ' ')
+ .replace(/\n\s*\n\s*\n+/g, '\n\n')
+ // Remove common boilerplate phrases
+ .replace(/cookie policy|privacy policy|terms of service|all rights reserved|copyright ©/gi, '')
+ // Remove email addresses
+ .replace(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g, '')
+ // Remove URLs
+ .replace(/https?:\/\/[^\s]+/g, '')
+ // Remove social media handles
+ .replace(/@[a-zA-Z0-9_]+/g, '')
+ // Clean up any remaining HTML tags that might have been missed
+ .replace(/<[^>]*>/g, '')
+ // Fix spacing issues after cleanup
+ .replace(/ +/g, ' ')
+ .trim()
+ );
+}