diff options
Diffstat (limited to 'src')
10 files changed, 215 insertions, 278 deletions
diff --git a/src/client/views/nodes/chatbot/agentsystem/Agent.ts b/src/client/views/nodes/chatbot/agentsystem/Agent.ts index 24471bf5b..86d40864e 100644 --- a/src/client/views/nodes/chatbot/agentsystem/Agent.ts +++ b/src/client/views/nodes/chatbot/agentsystem/Agent.ts @@ -63,10 +63,8 @@ export class Agent { */ constructor( _vectorstore: Vectorstore, - summaries: () => string, history: () => string, csvData: () => { filename: string; id: string; text: string }[], - getLinkedUrlDocId: (url: string) => string[], createImage: (result: Upload.FileInformation & Upload.InspectionResults, options: DocumentOptions) => void, createCSVInDash: (url: string, title: string, id: string, data: string) => void, docManager: AgentDocumentManager @@ -83,7 +81,7 @@ export class Agent { calculate: new CalculateTool(), rag: new RAGTool(this.vectorstore), dataAnalysis: new DataAnalysisTool(csvData), - websiteInfoScraper: new WebsiteInfoScraperTool(getLinkedUrlDocId), + websiteInfoScraper: new WebsiteInfoScraperTool(this._docManager), searchTool: new SearchTool(this._docManager), noTool: new NoTool(), //imageCreationTool: new ImageCreationTool(createImage), @@ -125,11 +123,8 @@ export class Agent { // Retrieve chat history and generate system prompt const chatHistory = this._history(); // Get document summaries directly from document manager - const documentSummaries = this._docManager.getAllDocumentSummaries(); - // Create a function that returns document summaries for the prompt - const getSummaries = () => documentSummaries; // Generate the system prompt with the summaries - const systemPrompt = getReactPrompt(Object.values(this.tools), getSummaries, chatHistory); + const systemPrompt = getReactPrompt(Object.values(this.tools), () => JSON.stringify(this._docManager.listDocs), chatHistory); // Initialize intermediate messages this.interMessages = [{ role: 'system', content: systemPrompt }]; diff --git a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx index 6349e554e..867e78860 100644 --- a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx +++ b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx @@ -121,16 +121,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { this.vectorstore = new Vectorstore(this.vectorstore_id, this.docManager); // Create an agent with the vectorstore - this.agent = new Agent( - this.vectorstore, - this.retrieveSummaries.bind(this), - this.retrieveFormattedHistory.bind(this), - this.retrieveCSVData.bind(this), - this.retrieveDocIds.bind(this), - this.createImageInDash.bind(this), - this.createCSVInDash.bind(this), - this.docManager - ); + this.agent = new Agent(this.vectorstore, this.retrieveFormattedHistory.bind(this), this.retrieveCSVData.bind(this), this.createImageInDash.bind(this), this.createCSVInDash.bind(this), this.docManager); // Add event listeners this.addScrollListener(); @@ -228,6 +219,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { } }; + //TODO: Update for new chunk_simpl on agentDocument /** * Adds a CSV file for analysis by sending it to OpenAI and generating a summary. * @param newLinkedDoc The linked document representing the CSV file. @@ -650,18 +642,15 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { citation: JSON.stringify(citation, null, 2), }); - // First try to find the document using the document manager's chunk ID lookup - const doc: Doc | undefined = this.docManager.getDocByChunkId(chunkId); - if (!doc) { - console.warn(`Document not found for citation with chunk_id: ${chunkId}`); - return; - } - // Get the simplified chunk using the document manager - const foundChunk = this.docManager.getSimplifiedChunkById(doc, chunkId); + const { foundChunk, doc } = this.docManager.getSimplifiedChunkById(chunkId); if (!foundChunk) { - console.warn(`Chunk not found in document for chunk ID: ${chunkId}`); - DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {}); + if (doc) { + console.warn(`Chunk not found in document, ${doc.id}, for chunk ID: ${chunkId}`); + DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {}); + } else { + console.warn(`Chunk not found for chunk ID: ${chunkId}`); + } return; } @@ -678,6 +667,10 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { } else if (foundChunk.chunkType === CHUNK_TYPE.TABLE || foundChunk.chunkType === CHUNK_TYPE.IMAGE) { this.handleOtherChunkTypes(foundChunk, citation, doc); } else { + if (doc.type === 'web') { + DocumentManager.Instance.showDocument(doc, { openLocation: OpenWhere.addRight }, () => {}); + return; + } // Show the chunk text in citation popup let chunkText = citation.direct_text || 'Text content not available'; this.showCitationPopup(chunkText); @@ -987,16 +980,6 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { } /** - * Getter that retrieves summaries of all linked documents. - */ - @computed - get summaries(): string { - // Use the document manager to get all summaries - console.log(this.docManager.listDocs); - return JSON.stringify(this.docManager.listDocs); - } - - /** * Getter that retrieves all linked CSV files for analysis. */ @computed get linkedCSVs(): { filename: string; id: string; text: string }[] { @@ -1022,7 +1005,8 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { // Other helper methods for retrieving document data and processing retrieveSummaries = (): string => { - return this.docManager.getAllDocumentSummaries(); + console.log(this.docManager.listDocs); + return JSON.stringify(this.docManager.listDocs); }; retrieveCSVData = () => { @@ -1033,10 +1017,6 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { return this.formattedHistory; }; - retrieveDocIds = (): string[] => { - return Array.from(this.docManager.docIds); - }; - /** * Handles follow-up questions when the user clicks on them. * Automatically sets the input value to the clicked follow-up question. diff --git a/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts b/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts index 5297292bf..405949c1e 100644 --- a/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts +++ b/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts @@ -408,7 +408,7 @@ export class DocumentMetadataTool extends BaseTool<DocumentMetadataToolParamsTyp const title = String(args.title); const data = String(args.data); - const id = this._docManager.createDocInDash(docType, data, { title: title }); + const id = await this._docManager.createDocInDash(docType, data, { title: title }); if (!id) { return [ diff --git a/src/client/views/nodes/chatbot/tools/SearchTool.ts b/src/client/views/nodes/chatbot/tools/SearchTool.ts index 53f5fc109..43f14ea83 100644 --- a/src/client/views/nodes/chatbot/tools/SearchTool.ts +++ b/src/client/views/nodes/chatbot/tools/SearchTool.ts @@ -48,19 +48,21 @@ export class SearchTool extends BaseTool<SearchToolParamsType> { query, max_results: this._max_results, })) as { results: { url: string; snippet: string }[] }; - const data = results.map((result: { url: string; snippet: string }) => { - // Create a web document with the URL - const id = this._docManager.createDocInDash('web', result.url, { - title: `Search Result: ${result.url}`, - text_html: result.snippet, - data_useCors: true, - }); + const data = await Promise.all( + results.map(async (result: { url: string; snippet: string }) => { + // Create a web document with the URL + const id = await this._docManager.createDocInDash('web', result.url, { + title: `Search Result: ${result.url}`, + text_html: result.snippet, + data_useCors: true, + }); - return { - type: 'text' as const, - text: `<chunk chunk_id="${id}" chunk_type="url"><url>${result.url}</url><overview>${result.snippet}</overview></chunk>`, - }; - }); + return { + type: 'text' as const, + text: `<chunk chunk_id="${id}" chunk_type="url"><url>${result.url}</url><overview>${result.snippet}</overview></chunk>`, + }; + }) + ); return data; } catch (error) { console.log(error); diff --git a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts index 3c7b4e3db..495a985cb 100644 --- a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts +++ b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts @@ -3,12 +3,14 @@ import { Networking } from '../../../../Network'; import { BaseTool } from './BaseTool'; import { Observation } from '../types/types'; import { ParametersType, ToolInfo } from '../types/tool_types'; - +import { AgentDocumentManager } from '../utils/AgentDocumentManager'; +import { Doc } from '../../../../../fields/Doc'; +import { StrCast, WebCast } from '../../../../../fields/Types'; const websiteInfoScraperToolParams = [ { - name: 'urls', + name: 'chunk_ids', type: 'string[]', - description: 'The URLs of the websites to scrape', + description: 'The chunk_ids of the urls to scrape from the SearchTool.', required: true, max_inputs: 3, }, @@ -66,11 +68,11 @@ const websiteInfoScraperToolInfo: ToolInfo<WebsiteInfoScraperToolParamsType> = { }; export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParamsType> { - private _getLinkedUrlDocId: (url: string) => string[]; + private _docManager: AgentDocumentManager; - constructor(getLinkedUrlDocIds: (url: string) => string[]) { + constructor(docManager: AgentDocumentManager) { super(websiteInfoScraperToolInfo); - this._getLinkedUrlDocId = getLinkedUrlDocIds; + this._docManager = docManager; } /** @@ -79,10 +81,13 @@ export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParam * @param maxRetries Maximum number of retry attempts * @returns The scraped content or error message */ - private async scrapeWithRetry(url: string, maxRetries = 2): Promise<Observation> { + private async scrapeWithRetry(chunkDoc: Doc, maxRetries = 2): Promise<Observation> { let lastError = ''; let retryCount = 0; - + const url = WebCast(chunkDoc.data!)!.url.href; + console.log(url); + console.log(chunkDoc); + console.log(chunkDoc.data); // Validate URL format try { new URL(url); // This will throw if URL is invalid @@ -110,7 +115,6 @@ export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParam } const { website_plain_text } = response as { website_plain_text: string }; - const id = this._getLinkedUrlDocId(url); // Validate content quality if (!website_plain_text) { @@ -126,7 +130,7 @@ export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParam if (retryCount === maxRetries) { return { type: 'text', - text: `<chunk chunk_id="${id}" chunk_type="url">\n${website_plain_text}\nNote: Limited content was retrieved from this URL.\n</chunk>`, + text: `<chunk chunk_id="${chunkDoc.id}" chunk_type="url">\n${website_plain_text}\nNote: Limited content was retrieved from this URL.\n</chunk>`, } as Observation; } @@ -138,7 +142,7 @@ export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParam // Process and return content if it looks good return { type: 'text', - text: `<chunk chunk_id="${id}" chunk_type="url">\n${website_plain_text}\n</chunk>`, + text: `<chunk chunk_id="${chunkDoc.id}" chunk_type="url">\n${website_plain_text}\n</chunk>`, } as Observation; } catch (error) { lastError = error instanceof Error ? error.message : 'Unknown error'; @@ -156,10 +160,10 @@ export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParam } async execute(args: ParametersType<WebsiteInfoScraperToolParamsType>): Promise<Observation[]> { - const urls = args.urls; + const chunk_ids = args.chunk_ids; // Create an array of promises, each one handling a website scrape for a URL - const scrapingPromises = urls.map(url => this.scrapeWithRetry(url)); + const scrapingPromises = chunk_ids.map(chunk_id => this.scrapeWithRetry(this._docManager.getDocument(chunk_id)!)); // Wait for all scraping promises to resolve const results = await Promise.all(scrapingPromises); diff --git a/src/client/views/nodes/chatbot/tools/WikipediaTool.ts b/src/client/views/nodes/chatbot/tools/WikipediaTool.ts index ee815532a..ec5d83e52 100644 --- a/src/client/views/nodes/chatbot/tools/WikipediaTool.ts +++ b/src/client/views/nodes/chatbot/tools/WikipediaTool.ts @@ -32,7 +32,7 @@ export class WikipediaTool extends BaseTool<WikipediaToolParamsType> { async execute(args: ParametersType<WikipediaToolParamsType>): Promise<Observation[]> { try { - const { text } = await Networking.PostToServer('/getWikipediaSummary', { title: args.title }); + const { text } = (await Networking.PostToServer('/getWikipediaSummary', { title: args.title })) as { text: string }; const id = uuidv4(); const url = `https://en.wikipedia.org/wiki/${args.title.replace(/ /g, '_')}`; this._addLinkedUrlDoc(url, id); diff --git a/src/client/views/nodes/chatbot/types/types.ts b/src/client/views/nodes/chatbot/types/types.ts index 90b5e7e11..0d1804b2d 100644 --- a/src/client/views/nodes/chatbot/types/types.ts +++ b/src/client/views/nodes/chatbot/types/types.ts @@ -101,6 +101,7 @@ export interface RAGChunk { export interface SimplifiedChunk { chunkId: string; + doc_id: string; startPage?: number; endPage?: number; location?: string; diff --git a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts index c8a6bb16b..5a09b945b 100644 --- a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts +++ b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts @@ -13,7 +13,7 @@ import { LinkManager, UPDATE_SERVER_CACHE } from '../../../../util/LinkManager'; import { DocumentView } from '../../DocumentView'; import { ChatBox, parsedDoc } from '../chatboxcomponents/ChatBox'; import { supportedDocTypes } from '../types/tool_types'; -import { CHUNK_TYPE, RAGChunk } from '../types/types'; +import { CHUNK_TYPE, RAGChunk, SimplifiedChunk } from '../types/types'; /** * Interface representing a document in the freeform view @@ -31,7 +31,7 @@ export class AgentDocumentManager { private chatBox: ChatBox; private chatBoxDocument: Doc | null = null; private fieldMetadata: Record<string, any> = {}; - @observable private documentIdsFromChunkIds: ObservableMap<string, string>; + @observable private simplifiedChunks: ObservableMap<string, SimplifiedChunk>; /** * Creates a new DocumentManager @@ -40,17 +40,21 @@ export class AgentDocumentManager { constructor(chatBox: ChatBox) { makeObservable(this); const agentDoc = DocCast(chatBox.Document.agentDocument) ?? new Doc(); - const chunkIds = DocCast(agentDoc.chunkIds) ?? new Doc(); + const chunk_simpl = DocCast(agentDoc.chunk_simpl) ?? new Doc(); agentDoc.title = chatBox.Document.title + '_agentDocument'; - chunkIds.title = '_chunkIds'; + chunk_simpl.title = '_chunk_simpl'; chatBox.Document.agentDocument = agentDoc; - DocCast(chatBox.Document.agentDocument)!.chunkIds = chunkIds; - this.documentIdsFromChunkIds = StrListCast(chunkIds.mapping).reduce((mapping, content) => { - const [chunkId, docId] = content.split(':'); - mapping.set(chunkId, docId); + DocCast(chatBox.Document.agentDocument)!.chunk_simpl = chunk_simpl; + + this.simplifiedChunks = StrListCast(chunk_simpl.mapping).reduce((mapping, chunks) => { + StrListCast(chunks).forEach(chunk => { + const parsed = JSON.parse(StrCast(chunk)); + mapping.set(parsed.chunkId, parsed); + }); return mapping; - }, new ObservableMap<string, string>()); + }, new ObservableMap<string, SimplifiedChunk>()); + this.documentsById = StrListCast(agentDoc.mapping).reduce((mapping, content) => { const [id, layoutId, docId] = content.split(':'); const layoutDoc = DocServer.GetCachedRefField(layoutId); @@ -76,14 +80,10 @@ export class AgentDocumentManager { //{ fireImmediately: true } ); reaction( - () => this.documentIdsFromChunkIds.values(), + () => this.simplifiedChunks.values(), () => { if (this.chatBoxDocument && DocCast(this.chatBoxDocument.agentDocument)) { - // Store the mapping with chunkId:docId format for consistency - const chunkIdsDoc = DocCast(DocCast(this.chatBoxDocument.agentDocument)!.chunkIds); - if (chunkIdsDoc) { - chunkIdsDoc.mapping = new List<string>(Array.from(this.documentIdsFromChunkIds.entries()).map(([chunkId, docId]) => `${chunkId}:${docId}`)); - } + DocCast(DocCast(this.chatBoxDocument.agentDocument)!.chunk_simpl)!.mapping = new List<string>(Array.from(this.simplifiedChunks.values()).map(chunk => JSON.stringify(chunk))); } } //{ fireImmediately: true } @@ -831,7 +831,8 @@ export class AgentDocumentManager { * @param options Optional configuration options * @returns The ID of the created document */ - public createDocInDash(docType: string, data: string, options?: any): string { + + public async createDocInDash(docType: string, data: string, options?: any): Promise<string> { // Validate doc_type if (!this.isValidDocType(docType)) { throw new Error(`Invalid document type: ${docType}`); @@ -877,14 +878,15 @@ export class AgentDocumentManager { // Create link and add it to the document system const linkDoc = Docs.Create.LinkDocument(this.chatBoxDocument, doc); LinkManager.Instance.addLink(linkDoc); - - // Add document to view - this.chatBox._props.addDocument?.(doc); - - // Show document - defer actual display to prevent immediate resource loading - setTimeout(() => { - DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {}); - }, 100); + if (doc.type !== 'web') { + // Add document to view + this.chatBox._props.addDocument?.(doc); + + // Show document - defer actual display to prevent immediate resource loading + setTimeout(() => { + DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {}); + }, 100); + } } }); @@ -986,88 +988,19 @@ export class AgentDocumentManager { } /** - * Registers chunk IDs associated with a document in the manager - * @param docId The parent document ID - * @param chunkIds Array of chunk IDs associated with this document - */ - @action - public registerChunkIds(docId: string, chunkIds: string[]): void { - // Get the document if it exists - const docInfo = this.documentsById.get(docId); - if (!docInfo) { - console.warn(`Cannot register chunks for unknown document ID: ${docId}`); - return; - } - - // Store chunk IDs on the document for future reference - const doc = docInfo.layoutDoc; - if (!doc.chunk_ids) { - doc.chunk_ids = JSON.stringify(chunkIds); - } else { - // Merge with existing chunk IDs if they exist - const existingIds = JSON.parse(doc.chunk_ids as string); - const updatedIds = [...new Set([...existingIds, ...chunkIds])]; // Remove duplicates - doc.chunk_ids = JSON.stringify(updatedIds); - } - for (const chunkId of chunkIds) { - // Ensure each chunk ID can be linked back to its parent document - // Store a mapping from chunk ID to parent document ID - // This allows us to easily find a document by any of its chunk IDs - if (!this.documentIdsFromChunkIds.has(chunkId) && doc) { - this.documentIdsFromChunkIds.set(chunkId, doc[Id]); - } - } - } - - /** - * Gets a document ID by a chunk ID - * @param chunkId The chunk ID to look up - * @returns The parent document ID if found - */ - public getDocByChunkId(chunkId: string): Doc | undefined { - // First, look up the document ID using the chunk ID mapping - const docId = this.documentIdsFromChunkIds.get(chunkId); - console.log('this.documentIdsFromChunkIds', this.documentIdsFromChunkIds); - console.log('docId', docId); - if (!docId) { - if (this.documentsById.has(chunkId)) { - return this.documentsById.get(chunkId)?.layoutDoc; - } else { - console.error('No document found for chunkId and docId', chunkId); - return undefined; - } - } - // Then get the document using the document ID - const docInfo = this.documentsById.get(docId); - if (docInfo) { - return docInfo.layoutDoc; - } - console.error('No document found for docId', docId); - return undefined; - } - - /** * Adds simplified chunks to a document for citation handling * @param doc The document to add simplified chunks to * @param chunks Array of full RAG chunks to simplify * @param docType The type of document (e.g., 'pdf', 'video', 'audio', etc.) * @returns The updated document with simplified chunks */ - public addSimplifiedChunks(doc: Doc, chunks: RAGChunk[], docType: string): Doc { - if (!doc) { - console.error('Cannot add simplified chunks to null document'); - return doc; - } - - // Initialize empty chunks array if not exists - if (!doc.chunk_simpl) { - doc.chunk_simpl = JSON.stringify({ chunks: [] }); - } - + @action + public addSimplifiedChunks(chunks: RAGChunk[], docType: string) { + console.log('chunks', chunks, 'simplifiedChunks', this.simplifiedChunks); // Create array of simplified chunks based on document type - const simplifiedChunks = chunks.map(chunk => { + for (const chunk of chunks) { // Common properties across all chunk types - const baseChunk = { + const baseChunk: SimplifiedChunk = { chunkId: chunk.id, text: chunk.metadata.text, doc_id: chunk.metadata.doc_id, @@ -1076,38 +1009,33 @@ export class AgentDocumentManager { // Add type-specific properties if (docType === 'video' || docType === 'audio') { - return { + this.simplifiedChunks.set(chunk.id, { ...baseChunk, start_time: chunk.metadata.start_time, end_time: chunk.metadata.end_time, indexes: chunk.metadata.indexes, chunkType: docType === 'video' ? CHUNK_TYPE.VIDEO : CHUNK_TYPE.AUDIO, - }; + } as SimplifiedChunk); } else if (docType === 'pdf') { - return { + this.simplifiedChunks.set(chunk.id, { ...baseChunk, startPage: chunk.metadata.start_page, endPage: chunk.metadata.end_page, location: chunk.metadata.location, - }; + } as SimplifiedChunk); } else if (docType === 'csv') { - return { + this.simplifiedChunks.set(chunk.id, { ...baseChunk, rowStart: (chunk.metadata as any).row_start, rowEnd: (chunk.metadata as any).row_end, colStart: (chunk.metadata as any).col_start, colEnd: (chunk.metadata as any).col_end, - }; + } as SimplifiedChunk); } else { // Default for other document types - return baseChunk; + this.simplifiedChunks.set(chunk.id, baseChunk as SimplifiedChunk); } - }); - console.log('simplifiedChunks', simplifiedChunks); - // Update the document with all simplified chunks at once - doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks }); - - return doc; + } } /** @@ -1116,21 +1044,10 @@ export class AgentDocumentManager { * @param chunkId The ID of the chunk to retrieve * @returns The simplified chunk if found, undefined otherwise */ - public getSimplifiedChunkById(doc: Doc, chunkId: string): any | undefined { - let chunks: any[] = []; - if (!doc || !doc.chunk_simpl) { - chunks = []; - console.warn('No chunk found for chunkId', chunkId, '. Checking if document exists in documentsById.'); - return []; - } - try { - const parsed = JSON.parse(StrCast(doc.chunk_simpl)); - chunks = parsed.chunks || []; - } catch (e) { - console.error('Error parsing simplified chunks:', e); - return []; - } - return chunks.find(chunk => chunk.chunkId === chunkId); + public getSimplifiedChunkById(chunkId: string): any | undefined { + console.log('chunkId', chunkId, 'simplifiedChunks', this.simplifiedChunks); + console.log('doc', this.getDocument(this.simplifiedChunks.get(chunkId)?.doc_id || '')); + return { foundChunk: this.simplifiedChunks.get(chunkId), doc: this.getDocument(this.simplifiedChunks.get(chunkId)?.doc_id || '') }; } /** @@ -1150,27 +1067,4 @@ export class AgentDocumentManager { return []; } } - - /** - * Gets all document summaries combined into a single string - * @returns String containing all document summaries - */ - public getAllDocumentSummaries(): string { - const summaries = Array.from(this.documentsById.keys()) - .map(id => { - const doc = this.getDocument(id); - if (doc) { - // Try to get summary from either the document or its data document - const summary = doc.summary || (doc[DocData] && doc[DocData].summary); - if (summary) { - return StrCast(summary); - } - } - return null; - }) - .filter(Boolean) - .join('\n\n'); - - return summaries; - } } diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts index 1349df483..f1fae6f11 100644 --- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts +++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts @@ -148,10 +148,6 @@ export class Vectorstore { // Generate chunk IDs upfront so we can register them const chunkIds = segmentedTranscript.map(() => uuidv4()); - - // Register all chunk IDs with the document manager - this.docManager.registerChunkIds(doc_id, chunkIds); - // Add transcript and embeddings to metadata result = { doc_id, @@ -185,7 +181,7 @@ export class Vectorstore { doc.segmented_transcript = JSON.stringify(segmentedTranscript); // Use doc manager to add simplified chunks const docType = local_file_path.endsWith('.mp3') ? 'audio' : 'video'; - this.docManager.addSimplifiedChunks(doc, result.chunks, docType); + this.docManager.addSimplifiedChunks(result.chunks, docType); } else { // Process regular document console.log('Processing regular document...'); @@ -216,13 +212,10 @@ export class Vectorstore { console.log('doc_id in vectorstore', result.doc_id, 'does not match doc_id in doc', doc[Id]); } - // Register chunks with the document manager - this.docManager.registerChunkIds(result.doc_id, chunkIds); - // Use doc manager to add simplified chunks - determine document type from file extension const fileExt = path.extname(local_file_path).toLowerCase(); const docType = fileExt === '.pdf' ? 'pdf' : fileExt === '.csv' ? 'csv' : 'text'; - this.docManager.addSimplifiedChunks(doc, result.chunks, docType); + this.docManager.addSimplifiedChunks(result.chunks, docType); doc.summary = result.summary; doc.ai_purpose = result.purpose; @@ -351,16 +344,6 @@ export class Vectorstore { }, } as RAGChunk; - // Ensure the document manager knows about this chunk - // This is important for maintaining backwards compatibility - if (chunk.id && !this.docManager.getDocByChunkId(chunk.id)) { - // If the chunk ID isn't registered but we have a doc_id in metadata - if (chunk.metadata.doc_id && this.docManager.has(chunk.metadata.doc_id)) { - // Register the chunk with its parent document - this.docManager.registerChunkIds(chunk.metadata.doc_id, [chunk.id]); - } - } - return chunk; }); diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts index 378f14094..b7ce4f663 100644 --- a/src/server/ApiManagers/AssistantManager.ts +++ b/src/server/ApiManagers/AssistantManager.ts @@ -514,30 +514,37 @@ export default class AssistantManager extends ApiManager { await browser.close(); browser = null; - // Use a try-catch block specifically for JSDOM parsing + let extractedText = ''; + + // First try with Readability try { // Parse HTML content using JSDOM const dom = new JSDOM(htmlContent, { url }); // Extract readable content using Mozilla's Readability API - const reader = new Readability(dom.window.document); + const reader = new Readability(dom.window.document, { + // Readability configuration to focus on text content + charThreshold: 100, + keepClasses: false, + }); const article = reader.parse(); - if (article) { - const plainText = article.textContent; - res.send({ website_plain_text: plainText }); + if (article && article.textContent) { + extractedText = article.textContent; } else { - // If Readability fails, fallback to extracting main content - const mainContent = await extractMainContent(htmlContent); - res.send({ website_plain_text: mainContent }); + // If Readability doesn't return useful content, try alternate method + extractedText = await extractEnhancedContent(htmlContent); } } catch (parsingError) { - console.error('Error parsing website content:', parsingError); - - // Fallback to a simplified extraction method - const mainContent = await extractMainContent(htmlContent); - res.send({ website_plain_text: mainContent }); + console.error('Error parsing website content with Readability:', parsingError); + // Fallback to enhanced content extraction + extractedText = await extractEnhancedContent(htmlContent); } + + // Clean up the extracted text + extractedText = cleanupText(extractedText); + + res.send({ website_plain_text: extractedText }); } catch (error) { console.error('Error scraping website:', error); @@ -985,48 +992,119 @@ function spawnPythonProcess(jobId: string, file_path: string, doc_id: string) { } /** - * Extracts main content from HTML by removing scripts, styles, and non-content elements - * Used as a fallback when Readability fails + * Enhanced content extraction that focuses on meaningful text content. * @param html The HTML content to process - * @returns Extracted main text content + * @returns Extracted and cleaned text content */ -async function extractMainContent(html: string): Promise<string> { +async function extractEnhancedContent(html: string): Promise<string> { try { - // Create a simple DOM to extract content + // Create DOM to extract content const dom = new JSDOM(html, { runScripts: 'outside-only' }); const document = dom.window.document; - // Remove scripts, styles, and other non-content elements - const elementsToRemove = ['script', 'style', 'iframe', 'noscript', 'svg', 'header', 'footer', 'nav', 'aside', 'ads', 'banner', 'form', 'button', 'input']; - - elementsToRemove.forEach(tag => { - const elements = document.querySelectorAll(tag); + // Remove all non-content elements + const elementsToRemove = [ + 'script', + 'style', + 'iframe', + 'noscript', + 'svg', + 'canvas', + 'header', + 'footer', + 'nav', + 'aside', + 'form', + 'button', + 'input', + 'select', + 'textarea', + 'meta', + 'link', + 'img', + 'video', + 'audio', + '.ad', + '.ads', + '.advertisement', + '.banner', + '.cookie', + '.popup', + '.modal', + '.newsletter', + '[role="banner"]', + '[role="navigation"]', + '[role="complementary"]', + ]; + + elementsToRemove.forEach(selector => { + const elements = document.querySelectorAll(selector); elements.forEach(el => el.remove()); }); - // Try to find the main content container using common selectors - const mainSelectors = ['main', 'article', '#content', '.content', '#main', '.main', '.post-content', '.article-content', '.entry-content']; - - let mainContent = ''; - - // Try each selector to find main content - for (const selector of mainSelectors) { - const element = document.querySelector(selector); - if (element && element.textContent && element.textContent.trim().length > 100) { - mainContent = element.textContent; - break; + // Get all text paragraphs with meaningful content + const contentElements = [ + ...Array.from(document.querySelectorAll('p')), + ...Array.from(document.querySelectorAll('h1')), + ...Array.from(document.querySelectorAll('h2')), + ...Array.from(document.querySelectorAll('h3')), + ...Array.from(document.querySelectorAll('h4')), + ...Array.from(document.querySelectorAll('h5')), + ...Array.from(document.querySelectorAll('h6')), + ...Array.from(document.querySelectorAll('li')), + ...Array.from(document.querySelectorAll('td')), + ...Array.from(document.querySelectorAll('article')), + ...Array.from(document.querySelectorAll('section')), + ...Array.from(document.querySelectorAll('div:not([class]):not([id])')), + ]; + + // Extract text from content elements that have meaningful text + let contentParts: string[] = []; + contentElements.forEach(el => { + const text = el.textContent?.trim(); + // Only include elements with substantial text (more than just a few characters) + if (text && text.length > 10 && !contentParts.includes(text)) { + contentParts.push(text); } - } + }); - // If no main content found with selectors, use body content - if (!mainContent || mainContent.length < 200) { - mainContent = document.body.textContent || ''; + // If no significant content found with selective approach, fallback to body + if (contentParts.length < 3) { + return document.body.textContent || ''; } - // Clean up the text - return mainContent.replace(/\s+/g, ' ').replace(/\n+/g, '\n').trim(); + return contentParts.join('\n\n'); } catch (error) { - console.error('Error extracting main content:', error); + console.error('Error extracting enhanced content:', error); return 'Failed to extract content from the webpage.'; } } + +/** + * Cleans up extracted text to improve readability and focus on useful content. + * @param text The raw extracted text + * @returns Cleaned and formatted text + */ +function cleanupText(text: string): string { + if (!text) return ''; + + return ( + text + // Remove excessive whitespace and normalize line breaks + .replace(/\s+/g, ' ') + .replace(/\n\s*\n\s*\n+/g, '\n\n') + // Remove common boilerplate phrases + .replace(/cookie policy|privacy policy|terms of service|all rights reserved|copyright ©/gi, '') + // Remove email addresses + .replace(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g, '') + // Remove URLs + .replace(/https?:\/\/[^\s]+/g, '') + // Remove social media handles + .replace(/@[a-zA-Z0-9_]+/g, '') + // Clean up any remaining HTML tags that might have been missed + .replace(/<[^>]*>/g, '') + // Fix spacing issues after cleanup + .replace(/ +/g, ' ') + .trim() + ); +} |