diff options
| author | A.J. Shulman <Shulman.aj@gmail.com> | 2025-04-27 14:57:39 -0400 |
|---|---|---|
| committer | A.J. Shulman <Shulman.aj@gmail.com> | 2025-04-27 14:57:39 -0400 |
| commit | 393b7f8286422c933102449eba1ba82874a48896 (patch) | |
| tree | c34cd5dffc7306a66fcfe54c81d8656c341facb9 /src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts | |
| parent | 67a7996278ce176e227393fa410e7afc80228a83 (diff) | |
improved consistency across doc types and parsing
Diffstat (limited to 'src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts')
| -rw-r--r-- | src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts | 234 |
1 files changed, 225 insertions, 9 deletions
diff --git a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts index c3beebcde..cff8380db 100644 --- a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts +++ b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts @@ -14,6 +14,8 @@ import { parsedDoc } from '../chatboxcomponents/ChatBox'; import { faThumbTackSlash } from '@fortawesome/free-solid-svg-icons'; import { DocumentManager } from '../../../../util/DocumentManager'; import { DocumentView } from '../../DocumentView'; +import { RAGChunk, CHUNK_TYPE } from '../types/types'; +import { runInAction } from 'mobx'; /** * Interface representing a document in the freeform view @@ -869,20 +871,43 @@ export class AgentDocumentManager { _layout_autoHeight: true, }; - // Use the chatBox's createDocInDash method to create and link the document + // Additional handling for web documents + if (docType === 'web') { + // For web documents, don't sanitize the URL here + // Instead, set properties to handle content safely when loaded + simpleDoc._disable_resource_loading = true; + simpleDoc._sandbox_iframe = true; + simpleDoc.data_useCors = true; + + // Specify a more permissive sandbox to allow content to render properly + // but still maintain security + simpleDoc._iframe_sandbox = 'allow-same-origin allow-scripts allow-popups allow-forms'; + } + + // Use the chatBox's createDocInDash method to create the document if (!this.chatBox) { throw new Error('ChatBox instance not available for creating document'); } - const linkAndShowDoc = (doc: Opt<Doc>) => { - if (doc) { - LinkManager.Instance.addLink(Docs.Create.LinkDocument(this.chatBoxDocument!, doc)); - this.chatBox._props.addDocument?.(doc); - DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {}); - } - }; + const doc = this.chatBox.whichDoc(simpleDoc, false); if (doc) { - linkAndShowDoc(doc); + // Use MobX runInAction to properly modify observable state + runInAction(() => { + if (this.chatBoxDocument && doc) { + // Create link and add it to the document system + const linkDoc = Docs.Create.LinkDocument(this.chatBoxDocument, doc); + LinkManager.Instance.addLink(linkDoc); + + // Add document to view + this.chatBox._props.addDocument?.(doc); + + // Show document - defer actual display to prevent immediate resource loading + setTimeout(() => { + DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {}); + }, 100); + } + }); + const id = this.processDocument(doc); return id; } else { @@ -893,6 +918,62 @@ export class AgentDocumentManager { } } + /** + * Sanitizes web content to prevent errors with external resources + * @param content The web content to sanitize + * @returns Sanitized content + */ + private sanitizeWebContent(content: string): string { + if (!content) return content; + + try { + // Replace problematic resource references that might cause errors + const sanitized = content + // Remove preload links that might cause errors + .replace(/<link[^>]*rel=["']preload["'][^>]*>/gi, '') + // Remove map file references + .replace(/\/\/# sourceMappingURL=.*\.map/gi, '') + // Remove external CSS map files references + .replace(/\/\*# sourceMappingURL=.*\.css\.map.*\*\//gi, '') + // Add sandbox to iframes + .replace(/<iframe/gi, '<iframe sandbox="allow-same-origin" loading="lazy"') + // Prevent automatic resource loading for images + .replace(/<img/gi, '<img loading="lazy"') + // Prevent automatic resource loading for scripts + .replace(/<script/gi, '<script type="text/disabled"') + // Handle invalid URIs by converting relative URLs to absolute ones + .replace(/href=["'](\/[^"']+)["']/gi, (match, p1) => { + // Only handle relative URLs starting with / + if (p1.startsWith('/')) { + return `href="#disabled-link"`; + } + return match; + }) + // Prevent automatic loading of CSS + .replace(/<link[^>]*rel=["']stylesheet["'][^>]*href=["']([^"']+)["']/gi, (match, href) => `<link rel="prefetch" data-original-href="${href}" />`); + + // Wrap the content in a sandboxed container + return ` + <div class="sandboxed-web-content"> + <style> + /* Override styles to prevent external resource loading */ + @font-face { font-family: 'disabled'; src: local('Arial'); } + * { font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, 'Open Sans', 'Helvetica Neue', sans-serif !important; } + img, iframe, frame, embed, object { max-width: 100%; } + </style> + ${sanitized} + </div>`; + } catch (e) { + console.warn('Error sanitizing web content:', e); + // Fall back to a safe container with the content as text + return ` + <div class="sandboxed-web-content"> + <p>Content could not be safely displayed. Raw content:</p> + <pre>${content.replace(/</g, '<').replace(/>/g, '>')}</pre> + </div>`; + } + } + public has(docId: string) { return this.documentsById.has(docId); } @@ -988,4 +1069,139 @@ export class AgentDocumentManager { } return undefined; } + + /** + * Adds simplified chunks to a document for citation handling + * @param doc The document to add simplified chunks to + * @param chunks Array of full RAG chunks to simplify + * @param docType The type of document (e.g., 'pdf', 'video', 'audio', etc.) + * @returns The updated document with simplified chunks + */ + public addSimplifiedChunks(doc: Doc, chunks: RAGChunk[], docType: string): Doc { + if (!doc) { + console.error('Cannot add simplified chunks to null document'); + return doc; + } + + // Initialize empty chunks array if not exists + if (!doc.chunk_simpl) { + doc.chunk_simpl = JSON.stringify({ chunks: [] }); + } + + // Create array of simplified chunks based on document type + const simplifiedChunks = chunks.map(chunk => { + // Common properties across all chunk types + const baseChunk = { + chunkId: chunk.id, + text: chunk.metadata.text, + doc_id: chunk.metadata.doc_id, + chunkType: chunk.metadata.type || CHUNK_TYPE.TEXT, + }; + + // Add type-specific properties + if (docType === 'video' || docType === 'audio') { + return { + ...baseChunk, + start_time: chunk.metadata.start_time, + end_time: chunk.metadata.end_time, + indexes: chunk.metadata.indexes, + chunkType: docType === 'video' ? CHUNK_TYPE.VIDEO : CHUNK_TYPE.AUDIO, + }; + } else if (docType === 'pdf') { + return { + ...baseChunk, + startPage: chunk.metadata.start_page, + endPage: chunk.metadata.end_page, + location: chunk.metadata.location, + }; + } else if (docType === 'csv') { + return { + ...baseChunk, + rowStart: (chunk.metadata as any).row_start, + rowEnd: (chunk.metadata as any).row_end, + colStart: (chunk.metadata as any).col_start, + colEnd: (chunk.metadata as any).col_end, + }; + } else { + // Default for other document types + return baseChunk; + } + }); + + // Update the document with all simplified chunks at once + doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks }); + + return doc; + } + + /** + * Gets the simplified chunks from a document + * @param doc The document to get simplified chunks from + * @returns Array of simplified chunks or empty array if none exist + */ + public getSimplifiedChunks(doc: Doc): any[] { + if (!doc || !doc.chunk_simpl) { + return []; + } + + try { + const parsed = JSON.parse(StrCast(doc.chunk_simpl)); + return parsed.chunks || []; + } catch (e) { + console.error('Error parsing simplified chunks:', e); + return []; + } + } + + /** + * Gets a specific simplified chunk by ID + * @param doc The document containing chunks + * @param chunkId The ID of the chunk to retrieve + * @returns The simplified chunk if found, undefined otherwise + */ + public getSimplifiedChunkById(doc: Doc, chunkId: string): any | undefined { + const chunks = this.getSimplifiedChunks(doc); + return chunks.find(chunk => chunk.chunkId === chunkId); + } + + /** + * Gets the original segments from a media document + * @param doc The document containing original media segments + * @returns Array of media segments or empty array if none exist + */ + public getOriginalSegments(doc: Doc): any[] { + if (!doc || !doc.original_segments) { + return []; + } + + try { + return JSON.parse(StrCast(doc.original_segments)) || []; + } catch (e) { + console.error('Error parsing original segments:', e); + return []; + } + } + + /** + * Gets all document summaries combined into a single string + * @returns String containing all document summaries + */ + public getAllDocumentSummaries(): string { + const summaries = Array.from(this.documentsById.keys()) + .map(id => { + const doc = this.getDocument(id); + if (doc) { + // Try to get summary from either the document or its data document + const summary = doc.summary || (doc[DocData] && doc[DocData].summary); + if (summary) { + return StrCast(summary); + } + } + return null; + }) + .filter(Boolean) + .join('\n\n'); + + return summaries; + } } |
