aboutsummaryrefslogtreecommitdiff
path: root/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts
diff options
context:
space:
mode:
authorA.J. Shulman <Shulman.aj@gmail.com>2025-04-27 14:57:39 -0400
committerA.J. Shulman <Shulman.aj@gmail.com>2025-04-27 14:57:39 -0400
commit393b7f8286422c933102449eba1ba82874a48896 (patch)
treec34cd5dffc7306a66fcfe54c81d8656c341facb9 /src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts
parent67a7996278ce176e227393fa410e7afc80228a83 (diff)
improved consistency across doc types and parsing
Diffstat (limited to 'src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts')
-rw-r--r--src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts234
1 files changed, 225 insertions, 9 deletions
diff --git a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts
index c3beebcde..cff8380db 100644
--- a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts
+++ b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts
@@ -14,6 +14,8 @@ import { parsedDoc } from '../chatboxcomponents/ChatBox';
import { faThumbTackSlash } from '@fortawesome/free-solid-svg-icons';
import { DocumentManager } from '../../../../util/DocumentManager';
import { DocumentView } from '../../DocumentView';
+import { RAGChunk, CHUNK_TYPE } from '../types/types';
+import { runInAction } from 'mobx';
/**
* Interface representing a document in the freeform view
@@ -869,20 +871,43 @@ export class AgentDocumentManager {
_layout_autoHeight: true,
};
- // Use the chatBox's createDocInDash method to create and link the document
+ // Additional handling for web documents
+ if (docType === 'web') {
+ // For web documents, don't sanitize the URL here
+ // Instead, set properties to handle content safely when loaded
+ simpleDoc._disable_resource_loading = true;
+ simpleDoc._sandbox_iframe = true;
+ simpleDoc.data_useCors = true;
+
+ // Specify a more permissive sandbox to allow content to render properly
+ // but still maintain security
+ simpleDoc._iframe_sandbox = 'allow-same-origin allow-scripts allow-popups allow-forms';
+ }
+
+ // Use the chatBox's createDocInDash method to create the document
if (!this.chatBox) {
throw new Error('ChatBox instance not available for creating document');
}
- const linkAndShowDoc = (doc: Opt<Doc>) => {
- if (doc) {
- LinkManager.Instance.addLink(Docs.Create.LinkDocument(this.chatBoxDocument!, doc));
- this.chatBox._props.addDocument?.(doc);
- DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {});
- }
- };
+
const doc = this.chatBox.whichDoc(simpleDoc, false);
if (doc) {
- linkAndShowDoc(doc);
+ // Use MobX runInAction to properly modify observable state
+ runInAction(() => {
+ if (this.chatBoxDocument && doc) {
+ // Create link and add it to the document system
+ const linkDoc = Docs.Create.LinkDocument(this.chatBoxDocument, doc);
+ LinkManager.Instance.addLink(linkDoc);
+
+ // Add document to view
+ this.chatBox._props.addDocument?.(doc);
+
+ // Show document - defer actual display to prevent immediate resource loading
+ setTimeout(() => {
+ DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {});
+ }, 100);
+ }
+ });
+
const id = this.processDocument(doc);
return id;
} else {
@@ -893,6 +918,62 @@ export class AgentDocumentManager {
}
}
+ /**
+ * Sanitizes web content to prevent errors with external resources
+ * @param content The web content to sanitize
+ * @returns Sanitized content
+ */
+ private sanitizeWebContent(content: string): string {
+ if (!content) return content;
+
+ try {
+ // Replace problematic resource references that might cause errors
+ const sanitized = content
+ // Remove preload links that might cause errors
+ .replace(/<link[^>]*rel=["']preload["'][^>]*>/gi, '')
+ // Remove map file references
+ .replace(/\/\/# sourceMappingURL=.*\.map/gi, '')
+ // Remove external CSS map files references
+ .replace(/\/\*# sourceMappingURL=.*\.css\.map.*\*\//gi, '')
+ // Add sandbox to iframes
+ .replace(/<iframe/gi, '<iframe sandbox="allow-same-origin" loading="lazy"')
+ // Prevent automatic resource loading for images
+ .replace(/<img/gi, '<img loading="lazy"')
+ // Prevent automatic resource loading for scripts
+ .replace(/<script/gi, '<script type="text/disabled"')
+ // Handle invalid URIs by converting relative URLs to absolute ones
+ .replace(/href=["'](\/[^"']+)["']/gi, (match, p1) => {
+ // Only handle relative URLs starting with /
+ if (p1.startsWith('/')) {
+ return `href="#disabled-link"`;
+ }
+ return match;
+ })
+ // Prevent automatic loading of CSS
+ .replace(/<link[^>]*rel=["']stylesheet["'][^>]*href=["']([^"']+)["']/gi, (match, href) => `<link rel="prefetch" data-original-href="${href}" />`);
+
+ // Wrap the content in a sandboxed container
+ return `
+ <div class="sandboxed-web-content">
+ <style>
+ /* Override styles to prevent external resource loading */
+ @font-face { font-family: 'disabled'; src: local('Arial'); }
+ * { font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, 'Open Sans', 'Helvetica Neue', sans-serif !important; }
+ img, iframe, frame, embed, object { max-width: 100%; }
+ </style>
+ ${sanitized}
+ </div>`;
+ } catch (e) {
+ console.warn('Error sanitizing web content:', e);
+ // Fall back to a safe container with the content as text
+ return `
+ <div class="sandboxed-web-content">
+ <p>Content could not be safely displayed. Raw content:</p>
+ <pre>${content.replace(/</g, '&lt;').replace(/>/g, '&gt;')}</pre>
+ </div>`;
+ }
+ }
+
public has(docId: string) {
return this.documentsById.has(docId);
}
@@ -988,4 +1069,139 @@ export class AgentDocumentManager {
}
return undefined;
}
+
+ /**
+ * Adds simplified chunks to a document for citation handling
+ * @param doc The document to add simplified chunks to
+ * @param chunks Array of full RAG chunks to simplify
+ * @param docType The type of document (e.g., 'pdf', 'video', 'audio', etc.)
+ * @returns The updated document with simplified chunks
+ */
+ public addSimplifiedChunks(doc: Doc, chunks: RAGChunk[], docType: string): Doc {
+ if (!doc) {
+ console.error('Cannot add simplified chunks to null document');
+ return doc;
+ }
+
+ // Initialize empty chunks array if not exists
+ if (!doc.chunk_simpl) {
+ doc.chunk_simpl = JSON.stringify({ chunks: [] });
+ }
+
+ // Create array of simplified chunks based on document type
+ const simplifiedChunks = chunks.map(chunk => {
+ // Common properties across all chunk types
+ const baseChunk = {
+ chunkId: chunk.id,
+ text: chunk.metadata.text,
+ doc_id: chunk.metadata.doc_id,
+ chunkType: chunk.metadata.type || CHUNK_TYPE.TEXT,
+ };
+
+ // Add type-specific properties
+ if (docType === 'video' || docType === 'audio') {
+ return {
+ ...baseChunk,
+ start_time: chunk.metadata.start_time,
+ end_time: chunk.metadata.end_time,
+ indexes: chunk.metadata.indexes,
+ chunkType: docType === 'video' ? CHUNK_TYPE.VIDEO : CHUNK_TYPE.AUDIO,
+ };
+ } else if (docType === 'pdf') {
+ return {
+ ...baseChunk,
+ startPage: chunk.metadata.start_page,
+ endPage: chunk.metadata.end_page,
+ location: chunk.metadata.location,
+ };
+ } else if (docType === 'csv') {
+ return {
+ ...baseChunk,
+ rowStart: (chunk.metadata as any).row_start,
+ rowEnd: (chunk.metadata as any).row_end,
+ colStart: (chunk.metadata as any).col_start,
+ colEnd: (chunk.metadata as any).col_end,
+ };
+ } else {
+ // Default for other document types
+ return baseChunk;
+ }
+ });
+
+ // Update the document with all simplified chunks at once
+ doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks });
+
+ return doc;
+ }
+
+ /**
+ * Gets the simplified chunks from a document
+ * @param doc The document to get simplified chunks from
+ * @returns Array of simplified chunks or empty array if none exist
+ */
+ public getSimplifiedChunks(doc: Doc): any[] {
+ if (!doc || !doc.chunk_simpl) {
+ return [];
+ }
+
+ try {
+ const parsed = JSON.parse(StrCast(doc.chunk_simpl));
+ return parsed.chunks || [];
+ } catch (e) {
+ console.error('Error parsing simplified chunks:', e);
+ return [];
+ }
+ }
+
+ /**
+ * Gets a specific simplified chunk by ID
+ * @param doc The document containing chunks
+ * @param chunkId The ID of the chunk to retrieve
+ * @returns The simplified chunk if found, undefined otherwise
+ */
+ public getSimplifiedChunkById(doc: Doc, chunkId: string): any | undefined {
+ const chunks = this.getSimplifiedChunks(doc);
+ return chunks.find(chunk => chunk.chunkId === chunkId);
+ }
+
+ /**
+ * Gets the original segments from a media document
+ * @param doc The document containing original media segments
+ * @returns Array of media segments or empty array if none exist
+ */
+ public getOriginalSegments(doc: Doc): any[] {
+ if (!doc || !doc.original_segments) {
+ return [];
+ }
+
+ try {
+ return JSON.parse(StrCast(doc.original_segments)) || [];
+ } catch (e) {
+ console.error('Error parsing original segments:', e);
+ return [];
+ }
+ }
+
+ /**
+ * Gets all document summaries combined into a single string
+ * @returns String containing all document summaries
+ */
+ public getAllDocumentSummaries(): string {
+ const summaries = Array.from(this.documentsById.keys())
+ .map(id => {
+ const doc = this.getDocument(id);
+ if (doc) {
+ // Try to get summary from either the document or its data document
+ const summary = doc.summary || (doc[DocData] && doc[DocData].summary);
+ if (summary) {
+ return StrCast(summary);
+ }
+ }
+ return null;
+ })
+ .filter(Boolean)
+ .join('\n\n');
+
+ return summaries;
+ }
}