10 files changed, 215 insertions, 278 deletions
diff --git a/src/client/views/nodes/chatbot/agentsystem/Agent.ts b/src/client/views/nodes/chatbot/agentsystem/Agent.ts
index 24471bf5b..86d40864e 100644
--- a/src/client/views/nodes/chatbot/agentsystem/Agent.ts
+++ b/src/client/views/nodes/chatbot/agentsystem/Agent.ts
@@ -63,10 +63,8 @@ export class Agent {
      */
     constructor(
         _vectorstore: Vectorstore,
-        summaries: () => string,
         history: () => string,
         csvData: () => { filename: string; id: string; text: string }[],
-        getLinkedUrlDocId: (url: string) => string[],
         createImage: (result: Upload.FileInformation & Upload.InspectionResults, options: DocumentOptions) => void,
         createCSVInDash: (url: string, title: string, id: string, data: string) => void,
         docManager: AgentDocumentManager
@@ -83,7 +81,7 @@ export class Agent {
             calculate: new CalculateTool(),
             rag: new RAGTool(this.vectorstore),
             dataAnalysis: new DataAnalysisTool(csvData),
-            websiteInfoScraper: new WebsiteInfoScraperTool(getLinkedUrlDocId),
+            websiteInfoScraper: new WebsiteInfoScraperTool(this._docManager),
             searchTool: new SearchTool(this._docManager),
             noTool: new NoTool(),
             //imageCreationTool: new ImageCreationTool(createImage),
@@ -125,11 +123,8 @@ export class Agent {
         // Retrieve chat history and generate system prompt
         const chatHistory = this._history();
         // Get document summaries directly from document manager
-        const documentSummaries = this._docManager.getAllDocumentSummaries();
-        // Create a function that returns document summaries for the prompt
-        const getSummaries = () => documentSummaries;
         // Generate the system prompt with the summaries
-        const systemPrompt = getReactPrompt(Object.values(this.tools), getSummaries, chatHistory);
+        const systemPrompt = getReactPrompt(Object.values(this.tools), () => JSON.stringify(this._docManager.listDocs), chatHistory);
 
         // Initialize intermediate messages
         this.interMessages = [{ role: 'system', content: systemPrompt }];
diff --git a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx
index 6349e554e..867e78860 100644
--- a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx
+++ b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx
@@ -121,16 +121,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
         this.vectorstore = new Vectorstore(this.vectorstore_id, this.docManager);
 
         // Create an agent with the vectorstore
-        this.agent = new Agent(
-            this.vectorstore,
-            this.retrieveSummaries.bind(this),
-            this.retrieveFormattedHistory.bind(this),
-            this.retrieveCSVData.bind(this),
-            this.retrieveDocIds.bind(this),
-            this.createImageInDash.bind(this),
-            this.createCSVInDash.bind(this),
-            this.docManager
-        );
+        this.agent = new Agent(this.vectorstore, this.retrieveFormattedHistory.bind(this), this.retrieveCSVData.bind(this), this.createImageInDash.bind(this), this.createCSVInDash.bind(this), this.docManager);
 
         // Add event listeners
         this.addScrollListener();
@@ -228,6 +219,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
         }
     };
 
+    //TODO: Update for new chunk_simpl on agentDocument
     /**
      * Adds a CSV file for analysis by sending it to OpenAI and generating a summary.
      * @param newLinkedDoc The linked document representing the CSV file.
@@ -650,18 +642,15 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
                 citation: JSON.stringify(citation, null, 2),
             });
 
-            // First try to find the document using the document manager's chunk ID lookup
-            const doc: Doc | undefined = this.docManager.getDocByChunkId(chunkId);
-            if (!doc) {
-                console.warn(`Document not found for citation with chunk_id: ${chunkId}`);
-                return;
-            }
-
             // Get the simplified chunk using the document manager
-            const foundChunk = this.docManager.getSimplifiedChunkById(doc, chunkId);
+            const { foundChunk, doc } = this.docManager.getSimplifiedChunkById(chunkId);
             if (!foundChunk) {
-                console.warn(`Chunk not found in document for chunk ID: ${chunkId}`);
-                DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {});
+                if (doc) {
+                    console.warn(`Chunk not found in document, ${doc.id}, for chunk ID: ${chunkId}`);
+                    DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {});
+                } else {
+                    console.warn(`Chunk not found for chunk ID: ${chunkId}`);
+                }
                 return;
             }
 
@@ -678,6 +667,10 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
             } else if (foundChunk.chunkType === CHUNK_TYPE.TABLE || foundChunk.chunkType === CHUNK_TYPE.IMAGE) {
                 this.handleOtherChunkTypes(foundChunk, citation, doc);
             } else {
+                if (doc.type === 'web') {
+                    DocumentManager.Instance.showDocument(doc, { openLocation: OpenWhere.addRight }, () => {});
+                    return;
+                }
                 // Show the chunk text in citation popup
                 let chunkText = citation.direct_text || 'Text content not available';
                 this.showCitationPopup(chunkText);
@@ -987,16 +980,6 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
     }
 
     /**
-     * Getter that retrieves summaries of all linked documents.
-     */
-    @computed
-    get summaries(): string {
-        // Use the document manager to get all summaries
-        console.log(this.docManager.listDocs);
-        return JSON.stringify(this.docManager.listDocs);
-    }
-
-    /**
      * Getter that retrieves all linked CSV files for analysis.
      */
     @computed get linkedCSVs(): { filename: string; id: string; text: string }[] {
@@ -1022,7 +1005,8 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
     // Other helper methods for retrieving document data and processing
 
     retrieveSummaries = (): string => {
-        return this.docManager.getAllDocumentSummaries();
+        console.log(this.docManager.listDocs);
+        return JSON.stringify(this.docManager.listDocs);
     };
 
     retrieveCSVData = () => {
@@ -1033,10 +1017,6 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
         return this.formattedHistory;
     };
 
-    retrieveDocIds = (): string[] => {
-        return Array.from(this.docManager.docIds);
-    };
-
     /**
      * Handles follow-up questions when the user clicks on them.
      * Automatically sets the input value to the clicked follow-up question.
diff --git a/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts b/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts
index 5297292bf..405949c1e 100644
--- a/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts
+++ b/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts
@@ -408,7 +408,7 @@ export class DocumentMetadataTool extends BaseTool<DocumentMetadataToolParamsTyp
                     const title = String(args.title);
                     const data = String(args.data);
 
-                    const id = this._docManager.createDocInDash(docType, data, { title: title });
+                    const id = await this._docManager.createDocInDash(docType, data, { title: title });
 
                     if (!id) {
                         return [
diff --git a/src/client/views/nodes/chatbot/tools/SearchTool.ts b/src/client/views/nodes/chatbot/tools/SearchTool.ts
index 53f5fc109..43f14ea83 100644
--- a/src/client/views/nodes/chatbot/tools/SearchTool.ts
+++ b/src/client/views/nodes/chatbot/tools/SearchTool.ts
@@ -48,19 +48,21 @@ export class SearchTool extends BaseTool<SearchToolParamsType> {
                     query,
                     max_results: this._max_results,
                 })) as { results: { url: string; snippet: string }[] };
-                const data = results.map((result: { url: string; snippet: string }) => {
-                    // Create a web document with the URL
-                    const id = this._docManager.createDocInDash('web', result.url, {
-                        title: `Search Result: ${result.url}`,
-                        text_html: result.snippet,
-                        data_useCors: true,
-                    });
+                const data = await Promise.all(
+                    results.map(async (result: { url: string; snippet: string }) => {
+                        // Create a web document with the URL
+                        const id = await this._docManager.createDocInDash('web', result.url, {
+                            title: `Search Result: ${result.url}`,
+                            text_html: result.snippet,
+                            data_useCors: true,
+                        });
 
-                    return {
-                        type: 'text' as const,
-                        text: `<chunk chunk_id="${id}" chunk_type="url"><url>${result.url}</url><overview>${result.snippet}</overview></chunk>`,
-                    };
-                });
+                        return {
+                            type: 'text' as const,
+                            text: `<chunk chunk_id="${id}" chunk_type="url"><url>${result.url}</url><overview>${result.snippet}</overview></chunk>`,
+                        };
+                    })
+                );
                 return data;
             } catch (error) {
                 console.log(error);
diff --git a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
index 3c7b4e3db..495a985cb 100644
--- a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
+++ b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
@@ -3,12 +3,14 @@ import { Networking } from '../../../../Network';
 import { BaseTool } from './BaseTool';
 import { Observation } from '../types/types';
 import { ParametersType, ToolInfo } from '../types/tool_types';
-
+import { AgentDocumentManager } from '../utils/AgentDocumentManager';
+import { Doc } from '../../../../../fields/Doc';
+import { StrCast, WebCast } from '../../../../../fields/Types';
 const websiteInfoScraperToolParams = [
     {
-        name: 'urls',
+        name: 'chunk_ids',
         type: 'string[]',
-        description: 'The URLs of the websites to scrape',
+        description: 'The chunk_ids of the urls to scrape from the SearchTool.',
         required: true,
         max_inputs: 3,
     },
@@ -66,11 +68,11 @@ const websiteInfoScraperToolInfo: ToolInfo<WebsiteInfoScraperToolParamsType> = {
 };
 
 export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParamsType> {
-    private _getLinkedUrlDocId: (url: string) => string[];
+    private _docManager: AgentDocumentManager;
 
-    constructor(getLinkedUrlDocIds: (url: string) => string[]) {
+    constructor(docManager: AgentDocumentManager) {
         super(websiteInfoScraperToolInfo);
-        this._getLinkedUrlDocId = getLinkedUrlDocIds;
+        this._docManager = docManager;
     }
 
     /**
@@ -79,10 +81,13 @@ export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParam
      * @param maxRetries Maximum number of retry attempts
      * @returns The scraped content or error message
      */
-    private async scrapeWithRetry(url: string, maxRetries = 2): Promise<Observation> {
+    private async scrapeWithRetry(chunkDoc: Doc, maxRetries = 2): Promise<Observation> {
         let lastError = '';
         let retryCount = 0;
-
+        const url = WebCast(chunkDoc.data!)!.url.href;
+        console.log(url);
+        console.log(chunkDoc);
+        console.log(chunkDoc.data);
         // Validate URL format
         try {
             new URL(url); // This will throw if URL is invalid
@@ -110,7 +115,6 @@ export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParam
                 }
 
                 const { website_plain_text } = response as { website_plain_text: string };
-                const id = this._getLinkedUrlDocId(url);
 
                 // Validate content quality
                 if (!website_plain_text) {
@@ -126,7 +130,7 @@ export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParam
                     if (retryCount === maxRetries) {
                         return {
                             type: 'text',
-                            text: `<chunk chunk_id="${id}" chunk_type="url">\n${website_plain_text}\nNote: Limited content was retrieved from this URL.\n</chunk>`,
+                            text: `<chunk chunk_id="${chunkDoc.id}" chunk_type="url">\n${website_plain_text}\nNote: Limited content was retrieved from this URL.\n</chunk>`,
                         } as Observation;
                     }
 
@@ -138,7 +142,7 @@ export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParam
                 // Process and return content if it looks good
                 return {
                     type: 'text',
-                    text: `<chunk chunk_id="${id}" chunk_type="url">\n${website_plain_text}\n</chunk>`,
+                    text: `<chunk chunk_id="${chunkDoc.id}" chunk_type="url">\n${website_plain_text}\n</chunk>`,
                 } as Observation;
             } catch (error) {
                 lastError = error instanceof Error ? error.message : 'Unknown error';
@@ -156,10 +160,10 @@ export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParam
     }
 
     async execute(args: ParametersType<WebsiteInfoScraperToolParamsType>): Promise<Observation[]> {
-        const urls = args.urls;
+        const chunk_ids = args.chunk_ids;
 
         // Create an array of promises, each one handling a website scrape for a URL
-        const scrapingPromises = urls.map(url => this.scrapeWithRetry(url));
+        const scrapingPromises = chunk_ids.map(chunk_id => this.scrapeWithRetry(this._docManager.getDocument(chunk_id)!));
 
         // Wait for all scraping promises to resolve
         const results = await Promise.all(scrapingPromises);
diff --git a/src/client/views/nodes/chatbot/tools/WikipediaTool.ts b/src/client/views/nodes/chatbot/tools/WikipediaTool.ts
index ee815532a..ec5d83e52 100644
--- a/src/client/views/nodes/chatbot/tools/WikipediaTool.ts
+++ b/src/client/views/nodes/chatbot/tools/WikipediaTool.ts
@@ -32,7 +32,7 @@ export class WikipediaTool extends BaseTool<WikipediaToolParamsType> {
 
     async execute(args: ParametersType<WikipediaToolParamsType>): Promise<Observation[]> {
         try {
-            const { text } = await Networking.PostToServer('/getWikipediaSummary', { title: args.title });
+            const { text } = (await Networking.PostToServer('/getWikipediaSummary', { title: args.title })) as { text: string };
             const id = uuidv4();
             const url = `https://en.wikipedia.org/wiki/${args.title.replace(/ /g, '_')}`;
             this._addLinkedUrlDoc(url, id);
diff --git a/src/client/views/nodes/chatbot/types/types.ts b/src/client/views/nodes/chatbot/types/types.ts
index 90b5e7e11..0d1804b2d 100644
--- a/src/client/views/nodes/chatbot/types/types.ts
+++ b/src/client/views/nodes/chatbot/types/types.ts
@@ -101,6 +101,7 @@ export interface RAGChunk {
 
 export interface SimplifiedChunk {
     chunkId: string;
+    doc_id: string;
     startPage?: number;
     endPage?: number;
     location?: string;
diff --git a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts
index c8a6bb16b..5a09b945b 100644
--- a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts
+++ b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts
@@ -13,7 +13,7 @@ import { LinkManager, UPDATE_SERVER_CACHE } from '../../../../util/LinkManager';
 import { DocumentView } from '../../DocumentView';
 import { ChatBox, parsedDoc } from '../chatboxcomponents/ChatBox';
 import { supportedDocTypes } from '../types/tool_types';
-import { CHUNK_TYPE, RAGChunk } from '../types/types';
+import { CHUNK_TYPE, RAGChunk, SimplifiedChunk } from '../types/types';
 
 /**
  * Interface representing a document in the freeform view
@@ -31,7 +31,7 @@ export class AgentDocumentManager {
     private chatBox: ChatBox;
     private chatBoxDocument: Doc | null = null;
     private fieldMetadata: Record<string, any> = {};
-    @observable private documentIdsFromChunkIds: ObservableMap<string, string>;
+    @observable private simplifiedChunks: ObservableMap<string, SimplifiedChunk>;
 
     /**
      * Creates a new DocumentManager
@@ -40,17 +40,21 @@ export class AgentDocumentManager {
     constructor(chatBox: ChatBox) {
         makeObservable(this);
         const agentDoc = DocCast(chatBox.Document.agentDocument) ?? new Doc();
-        const chunkIds = DocCast(agentDoc.chunkIds) ?? new Doc();
+        const chunk_simpl = DocCast(agentDoc.chunk_simpl) ?? new Doc();
 
         agentDoc.title = chatBox.Document.title + '_agentDocument';
-        chunkIds.title = '_chunkIds';
+        chunk_simpl.title = '_chunk_simpl';
         chatBox.Document.agentDocument = agentDoc;
-        DocCast(chatBox.Document.agentDocument)!.chunkIds = chunkIds;
-        this.documentIdsFromChunkIds = StrListCast(chunkIds.mapping).reduce((mapping, content) => {
-            const [chunkId, docId] = content.split(':');
-            mapping.set(chunkId, docId);
+        DocCast(chatBox.Document.agentDocument)!.chunk_simpl = chunk_simpl;
+
+        this.simplifiedChunks = StrListCast(chunk_simpl.mapping).reduce((mapping, chunks) => {
+            StrListCast(chunks).forEach(chunk => {
+                const parsed = JSON.parse(StrCast(chunk));
+                mapping.set(parsed.chunkId, parsed);
+            });
             return mapping;
-        }, new ObservableMap<string, string>());
+        }, new ObservableMap<string, SimplifiedChunk>());
+
         this.documentsById = StrListCast(agentDoc.mapping).reduce((mapping, content) => {
             const [id, layoutId, docId] = content.split(':');
             const layoutDoc = DocServer.GetCachedRefField(layoutId);
@@ -76,14 +80,10 @@ export class AgentDocumentManager {
             //{ fireImmediately: true }
         );
         reaction(
-            () => this.documentIdsFromChunkIds.values(),
+            () => this.simplifiedChunks.values(),
             () => {
                 if (this.chatBoxDocument && DocCast(this.chatBoxDocument.agentDocument)) {
-                    // Store the mapping with chunkId:docId format for consistency
-                    const chunkIdsDoc = DocCast(DocCast(this.chatBoxDocument.agentDocument)!.chunkIds);
-                    if (chunkIdsDoc) {
-                        chunkIdsDoc.mapping = new List<string>(Array.from(this.documentIdsFromChunkIds.entries()).map(([chunkId, docId]) => `${chunkId}:${docId}`));
-                    }
+                    DocCast(DocCast(this.chatBoxDocument.agentDocument)!.chunk_simpl)!.mapping = new List<string>(Array.from(this.simplifiedChunks.values()).map(chunk => JSON.stringify(chunk)));
                 }
             }
             //{ fireImmediately: true }
@@ -831,7 +831,8 @@ export class AgentDocumentManager {
      * @param options Optional configuration options
      * @returns The ID of the created document
      */
-    public createDocInDash(docType: string, data: string, options?: any): string {
+
+    public async createDocInDash(docType: string, data: string, options?: any): Promise<string> {
         // Validate doc_type
         if (!this.isValidDocType(docType)) {
             throw new Error(`Invalid document type: ${docType}`);
@@ -877,14 +878,15 @@ export class AgentDocumentManager {
                         // Create link and add it to the document system
                         const linkDoc = Docs.Create.LinkDocument(this.chatBoxDocument, doc);
                         LinkManager.Instance.addLink(linkDoc);
-
-                        // Add document to view
-                        this.chatBox._props.addDocument?.(doc);
-
-                        // Show document - defer actual display to prevent immediate resource loading
-                        setTimeout(() => {
-                            DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {});
-                        }, 100);
+                        if (doc.type !== 'web') {
+                            // Add document to view
+                            this.chatBox._props.addDocument?.(doc);
+
+                            // Show document - defer actual display to prevent immediate resource loading
+                            setTimeout(() => {
+                                DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {});
+                            }, 100);
+                        }
                     }
                 });
 
@@ -986,88 +988,19 @@ export class AgentDocumentManager {
     }
 
     /**
-     * Registers chunk IDs associated with a document in the manager
-     * @param docId The parent document ID
-     * @param chunkIds Array of chunk IDs associated with this document
-     */
-    @action
-    public registerChunkIds(docId: string, chunkIds: string[]): void {
-        // Get the document if it exists
-        const docInfo = this.documentsById.get(docId);
-        if (!docInfo) {
-            console.warn(`Cannot register chunks for unknown document ID: ${docId}`);
-            return;
-        }
-
-        // Store chunk IDs on the document for future reference
-        const doc = docInfo.layoutDoc;
-        if (!doc.chunk_ids) {
-            doc.chunk_ids = JSON.stringify(chunkIds);
-        } else {
-            // Merge with existing chunk IDs if they exist
-            const existingIds = JSON.parse(doc.chunk_ids as string);
-            const updatedIds = [...new Set([...existingIds, ...chunkIds])]; // Remove duplicates
-            doc.chunk_ids = JSON.stringify(updatedIds);
-        }
-        for (const chunkId of chunkIds) {
-            // Ensure each chunk ID can be linked back to its parent document
-            // Store a mapping from chunk ID to parent document ID
-            // This allows us to easily find a document by any of its chunk IDs
-            if (!this.documentIdsFromChunkIds.has(chunkId) && doc) {
-                this.documentIdsFromChunkIds.set(chunkId, doc[Id]);
-            }
-        }
-    }
-
-    /**
-     * Gets a document ID by a chunk ID
-     * @param chunkId The chunk ID to look up
-     * @returns The parent document ID if found
-     */
-    public getDocByChunkId(chunkId: string): Doc | undefined {
-        // First, look up the document ID using the chunk ID mapping
-        const docId = this.documentIdsFromChunkIds.get(chunkId);
-        console.log('this.documentIdsFromChunkIds', this.documentIdsFromChunkIds);
-        console.log('docId', docId);
-        if (!docId) {
-            if (this.documentsById.has(chunkId)) {
-                return this.documentsById.get(chunkId)?.layoutDoc;
-            } else {
-                console.error('No document found for chunkId and docId', chunkId);
-                return undefined;
-            }
-        }
-        // Then get the document using the document ID
-        const docInfo = this.documentsById.get(docId);
-        if (docInfo) {
-            return docInfo.layoutDoc;
-        }
-        console.error('No document found for docId', docId);
-        return undefined;
-    }
-
-    /**
      * Adds simplified chunks to a document for citation handling
      * @param doc The document to add simplified chunks to
      * @param chunks Array of full RAG chunks to simplify
      * @param docType The type of document (e.g., 'pdf', 'video', 'audio', etc.)
      * @returns The updated document with simplified chunks
      */
-    public addSimplifiedChunks(doc: Doc, chunks: RAGChunk[], docType: string): Doc {
-        if (!doc) {
-            console.error('Cannot add simplified chunks to null document');
-            return doc;
-        }
-
-        // Initialize empty chunks array if not exists
-        if (!doc.chunk_simpl) {
-            doc.chunk_simpl = JSON.stringify({ chunks: [] });
-        }
-
+    @action
+    public addSimplifiedChunks(chunks: RAGChunk[], docType: string) {
+        console.log('chunks', chunks, 'simplifiedChunks', this.simplifiedChunks);
         // Create array of simplified chunks based on document type
-        const simplifiedChunks = chunks.map(chunk => {
+        for (const chunk of chunks) {
             // Common properties across all chunk types
-            const baseChunk = {
+            const baseChunk: SimplifiedChunk = {
                 chunkId: chunk.id,
                 text: chunk.metadata.text,
                 doc_id: chunk.metadata.doc_id,
@@ -1076,38 +1009,33 @@ export class AgentDocumentManager {
 
             // Add type-specific properties
             if (docType === 'video' || docType === 'audio') {
-                return {
+                this.simplifiedChunks.set(chunk.id, {
                     ...baseChunk,
                     start_time: chunk.metadata.start_time,
                     end_time: chunk.metadata.end_time,
                     indexes: chunk.metadata.indexes,
                     chunkType: docType === 'video' ? CHUNK_TYPE.VIDEO : CHUNK_TYPE.AUDIO,
-                };
+                } as SimplifiedChunk);
             } else if (docType === 'pdf') {
-                return {
+                this.simplifiedChunks.set(chunk.id, {
                     ...baseChunk,
                     startPage: chunk.metadata.start_page,
                     endPage: chunk.metadata.end_page,
                     location: chunk.metadata.location,
-                };
+                } as SimplifiedChunk);
             } else if (docType === 'csv') {
-                return {
+                this.simplifiedChunks.set(chunk.id, {
                     ...baseChunk,
                     rowStart: (chunk.metadata as any).row_start,
                     rowEnd: (chunk.metadata as any).row_end,
                     colStart: (chunk.metadata as any).col_start,
                     colEnd: (chunk.metadata as any).col_end,
-                };
+                } as SimplifiedChunk);
             } else {
                 // Default for other document types
-                return baseChunk;
+                this.simplifiedChunks.set(chunk.id, baseChunk as SimplifiedChunk);
             }
-        });
-        console.log('simplifiedChunks', simplifiedChunks);
-        // Update the document with all simplified chunks at once
-        doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks });
-
-        return doc;
+        }
     }
 
     /**
@@ -1116,21 +1044,10 @@ export class AgentDocumentManager {
      * @param chunkId The ID of the chunk to retrieve
      * @returns The simplified chunk if found, undefined otherwise
      */
-    public getSimplifiedChunkById(doc: Doc, chunkId: string): any | undefined {
-        let chunks: any[] = [];
-        if (!doc || !doc.chunk_simpl) {
-            chunks = [];
-            console.warn('No chunk found for chunkId', chunkId, '. Checking if document exists in documentsById.');
-            return [];
-        }
-        try {
-            const parsed = JSON.parse(StrCast(doc.chunk_simpl));
-            chunks = parsed.chunks || [];
-        } catch (e) {
-            console.error('Error parsing simplified chunks:', e);
-            return [];
-        }
-        return chunks.find(chunk => chunk.chunkId === chunkId);
+    public getSimplifiedChunkById(chunkId: string): any | undefined {
+        console.log('chunkId', chunkId, 'simplifiedChunks', this.simplifiedChunks);
+        console.log('doc', this.getDocument(this.simplifiedChunks.get(chunkId)?.doc_id || ''));
+        return { foundChunk: this.simplifiedChunks.get(chunkId), doc: this.getDocument(this.simplifiedChunks.get(chunkId)?.doc_id || '') };
     }
 
     /**
@@ -1150,27 +1067,4 @@ export class AgentDocumentManager {
             return [];
         }
     }
-
-    /**
-     * Gets all document summaries combined into a single string
-     * @returns String containing all document summaries
-     */
-    public getAllDocumentSummaries(): string {
-        const summaries = Array.from(this.documentsById.keys())
-            .map(id => {
-                const doc = this.getDocument(id);
-                if (doc) {
-                    // Try to get summary from either the document or its data document
-                    const summary = doc.summary || (doc[DocData] && doc[DocData].summary);
-                    if (summary) {
-                        return StrCast(summary);
-                    }
-                }
-                return null;
-            })
-            .filter(Boolean)
-            .join('\n\n');
-
-        return summaries;
-    }
 }
diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
index 1349df483..f1fae6f11 100644
--- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
+++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
@@ -148,10 +148,6 @@ export class Vectorstore {
 
                     // Generate chunk IDs upfront so we can register them
                     const chunkIds = segmentedTranscript.map(() => uuidv4());
-
-                    // Register all chunk IDs with the document manager
-                    this.docManager.registerChunkIds(doc_id, chunkIds);
-
                     // Add transcript and embeddings to metadata
                     result = {
                         doc_id,
@@ -185,7 +181,7 @@ export class Vectorstore {
                 doc.segmented_transcript = JSON.stringify(segmentedTranscript);
                 // Use doc manager to add simplified chunks
                 const docType = local_file_path.endsWith('.mp3') ? 'audio' : 'video';
-                this.docManager.addSimplifiedChunks(doc, result.chunks, docType);
+                this.docManager.addSimplifiedChunks(result.chunks, docType);
             } else {
                 // Process regular document
                 console.log('Processing regular document...');
@@ -216,13 +212,10 @@ export class Vectorstore {
                     console.log('doc_id in vectorstore', result.doc_id, 'does not match doc_id in doc', doc[Id]);
                 }
 
-                // Register chunks with the document manager
-                this.docManager.registerChunkIds(result.doc_id, chunkIds);
-
                 // Use doc manager to add simplified chunks - determine document type from file extension
                 const fileExt = path.extname(local_file_path).toLowerCase();
                 const docType = fileExt === '.pdf' ? 'pdf' : fileExt === '.csv' ? 'csv' : 'text';
-                this.docManager.addSimplifiedChunks(doc, result.chunks, docType);
+                this.docManager.addSimplifiedChunks(result.chunks, docType);
 
                 doc.summary = result.summary;
                 doc.ai_purpose = result.purpose;
@@ -351,16 +344,6 @@ export class Vectorstore {
                     },
                 } as RAGChunk;
 
-                // Ensure the document manager knows about this chunk
-                // This is important for maintaining backwards compatibility
-                if (chunk.id && !this.docManager.getDocByChunkId(chunk.id)) {
-                    // If the chunk ID isn't registered but we have a doc_id in metadata
-                    if (chunk.metadata.doc_id && this.docManager.has(chunk.metadata.doc_id)) {
-                        // Register the chunk with its parent document
-                        this.docManager.registerChunkIds(chunk.metadata.doc_id, [chunk.id]);
-                    }
-                }
-
                 return chunk;
             });
 
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts
index 378f14094..b7ce4f663 100644
--- a/src/server/ApiManagers/AssistantManager.ts
+++ b/src/server/ApiManagers/AssistantManager.ts
@@ -514,30 +514,37 @@ export default class AssistantManager extends ApiManager {
                     await browser.close();
                     browser = null;
 
-                    // Use a try-catch block specifically for JSDOM parsing
+                    let extractedText = '';
+
+                    // First try with Readability
                     try {
                         // Parse HTML content using JSDOM
                         const dom = new JSDOM(htmlContent, { url });
 
                         // Extract readable content using Mozilla's Readability API
-                        const reader = new Readability(dom.window.document);
+                        const reader = new Readability(dom.window.document, {
+                            // Readability configuration to focus on text content
+                            charThreshold: 100,
+                            keepClasses: false,
+                        });
                         const article = reader.parse();
 
-                        if (article) {
-                            const plainText = article.textContent;
-                            res.send({ website_plain_text: plainText });
+                        if (article && article.textContent) {
+                            extractedText = article.textContent;
                         } else {
-                            // If Readability fails, fallback to extracting main content
-                            const mainContent = await extractMainContent(htmlContent);
-                            res.send({ website_plain_text: mainContent });
+                            // If Readability doesn't return useful content, try alternate method
+                            extractedText = await extractEnhancedContent(htmlContent);
                         }
                     } catch (parsingError) {
-                        console.error('Error parsing website content:', parsingError);
-
-                        // Fallback to a simplified extraction method
-                        const mainContent = await extractMainContent(htmlContent);
-                        res.send({ website_plain_text: mainContent });
+                        console.error('Error parsing website content with Readability:', parsingError);
+                        // Fallback to enhanced content extraction
+                        extractedText = await extractEnhancedContent(htmlContent);
                     }
+
+                    // Clean up the extracted text
+                    extractedText = cleanupText(extractedText);
+
+                    res.send({ website_plain_text: extractedText });
                 } catch (error) {
                     console.error('Error scraping website:', error);
 
@@ -985,48 +992,119 @@ function spawnPythonProcess(jobId: string, file_path: string, doc_id: string) {
 }
 
 /**
- * Extracts main content from HTML by removing scripts, styles, and non-content elements
- * Used as a fallback when Readability fails
+ * Enhanced content extraction that focuses on meaningful text content.
  * @param html The HTML content to process
- * @returns Extracted main text content
+ * @returns Extracted and cleaned text content
  */
-async function extractMainContent(html: string): Promise<string> {
+async function extractEnhancedContent(html: string): Promise<string> {
     try {
-        // Create a simple DOM to extract content
+        // Create DOM to extract content
         const dom = new JSDOM(html, { runScripts: 'outside-only' });
         const document = dom.window.document;
 
-        // Remove scripts, styles, and other non-content elements
-        const elementsToRemove = ['script', 'style', 'iframe', 'noscript', 'svg', 'header', 'footer', 'nav', 'aside', 'ads', 'banner', 'form', 'button', 'input'];
-
-        elementsToRemove.forEach(tag => {
-            const elements = document.querySelectorAll(tag);
+        // Remove all non-content elements
+        const elementsToRemove = [
+            'script',
+            'style',
+            'iframe',
+            'noscript',
+            'svg',
+            'canvas',
+            'header',
+            'footer',
+            'nav',
+            'aside',
+            'form',
+            'button',
+            'input',
+            'select',
+            'textarea',
+            'meta',
+            'link',
+            'img',
+            'video',
+            'audio',
+            '.ad',
+            '.ads',
+            '.advertisement',
+            '.banner',
+            '.cookie',
+            '.popup',
+            '.modal',
+            '.newsletter',
+            '[role="banner"]',
+            '[role="navigation"]',
+            '[role="complementary"]',
+        ];
+
+        elementsToRemove.forEach(selector => {
+            const elements = document.querySelectorAll(selector);
             elements.forEach(el => el.remove());
         });
 
-        // Try to find the main content container using common selectors
-        const mainSelectors = ['main', 'article', '#content', '.content', '#main', '.main', '.post-content', '.article-content', '.entry-content'];
-
-        let mainContent = '';
-
-        // Try each selector to find main content
-        for (const selector of mainSelectors) {
-            const element = document.querySelector(selector);
-            if (element && element.textContent && element.textContent.trim().length > 100) {
-                mainContent = element.textContent;
-                break;
+        // Get all text paragraphs with meaningful content
+        const contentElements = [
+            ...Array.from(document.querySelectorAll('p')),
+            ...Array.from(document.querySelectorAll('h1')),
+            ...Array.from(document.querySelectorAll('h2')),
+            ...Array.from(document.querySelectorAll('h3')),
+            ...Array.from(document.querySelectorAll('h4')),
+            ...Array.from(document.querySelectorAll('h5')),
+            ...Array.from(document.querySelectorAll('h6')),
+            ...Array.from(document.querySelectorAll('li')),
+            ...Array.from(document.querySelectorAll('td')),
+            ...Array.from(document.querySelectorAll('article')),
+            ...Array.from(document.querySelectorAll('section')),
+            ...Array.from(document.querySelectorAll('div:not([class]):not([id])')),
+        ];
+
+        // Extract text from content elements that have meaningful text
+        let contentParts: string[] = [];
+        contentElements.forEach(el => {
+            const text = el.textContent?.trim();
+            // Only include elements with substantial text (more than just a few characters)
+            if (text && text.length > 10 && !contentParts.includes(text)) {
+                contentParts.push(text);
             }
-        }
+        });
 
-        // If no main content found with selectors, use body content
-        if (!mainContent || mainContent.length < 200) {
-            mainContent = document.body.textContent || '';
+        // If no significant content found with selective approach, fallback to body
+        if (contentParts.length < 3) {
+            return document.body.textContent || '';
         }
 
-        // Clean up the text
-        return mainContent.replace(/\s+/g, ' ').replace(/\n+/g, '\n').trim();
+        return contentParts.join('\n\n');
     } catch (error) {
-        console.error('Error extracting main content:', error);
+        console.error('Error extracting enhanced content:', error);
         return 'Failed to extract content from the webpage.';
     }
 }
+
+/**
+ * Cleans up extracted text to improve readability and focus on useful content.
+ * @param text The raw extracted text
+ * @returns Cleaned and formatted text
+ */
+function cleanupText(text: string): string {
+    if (!text) return '';
+
+    return (
+        text
+            // Remove excessive whitespace and normalize line breaks
+            .replace(/\s+/g, ' ')
+            .replace(/\n\s*\n\s*\n+/g, '\n\n')
+            // Remove common boilerplate phrases
+            .replace(/cookie policy|privacy policy|terms of service|all rights reserved|copyright ©/gi, '')
+            // Remove email addresses
+            .replace(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g, '')
+            // Remove URLs
+            .replace(/https?:\/\/[^\s]+/g, '')
+            // Remove social media handles
+            .replace(/@[a-zA-Z0-9_]+/g, '')
+            // Clean up any remaining HTML tags that might have been missed
+            .replace(/<[^>]*>/g, '')
+            // Fix spacing issues after cleanup
+            .replace(/ +/g, ' ')
+            .trim()
+    );
+}