aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/client/views/nodes/chatbot/agentsystem/Agent.ts9
-rw-r--r--src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx50
-rw-r--r--src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts2
-rw-r--r--src/client/views/nodes/chatbot/tools/SearchTool.ts26
-rw-r--r--src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts30
-rw-r--r--src/client/views/nodes/chatbot/tools/WikipediaTool.ts2
-rw-r--r--src/client/views/nodes/chatbot/types/types.ts1
-rw-r--r--src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts192
-rw-r--r--src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts21
-rw-r--r--src/server/ApiManagers/AssistantManager.ts160
10 files changed, 215 insertions, 278 deletions
diff --git a/src/client/views/nodes/chatbot/agentsystem/Agent.ts b/src/client/views/nodes/chatbot/agentsystem/Agent.ts
index 24471bf5b..86d40864e 100644
--- a/src/client/views/nodes/chatbot/agentsystem/Agent.ts
+++ b/src/client/views/nodes/chatbot/agentsystem/Agent.ts
@@ -63,10 +63,8 @@ export class Agent {
*/
constructor(
_vectorstore: Vectorstore,
- summaries: () => string,
history: () => string,
csvData: () => { filename: string; id: string; text: string }[],
- getLinkedUrlDocId: (url: string) => string[],
createImage: (result: Upload.FileInformation & Upload.InspectionResults, options: DocumentOptions) => void,
createCSVInDash: (url: string, title: string, id: string, data: string) => void,
docManager: AgentDocumentManager
@@ -83,7 +81,7 @@ export class Agent {
calculate: new CalculateTool(),
rag: new RAGTool(this.vectorstore),
dataAnalysis: new DataAnalysisTool(csvData),
- websiteInfoScraper: new WebsiteInfoScraperTool(getLinkedUrlDocId),
+ websiteInfoScraper: new WebsiteInfoScraperTool(this._docManager),
searchTool: new SearchTool(this._docManager),
noTool: new NoTool(),
//imageCreationTool: new ImageCreationTool(createImage),
@@ -125,11 +123,8 @@ export class Agent {
// Retrieve chat history and generate system prompt
const chatHistory = this._history();
// Get document summaries directly from document manager
- const documentSummaries = this._docManager.getAllDocumentSummaries();
- // Create a function that returns document summaries for the prompt
- const getSummaries = () => documentSummaries;
// Generate the system prompt with the summaries
- const systemPrompt = getReactPrompt(Object.values(this.tools), getSummaries, chatHistory);
+ const systemPrompt = getReactPrompt(Object.values(this.tools), () => JSON.stringify(this._docManager.listDocs), chatHistory);
// Initialize intermediate messages
this.interMessages = [{ role: 'system', content: systemPrompt }];
diff --git a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx
index 6349e554e..867e78860 100644
--- a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx
+++ b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx
@@ -121,16 +121,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
this.vectorstore = new Vectorstore(this.vectorstore_id, this.docManager);
// Create an agent with the vectorstore
- this.agent = new Agent(
- this.vectorstore,
- this.retrieveSummaries.bind(this),
- this.retrieveFormattedHistory.bind(this),
- this.retrieveCSVData.bind(this),
- this.retrieveDocIds.bind(this),
- this.createImageInDash.bind(this),
- this.createCSVInDash.bind(this),
- this.docManager
- );
+ this.agent = new Agent(this.vectorstore, this.retrieveFormattedHistory.bind(this), this.retrieveCSVData.bind(this), this.createImageInDash.bind(this), this.createCSVInDash.bind(this), this.docManager);
// Add event listeners
this.addScrollListener();
@@ -228,6 +219,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
}
};
+ //TODO: Update for new chunk_simpl on agentDocument
/**
* Adds a CSV file for analysis by sending it to OpenAI and generating a summary.
* @param newLinkedDoc The linked document representing the CSV file.
@@ -650,18 +642,15 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
citation: JSON.stringify(citation, null, 2),
});
- // First try to find the document using the document manager's chunk ID lookup
- const doc: Doc | undefined = this.docManager.getDocByChunkId(chunkId);
- if (!doc) {
- console.warn(`Document not found for citation with chunk_id: ${chunkId}`);
- return;
- }
-
// Get the simplified chunk using the document manager
- const foundChunk = this.docManager.getSimplifiedChunkById(doc, chunkId);
+ const { foundChunk, doc } = this.docManager.getSimplifiedChunkById(chunkId);
if (!foundChunk) {
- console.warn(`Chunk not found in document for chunk ID: ${chunkId}`);
- DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {});
+ if (doc) {
+ console.warn(`Chunk not found in document, ${doc.id}, for chunk ID: ${chunkId}`);
+ DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {});
+ } else {
+ console.warn(`Chunk not found for chunk ID: ${chunkId}`);
+ }
return;
}
@@ -678,6 +667,10 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
} else if (foundChunk.chunkType === CHUNK_TYPE.TABLE || foundChunk.chunkType === CHUNK_TYPE.IMAGE) {
this.handleOtherChunkTypes(foundChunk, citation, doc);
} else {
+ if (doc.type === 'web') {
+ DocumentManager.Instance.showDocument(doc, { openLocation: OpenWhere.addRight }, () => {});
+ return;
+ }
// Show the chunk text in citation popup
let chunkText = citation.direct_text || 'Text content not available';
this.showCitationPopup(chunkText);
@@ -987,16 +980,6 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
}
/**
- * Getter that retrieves summaries of all linked documents.
- */
- @computed
- get summaries(): string {
- // Use the document manager to get all summaries
- console.log(this.docManager.listDocs);
- return JSON.stringify(this.docManager.listDocs);
- }
-
- /**
* Getter that retrieves all linked CSV files for analysis.
*/
@computed get linkedCSVs(): { filename: string; id: string; text: string }[] {
@@ -1022,7 +1005,8 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
// Other helper methods for retrieving document data and processing
retrieveSummaries = (): string => {
- return this.docManager.getAllDocumentSummaries();
+ console.log(this.docManager.listDocs);
+ return JSON.stringify(this.docManager.listDocs);
};
retrieveCSVData = () => {
@@ -1033,10 +1017,6 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
return this.formattedHistory;
};
- retrieveDocIds = (): string[] => {
- return Array.from(this.docManager.docIds);
- };
-
/**
* Handles follow-up questions when the user clicks on them.
* Automatically sets the input value to the clicked follow-up question.
diff --git a/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts b/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts
index 5297292bf..405949c1e 100644
--- a/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts
+++ b/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts
@@ -408,7 +408,7 @@ export class DocumentMetadataTool extends BaseTool<DocumentMetadataToolParamsTyp
const title = String(args.title);
const data = String(args.data);
- const id = this._docManager.createDocInDash(docType, data, { title: title });
+ const id = await this._docManager.createDocInDash(docType, data, { title: title });
if (!id) {
return [
diff --git a/src/client/views/nodes/chatbot/tools/SearchTool.ts b/src/client/views/nodes/chatbot/tools/SearchTool.ts
index 53f5fc109..43f14ea83 100644
--- a/src/client/views/nodes/chatbot/tools/SearchTool.ts
+++ b/src/client/views/nodes/chatbot/tools/SearchTool.ts
@@ -48,19 +48,21 @@ export class SearchTool extends BaseTool<SearchToolParamsType> {
query,
max_results: this._max_results,
})) as { results: { url: string; snippet: string }[] };
- const data = results.map((result: { url: string; snippet: string }) => {
- // Create a web document with the URL
- const id = this._docManager.createDocInDash('web', result.url, {
- title: `Search Result: ${result.url}`,
- text_html: result.snippet,
- data_useCors: true,
- });
+ const data = await Promise.all(
+ results.map(async (result: { url: string; snippet: string }) => {
+ // Create a web document with the URL
+ const id = await this._docManager.createDocInDash('web', result.url, {
+ title: `Search Result: ${result.url}`,
+ text_html: result.snippet,
+ data_useCors: true,
+ });
- return {
- type: 'text' as const,
- text: `<chunk chunk_id="${id}" chunk_type="url"><url>${result.url}</url><overview>${result.snippet}</overview></chunk>`,
- };
- });
+ return {
+ type: 'text' as const,
+ text: `<chunk chunk_id="${id}" chunk_type="url"><url>${result.url}</url><overview>${result.snippet}</overview></chunk>`,
+ };
+ })
+ );
return data;
} catch (error) {
console.log(error);
diff --git a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
index 3c7b4e3db..495a985cb 100644
--- a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
+++ b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
@@ -3,12 +3,14 @@ import { Networking } from '../../../../Network';
import { BaseTool } from './BaseTool';
import { Observation } from '../types/types';
import { ParametersType, ToolInfo } from '../types/tool_types';
-
+import { AgentDocumentManager } from '../utils/AgentDocumentManager';
+import { Doc } from '../../../../../fields/Doc';
+import { StrCast, WebCast } from '../../../../../fields/Types';
const websiteInfoScraperToolParams = [
{
- name: 'urls',
+ name: 'chunk_ids',
type: 'string[]',
- description: 'The URLs of the websites to scrape',
+ description: 'The chunk_ids of the urls to scrape from the SearchTool.',
required: true,
max_inputs: 3,
},
@@ -66,11 +68,11 @@ const websiteInfoScraperToolInfo: ToolInfo<WebsiteInfoScraperToolParamsType> = {
};
export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParamsType> {
- private _getLinkedUrlDocId: (url: string) => string[];
+ private _docManager: AgentDocumentManager;
- constructor(getLinkedUrlDocIds: (url: string) => string[]) {
+ constructor(docManager: AgentDocumentManager) {
super(websiteInfoScraperToolInfo);
- this._getLinkedUrlDocId = getLinkedUrlDocIds;
+ this._docManager = docManager;
}
/**
@@ -79,10 +81,13 @@ export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParam
* @param maxRetries Maximum number of retry attempts
* @returns The scraped content or error message
*/
- private async scrapeWithRetry(url: string, maxRetries = 2): Promise<Observation> {
+ private async scrapeWithRetry(chunkDoc: Doc, maxRetries = 2): Promise<Observation> {
let lastError = '';
let retryCount = 0;
-
+ const url = WebCast(chunkDoc.data!)!.url.href;
+ console.log(url);
+ console.log(chunkDoc);
+ console.log(chunkDoc.data);
// Validate URL format
try {
new URL(url); // This will throw if URL is invalid
@@ -110,7 +115,6 @@ export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParam
}
const { website_plain_text } = response as { website_plain_text: string };
- const id = this._getLinkedUrlDocId(url);
// Validate content quality
if (!website_plain_text) {
@@ -126,7 +130,7 @@ export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParam
if (retryCount === maxRetries) {
return {
type: 'text',
- text: `<chunk chunk_id="${id}" chunk_type="url">\n${website_plain_text}\nNote: Limited content was retrieved from this URL.\n</chunk>`,
+ text: `<chunk chunk_id="${chunkDoc.id}" chunk_type="url">\n${website_plain_text}\nNote: Limited content was retrieved from this URL.\n</chunk>`,
} as Observation;
}
@@ -138,7 +142,7 @@ export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParam
// Process and return content if it looks good
return {
type: 'text',
- text: `<chunk chunk_id="${id}" chunk_type="url">\n${website_plain_text}\n</chunk>`,
+ text: `<chunk chunk_id="${chunkDoc.id}" chunk_type="url">\n${website_plain_text}\n</chunk>`,
} as Observation;
} catch (error) {
lastError = error instanceof Error ? error.message : 'Unknown error';
@@ -156,10 +160,10 @@ export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParam
}
async execute(args: ParametersType<WebsiteInfoScraperToolParamsType>): Promise<Observation[]> {
- const urls = args.urls;
+ const chunk_ids = args.chunk_ids;
// Create an array of promises, each one handling a website scrape for a URL
- const scrapingPromises = urls.map(url => this.scrapeWithRetry(url));
+ const scrapingPromises = chunk_ids.map(chunk_id => this.scrapeWithRetry(this._docManager.getDocument(chunk_id)!));
// Wait for all scraping promises to resolve
const results = await Promise.all(scrapingPromises);
diff --git a/src/client/views/nodes/chatbot/tools/WikipediaTool.ts b/src/client/views/nodes/chatbot/tools/WikipediaTool.ts
index ee815532a..ec5d83e52 100644
--- a/src/client/views/nodes/chatbot/tools/WikipediaTool.ts
+++ b/src/client/views/nodes/chatbot/tools/WikipediaTool.ts
@@ -32,7 +32,7 @@ export class WikipediaTool extends BaseTool<WikipediaToolParamsType> {
async execute(args: ParametersType<WikipediaToolParamsType>): Promise<Observation[]> {
try {
- const { text } = await Networking.PostToServer('/getWikipediaSummary', { title: args.title });
+ const { text } = (await Networking.PostToServer('/getWikipediaSummary', { title: args.title })) as { text: string };
const id = uuidv4();
const url = `https://en.wikipedia.org/wiki/${args.title.replace(/ /g, '_')}`;
this._addLinkedUrlDoc(url, id);
diff --git a/src/client/views/nodes/chatbot/types/types.ts b/src/client/views/nodes/chatbot/types/types.ts
index 90b5e7e11..0d1804b2d 100644
--- a/src/client/views/nodes/chatbot/types/types.ts
+++ b/src/client/views/nodes/chatbot/types/types.ts
@@ -101,6 +101,7 @@ export interface RAGChunk {
export interface SimplifiedChunk {
chunkId: string;
+ doc_id: string;
startPage?: number;
endPage?: number;
location?: string;
diff --git a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts
index c8a6bb16b..5a09b945b 100644
--- a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts
+++ b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts
@@ -13,7 +13,7 @@ import { LinkManager, UPDATE_SERVER_CACHE } from '../../../../util/LinkManager';
import { DocumentView } from '../../DocumentView';
import { ChatBox, parsedDoc } from '../chatboxcomponents/ChatBox';
import { supportedDocTypes } from '../types/tool_types';
-import { CHUNK_TYPE, RAGChunk } from '../types/types';
+import { CHUNK_TYPE, RAGChunk, SimplifiedChunk } from '../types/types';
/**
* Interface representing a document in the freeform view
@@ -31,7 +31,7 @@ export class AgentDocumentManager {
private chatBox: ChatBox;
private chatBoxDocument: Doc | null = null;
private fieldMetadata: Record<string, any> = {};
- @observable private documentIdsFromChunkIds: ObservableMap<string, string>;
+ @observable private simplifiedChunks: ObservableMap<string, SimplifiedChunk>;
/**
* Creates a new DocumentManager
@@ -40,17 +40,21 @@ export class AgentDocumentManager {
constructor(chatBox: ChatBox) {
makeObservable(this);
const agentDoc = DocCast(chatBox.Document.agentDocument) ?? new Doc();
- const chunkIds = DocCast(agentDoc.chunkIds) ?? new Doc();
+ const chunk_simpl = DocCast(agentDoc.chunk_simpl) ?? new Doc();
agentDoc.title = chatBox.Document.title + '_agentDocument';
- chunkIds.title = '_chunkIds';
+ chunk_simpl.title = '_chunk_simpl';
chatBox.Document.agentDocument = agentDoc;
- DocCast(chatBox.Document.agentDocument)!.chunkIds = chunkIds;
- this.documentIdsFromChunkIds = StrListCast(chunkIds.mapping).reduce((mapping, content) => {
- const [chunkId, docId] = content.split(':');
- mapping.set(chunkId, docId);
+ DocCast(chatBox.Document.agentDocument)!.chunk_simpl = chunk_simpl;
+
+ this.simplifiedChunks = StrListCast(chunk_simpl.mapping).reduce((mapping, chunks) => {
+ StrListCast(chunks).forEach(chunk => {
+ const parsed = JSON.parse(StrCast(chunk));
+ mapping.set(parsed.chunkId, parsed);
+ });
return mapping;
- }, new ObservableMap<string, string>());
+ }, new ObservableMap<string, SimplifiedChunk>());
+
this.documentsById = StrListCast(agentDoc.mapping).reduce((mapping, content) => {
const [id, layoutId, docId] = content.split(':');
const layoutDoc = DocServer.GetCachedRefField(layoutId);
@@ -76,14 +80,10 @@ export class AgentDocumentManager {
//{ fireImmediately: true }
);
reaction(
- () => this.documentIdsFromChunkIds.values(),
+ () => this.simplifiedChunks.values(),
() => {
if (this.chatBoxDocument && DocCast(this.chatBoxDocument.agentDocument)) {
- // Store the mapping with chunkId:docId format for consistency
- const chunkIdsDoc = DocCast(DocCast(this.chatBoxDocument.agentDocument)!.chunkIds);
- if (chunkIdsDoc) {
- chunkIdsDoc.mapping = new List<string>(Array.from(this.documentIdsFromChunkIds.entries()).map(([chunkId, docId]) => `${chunkId}:${docId}`));
- }
+ DocCast(DocCast(this.chatBoxDocument.agentDocument)!.chunk_simpl)!.mapping = new List<string>(Array.from(this.simplifiedChunks.values()).map(chunk => JSON.stringify(chunk)));
}
}
//{ fireImmediately: true }
@@ -831,7 +831,8 @@ export class AgentDocumentManager {
* @param options Optional configuration options
* @returns The ID of the created document
*/
- public createDocInDash(docType: string, data: string, options?: any): string {
+
+ public async createDocInDash(docType: string, data: string, options?: any): Promise<string> {
// Validate doc_type
if (!this.isValidDocType(docType)) {
throw new Error(`Invalid document type: ${docType}`);
@@ -877,14 +878,15 @@ export class AgentDocumentManager {
// Create link and add it to the document system
const linkDoc = Docs.Create.LinkDocument(this.chatBoxDocument, doc);
LinkManager.Instance.addLink(linkDoc);
-
- // Add document to view
- this.chatBox._props.addDocument?.(doc);
-
- // Show document - defer actual display to prevent immediate resource loading
- setTimeout(() => {
- DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {});
- }, 100);
+ if (doc.type !== 'web') {
+ // Add document to view
+ this.chatBox._props.addDocument?.(doc);
+
+ // Show document - defer actual display to prevent immediate resource loading
+ setTimeout(() => {
+ DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {});
+ }, 100);
+ }
}
});
@@ -986,88 +988,19 @@ export class AgentDocumentManager {
}
/**
- * Registers chunk IDs associated with a document in the manager
- * @param docId The parent document ID
- * @param chunkIds Array of chunk IDs associated with this document
- */
- @action
- public registerChunkIds(docId: string, chunkIds: string[]): void {
- // Get the document if it exists
- const docInfo = this.documentsById.get(docId);
- if (!docInfo) {
- console.warn(`Cannot register chunks for unknown document ID: ${docId}`);
- return;
- }
-
- // Store chunk IDs on the document for future reference
- const doc = docInfo.layoutDoc;
- if (!doc.chunk_ids) {
- doc.chunk_ids = JSON.stringify(chunkIds);
- } else {
- // Merge with existing chunk IDs if they exist
- const existingIds = JSON.parse(doc.chunk_ids as string);
- const updatedIds = [...new Set([...existingIds, ...chunkIds])]; // Remove duplicates
- doc.chunk_ids = JSON.stringify(updatedIds);
- }
- for (const chunkId of chunkIds) {
- // Ensure each chunk ID can be linked back to its parent document
- // Store a mapping from chunk ID to parent document ID
- // This allows us to easily find a document by any of its chunk IDs
- if (!this.documentIdsFromChunkIds.has(chunkId) && doc) {
- this.documentIdsFromChunkIds.set(chunkId, doc[Id]);
- }
- }
- }
-
- /**
- * Gets a document ID by a chunk ID
- * @param chunkId The chunk ID to look up
- * @returns The parent document ID if found
- */
- public getDocByChunkId(chunkId: string): Doc | undefined {
- // First, look up the document ID using the chunk ID mapping
- const docId = this.documentIdsFromChunkIds.get(chunkId);
- console.log('this.documentIdsFromChunkIds', this.documentIdsFromChunkIds);
- console.log('docId', docId);
- if (!docId) {
- if (this.documentsById.has(chunkId)) {
- return this.documentsById.get(chunkId)?.layoutDoc;
- } else {
- console.error('No document found for chunkId and docId', chunkId);
- return undefined;
- }
- }
- // Then get the document using the document ID
- const docInfo = this.documentsById.get(docId);
- if (docInfo) {
- return docInfo.layoutDoc;
- }
- console.error('No document found for docId', docId);
- return undefined;
- }
-
- /**
* Adds simplified chunks to a document for citation handling
* @param doc The document to add simplified chunks to
* @param chunks Array of full RAG chunks to simplify
* @param docType The type of document (e.g., 'pdf', 'video', 'audio', etc.)
* @returns The updated document with simplified chunks
*/
- public addSimplifiedChunks(doc: Doc, chunks: RAGChunk[], docType: string): Doc {
- if (!doc) {
- console.error('Cannot add simplified chunks to null document');
- return doc;
- }
-
- // Initialize empty chunks array if not exists
- if (!doc.chunk_simpl) {
- doc.chunk_simpl = JSON.stringify({ chunks: [] });
- }
-
+ @action
+ public addSimplifiedChunks(chunks: RAGChunk[], docType: string) {
+ console.log('chunks', chunks, 'simplifiedChunks', this.simplifiedChunks);
// Create array of simplified chunks based on document type
- const simplifiedChunks = chunks.map(chunk => {
+ for (const chunk of chunks) {
// Common properties across all chunk types
- const baseChunk = {
+ const baseChunk: SimplifiedChunk = {
chunkId: chunk.id,
text: chunk.metadata.text,
doc_id: chunk.metadata.doc_id,
@@ -1076,38 +1009,33 @@ export class AgentDocumentManager {
// Add type-specific properties
if (docType === 'video' || docType === 'audio') {
- return {
+ this.simplifiedChunks.set(chunk.id, {
...baseChunk,
start_time: chunk.metadata.start_time,
end_time: chunk.metadata.end_time,
indexes: chunk.metadata.indexes,
chunkType: docType === 'video' ? CHUNK_TYPE.VIDEO : CHUNK_TYPE.AUDIO,
- };
+ } as SimplifiedChunk);
} else if (docType === 'pdf') {
- return {
+ this.simplifiedChunks.set(chunk.id, {
...baseChunk,
startPage: chunk.metadata.start_page,
endPage: chunk.metadata.end_page,
location: chunk.metadata.location,
- };
+ } as SimplifiedChunk);
} else if (docType === 'csv') {
- return {
+ this.simplifiedChunks.set(chunk.id, {
...baseChunk,
rowStart: (chunk.metadata as any).row_start,
rowEnd: (chunk.metadata as any).row_end,
colStart: (chunk.metadata as any).col_start,
colEnd: (chunk.metadata as any).col_end,
- };
+ } as SimplifiedChunk);
} else {
// Default for other document types
- return baseChunk;
+ this.simplifiedChunks.set(chunk.id, baseChunk as SimplifiedChunk);
}
- });
- console.log('simplifiedChunks', simplifiedChunks);
- // Update the document with all simplified chunks at once
- doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks });
-
- return doc;
+ }
}
/**
@@ -1116,21 +1044,10 @@ export class AgentDocumentManager {
* @param chunkId The ID of the chunk to retrieve
* @returns The simplified chunk if found, undefined otherwise
*/
- public getSimplifiedChunkById(doc: Doc, chunkId: string): any | undefined {
- let chunks: any[] = [];
- if (!doc || !doc.chunk_simpl) {
- chunks = [];
- console.warn('No chunk found for chunkId', chunkId, '. Checking if document exists in documentsById.');
- return [];
- }
- try {
- const parsed = JSON.parse(StrCast(doc.chunk_simpl));
- chunks = parsed.chunks || [];
- } catch (e) {
- console.error('Error parsing simplified chunks:', e);
- return [];
- }
- return chunks.find(chunk => chunk.chunkId === chunkId);
+ public getSimplifiedChunkById(chunkId: string): any | undefined {
+ console.log('chunkId', chunkId, 'simplifiedChunks', this.simplifiedChunks);
+ console.log('doc', this.getDocument(this.simplifiedChunks.get(chunkId)?.doc_id || ''));
+ return { foundChunk: this.simplifiedChunks.get(chunkId), doc: this.getDocument(this.simplifiedChunks.get(chunkId)?.doc_id || '') };
}
/**
@@ -1150,27 +1067,4 @@ export class AgentDocumentManager {
return [];
}
}
-
- /**
- * Gets all document summaries combined into a single string
- * @returns String containing all document summaries
- */
- public getAllDocumentSummaries(): string {
- const summaries = Array.from(this.documentsById.keys())
- .map(id => {
- const doc = this.getDocument(id);
- if (doc) {
- // Try to get summary from either the document or its data document
- const summary = doc.summary || (doc[DocData] && doc[DocData].summary);
- if (summary) {
- return StrCast(summary);
- }
- }
- return null;
- })
- .filter(Boolean)
- .join('\n\n');
-
- return summaries;
- }
}
diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
index 1349df483..f1fae6f11 100644
--- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
+++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
@@ -148,10 +148,6 @@ export class Vectorstore {
// Generate chunk IDs upfront so we can register them
const chunkIds = segmentedTranscript.map(() => uuidv4());
-
- // Register all chunk IDs with the document manager
- this.docManager.registerChunkIds(doc_id, chunkIds);
-
// Add transcript and embeddings to metadata
result = {
doc_id,
@@ -185,7 +181,7 @@ export class Vectorstore {
doc.segmented_transcript = JSON.stringify(segmentedTranscript);
// Use doc manager to add simplified chunks
const docType = local_file_path.endsWith('.mp3') ? 'audio' : 'video';
- this.docManager.addSimplifiedChunks(doc, result.chunks, docType);
+ this.docManager.addSimplifiedChunks(result.chunks, docType);
} else {
// Process regular document
console.log('Processing regular document...');
@@ -216,13 +212,10 @@ export class Vectorstore {
console.log('doc_id in vectorstore', result.doc_id, 'does not match doc_id in doc', doc[Id]);
}
- // Register chunks with the document manager
- this.docManager.registerChunkIds(result.doc_id, chunkIds);
-
// Use doc manager to add simplified chunks - determine document type from file extension
const fileExt = path.extname(local_file_path).toLowerCase();
const docType = fileExt === '.pdf' ? 'pdf' : fileExt === '.csv' ? 'csv' : 'text';
- this.docManager.addSimplifiedChunks(doc, result.chunks, docType);
+ this.docManager.addSimplifiedChunks(result.chunks, docType);
doc.summary = result.summary;
doc.ai_purpose = result.purpose;
@@ -351,16 +344,6 @@ export class Vectorstore {
},
} as RAGChunk;
- // Ensure the document manager knows about this chunk
- // This is important for maintaining backwards compatibility
- if (chunk.id && !this.docManager.getDocByChunkId(chunk.id)) {
- // If the chunk ID isn't registered but we have a doc_id in metadata
- if (chunk.metadata.doc_id && this.docManager.has(chunk.metadata.doc_id)) {
- // Register the chunk with its parent document
- this.docManager.registerChunkIds(chunk.metadata.doc_id, [chunk.id]);
- }
- }
-
return chunk;
});
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts
index 378f14094..b7ce4f663 100644
--- a/src/server/ApiManagers/AssistantManager.ts
+++ b/src/server/ApiManagers/AssistantManager.ts
@@ -514,30 +514,37 @@ export default class AssistantManager extends ApiManager {
await browser.close();
browser = null;
- // Use a try-catch block specifically for JSDOM parsing
+ let extractedText = '';
+
+ // First try with Readability
try {
// Parse HTML content using JSDOM
const dom = new JSDOM(htmlContent, { url });
// Extract readable content using Mozilla's Readability API
- const reader = new Readability(dom.window.document);
+ const reader = new Readability(dom.window.document, {
+ // Readability configuration to focus on text content
+ charThreshold: 100,
+ keepClasses: false,
+ });
const article = reader.parse();
- if (article) {
- const plainText = article.textContent;
- res.send({ website_plain_text: plainText });
+ if (article && article.textContent) {
+ extractedText = article.textContent;
} else {
- // If Readability fails, fallback to extracting main content
- const mainContent = await extractMainContent(htmlContent);
- res.send({ website_plain_text: mainContent });
+ // If Readability doesn't return useful content, try alternate method
+ extractedText = await extractEnhancedContent(htmlContent);
}
} catch (parsingError) {
- console.error('Error parsing website content:', parsingError);
-
- // Fallback to a simplified extraction method
- const mainContent = await extractMainContent(htmlContent);
- res.send({ website_plain_text: mainContent });
+ console.error('Error parsing website content with Readability:', parsingError);
+ // Fallback to enhanced content extraction
+ extractedText = await extractEnhancedContent(htmlContent);
}
+
+ // Clean up the extracted text
+ extractedText = cleanupText(extractedText);
+
+ res.send({ website_plain_text: extractedText });
} catch (error) {
console.error('Error scraping website:', error);
@@ -985,48 +992,119 @@ function spawnPythonProcess(jobId: string, file_path: string, doc_id: string) {
}
/**
- * Extracts main content from HTML by removing scripts, styles, and non-content elements
- * Used as a fallback when Readability fails
+ * Enhanced content extraction that focuses on meaningful text content.
* @param html The HTML content to process
- * @returns Extracted main text content
+ * @returns Extracted and cleaned text content
*/
-async function extractMainContent(html: string): Promise<string> {
+async function extractEnhancedContent(html: string): Promise<string> {
try {
- // Create a simple DOM to extract content
+ // Create DOM to extract content
const dom = new JSDOM(html, { runScripts: 'outside-only' });
const document = dom.window.document;
- // Remove scripts, styles, and other non-content elements
- const elementsToRemove = ['script', 'style', 'iframe', 'noscript', 'svg', 'header', 'footer', 'nav', 'aside', 'ads', 'banner', 'form', 'button', 'input'];
-
- elementsToRemove.forEach(tag => {
- const elements = document.querySelectorAll(tag);
+ // Remove all non-content elements
+ const elementsToRemove = [
+ 'script',
+ 'style',
+ 'iframe',
+ 'noscript',
+ 'svg',
+ 'canvas',
+ 'header',
+ 'footer',
+ 'nav',
+ 'aside',
+ 'form',
+ 'button',
+ 'input',
+ 'select',
+ 'textarea',
+ 'meta',
+ 'link',
+ 'img',
+ 'video',
+ 'audio',
+ '.ad',
+ '.ads',
+ '.advertisement',
+ '.banner',
+ '.cookie',
+ '.popup',
+ '.modal',
+ '.newsletter',
+ '[role="banner"]',
+ '[role="navigation"]',
+ '[role="complementary"]',
+ ];
+
+ elementsToRemove.forEach(selector => {
+ const elements = document.querySelectorAll(selector);
elements.forEach(el => el.remove());
});
- // Try to find the main content container using common selectors
- const mainSelectors = ['main', 'article', '#content', '.content', '#main', '.main', '.post-content', '.article-content', '.entry-content'];
-
- let mainContent = '';
-
- // Try each selector to find main content
- for (const selector of mainSelectors) {
- const element = document.querySelector(selector);
- if (element && element.textContent && element.textContent.trim().length > 100) {
- mainContent = element.textContent;
- break;
+ // Get all text paragraphs with meaningful content
+ const contentElements = [
+ ...Array.from(document.querySelectorAll('p')),
+ ...Array.from(document.querySelectorAll('h1')),
+ ...Array.from(document.querySelectorAll('h2')),
+ ...Array.from(document.querySelectorAll('h3')),
+ ...Array.from(document.querySelectorAll('h4')),
+ ...Array.from(document.querySelectorAll('h5')),
+ ...Array.from(document.querySelectorAll('h6')),
+ ...Array.from(document.querySelectorAll('li')),
+ ...Array.from(document.querySelectorAll('td')),
+ ...Array.from(document.querySelectorAll('article')),
+ ...Array.from(document.querySelectorAll('section')),
+ ...Array.from(document.querySelectorAll('div:not([class]):not([id])')),
+ ];
+
+ // Extract text from content elements that have meaningful text
+ let contentParts: string[] = [];
+ contentElements.forEach(el => {
+ const text = el.textContent?.trim();
+ // Only include elements with substantial text (more than just a few characters)
+ if (text && text.length > 10 && !contentParts.includes(text)) {
+ contentParts.push(text);
}
- }
+ });
- // If no main content found with selectors, use body content
- if (!mainContent || mainContent.length < 200) {
- mainContent = document.body.textContent || '';
+ // If no significant content found with selective approach, fallback to body
+ if (contentParts.length < 3) {
+ return document.body.textContent || '';
}
- // Clean up the text
- return mainContent.replace(/\s+/g, ' ').replace(/\n+/g, '\n').trim();
+ return contentParts.join('\n\n');
} catch (error) {
- console.error('Error extracting main content:', error);
+ console.error('Error extracting enhanced content:', error);
return 'Failed to extract content from the webpage.';
}
}
+
+/**
+ * Cleans up extracted text to improve readability and focus on useful content.
+ * @param text The raw extracted text
+ * @returns Cleaned and formatted text
+ */
+function cleanupText(text: string): string {
+ if (!text) return '';
+
+ return (
+ text
+ // Remove excessive whitespace and normalize line breaks
+ .replace(/\s+/g, ' ')
+ .replace(/\n\s*\n\s*\n+/g, '\n\n')
+ // Remove common boilerplate phrases
+ .replace(/cookie policy|privacy policy|terms of service|all rights reserved|copyright ©/gi, '')
+ // Remove email addresses
+ .replace(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g, '')
+ // Remove URLs
+ .replace(/https?:\/\/[^\s]+/g, '')
+ // Remove social media handles
+ .replace(/@[a-zA-Z0-9_]+/g, '')
+ // Clean up any remaining HTML tags that might have been missed
+ .replace(/<[^>]*>/g, '')
+ // Fix spacing issues after cleanup
+ .replace(/ +/g, ' ')
+ .trim()
+ );
+}