diff options
author | A.J. Shulman <Shulman.aj@gmail.com> | 2024-07-15 12:31:35 -0400 |
---|---|---|
committer | A.J. Shulman <Shulman.aj@gmail.com> | 2024-07-15 12:31:35 -0400 |
commit | ef79b7d617035c52fea159225ba9a39b8222e8f4 (patch) | |
tree | 2ad28baadada16e2688f922c906216b39652c28b | |
parent | f1cdfc1d02488c4a513fbf67f729f702526a345d (diff) |
citation parsing working much better
-rw-r--r-- | .vscode/launch.json | 15 | ||||
-rw-r--r-- | src/client/views/nodes/ChatBox/Agent.ts | 8 | ||||
-rw-r--r-- | src/client/views/nodes/ChatBox/AnswerParser.ts | 60 | ||||
-rw-r--r-- | src/client/views/nodes/ChatBox/ChatBox.tsx | 52 | ||||
-rw-r--r-- | src/client/views/nodes/ChatBox/MessageComponent.tsx | 95 | ||||
-rw-r--r-- | src/client/views/nodes/ChatBox/types.ts | 18 | ||||
-rw-r--r-- | src/client/views/nodes/ChatBox/vectorstore/VectorstoreUpload.ts | 5 |
7 files changed, 138 insertions, 115 deletions
diff --git a/.vscode/launch.json b/.vscode/launch.json index e4c31361c..e9a07cf93 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -59,7 +59,7 @@ "webRoot": "${workspaceFolder}" }, { - "type": "node", + "type": "pwa-node", "request": "attach", "name": "Typescript Server", "protocol": "inspector", @@ -68,7 +68,7 @@ "remoteRoot": "${workspaceFolder}" }, { - "type": "node", + "type": "pwa-node", "request": "launch", "name": "Current TS File", "runtimeExecutable": "npx", @@ -76,7 +76,7 @@ "port": 9229 }, { - "type": "node", + "type": "pwa-node", "request": "launch", "name": "Mocha Tests", "program": "${workspaceFolder}/node_modules/mocha/bin/_mocha", @@ -86,7 +86,7 @@ "protocol": "inspector" }, { - "type": "node", + "type": "pwa-node", "request": "launch", "name": "Mocha Current File", "program": "${workspaceFolder}/node_modules/mocha/bin/_mocha", @@ -96,9 +96,6 @@ "protocol": "inspector" } ], - - "resolveSourceMapLocations": [ - "${workspaceFolder}/**", - "!**/node_modules/**" - ] + + "resolveSourceMapLocations": ["${workspaceFolder}/**", "!**/node_modules/**"] } diff --git a/src/client/views/nodes/ChatBox/Agent.ts b/src/client/views/nodes/ChatBox/Agent.ts index 6757b2ce8..fd3c6e5e8 100644 --- a/src/client/views/nodes/ChatBox/Agent.ts +++ b/src/client/views/nodes/ChatBox/Agent.ts @@ -95,6 +95,8 @@ export class Agent { break; } else { console.log('Error: No valid action'); + this.interMessages.push({ role: 'user', content: 'No valid action, try again.' }); + break; } } else if (key === 'action_input') { const actionInput = builder.build({ action_input: step[key] }); @@ -116,11 +118,7 @@ export class Agent { } } else if (key === 'answer') { console.log('Answer found. Ending query.'); - const answerContent = builder.build({ answer: step[key] }); - this.messages.push({ role: 'assistant', content: answerContent }); - this.interMessages = []; - console.log(this.messages); - return answerContent; + return result; } } } diff --git a/src/client/views/nodes/ChatBox/AnswerParser.ts b/src/client/views/nodes/ChatBox/AnswerParser.ts new file mode 100644 index 000000000..f77d2261d --- /dev/null +++ b/src/client/views/nodes/ChatBox/AnswerParser.ts @@ -0,0 +1,60 @@ +import { ASSISTANT_ROLE, AssistantMessage, Citation, getChunkType } from './types'; + +export class AnswerParser { + static parse(xml: string): AssistantMessage { + const answerRegex = /<answer>([\s\S]*?)<\/answer>/; + const citationRegex = /<citation chunk_id="([^"]+)" type="([^"]+)">(.*?)<\/citation>/g; + const followUpQuestionsRegex = /<follow_up_questions>([\s\S]*?)<\/follow_up_questions>/; + const questionRegex = /<question>(.*?)<\/question>/g; + + const answerMatch = answerRegex.exec(xml); + const followUpQuestionsMatch = followUpQuestionsRegex.exec(xml); + + if (!answerMatch) { + throw new Error('Invalid XML: Missing <answer> tag.'); + } + + const rawTextContent = answerMatch[1].trim(); + const textContentWithCitations = rawTextContent.replace(citationRegex, ''); + const textContent = textContentWithCitations.replace(followUpQuestionsRegex, '').trim(); + + let citations: Citation[] = []; + let match: RegExpExecArray | null; + + let plainTextOffset = 0; + let citationOffset = 0; + + while ((match = citationRegex.exec(rawTextContent)) !== null) { + const [fullMatch, chunk_id, type, direct_text] = match; + const citationStartIndex = match.index; + const citationPlainStart = citationStartIndex - citationOffset; + + citations.push({ + direct_text: direct_text.trim(), + type: getChunkType(type), + chunk_id: chunk_id, + location: citationPlainStart, + }); + + citationOffset += fullMatch.length; + } + + let followUpQuestions: string[] = []; + if (followUpQuestionsMatch) { + const questionsText = followUpQuestionsMatch[1]; + let questionMatch: RegExpExecArray | null; + + while ((questionMatch = questionRegex.exec(questionsText)) !== null) { + followUpQuestions.push(questionMatch[1].trim()); + } + } + const assistantResponse: AssistantMessage = { + role: ASSISTANT_ROLE.ASSISTANT, + text_content: textContent, + follow_up_questions: followUpQuestions, + citations: citations, + }; + + return assistantResponse; + } +} diff --git a/src/client/views/nodes/ChatBox/ChatBox.tsx b/src/client/views/nodes/ChatBox/ChatBox.tsx index 9b2a92564..bae6bbaa6 100644 --- a/src/client/views/nodes/ChatBox/ChatBox.tsx +++ b/src/client/views/nodes/ChatBox/ChatBox.tsx @@ -12,7 +12,7 @@ import { ViewBoxAnnotatableComponent } from '../../DocComponent'; import { FieldView, FieldViewProps } from '../FieldView'; import './ChatBox.scss'; import MessageComponentBox from './MessageComponent'; -import { ASSISTANT_ROLE, AssistantMessage, AI_Document, convertToAIDocument, Citation } from './types'; +import { ASSISTANT_ROLE, AssistantMessage, AI_Document, convertToAIDocument, Citation, CHUNK_TYPE } from './types'; import { Vectorstore } from './vectorstore/VectorstoreUpload'; import { CollectionFreeFormDocumentView } from '../CollectionFreeFormDocumentView'; import { CollectionFreeFormView } from '../../collections/collectionFreeForm'; @@ -20,6 +20,7 @@ import { Agent } from './Agent'; import dotenv from 'dotenv'; import { DocData } from '../../../../fields/DocSymbols'; import { DocumentView } from '../DocumentView'; +import { AnswerParser } from './AnswerParser'; dotenv.config(); @observer @@ -47,14 +48,13 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { constructor(props: FieldViewProps) { super(props); makeObservable(this); - this.openai = this.initializeOpenAI(); - this.history = [{ role: ASSISTANT_ROLE.ASSISTANT, text: 'Welcome to the Document Analyser Assistant! Link a document or ask questions to get started.' }]; + this.history = [{ role: ASSISTANT_ROLE.ASSISTANT, text_content: 'Welcome to the Document Analyser Assistant! Link a document or ask questions to get started.' }]; this.openai = this.initializeOpenAI(); this.vectorstore = new Vectorstore(); this.agent = new Agent(this.vectorstore); // Initialize the Agent reaction( - () => this.history.map((msg: AssistantMessage) => ({ role: msg.role, text: msg.text, follow_up_questions: msg.follow_up_questions, citations: msg.citations })), + () => this.history.map((msg: AssistantMessage) => ({ role: msg.role, text_content: msg.text_content, follow_up_questions: msg.follow_up_questions, citations: msg.citations })), serializableHistory => { this.dataDoc.data = JSON.stringify(serializableHistory); } @@ -110,19 +110,19 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { try { textInput.value = ''; runInAction(() => { - this.history.push({ role: ASSISTANT_ROLE.USER, text: trimmedText }); + this.history.push({ role: ASSISTANT_ROLE.USER, text_content: trimmedText }); this.isLoading = true; }); const response = await this.agent.askAgent(trimmedText); // Use the chatbot to get the response runInAction(() => { - this.history.push(this.parseAssistantResponse(response)); + this.history.push(AnswerParser.parse(response)); }); this.dataDoc.data = JSON.stringify(this.history); } catch (err) { console.error('Error:', err); runInAction(() => { - this.history.push({ role: ASSISTANT_ROLE.ASSISTANT, text: 'Sorry, I encountered an error while processing your request.' }); + this.history.push({ role: ASSISTANT_ROLE.ASSISTANT, text_content: 'Sorry, I encountered an error while processing your request.' }); }); } finally { runInAction(() => { @@ -132,35 +132,6 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { } }; - parseAssistantResponse(response: string): AssistantMessage { - const parser = new DOMParser(); - const xmlDoc = parser.parseFromString(response, 'text/xml'); - const answerElement = xmlDoc.querySelector('answer'); - const followUpQuestionsElement = xmlDoc.querySelector('follow_up_questions'); - - let text = ''; - let followUpQuestions: string[] = []; - - if (answerElement) { - // Remove the follow_up_questions element from the answer - const followUpElement = answerElement.querySelector('follow_up_questions'); - if (followUpElement) { - followUpElement.remove(); - } - text = answerElement.innerHTML.trim(); - } - - if (followUpQuestionsElement) { - followUpQuestions = Array.from(followUpQuestionsElement.querySelectorAll('question')).map(q => q.textContent || ''); - } - - return { - role: ASSISTANT_ROLE.ASSISTANT, - text, - follow_up_questions: followUpQuestions, - }; - } - @action updateMessageCitations = (index: number, citations: Citation[]) => { if (this.history[index]) { @@ -219,7 +190,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { this.history.push( ...storedHistory.map((msg: AssistantMessage) => ({ role: msg.role, - text: msg.text, + text_content: msg.text_content, follow_up_questions: msg.follow_up_questions, citations: msg.citations, })) @@ -246,6 +217,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { if ((change as any).addedCount > 0) { // maybe check here if its already in the urls datadoc array so doesn't add twice console.log((change as any).added as Doc[]); + console.log('here!'); this.addDocsToVectorstore((change as any).added as Doc[]); } // (change as any).removed.forEach((link: any) => remLinkFromDoc(toRealField(link))); @@ -265,7 +237,11 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { @computed get visibleDocs() { - return (CollectionFreeFormDocumentView.from(this._props.DocumentView?.())?._props.parent as CollectionFreeFormView)?.childDocs.filter(doc => doc != this.Document) ?? []; + //return (CollectionFreeFormDocumentView.from(this._props.DocumentView?.())?._props.parent as CollectionFreeFormView)?.childDocs.filter(doc => doc != this.Document) ?? []; + return LinkManager.Instance.getAllRelatedLinks(this.Document) + .map(d => DocCast(LinkManager.getOppositeAnchor(d, this.Document))) + .map(d => DocCast(d?.annotationOn, d)) + .filter(d => d); } @action diff --git a/src/client/views/nodes/ChatBox/MessageComponent.tsx b/src/client/views/nodes/ChatBox/MessageComponent.tsx index 38faf7e00..76faff10b 100644 --- a/src/client/views/nodes/ChatBox/MessageComponent.tsx +++ b/src/client/views/nodes/ChatBox/MessageComponent.tsx @@ -1,9 +1,6 @@ import React from 'react'; import { observer } from 'mobx-react'; -import { AssistantMessage, CHUNK_TYPE, Citation } from './types'; -import { TbInfoCircleFilled } from 'react-icons/tb'; -import { Docs } from '../../../documents/Documents'; -import { DocumentType } from '../../../documents/DocumentTypes'; +import { AssistantMessage, Citation } from './types'; interface MessageComponentProps { message: AssistantMessage; @@ -14,59 +11,53 @@ interface MessageComponentProps { } const MessageComponentBox: React.FC<MessageComponentProps> = function ({ message, index, onFollowUpClick, onCitationClick, updateMessageCitations }) { - // public static LayoutString(fieldKey: string) { - // return FieldView.LayoutString(MessageComponentBox, fieldKey); - // } - - // the presentation view that renders this slide - - // @computed - // get chatBoxView() { - // return this.DocumentView?.().containerViewPath?.().lastElement()?.ComponentView as ChatBox; - // } + const renderContent = (content: string) => { + if (!message.citations || message.citations.length === 0) { + return content; + } - const renderContent = (text: string) => { - const citationRegex = /<citation chunk_id="([^"]*)" type="([^"]*)">([^<]*)<\/citation>/g; const parts = []; let lastIndex = 0; - let match; - const citations: Citation[] = []; - - while ((match = citationRegex.exec(text)) !== null) { - const [fullMatch, chunkId, type, content] = match; - const citation: Citation = { chunk_id: chunkId, type: type as CHUNK_TYPE, text: content }; - citations.push(citation); - parts.push(text.slice(lastIndex, match.index)); - parts.push( - <a - key={chunkId} - href="#" - onClick={e => { - e.preventDefault(); - onCitationClick(citation); - }} + message.citations.forEach((citation, idx) => { + const location = citation.location; + const textBefore = content.slice(lastIndex, location); + const citationButton = ( + <button + key={idx} + className="citation-button" + onClick={() => onCitationClick(citation)} style={{ - color: 'lightblue', - verticalAlign: 'super', - fontSize: 'smaller', + display: 'inline-flex', + alignItems: 'center', + justifyContent: 'center', + width: '20px', + height: '20px', + borderRadius: '50%', + border: 'none', + background: '#007bff', + color: 'white', + fontSize: '12px', + fontWeight: 'bold', + cursor: 'pointer', + margin: '0 2px', + padding: 0, }}> - <TbInfoCircleFilled /> - </a> + {idx + 1} + </button> ); - lastIndex = match.index + fullMatch.length; - } - - parts.push(text.slice(lastIndex)); + parts.push(textBefore, citationButton); + lastIndex = location; + }); - updateMessageCitations(index, citations); + parts.push(content.slice(lastIndex)); - return <>{parts}</>; + return parts; }; return ( <div className={`message ${message.role}`}> - <div>{renderContent(message.text)}</div> + <div>{renderContent(message.text_content)}</div> {message.follow_up_questions && message.follow_up_questions.length > 0 && ( <div className="follow-up-questions"> <h4>Follow-up Questions:</h4> @@ -81,20 +72,4 @@ const MessageComponentBox: React.FC<MessageComponentProps> = function ({ message ); }; -// Docs.Prototypes.TemplateMap.set(DocumentType.MESSAGE, { -// layout: { view: MessageComponentBox, dataField: 'data' }, -// options: { -// acl: '', -// _height: 35, -// _xMargin: 10, -// _yMargin: 10, -// _layout_nativeDimEditable: true, -// _layout_reflowVertical: true, -// _layout_reflowHorizontal: true, -// defaultDoubleClick: 'ignore', -// systemIcon: 'BsFileEarmarkTextFill', -// layout_borderRounding: '10px', -// }, -// }); - export default observer(MessageComponentBox); diff --git a/src/client/views/nodes/ChatBox/types.ts b/src/client/views/nodes/ChatBox/types.ts index 0270b6256..a0d295e92 100644 --- a/src/client/views/nodes/ChatBox/types.ts +++ b/src/client/views/nodes/ChatBox/types.ts @@ -9,17 +9,31 @@ export enum CHUNK_TYPE { TABLE = 'table', } +export function getChunkType(type: string): CHUNK_TYPE { + switch (type.toLowerCase()) { + case 'text': + return CHUNK_TYPE.TEXT; + case 'image': + return CHUNK_TYPE.IMAGE; + case 'table': + return CHUNK_TYPE.TABLE; + default: + return CHUNK_TYPE.TEXT; + } +} + export interface AssistantMessage { role: ASSISTANT_ROLE; - text: string; + text_content: string; follow_up_questions?: string[]; citations?: Citation[]; } export interface Citation { - text: string; + direct_text?: string; type: CHUNK_TYPE; chunk_id: string; + location: number; } export interface Chunk { diff --git a/src/client/views/nodes/ChatBox/vectorstore/VectorstoreUpload.ts b/src/client/views/nodes/ChatBox/vectorstore/VectorstoreUpload.ts index 5e8e6b23a..d3b1cb4e7 100644 --- a/src/client/views/nodes/ChatBox/vectorstore/VectorstoreUpload.ts +++ b/src/client/views/nodes/ChatBox/vectorstore/VectorstoreUpload.ts @@ -58,14 +58,17 @@ export class Vectorstore { visible_docs?.forEach(async doc => { await this.addAIDoc(doc); }); + return; } async addAIDoc(doc: Doc) { if (doc[DocData]?.ai_document) { this.documents.push(convertToAIDocument(JSON.parse(StrCast(doc[DocData].ai_document)))); - console.log(`Document already added: ${doc[DocData].file_name}`); + console.log(`Document already added: ${doc.file_name}`); } else { console.log(doc); + console.log(PDFCast(doc.data)?.url?.pathname); + console.log(CsvCast(doc.data)?.url?.pathname); const local_file_path: string = CsvCast(doc.data)?.url?.pathname ?? PDFCast(doc.data)?.url?.pathname; console.log('Local File Path:', local_file_path); if (local_file_path) { |