/** * @file AnswerParser.ts * @description This file defines the AnswerParser class, which processes structured XML-like responses * from the AI system, parsing grounded text, normal text, citations, follow-up questions, and loop summaries. * The parser converts the XML response into an AssistantMessage format, extracting key information like * citations and processing steps for further use in the assistant's workflow. */ import { v4 as uuid } from 'uuid'; import { ASSISTANT_ROLE, AssistantMessage, Citation, ProcessingInfo, TEXT_TYPE, getChunkType } from '../types/types'; export class AnswerParser { static parse(xml: string, processingInfo: ProcessingInfo[]): AssistantMessage { const answerRegex = /([\s\S]*?)<\/answer>/; const citationsRegex = /([\s\S]*?)<\/citations>/; const citationRegex = /([\s\S]*?)<\/citation>/g; const followUpQuestionsRegex = /([\s\S]*?)<\/follow_up_questions>/; const questionRegex = /(.*?)<\/question>/g; const groundedTextRegex = /([\s\S]*?)<\/grounded_text>/g; const normalTextRegex = /([\s\S]*?)<\/normal_text>/g; const loopSummaryRegex = /([\s\S]*?)<\/loop_summary>/; const answerMatch = answerRegex.exec(xml); const citationsMatch = citationsRegex.exec(xml); const followUpQuestionsMatch = followUpQuestionsRegex.exec(xml); const loopSummaryMatch = loopSummaryRegex.exec(xml); if (!answerMatch) { throw new Error('Invalid XML: Missing tag.'); } let rawTextContent = answerMatch[1].trim(); const content: AssistantMessage['content'] = []; const citations: Citation[] = []; let contentIndex = 0; // Remove citations and follow-up questions from rawTextContent if (citationsMatch) { rawTextContent = rawTextContent.replace(citationsMatch[0], '').trim(); } if (followUpQuestionsMatch) { rawTextContent = rawTextContent.replace(followUpQuestionsMatch[0], '').trim(); } if (loopSummaryMatch) { rawTextContent = rawTextContent.replace(loopSummaryMatch[0], '').trim(); } // Parse citations let citationMatch; const citationMap = new Map(); if (citationsMatch) { const citationsContent = citationsMatch[1]; while ((citationMatch = citationRegex.exec(citationsContent)) !== null) { // eslint-disable-next-line @typescript-eslint/no-unused-vars const [_, index, chunk_id, type, direct_text] = citationMatch; const citation_id = uuid(); citationMap.set(index, citation_id); citations.push({ direct_text: direct_text.trim(), type: getChunkType(type), chunk_id, citation_id, }); } } rawTextContent = rawTextContent.replace(normalTextRegex, '$1'); // Parse text content (normal and grounded) let lastIndex = 0; let match; while ((match = groundedTextRegex.exec(rawTextContent)) !== null) { const [fullMatch, citationIndex, groundedText] = match; // Add normal text that is before the grounded text if (match.index > lastIndex) { const normalText = rawTextContent.slice(lastIndex, match.index).trim(); if (normalText) { content.push({ index: contentIndex++, type: TEXT_TYPE.NORMAL, text: normalText, citation_ids: null, }); } } // Add grounded text const citation_ids = citationIndex.split(',').map(index => citationMap.get(index) || ''); content.push({ index: contentIndex++, type: TEXT_TYPE.GROUNDED, text: groundedText.trim(), citation_ids, }); lastIndex = match.index + fullMatch.length; } // Add any remaining normal text after the last grounded text if (lastIndex < rawTextContent.length) { const remainingText = rawTextContent.slice(lastIndex).trim(); if (remainingText) { content.push({ index: contentIndex++, type: TEXT_TYPE.NORMAL, text: remainingText, citation_ids: null, }); } } const followUpQuestions: string[] = []; if (followUpQuestionsMatch) { const questionsText = followUpQuestionsMatch[1]; let questionMatch; while ((questionMatch = questionRegex.exec(questionsText)) !== null) { followUpQuestions.push(questionMatch[1].trim()); } } const assistantResponse: AssistantMessage = { role: ASSISTANT_ROLE.ASSISTANT, content, follow_up_questions: followUpQuestions, citations, processing_info: processingInfo, loop_summary: loopSummaryMatch ? loopSummaryMatch[1].trim() : undefined, }; return assistantResponse; } }