aboutsummaryrefslogtreecommitdiff
path: root/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
blob: f10e889e23c45c37d88935cb024f92e2b32981d8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
/**
 * @file Vectorstore.ts
 * @description This file defines the Vectorstore class, which integrates with Pinecone for vector-based document indexing and OpenAI text-embedding-3-large for text embeddings.
 * It manages AI document handling, including adding documents, processing media files, combining document chunks, indexing documents,
 * and retrieving relevant sections based on user queries.
 */

import { Index, IndexList, Pinecone, PineconeRecord, QueryResponse, RecordMetadata } from '@pinecone-database/pinecone';
import dotenv from 'dotenv';
import path from 'path';
import { v4 as uuidv4 } from 'uuid';
import { Doc } from '../../../../../fields/Doc';
import { AudioCast, CsvCast, PDFCast, StrCast, VideoCast } from '../../../../../fields/Types';
import { Networking } from '../../../../Network';
import { AI_Document, CHUNK_TYPE, RAGChunk } from '../types/types';
import OpenAI from 'openai';
import { Embedding } from 'openai/resources';
import { AgentDocumentManager } from '../utils/AgentDocumentManager';
import { Id } from '../../../../../fields/FieldSymbols';

dotenv.config();

/**
 * The Vectorstore class integrates with Pinecone for vector-based document indexing and retrieval,
 * and OpenAI text-embedding-3-large for text embedding. It handles AI document management, uploads, and query-based retrieval.
 */
export class Vectorstore {
    private pinecone!: Pinecone; // Pinecone client for managing the vector index.
    private index!: Index; // The specific Pinecone index used for document chunks.
    private summaryIndex!: Index; // The Pinecone index used for file summaries.
    private openai!: OpenAI; // OpenAI client for generating embeddings.
    private indexName: string = 'pdf-chatbot'; // Default name for the index.
    private summaryIndexName: string = 'file-summaries'; // Name for the summaries index.
    private _id!: string; // Unique ID for the Vectorstore instance.
    private docManager!: AgentDocumentManager; // Document manager for handling documents
    private summaryCacheCount: number = 0; // Cache for the number of summaries
    documents: AI_Document[] = []; // Store the documents indexed in the vectorstore.
    private debug: boolean = true; // Enable debugging
    private initialized: boolean = false;

    /**
     * Initializes the Pinecone and OpenAI clients, sets up the document ID list,
     * and initializes the Pinecone index.
     * @param id The unique identifier for the vectorstore instance.
     * @param docManager An instance of AgentDocumentManager to handle document management.
     */
    constructor(id: string, docManager: AgentDocumentManager) {
        if (this.debug) console.log(`[DEBUG] Initializing Vectorstore with ID: ${id}`);
        const pineconeApiKey = 'pcsk_3txLxJ_9fxdmAph4csnq4yxoDF5De5A8bJvjWaXXigBgshy4eoXggrXcxATJiH8vzXbrKm';
        if (!pineconeApiKey) {
            console.log('PINECONE_API_KEY is not defined - Vectorstore will be unavailable');
            return;
        }

        // Initialize Pinecone and OpenAI clients with API keys from the environment.
        this.pinecone = new Pinecone({ apiKey: pineconeApiKey });
        this.openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY, dangerouslyAllowBrowser: true });
        this._id = id;
        this.docManager = docManager;

        // Proper async initialization sequence
        this.initializeAsync(id);
    }

    /**
     * Handles async initialization of all components
     */
    private async initializeAsync(id: string) {
        try {
            if (this.debug) console.log(`[DEBUG] Starting async initialization sequence for Vectorstore ID: ${id}`);

            // Initialize the main document index
            await this.initializeIndex();

            // Initialize the summary index
            await this.initializeSummaryIndex();

            this.initialized = true;
            if (this.debug) console.log(`[DEBUG] ✅ Vectorstore initialization complete, running test query...`);

            // Run a single test query instead of multiple
            await this.runSingleTestQuery();
        } catch (error) {
            console.error('[ERROR] Failed to initialize Vectorstore:', error);
        }
    }

    async getFileNames() {
        const response = await Networking.FetchFromServer('/getFileNames');
        const filepaths = JSON.parse(response);
        return filepaths;
    }

    /**
     * Initializes the Pinecone index by checking if it exists and creating it if necessary.
     * Sets the index to use cosine similarity for vector similarity calculations.
     */
    private async initializeIndex() {
        if (this.debug) console.log(`[DEBUG] Initializing main document index: ${this.indexName}`);
        const indexList: IndexList = await this.pinecone.listIndexes();
        if (this.debug) console.log(`[DEBUG] Available Pinecone indexes: ${indexList.indexes?.map(i => i.name).join(', ') || 'none'}`);

        // Check if the index already exists, otherwise create it.
        if (!indexList.indexes?.some(index => index.name === this.indexName)) {
            if (this.debug) console.log(`[DEBUG] Creating new index: ${this.indexName}`);
            await this.pinecone.createIndex({
                name: this.indexName,
                dimension: 3072,
                metric: 'cosine',
                spec: {
                    serverless: {
                        cloud: 'aws',
                        region: 'us-east-1',
                    },
                },
            });
            if (this.debug) console.log(`[DEBUG] ✅ Index ${this.indexName} created successfully`);
        } else {
            if (this.debug) console.log(`[DEBUG] ✅ Using existing index: ${this.indexName}`);
        }

        // Set the index for future use.
        this.index = this.pinecone.Index(this.indexName);
    }

    /**
     * Initializes the Pinecone index for file summaries.
     * Checks if it exists and creates it if necessary.
     */
    private async initializeSummaryIndex() {
        if (this.debug) console.log(`[DEBUG] Initializing file summaries index: ${this.summaryIndexName}`);
        const indexList: IndexList = await this.pinecone.listIndexes();

        // Check if the index already exists, otherwise create it.
        if (!indexList.indexes?.some(index => index.name === this.summaryIndexName)) {
            if (this.debug) console.log(`[DEBUG] Creating new summary index: ${this.summaryIndexName}`);
            await this.pinecone.createIndex({
                name: this.summaryIndexName,
                dimension: 3072,
                metric: 'cosine',
                spec: {
                    serverless: {
                        cloud: 'aws',
                        region: 'us-east-1',
                    },
                },
            });
            if (this.debug) console.log(`[DEBUG] ✅ Summary index ${this.summaryIndexName} created successfully`);
        } else {
            if (this.debug) console.log(`[DEBUG] ✅ Using existing summary index: ${this.summaryIndexName}`);
        }

        // Set the summaries index for future use.
        this.summaryIndex = this.pinecone.Index(this.summaryIndexName);

        // Check if we need to index the file summaries
        await this.processFileSummaries();
    }

    /**
     * Processes file summaries from the JSON file if needed.
     * Checks if the index contains the correct number of summaries before embedding.
     */
    private async processFileSummaries() {
        if (this.debug) console.log(`[DEBUG] Starting file summaries processing`);
        try {
            // Get file summaries from the server
            if (this.debug) console.log(`[DEBUG] Fetching file summaries from server...`);
            const response = await Networking.FetchFromServer('/getFileSummaries');

            if (!response) {
                console.error('[ERROR] Failed to fetch file summaries');
                return;
            }
            if (this.debug) console.log(`[DEBUG] File summaries response received (${response.length} bytes)`);

            const summaries = JSON.parse(response);
            const filepaths = Object.keys(summaries);
            const summaryCount = filepaths.length;
            this.summaryCacheCount = summaryCount;

            if (this.debug) {
                console.log(`[DEBUG] File summaries parsed: ${summaryCount} files`);
                console.log(`[DEBUG] Sample filepaths: ${filepaths.slice(0, 3).join(', ')}...`);
                console.log(`[DEBUG] Sample summary: "${summaries[filepaths[0]].substring(0, 100)}..."`);
            }

            // Check if index already has the correct number of summaries
            try {
                if (this.debug) console.log(`[DEBUG] Checking summary index stats...`);
                const indexStats = await this.summaryIndex.describeIndexStats();
                const vectorCount = indexStats.totalRecordCount;

                if (this.debug) console.log(`[DEBUG] Summary index has ${vectorCount} records, expecting ${summaryCount}`);

                if (vectorCount === summaryCount) {
                    console.log(`[DEBUG] ✅ Summary index already contains ${vectorCount} entries, skipping embedding.`);
                    return;
                }

                if (this.debug) console.log(`[DEBUG] ⚠️ Summary index contains ${vectorCount} entries, but there are ${summaryCount} summaries. Re-indexing.`);
            } catch (error) {
                console.error('[ERROR] Error checking summary index stats:', error);
            }

            // If we get here, we need to embed the summaries
            await this.embedAndIndexFileSummaries(summaries);
        } catch (error) {
            console.error('[ERROR] Error processing file summaries:', error);
        }
    }

    /**
     * Embeds and indexes file summaries into the summary index.
     * @param summaries Object mapping filepaths to summaries
     */
    private async embedAndIndexFileSummaries(summaries: Record<string, string>) {
        if (this.debug) console.log(`[DEBUG] Starting embedding and indexing of file summaries...`);

        const filepaths = Object.keys(summaries);
        const summaryTexts = Object.values(summaries);

        // Split into batches of 100 to avoid exceeding API limits
        const batchSize = 100;
        const totalBatches = Math.ceil(filepaths.length / batchSize);

        if (this.debug) console.log(`[DEBUG] Processing ${filepaths.length} files in ${totalBatches} batches of size ${batchSize}`);

        for (let i = 0; i < filepaths.length; i += batchSize) {
            const batchFilepaths = filepaths.slice(i, i + batchSize);
            const batchTexts = summaryTexts.slice(i, i + batchSize);

            if (this.debug) {
                console.log(`[DEBUG] Processing batch ${Math.floor(i / batchSize) + 1}/${totalBatches}`);
                console.log(`[DEBUG] First file in batch: ${batchFilepaths[0]}`);
                console.log(`[DEBUG] First summary in batch: "${batchTexts[0].substring(0, 50)}..."`);
            }

            try {
                // Generate embeddings for this batch
                if (this.debug) console.log(`[DEBUG] Generating embeddings for batch of ${batchTexts.length} summaries...`);
                const startTime = Date.now();
                const embeddingResponse = await this.openai.embeddings.create({
                    model: 'text-embedding-3-large',
                    input: batchTexts,
                    encoding_format: 'float',
                });
                const duration = Date.now() - startTime;
                if (this.debug) console.log(`[DEBUG] ✅ Embeddings generated in ${duration}ms`);

                // Prepare Pinecone records
                if (this.debug) console.log(`[DEBUG] Preparing Pinecone records...`);
                const pineconeRecords: PineconeRecord[] = batchTexts.map((text, index) => {
                    const embedding = (embeddingResponse.data as Embedding[])[index].embedding;
                    if (this.debug && index === 0) console.log(`[DEBUG] Sample embedding dimensions: ${embedding.length}, first few values: [${embedding.slice(0, 5).join(', ')}...]`);

                    return {
                        id: uuidv4(), // Generate a unique ID for each summary
                        values: embedding,
                        metadata: {
                            filepath: batchFilepaths[index],
                            summary: text,
                        } as RecordMetadata,
                    };
                });

                // Upload to Pinecone
                if (this.debug) console.log(`[DEBUG] Upserting ${pineconeRecords.length} records to Pinecone...`);
                const upsertStart = Date.now();
                try {
                    await this.summaryIndex.upsert(pineconeRecords);
                    const upsertDuration = Date.now() - upsertStart;
                    if (this.debug) console.log(`[DEBUG] ✅ Batch ${Math.floor(i / batchSize) + 1}/${totalBatches} indexed in ${upsertDuration}ms`);
                } catch (upsertError) {
                    console.error(`[ERROR] Failed to upsert batch ${Math.floor(i / batchSize) + 1}/${totalBatches} to Pinecone:`, upsertError);
                    // Try again with smaller batch
                    if (batchTexts.length > 20) {
                        console.log(`[DEBUG] 🔄 Retrying with smaller batch size...`);
                        // Split the batch in half and retry recursively
                        const midpoint = Math.floor(batchTexts.length / 2);
                        const firstHalf = {
                            filepaths: batchFilepaths.slice(0, midpoint),
                            texts: batchTexts.slice(0, midpoint),
                        };
                        const secondHalf = {
                            filepaths: batchFilepaths.slice(midpoint),
                            texts: batchTexts.slice(midpoint),
                        };

                        // Create a helper function to retry smaller batches
                        const retryBatch = async (paths: string[], texts: string[], batchNum: string) => {
                            try {
                                if (this.debug) console.log(`[DEBUG] Generating embeddings for sub-batch ${batchNum}...`);
                                const embRes = await this.openai.embeddings.create({
                                    model: 'text-embedding-3-large',
                                    input: texts,
                                    encoding_format: 'float',
                                });

                                const records = texts.map((t, idx) => ({
                                    id: uuidv4(),
                                    values: (embRes.data as Embedding[])[idx].embedding,
                                    metadata: {
                                        filepath: paths[idx],
                                        summary: t,
                                    } as RecordMetadata,
                                }));

                                if (this.debug) console.log(`[DEBUG] Upserting sub-batch ${batchNum} (${records.length} records)...`);
                                await this.summaryIndex.upsert(records);
                                if (this.debug) console.log(`[DEBUG] ✅ Sub-batch ${batchNum} upserted successfully`);
                            } catch (retryError) {
                                console.error(`[ERROR] Failed to upsert sub-batch ${batchNum}:`, retryError);
                            }
                        };

                        await retryBatch(firstHalf.filepaths, firstHalf.texts, `${Math.floor(i / batchSize) + 1}.1`);
                        await retryBatch(secondHalf.filepaths, secondHalf.texts, `${Math.floor(i / batchSize) + 1}.2`);
                    }
                }
            } catch (error) {
                console.error('[ERROR] Error processing batch:', error);
            }
        }

        if (this.debug) console.log(`[DEBUG] ✅ File summary indexing complete for all ${filepaths.length} files`);

        // Verify the index was populated correctly
        try {
            const indexStats = await this.summaryIndex.describeIndexStats();
            const vectorCount = indexStats.totalRecordCount;
            if (this.debug) console.log(`[DEBUG] 🔍 Final index verification: ${vectorCount} records in Pinecone index (expected ${filepaths.length})`);
        } catch (error) {
            console.error('[ERROR] Failed to verify index stats:', error);
        }
    }

    /**
     * Searches for file summaries similar to the given query.
     * @param query The search query
     * @param topK Number of results to return (default: 5)
     * @returns Array of filepath and summary pairs with relevance scores
     */
    async searchFileSummaries(query: string, topK: number = 5): Promise<Array<{ filepath: string; summary: string; score?: number }>> {
        if (!this.initialized) {
            console.error('[ERROR] Cannot search - Vectorstore not fully initialized');
            return [];
        }

        if (this.debug) console.log(`[DEBUG] Searching file summaries for query: "${query}" (topK=${topK})`);
        try {
            // Generate embedding for the query
            if (this.debug) console.log(`[DEBUG] Generating embedding for query...`);
            const startTime = Date.now();
            const queryEmbeddingResponse = await this.openai.embeddings.create({
                model: 'text-embedding-3-large',
                input: query,
                encoding_format: 'float',
            });
            const duration = Date.now() - startTime;

            const queryEmbedding = queryEmbeddingResponse.data[0].embedding;
            if (this.debug) {
                console.log(`[DEBUG] ✅ Query embedding generated in ${duration}ms`);
                console.log(`[DEBUG] Query embedding dimensions: ${queryEmbedding.length}`);
            }

            // Check if summary index is ready
            try {
                const indexStats = await this.summaryIndex.describeIndexStats();
                const vectorCount = indexStats.totalRecordCount;
                if (this.debug) console.log(`[DEBUG] Summary index contains ${vectorCount} records`);

                if (vectorCount === 0) {
                    console.error('[ERROR] Summary index is empty, cannot perform search');
                    return [];
                }
            } catch (statsError) {
                console.error('[ERROR] Failed to check summary index stats:', statsError);
                console.error('[ERROR] Stats error details:', JSON.stringify(statsError));
            }

            // Test direct API access to Pinecone
            if (this.debug) console.log(`[DEBUG] Testing Pinecone connection...`);
            try {
                const indexes = await this.pinecone.listIndexes();
                console.log(`[DEBUG] Available Pinecone indexes: ${indexes.indexes?.map(idx => idx.name).join(', ')}`);
            } catch (connectionError) {
                console.error('[ERROR] Could not connect to Pinecone:', connectionError);
            }

            // Query the summaries index
            if (this.debug) console.log(`[DEBUG] Querying Pinecone summary index (${this.summaryIndexName})...`);
            const queryStart = Date.now();

            let queryResponse;
            try {
                // First, make sure we can access the index
                const indexInfo = await this.summaryIndex.describeIndexStats();
                if (this.debug) console.log(`[DEBUG] Index stats:`, indexInfo);

                queryResponse = await this.summaryIndex.query({
                    vector: queryEmbedding,
                    topK,
                    includeMetadata: true,
                });

                const queryDuration = Date.now() - queryStart;

                if (this.debug) {
                    console.log(`[DEBUG] ✅ Pinecone query completed in ${queryDuration}ms`);
                    console.log(`[DEBUG] Raw Pinecone response:`, JSON.stringify(queryResponse, null, 2));
                    if (queryResponse.matches) {
                        console.log(`[DEBUG] Found ${queryResponse.matches.length} matching summaries`);
                        console.log(`[DEBUG] Match scores: ${queryResponse.matches.map(m => m.score?.toFixed(4)).join(', ')}`);
                    } else {
                        console.log(`[DEBUG] No matches in response`);
                    }
                }
            } catch (queryError) {
                console.error('[ERROR] Pinecone query failed:', queryError);
                if (typeof queryError === 'object' && queryError !== null) {
                    console.error('[ERROR] Query error details:', JSON.stringify(queryError, null, 2));
                }
                return [];
            }

            if (!queryResponse || !queryResponse.matches || queryResponse.matches.length === 0) {
                console.log('[DEBUG] ⚠️ No matches found in Pinecone for query');
                return [];
            }

            // Format results
            const results = queryResponse.matches.map(match => {
                if (!match.metadata) {
                    console.error('[ERROR] Match is missing metadata:', match);
                    return { filepath: 'unknown', summary: 'No summary available' };
                }

                return {
                    filepath: (match.metadata as { filepath: string }).filepath || 'unknown',
                    summary: (match.metadata as { summary: string }).summary || 'No summary available',
                    score: match.score,
                };
            });

            if (this.debug) {
                if (results.length > 0) {
                    console.log(`[DEBUG] Top result filepath: ${results[0]?.filepath}`);
                    console.log(`[DEBUG] Top result score: ${results[0]?.score}`);
                    console.log(`[DEBUG] Top result summary excerpt: "${results[0]?.summary?.substring(0, 100)}..."`);
                } else {
                    console.log(`[DEBUG] No results returned after processing`);
                }
            }

            return results;
        } catch (error) {
            console.error('[ERROR] Error searching file summaries:', error);
            if (typeof error === 'object' && error !== null) {
                console.error('[ERROR] Full error details:', JSON.stringify(error, null, 2));
            }
            return [];
        }
    }

    /**
     * Runs a single test query after setup to validate the file summary search functionality.
     */
    private async runSingleTestQuery() {
        console.log(`\n[TEST] Running single test query to validate file summary search functionality...`);

        // Verify the index is accessible
        try {
            const indexStats = await this.summaryIndex.describeIndexStats();
            console.log(`[TEST] Pinecone index stats:`, JSON.stringify(indexStats, null, 2));
            console.log(`[TEST] Summary index contains ${indexStats.totalRecordCount} indexed summaries`);
        } catch (error) {
            console.error('[TEST] ❌ Failed to access Pinecone index:', error);
            return;
        }

        // Add a brief delay to ensure Pinecone has finished processing
        console.log('[TEST] Waiting 2 seconds for Pinecone indexing to complete...');
        await new Promise(resolve => setTimeout(resolve, 2000));

        // Run a single test query
        const query = 'React components for the UI';
        console.log(`\n[TEST] Executing query: "${query}"`);

        try {
            const results = await this.searchFileSummaries(query);
            console.log(`[TEST] Search returned ${results.length} results:`);

            results.forEach((result, i) => {
                console.log(`\n[TEST] Result ${i + 1}:`);
                console.log(`[TEST] File: ${result.filepath}`);
                console.log(`[TEST] Score: ${result.score}`);
                console.log(`[TEST] Summary: "${result.summary?.substring(0, 150)}..."`);
            });

            // If we have results, fetch the content for the first one
            if (results.length > 0) {
                const topFilepath = results[0].filepath;
                console.log(`\n[TEST] Fetching full content for top result: ${topFilepath}`);
                const content = await this.getFileContent(topFilepath);

                if (content) {
                    console.log(`[TEST] ✅ Content retrieved successfully (${content.length} chars)`);
                    console.log(`[TEST] Content excerpt:\n---\n${content.substring(0, 300)}...\n---`);
                } else {
                    console.log(`[TEST] ❌ Failed to retrieve content for ${topFilepath}`);
                }
            } else {
                console.log(`\n[TEST] ⚠️ No results to fetch content for`);
            }

            console.log(`\n[TEST] ✅ Test query completed`);
        } catch (testError) {
            console.error(`[TEST] ❌ Test query failed:`, testError);
            if (typeof testError === 'object' && testError !== null) {
                console.error('[TEST] Full error details:', JSON.stringify(testError, null, 2));
            }
        }
    }

    /**
     * Gets the full content of a file by its filepath.
     * @param filepath The filepath to look up
     * @returns The file content or null if not found
     */
    async getFileContent(filepath: string): Promise<string | null> {
        if (this.debug) console.log(`[DEBUG] Getting file content for: ${filepath}`);
        try {
            const startTime = Date.now();

            // Use the Networking utility for consistent API access
            // But convert the response to text manually to avoid JSON parsing
            const rawResponse = await fetch('/getRawFileContent', {
                method: 'POST',
                headers: {
                    'Content-Type': 'application/json',
                },
                body: JSON.stringify({ filepath }),
            });

            if (!rawResponse.ok) {
                const errorText = await rawResponse.text();
                console.error(`[ERROR] Server returned error ${rawResponse.status}: ${errorText}`);
                return null;
            }

            // Get the raw text content without JSON parsing
            const content = await rawResponse.text();
            const duration = Date.now() - startTime;

            if (this.debug) {
                console.log(`[DEBUG] ✅ File content retrieved in ${duration}ms`);
                console.log(`[DEBUG] Content length: ${content.length} chars`);
                console.log(`[DEBUG] Content excerpt: "${content.substring(0, 100)}..."`);
            }

            return content;
        } catch (error) {
            console.error('[ERROR] Error getting file content:', error);
            if (typeof error === 'object' && error !== null) {
                console.error('[ERROR] Full error details:', JSON.stringify(error, null, 2));
            }
            return null;
        }
    }

    /**
     * Adds an AI document to the vectorstore. Handles media file processing for audio/video,
     * and text embedding for all document types. Updates document metadata during processing.
     * @param doc The document to add.
     * @param progressCallback Callback to track the progress of the addition process.
     */
    async addAIDoc(doc: Doc, progressCallback: (progress: number, step: string) => void) {
        const ai_document_status: string = StrCast(doc.ai_document_status);

        // Skip if the document is already in progress or completed.
        if (ai_document_status !== undefined && ai_document_status.trim() !== '' && ai_document_status !== '{}') {
            if (ai_document_status === 'PROGRESS') {
                console.log('Already in progress.');
                return;
            } else if (ai_document_status === 'COMPLETED') {
                console.log('Already completed.');
                return;
            }
        } else {
            // Start processing the document.
            doc.ai_document_status = 'PROGRESS';
            const local_file_path = CsvCast(doc.data)?.url?.pathname ?? PDFCast(doc.data)?.url?.pathname ?? VideoCast(doc.data)?.url?.pathname ?? AudioCast(doc.data)?.url?.pathname;

            if (!local_file_path) {
                console.log('Not adding to vectorstore. Invalid file path for vectorstore addition.');
                return;
            }

            const isAudioOrVideo = local_file_path.endsWith('.mp3') || local_file_path.endsWith('.mp4');
            let result: AI_Document & { doc_id: string };

            if (isAudioOrVideo) {
                console.log('Processing media file...');
                progressCallback(10, 'Preparing media file for transcription...');

                // Post to processMediaFile endpoint to get the transcript
                const response = await Networking.PostToServer('/processMediaFile', { fileName: path.basename(local_file_path) });
                progressCallback(60, 'Transcription completed. Processing transcript...');

                // Type assertion to handle the response properties
                const typedResponse = response as {
                    condensed: Array<{ text: string; indexes: string[]; start: number; end: number }>;
                    full: Array<unknown>;
                    summary: string;
                };

                const segmentedTranscript = typedResponse.condensed;
                console.log(segmentedTranscript);
                const summary = typedResponse.summary;
                doc.summary = summary;

                // Generate embeddings for each chunk
                const texts = segmentedTranscript.map(chunk => chunk.text);

                try {
                    const embeddingsResponse = await this.openai.embeddings.create({
                        model: 'text-embedding-3-large',
                        input: texts,
                        encoding_format: 'float',
                    });
                    progressCallback(85, 'Embeddings generated. Finalizing document...');

                    doc.original_segments = JSON.stringify(typedResponse.full);
                    const doc_id = doc[Id];
                    console.log('doc_id in vectorstore', doc_id);

                    // Generate chunk IDs upfront so we can register them
                    const chunkIds = segmentedTranscript.map(() => uuidv4());
                    // Add transcript and embeddings to metadata
                    result = {
                        doc_id,
                        purpose: '',
                        file_name: local_file_path,
                        num_pages: 0,
                        summary: summary,
                        chunks: segmentedTranscript.map((chunk, index) => ({
                            id: chunkIds[index], // Use pre-generated chunk ID
                            values: (embeddingsResponse.data as Embedding[])[index].embedding, // Assign embedding
                            metadata: {
                                indexes: chunk.indexes,
                                original_document: local_file_path,
                                doc_id: doc_id, // Ensure doc_id is consistent
                                file_path: local_file_path,
                                start_time: chunk.start,
                                end_time: chunk.end,
                                text: chunk.text,
                                type: local_file_path.endsWith('.mp3') ? CHUNK_TYPE.AUDIO : CHUNK_TYPE.VIDEO,
                            },
                        })),
                        type: 'media',
                    };
                    progressCallback(95, 'Adding document to vectorstore...');
                } catch (error) {
                    console.error('Error generating embeddings:', error);
                    doc.ai_document_status = 'ERROR';
                    throw new Error('Embedding generation failed');
                }

                doc.segmented_transcript = JSON.stringify(segmentedTranscript);
                // Use doc manager to add simplified chunks
                const docType = local_file_path.endsWith('.mp3') ? 'audio' : 'video';
                const simplifiedChunks = this.docManager.getSimplifiedChunks(result.chunks, docType);
                doc.chunk_simplified = JSON.stringify(simplifiedChunks);
                this.docManager.addSimplifiedChunks(simplifiedChunks);
            } else {
                // Process regular document
                console.log('Processing regular document...');
                const createDocumentResponse = await Networking.PostToServer('/createDocument', { file_path: local_file_path, doc_id: doc[Id] });

                // Type assertion for the response
                const { jobId } = createDocumentResponse as { jobId: string };

                while (true) {
                    await new Promise(resolve => setTimeout(resolve, 2000));
                    const resultResponse = await Networking.FetchFromServer(`/getResult/${jobId}`);
                    const resultResponseJson = JSON.parse(resultResponse);
                    if (resultResponseJson.status === 'completed') {
                        result = resultResponseJson;
                        break;
                    }
                    const progressResponse = await Networking.FetchFromServer(`/getProgress/${jobId}`);
                    const progressResponseJson = JSON.parse(progressResponse);
                    if (progressResponseJson) {
                        progressCallback(progressResponseJson.progress, progressResponseJson.step);
                    }
                }

                // Collect all chunk IDs
                const chunkIds = result.chunks.map(chunk => chunk.id);

                if (result.doc_id !== doc[Id]) {
                    console.log('doc_id in vectorstore', result.doc_id, 'does not match doc_id in doc', doc[Id]);
                }

                // Use doc manager to add simplified chunks - determine document type from file extension
                const fileExt = path.extname(local_file_path).toLowerCase();
                const docType = fileExt === '.pdf' ? 'pdf' : fileExt === '.csv' ? 'csv' : 'text';
                const simplifiedChunks = this.docManager.getSimplifiedChunks(result.chunks, docType);
                doc.chunk_simplified = JSON.stringify(simplifiedChunks);
                this.docManager.addSimplifiedChunks(simplifiedChunks);

                doc.summary = result.summary;
                doc.ai_purpose = result.purpose;
            }

            // Index the document
            await this.indexDocument(result);
            progressCallback(100, 'Document added successfully!');

            // Preserve existing metadata updates
            if (!doc.vectorstore_id) {
                doc.vectorstore_id = JSON.stringify([this._id]);
            } else {
                doc.vectorstore_id = JSON.stringify(JSON.parse(StrCast(doc.vectorstore_id)).concat([this._id]));
            }

            doc.ai_doc_id = result.doc_id;

            console.log(`Document added: ${result.file_name}`);
            doc.ai_document_status = 'COMPLETED';
        }
    }

    /**
     * Uploads the document's vector chunks to the Pinecone index.
     * Prepares the metadata for each chunk and uses Pinecone's upsert operation.
     * @param document The processed document containing its chunks and metadata.
     */
    private async indexDocument(document: AI_Document) {
        console.log('Uploading vectors to content namespace...');

        // Prepare Pinecone records for each chunk in the document.
        const pineconeRecords: PineconeRecord[] = (document.chunks as RAGChunk[]).map(chunk => ({
            id: chunk.id,
            values: chunk.values,
            metadata: { ...chunk.metadata } as RecordMetadata,
        }));

        // Upload the records to Pinecone.
        await this.index.upsert(pineconeRecords);
    }

    /**
     * Combines document chunks until their combined text reaches a minimum word count.
     * This is used to optimize retrieval and indexing processes.
     * @param chunks The original chunks to combine.
     * @returns Combined chunks with updated text and metadata.
     */
    private combineChunks(chunks: RAGChunk[]): RAGChunk[] {
        const combinedChunks: RAGChunk[] = [];
        let currentChunk: RAGChunk | null = null;
        let wordCount = 0;

        chunks.forEach(chunk => {
            const textWords = chunk.metadata.text.split(' ').length;

            if (!currentChunk) {
                currentChunk = { ...chunk, metadata: { ...chunk.metadata, text: chunk.metadata.text } };
                wordCount = textWords;
            } else if (wordCount + textWords >= 500) {
                combinedChunks.push(currentChunk);
                currentChunk = { ...chunk, metadata: { ...chunk.metadata, text: chunk.metadata.text } };
                wordCount = textWords;
            } else {
                currentChunk.metadata.text += ` ${chunk.metadata.text}`;
                wordCount += textWords;
            }
        });

        if (currentChunk) {
            combinedChunks.push(currentChunk);
        }

        return combinedChunks;
    }

    /**
     * Retrieves the most relevant document chunks for a given query.
     * Uses OpenAI for embedding the query and Pinecone for vector similarity matching.
     * @param query The search query string.
     * @param topK The number of top results to return (default is 15).
     * @returns A list of document chunks that match the query.
     */
    async retrieve(query: string, topK: number = 15, docIds?: string[]): Promise<RAGChunk[]> {
        console.log(`Retrieving chunks for query: ${query}`);
        try {
            // Generate an embedding for the query using OpenAI.
            const queryEmbeddingResponse = await this.openai.embeddings.create({
                model: 'text-embedding-3-large',
                input: query,
                encoding_format: 'float',
            });

            const queryEmbedding = queryEmbeddingResponse.data[0].embedding;
            const _docIds = docIds?.length === 0 || !docIds ? this.docManager.docIds : docIds;

            console.log('Using document IDs for retrieval:', _docIds);

            // Query the Pinecone index using the embedding and filter by document IDs.
            // We'll query based on document IDs that are registered in the document manager
            const queryResponse: QueryResponse = await this.index.query({
                vector: queryEmbedding,
                filter: {
                    doc_id: { $in: _docIds },
                },
                topK,
                includeValues: true,
                includeMetadata: true,
            });
            console.log(`Found ${queryResponse.matches.length} matching chunks`);

            // For each retrieved chunk, ensure its document ID is registered in the document manager
            // This maintains compatibility with existing code while ensuring consistency
            const processedMatches = queryResponse.matches.map(match => {
                const chunk = {
                    id: match.id,
                    values: match.values as number[],
                    metadata: match.metadata as {
                        text: string;
                        type: string;
                        original_document: string;
                        file_path: string;
                        doc_id: string;
                        location: string;
                        start_page: number;
                        end_page: number;
                    },
                } as RAGChunk;

                return chunk;
            });

            return processedMatches;
        } catch (error) {
            console.error(`Error retrieving chunks: ${error}`);
            return [];
        }
    }
}