src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189

import { v4 as uuidv4 } from 'uuid';
import { Networking } from '../../../../Network';
import { BaseTool } from './BaseTool';
import { Observation } from '../types/types';
import { ParametersType, ToolInfo } from '../types/tool_types';
import { AgentDocumentManager } from '../utils/AgentDocumentManager';
import { Doc } from '../../../../../fields/Doc';
import { StrCast, WebCast } from '../../../../../fields/Types';
const websiteInfoScraperToolParams = [
    {
        name: 'chunk_ids',
        type: 'string[]',
        description: 'The chunk_ids of the urls to scrape from the SearchTool.',
        required: true,
        max_inputs: 3,
    },
] as const;

type WebsiteInfoScraperToolParamsType = typeof websiteInfoScraperToolParams;

const websiteInfoScraperToolInfo: ToolInfo<WebsiteInfoScraperToolParamsType> = {
    name: 'websiteInfoScraper',
    description: 'Scrape detailed information from specific websites relevant to the user query. Returns the text content of the webpages for further analysis and grounding.',
    citationRules: `
      !IMPORTANT! THESE CHUNKS REPLACE THE CHUNKS THAT ARE RETURNED FROM THE SEARCHTOOL.
      Your task is to provide a comprehensive response to the user's prompt using the content scraped from relevant websites. Ensure you follow these guidelines for structuring your response:

      1. Grounded Text Tag Structure:
         - Wrap all text derived from the scraped website(s) in <grounded_text> tags.
         - **Do not include non-sourced information** in <grounded_text> tags.
         - Use a single <grounded_text> tag for content derived from a single website. If citing multiple websites, create new <grounded_text> tags for each.
         - Ensure each <grounded_text> tag has a citation index corresponding to the scraped URL.

      2. Citation Tag Structure:
         - Create a <citation> tag for each distinct piece of information used from the website(s).
         - Each <citation> tag must reference a URL chunk using the chunk_id attribute.
         - For URL-based citations, leave the citation content empty, but reference the chunk_id and type as 'url'.

      3. Structural Integrity Checks:
         - Ensure all opening and closing tags are matched properly.
         - Verify that all citation_index attributes in <grounded_text> tags correspond to valid citations.
         - Do not over-cite—cite only the most relevant parts of the websites.

      Example Usage:

      <answer>
          <grounded_text citation_index="1">
          Based on data from the World Bank, economic growth has stabilized in recent years, following a surge in investments.
          </grounded_text>
          <grounded_text citation_index="2">
          According to information retrieved from the International Monetary Fund, the inflation rate has been gradually decreasing since 2020.
          </grounded_text>

          <citations>
              <citation index="1" chunk_id="1234" type="url"></citation>
              <citation index="2" chunk_id="5678" type="url"></citation>
          </citations>

          <follow_up_questions>
              <question>What are the long-term economic impacts of increased investments on GDP?</question>
              <question>How might inflation trends affect future monetary policy?</question>
              <question>Are there additional factors that could influence economic growth beyond investments and inflation?</question>
          </follow_up_questions>
      </answer>

      ***NOTE***: Ensure that the response is structured correctly and adheres to the guidelines provided. Also, if needed/possible, cite multiple websites to provide a comprehensive response.
      `,
    parameterRules: websiteInfoScraperToolParams,
};

export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParamsType> {
    private _docManager: AgentDocumentManager;

    constructor(docManager: AgentDocumentManager) {
        super(websiteInfoScraperToolInfo);
        this._docManager = docManager;
    }

    /**
     * Attempts to scrape a website with retry logic
     * @param url URL to scrape
     * @param maxRetries Maximum number of retry attempts
     * @returns The scraped content or error message
     */
    private async scrapeWithRetry(chunkDoc: Doc, maxRetries = 2): Promise<Observation> {
        let lastError = '';
        let retryCount = 0;
        const url = WebCast(chunkDoc.data!)!.url.href;
        console.log(url);
        console.log(chunkDoc);
        console.log(chunkDoc.data);
        const id = chunkDoc.id;
        // Validate URL format
        try {
            new URL(url); // This will throw if URL is invalid
        } catch (e) {
            return {
                type: 'text',
                text: `Invalid URL format: ${url}. Please provide a valid URL including http:// or https://`,
            } as Observation;
        }

        while (retryCount <= maxRetries) {
            try {
                // Add a slight delay between retries
                if (retryCount > 0) {
                    console.log(`Retry attempt ${retryCount} for ${url}`);
                    await new Promise(resolve => setTimeout(resolve, retryCount * 2000)); // Increasing delay for each retry
                }

                const response = await Networking.PostToServer('/scrapeWebsite', { url });

                if (!response || typeof response !== 'object') {
                    lastError = 'Empty or invalid response from server';
                    retryCount++;
                    continue;
                }

                const { website_plain_text } = response as { website_plain_text: string };

                // Validate content quality
                if (!website_plain_text) {
                    lastError = 'Retrieved content was empty';
                    retryCount++;
                    continue;
                }

                if (website_plain_text.length < 100) {
                    console.warn(`Warning: Content from ${url} is very short (${website_plain_text.length} chars)`);

                    // Still return it if this is our last try
                    if (retryCount === maxRetries) {
                        return {
                            type: 'text',
                            text: `<chunk chunk_id="${id}" chunk_type="url">\n${website_plain_text}\nNote: Limited content was retrieved from this URL.\n</chunk>`,
                        } as Observation;
                    }

                    lastError = 'Retrieved content was too short, trying again';
                    retryCount++;
                    continue;
                }

                // Process and return content if it looks good
                return {
                    type: 'text',
                    text: `<chunk chunk_id="${id}" chunk_type="url">\n${website_plain_text}\n</chunk>`,
                } as Observation;
            } catch (error) {
                lastError = error instanceof Error ? error.message : 'Unknown error';
                console.log(`Error scraping ${url} (attempt ${retryCount + 1}):`, error);
            }

            retryCount++;
        }

        // All attempts failed
        return {
            type: 'text',
            text: `Unable to scrape website: ${url}. Error: ${lastError}`,
        } as Observation;
    }

    async execute(args: ParametersType<WebsiteInfoScraperToolParamsType>): Promise<Observation[]> {
        const chunk_ids = args.chunk_ids;

        // Create an array of promises, each one handling a website scrape for a URL
        const scrapingPromises = chunk_ids.map(chunk_id => this.scrapeWithRetry(this._docManager.getDocument(chunk_id)!));

        // Wait for all scraping promises to resolve
        const results = await Promise.all(scrapingPromises);

        // Check if we got any successful results
        const successfulResults = results.filter(result => {
            if (result.type !== 'text') return false;
            return (result as { type: 'text'; text: string }).text.includes('chunk_id') && !(result as { type: 'text'; text: string }).text.includes('Unable to scrape');
        });

        // If all scrapes failed, provide a more helpful error message
        if (successfulResults.length === 0 && results.length > 0) {
            results.push({
                type: 'text',
                text: `Note: All website scraping attempts failed. Please try with different URLs or try again later.`,
            } as Observation);
        }

        return results;
    }
}