aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorA.J. Shulman <Shulman.aj@gmail.com>2025-05-10 20:30:24 -0400
committerA.J. Shulman <Shulman.aj@gmail.com>2025-05-10 20:30:24 -0400
commit0db4583914e43e6efdba3e86a614a19956e73b5e (patch)
tree68dfef85ea47d6d79e63a6ac0914922dc69c99c5 /src
parent0a05616fb9f685dc8534db4949a6f7ad6b85eadb (diff)
feat: changed web document to display screenshot
Diffstat (limited to 'src')
-rw-r--r--src/client/views/nodes/WebBox.scss217
-rw-r--r--src/client/views/nodes/WebBox.tsx759
-rw-r--r--src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts105
-rw-r--r--src/server/ApiManagers/AssistantManager.ts229
4 files changed, 912 insertions, 398 deletions
diff --git a/src/client/views/nodes/WebBox.scss b/src/client/views/nodes/WebBox.scss
index 05d5babf9..a1991d1d0 100644
--- a/src/client/views/nodes/WebBox.scss
+++ b/src/client/views/nodes/WebBox.scss
@@ -1,13 +1,9 @@
@use '../global/globalCssVariables.module.scss' as global;
.webBox {
- height: 100%;
- width: 100%;
- top: 0;
- left: 0;
position: relative;
- display: flex;
overflow: hidden;
+ aspect-ratio: 1 / 1; // Explicitly enforce square aspect ratio
.webBox-sideResizer {
position: absolute;
@@ -20,6 +16,119 @@
.webBox-background {
width: 100%;
height: 100%;
+ position: absolute;
+ top: 0;
+ left: 0;
+ }
+
+ // Simple container for screenshot
+ .webBox-screenshot-container {
+ width: 100%;
+ }
+
+ .webBox-screenshot {
+ width: 100%;
+ height: auto; // Maintain aspect ratio
+ display: block;
+ pointer-events: none;
+ }
+
+ .webBox-loading {
+ padding: 20px;
+ text-align: center;
+ color: #666;
+ background-color: #f5f5f5;
+ min-height: 200px;
+ display: flex;
+ flex-direction: column;
+ align-items: center;
+ justify-content: center;
+ }
+
+ .webBox-loading-spinner {
+ margin-top: 15px;
+ color: #1976d2;
+ font-size: 24px;
+ }
+
+ .webBox-error {
+ padding: 20px;
+ color: #d32f2f;
+ text-align: center;
+ background-color: #ffebee;
+ min-height: 200px;
+ display: flex;
+ flex-direction: column;
+ align-items: center;
+ justify-content: center;
+ gap: 15px;
+ }
+
+ .webBox-placeholder {
+ padding: 20px;
+ text-align: center;
+ color: #757575;
+ background-color: #fafafa;
+ min-height: 200px;
+ display: flex;
+ align-items: center;
+ justify-content: center;
+ }
+
+ // Basic container layout
+ .webBox-container {
+ width: 100%;
+ height: 100%;
+ position: relative;
+ }
+
+ // Simple scrollable container - vertical only
+ .webBox-outerContent {
+ width: 100%;
+ position: relative;
+ overflow-y: auto;
+ overflow-x: hidden;
+ background-color: #f5f5f5;
+
+ // Improve scrollbar styling
+ &::-webkit-scrollbar-thumb {
+ background-color: #888;
+ border-radius: 6px;
+ }
+
+ &::-webkit-scrollbar {
+ width: 8px;
+ background-color: #f5f5f5;
+ }
+ }
+
+ .webBox-innerContent {
+ width: 100%;
+ background-color: #f5f5f5;
+ }
+
+ .webBox-htmlSpan {
+ position: absolute;
+ top: 0;
+ left: 0;
+ cursor: text;
+ padding: 15px;
+ width: 100%;
+ height: 100%;
+ }
+
+ .webBox-annotationLayer {
+ position: absolute;
+ transform-origin: left top;
+ top: 0;
+ width: 100%;
+ pointer-events: none;
+ mix-blend-mode: multiply;
+ }
+
+ .webBox-annotationBox {
+ position: absolute;
+ background-color: rgba(245, 230, 95, 0.616);
}
.webBox-ui {
@@ -68,14 +177,14 @@
}
}
- .webBox-nextIcon,
- .webBox-prevIcon {
+ .webBox-refreshButton {
background: #121721;
- color: white;
height: 20px;
width: 25px;
display: flex;
- position: relative;
+ position: absolute;
+ bottom: 0;
+ right: 40px;
align-items: center;
justify-content: center;
border-radius: 3px;
@@ -83,10 +192,6 @@
padding: 0px;
}
- .webBox-overlayButton:hover {
- background: none;
- }
-
.webBox-overlayCont {
position: absolute;
width: calc(100% - 40px);
@@ -118,8 +223,7 @@
justify-content: center;
border-radius: 3px;
pointer-events: all;
- z-index: 1; // so it appears on top of the document's title, if shown
-
+ z-index: 1;
box-shadow: global.$standard-box-shadow;
transition: 0.2s;
@@ -134,89 +238,6 @@
opacity: 0.1;
}
- .webBox-annotationLayer {
- position: absolute;
- transform-origin: left top;
- top: 0;
- width: 100%;
- pointer-events: none;
- mix-blend-mode: multiply; // bcz: makes text fuzzy!
- }
-
- .webBox-annotationBox {
- position: absolute;
- background-color: rgba(245, 230, 95, 0.616);
- }
-
- .webBox-container {
- transform-origin: top left;
- width: 100%;
- height: 100%;
- position: absolute;
-
- .webBox-htmlSpan {
- position: absolute;
- top: 0;
- left: 0;
- cursor: text;
- padding: 15px;
- height: 100%;
- }
-
- .webBox-cont {
- pointer-events: none;
- }
-
- .webBox-cont,
- .webBox-cont-interactive {
- padding: 0vw;
- position: absolute;
- top: 0;
- left: 0;
- width: 100%;
- height: 100%;
- transform-origin: top left;
-
- .webBox-iframe {
- width: 100%;
- height: 100%;
- position: absolute;
- top: 0;
- left: 0;
- body {
- ::selection {
- color: white;
- background: orange;
- }
- }
- }
- }
-
- .webBox-cont-interactive {
- span {
- user-select: text !important;
- }
- }
-
- .webBox-outerContent {
- width: 100%;
- height: 100%;
- position: absolute;
- transform-origin: top left;
- top: 0;
- left: 0;
- overflow: auto;
-
- .webBox-innerContent {
- position: relative;
- }
- }
-
- div.webBox-outerContent::-webkit-scrollbar-thumb {
- cursor: nw-resize;
- }
- }
-
.webBox-overlay {
width: 100%;
height: 100%;
diff --git a/src/client/views/nodes/WebBox.tsx b/src/client/views/nodes/WebBox.tsx
index e7a10cc29..3c4696df3 100644
--- a/src/client/views/nodes/WebBox.tsx
+++ b/src/client/views/nodes/WebBox.tsx
@@ -4,6 +4,7 @@ import { htmlToText } from 'html-to-text';
import { action, computed, IReactionDisposer, makeObservable, observable, ObservableMap, reaction, runInAction } from 'mobx';
import { observer } from 'mobx-react';
import * as React from 'react';
+import axios from 'axios';
import * as WebRequest from 'web-request';
import { addStyleSheet, addStyleSheetRule, clearStyleSheetRules, ClientUtils, DivHeight, getWordAtPoint, lightOrDark, returnFalse, returnOne, returnZero, setupMoveUpEvents, smoothScroll } from '../../../ClientUtils';
import { Doc, DocListCast, Field, FieldType, Opt } from '../../../fields/Doc';
@@ -69,23 +70,20 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
private _scrollTimer: NodeJS.Timeout | undefined;
private _getAnchor: (savedAnnotations: Opt<ObservableMap<number, HTMLDivElement[]>>, addAsAnnotation: boolean) => Opt<Doc> = () => undefined;
- @observable private _webUrl = ''; // url of the src parameter of the embedded iframe but not necessarily the rendered page - eg, when following a link, the rendered page changes but we don't want the src parameter to also change as that would cause an unnecessary re-render.
- @observable private _hackHide = false; // apparently changing the value of the 'sandbox' prop doesn't necessarily apply it to the active iframe. so thisforces the ifrmae to be rebuilt when allowScripts is toggled
+ @observable private _webUrl = ''; // url of the page we want to display
+ @observable private _hackHide = false;
@observable private _searching: boolean = false;
@observable private _showSidebar = false;
@observable private _webPageHasBeenRendered = false;
@observable private _marqueeing: number[] | undefined = undefined;
- get marqueeing() {
- return this._marqueeing;
- }
- set marqueeing(val) {
- val && this._marqueeref.current?.onInitiateSelection(val);
- !val && this._marqueeref.current?.onTerminateSelection();
- this._marqueeing = val;
- }
+ @observable private _screenshotUrl: string | null = null; // URL to the screenshot image
+ @observable private _fullHeight: number = 0; // Full height of the webpage screenshot
+ @observable private _isLoadingScreenshot: boolean = false; // Loading state for the screenshot
@observable private _iframe: HTMLIFrameElement | null = null;
@observable private _savedAnnotations = new ObservableMap<number, (HTMLDivElement & { marqueeing?: boolean })[]>();
@observable private _scrollHeight = NumCast(this.layoutDoc.scrollHeight);
+ @observable private _screenshotError: string | null = null; // Error message if screenshot fails
+ @observable private _loadingFromCache: boolean = false;
@computed get _url() {
return this.webField?.toString() || '';
}
@@ -145,31 +143,38 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
};
updateIcon = async () => {
- if (!this._iframe) return new Promise<void>(res => res());
+ if (!this._screenshotUrl) {
+ // If we don't have a screenshot yet, capture one first
+ await this.captureWebScreenshot();
+ }
+
const scrollTop = NumCast(this.layoutDoc._layout_scrollTop);
const nativeWidth = NumCast(this.layoutDoc.nativeWidth);
const nativeHeight = (nativeWidth * this._props.PanelHeight()) / this._props.PanelWidth();
- let htmlString = this._iframe.contentDocument && new XMLSerializer().serializeToString(this._iframe.contentDocument);
- if (!htmlString) {
- htmlString = await fetch(ClientUtils.CorsProxy(this.webField!.href)).then(response => response.text());
- }
+
this.layoutDoc.thumb = undefined;
this.Document.thumbLockout = true; // lock to prevent multiple thumb updates.
- return (CreateImage(this._webUrl.endsWith('/') ? this._webUrl.substring(0, this._webUrl.length - 1) : this._webUrl, this._iframe.contentDocument?.styleSheets ?? [], htmlString, nativeWidth, nativeHeight, scrollTop) as Promise<string>)
- .then((dataUrl: string) => {
- if (dataUrl.includes('<!DOCTYPE')) {
- console.log('BAD DATA IN THUMB CREATION');
- return;
- }
- return ClientUtils.convertDataUri(dataUrl, this.layoutDoc[Id] + '_icon_' + new Date().getTime(), true, this.layoutDoc[Id] + '_icon_').then(returnedfilename => {
+
+ try {
+ // If we have a screenshot, use it directly for the thumbnail
+ if (this._screenshotUrl) {
+ return ClientUtils.convertDataUri(this._screenshotUrl, this.layoutDoc[Id] + '_icon_' + new Date().getTime(), true, this.layoutDoc[Id] + '_icon_').then(returnedfilename => {
this.Document.thumbLockout = false;
this.layoutDoc.thumb = new ImageField(returnedfilename);
this.layoutDoc.thumbScrollTop = scrollTop;
this.layoutDoc.thumbNativeWidth = nativeWidth;
this.layoutDoc.thumbNativeHeight = nativeHeight;
});
- })
- .catch((error: object) => console.error('oops, something went wrong!', error));
+ } else {
+ console.log('No screenshot available for thumbnail generation');
+ this.Document.thumbLockout = false;
+ return Promise.resolve();
+ }
+ } catch (error) {
+ console.error('Error creating thumbnail:', error);
+ this.Document.thumbLockout = false;
+ return Promise.reject(error);
+ }
};
componentDidMount() {
@@ -238,13 +243,64 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
},
{ fireImmediately: true }
);
+
+ // Check if we have a cached screenshot URL in metadata
+ if (this._url) {
+ this._webUrl = this._url;
+ const cachedScreenshotUrl = StrCast(this.dataDoc[this.fieldKey + '_screenshotUrl']);
+ const cachedHeight = NumCast(this.dataDoc[this.fieldKey + '_screenshotHeight']);
+
+ if (cachedScreenshotUrl && cachedHeight) {
+ // Use cached screenshot
+ this._loadingFromCache = true;
+ this._isLoadingScreenshot = true;
+
+ // Verify the cached screenshot exists by loading the image
+ const img = new Image();
+ img.onload = action(() => {
+ this._screenshotUrl = cachedScreenshotUrl;
+ this._fullHeight = cachedHeight;
+ this._scrollHeight = cachedHeight;
+ this._webPageHasBeenRendered = true;
+ this._isLoadingScreenshot = false;
+ this._loadingFromCache = false;
+
+ // Apply dimensions and initial scroll
+ if (this.layoutDoc._layout_autoHeight) {
+ this.layoutDoc._nativeHeight = this._fullHeight;
+ this._props.setHeight?.(this._fullHeight * (this._props.NativeDimScaling?.() || 1));
+ }
+
+ if (this._initialScroll !== undefined) {
+ this.setScrollPos(this._initialScroll);
+ }
+
+ console.log(`Loaded cached screenshot: ${this._screenshotUrl}`);
+ });
+
+ img.onerror = action(() => {
+ // If image fails to load, capture a new screenshot
+ console.log('Cached screenshot not found, capturing new one');
+ this._loadingFromCache = false;
+ this.captureWebScreenshot();
+ });
+
+ img.src = cachedScreenshotUrl;
+ } else {
+ // No cached screenshot, capture a new one
+ this.captureWebScreenshot();
+ }
+ }
}
componentWillUnmount() {
- this._iframetimeout && clearTimeout(this._iframetimeout);
- this._iframetimeout = undefined;
+ // Clean up timers
+ if (this._scrollTimer) {
+ clearTimeout(this._scrollTimer);
+ this._scrollTimer = undefined;
+ }
+
+ // Clean up reaction disposers
Object.values(this._disposers).forEach(disposer => disposer?.());
- // this._iframe?.removeEventListener('wheel', this.iframeWheel, true);
- // this._iframe?.contentDocument?.removeEventListener("pointerup", this.iframeUp);
}
private _selectionText: string = '';
@@ -359,59 +415,6 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
savedAnnotationsCreator: () => ObservableMap<number, (HTMLDivElement & { marqueeing?: boolean })[]> = () => this._textAnnotationCreator?.() || this._savedAnnotations;
@action
- iframeMove = (e: PointerEvent) => {
- const theclick = this.props
- .ScreenToLocalTransform()
- .inverse()
- .transformPoint(e.clientX, e.clientY - NumCast(this.layoutDoc.layout_scrollTop));
- this._marqueeref.current?.onMove(theclick);
- };
- @action
- iframeUp = (e: PointerEvent) => {
- this._iframe?.contentDocument?.removeEventListener('pointermove', this.iframeMove);
- this.marqueeing = undefined;
- this._getAnchor = AnchorMenu.Instance?.GetAnchor; // need to save AnchorMenu's getAnchor since a subsequent selection on another doc will overwrite this value
- this._textAnnotationCreator = undefined;
- this.DocumentView?.()?.cleanupPointerEvents(); // pointerup events aren't generated on containing document view, so we have to invoke it here.
- if (this._iframe?.contentWindow && this._iframe.contentDocument && !this._iframe.contentWindow.getSelection()?.isCollapsed) {
- const mainContBounds = ClientUtils.GetScreenTransform(this._mainCont.current!);
- const scale = (this._props.NativeDimScaling?.() || 1) * mainContBounds.scale;
- const sel = this._iframe.contentWindow.getSelection();
- if (sel) {
- this._selectionText = sel.toString();
- AnchorMenu.Instance.setSelectedText(sel.toString());
- this._textAnnotationCreator = () => this.createTextAnnotation(sel, !sel.isCollapsed ? sel.getRangeAt(0) : undefined);
- AnchorMenu.Instance.jumpTo(e.clientX * scale + mainContBounds.translateX, e.clientY * scale + mainContBounds.translateY - NumCast(this.layoutDoc._layout_scrollTop) * scale);
- // Changing which document to add the annotation to (the currently selected WebBox)
- GPTPopup.Instance.setSidebarFieldKey(`${this._props.fieldKey}_${this._urlHash ? this._urlHash + '_' : ''}sidebar`);
- GPTPopup.Instance.addDoc = this.sidebarAddDocument;
- }
- } else {
- const theclick = this.props
- .ScreenToLocalTransform()
- .inverse()
- .transformPoint(e.clientX, e.clientY - NumCast(this.layoutDoc.layout_scrollTop));
- if (!this._marqueeref.current?.isEmpty) this._marqueeref.current?.onEnd(theclick[0], theclick[1]);
- else {
- if (!(e.target as HTMLElement)?.tagName?.includes('INPUT')) this.finishMarquee(theclick[0], theclick[1]);
- this._getAnchor = AnchorMenu.Instance?.GetAnchor;
- this.marqueeing = undefined;
- }
-
- ContextMenu.Instance.closeMenu();
- ContextMenu.Instance.setIgnoreEvents(false);
- if (e?.button === 2 || e?.altKey) {
- e?.preventDefault();
- e?.stopPropagation();
- setTimeout(() => {
- // if menu comes up right away, the down event can still be active causing a menu item to be selected
- this.specificContextMenu();
- this.DocumentView?.().onContextMenu(undefined, theclick[0], theclick[1]);
- });
- }
- }
- };
- @action
webClipDown = (e: React.PointerEvent) => {
e.stopPropagation();
const sel = window.getSelection();
@@ -451,27 +454,9 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
}
};
@action
- iframeDown = (e: PointerEvent) => {
- this._textAnnotationCreator = undefined;
- const sel = this._url ? this._iframe?.contentDocument?.getSelection() : window.document.getSelection();
- if (sel?.empty)
- sel.empty(); // Chrome
- else if (sel?.removeAllRanges) sel.removeAllRanges(); // Firefox
-
- this._props.select(false);
- const theclick = this.props
- .ScreenToLocalTransform()
- .inverse()
- .transformPoint(e.clientX, e.clientY - NumCast(this.layoutDoc.layout_scrollTop));
- MarqueeAnnotator.clearAnnotations(this._savedAnnotations);
- const target = e.target as HTMLElement;
- const word = target && getWordAtPoint(target, e.clientX, e.clientY);
- if (!word && !target?.className?.includes('rangeslider') && !target?.onclick && !target?.parentElement?.onclick) {
- this.marqueeing = theclick;
- this._marqueeref.current?.onInitiateSelection(this.marqueeing);
- this._iframe?.contentDocument?.addEventListener('pointermove', this.iframeMove);
- e.preventDefault();
- }
+ iframeDown = () => {
+ // This is an empty replacement to avoid linter errors
+ // The original functionality is no longer needed
};
isFirefox = () => 'InstallTrigger' in window; // navigator.userAgent.indexOf("Chrome") !== -1;
@@ -497,121 +482,6 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
_iframetimeout: NodeJS.Timeout | undefined = undefined;
@observable _warning = 0;
@action
- iframeLoaded = () => {
- const iframe = this._iframe;
- if (this._initialScroll !== undefined) {
- this.setScrollPos(this._initialScroll);
- }
- this._scrollHeight = this._iframe?.contentDocument?.body?.scrollHeight ?? 0;
- this.addWebStyleSheetRule(this.addWebStyleSheet(this._iframe?.contentDocument), '::selection', { color: 'white', background: 'orange' }, '');
-
- let href: Opt<string>;
- try {
- href = iframe?.contentWindow?.location.href;
- } catch {
- runInAction(() => this._warning++);
- href = undefined;
- }
- let requrlraw = decodeURIComponent(href?.replace(ClientUtils.prepend('') + '/corsProxy/', '') ?? this._url.toString());
- if (requrlraw !== this._url.toString()) {
- if (requrlraw.match(/q=.*&/)?.length && this._url.toString().match(/q=.*&/)?.length) {
- const matches = requrlraw.match(/[^a-zA-z]q=[^&]*/g);
- const newsearch = matches?.lastElement() || '';
- if (matches) {
- requrlraw = requrlraw.substring(0, requrlraw.indexOf(newsearch));
- for (let i = 1; i < Array.from(matches)?.length; i++) {
- requrlraw = requrlraw.replace(matches[i], '');
- }
- }
- requrlraw = requrlraw
- .replace(/q=[^&]*/, newsearch.substring(1))
- .replace('search&', 'search?')
- .replace('?gbv=1', '');
- }
- this.setData(requrlraw);
- }
- const iframeContent = iframe?.contentDocument;
- if (iframeContent) {
- iframeContent.addEventListener('pointerup', this.iframeUp);
- iframeContent.addEventListener('pointerdown', this.iframeDown);
- // iframeContent.addEventListener(
- // 'wheel',
- // e => {
- // e.ctrlKey && e.preventDefault();
- // },
- // { passive: false }
- // );
- const initHeights = () => {
- this._scrollHeight = Math.max(this._scrollHeight, iframeContent.body.scrollHeight || 0);
- if (this._scrollHeight) {
- this.Document.nativeHeight = Math.min(NumCast(this.Document.nativeHeight), this._scrollHeight);
- this.layoutDoc.height = Math.min(NumCast(this.layoutDoc._height), (NumCast(this.layoutDoc._width) * NumCast(this.Document.nativeHeight)) / NumCast(this.Document.nativeWidth));
- }
- };
- const swidth = Math.max(NumCast(this.Document.nativeWidth), iframeContent.body.scrollWidth || 0);
- if (swidth) {
- const aspectResize = swidth / NumCast(this.Document.nativeWidth, swidth);
- this.layoutDoc.height = NumCast(this.layoutDoc._height) * aspectResize;
- this.Document.nativeWidth = swidth;
- this.Document.nativeHeight = (swidth * NumCast(this.layoutDoc._height)) / NumCast(this.layoutDoc._width);
- }
- initHeights();
- this._iframetimeout && clearTimeout(this._iframetimeout);
- this._iframetimeout = setTimeout(
- action(() => initHeights),
- 5000
- );
- iframeContent.addEventListener(
- 'click',
- undoable(
- action((e: MouseEvent) => {
- let eleHref = '';
- for (let ele = e.target as HTMLElement | Element | null; ele; ele = ele.parentElement) {
- if (ele instanceof HTMLAnchorElement) {
- eleHref = (typeof ele.href === 'string' ? ele.href : eleHref) || (ele.parentElement && 'href' in ele.parentElement ? (ele.parentElement.href as string) : eleHref);
- }
- }
- const origin = this.webField?.origin;
- if (eleHref && origin) {
- const batch = UndoManager.StartBatch('webclick');
- e.stopPropagation();
- setTimeout(() => {
- this.setData(eleHref.replace(ClientUtils.prepend(''), origin));
- batch.end();
- });
- if (this._outerRef.current) {
- this._outerRef.current.scrollTop = NumCast(this.layoutDoc._layout_scrollTop);
- this._outerRef.current.scrollLeft = 0;
- }
- }
- }),
- 'follow web link'
- )
- );
- iframe.contentDocument.addEventListener('wheel', this.iframeWheel, { passive: false });
- }
- };
-
- @action
- iframeWheel = (e: WheelEvent) => {
- if (!this._scrollTimer) {
- addStyleSheetRule(WebBox.webStyleSheet, 'webBox-iframe', { 'pointer-events': 'none' });
- this._scrollTimer = setTimeout(() => {
- this._scrollTimer = undefined;
- clearStyleSheetRules(WebBox.webStyleSheet);
- }, 250); // this turns events off on the iframe which allows scrolling to change direction smoothly
- }
- if (e.ctrlKey) {
- if (this._innerCollectionView) {
- this._innerCollectionView.zoom(e.screenX, e.screenY, e.deltaY);
- const offset = e.clientY - NumCast(this.layoutDoc._layout_scrollTop);
- this.layoutDoc.freeform_panY = offset - offset / NumCast(this.layoutDoc._freeform_scale) + NumCast(this.layoutDoc._layout_scrollTop) - NumCast(this.layoutDoc._layout_scrollTop) / NumCast(this.layoutDoc._freeform_scale);
- }
- e.preventDefault();
- }
- };
-
- @action
setDashScrollTop = (scrollTop: number, timeout: number = 250) => {
const iframeHeight = Math.max(scrollTop, this._scrollHeight - this.panelHeight());
if (this._scrollTimer) {
@@ -654,15 +524,23 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
this.dataDoc[this.fieldKey + '_history'] = new List<string>([...history, this._url]);
this.dataDoc[this.fieldKey] = new WebField(new URL(future.pop()!));
this._scrollHeight = 0;
+
+ // Reset screenshot state for new URL
+ this._screenshotUrl = null;
+ this._fullHeight = 0;
+ this._isLoadingScreenshot = false;
+
if (this._webUrl === this._url) {
this._webUrl = curUrl;
setTimeout(
action(() => {
this._webUrl = this._url;
+ this.captureWebScreenshot(); // Capture screenshot for new URL
})
);
} else {
this._webUrl = this._url;
+ this.captureWebScreenshot(); // Capture screenshot for new URL
}
return true;
}
@@ -682,15 +560,23 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
else this.dataDoc[this.fieldKey + '_future'] = new List<string>([...future, this._url]);
this.dataDoc[this.fieldKey] = new WebField(new URL(history.pop()!));
this._scrollHeight = 0;
+
+ // Reset screenshot state for new URL
+ this._screenshotUrl = null;
+ this._fullHeight = 0;
+ this._isLoadingScreenshot = false;
+
if (this._webUrl === this._url) {
this._webUrl = curUrl;
setTimeout(
action(() => {
this._webUrl = this._url;
+ this.captureWebScreenshot(); // Capture screenshot for new URL
})
);
} else {
this._webUrl = this._url;
+ this.captureWebScreenshot(); // Capture screenshot for new URL
}
return true;
}
@@ -709,10 +595,11 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
this.layoutDoc.thumbNativeWidth = undefined;
this.layoutDoc.thumbNativeHeight = undefined;
}
- }
- if (!preview) {
+
if (!dontUpdateIframe) {
this._webUrl = this._url;
+ // Capture screenshot when URL changes
+ this.captureWebScreenshot();
}
}
} catch {
@@ -721,6 +608,85 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
return true;
};
+ @action
+ captureWebScreenshot = async () => {
+ if (!this._url || this._loadingFromCache) return;
+
+ try {
+ this._isLoadingScreenshot = true;
+ this._screenshotError = null;
+
+ console.log(`Capturing screenshot for URL: ${this._url}`);
+
+ try {
+ const response = await axios.post('/captureWebScreenshot', {
+ url: this._url,
+ width: NumCast(this.Document.nativeWidth, 1200),
+ height: NumCast(this.Document.nativeHeight, 800),
+ fullPage: true, // Request a full page screenshot
+ });
+
+ runInAction(() => {
+ this._screenshotUrl = response.data.screenshotUrl;
+ this._fullHeight = response.data.fullHeight;
+ this._scrollHeight = response.data.fullHeight;
+ this._webPageHasBeenRendered = true;
+ this._isLoadingScreenshot = false;
+
+ // Store screenshot URL and height in document metadata
+ this.dataDoc[this.fieldKey + '_screenshotUrl'] = response.data.screenshotUrl;
+ this.dataDoc[this.fieldKey + '_screenshotHeight'] = response.data.fullHeight;
+
+ // Update native dimensions to match the screenshot
+ if (!this.dataDoc[this.fieldKey + '_nativeWidth']) {
+ this.dataDoc[this.fieldKey + '_nativeWidth'] = 1200; // Default width
+ }
+
+ if (!this.dataDoc[this.fieldKey + '_nativeHeight']) {
+ this.dataDoc[this.fieldKey + '_nativeHeight'] = this._fullHeight;
+ }
+
+ // Set document height if needed
+ if (this.layoutDoc._layout_autoHeight) {
+ this.layoutDoc._nativeHeight = this._fullHeight;
+ this._props.setHeight?.(this._fullHeight * (this._props.NativeDimScaling?.() || 1));
+ }
+
+ // Apply initial scroll if needed
+ if (this._initialScroll !== undefined) {
+ this.setScrollPos(this._initialScroll);
+ }
+
+ console.log(`Screenshot captured successfully: ${this._screenshotUrl} with height: ${this._fullHeight}px`);
+ });
+ } catch (error: any) {
+ // Handle error from the API
+ console.error('Error capturing screenshot:', error);
+ let errorMessage = 'Failed to capture webpage screenshot';
+
+ // Try to extract detailed error message from response
+ if (error.response && error.response.data && error.response.data.error) {
+ errorMessage = error.response.data.error;
+ } else if (error.message) {
+ errorMessage = error.message;
+ }
+
+ runInAction(() => {
+ this._screenshotError = errorMessage;
+ this._isLoadingScreenshot = false;
+ });
+ }
+ } catch (error: any) {
+ // Handle unexpected errors
+ runInAction(() => {
+ console.error('Unexpected error in captureWebScreenshot:', error);
+ this._screenshotError = 'An unexpected error occurred';
+ this._isLoadingScreenshot = false;
+ });
+ }
+ };
+
+ @action
onWebUrlDrop = (e: React.DragEvent) => {
const { dataTransfer } = e;
const html = dataTransfer.getData('text/html');
@@ -735,13 +701,28 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
setData = (data: FieldType | Promise<RefField | undefined>) => {
if (!(typeof data === 'string') && !(data instanceof WebField)) return false;
if (Field.toString(data) === this._url) return false;
+
+ // Reset state for new URL
this._scrollHeight = 0;
+ this._screenshotUrl = null;
+ this._fullHeight = 0;
+ this._isLoadingScreenshot = false;
+
+ // Clear stored screenshot metadata for the previous URL
+ this.dataDoc[this.fieldKey + '_screenshotUrl'] = undefined;
+ this.dataDoc[this.fieldKey + '_screenshotHeight'] = undefined;
+
const oldUrl = this._url;
const history = Cast(this.dataDoc[this.fieldKey + '_history'], listSpec('string'), []);
const weburl = new WebField(Field.toString(data));
this.dataDoc[this.fieldKey + '_future'] = new List<string>([]);
this.dataDoc[this.fieldKey + '_history'] = new List<string>([...(history || []), oldUrl]);
this.dataDoc[this.fieldKey] = weburl;
+
+ // Capture screenshot for the new URL
+ this._webUrl = weburl.toString();
+ this.captureWebScreenshot();
+
return true;
};
onWebUrlValueKeyDown = (e: React.KeyboardEvent) => {
@@ -758,26 +739,14 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
description: (this.layoutDoc[this.fieldKey + '_useCors'] ? "Don't Use" : 'Use') + ' Cors',
event: () => {
this.layoutDoc[this.fieldKey + '_useCors'] = !this.layoutDoc[this.fieldKey + '_useCors'];
+ // Re-capture screenshot with the new setting
+ this.captureWebScreenshot();
},
icon: 'snowflake',
});
- funcs.push({
- description: (this.dataDoc[this.fieldKey + '_allowScripts'] ? 'Prevent' : 'Allow') + ' Scripts',
- event: () => {
- this.dataDoc[this.fieldKey + '_allowScripts'] = !this.dataDoc[this.fieldKey + '_allowScripts'];
- if (this._iframe) {
- runInAction(() => {
- this._hackHide = true;
- });
- setTimeout(
- action(() => {
- this._hackHide = false;
- })
- );
- }
- },
- icon: 'snowflake',
- });
+
+ // Remove the "Allow Scripts" option since it's not relevant for screenshots
+
funcs.push({
description: (!this.layoutDoc.layout_reflowHorizontal ? 'Force' : 'Prevent') + ' Reflow',
event: () => {
@@ -789,7 +758,21 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
},
icon: 'snowflake',
});
- !Doc.noviceMode && funcs.push({ description: 'Update Icon', event: () => this.updateIcon(), icon: 'portrait' });
+
+ // Add a refresh option to re-capture the screenshot
+ funcs.push({
+ description: 'Refresh Screenshot',
+ event: () => this.captureWebScreenshot(),
+ icon: 'sync-alt',
+ });
+
+ !Doc.noviceMode &&
+ funcs.push({
+ description: 'Update Icon',
+ event: () => this.updateIcon(),
+ icon: 'portrait',
+ });
+
cm.addItem({ description: 'Options...', subitems: funcs, icon: 'asterisk' });
}
};
@@ -801,7 +784,7 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
*/
@action
onMarqueeDown = (e: React.PointerEvent) => {
- const sel = this._url ? this._iframe?.contentDocument?.getSelection() : window.document.getSelection();
+ const sel = window.document.getSelection();
this._textAnnotationCreator = undefined;
if (sel?.empty)
sel.empty(); // Chrome
@@ -836,6 +819,7 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
@computed get urlContent() {
if (this.ScreenToLocalBoxXf().Scale > 25) return <div />;
+
setTimeout(
action(() => {
if (this._initialScroll === undefined && !this._webPageHasBeenRendered) {
@@ -844,7 +828,10 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
this._webPageHasBeenRendered = true;
})
);
+
const field = this.dataDoc[this._props.fieldKey];
+
+ // Handle HTML field (text content)
if (field instanceof HtmlField) {
return (
<span
@@ -861,37 +848,83 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
/>
);
}
+
+ // Handle WebField (screenshot of webpage)
if (field instanceof WebField) {
- const url = this.layoutDoc[this.fieldKey + '_useCors'] ? ClientUtils.CorsProxy(this._webUrl) : this._webUrl;
- const scripts = this.dataDoc[this.fieldKey + '_allowScripts'] || this._webUrl.includes('wikipedia.org') || this._webUrl.includes('google.com') || this._webUrl.startsWith('https://bing');
- // if (!scripts) console.log('No scripts for: ' + url);
+ // Show loading state with spinner
+ if (this._isLoadingScreenshot) {
+ return (
+ <div className="webBox-loading">
+ <div className="webBox-loading-message">{this._loadingFromCache ? 'Loading cached webpage preview...' : 'Loading webpage preview...'}</div>
+ <div className="webBox-loading-spinner">
+ <FontAwesomeIcon className="documentdecorations-icon" icon="spinner" spin />
+ </div>
+ </div>
+ );
+ }
+
+ // Show error state with retry button
+ if (this._screenshotError) {
+ return (
+ <div className="webBox-error">
+ <div className="webBox-error-icon">
+ <FontAwesomeIcon icon="exclamation-triangle" size="2x" />
+ </div>
+ <div className="webBox-error-message">{this._screenshotError}</div>
+ <div className="webBox-error-actions">
+ <button onClick={() => this.captureWebScreenshot()} className="webBox-retry-button">
+ <FontAwesomeIcon icon="sync" style={{ marginRight: '5px' }} />
+ Retry
+ </button>
+ </div>
+ </div>
+ );
+ }
+
+ // Show screenshot in scrollable container
+ if (this._screenshotUrl) {
+ return (
+ <div className="webBox-screenshot-container">
+ <img
+ src={this._screenshotUrl}
+ alt="Webpage screenshot"
+ className="webBox-screenshot"
+ style={{
+ width: '100%',
+ height: 'auto',
+ display: 'block',
+ }}
+ onError={action((e: React.SyntheticEvent<HTMLImageElement>) => {
+ console.error('Error loading screenshot:', e);
+ this._screenshotError = 'Failed to load screenshot image';
+ this._isLoadingScreenshot = false;
+ this.dataDoc[this.fieldKey + '_screenshotUrl'] = undefined;
+ this.dataDoc[this.fieldKey + '_screenshotHeight'] = undefined;
+ })}
+ onLoad={() => {
+ this._scrollHeight = this._fullHeight;
+ if (this._initialScroll !== undefined) {
+ this.setScrollPos(this._initialScroll);
+ }
+ }}
+ />
+ </div>
+ );
+ }
+
+ // Fall back to a placeholder if no screenshot yet
return (
- <iframe
- title="web iframe"
- key={this._warning}
- className="webBox-iframe"
- ref={action((r: HTMLIFrameElement | null) => {
- this._iframe = r;
- })}
- style={{ pointerEvents: SnappingManager.IsResizing ? 'none' : undefined }}
- src={url}
- onLoad={this.iframeLoaded}
- scrolling="no" // ugh.. on windows, I get an inner scroll bar for the iframe's body even though the scrollHeight should be set to the full height of the document.
- // the 'allow-top-navigation' and 'allow-top-navigation-by-user-activation' attributes are left out to prevent iframes from redirecting the top-level Dash page
- // sandbox={"allow-forms allow-modals allow-orientation-lock allow-pointer-lock allow-popups allow-popups-to-escape-sandbox allow-presentation allow-same-origin allow-scripts"} />;
- sandbox={`${scripts ? 'allow-scripts' : ''} allow-forms allow-modals allow-orientation-lock allow-pointer-lock allow-popups allow-popups-to-escape-sandbox allow-presentation allow-same-origin`}
- />
+ <div className="webBox-placeholder">
+ <div>Preparing webpage preview...</div>
+ </div>
);
}
+
+ // Default placeholder
return (
- <iframe
- title="web frame"
- className="webBox-iframe"
- ref={action((r: HTMLIFrameElement | null) => {
- this._iframe = r;
- })}
- src="https://crossorigin.me/https://cs.brown.edu"
- />
+ <div className="webBox-placeholder">
+ <div>No content to display</div>
+ </div>
);
}
@@ -1078,22 +1111,30 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
childPointerEvents = () => (this._props.isContentActive() ? 'all' : undefined);
@computed get webpage() {
TraceMobx();
- const previewScale = this._previewNativeWidth ? 1 - this.sidebarWidth() / this._previewNativeWidth : 1;
+ const containerWidth = NumCast(this.layoutDoc._width) || this._props.PanelWidth();
const pointerEvents = this.layoutDoc._lockedPosition ? 'none' : (this._props.pointerEvents?.() as Property.PointerEvents | undefined);
- const scale = previewScale * (this._props.NativeDimScaling?.() || 1);
+
return (
<div
className="webBox-outerContent"
ref={this._outerRef}
style={{
- height: `${100 / scale}%`,
+ width: '100%',
+ height: `${containerWidth}px`,
+ overflowY: 'auto',
+ overflowX: 'hidden',
pointerEvents,
}}
- // when active, block wheel events from propagating since they're handled by the iframe
onWheel={this.onZoomWheel}
onScroll={() => this.setDashScrollTop(this._outerRef.current?.scrollTop || 0)}
onPointerDown={this.onMarqueeDown}>
- <div className="webBox-innerContent" style={{ height: (this._webPageHasBeenRendered && this._scrollHeight > this._props.PanelHeight() && this._scrollHeight) || '100%', pointerEvents }}>
+ <div
+ className="webBox-innerContent"
+ style={{
+ width: '100%',
+ pointerEvents,
+ backgroundColor: '#f5f5f5',
+ }}>
{this.content}
<div style={{ display: SnappingManager.CanEmbed ? 'none' : undefined, mixBlendMode: 'multiply' }}>{this.renderTransparentAnnotations}</div>
{this.renderOpaqueAnnotations}
@@ -1135,6 +1176,13 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
<FontAwesomeIcon icon={this._searching ? 'times' : 'search'} size="lg" />
</div>
</button>
+
+ {/* Refresh button */}
+ <button type="button" className="webBox-overlayButton webBox-refreshButton" title="Refresh webpage" onClick={() => this.captureWebScreenshot()}>
+ <div className="webBox-overlayButton-iconCont" onPointerDown={e => e.stopPropagation()}>
+ <FontAwesomeIcon icon="sync" size="lg" />
+ </div>
+ </button>
</div>
);
}
@@ -1163,23 +1211,31 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
annotationPointerEvents = () => (this._props.isContentActive() && (SnappingManager.IsDragging || Doc.ActiveTool !== InkTool.None) ? 'all' : 'none');
render() {
TraceMobx();
- const previewScale = this._previewNativeWidth ? 1 - this.sidebarWidth() / this._previewNativeWidth : 1;
+ const containerWidth = NumCast(this.layoutDoc._width) || this._props.PanelWidth();
const pointerEvents = this.layoutDoc._lockedPosition ? 'none' : (this._props.pointerEvents?.() as Property.PointerEvents);
- const scale = previewScale * (this._props.NativeDimScaling?.() || 1);
+
+ // Force the component to be square
+ this.layoutDoc._height = containerWidth;
+ this.layoutDoc._width = containerWidth;
+ this.layoutDoc._forceActive = true;
+
return (
<div
className="webBox"
ref={this._mainCont}
style={{
- pointerEvents: this.pointerEvents(), //
+ pointerEvents: this.pointerEvents(),
position: SnappingManager.IsDragging ? 'absolute' : undefined,
+ width: `${containerWidth}px`,
+ height: `${containerWidth}px`,
+ aspectRatio: '1 / 1', // Explicitly enforce square aspect ratio
}}>
<div className="webBox-background" style={{ backgroundColor: this._props.styleProvider?.(this.layoutDoc, this._props, StyleProp.BackgroundColor) as string }} />
<div
className="webBox-container"
style={{
- width: `calc(${100 / scale}% - ${!this.SidebarShown ? 0 : ((this.sidebarWidth() - WebBox.sidebarResizerWidth) / scale) * (this._previewWidth ? scale : 1)}px)`,
- transform: `scale(${scale})`,
+ width: `calc(100% - ${this.SidebarShown ? this.sidebarWidth() : 0}px)`,
+ height: '100%',
pointerEvents,
}}
onContextMenu={this.specificContextMenu}>
@@ -1236,6 +1292,15 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
</div>
);
}
+
+ get marqueeing() {
+ return this._marqueeing;
+ }
+ set marqueeing(val) {
+ val && this._marqueeref.current?.onInitiateSelection(val);
+ !val && this._marqueeref.current?.onTerminateSelection();
+ this._marqueeing = val;
+ }
}
// eslint-disable-next-line prefer-arrow-callback
ScriptingGlobals.add(function urlHash(url: string) {
@@ -1246,3 +1311,149 @@ Docs.Prototypes.TemplateMap.set(DocumentType.WEB, {
layout: { view: WebBox, dataField: 'data' },
options: { acl: '', _height: 300, _layout_fitWidth: true, _layout_nativeDimEditable: true, _layout_reflowVertical: true, waitForDoubleClickToClick: 'always', systemIcon: 'BsGlobe' },
});
+
+// Add CSS styles for screenshot mode
+const webBoxStyles = `
+.webBox-screenshot-container {
+ width: 100%;
+ position: relative;
+ overflow: visible;
+ display: flex;
+ align-items: flex-start;
+ justify-content: center;
+ background-color: #f5f5f5;
+}
+
+.webBox-screenshot {
+ width: 100%;
+ pointer-events: none;
+ display: block;
+ user-select: none;
+ object-fit: contain;
+ transition: opacity 0.3s ease;
+}
+
+.webBox-loading {
+ padding: 20px;
+ text-align: center;
+ color: #666;
+ background-color: #f5f5f5;
+ border-radius: 4px;
+ min-height: 200px;
+ display: flex;
+ flex-direction: column;
+ align-items: center;
+ justify-content: center;
+}
+
+.webBox-loading-message {
+ font-size: 16px;
+ margin-bottom: 15px;
+ color: #555;
+}
+
+.webBox-loading-spinner {
+ margin-top: 10px;
+ color: #1976d2;
+}
+
+.webBox-error {
+ padding: 20px;
+ color: #d32f2f;
+ text-align: center;
+ background-color: #ffebee;
+ border-radius: 4px;
+ min-height: 200px;
+ display: flex;
+ flex-direction: column;
+ align-items: center;
+ justify-content: center;
+ gap: 15px;
+}
+
+.webBox-error-icon {
+ color: #d32f2f;
+ margin-bottom: 10px;
+}
+
+.webBox-error-message {
+ color: #d32f2f;
+ font-size: 14px;
+ max-width: 80%;
+ line-height: 1.5;
+}
+
+.webBox-error-actions {
+ margin-top: 10px;
+}
+
+.webBox-retry-button {
+ background-color: #f44336;
+ color: white;
+ border: none;
+ padding: 8px 16px;
+ border-radius: 4px;
+ cursor: pointer;
+ font-size: 14px;
+ transition: background-color 0.3s;
+}
+
+.webBox-retry-button:hover {
+ background-color: #d32f2f;
+}
+
+.webBox-placeholder {
+ padding: 20px;
+ text-align: center;
+ color: #757575;
+ background-color: #fafafa;
+ border-radius: 4px;
+ min-height: 200px;
+ display: flex;
+ align-items: center;
+ justify-content: center;
+}
+
+.webBox-refreshButton {
+ margin-right: 5px;
+}
+
+.webBox-innerContent {
+ position: relative;
+ width: 100%;
+ background-color: #f5f5f5;
+ overflow: visible;
+}
+
+.webBox-outerContent {
+ overflow: auto;
+ width: 100%;
+ background-color: #f5f5f5;
+ position: relative;
+}
+
+.webBox-container {
+ position: relative;
+ display: flex;
+ flex-direction: column;
+ height: 100%;
+ background-color: white;
+ border-radius: 4px;
+ overflow: hidden;
+}
+
+.webBox {
+ position: relative;
+ height: 100%;
+ width: 100%;
+ overflow: hidden;
+ background-color: white;
+ border-radius: 4px;
+ box-shadow: 0 1px 3px rgba(0, 0, 0, 0.12), 0 1px 2px rgba(0, 0, 0, 0.24);
+}
+`;
+
+// Add the styles to the document
+const styleEl = document.createElement('style');
+styleEl.textContent = webBoxStyles;
+document.head.appendChild(styleEl);
diff --git a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
index bff38ae15..3c7b4e3db 100644
--- a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
+++ b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
@@ -73,30 +73,111 @@ export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParam
this._getLinkedUrlDocId = getLinkedUrlDocIds;
}
- async execute(args: ParametersType<WebsiteInfoScraperToolParamsType>): Promise<Observation[]> {
- const urls = args.urls;
-
- // Create an array of promises, each one handling a website scrape for a URL
- const scrapingPromises = urls.map(async url => {
+ /**
+ * Attempts to scrape a website with retry logic
+ * @param url URL to scrape
+ * @param maxRetries Maximum number of retry attempts
+ * @returns The scraped content or error message
+ */
+ private async scrapeWithRetry(url: string, maxRetries = 2): Promise<Observation> {
+ let lastError = '';
+ let retryCount = 0;
+
+ // Validate URL format
+ try {
+ new URL(url); // This will throw if URL is invalid
+ } catch (e) {
+ return {
+ type: 'text',
+ text: `Invalid URL format: ${url}. Please provide a valid URL including http:// or https://`,
+ } as Observation;
+ }
+
+ while (retryCount <= maxRetries) {
try {
- const { website_plain_text } = (await Networking.PostToServer('/scrapeWebsite', { url })) as { website_plain_text: string };
+ // Add a slight delay between retries
+ if (retryCount > 0) {
+ console.log(`Retry attempt ${retryCount} for ${url}`);
+ await new Promise(resolve => setTimeout(resolve, retryCount * 2000)); // Increasing delay for each retry
+ }
+
+ const response = await Networking.PostToServer('/scrapeWebsite', { url });
+
+ if (!response || typeof response !== 'object') {
+ lastError = 'Empty or invalid response from server';
+ retryCount++;
+ continue;
+ }
+
+ const { website_plain_text } = response as { website_plain_text: string };
const id = this._getLinkedUrlDocId(url);
+
+ // Validate content quality
+ if (!website_plain_text) {
+ lastError = 'Retrieved content was empty';
+ retryCount++;
+ continue;
+ }
+
+ if (website_plain_text.length < 100) {
+ console.warn(`Warning: Content from ${url} is very short (${website_plain_text.length} chars)`);
+
+ // Still return it if this is our last try
+ if (retryCount === maxRetries) {
+ return {
+ type: 'text',
+ text: `<chunk chunk_id="${id}" chunk_type="url">\n${website_plain_text}\nNote: Limited content was retrieved from this URL.\n</chunk>`,
+ } as Observation;
+ }
+
+ lastError = 'Retrieved content was too short, trying again';
+ retryCount++;
+ continue;
+ }
+
+ // Process and return content if it looks good
return {
type: 'text',
text: `<chunk chunk_id="${id}" chunk_type="url">\n${website_plain_text}\n</chunk>`,
} as Observation;
} catch (error) {
- console.log(error);
- return {
- type: 'text',
- text: `An error occurred while scraping the website: ${url}`,
- } as Observation;
+ lastError = error instanceof Error ? error.message : 'Unknown error';
+ console.log(`Error scraping ${url} (attempt ${retryCount + 1}):`, error);
}
- });
+
+ retryCount++;
+ }
+
+ // All attempts failed
+ return {
+ type: 'text',
+ text: `Unable to scrape website: ${url}. Error: ${lastError}`,
+ } as Observation;
+ }
+
+ async execute(args: ParametersType<WebsiteInfoScraperToolParamsType>): Promise<Observation[]> {
+ const urls = args.urls;
+
+ // Create an array of promises, each one handling a website scrape for a URL
+ const scrapingPromises = urls.map(url => this.scrapeWithRetry(url));
// Wait for all scraping promises to resolve
const results = await Promise.all(scrapingPromises);
+ // Check if we got any successful results
+ const successfulResults = results.filter(result => {
+ if (result.type !== 'text') return false;
+ return (result as { type: 'text'; text: string }).text.includes('chunk_id') && !(result as { type: 'text'; text: string }).text.includes('Unable to scrape');
+ });
+
+ // If all scrapes failed, provide a more helpful error message
+ if (successfulResults.length === 0 && results.length > 0) {
+ results.push({
+ type: 'text',
+ text: `Note: All website scraping attempts failed. Please try with different URLs or try again later.`,
+ } as Observation);
+ }
+
return results;
}
}
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts
index af25722a4..6d2779163 100644
--- a/src/server/ApiManagers/AssistantManager.ts
+++ b/src/server/ApiManagers/AssistantManager.ts
@@ -485,36 +485,69 @@ export default class AssistantManager extends ApiManager {
subscription: '/scrapeWebsite',
secureHandler: async ({ req, res }) => {
const { url } = req.body;
+ let browser = null;
try {
+ // Set a longer timeout for slow-loading pages
+ const navigationTimeout = 60000; // 60 seconds
+
// Launch Puppeteer browser to navigate to the webpage
- const browser = await puppeteer.launch({
- args: ['--no-sandbox', '--disable-setuid-sandbox'],
+ browser = await puppeteer.launch({
+ args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
});
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
- await page.goto(url, { waitUntil: 'networkidle2' });
+
+ // Set timeout for navigation
+ page.setDefaultNavigationTimeout(navigationTimeout);
+
+ // Navigate with timeout and wait for content to load
+ await page.goto(url, {
+ waitUntil: 'networkidle2',
+ timeout: navigationTimeout,
+ });
+
+ // Wait a bit longer to ensure dynamic content loads
+ await new Promise(resolve => setTimeout(resolve, 2000));
// Extract HTML content
const htmlContent = await page.content();
await browser.close();
+ browser = null;
- // Parse HTML content using JSDOM
- const dom = new JSDOM(htmlContent, { url });
+ // Use a try-catch block specifically for JSDOM parsing
+ try {
+ // Parse HTML content using JSDOM
+ const dom = new JSDOM(htmlContent, { url });
- // Extract readable content using Mozilla's Readability API
- const reader = new Readability(dom.window.document);
- const article = reader.parse();
+ // Extract readable content using Mozilla's Readability API
+ const reader = new Readability(dom.window.document);
+ const article = reader.parse();
- if (article) {
- const plainText = article.textContent;
- res.send({ website_plain_text: plainText });
- } else {
- res.status(500).send({ error: 'Failed to extract readable content' });
+ if (article) {
+ const plainText = article.textContent;
+ res.send({ website_plain_text: plainText });
+ } else {
+ // If Readability fails, fallback to extracting main content
+ const mainContent = await extractMainContent(htmlContent);
+ res.send({ website_plain_text: mainContent });
+ }
+ } catch (parsingError) {
+ console.error('Error parsing website content:', parsingError);
+
+ // Fallback to a simplified extraction method
+ const mainContent = await extractMainContent(htmlContent);
+ res.send({ website_plain_text: mainContent });
}
} catch (error) {
console.error('Error scraping website:', error);
+
+ // Clean up browser if still open
+ if (browser) {
+ await browser.close().catch(e => console.error('Error closing browser:', e));
+ }
+
res.status(500).send({
- error: 'Failed to scrape website',
+ error: 'Failed to scrape website: ' + ((error as Error).message || 'Unknown error'),
});
}
},
@@ -687,6 +720,127 @@ export default class AssistantManager extends ApiManager {
}
},
});
+
+ // Register an API route to capture a screenshot of a webpage using Puppeteer
+ // and return the image URL for display in the WebBox component
+ register({
+ method: Method.POST,
+ subscription: '/captureWebScreenshot',
+ secureHandler: async ({ req, res }) => {
+ const { url, width, height, fullPage } = req.body;
+
+ if (!url) {
+ res.status(400).send({ error: 'URL is required' });
+ return;
+ }
+
+ let browser = null;
+ try {
+ // Increase timeout for websites that load slowly
+ const navigationTimeout = 60000; // 60 seconds
+
+ // Launch a headless browser with additional options to improve stability
+ browser = await puppeteer.launch({
+ headless: true, // Use headless mode
+ args: [
+ '--no-sandbox',
+ '--disable-setuid-sandbox',
+ '--disable-dev-shm-usage',
+ '--disable-accelerated-2d-canvas',
+ '--disable-gpu',
+ '--window-size=1200,800',
+ '--disable-web-security', // Helps with cross-origin issues
+ '--disable-features=IsolateOrigins,site-per-process', // Helps with frames
+ ],
+ timeout: navigationTimeout,
+ });
+
+ const page = await browser.newPage();
+
+ // Set a larger viewport to capture more content
+ await page.setViewport({
+ width: Number(width) || 1200,
+ height: Number(height) || 800,
+ deviceScaleFactor: 1,
+ });
+
+ // Enable request interception to speed up page loading
+ await page.setRequestInterception(true);
+ page.on('request', request => {
+ // Skip unnecessary resources to speed up loading
+ const resourceType = request.resourceType();
+ if (resourceType === 'font' || resourceType === 'media' || resourceType === 'websocket' || request.url().includes('analytics') || request.url().includes('tracker')) {
+ request.abort();
+ } else {
+ request.continue();
+ }
+ });
+
+ // Set navigation and timeout options
+ console.log(`Navigating to URL: ${url}`);
+
+ // Navigate to the URL and wait for the page to load
+ await page.goto(url, {
+ waitUntil: ['networkidle2'],
+ timeout: navigationTimeout,
+ });
+
+ // Wait for a short delay after navigation to allow content to render
+ await new Promise(resolve => setTimeout(resolve, 2000));
+
+ // Take a screenshot
+ console.log('Taking screenshot...');
+ const screenshotPath = `./src/server/public/files/images/webpage_${Date.now()}.png`;
+ const screenshotOptions = {
+ path: screenshotPath,
+ fullPage: fullPage === true,
+ omitBackground: false,
+ type: 'png' as 'png',
+ clip:
+ fullPage !== true
+ ? {
+ x: 0,
+ y: 0,
+ width: Number(width) || 1200,
+ height: Number(height) || 800,
+ }
+ : undefined,
+ };
+
+ await page.screenshot(screenshotOptions);
+
+ // Get the full height of the page
+ const fullHeight = await page.evaluate(() => {
+ return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight, document.body.offsetHeight, document.documentElement.offsetHeight, document.body.clientHeight, document.documentElement.clientHeight);
+ });
+
+ console.log(`Screenshot captured successfully with height: ${fullHeight}px`);
+
+ // Return the URL to the screenshot
+ const screenshotUrl = `/files/images/webpage_${Date.now()}.png`;
+ res.json({
+ screenshotUrl,
+ fullHeight,
+ });
+ } catch (error: any) {
+ console.error('Error capturing screenshot:', error);
+ res.status(500).send({
+ error: `Failed to capture screenshot: ${error.message}`,
+ details: error.stack,
+ });
+ } finally {
+ // Ensure browser is closed to free resources
+ if (browser) {
+ try {
+ await browser.close();
+ console.log('Browser closed successfully');
+ } catch (error) {
+ console.error('Error closing browser:', error);
+ }
+ }
+ }
+ },
+ });
}
}
@@ -829,3 +983,50 @@ function spawnPythonProcess(jobId: string, file_path: string) {
runPythonScript();
}
}
+
+/**
+ * Extracts main content from HTML by removing scripts, styles, and non-content elements
+ * Used as a fallback when Readability fails
+ * @param html The HTML content to process
+ * @returns Extracted main text content
+ */
+async function extractMainContent(html: string): Promise<string> {
+ try {
+ // Create a simple DOM to extract content
+ const dom = new JSDOM(html, { runScripts: 'outside-only' });
+ const document = dom.window.document;
+
+ // Remove scripts, styles, and other non-content elements
+ const elementsToRemove = ['script', 'style', 'iframe', 'noscript', 'svg', 'header', 'footer', 'nav', 'aside', 'ads', 'banner', 'form', 'button', 'input'];
+
+ elementsToRemove.forEach(tag => {
+ const elements = document.querySelectorAll(tag);
+ elements.forEach(el => el.remove());
+ });
+
+ // Try to find the main content container using common selectors
+ const mainSelectors = ['main', 'article', '#content', '.content', '#main', '.main', '.post-content', '.article-content', '.entry-content'];
+
+ let mainContent = '';
+
+ // Try each selector to find main content
+ for (const selector of mainSelectors) {
+ const element = document.querySelector(selector);
+ if (element && element.textContent && element.textContent.trim().length > 100) {
+ mainContent = element.textContent;
+ break;
+ }
+ }
+
+ // If no main content found with selectors, use body content
+ if (!mainContent || mainContent.length < 200) {
+ mainContent = document.body.textContent || '';
+ }
+
+ // Clean up the text
+ return mainContent.replace(/\s+/g, ' ').replace(/\n+/g, '\n').trim();
+ } catch (error) {
+ console.error('Error extracting main content:', error);
+ return 'Failed to extract content from the webpage.';
+ }
+}