diff --git a/Dockerfile b/Dockerfile index 1e97fb520..0cadfc14e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -ARG BROWSER_VERSION=1.79.118 +ARG BROWSER_VERSION=1.80.125 ARG BROWSER_IMAGE_BASE=webrecorder/browsertrix-browser-base:brave-${BROWSER_VERSION} FROM ${BROWSER_IMAGE_BASE} @@ -39,7 +39,7 @@ ADD config/ /app/ ADD html/ /app/html/ -ARG RWP_VERSION=2.3.7 +ARG RWP_VERSION=2.3.15 ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/ui.js /app/html/rwp/ ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/sw.js /app/html/rwp/ ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/adblock/adblock.gz /app/html/rwp/adblock.gz diff --git a/docs/docs/user-guide/behaviors.md b/docs/docs/user-guide/behaviors.md index 224211f33..b0c34a612 100644 --- a/docs/docs/user-guide/behaviors.md +++ b/docs/docs/user-guide/behaviors.md @@ -35,14 +35,15 @@ To disable all behaviors, use `--behaviors ""`. ## Behavior and Page Timeouts Browsertrix includes a number of timeouts, including before, during and after running behaviors. + The timeouts are as follows: -- `--waitUntil`: how long to wait for page to finish loading, *before* doing anything else. +- `--pageLoadTimeout`: how long to wait for page to finish loading, *before* doing anything else. - `--postLoadDelay`: how long to wait *before* starting any behaviors, but after page has finished loading. A custom behavior can override this (see below). - `--behaviorTimeout`: maximum time to spend on running site-specific / Autoscroll behaviors (can be less if behavior finishes early). - `--pageExtraDelay`: how long to wait *after* finishing behaviors (or after `behaviorTimeout` has been reached) before moving on to next page. -A site-specific behavior (or Autoscroll) will start after the page is loaded (at most after `--waitUntil` seconds) and exactly after `--postLoadDelay` seconds. +A site-specific behavior (or Autoscroll) will start after the page is loaded (at most after `--pageLoadTimeout` seconds) and exactly after `--postLoadDelay` seconds. The behavior will then run until finished or at most until `--behaviorTimeout` is reached (90 seconds by default). @@ -267,3 +268,26 @@ Some of these functions which may be of use to behaviors authors are: - `getState`: increment a state counter and return all state counters + string message More detailed references will be added in the future. + +## Fail On Content Check + +In Browsertrix Crawler 1.7.0 and higher, the `--failOnContentCheck` option will result in a crawl failing if a behavior detects the presence or absence of certain content on a page in its `awaitPageLoad()` callback. By default, this is used to fail a crawl if site-specific behaviors determine that the user is not logged in on the following sites: + +- Facebook +- Instagram +- TikTok +- X + +It is also used to fail crawls with YouTube videos if one of the videos is found not to play. + +It is possible to add content checks to custom behaviors. To do so, include an `awaitPageLoad` method on the behavior and use the `ctx.Lib` function `assertContentValid` to check for content and fail the behavior with a specified reason if it is not found. + +For an example, see the following `awaitPageLoad` example from the site-specific behavior for X: + +```javascript +async awaitPageLoad(ctx: any) { + const { sleep, assertContentValid } = ctx.Lib; + await sleep(5); + assertContentValid(() => !document.documentElement.outerHTML.match(/Log In/i), "not_logged_in"); +} +``` diff --git a/docs/docs/user-guide/cli-options.md b/docs/docs/user-guide/cli-options.md index 298b366eb..d37160b05 100644 --- a/docs/docs/user-guide/cli-options.md +++ b/docs/docs/user-guide/cli-options.md @@ -261,6 +261,10 @@ Options: ailOnFailedSeed may result in crawl failing due to non-200 responses [boolean] [default: false] + --failOnContentCheck If set, allows for behaviors to fail + a crawl with custom reason based on + content (e.g. logged out) + [boolean] [default: false] --customBehaviors Custom behavior files to inject. Val id values: URL to file, path to file , path to directory of behaviors, UR @@ -272,6 +276,10 @@ Options: git+https://git.example.com/repo.git ?branch=dev&path=some/dir" [array] [default: []] + --saveStorage if set, will store the localStorage/ + sessionStorage data for each page as + part of WARC-JSON-Metadata field + [boolean] --debugAccessRedis if set, runs internal redis without protected mode to allow external acc ess (for debugging) [boolean] diff --git a/package.json b/package.json index 0b219ed40..eb545f8e7 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "1.6.3", + "version": "1.7.0", "main": "browsertrix-crawler", "type": "module", "repository": "https://github.com/webrecorder/browsertrix-crawler", @@ -18,8 +18,8 @@ "dependencies": { "@novnc/novnc": "1.4.0", "@puppeteer/replay": "^3.1.1", - "@webrecorder/wabac": "^2.23.3", - "browsertrix-behaviors": "^0.8.5", + "@webrecorder/wabac": "^2.23.8", + "browsertrix-behaviors": "^0.9.2", "client-zip": "^2.4.5", "css-selector-parser": "^3.0.5", "fetch-socks": "^1.3.0", diff --git a/src/crawler.ts b/src/crawler.ts index e7276e95d..5adf42c07 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -62,7 +62,7 @@ import { } from "puppeteer-core"; import { Recorder } from "./util/recorder.js"; import { SitemapReader } from "./util/sitemapper.js"; -import { ScopedSeed } from "./util/seeds.js"; +import { ScopedSeed, parseSeeds } from "./util/seeds.js"; import { WARCWriter, createWARCInfo, @@ -134,7 +134,7 @@ export class Crawler { maxPageTime: number; - seeds: ScopedSeed[]; + seeds: ScopedSeed[] = []; numOriginalSeeds = 0; // eslint-disable-next-line @typescript-eslint/no-explicit-any @@ -178,7 +178,6 @@ export class Crawler { customBehaviors = ""; behaviorsChecked = false; - behaviorLastLine?: string; browser: Browser; storage: S3StorageSync | null = null; @@ -255,9 +254,6 @@ export class Crawler { this.saveStateFiles = []; this.lastSaveTime = 0; - this.seeds = this.params.scopedSeeds as ScopedSeed[]; - this.numOriginalSeeds = this.seeds.length; - // sum of page load + behavior timeouts + 2 x pageop timeouts (for cloudflare, link extraction) + extra page delay // if exceeded, will interrupt and move on to next page (likely behaviors or some other operation is stuck) this.maxPageTime = @@ -514,6 +510,9 @@ export class Crawler { this.proxyServer = await initProxy(this.params, RUN_DETACHED); + this.seeds = await parseSeeds(this.params); + this.numOriginalSeeds = this.seeds.length; + logger.info("Seeds", this.seeds); logger.info("Link Selectors", this.params.selectLinks); @@ -645,6 +644,11 @@ export class Crawler { } } } + if (await this.crawlState.isFailed()) { + logger.error("Crawl failed, no pages crawled successfully"); + status = "failed"; + exitCode = ExitCodes.Failed; + } } catch (e) { logger.error("Crawl failed", e); exitCode = ExitCodes.Failed; @@ -662,7 +666,6 @@ export class Crawler { pageUrl: string, workerid: WorkerId, ) { - let behaviorLine; let message; let details; @@ -706,11 +709,7 @@ export class Crawler { switch (type) { case "info": - behaviorLine = JSON.stringify(data); - if (behaviorLine !== this.behaviorLastLine) { - logger.info(message, details, context); - this.behaviorLastLine = behaviorLine; - } + logger.info(message, details, context); break; case "error": @@ -938,7 +937,24 @@ self.__bx_behaviors.selectMainBehavior(); return nextFlowStep(id, page, workerid); }); - // await page.exposeFunction("__bx_hasSet", (data: string) => this.crawlState.hasUserSet(data)); + if (this.params.failOnContentCheck) { + await page.exposeFunction( + BxFunctionBindings.ContentCheckFailed, + (reason: string) => { + // if called outside of awaitPageLoad(), ignore + if (!opts.data.contentCheckAllowed) { + return; + } + void this.crawlState.setFailReason(reason); + logger.fatal( + "Content check failed, failing crawl", + { reason }, + "behavior", + ExitCodes.Failed, + ); + }, + ); + } } async setupExecContextEvents( @@ -1299,7 +1315,7 @@ self.__bx_behaviors.selectMainBehavior(); "Seed Page Load Failed, failing crawl", {}, "general", - 1, + ExitCodes.GenericError, ); } } @@ -1940,6 +1956,8 @@ self.__bx_behaviors.selectMainBehavior(); logger.error("Error creating WACZ", e); if (!streaming) { logger.fatal("Unable to write WACZ successfully"); + } else if (this.params.restartsOnError) { + await this.setStatusAndExit(ExitCodes.UploadFailed, "interrupted"); } } } @@ -2236,8 +2254,13 @@ self.__bx_behaviors.selectMainBehavior(); await this.netIdle(page, logDetails); + // allow failing crawl via script only within awaitPageLoad() for now + data.contentCheckAllowed = true; + await this.awaitPageLoad(page.mainFrame(), logDetails); + data.contentCheckAllowed = false; + // skip extraction if at max depth if (seed.isAtMaxDepth(depth, extraHops)) { logger.debug("Skipping Link Extraction, At Max Depth", {}, "links"); @@ -2278,7 +2301,7 @@ self.__bx_behaviors.selectMainBehavior(); frame.evaluate( "self.__bx_behaviors && self.__bx_behaviors.awaitPageLoad();", ), - PAGE_OP_TIMEOUT_SECS, + PAGE_OP_TIMEOUT_SECS * 4, "Custom page load check timed out", logDetails, ); @@ -2432,25 +2455,30 @@ self.__bx_behaviors.selectMainBehavior(); this.pageLimit, ); + const logContext = depth === 0 ? "scope" : "links"; + const logLevel = depth === 0 ? "error" : "debug"; + switch (result) { case QueueState.ADDED: - logger.debug("Queued new page url", { url, ...logDetails }, "links"); + logger.debug("Queued new page URL", { url, ...logDetails }, logContext); return true; case QueueState.LIMIT_HIT: - logger.debug( - "Not queued page url, at page limit", + logger.logAsJSON( + "Page URL not queued, at page limit", { url, ...logDetails }, - "links", + logContext, + logLevel, ); this.limitHit = true; return false; case QueueState.DUPE_URL: - logger.debug( - "Not queued page url, already seen", + logger.logAsJSON( + "Page URL not queued, already seen", { url, ...logDetails }, - "links", + logContext, + logLevel, ); return false; } diff --git a/src/replaycrawler.ts b/src/replaycrawler.ts index 75abfc4ef..819bcf393 100644 --- a/src/replaycrawler.ts +++ b/src/replaycrawler.ts @@ -96,8 +96,6 @@ export class ReplayCrawler extends Crawler { // skip text from first two frames, as they are RWP boilerplate this.skipTextDocs = SKIP_FRAMES; - this.params.scopedSeeds = []; - this.params.screenshot = ["view"]; this.params.text = ["to-warc"]; diff --git a/src/util/argParser.ts b/src/util/argParser.ts index 73da878ba..95ee3e1a1 100644 --- a/src/util/argParser.ts +++ b/src/util/argParser.ts @@ -20,7 +20,6 @@ import { BxFunctionBindings, DEFAULT_CRAWL_ID_TEMPLATE, } from "./constants.js"; -import { ScopedSeed } from "./seeds.js"; import { interpolateFilename } from "./storage.js"; import { screenshotTypes } from "./screenshots.js"; import { @@ -37,12 +36,14 @@ export type CrawlerArgs = ReturnType & { logExcludeContext: LogContext[]; text: string[]; - scopedSeeds: ScopedSeed[]; - customBehaviors: string[]; selectLinks: ExtractSelector[]; + include: string[]; + exclude: string[]; + sitemap: boolean; + crawlId: string; // eslint-disable-next-line @typescript-eslint/no-explicit-any @@ -586,6 +587,13 @@ class ArgParser { default: false, }, + failOnContentCheck: { + describe: + "If set, allows for behaviors to fail a crawl with custom reason based on content (e.g. logged out)", + type: "boolean", + default: false, + }, + customBehaviors: { describe: "Custom behavior files to inject. Valid values: URL to file, path to file, path to directory" + @@ -596,6 +604,12 @@ class ArgParser { default: [], }, + saveStorage: { + describe: + "if set, will store the localStorage/sessionStorage data for each page as part of WARC-JSON-Metadata field", + type: "boolean", + }, + debugAccessRedis: { describe: "if set, runs internal redis without protected mode to allow external access (for debugging)", @@ -770,22 +784,6 @@ class ArgParser { } } - if (argv.seedFile) { - const urlSeedFile = fs.readFileSync(argv.seedFile, "utf8"); - const urlSeedFileList = urlSeedFile.split("\n"); - - if (typeof argv.seeds === "string") { - argv.seeds = [argv.seeds]; - } - - for (const seed of urlSeedFileList) { - if (seed) { - // eslint-disable-next-line @typescript-eslint/no-explicit-any - (argv.seeds as any).push(seed); - } - } - } - let selectLinks: ExtractSelector[]; if (argv.selectLinks) { @@ -817,50 +815,10 @@ class ArgParser { //logger.debug(`Set netIdleWait to ${argv.netIdleWait} seconds`); } - const scopedSeeds: ScopedSeed[] = []; - - if (!isQA) { - const scopeOpts = { - scopeType: argv.scopeType, - sitemap: argv.sitemap, - include: argv.include, - exclude: argv.exclude, - depth: argv.depth, - extraHops: argv.extraHops, - }; - - for (const seed of argv.seeds) { - const newSeed = typeof seed === "string" ? { url: seed } : seed; - - try { - scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...newSeed })); - // eslint-disable-next-line @typescript-eslint/no-explicit-any - } catch (e: any) { - logger.error("Failed to create seed", { - error: e.toString(), - ...scopeOpts, - ...newSeed, - }); - if (argv.failOnFailedSeed) { - logger.fatal( - "Invalid seed specified, aborting crawl", - { url: newSeed.url }, - "general", - 1, - ); - } - } - } - - if (!scopedSeeds.length) { - logger.fatal("No valid seeds specified, aborting crawl"); - } - } else if (!argv.qaSource) { + if (isQA && !argv.qaSource) { logger.fatal("--qaSource required for QA mode"); } - argv.scopedSeeds = scopedSeeds; - // Resolve statsFilename if (argv.statsFilename) { argv.statsFilename = path.resolve(argv.cwd, argv.statsFilename); @@ -870,6 +828,10 @@ class ArgParser { argv.diskUtilization = 90; } + if (argv.saveStorage) { + logger.info("Saving localStorage and sessionStorage"); + } + return true; } } diff --git a/src/util/browser.ts b/src/util/browser.ts index a4d1c6276..69f8ff27b 100644 --- a/src/util/browser.ts +++ b/src/util/browser.ts @@ -76,7 +76,11 @@ export class Browser { screenWHRatio: number; constructor() { - this.profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-")); + this.profileDir = path.join(os.tmpdir(), "btrixProfile"); + if (fs.existsSync(this.profileDir)) { + fs.rmSync(this.profileDir, { recursive: true, force: true }); + } + fs.mkdirSync(this.profileDir); // must be provided, part of Dockerfile assert(process.env.GEOMETRY); diff --git a/src/util/constants.ts b/src/util/constants.ts index d6185d4e2..15b00bd70 100644 --- a/src/util/constants.ts +++ b/src/util/constants.ts @@ -30,6 +30,8 @@ export enum BxFunctionBindings { InitFlow = "__bx_initFlow", NextFlowStep = "__bx_nextFlowStep", + + ContentCheckFailed = "__bx_contentCheckFailed", } export const MAX_DEPTH = 1000000; @@ -79,6 +81,7 @@ export enum ExitCodes { DiskUtilization = 16, Fatal = 17, ProxyError = 21, + UploadFailed = 22, } export enum InterruptReason { diff --git a/src/util/file_reader.ts b/src/util/file_reader.ts index fa8ad0bc1..f0908d12b 100644 --- a/src/util/file_reader.ts +++ b/src/util/file_reader.ts @@ -24,6 +24,48 @@ export type FileSource = { export type FileSources = FileSource[]; +async function getTempFile( + filename: string, + dirPrefix: string, +): Promise { + const tmpDir = path.join( + os.tmpdir(), + `${dirPrefix}-${crypto.randomBytes(4).toString("hex")}`, + ); + await fsp.mkdir(tmpDir, { recursive: true }); + return path.join(tmpDir, filename); +} + +async function writeUrlContentsToFile( + url: string, + pathPrefix: string, + pathDefaultExt: string, +) { + const res = await fetch(url, { dispatcher: getProxyDispatcher() }); + const fileContents = await res.text(); + + const filename = + path.basename(new URL(url).pathname) || "index." + pathDefaultExt; + const filepath = await getTempFile(filename, pathPrefix); + + await fsp.writeFile(filepath, fileContents); + return filepath; +} + +export async function collectOnlineSeedFile(url: string): Promise { + try { + const filepath = await writeUrlContentsToFile(url, "seeds-", ".txt"); + logger.info("Seed file downloaded", { url, path: filepath }); + return filepath; + } catch (e) { + logger.fatal("Error downloading seed file from URL", { + url, + ...formatErr(e), + }); + throw e; + } +} + export async function collectCustomBehaviors( sources: string[], ): Promise { @@ -79,7 +121,7 @@ async function collectGitBehaviors(gitUrl: string): Promise { } catch (e) { logger.fatal( "Error downloading custom behaviors from Git repo", - { url: urlStripped, error: e }, + { url: urlStripped, ...formatErr(e) }, "behavior", ); } @@ -87,18 +129,12 @@ async function collectGitBehaviors(gitUrl: string): Promise { } async function collectOnlineBehavior(url: string): Promise { - const filename = path.basename(new URL(url).pathname); - const tmpDir = path.join( - os.tmpdir(), - `behaviors-${crypto.randomBytes(4).toString("hex")}`, - ); - await fsp.mkdir(tmpDir, { recursive: true }); - const behaviorFilepath = path.join(tmpDir, filename); - try { - const res = await fetch(url, { dispatcher: getProxyDispatcher() }); - const fileContents = await res.text(); - await fsp.writeFile(behaviorFilepath, fileContents); + const behaviorFilepath = await writeUrlContentsToFile( + url, + "behaviors-", + ".js", + ); logger.info( "Custom behavior file downloaded", { url, path: behaviorFilepath }, @@ -108,7 +144,7 @@ async function collectOnlineBehavior(url: string): Promise { } catch (e) { logger.fatal( "Error downloading custom behavior from URL", - { url, error: e }, + { url, ...formatErr(e) }, "behavior", ); } @@ -190,7 +226,7 @@ async function collectLocalPathBehaviors( } catch (e) { logger.fatal( "Error fetching local custom behaviors", - { path: resolvedPath, error: e }, + { path: resolvedPath, ...formatErr(e) }, "behavior", ); } diff --git a/src/util/logger.ts b/src/util/logger.ts index 0325924b0..7d10939ee 100644 --- a/src/util/logger.ts +++ b/src/util/logger.ts @@ -56,10 +56,13 @@ export const LOG_CONTEXT_TYPES = [ "wacz", "replay", "proxy", + "scope", ] as const; export type LogContext = (typeof LOG_CONTEXT_TYPES)[number]; +export type LogLevel = "debug" | "info" | "warn" | "error" | "fatal"; + export const DEFAULT_EXCLUDE_LOG_CONTEXTS: LogContext[] = [ "recorderNetwork", "jsError", @@ -118,7 +121,7 @@ class Logger { message: string, dataUnknown: unknown, context: LogContext, - logLevel = "info", + logLevel: LogLevel, ) { // eslint-disable-next-line @typescript-eslint/no-explicit-any const data: Record = formatErr(dataUnknown); @@ -182,7 +185,7 @@ class Logger { } info(message: string, data: unknown = {}, context: LogContext = "general") { - this.logAsJSON(message, data, context); + this.logAsJSON(message, data, context, "info"); } error(message: string, data: unknown = {}, context: LogContext = "general") { diff --git a/src/util/recorder.ts b/src/util/recorder.ts index aaa4a425f..5c2c96e0f 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -26,6 +26,7 @@ import { Crawler } from "../crawler.js"; import { getProxyDispatcher } from "./proxy.js"; import { ScopedSeed } from "./seeds.js"; import EventEmitter from "events"; +import { DEFAULT_MAX_RETRIES } from "./constants.js"; const MAX_BROWSER_DEFAULT_FETCH_SIZE = 5_000_000; const MAX_TEXT_REWRITE_SIZE = 25_000_000; @@ -148,12 +149,15 @@ export class Recorder extends EventEmitter { writer: WARCWriter; pageUrl!: string; + finalPageUrl = ""; pageid!: string; pageSeed?: ScopedSeed; frameIdToExecId: Map | null; + shouldSaveStorage = false; + constructor({ workerid, writer, @@ -168,6 +172,8 @@ export class Recorder extends EventEmitter { this.crawler = crawler; this.crawlState = crawler.crawlState; + this.shouldSaveStorage = !!crawler.params.saveStorage; + this.writer = writer; this.fetcherQ = new PQueue({ concurrency: 1 }); @@ -219,7 +225,7 @@ export class Recorder extends EventEmitter { // Loading cdp.on("Network.loadingFinished", (params) => - this.handleLoadingFinished(params), + this.handleLoadingFinished(params, cdp), ); cdp.on("Network.loadingFailed", (params) => @@ -407,6 +413,10 @@ export class Recorder extends EventEmitter { return; } + if (reqresp.url === this.finalPageUrl) { + this.finalPageUrl = reqresp.getRedirectUrl(); + } + this.serializeToWARC(reqresp).catch((e) => logger.warn("Error Serializing to WARC", e, "recorder"), ); @@ -484,7 +494,10 @@ export class Recorder extends EventEmitter { this.removeReqResp(requestId); } - handleLoadingFinished(params: Protocol.Network.LoadingFinishedEvent) { + async handleLoadingFinished( + params: Protocol.Network.LoadingFinishedEvent, + cdp: CDPSession, + ) { const { requestId } = params; const reqresp = this.pendingReqResp(requestId, true); @@ -507,9 +520,38 @@ export class Recorder extends EventEmitter { return; } - this.serializeToWARC(reqresp).catch((e) => - logger.warn("Error Serializing to WARC", e, "recorder"), - ); + if (this.shouldSaveStorage && url === this.finalPageUrl) { + await this.saveStorage(reqresp, cdp); + } + + try { + await this.serializeToWARC(reqresp); + } catch (e) { + logger.warn("Error Serializing to WARC", e, "recorder"); + } + } + + async saveStorage(reqresp: RequestResponseInfo, cdp: CDPSession) { + try { + const { url, extraOpts } = reqresp; + const securityOrigin = new URL(url).origin; + + const local = await cdp.send("DOMStorage.getDOMStorageItems", { + storageId: { securityOrigin, isLocalStorage: true }, + }); + const session = await cdp.send("DOMStorage.getDOMStorageItems", { + storageId: { securityOrigin, isLocalStorage: false }, + }); + + if (local.entries.length || session.entries.length) { + extraOpts.storage = JSON.stringify({ + local: local.entries, + session: session.entries, + }); + } + } catch (e) { + logger.warn("Error getting local/session storage", e, "recorder"); + } } async handleRequestPaused( @@ -910,6 +952,7 @@ export class Recorder extends EventEmitter { startPage({ pageid, url }: { pageid: string; url: string }) { this.pageid = pageid; this.pageUrl = url; + this.finalPageUrl = this.pageUrl; this.logDetails = { page: url, workerid: this.workerid }; if (this.pendingRequests && this.pendingRequests.size) { logger.debug( @@ -1468,6 +1511,8 @@ class AsyncFetcher { manualRedirect = false; + maxRetries = DEFAULT_MAX_RETRIES; + constructor({ reqresp, expectedSize = -1, @@ -1513,122 +1558,165 @@ class AsyncFetcher { } } - const body = await this._doFetch(); - fetched = "fetched"; + let retries = 0; - const responseRecord = createResponse(reqresp, pageid, body); - const requestRecord = createRequest(reqresp, responseRecord, pageid); + while (retries <= this.maxRetries) { + try { + reqresp.truncated = undefined; + const body = await this._doFetch(); + fetched = "fetched"; - const serializer = new WARCSerializer(responseRecord, { - gzip, - maxMemSize: this.maxFetchSize, - }); + const responseRecord = createResponse(reqresp, pageid, body); + const requestRecord = createRequest(reqresp, responseRecord, pageid); - try { - let readSize = await serializer.digestRecord(); - if (serializer.httpHeadersBuff) { - readSize -= serializer.httpHeadersBuff.length; - } - reqresp.readSize = readSize; - // set truncated field and recompute header buff - if (reqresp.truncated) { - responseRecord.warcHeaders.headers.set( - "WARC-Truncated", - reqresp.truncated, - ); - // todo: keep this internal in warcio after adding new header - serializer.warcHeadersBuff = encoder.encode( - responseRecord.warcHeaders.toString(), - ); - } - } catch (e) { - logger.error( - "Error reading + digesting payload", - { url, ...formatErr(e), ...logDetails }, - "recorder", - ); - } + const serializer = new WARCSerializer(responseRecord, { + gzip, + maxMemSize: this.maxFetchSize, + }); - if ( - reqresp.readSize === reqresp.expectedSize || - reqresp.expectedSize < 0 - ) { - logger.debug( - "Async fetch: streaming done", - { - size: reqresp.readSize, - expected: reqresp.expectedSize, - networkId, - url, - ...logDetails, - }, - "recorder", - ); - } else { - logger.warn( - "Async fetch: possible response size mismatch", - { - type: this.constructor.name, - size: reqresp.readSize, - expected: reqresp.expectedSize, - url, - ...logDetails, - }, - "recorder", - ); - if (status === 206 || status === 200) { - void serializer.externalBuffer?.purge(); - await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url, status); - return "notfetched"; - } - } + try { + let readSize = await serializer.digestRecord(); + if (serializer.httpHeadersBuff) { + readSize -= serializer.httpHeadersBuff.length; + } + reqresp.readSize = readSize; + // set truncated field and recompute header buff + if (reqresp.truncated) { + const retry = retries < this.maxRetries; + logger.warn( + "Response truncated", + { url, retry, ...logDetails }, + "recorder", + ); + // if retries available, just retry + if (retry) { + void serializer.externalBuffer?.purge(); + retries++; + continue; + } + responseRecord.warcHeaders.headers.set( + "WARC-Truncated", + reqresp.truncated, + ); + // todo: keep this internal in warcio after adding new header + serializer.warcHeadersBuff = encoder.encode( + responseRecord.warcHeaders.toString(), + ); + } + } catch (e) { + const retry = retries < this.maxRetries; + logger.error( + "Error reading + digesting payload", + { url, retry, ...formatErr(e), ...logDetails }, + "recorder", + ); + if (retry) { + void serializer.externalBuffer?.purge(); + retries++; + continue; + } + } + + if ( + reqresp.readSize === reqresp.expectedSize || + reqresp.expectedSize < 0 + ) { + logger.debug( + "Async fetch: streaming done", + { + size: reqresp.readSize, + expected: reqresp.expectedSize, + networkId, + url, + ...logDetails, + }, + "recorder", + ); + } else { + logger.warn( + "Async fetch: possible response size mismatch", + { + type: this.constructor.name, + size: reqresp.readSize, + expected: reqresp.expectedSize, + url, + retry: + retries < this.maxRetries && + (status === 206 || status === 200), + ...logDetails, + }, + "recorder", + ); + if (status === 206 || status === 200) { + void serializer.externalBuffer?.purge(); + await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url, status); + if (retries < this.maxRetries) { + retries++; + continue; + } + return "notfetched"; + } + } + + const externalBuffer: TempFileBuffer = + serializer.externalBuffer as TempFileBuffer; + + if (externalBuffer) { + const { currSize, buffers, fh } = externalBuffer; + + // if fully buffered in memory, then populate the payload to return to browser + if (buffers && buffers.length && !fh) { + reqresp.payload = Buffer.concat(buffers, currSize); + externalBuffer.buffers = [reqresp.payload]; + } else if (fh) { + logger.debug( + "Large payload written to WARC, but not returned to browser (would require rereading into memory)", + { + url, + actualSize: reqresp.readSize, + maxSize: this.maxFetchSize, + }, + "recorder", + ); + } + } - const externalBuffer: TempFileBuffer = - serializer.externalBuffer as TempFileBuffer; + if (Object.keys(reqresp.extraOpts).length) { + responseRecord.warcHeaders.headers.set( + "WARC-JSON-Metadata", + JSON.stringify(reqresp.extraOpts), + ); + } - if (externalBuffer) { - const { currSize, buffers, fh } = externalBuffer; + recorder.writer.writeRecordPair( + responseRecord, + requestRecord, + serializer, + ); - // if fully buffered in memory, then populate the payload to return to browser - if (buffers && buffers.length && !fh) { - reqresp.payload = Buffer.concat(buffers, currSize); - externalBuffer.buffers = [reqresp.payload]; - } else if (fh) { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + } catch (e: any) { + await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url!, status); + if (e.message === "response-filtered-out") { + throw e; + } + const retry = retries < this.maxRetries; logger.debug( - "Large payload written to WARC, but not returned to browser (would require rereading into memory)", - { url, actualSize: reqresp.readSize, maxSize: this.maxFetchSize }, + "Streaming Fetch Error", + { url, networkId, retry, ...formatErr(e), ...logDetails }, "recorder", ); + if (retry) { + retries++; + continue; + } + // indicate response is ultimately not valid + reqresp.status = 0; + reqresp.errorText = e.message; } + // if we get here, successful (or out of retries), break out of loop + break; } - - if (Object.keys(reqresp.extraOpts).length) { - responseRecord.warcHeaders.headers.set( - "WARC-JSON-Metadata", - JSON.stringify(reqresp.extraOpts), - ); - } - - recorder.writer.writeRecordPair( - responseRecord, - requestRecord, - serializer, - ); - - // eslint-disable-next-line @typescript-eslint/no-explicit-any - } catch (e: any) { - await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url!, status); - if (e.message === "response-filtered-out") { - throw e; - } - logger.debug( - "Streaming Fetch Error", - { url, networkId, ...formatErr(e), ...logDetails }, - "recorder", - ); - // indicate response is ultimately not valid - reqresp.status = 0; - reqresp.errorText = e.message; } finally { recorder.addPageRecord(reqresp); // exclude direct fetch request with fake id @@ -1769,6 +1857,8 @@ class ResponseStreamAsyncFetcher extends AsyncFetcher { super(opts); this.cdp = opts.cdp; this.requestId = opts.requestId; + // can't retry this type of fetch + this.maxRetries = 0; } async _doFetch() { diff --git a/src/util/reqresp.ts b/src/util/reqresp.ts index b1efa240e..dba22a5e8 100644 --- a/src/util/reqresp.ts +++ b/src/util/reqresp.ts @@ -1,4 +1,8 @@ -import { getCustomRewriter, getStatusText } from "@webrecorder/wabac"; +import { + getCustomRewriter, + getStatusText, + ExtraOpts, +} from "@webrecorder/wabac"; import { Protocol } from "puppeteer-core"; import { postToGetUrl } from "warcio"; @@ -66,9 +70,7 @@ export class RequestResponseInfo { resourceType?: string; - // TODO: Fix this the next time the file is edited. - // eslint-disable-next-line @typescript-eslint/no-explicit-any - extraOpts: Record = {}; + extraOpts: ExtraOpts = {}; // stats readSize: number = 0; @@ -181,19 +183,22 @@ export class RequestResponseInfo { return isRedirectStatus(this.status); } - isSelfRedirect() { - if (!this.isRedirectStatus()) { - return false; - } - + getRedirectUrl() { try { const headers = new Headers(this.getResponseHeadersDict()); const location = headers.get("location") || ""; - const redirUrl = new URL(location, this.url).href; - return this.url === redirUrl; + return new URL(location, this.url).href; } catch (e) { + return ""; + } + } + + isSelfRedirect() { + if (!this.isRedirectStatus()) { return false; } + + return this.url === this.getRedirectUrl(); } fillResponseReceivedExtraInfo( diff --git a/src/util/seeds.ts b/src/util/seeds.ts index d0e244459..ade152084 100644 --- a/src/util/seeds.ts +++ b/src/util/seeds.ts @@ -1,5 +1,9 @@ -import { logger } from "./logger.js"; +import fs from "fs"; + import { MAX_DEPTH } from "./constants.js"; +import { collectOnlineSeedFile } from "./file_reader.js"; +import { logger } from "./logger.js"; +import { type CrawlerArgs } from "./argParser.js"; type ScopeType = | "prefix" @@ -39,14 +43,14 @@ export class ScopedSeed { auth = null, }: { url: string; - scopeType: ScopeType; + scopeType: ScopeType | undefined; include: string[]; exclude: string[]; allowHash?: boolean; depth?: number; sitemap?: string | boolean | null; extraHops?: number; - auth: string | null; + auth?: string | null; }) { const parsedUrl = this.parseUrl(url); if (!parsedUrl) { @@ -62,14 +66,14 @@ export class ScopedSeed { this.url = parsedUrl.href; this.include = parseRx(include); this.exclude = parseRx(exclude); - this.scopeType = scopeType; this._includeStr = include; this._excludeStr = exclude; - if (!this.scopeType) { - this.scopeType = this.include.length ? "custom" : "prefix"; + if (!scopeType) { + scopeType = this.include.length ? "custom" : "prefix"; } + this.scopeType = scopeType; if (this.scopeType !== "custom") { const [includeNew, allowHashNew] = this.scopeFromType( @@ -300,6 +304,72 @@ export class ScopedSeed { } } +export async function parseSeeds(params: CrawlerArgs): Promise { + let seeds = params.seeds as string[]; + const scopedSeeds: ScopedSeed[] = []; + + if (params.seedFile) { + let seedFilePath = params.seedFile as string; + if ( + seedFilePath.startsWith("http://") || + seedFilePath.startsWith("https://") + ) { + seedFilePath = await collectOnlineSeedFile(seedFilePath); + } + + const urlSeedFile = fs.readFileSync(seedFilePath, "utf8"); + const urlSeedFileList = urlSeedFile.split("\n"); + + if (typeof seeds === "string") { + seeds = [seeds]; + } + + for (const seed of urlSeedFileList) { + if (seed) { + seeds.push(seed); + } + } + } + + const scopeOpts = { + scopeType: params.scopeType as ScopeType | undefined, + sitemap: params.sitemap, + include: params.include, + exclude: params.exclude, + depth: params.depth, + extraHops: params.extraHops, + }; + + for (const seed of seeds) { + const newSeed = typeof seed === "string" ? { url: seed } : seed; + + try { + scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...newSeed })); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + } catch (e: any) { + logger.error("Failed to create seed", { + error: e.toString(), + ...scopeOpts, + ...newSeed, + }); + if (params.failOnFailedSeed) { + logger.fatal( + "Invalid seed specified, aborting crawl", + { url: newSeed.url }, + "general", + 1, + ); + } + } + } + + if (!params.qaSource && !scopedSeeds.length) { + logger.fatal("No valid seeds specified, aborting crawl"); + } + + return scopedSeeds; +} + export function rxEscape(string: string) { return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&"); } diff --git a/src/util/state.ts b/src/util/state.ts index 107683af5..ec18145eb 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -87,6 +87,7 @@ export class PageState { pageSkipped = false; filteredFrames: Frame[] = []; loadState: LoadState = LoadState.FAILED; + contentCheckAllowed = false; logDetails = {}; @@ -447,7 +448,20 @@ return inx; return (await this.queueSize()) == 0 && (await this.numDone()) > 0; } + async isFailed() { + return ( + (await this.numDone()) === 0 && + (await this.queueSize()) === 0 && + (await this.numPending()) === 0 && + (await this.numFailed()) > 0 + ); + } + async trimToLimit(limit: number) { + if (limit === 0) { + return; + } + const totalComplete = (await this.numPending()) + (await this.numDone()) + @@ -465,6 +479,10 @@ return inx; } } + async setFailReason(reason: string) { + await this.redis.set(`${this.key}:failReason`, reason); + } + async setStatus(status_: string) { await this.redis.hset(`${this.key}:status`, this.uid, status_); } diff --git a/src/util/worker.ts b/src/util/worker.ts index f4dc8ddf4..dee9cebab 100644 --- a/src/util/worker.ts +++ b/src/util/worker.ts @@ -351,7 +351,7 @@ export class PageWorker { let loggedWaiting = false; while (await this.crawler.isCrawlRunning()) { - await crawlState.processMessage(this.crawler.params.scopedSeeds); + await crawlState.processMessage(this.crawler.seeds); const data = await crawlState.nextFromQueue(); diff --git a/tests/scopes.test.js b/tests/scopes.test.js index ddb64e04d..9717fb11d 100644 --- a/tests/scopes.test.js +++ b/tests/scopes.test.js @@ -1,8 +1,9 @@ import { parseArgs } from "../dist/util/argParser.js"; +import { parseSeeds } from "../dist/util/seeds.js"; import fs from "fs"; -function getSeeds(config) { +async function getSeeds(config) { const orig = fs.readFileSync; fs.readFileSync = (name, ...args) => { @@ -12,12 +13,12 @@ function getSeeds(config) { return orig(name, ...args); }; - const res = parseArgs(["node", "crawler", "--config", "stdinconfig"]); - return res.scopedSeeds; + const params = parseArgs(["node", "crawler", "--config", "stdinconfig"]); + return await parseSeeds(params); } test("default scope", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - https://example.com/ @@ -30,7 +31,7 @@ seeds: }); test("default scope + exclude", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - https://example.com/ @@ -45,7 +46,7 @@ exclude: https://example.com/pathexclude }); test("default scope + exclude is numeric", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - https://example.com/ @@ -60,7 +61,7 @@ exclude: "2022" }); test("prefix scope global + exclude", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - https://example.com/ @@ -76,7 +77,7 @@ exclude: https://example.com/pathexclude }); test("prefix scope per seed + exclude", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - url: https://example.com/ scopeType: prefix @@ -92,7 +93,7 @@ exclude: https://example.com/pathexclude }); test("host scope and domain scope", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - url: https://example.com/ @@ -127,7 +128,7 @@ seeds: }); test("domain scope drop www.", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - url: https://www.example.com/ scopeType: domain @@ -139,7 +140,7 @@ seeds: }); test("custom scope", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - url: https://example.com/ include: https?://example.com/(path|other) @@ -153,7 +154,7 @@ seeds: }); test("inherit scope", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - url: https://example.com/1 @@ -177,7 +178,7 @@ exclude: https://example.com/pathexclude }); test("override scope", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - url: https://example.com/1 @@ -220,7 +221,7 @@ include: https://example.com/onlythispath }); test("override scope with exclude", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - url: https://example.com/1 @@ -275,7 +276,7 @@ exclude: }); test("with exclude non-string types", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - url: https://example.com/ exclude: "2023" diff --git a/tests/upload-wacz.test.js b/tests/upload-wacz.test.js index 56039435d..8a039eeb2 100644 --- a/tests/upload-wacz.test.js +++ b/tests/upload-wacz.test.js @@ -72,7 +72,7 @@ test("run crawl with upload", async () => { } // ensure bucket is public - execSync(`docker exec ${minioId.trim()} mc config host add local http://127.0.0.1:9000 minioadmin minioadmin`); + execSync(`docker exec ${minioId.trim()} mc alias set local http://127.0.0.1:9000 minioadmin minioadmin`); execSync(`docker exec ${minioId.trim()} mc anonymous set download local/test-bucket`); // wait for crawler to finish diff --git a/tests/url_file_list.test.js b/tests/url_file_list.test.js index 9901ff366..c76afa6e6 100644 --- a/tests/url_file_list.test.js +++ b/tests/url_file_list.test.js @@ -38,3 +38,39 @@ test("check that URLs in seed-list are crawled", async () => { } expect(foundSeedUrl).toBe(true); }); + + +test("check that URLs in seed-list hosted at URL are crawled", async () => { + try { + await exec( + 'docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection onlinefilelisttest --urlFile "https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/fixtures/urlSeedFile.txt" --timeout 90000', + ); + } catch (error) { + console.log(error); + } + + let crawled_pages = fs.readFileSync( + "test-crawls/collections/onlinefilelisttest/pages/pages.jsonl", + "utf8", + ); + let seed_file = fs + .readFileSync("tests/fixtures/urlSeedFile.txt", "utf8") + .split("\n") + .sort(); + + let seed_file_list = []; + for (var j = 0; j < seed_file.length; j++) { + if (seed_file[j] != undefined) { + seed_file_list.push(seed_file[j]); + } + } + + let foundSeedUrl = true; + + for (var i = 1; i < seed_file_list.length; i++) { + if (crawled_pages.indexOf(seed_file_list[i]) == -1) { + foundSeedUrl = false; + } + } + expect(foundSeedUrl).toBe(true); +}); diff --git a/tsconfig.json b/tsconfig.json index e913ef61c..61251e386 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -101,7 +101,7 @@ // "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */ /* Completeness */ - "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */ + //"skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */ "skipLibCheck": true /* Skip type checking all .d.ts files. */ }, diff --git a/yarn.lock b/yarn.lock index 4cb40b072..faa1fa159 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1134,16 +1134,16 @@ resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406" integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ== -"@webrecorder/wabac@^2.23.3": - version "2.23.3" - resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.23.3.tgz#405f53649183c54fd116e334eae2666d6514a341" - integrity sha512-NlPNGNmilNf/NEqHbCNPcib4GNnZKQJKK3PIiI0BvEdem/TEjvcn5wEBbUntTYn+VwrhX36QY2HC7Iag+dVnvw== +"@webrecorder/wabac@^2.23.8": + version "2.23.8" + resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.23.8.tgz#a3eb1e605acb706b6f043ec9e7fae9ff412ccc8a" + integrity sha512-+ShHsaBHwFC0SPFTpMWrwJHd47MzT6o1Rg12FSfGfpycrcmrBV447+JR28NitLJIsfcIif8xAth9Vh5Z7tHWlQ== dependencies: "@peculiar/asn1-ecc" "^2.3.4" "@peculiar/asn1-schema" "^2.3.3" "@peculiar/x509" "^1.9.2" "@types/js-levenshtein" "^1.1.3" - "@webrecorder/wombat" "^3.8.13" + "@webrecorder/wombat" "^3.8.14" acorn "^8.10.0" auto-js-ipfs "^2.1.1" base64-js "^1.5.1" @@ -1164,10 +1164,10 @@ stream-browserify "^3.0.0" warcio "^2.4.3" -"@webrecorder/wombat@^3.8.13": - version "3.8.13" - resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.8.13.tgz#264f639dd102dca415f5d01a649d6b95dfac9779" - integrity sha512-gg80bEpJE+2Wn0ZTbfCkt9+vTftJemBwAWe9TYXo7ErCX1v7RbIrZ5LfkjSWx3vCx6R4V31DxXk1mycsVrEapA== +"@webrecorder/wombat@^3.8.14": + version "3.8.14" + resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.8.14.tgz#fde951519ed9ab8271107a013fc1abd6a9997424" + integrity sha512-1CaL8Oel02V321SS+wOomV+cSDo279eVEAuiamO9jl9YoijRsGL9z/xZKE6sz6npLltE3YYziEBYO81xnaeTcA== dependencies: warcio "^2.4.0" @@ -1595,10 +1595,10 @@ browserslist@^4.24.0: node-releases "^2.0.18" update-browserslist-db "^1.1.1" -browsertrix-behaviors@^0.8.5: - version "0.8.5" - resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.8.5.tgz#f93dc6fed15cb2266664c85eec7f0796c1634fa5" - integrity sha512-v6wv6NLJEhj3NbrmGEfOWyXf2TuJgj95Em+KfCTPRJxakTtsvH/A7n2FSNvqMhwusqrjpIR4ch6cEkDp4hblvQ== +browsertrix-behaviors@^0.9.2: + version "0.9.2" + resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.9.2.tgz#b5bee47d15014a05a873d8cc6ea8917bfa61d5c8" + integrity sha512-d7rLNKXaiD83S4uXKBUf2x9UzmMjbrqKoO820KVqzWtlpzqnXFUsqN/wKvMSiNbDzmL1+G9Um7Gwb1AjD0djCw== dependencies: query-selector-shadow-dom "^1.0.1"