diff --git a/docs/upgrading/upgrading_v4.md b/docs/upgrading/upgrading_v4.md index 22a3ff28d206..35292bd78420 100644 --- a/docs/upgrading/upgrading_v4.md +++ b/docs/upgrading/upgrading_v4.md @@ -94,3 +94,16 @@ This experimental option relied on an outdated manifest version for browser exte ## Available resource detection In v3, we introduced a new way to detect available resources for the crawler, available via `systemInfoV2` flag. In v4, this is the default way to detect available resources. The old way is removed completely together with the `systemInfoV2` flag. + +## `HttpClient` instances return `Response` objects + +The interface of `HttpClient` instances was changed to return the [native `Response` objects](https://developer.mozilla.org/en-US/docs/Web/API/Response) instead of custom `HttpResponse` objects. + +## `CrawlingContext.response` is now of type `Response` + +The `CrawlingContext.response` property is now of type [`Response`](https://developer.mozilla.org/en-US/docs/Web/API/Response) instead of `HttpResponse`. `CrawlingContext.sendRequest` method now returns `Response` objects as well. + +## Crawling context in the `FileDownload` crawler no longer includes `body` and `stream` properties + +The crawling context in the `FileDownload` crawler no longer includes the `body` and `stream` properties. These can be accessed directly via the `response` property instead, e.g. `context.response.bytes()` or `context.response.body`. + diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index 037107a10518..a8e4c015bac1 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -1405,7 +1405,7 @@ export class BasicCrawler< }, pushData: this.pushData.bind(this), useState: this.useState.bind(this), - sendRequest: createSendRequest(this.httpClient, request!, session), + sendRequest: createSendRequest(this.httpClient, request!, session) as CrawlingContext['sendRequest'], getKeyValueStore: async (idOrName?: string) => KeyValueStore.open(idOrName, { config: this.config }), registerDeferredCleanup: (cleanup) => { deferredCleanup.push(cleanup); diff --git a/packages/basic-crawler/src/internals/send-request.ts b/packages/basic-crawler/src/internals/send-request.ts index 249a968ae821..11e560400189 100644 --- a/packages/basic-crawler/src/internals/send-request.ts +++ b/packages/basic-crawler/src/internals/send-request.ts @@ -5,7 +5,6 @@ import { type Request, type Session, } from '@crawlee/core'; -import type { Method, Response as GotResponse } from 'got-scraping'; /** * Prepares a function to be used as the `sendRequest` context helper. @@ -17,10 +16,7 @@ import type { Method, Response as GotResponse } from 'got-scraping'; * @param getProxyUrl A function that will return the proxy URL that should be used for handling the request. */ export function createSendRequest(httpClient: BaseHttpClient, originRequest: Request, session: Session | undefined) { - return async ( - // TODO the type information here (and in crawler_commons) is outright wrong... for BC - replace this with generic HttpResponse in v4 - overrideOptions: Partial = {}, - ): Promise> => { + return async (overrideOptions: Partial = {}): Promise => { const cookieJar = session ? { getCookieString: async (url: string) => session.getCookieString(url), @@ -31,7 +27,7 @@ export function createSendRequest(httpClient: BaseHttpClient, originRequest: Req const requestOptions = processHttpRequestOptions({ url: originRequest.url, - method: originRequest.method as Method, // Narrow type to omit CONNECT + method: originRequest.method, headers: originRequest.headers, proxyUrl: session?.proxyInfo?.url, sessionToken: session, @@ -43,6 +39,6 @@ export function createSendRequest(httpClient: BaseHttpClient, originRequest: Req // Fill in body as the last step - `processHttpRequestOptions` may use either `body`, `json` or `form` so we cannot override it beforehand requestOptions.body ??= originRequest.payload; - return httpClient.sendRequest(requestOptions) as unknown as GotResponse; + return httpClient.sendRequest(requestOptions); }; } diff --git a/packages/core/src/cookie_utils.ts b/packages/core/src/cookie_utils.ts index 60083fdca3b4..6598ec3f30c3 100644 --- a/packages/core/src/cookie_utils.ts +++ b/packages/core/src/cookie_utils.ts @@ -12,16 +12,14 @@ export interface ResponseLike { /** * @internal */ -export function getCookiesFromResponse(response: ResponseLike): Cookie[] { - const headers = typeof response.headers === 'function' ? response.headers() : response.headers; - const cookieHeader = headers?.['set-cookie'] || ''; +export function getCookiesFromResponse(response: Response): Cookie[] { + const headers = response.headers; + const cookieHeaders = headers.getSetCookie(); try { - return Array.isArray(cookieHeader) - ? cookieHeader.map((cookie) => Cookie.parse(cookie)!) - : [Cookie.parse(cookieHeader)!]; + return cookieHeaders.map((cookie) => Cookie.parse(cookie)!); } catch (e) { - throw new CookieParseError(cookieHeader); + throw new CookieParseError(cookieHeaders); } } diff --git a/packages/core/src/crawlers/crawler_commons.ts b/packages/core/src/crawlers/crawler_commons.ts index 0ac76d838d26..1f4a3e6bc6f1 100644 --- a/packages/core/src/crawlers/crawler_commons.ts +++ b/packages/core/src/crawlers/crawler_commons.ts @@ -1,5 +1,5 @@ import type { Dictionary } from '@crawlee/types'; -import type { OptionsInit, Response as GotResponse } from 'got-scraping'; +import type { OptionsInit } from 'got-scraping'; import type { ReadonlyDeep, SetRequired } from 'type-fest'; import type { Configuration } from '../configuration.js'; @@ -156,7 +156,7 @@ export interface CrawlingContext exten * }, * ``` */ - sendRequest(overrideOptions?: Partial): Promise>; + sendRequest(overrideOptions?: Partial): Promise; /** * Register a function to be called at the very end of the request handling process. This is useful for resources that should be accessible to error handlers, for instance. diff --git a/packages/core/src/http_clients/base-http-client.ts b/packages/core/src/http_clients/base-http-client.ts index 053b99b08a2c..8adf71e9a32b 100644 --- a/packages/core/src/http_clients/base-http-client.ts +++ b/packages/core/src/http_clients/base-http-client.ts @@ -1,5 +1,6 @@ import type { Readable } from 'node:stream'; +import type { AllowedHttpMethods } from '@crawlee/types'; import { applySearchParams, type SearchParams } from '@crawlee/utils'; import type { FormDataLike } from './form-data-like.js'; @@ -15,24 +16,6 @@ type Timeout = } | { request: number }; -type Method = - | 'GET' - | 'POST' - | 'PUT' - | 'PATCH' - | 'HEAD' - | 'DELETE' - | 'OPTIONS' - | 'TRACE' - | 'get' - | 'post' - | 'put' - | 'patch' - | 'head' - | 'delete' - | 'options' - | 'trace'; - /** * Maps permitted values of the `responseType` option on {@apilink HttpRequest} to the types that they produce. */ @@ -79,7 +62,7 @@ export interface HttpRequest [k: string]: unknown; // TODO BC with got - remove in 4.0 url: string | URL; - method?: Method; + method?: AllowedHttpMethods; headers?: SimpleHeaders; body?: string | Buffer | Readable | Generator | AsyncGenerator | FormDataLike; @@ -146,6 +129,14 @@ interface HttpResponseWithoutBody; } +export class ResponseWithUrl extends Response { + override url: string; + constructor(body: BodyInit | null, init: ResponseInit & { url?: string }) { + super(body, init); + this.url = init.url ?? ''; + } +} + /** * HTTP response data as returned by the {@apilink BaseHttpClient.sendRequest} method. */ @@ -169,7 +160,7 @@ export interface StreamingHttpResponse extends HttpResponseWithoutBody { * Type of a function called when an HTTP redirect takes place. It is allowed to mutate the `updatedRequest` argument. */ export type RedirectHandler = ( - redirectResponse: BaseHttpResponseData, + redirectResponse: Response, updatedRequest: { url?: string | URL; headers: SimpleHeaders }, ) => void; @@ -182,12 +173,12 @@ export interface BaseHttpClient { */ sendRequest( request: HttpRequest, - ): Promise>; + ): Promise; /** * Perform an HTTP Request and return after the response headers are received. The body may be read from a stream contained in the response. */ - stream(request: HttpRequest, onRedirect?: RedirectHandler): Promise; + stream(request: HttpRequest, onRedirect?: RedirectHandler): Promise; } /** diff --git a/packages/core/src/http_clients/got-scraping-http-client.ts b/packages/core/src/http_clients/got-scraping-http-client.ts index 408906fbd1d8..3f3d6df79ae8 100644 --- a/packages/core/src/http_clients/got-scraping-http-client.ts +++ b/packages/core/src/http_clients/got-scraping-http-client.ts @@ -1,25 +1,40 @@ +import { Readable } from 'node:stream'; + import type { Options, PlainResponse } from 'got-scraping'; import { gotScraping } from 'got-scraping'; -import type { - BaseHttpClient, - HttpRequest, - HttpResponse, - RedirectHandler, - ResponseTypes, - StreamingHttpResponse, +import { + type BaseHttpClient, + type HttpRequest, + type RedirectHandler, + type ResponseTypes, + ResponseWithUrl, } from './base-http-client.js'; /** * A HTTP client implementation based on the `got-scraping` library. */ export class GotScrapingHttpClient implements BaseHttpClient { + /** + * Type guard that validates the HTTP method (excluding CONNECT). + * @param request - The HTTP request to validate + */ + private validateRequest>( + request: T, + ): request is T & { method: Exclude } { + return !['CONNECT', 'connect'].includes(request.method!); + } + /** * @inheritDoc */ async sendRequest( request: HttpRequest, - ): Promise> { + ): Promise { + if (!this.validateRequest(request)) { + throw new Error(`The HTTP method CONNECT is not supported by the GotScrapingHttpClient.`); + } + const gotResult = await gotScraping({ ...request, retry: { @@ -28,23 +43,45 @@ export class GotScrapingHttpClient implements BaseHttpClient { }, }); - return { - ...gotResult, - body: gotResult.body as ResponseTypes[TResponseType], - request: { url: request.url, ...gotResult.request }, - }; + const parsedHeaders = Object.entries(gotResult.headers) + .map(([key, value]) => { + if (value === undefined) return []; + + if (Array.isArray(value)) { + return value.map((v) => [key, v]); + } + + return [[key, value]]; + }) + .flat() as [string, string][]; + + return new ResponseWithUrl(new Uint8Array(gotResult.rawBody), { + headers: new Headers(parsedHeaders), + status: gotResult.statusCode, + statusText: gotResult.statusMessage ?? '', + url: gotResult.url, + }); } /** * @inheritDoc */ - async stream(request: HttpRequest, handleRedirect?: RedirectHandler): Promise { + async stream(request: HttpRequest, handleRedirect?: RedirectHandler): Promise { + if (!this.validateRequest(request)) { + throw new Error(`The HTTP method CONNECT is not supported by the GotScrapingHttpClient.`); + } // eslint-disable-next-line no-async-promise-executor return new Promise(async (resolve, reject) => { const stream = gotScraping({ ...request, isStream: true }); - stream.on('redirect', (updatedOptions: Options, redirectResponse: PlainResponse) => { - handleRedirect?.(redirectResponse, updatedOptions); + stream.on('redirect', (updatedOptions: Options, redirectResponse: any) => { + const nativeRedirectResponse = new ResponseWithUrl(redirectResponse.rawBody, { + headers: redirectResponse.headers, + status: redirectResponse.statusCode, + statusText: redirectResponse.statusMessage, + url: redirectResponse.url, + }); + handleRedirect?.(nativeRedirectResponse, updatedOptions); }); // We need to end the stream for DELETE requests, otherwise it will hang. @@ -55,37 +92,15 @@ export class GotScrapingHttpClient implements BaseHttpClient { stream.on('error', reject); stream.on('response', (response: PlainResponse) => { - const result: StreamingHttpResponse = { - stream, - request, - redirectUrls: response.redirectUrls, - url: response.url, - ip: response.ip, - statusCode: response.statusCode, - headers: response.headers, - trailers: response.trailers, - complete: response.complete, - get downloadProgress() { - return stream.downloadProgress; - }, - get uploadProgress() { - return stream.uploadProgress; - }, - }; - - Object.assign(result, response); // TODO BC - remove in 4.0 - - resolve(result); - - stream.on('end', () => { - result.complete = response.complete; - - result.trailers ??= {}; - Object.assign(result.trailers, response.trailers); - - (result as any).rawTrailers ??= []; // TODO BC - remove in 4.0 - Object.assign((result as any).rawTrailers, response.rawTrailers); - }); + // Cast shouldn't be needed here, undici might have a different `ReadableStream` type + resolve( + new ResponseWithUrl(Readable.toWeb(stream) as any, { + status: response.statusCode, + statusText: response.statusMessage ?? '', + headers: response.headers as HeadersInit, + url: response.url, + }), + ); }); }); } diff --git a/packages/core/src/request.ts b/packages/core/src/request.ts index a8dc7f6d7008..a41b6234712d 100644 --- a/packages/core/src/request.ts +++ b/packages/core/src/request.ts @@ -79,7 +79,7 @@ export enum RequestState { * ``` * @category Sources */ -export class Request { +class CrawleeRequest { /** Request ID */ id?: string; @@ -193,7 +193,8 @@ export class Request { this.url = url; this.loadedUrl = loadedUrl; this.uniqueKey = - uniqueKey || Request.computeUniqueKey({ url, method, payload, keepUrlFragment, useExtendedUniqueKey }); + uniqueKey || + CrawleeRequest.computeUniqueKey({ url, method, payload, keepUrlFragment, useExtendedUniqueKey }); this.method = method; this.payload = payload; this.noRetry = noRetry; @@ -255,6 +256,18 @@ export class Request { } } + /** + * Converts the Crawlee Request object to a `fetch` API Request object. + * @returns The native `fetch` API Request object. + */ + public intoFetchAPIRequest(): Request { + return new Request(this.url, { + method: this.method, + headers: this.headers, + body: this.payload, + }); + } + /** Tells the crawler processing this request to skip the navigation and process the request directly. */ get skipNavigation(): boolean { return this.userData.__crawlee?.skipNavigation ?? false; @@ -398,7 +411,7 @@ export class Request { } return normalizedUrl; } - const payloadHash = payload ? Request.hashPayload(payload) : ''; + const payloadHash = payload ? CrawleeRequest.hashPayload(payload) : ''; return `${normalizedMethod}(${payloadHash}):${normalizedUrl}`; } @@ -526,10 +539,12 @@ interface ComputeUniqueKeyOptions { useExtendedUniqueKey?: boolean; } -export type Source = (Partial & { requestsFromUrl?: string; regex?: RegExp }) | Request; +export type Source = (Partial & { requestsFromUrl?: string; regex?: RegExp }) | CrawleeRequest; /** @internal */ export interface InternalSource { requestsFromUrl: string; regex?: RegExp; } + +export { CrawleeRequest as Request }; diff --git a/packages/core/src/session_pool/session.ts b/packages/core/src/session_pool/session.ts index 6bb783ddae1c..fc5d8aa362d3 100644 --- a/packages/core/src/session_pool/session.ts +++ b/packages/core/src/session_pool/session.ts @@ -8,7 +8,6 @@ import { CookieJar } from 'tough-cookie'; import type { Log } from '@apify/log'; import { cryptoRandomObjectId } from '@apify/utilities'; -import type { ResponseLike } from '../cookie_utils.js'; import { browserPoolCookieToToughCookie, getCookiesFromResponse, @@ -331,10 +330,10 @@ export class Session { * * It then parses and saves the cookies from the `set-cookie` header, if available. */ - setCookiesFromResponse(response: ResponseLike) { + setCookiesFromResponse(response: Response) { try { const cookies = getCookiesFromResponse(response).filter((c) => c); - this._setCookies(cookies, typeof response.url === 'function' ? response.url() : response.url!); + this._setCookies(cookies, response.url); } catch (e) { const err = e as Error; // if invalid Cookie header is provided just log the exception. diff --git a/packages/core/src/typedefs.ts b/packages/core/src/typedefs.ts index 49f7f49c1d2a..9564cda5fa86 100644 --- a/packages/core/src/typedefs.ts +++ b/packages/core/src/typedefs.ts @@ -14,13 +14,4 @@ export function keys(obj: T) { return Object.keys(obj) as (keyof T)[]; } -export declare type AllowedHttpMethods = - | 'GET' - | 'HEAD' - | 'POST' - | 'PUT' - | 'DELETE' - | 'TRACE' - | 'OPTIONS' - | 'CONNECT' - | 'PATCH'; +export { AllowedHttpMethods } from '@crawlee/types'; diff --git a/packages/http-crawler/src/internals/file-download.ts b/packages/http-crawler/src/internals/file-download.ts index 614bd9583097..c814c3e63762 100644 --- a/packages/http-crawler/src/internals/file-download.ts +++ b/packages/http-crawler/src/internals/file-download.ts @@ -1,10 +1,8 @@ -import type { Readable } from 'node:stream'; -import { buffer } from 'node:stream/consumers'; import { finished } from 'node:stream/promises'; import type { BasicCrawlerOptions } from '@crawlee/basic'; import { BasicCrawler, ContextPipeline } from '@crawlee/basic'; -import type { CrawlingContext, HttpResponse, LoadedRequest, Request, StreamingHttpResponse } from '@crawlee/core'; +import type { CrawlingContext, LoadedRequest, Request } from '@crawlee/core'; import type { Dictionary } from '@crawlee/types'; import type { ErrorHandler, GetUserDataFromRequest, InternalHttpHook, RequestHandler, RouterRoutes } from '../index.js'; @@ -23,9 +21,7 @@ export interface FileDownloadCrawlingContext< UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler > extends CrawlingContext { request: LoadedRequest>; - response: HttpResponse<'buffer'> | StreamingHttpResponse; - body: Promise; - stream: Readable; + response: Response; contentType: { type: string; encoding: BufferEncoding }; } @@ -33,10 +29,6 @@ export type FileDownloadRequestHandler< UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler > = RequestHandler>; -interface ContextInternals { - pollingInterval?: NodeJS.Timeout; -} - /** * Provides a framework for downloading files in parallel using plain HTTP requests. The URLs to download are fed either from a static list of URLs or they can be added on the fly from another crawler. * @@ -81,16 +73,16 @@ interface ContextInternals { * ``` */ export class FileDownload extends BasicCrawler { - #contextInternals = Symbol('contextInternals'); - // TODO hooks constructor(options: BasicCrawlerOptions = {}) { super({ ...options, contextPipelineBuilder: () => ContextPipeline.create().compose({ - action: this.initiateDownload.bind(this), - cleanup: this.cleanupDownload.bind(this), + action: async (context) => this.initiateDownload(context), + cleanup: async (context) => { + await (context.response.body ? finished(context.response.body as any) : Promise.resolve()); + }, }), }); } @@ -106,49 +98,14 @@ export class FileDownload extends BasicCrawler { context.request.url = response.url; - const pollingInterval = setInterval(() => { - const { total, transferred } = response.downloadProgress; - - if (transferred > 0) { - context.log.debug( - `Downloaded ${transferred} bytes of ${total ?? 0} bytes from ${context.request.url}.`, - ); - } - }, 5000); - const contextExtension = { - [this.#contextInternals]: { pollingInterval } as ContextInternals, request: context.request as LoadedRequest, response, contentType: { type, encoding }, - stream: response.stream, - get body() { - return buffer(response.stream); - }, }; return contextExtension; } - - private async cleanupDownload( - context: FileDownloadCrawlingContext & { [k: symbol]: ContextInternals }, - error?: unknown, - ) { - clearInterval(context[this.#contextInternals].pollingInterval); - - // If there was no error and the stream is still readable, wait for it to be consumed before proceeding - if (error === undefined) { - if (!context.stream.destroyed && context.stream.readable) { - try { - await finished(context.stream); - } catch { - // Stream might have encountered an error or been closed, which is fine - } - } - } else { - context.stream.destroy(); - } - } } /** diff --git a/packages/http-crawler/src/internals/http-crawler.ts b/packages/http-crawler/src/internals/http-crawler.ts index a6722ecd0ca8..a03c3087e39d 100644 --- a/packages/http-crawler/src/internals/http-crawler.ts +++ b/packages/http-crawler/src/internals/http-crawler.ts @@ -1,5 +1,5 @@ -import type { IncomingHttpHeaders, IncomingMessage } from 'node:http'; -import type { Readable } from 'node:stream'; +import type { IncomingMessage } from 'node:http'; +import { Readable } from 'node:stream'; import util from 'node:util'; import type { @@ -21,10 +21,11 @@ import { mergeCookies, processHttpRequestOptions, RequestState, + ResponseWithUrl, Router, SessionError, } from '@crawlee/basic'; -import type { HttpResponse, LoadedRequest, StreamingHttpResponse } from '@crawlee/core'; +import type { HttpResponse, LoadedRequest } from '@crawlee/core'; import type { Awaitable, Dictionary } from '@crawlee/types'; import { type CheerioRoot, RETRY_CSS_SELECTORS } from '@crawlee/utils'; import * as cheerio from 'cheerio'; @@ -36,7 +37,6 @@ import ow from 'ow'; import type { JsonValue } from 'type-fest'; import { addTimeoutToPromise, tryCancel } from '@apify/timeout'; -import { concatStreamToBuffer, readStreamToString } from '@apify/utilities'; import { parseContentTypeFromResponse } from './utils.js'; @@ -190,7 +190,7 @@ interface CrawlingContextWithReponse< /** * The HTTP response object containing status code, headers, and other response metadata. */ - response: PlainResponse; + response: Response; } /** @@ -481,7 +481,7 @@ export class HttpCrawler< ); tryCancel(); - request.loadedUrl = httpResponse.url; + request.loadedUrl = httpResponse?.url; request.state = RequestState.AFTER_NAV; return { request: request as LoadedRequest, response: httpResponse }; @@ -538,7 +538,7 @@ export class HttpCrawler< }; if (this.useSessionPool) { - this._throwOnBlockedRequest(crawlingContext.session!, response.statusCode!); + this._throwOnBlockedRequest(crawlingContext.session!, response.status!); } if (this.persistCookiesPerSession) { @@ -554,7 +554,7 @@ export class HttpCrawler< waitForSelector, parseWithCheerio, contentType, - body: parsed.body!, + body: parsed.body, }; } @@ -646,7 +646,7 @@ export class HttpCrawler< session, proxyUrl, gotOptions, - }: RequestFunctionOptions): Promise { + }: RequestFunctionOptions): Promise { if (!TimeoutError) { // @ts-ignore ({ TimeoutError } = await import('got-scraping')); @@ -659,7 +659,7 @@ export class HttpCrawler< } catch (e) { if (e instanceof TimeoutError) { this._handleRequestTimeout(session); - return undefined as unknown as PlainResponse; + return new Response(); // this will never happen, as _handleRequestTimeout always throws } if (this.isProxyError(e as Error)) { @@ -673,21 +673,21 @@ export class HttpCrawler< /** * Encodes and parses response according to the provided content type */ - private async _parseResponse(request: Request, responseStream: IncomingMessage) { - const { statusCode } = responseStream; - const { type, charset } = parseContentTypeFromResponse(responseStream); - const { response, encoding } = this._encodeResponse(request, responseStream, charset); + protected async _parseResponse(request: Request, response: Response) { + const { status } = response; + const { type, charset } = parseContentTypeFromResponse(response); + const { response: reencodedResponse, encoding } = this._encodeResponse(request, response, charset); const contentType = { type, encoding }; - if (statusCode! >= 400 && statusCode! <= 599) { - this.stats.registerStatusCode(statusCode!); + if (status >= 400 && status <= 599) { + this.stats.registerStatusCode(status); } - const excludeError = this.ignoreHttpErrorStatusCodes.has(statusCode!); - const includeError = this.additionalHttpErrorStatusCodes.has(statusCode!); + const excludeError = this.ignoreHttpErrorStatusCodes.has(status); + const includeError = this.additionalHttpErrorStatusCodes.has(status); - if ((statusCode! >= 500 && !excludeError) || includeError) { - const body = await readStreamToString(response, encoding); + if ((status >= 500 && !excludeError) || includeError) { + const body = await reencodedResponse.text(); // TODO - this always uses UTF-8 (see https://developer.mozilla.org/en-US/docs/Web/API/Request/text) // Errors are often sent as JSON, so attempt to parse them, // despite Accept header being set to text/html. @@ -695,19 +695,19 @@ export class HttpCrawler< const errorResponse = JSON.parse(body); let { message } = errorResponse; if (!message) message = util.inspect(errorResponse, { depth: 1, maxArrayLength: 10 }); - throw new Error(`${statusCode} - ${message}`); + throw new Error(`${status} - ${message}`); } if (includeError) { - throw new Error(`${statusCode} - Error status code was set by user.`); + throw new Error(`${status} - Error status code was set by user.`); } // It's not a JSON, so it's probably some text. Get the first 100 chars of it. - throw new Error(`${statusCode} - Internal Server Error: ${body.slice(0, 100)}`); + throw new Error(`${status} - Internal Server Error: ${body.slice(0, 100)}`); } else if (HTML_AND_XML_MIME_TYPES.includes(type)) { - return { response, contentType, body: await readStreamToString(response) }; + return { response, contentType, body: await response.text() }; } else { - const body = await concatStreamToBuffer(response); + const body = Buffer.from(await response.bytes()); return { body, response, @@ -753,11 +753,11 @@ export class HttpCrawler< protected _encodeResponse( request: Request, - response: IncomingMessage, + response: Response, encoding: BufferEncoding, ): { encoding: BufferEncoding; - response: IncomingMessage; + response: Response; } { if (this.forceResponseEncoding) { encoding = this.forceResponseEncoding as BufferEncoding; @@ -777,17 +777,18 @@ export class HttpCrawler< if (iconv.encodingExists(encoding)) { const encodeStream = iconv.encodeStream(utf8); const decodeStream = iconv.decodeStream(encoding).on('error', (err) => encodeStream.emit('error', err)); - response.on('error', (err: Error) => decodeStream.emit('error', err)); - const encodedResponse = response.pipe(decodeStream).pipe(encodeStream) as NodeJS.ReadWriteStream & { - statusCode?: number; - headers: IncomingHttpHeaders; - url?: string; - }; - encodedResponse.statusCode = response.statusCode; - encodedResponse.headers = response.headers; - encodedResponse.url = response.url; + const reencodedBody = response.body + ? Readable.toWeb( + Readable.from( + Readable.fromWeb(response.body as any) + .pipe(decodeStream) + .pipe(encodeStream), + ), + ) + : null; + return { - response: encodedResponse as any, + response: new ResponseWithUrl(reencodedBody as any, response), encoding: utf8, }; } @@ -822,14 +823,14 @@ export class HttpCrawler< throw new Error(`request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`); } - private _abortDownloadOfBody(request: Request, response: IncomingMessage) { - const { statusCode } = response; + private _abortDownloadOfBody(request: Request, response: Response) { + const { status } = response; const { type } = parseContentTypeFromResponse(response); // eslint-disable-next-line dot-notation -- accessing private property const blockedStatusCodes = this.sessionPool ? this.sessionPool['blockedStatusCodes'] : []; // if we retry the request, can the Content-Type change? - const isTransientContentType = statusCode! >= 500 || blockedStatusCodes.includes(statusCode!); + const isTransientContentType = status >= 500 || blockedStatusCodes.includes(status); if (!this.supportedMimeTypes.has(type) && !this.supportedMimeTypes.has('*/*') && !isTransientContentType) { request.noRetry = true; @@ -865,7 +866,7 @@ export class HttpCrawler< }, ); - return addResponsePropertiesToStream(response.stream, response); + return response; }; } @@ -876,48 +877,6 @@ interface RequestFunctionOptions { gotOptions: OptionsInit; } -/** - * The stream object returned from got does not have the below properties. - * At the same time, you can't read data directly from the response stream, - * because they won't get emitted unless you also read from the primary - * got stream. To be able to work with only one stream, we move the expected props - * from the response stream to the got stream. - * @internal - */ -function addResponsePropertiesToStream(stream: Readable, response: StreamingHttpResponse) { - const properties: (keyof PlainResponse)[] = [ - 'statusCode', - 'statusMessage', - 'headers', - 'complete', - 'httpVersion', - 'rawHeaders', - 'rawTrailers', - 'trailers', - 'url', - 'request', - ]; - - stream.on('end', () => { - // @ts-expect-error - if (stream.rawTrailers) stream.rawTrailers = response.rawTrailers; // TODO BC with got - remove in 4.0 - - // @ts-expect-error - if (stream.trailers) stream.trailers = response.trailers; - - // @ts-expect-error - stream.complete = response.complete; - }); - - for (const prop of properties) { - if (!(prop in stream)) { - (stream as any)[prop] = (response as any)[prop]; - } - } - - return stream as unknown as PlainResponse; -} - /** * Creates new {@apilink Router} instance that works based on request labels. * This instance can then serve as a `requestHandler` of your {@apilink HttpCrawler}. diff --git a/packages/http-crawler/src/internals/utils.ts b/packages/http-crawler/src/internals/utils.ts index 0dcfe707d206..380b58b60ad3 100644 --- a/packages/http-crawler/src/internals/utils.ts +++ b/packages/http-crawler/src/internals/utils.ts @@ -8,7 +8,7 @@ import ow, { ObjectPredicate } from 'ow'; * Gets parsed content type from response object * @param response HTTP response object */ -export function parseContentTypeFromResponse(response: unknown): { type: string; charset: BufferEncoding } { +export function parseContentTypeFromResponse(response: Response): { type: string; charset: BufferEncoding } { ow( response, ow.object.partialShape({ @@ -20,9 +20,9 @@ export function parseContentTypeFromResponse(response: unknown): { type: string; const { url, headers } = response; let parsedContentType; - if (headers['content-type']) { + if (headers.get('content-type')) { try { - parsedContentType = contentTypeParser.parse(headers['content-type'] as string); + parsedContentType = contentTypeParser.parse(headers.get('content-type') as string); } catch { // Can not parse content type from Content-Type header. Try to parse it from file extension. } diff --git a/packages/impit-client/src/index.ts b/packages/impit-client/src/index.ts index 0e1da299ece2..a0b6c5c919b4 100644 --- a/packages/impit-client/src/index.ts +++ b/packages/impit-client/src/index.ts @@ -1,8 +1,8 @@ import { Readable } from 'node:stream'; -import { type ReadableStream } from 'node:stream/web'; +import type { ReadableStream } from 'node:stream/web'; import { isGeneratorObject } from 'node:util/types'; -import type { BaseHttpClient, HttpRequest, HttpResponse, ResponseTypes, StreamingHttpResponse } from '@crawlee/core'; +import { type BaseHttpClient, type HttpRequest, type ResponseTypes, ResponseWithUrl } from '@crawlee/core'; import type { HttpMethod, ImpitOptions, ImpitResponse, RequestInit } from 'impit'; import { Impit } from 'impit'; @@ -155,35 +155,11 @@ export class ImpitHttpClient implements BaseHttpClient { */ async sendRequest( request: HttpRequest, - ): Promise> { - const { response, redirectUrls } = await this.getResponse(request); - - let responseBody; - - switch (request.responseType) { - case 'text': - responseBody = await response.text(); - break; - case 'json': - responseBody = await response.json(); - break; - case 'buffer': - responseBody = await response.bytes(); - break; - default: - throw new Error('Unsupported response type.'); - } + ): Promise { + const { response } = await this.getResponse(request); - return { - headers: Object.fromEntries(response.headers.entries()), - statusCode: response.status, - url: response.url, - request, - redirectUrls, - trailers: {}, - body: responseBody, - complete: true, - }; + // todo - cast shouldn't be needed here, impit returns `Uint8Array` + return new ResponseWithUrl((await response.bytes()) as any, response); } private getStreamWithProgress( @@ -210,23 +186,11 @@ export class ImpitHttpClient implements BaseHttpClient { /** * @inheritDoc */ - async stream(request: HttpRequest): Promise { - const { response, redirectUrls } = await this.getResponse(request); - const [stream, getDownloadProgress] = this.getStreamWithProgress(response); + async stream(request: HttpRequest): Promise { + const { response } = await this.getResponse(request); + const [stream] = this.getStreamWithProgress(response); - return { - request, - url: response.url, - statusCode: response.status, - stream, - complete: true, - get downloadProgress() { - return getDownloadProgress(); - }, - uploadProgress: { percent: 100, transferred: 0 }, - redirectUrls, - headers: Object.fromEntries(response.headers.entries()), - trailers: {}, - }; + // Cast shouldn't be needed here, undici might have a slightly different `ReadableStream` type + return new ResponseWithUrl(Readable.toWeb(stream) as any, response); } } diff --git a/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts b/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts index c5345302e03c..481f3e8d03ac 100644 --- a/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts +++ b/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts @@ -6,7 +6,6 @@ import { extractUrlsFromPage } from '@crawlee/browser'; import type { CheerioCrawlingContext } from '@crawlee/cheerio'; import { CheerioCrawler } from '@crawlee/cheerio'; import type { - BaseHttpResponseData, ContextPipeline, CrawlingContext, EnqueueLinksOptions, @@ -105,7 +104,7 @@ export interface AdaptivePlaywrightCrawlerContext = new (...args: any[]) => T; /** @ignore */ export type Awaitable = T | PromiseLike; -export type AllowedHttpMethods = 'GET' | 'HEAD' | 'POST' | 'PUT' | 'DELETE' | 'TRACE' | 'OPTIONS' | 'CONNECT' | 'PATCH'; +export type AllowedHttpMethods = + | 'GET' + | 'HEAD' + | 'POST' + | 'PUT' + | 'DELETE' + | 'TRACE' + | 'OPTIONS' + | 'CONNECT' + | 'PATCH' + | 'get' + | 'head' + | 'post' + | 'put' + | 'delete' + | 'trace' + | 'options' + | 'connect' + | 'patch'; diff --git a/test/core/crawlers/basic_crawler.test.ts b/test/core/crawlers/basic_crawler.test.ts index 8edbb682ca30..656ce27eeff5 100644 --- a/test/core/crawlers/basic_crawler.test.ts +++ b/test/core/crawlers/basic_crawler.test.ts @@ -1310,8 +1310,8 @@ describe('BasicCrawler', () => { const response = await sendRequest(); responses.push({ - statusCode: response.statusCode, - body: response.body, + statusCode: response.status, + body: await response.text(), }); }, }); @@ -1338,8 +1338,8 @@ describe('BasicCrawler', () => { const response = await sendRequest(); responses.push({ - statusCode: response.statusCode, - body: response.body, + statusCode: response.status, + body: await response.text(), }); }, }); diff --git a/test/core/crawlers/cheerio_crawler.test.ts b/test/core/crawlers/cheerio_crawler.test.ts index edb2ace48cb5..bd35566ac6a0 100644 --- a/test/core/crawlers/cheerio_crawler.test.ts +++ b/test/core/crawlers/cheerio_crawler.test.ts @@ -46,12 +46,12 @@ async function getRequestListForMock(mockData: Dictionary, pathName = 'special/m return requestList; } -async function getRequestListForMirror() { +async function getExampleRequestList(pathname = '/special/mirror') { const sources = [ - { url: `${serverAddress}/special/mirror?a=12` }, - { url: `${serverAddress}/special/mirror?a=23` }, - { url: `${serverAddress}/special/mirror?a=33` }, - { url: `${serverAddress}/special/mirror?a=43` }, + { url: `${serverAddress}${pathname}?a=12` }, + { url: `${serverAddress}${pathname}?a=23` }, + { url: `${serverAddress}${pathname}?a=33` }, + { url: `${serverAddress}${pathname}?a=43` }, ]; const requestList = await RequestList.open(null, sources); return requestList; @@ -92,7 +92,7 @@ describe('CheerioCrawler', () => { }); test('should work', async () => { - const requestList = await getRequestListForMirror(); + const requestList = await getExampleRequestList(); const processed: Request[] = []; const failed: Request[] = []; const requestHandler: CheerioRequestHandler = ({ $, body, request }) => { @@ -125,7 +125,7 @@ describe('CheerioCrawler', () => { }); test('should work with implicit router', async () => { - const requestList = await getRequestListForMirror(); + const requestList = await getExampleRequestList(); const processed: Request[] = []; const failed: Request[] = []; @@ -158,7 +158,7 @@ describe('CheerioCrawler', () => { }); test('should work with explicit router', async () => { - const requestList = await getRequestListForMirror(); + const requestList = await getExampleRequestList(); const processed: Request[] = []; const failed: Request[] = []; @@ -194,7 +194,7 @@ describe('CheerioCrawler', () => { }); test('should throw when no requestHandler nor default route provided', async () => { - const requestList = await getRequestListForMirror(); + const requestList = await getExampleRequestList(); const cheerioCrawler = new CheerioCrawler({ requestList, @@ -341,7 +341,7 @@ describe('CheerioCrawler', () => { test('after requestHandlerTimeoutSecs', async () => { const failed: Request[] = []; - const requestList = await getRequestListForMirror(); + const requestList = await getExampleRequestList(); const requestHandler = vi.fn(async () => { await sleep(2000); }); @@ -407,19 +407,19 @@ describe('CheerioCrawler', () => { describe('should ensure text/html Content-Type', () => { test('by setting a correct Accept header', async () => { - const headers: IncomingHttpHeaders[] = []; - const requestList = await getRequestListForMirror(); + const headersPerRequests: Headers[] = []; + const requestList = await getExampleRequestList('/special/headers'); const crawler = new CheerioCrawler({ requestList, - requestHandler: ({ response }) => { - headers.push(response.request.options.headers); + requestHandler: async ({ json }) => { + headersPerRequests.push(new Headers(json.headers)); }, }); await crawler.run(); - expect(headers).toHaveLength(4); - headers.forEach((h) => { - const acceptHeader = h.accept || h.Accept; + expect(headersPerRequests).toHaveLength(4); + headersPerRequests.forEach((headerset) => { + const acceptHeader = headerset.get('accept'); expect(acceptHeader!.includes('text/html')).toBe(true); expect(acceptHeader!.includes('application/xhtml+xml')).toBe(true); }); @@ -544,7 +544,7 @@ describe('CheerioCrawler', () => { }); test('should throw an error on http error status codes set by user', async () => { - const requestList = await getRequestListForMirror(); + const requestList = await getExampleRequestList(); const failed: Request[] = []; const cheerioCrawler = new CheerioCrawler({ @@ -649,15 +649,10 @@ describe('CheerioCrawler', () => { suggestResponseEncoding, }); - const stream = Readable.from([buf]); - // @ts-expect-error Using private method - const { response, encoding } = crawler._encodeResponse({}, stream); + const { response, encoding } = crawler._encodeResponse({}, new Response(new Uint8Array(buf))); expect(encoding).toBe('utf8'); - for await (const chunk of response) { - const string = chunk.toString('utf8'); - expect(string).toBe(html); - } + expect(await response.text()).toBe(html); }); test('always when forced', async () => { @@ -675,15 +670,10 @@ describe('CheerioCrawler', () => { forceResponseEncoding, }); - const stream = Readable.from([buf]); - // @ts-expect-error Using private method - const { response, encoding } = crawler._encodeResponse({}, stream, 'ascii'); + const { response, encoding } = crawler._encodeResponse({}, new Response(new Uint8Array(buf)), 'ascii'); expect(encoding).toBe('utf8'); - for await (const chunk of response) { - const string = chunk.toString('utf8'); - expect(string).toBe(html); - } + expect(await response.text()).toBe(html); }); test('Cheerio decodes html entities', async () => { @@ -716,7 +706,7 @@ describe('CheerioCrawler', () => { proxyUrls: [proxyUrl], }); - const requestList = await getRequestListForMirror(); + const requestList = await getExampleRequestList(); const proxies: string[] = []; const crawler = new CheerioCrawler({ @@ -748,7 +738,7 @@ describe('CheerioCrawler', () => { sessions.push(session!); }; - const requestList = await getRequestListForMirror(); + const requestList = await getExampleRequestList(); const crawler = new CheerioCrawler({ requestList, diff --git a/test/core/crawlers/file_download.test.ts b/test/core/crawlers/file_download.test.ts index 9f9cfc934d97..83ce50a1ddb7 100644 --- a/test/core/crawlers/file_download.test.ts +++ b/test/core/crawlers/file_download.test.ts @@ -1,6 +1,6 @@ import type { Server } from 'node:http'; import type { AddressInfo } from 'node:net'; -import { Duplex, pipeline as pipelineWithCallbacks } from 'node:stream'; +import { Duplex, finished, pipeline as pipelineWithCallbacks, Readable } from 'node:stream'; import { pipeline } from 'node:stream/promises'; import { ReadableStream } from 'node:stream/web'; import { setTimeout } from 'node:timers/promises'; @@ -11,15 +11,15 @@ import { startExpressAppPromise } from 'test/shared/_helper.js'; import { afterAll, beforeAll, expect, test } from 'vitest'; class ReadableStreamGenerator { - private static async generateRandomData(size: number, seed: number) { + private static async generateRandomData(size: number, seed: number): Promise { const chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'; - const buffer = Buffer.alloc(size); + const array = new Uint8Array(size); for (let i = 0; i < size; i++) { // eslint-disable-next-line no-bitwise seed = Math.imul(48271, seed) | (0 % 2147483647); - buffer[i] = chars.charCodeAt(seed % chars.length); + array[i] = chars.charCodeAt(seed % chars.length); } - return buffer; + return array; } static getReadableStream(size: number, seed: number, throttle = 0): ReadableStream { @@ -43,13 +43,15 @@ class ReadableStreamGenerator { return stream; } - static async getBuffer(size: number, seed: number) { + static async getUint8Array(size: number, seed: number) { const stream = this.getReadableStream(size, seed); - const chunks: string[] = []; + const chunks: Uint8Array = new Uint8Array(size); + let offset = 0; for await (const chunk of stream) { - chunks.push(chunk); + chunks.set(chunk, offset); + offset += chunk.length; } - return Buffer.from(chunks.join('')); + return chunks; } } @@ -81,13 +83,13 @@ afterAll(async () => { server.close(); }); -test('requestHandler - `body` property works', async () => { - const results: Buffer[] = []; +test('requestHandler - reading bytes synchronously', async () => { + const results: Uint8Array[] = []; const crawler = new FileDownload({ maxRequestRetries: 0, - requestHandler: async ({ body }) => { - results.push(await body); + requestHandler: async ({ response }) => { + results.push(await response.bytes()); }, }); @@ -97,17 +99,17 @@ test('requestHandler - `body` property works', async () => { expect(results).toHaveLength(1); expect(results[0].length).toBe(1024); - expect(results[0]).toEqual(await ReadableStreamGenerator.getBuffer(1024, 123)); + expect(results[0]).toEqual(await ReadableStreamGenerator.getUint8Array(1024, 123)); }); -test('requestHandler - `stream` property works', async () => { - let result: Buffer = Buffer.alloc(0); +test('requestHandler - streaming response body', async () => { + let result: Uint8Array = new Uint8Array(); const crawler = new FileDownload({ maxRequestRetries: 0, - requestHandler: async ({ stream }) => { - for await (const chunk of stream) { - result = Buffer.concat([result, chunk]); + requestHandler: async ({ response }) => { + for await (const chunk of response.body ?? []) { + result = new Uint8Array([...result, ...chunk]); } }, }); @@ -117,16 +119,16 @@ test('requestHandler - `stream` property works', async () => { await crawler.run([fileUrl]); expect(result.length).toBe(1024); - expect(result).toEqual(await ReadableStreamGenerator.getBuffer(1024, 456)); + expect(result).toEqual(await ReadableStreamGenerator.getUint8Array(1024, 456)); }); test('requestHandler receives response', async () => { const crawler = new FileDownload({ maxRequestRetries: 0, requestHandler: async ({ response }) => { - expect(response.headers['content-type']).toBe('application/octet-stream'); - expect(response.statusCode).toBe(200); - expect(response.statusMessage).toBe('OK'); + expect(response?.headers.get('content-type')).toBe('application/octet-stream'); + expect(response?.status).toBe(200); + expect(response?.statusText).toBe('OK'); }, }); @@ -146,8 +148,8 @@ test('crawler waits for the stream to be consumed', async () => { const crawler = new FileDownload({ maxRequestRetries: 0, - requestHandler: ({ stream }) => { - pipelineWithCallbacks(stream, bufferingStream, (err) => { + requestHandler: async ({ response }) => { + pipelineWithCallbacks(response.body ?? ReadableStream.from([]), bufferingStream, (err) => { if (!err) { bufferingStream.push(null); bufferingStream.end(); @@ -165,12 +167,13 @@ test('crawler waits for the stream to be consumed', async () => { // the stream should be finished once the crawler finishes. expect(bufferingStream.writableFinished).toBe(true); - const bufferedData: Buffer[] = []; + const bufferedData = new Uint8Array(5 * 1024); + let offset = 0; for await (const chunk of bufferingStream) { - bufferedData.push(chunk); + bufferedData.set(chunk, offset); + offset += chunk.length; } - const result = Buffer.concat(bufferedData); - expect(result.length).toBe(5 * 1024); - expect(result).toEqual(await ReadableStreamGenerator.getBuffer(5 * 1024, 789)); + expect(bufferedData.length).toBe(5 * 1024); + expect(bufferedData).toEqual(await ReadableStreamGenerator.getUint8Array(5 * 1024, 789)); }); diff --git a/test/core/crawlers/http_crawler.test.ts b/test/core/crawlers/http_crawler.test.ts index 820b4e417f5e..00e2b754501f 100644 --- a/test/core/crawlers/http_crawler.test.ts +++ b/test/core/crawlers/http_crawler.test.ts @@ -2,7 +2,7 @@ import http from 'node:http'; import type { AddressInfo } from 'node:net'; import { Readable } from 'node:stream'; -import { HttpCrawler } from '@crawlee/http'; +import { HttpCrawler, ResponseWithUrl } from '@crawlee/http'; import { MemoryStorageEmulator } from 'test/shared/MemoryStorageEmulator.js'; const router = new Map(); @@ -376,32 +376,6 @@ test('should retry on 403 even with disallowed content-type', async () => { expect(succeeded[0].retryCount).toBe(1); }); -test('should work with cacheable-request', async () => { - const isFromCache: Record = {}; - const cache = new Map(); - const crawler = new HttpCrawler({ - maxConcurrency: 1, - preNavigationHooks: [ - async (_, gotOptions) => { - gotOptions.cache = cache; - gotOptions.headers = { - ...gotOptions.headers, - // to force cache - 'cache-control': 'max-stale', - }; - }, - ], - requestHandler: async ({ request, response }) => { - isFromCache[request.uniqueKey] = response.isFromCache; - }, - }); - await crawler.run([ - { url, uniqueKey: 'first' }, - { url, uniqueKey: 'second' }, - ]); - expect(isFromCache).toEqual({ first: false, second: true }); -}); - test('works with a custom HttpClient', async () => { const results: string[] = []; @@ -410,42 +384,26 @@ test('works with a custom HttpClient', async () => { requestHandler: async ({ body, sendRequest }) => { results.push(body as string); - results.push((await sendRequest()).body); + results.push(await (await sendRequest()).text()); }, httpClient: { async sendRequest(request) { - if (request.responseType !== 'text') { - throw new Error('Not implemented'); - } - - return { - body: 'Hello from sendRequest()' as any, - request, - url, - redirectUrls: [], - statusCode: 200, + return new ResponseWithUrl('Hello from sendRequest()', { + url: request.url.toString(), + status: 200, headers: {}, - trailers: {}, - complete: true, - }; + }); }, async stream(request) { const stream = new Readable(); stream.push('Schmexample Domain'); stream.push(null); - return { - stream, - downloadProgress: { percent: 100, transferred: 0 }, - uploadProgress: { percent: 100, transferred: 0 }, - request, - url, - redirectUrls: [], - statusCode: 200, + return new ResponseWithUrl(Readable.toWeb(stream) as any, { + url: request.url.toString(), + status: 200, headers: { 'content-type': 'text/html; charset=utf-8' }, - trailers: {}, - complete: true, - }; + }); }, }, }); diff --git a/test/core/session_pool/session.test.ts b/test/core/session_pool/session.test.ts index af2641801fca..188422e090ec 100644 --- a/test/core/session_pool/session.test.ts +++ b/test/core/session_pool/session.test.ts @@ -1,5 +1,4 @@ -import { EVENT_SESSION_RETIRED, Session, SessionPool } from '@crawlee/core'; -import type { Dictionary } from '@crawlee/utils'; +import { EVENT_SESSION_RETIRED, ResponseWithUrl, Session, SessionPool } from '@crawlee/core'; import { entries, sleep } from '@crawlee/utils'; import { CookieJar } from 'tough-cookie'; @@ -61,10 +60,12 @@ describe('Session - testing session behaviour ', () => { let error; try { - session.setCookiesFromResponse({ - headers: { Cookie: 'invaldi*{*{*{*-----***@s' }, - url: 'http://localhost:1337', - }); + session.setCookiesFromResponse( + new ResponseWithUrl('', { + headers: { Cookie: 'invaldi*{*{*{*-----***@s' }, + url: 'http://localhost:1337', + }), + ); } catch (e) { error = e; } @@ -280,36 +281,34 @@ describe('Session - testing session behaviour ', () => { describe('.putResponse & .getCookieString', () => { test('should set and update cookies from "set-cookie" header', () => { - const headers: Dictionary = {}; + const headers = new Headers(); + + headers.append('set-cookie', 'CSRF=e8b667; Domain=example.com; Secure '); + headers.append('set-cookie', 'id=a3fWa; Expires=Wed, Domain=example.com; 21 Oct 2015 07:28:00 GMT'); - headers['set-cookie'] = [ - 'CSRF=e8b667; Domain=example.com; Secure ', - 'id=a3fWa; Expires=Wed, Domain=example.com; 21 Oct 2015 07:28:00 GMT', - ]; const newSession = new Session({ sessionPool: new SessionPool() }); const url = 'https://example.com'; - newSession.setCookiesFromResponse({ headers, url }); + newSession.setCookiesFromResponse(new ResponseWithUrl('', { headers, url })); let cookies = newSession.getCookieString(url); expect(cookies).toEqual('CSRF=e8b667; id=a3fWa'); const newCookie = 'ABCD=1231231213; Domain=example.com; Secure'; - newSession.setCookiesFromResponse({ headers: { 'set-cookie': newCookie }, url }); + newSession.setCookiesFromResponse(new ResponseWithUrl('', { headers: { 'set-cookie': newCookie }, url })); cookies = newSession.getCookieString(url); expect(cookies).toEqual('CSRF=e8b667; id=a3fWa; ABCD=1231231213'); }); }); test('should correctly persist and init cookieJar', () => { - const headers: Dictionary = {}; + const headers = new Headers(); + + headers.append('set-cookie', 'CSRF=e8b667; Domain=example.com; Secure '); + headers.append('set-cookie', 'id=a3fWa; Expires=Wed, Domain=example.com; 21 Oct 2015 07:28:00 GMT'); - headers['set-cookie'] = [ - 'CSRF=e8b667; Domain=example.com; Secure ', - 'id=a3fWa; Expires=Wed, Domain=example.com; 21 Oct 2015 07:28:00 GMT', - ]; const newSession = new Session({ sessionPool: new SessionPool() }); const url = 'https://example.com'; - newSession.setCookiesFromResponse({ headers, url }); + newSession.setCookiesFromResponse(new ResponseWithUrl('', { headers, url })); const old = newSession.getState(); diff --git a/test/core/session_pool/session_utils.test.ts b/test/core/session_pool/session_utils.test.ts index aab3f1a98a44..d021c161b00a 100644 --- a/test/core/session_pool/session_utils.test.ts +++ b/test/core/session_pool/session_utils.test.ts @@ -1,41 +1,39 @@ import { getCookiesFromResponse } from '@crawlee/core'; -import type { Dictionary } from '@crawlee/utils'; import { Cookie } from 'tough-cookie'; describe('getCookiesFromResponse', () => { test('should parse cookies if set-cookie is array', () => { - const headers: Dictionary = {}; - const dummyCookies = [ - 'CSRF=e8b667; Domain=example.com; Secure', - 'id=a3fWa; Expires=Wed, 21 Oct 2015 07:28:00 GMT', - ]; - headers['set-cookie'] = dummyCookies; - const cookies = getCookiesFromResponse({ headers }); + const headers = new Headers(); + + headers.append('set-cookie', 'CSRF=e8b667; Domain=example.com; Secure '); + headers.append('set-cookie', 'id=a3fWa; Expires=Wed, 21 Oct 2015 07:28:00 GMT'); + + const cookies = getCookiesFromResponse(new Response('', { headers })); cookies.forEach((cookie) => { expect(cookie).toBeInstanceOf(Cookie); }); - expect(dummyCookies[0]).toEqual(cookies[0].toString()); - expect(dummyCookies[1]).toEqual(cookies[1].toString()); + expect(cookies[0].toString()).toEqual('CSRF=e8b667; Domain=example.com; Secure'); + expect(cookies[1].toString()).toEqual('id=a3fWa; Expires=Wed, 21 Oct 2015 07:28:00 GMT'); }); test('should parse cookies if set-cookie is string', () => { - const headers: Dictionary = {}; - const dummyCookie = 'CSRF=e8b667; Domain=example.com; Secure'; - headers['set-cookie'] = dummyCookie; - const cookies = getCookiesFromResponse({ headers }); + const headers = new Headers(); + headers.append('set-cookie', 'CSRF=e8b667; Domain=example.com; Secure '); + + const cookies = getCookiesFromResponse(new Response('', { headers })); expect(cookies).toHaveLength(1); - expect(dummyCookie).toEqual(cookies[0].toString()); + expect(cookies[0].toString()).toEqual('CSRF=e8b667; Domain=example.com; Secure'); expect(cookies[0]).toBeInstanceOf(Cookie); }); test('should not throw error on parsing invalid cookie', () => { - const headers: Dictionary = {}; - const dummyCookie = 'totally Invalid Cookie $@$@#$**'; - headers['set-cookie'] = dummyCookie; - const cookies = getCookiesFromResponse({ headers }); + const headers = new Headers(); + headers.append('set-cookie', 'totally Invalid Cookie $@$@#$**'); + + const cookies = getCookiesFromResponse(new Response('', { headers })); expect(cookies).toHaveLength(1); expect(cookies[0]).toBeUndefined(); diff --git a/tsconfig.build.json b/tsconfig.build.json index 2ea673f57b08..95710e031b70 100644 --- a/tsconfig.build.json +++ b/tsconfig.build.json @@ -4,7 +4,7 @@ "module": "NodeNext", "moduleResolution": "NodeNext", "target": "ESNext", - "lib": ["DOM", "ES2023"], + "lib": ["DOM", "ES2023", "ES2024", "DOM.AsyncIterable"], "baseUrl": ".", "allowJs": true, "skipLibCheck": true,