Skip to content
Open
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
b22aaed
feat: add `Request.intoFetchAPIRequest` method
barjin Sep 12, 2025
aac26de
feat: use native `Response` in `sendRequest`
barjin Sep 12, 2025
ee5f2b2
chore: phase out `got-scraping`'s Response in favour of `fetch` API `…
barjin Sep 12, 2025
450a393
chore: run linter
barjin Sep 15, 2025
47711c6
feat: use `fetch` API's `Response` in `HttpCrawler`
barjin Sep 16, 2025
e68e8fc
chore: fix build / linter errors
barjin Sep 16, 2025
709814c
fix: try fixing broken tests
barjin Sep 16, 2025
20db2c8
chore: run linter
barjin Sep 16, 2025
f55b0d3
fix: align with `HttpCrawler` tested behaviour
barjin Sep 16, 2025
414b3bb
fix: align types in `redirectHandler`, `sendRequest`
barjin Sep 16, 2025
f539905
feat: use native `Response` API in `BrowserCrawler`
barjin Sep 17, 2025
34a0833
fix: correctly typed tests
barjin Sep 17, 2025
de8855e
chore: fix failing `CheerioCrawler` tests
barjin Sep 17, 2025
9675723
fix: waiting for stream to finish in `FileDownloader`
barjin Sep 17, 2025
e2b338b
chore: fix `CheerioCrawler` tests
barjin Sep 17, 2025
6635213
chore: apply PR suggestions
barjin Oct 1, 2025
7f86947
Merge branch 'feat/request-response' into v4
barjin Nov 25, 2025
f6a52a2
chore: fix build errors
barjin Nov 25, 2025
afe8f7b
fix: wait for streaming response in `FileDownload`
barjin Nov 25, 2025
56ddc76
chore: fix linter errors
barjin Nov 25, 2025
6b6960c
fix: correct `content-type` parsing from `Headers` object
barjin Nov 25, 2025
e6ce6b9
chore: fix file download tests
barjin Nov 26, 2025
2eb1062
Merge branch 'v4' into feat/request-response
barjin Nov 26, 2025
55bc416
chore: remove commented-out `got-scraping`-related test
barjin Nov 26, 2025
3947109
chore: add `AsyncIterable`-compatible DOM type definitions
barjin Nov 26, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions packages/basic-crawler/src/internals/send-request.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import {
type Request,
type Session,
} from '@crawlee/core';
import type { Method, Response as GotResponse } from 'got-scraping';
import type { Method } from 'got-scraping';

/**
* Prepares a function to be used as the `sendRequest` context helper.
Expand All @@ -22,10 +22,7 @@ export function createSendRequest(
session: Session | undefined,
getProxyUrl: () => string | undefined,
) {
return async <Response = string>(
// TODO the type information here (and in crawler_commons) is outright wrong... for BC - replace this with generic HttpResponse in v4
overrideOptions: Partial<HttpRequestOptions> = {},
): Promise<GotResponse<Response>> => {
return async (overrideOptions: Partial<HttpRequestOptions> = {}): Promise<Response> => {
const cookieJar = session
? {
getCookieString: async (url: string) => session.getCookieString(url),
Expand All @@ -48,6 +45,6 @@ export function createSendRequest(
// Fill in body as the last step - `processHttpRequestOptions` may use either `body`, `json` or `form` so we cannot override it beforehand
requestOptions.body ??= originRequest.payload;

return httpClient.sendRequest<any>(requestOptions) as unknown as GotResponse<Response>;
return httpClient.sendRequest(requestOptions);
};
}
11 changes: 2 additions & 9 deletions packages/cheerio-crawler/src/internals/cheerio-crawler.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
import type { IncomingMessage } from 'node:http';
import { text as readStreamToString } from 'node:stream/consumers';

import type {
Configuration,
EnqueueLinksOptions,
Expand Down Expand Up @@ -168,12 +165,8 @@ export class CheerioCrawler extends HttpCrawler<CheerioCrawlingContext> {
super(options, config);
}

protected override async _parseHTML(
response: IncomingMessage,
isXml: boolean,
crawlingContext: CheerioCrawlingContext,
) {
const body = await readStreamToString(response);
protected override async _parseHTML(response: Response, isXml: boolean, crawlingContext: CheerioCrawlingContext) {
const body = await response.text();
const dom = parseDocument(body, { decodeEntities: true, xmlMode: isXml });
const $ = cheerio.load(dom, {
xml: { decodeEntities: true, xmlMode: isXml },
Expand Down
12 changes: 5 additions & 7 deletions packages/core/src/cookie_utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,14 @@ export interface ResponseLike {
/**
* @internal
*/
export function getCookiesFromResponse(response: ResponseLike): Cookie[] {
const headers = typeof response.headers === 'function' ? response.headers() : response.headers;
const cookieHeader = headers?.['set-cookie'] || '';
export function getCookiesFromResponse(response: Response): Cookie[] {
const headers = response.headers;
const cookieHeaders = headers.getSetCookie();

try {
return Array.isArray(cookieHeader)
? cookieHeader.map((cookie) => Cookie.parse(cookie)!)
: [Cookie.parse(cookieHeader)!];
return cookieHeaders.map((cookie) => Cookie.parse(cookie)!);
} catch (e) {
throw new CookieParseError(cookieHeader);
throw new CookieParseError(cookieHeaders);
}
}

Expand Down
4 changes: 2 additions & 2 deletions packages/core/src/crawlers/crawler_commons.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import type { BatchAddRequestsResult, Dictionary } from '@crawlee/types';
import type { OptionsInit, Response as GotResponse } from 'got-scraping';
import type { OptionsInit } from 'got-scraping';
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would love to see this go as well

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd prefer to do that as a part of a separate PR. Removing got-scraping (and all the type todos) is no small feat, which would make it hard to review, if done all-in-one.

import type { ReadonlyDeep } from 'type-fest';

import type { Configuration } from '../configuration.js';
Expand Down Expand Up @@ -163,7 +163,7 @@ export interface CrawlingContext<Crawler = unknown, UserData extends Dictionary
* },
* ```
*/
sendRequest<Response = string>(overrideOptions?: Partial<OptionsInit>): Promise<GotResponse<Response>>;
sendRequest(overrideOptions?: Partial<OptionsInit>): Promise<Response>;
}

/**
Expand Down
14 changes: 11 additions & 3 deletions packages/core/src/http_clients/base-http-client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,14 @@ interface HttpResponseWithoutBody<TResponseType extends keyof ResponseTypes = ke
request: HttpRequest<TResponseType>;
}

export class ResponseWithUrl extends Response {
override url: string;
constructor(body: BodyInit | null, init: ResponseInit & { url?: string }) {
super(body, init);
this.url = init.url ?? '';
}
}

/**
* HTTP response data as returned by the {@apilink BaseHttpClient.sendRequest} method.
*/
Expand All @@ -169,7 +177,7 @@ export interface StreamingHttpResponse extends HttpResponseWithoutBody {
* Type of a function called when an HTTP redirect takes place. It is allowed to mutate the `updatedRequest` argument.
*/
export type RedirectHandler = (
redirectResponse: BaseHttpResponseData,
redirectResponse: Response,
updatedRequest: { url?: string | URL; headers: SimpleHeaders },
) => void;

Expand All @@ -182,12 +190,12 @@ export interface BaseHttpClient {
*/
sendRequest<TResponseType extends keyof ResponseTypes = 'text'>(
request: HttpRequest<TResponseType>,
): Promise<HttpResponse<TResponseType>>;
): Promise<Response>;

/**
* Perform an HTTP Request and return after the response headers are received. The body may be read from a stream contained in the response.
*/
stream(request: HttpRequest, onRedirect?: RedirectHandler): Promise<StreamingHttpResponse>;
stream(request: HttpRequest, onRedirect?: RedirectHandler): Promise<Response>;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't the stream method obsolete? The web Response class can be streamed using response.body when the caller chooses to do so

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, it actually is 👍 I'd prefer to do this in a separate PR, for the same reasons as the total got-scraping phase-out.

}

/**
Expand Down
92 changes: 45 additions & 47 deletions packages/core/src/http_clients/got-scraping-http-client.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import { Readable } from 'node:stream';

import type { Options, PlainResponse } from 'got-scraping';
import { gotScraping } from 'got-scraping';

import type {
BaseHttpClient,
HttpRequest,
HttpResponse,
RedirectHandler,
ResponseTypes,
StreamingHttpResponse,
import {
type BaseHttpClient,
type HttpRequest,
type RedirectHandler,
type ResponseTypes,
ResponseWithUrl,
} from './base-http-client.js';

/**
Expand All @@ -19,7 +20,7 @@ export class GotScrapingHttpClient implements BaseHttpClient {
*/
async sendRequest<TResponseType extends keyof ResponseTypes>(
request: HttpRequest<TResponseType>,
): Promise<HttpResponse<TResponseType>> {
): Promise<Response> {
const gotResult = await gotScraping({
...request,
retry: {
Expand All @@ -28,23 +29,42 @@ export class GotScrapingHttpClient implements BaseHttpClient {
},
});

return {
...gotResult,
body: gotResult.body as ResponseTypes[TResponseType],
request: { url: request.url, ...gotResult.request },
};
const parsedHeaders = Object.entries(gotResult.headers)
.map(([key, value]) => {
if (value === undefined) return [];

if (Array.isArray(value)) {
return value.map((v) => [key, v]);
}

return [[key, value]];
})
.flat() as [string, string][];

return new ResponseWithUrl(new Uint8Array(gotResult.rawBody), {
headers: new Headers(parsedHeaders),
status: gotResult.statusCode,
statusText: gotResult.statusMessage ?? '',
url: gotResult.url,
});
}

/**
* @inheritDoc
*/
async stream(request: HttpRequest, handleRedirect?: RedirectHandler): Promise<StreamingHttpResponse> {
async stream(request: HttpRequest, handleRedirect?: RedirectHandler): Promise<Response> {
// eslint-disable-next-line no-async-promise-executor
return new Promise(async (resolve, reject) => {
const stream = gotScraping({ ...request, isStream: true });

stream.on('redirect', (updatedOptions: Options, redirectResponse: PlainResponse) => {
handleRedirect?.(redirectResponse, updatedOptions);
stream.on('redirect', (updatedOptions: Options, redirectResponse: any) => {
const nativeRedirectResponse = new ResponseWithUrl(redirectResponse.rawBody, {
headers: redirectResponse.headers,
status: redirectResponse.statusCode,
statusText: redirectResponse.statusMessage,
url: redirectResponse.url,
});
handleRedirect?.(nativeRedirectResponse, updatedOptions);
});

// We need to end the stream for DELETE requests, otherwise it will hang.
Expand All @@ -55,37 +75,15 @@ export class GotScrapingHttpClient implements BaseHttpClient {
stream.on('error', reject);

stream.on('response', (response: PlainResponse) => {
const result: StreamingHttpResponse = {
stream,
request,
redirectUrls: response.redirectUrls,
url: response.url,
ip: response.ip,
statusCode: response.statusCode,
headers: response.headers,
trailers: response.trailers,
complete: response.complete,
get downloadProgress() {
return stream.downloadProgress;
},
get uploadProgress() {
return stream.uploadProgress;
},
};

Object.assign(result, response); // TODO BC - remove in 4.0

resolve(result);

stream.on('end', () => {
result.complete = response.complete;

result.trailers ??= {};
Object.assign(result.trailers, response.trailers);

(result as any).rawTrailers ??= []; // TODO BC - remove in 4.0
Object.assign((result as any).rawTrailers, response.rawTrailers);
});
// Cast shouldn't be needed here, undici might have a different `ReadableStream` type
resolve(
new ResponseWithUrl(Readable.toWeb(stream) as any, {
status: response.statusCode,
statusText: response.statusMessage ?? '',
headers: response.headers as HeadersInit,
url: response.url,
}),
);
});
});
}
Expand Down
23 changes: 19 additions & 4 deletions packages/core/src/request.ts
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ export enum RequestState {
* ```
* @category Sources
*/
export class Request<UserData extends Dictionary = Dictionary> {
class CrawleeRequest<UserData extends Dictionary = Dictionary> {
/** Request ID */
id?: string;

Expand Down Expand Up @@ -193,7 +193,8 @@ export class Request<UserData extends Dictionary = Dictionary> {
this.url = url;
this.loadedUrl = loadedUrl;
this.uniqueKey =
uniqueKey || Request.computeUniqueKey({ url, method, payload, keepUrlFragment, useExtendedUniqueKey });
uniqueKey ||
CrawleeRequest.computeUniqueKey({ url, method, payload, keepUrlFragment, useExtendedUniqueKey });
this.method = method;
this.payload = payload;
this.noRetry = noRetry;
Expand Down Expand Up @@ -255,6 +256,18 @@ export class Request<UserData extends Dictionary = Dictionary> {
}
}

/**
* Converts the Crawlee Request object to a `fetch` API Request object.
* @returns The native `fetch` API Request object.
*/
public intoFetchAPIRequest(): Request {
return new Request(this.url, {
method: this.method,
headers: this.headers,
body: this.payload,
});
}

/** Tells the crawler processing this request to skip the navigation and process the request directly. */
get skipNavigation(): boolean {
return this.userData.__crawlee?.skipNavigation ?? false;
Expand Down Expand Up @@ -398,7 +411,7 @@ export class Request<UserData extends Dictionary = Dictionary> {
}
return normalizedUrl;
}
const payloadHash = payload ? Request.hashPayload(payload) : '';
const payloadHash = payload ? CrawleeRequest.hashPayload(payload) : '';
return `${normalizedMethod}(${payloadHash}):${normalizedUrl}`;
}

Expand Down Expand Up @@ -526,10 +539,12 @@ interface ComputeUniqueKeyOptions {
useExtendedUniqueKey?: boolean;
}

export type Source = (Partial<RequestOptions> & { requestsFromUrl?: string; regex?: RegExp }) | Request;
export type Source = (Partial<RequestOptions> & { requestsFromUrl?: string; regex?: RegExp }) | CrawleeRequest;

/** @internal */
export interface InternalSource {
requestsFromUrl: string;
regex?: RegExp;
}

export { CrawleeRequest as Request };
5 changes: 2 additions & 3 deletions packages/core/src/session_pool/session.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import { CookieJar } from 'tough-cookie';
import type { Log } from '@apify/log';
import { cryptoRandomObjectId } from '@apify/utilities';

import type { ResponseLike } from '../cookie_utils.js';
import {
browserPoolCookieToToughCookie,
getCookiesFromResponse,
Expand Down Expand Up @@ -319,10 +318,10 @@ export class Session {
*
* It then parses and saves the cookies from the `set-cookie` header, if available.
*/
setCookiesFromResponse(response: ResponseLike) {
setCookiesFromResponse(response: Response) {
try {
const cookies = getCookiesFromResponse(response).filter((c) => c);
this._setCookies(cookies, typeof response.url === 'function' ? response.url() : response.url!);
this._setCookies(cookies, response.url);
} catch (e) {
const err = e as Error;
// if invalid Cookie header is provided just log the exception.
Expand Down
Loading