diff --git a/apps/worker/src/utils/fetchAsBot.test.ts b/apps/worker/src/utils/fetchAsBot.test.ts index 6e0460e..b6303d3 100644 --- a/apps/worker/src/utils/fetchAsBot.test.ts +++ b/apps/worker/src/utils/fetchAsBot.test.ts @@ -1,88 +1,1457 @@ +import { createWriteStream } from "fs"; +import { devNull } from "os"; +import { Writable } from "stream"; +import { request } from "undici"; import { mockEndpoint } from "../../test-utils/server.ts"; -import { RobotDeniedError, fetchAsBot, resetCache } from "./fetchAsBot.ts"; +import { + RobotDeniedError, + fetchAsBot, + fetchAsBotStream, + resetCache, +} from "./fetchAsBot.ts"; beforeEach(() => resetCache()); -test("Should throw error when a 400 status code is returned", async () => { - const robotsUrl = new URL("https://example.com/robots.txt"); - const url = new URL("https://example.com/test"); +describe("fetchAsBot", () => { + test("Should throw error when a 400 status code is returned", async () => { + const robotsUrl = new URL("https://example.com/robots.txt"); + const url = new URL("https://example.com/test"); - mockEndpoint({ - path: robotsUrl, - body: "User-agent: *\nDisallow:\n", - headers: { - "content-type": "text/plain", - }, + mockEndpoint({ + path: robotsUrl, + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + mockEndpoint({ + path: url, + body: "", + status: 400, + }); + + const response = fetchAsBot({ url, method: "GET" }); + await expect(response).rejects.toThrow(); }); - mockEndpoint({ - path: url, - body: "", - status: 400, + + test("Should return successful data for a URL with no robots.txt", async () => { + const robotsUrl = new URL("https://example.com/robots.txt"); + const url = new URL("https://example.com/test"); + + mockEndpoint({ + path: robotsUrl, + status: 404, + body: "", + }); + mockEndpoint({ + path: url, + body: "Hello!", + }); + + const response = await fetchAsBot({ url, method: "GET" }); + const body = await response.body.text(); + expect(body).toBe("Hello!"); }); - const response = fetchAsBot({ url, method: "GET" }); - await expect(response).rejects.toThrow(); -}); + test("Should return successful data for a URL with a valid robots.txt", async () => { + const robotsUrl = new URL("https://example.com/robots.txt"); + const url = new URL("https://example.com/test"); -test("Should return successful data for a URL with no robots.txt", async () => { - const robotsUrl = new URL("https://example.com/robots.txt"); - const url = new URL("https://example.com/test"); + mockEndpoint({ + path: robotsUrl, + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + mockEndpoint({ + path: url, + body: "Hello!", + }); - mockEndpoint({ - path: robotsUrl, - status: 404, - body: "", + const response = await fetchAsBot({ url, method: "GET" }); + const body = await response.body.text(); + expect(body).toBe("Hello!"); }); - mockEndpoint({ - path: url, - body: "Hello!", + + test("Should not return data for a URL disallowed by robots.txt", async () => { + const robotsUrl = new URL("https://example.com/robots.txt"); + const url = new URL("https://example.com/test"); + + mockEndpoint({ + path: robotsUrl, + body: "User-agent: *\nDisallow: /test\n", + headers: { + "content-type": "text/plain", + }, + }); + + const response = await fetchAsBot({ url, method: "GET" }).catch((e) => e); + expect(response).to.be.instanceOf(RobotDeniedError); + expect((response as Error).message).toBe( + "playful-programming/1.0 is disallowed from example.com!", + ); }); - const response = await fetchAsBot({ url, method: "GET" }); - const body = await response.body.text(); - expect(body).toBe("Hello!"); -}); + test("Should return data for a URL disallowed by robots.txt when skipRobotsCheck is true", async () => { + const robotsUrl = new URL("https://example.com/robots.txt"); + mockEndpoint({ + path: robotsUrl, + body: "User-agent: *\nDisallow: /test\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("https://example.com/test"); + const mockedBody = "Hello!"; + mockEndpoint({ + path: url, + body: mockedBody, + }); + + const response = await fetchAsBot({ + url, + method: "GET", + skipRobotsCheck: true, + }); + + expect(await response.body.text()).toBe(mockedBody); + + // Consume the remaining robots.txt mock so afterEach has no pending interceptors + await request(robotsUrl); + }); + + test("Should not follow redirects when followRedirects is 0", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectPath = "/another-test"; + const redirectStatus = 301; + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectPath, + }, + status: redirectStatus, + }); + + const redirectedUrl = new URL(redirectPath, baseUrl); + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectedUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + const response = await fetchAsBot({ + url, + method: "GET", + followRedirects: 0, + }).catch((e) => e as Error); + + expect.assert( + response instanceof Error === true, + "Expected an error to be thrown", + ); + expect(response.message).toBe(`Request ${url} returned ${redirectStatus}`); + + // Consume the remaining redirect mock so afterEach has no pending interceptors + await request(redirectedUrl); + }); + + test("Should handle single redirect", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectPath = "/another-test"; + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectPath, + }, + status: 301, + }); + + const redirectedUrl = new URL(redirectPath, baseUrl); + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectedUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + const response = await fetchAsBot({ + url, + method: "GET", + }); -test("Should return successful data for a URL with a valid robots.txt", async () => { - const robotsUrl = new URL("https://example.com/robots.txt"); - const url = new URL("https://example.com/test"); + expect(await response.body.text()).toBe(redirectionBody); + }); + + test("Should handle chain of redirects", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const redirectionBody = "This is the redirection result."; + for (let i = 1; i <= 3; i++) { + if (i === 3) { + mockEndpoint({ + path: new URL(`/${i}`, baseUrl), + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + continue; + } - mockEndpoint({ - path: robotsUrl, - body: "User-agent: *\nDisallow:\n", - headers: { - "content-type": "text/plain", - }, + mockEndpoint({ + path: new URL(`/${i}`, baseUrl), + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: new URL(`/${i + 1}`, baseUrl).toString(), + }, + status: 301, + }); + } + + const response = await fetchAsBot({ + url: new URL("/1", baseUrl), + method: "GET", + }); + + expect(await response.body.text()).toBe(redirectionBody); }); - mockEndpoint({ - path: url, - body: "Hello!", + + test("Should throw error when redirect limit is exceeded", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + for (let i = 1; i <= 12; i++) { + if (i === 12) { + mockEndpoint({ + path: new URL(`/${i}`, baseUrl), + body: "This is the redirection result.", + headers: { + "content-type": "text/plain", + }, + }); + continue; + } + + mockEndpoint({ + path: new URL(`/${i}`, baseUrl), + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: `/${i + 1}`, + }, + status: 302, + }); + } + + const response = await fetchAsBot({ + url: new URL("/1", baseUrl), + method: "GET", + }).catch((e) => e as Error); + + expect(response).toBeInstanceOf(Error); + + // Consume the remaining redirect mocks so afterEach has no pending interceptors + await request(new URL("/12", baseUrl)); }); - const response = await fetchAsBot({ url, method: "GET" }); - const body = await response.body.text(); - expect(body).toBe("Hello!"); + test("Should throw error when redirecting to invalid location", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const invalidLocation = "http://[invalid-url"; + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: invalidLocation, + }, + status: 303, + }); + + const response = await fetchAsBot({ + url: new URL("/test", baseUrl), + method: "GET", + }).catch((e) => e as Error); + + expect.assert( + response instanceof Error === true, + "Expected an error to be thrown", + ); + expect(response.message).toBe( + `The redirect location ${invalidLocation} couldn't be parsed as a URL for ${url}`, + ); + }); + + test("Should throw error when location header is missing on redirect", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + }, + status: 307, + }); + + const response = await fetchAsBot({ + url: new URL("/test", baseUrl), + method: "GET", + }).catch((e) => e as Error); + + expect.assert( + response instanceof Error === true, + "Expected an error to be thrown", + ); + expect(response.message).toBe( + `The redirect location undefined couldn't be parsed as a URL for ${url}`, + ); + }); + + test("Should throw error when redirecting to unsupported protocol", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: "ftp://example.com/file", + }, + status: 308, + }); + + const response = await fetchAsBot({ + url: new URL("/test", baseUrl), + method: "GET", + }).catch((e) => e as Error); + + expect.assert( + response instanceof Error === true, + "Expected an error to be thrown", + ); + expect(response.message).toBe(`Invalid redirect protocol for ${url}`); + }); + + test("Should handle absolute redirects", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectPath = "/another-test"; + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectPath, + }, + status: 301, + }); + + const redirectedUrl = new URL(redirectPath, baseUrl); + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectedUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + const response = await fetchAsBot({ + url, + method: "GET", + }); + + expect(await response.body.text()).toBe(redirectionBody); + }); + + test("Should handle relative redirects", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test/", baseUrl); + const redirectPath = "another-test"; + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectPath, + }, + status: 301, + }); + + const redirectedUrl = new URL(redirectPath, url); + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectedUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + const response = await fetchAsBot({ + url, + method: "GET", + }); + + expect(await response.body.text()).toBe(redirectionBody); + }); + + test("Should handle full URL redirects", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectUrl = new URL("/another-test", baseUrl); + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectUrl.toString(), + }, + status: 301, + }); + + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + const response = await fetchAsBot({ + url, + method: "GET", + }); + + expect(await response.body.text()).toBe(redirectionBody); + }); + + test("Should handle cross-domain redirects", async () => { + const baseUrl = "https://example.com"; + const redirectBaseUrl = "https://example.net"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + mockEndpoint({ + path: new URL("/robots.txt", redirectBaseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectUrl = new URL("/test", redirectBaseUrl); + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectUrl.toString(), + }, + status: 301, + }); + + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + const response = await fetchAsBot({ + url, + method: "GET", + }); + + expect(await response.body.text()).toBe(redirectionBody); + }); + + test("Should throw error when redirected location in same domain is disallowed by robots.txt", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow: /another-test\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectPath = "/another-test"; + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectPath, + }, + status: 301, + }); + + const redirectedUrl = new URL(redirectPath, url); + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectedUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + const response = await fetchAsBot({ + url, + method: "GET", + }).catch((e) => e as Error); + + expect.assert( + response instanceof Error === true, + "Expected an error to be thrown", + ); + expect(response.message).toBe( + `playful-programming/1.0 is disallowed from ${url.hostname}!`, + ); + + // Consume the remaining redirect mock so afterEach has no pending interceptors + await request(redirectedUrl); + }); + + test("Should throw error when redirected location in different domain is disallowed by robots.txt", async () => { + const baseUrl = "https://example.com"; + const redirectBaseUrl = "https://example.net"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + mockEndpoint({ + path: new URL("/robots.txt", redirectBaseUrl), + body: "User-agent: *\nDisallow: /test\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectUrl = new URL("/test", redirectBaseUrl); + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectUrl.toString(), + }, + status: 301, + }); + + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + const response = await fetchAsBot({ + url, + method: "GET", + }).catch((e) => e as Error); + + expect.assert( + response instanceof Error === true, + "Expected an error to be thrown", + ); + expect(response.message).toBe( + `playful-programming/1.0 is disallowed from ${redirectUrl.hostname}!`, + ); + + // Consume the remaining redirect mock so afterEach has no pending interceptors + await request(redirectUrl); + }); }); -test("Should not return data for a URL disallowed by robots.txt", async () => { - const robotsUrl = new URL("https://example.com/robots.txt"); - const url = new URL("https://example.com/test"); +describe("fetchAsBotStream", () => { + test("Should throw error when a 400 status code is returned", async () => { + const robotsUrl = new URL("https://example.com/robots.txt"); + const url = new URL("https://example.com/test"); - mockEndpoint({ - path: robotsUrl, - body: "User-agent: *\nDisallow: /test\n", - headers: { - "content-type": "text/plain", - }, + mockEndpoint({ + path: robotsUrl, + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + mockEndpoint({ + path: url, + body: "", + status: 400, + }); + + const botFetchStream = fetchAsBotStream({ + url, + method: "GET", + writable: createWriteStream(devNull), + }); + + await expect(botFetchStream).rejects.toThrow(); }); - mockEndpoint({ - path: url, - body: "Hello!", + + test("Should return successful data for a URL with no robots.txt", async () => { + const robotsUrl = new URL("https://example.com/robots.txt"); + const url = new URL("https://example.com/test"); + + mockEndpoint({ + path: robotsUrl, + status: 404, + body: "", + }); + mockEndpoint({ + path: url, + body: "Hello!", + }); + + let body = ""; + await fetchAsBotStream({ + url, + method: "GET", + writable: new Writable({ + write(chunk, _encoding, next) { + body += chunk; + next(); + }, + }), + }); + expect(body).toBe("Hello!"); }); - const response = await fetchAsBot({ url, method: "GET" }).catch((e) => e); - expect(response).to.be.instanceOf(RobotDeniedError); - expect((response as Error).message).toBe( - "playful-programming/1.0 is disallowed from example.com!", - ); + test("Should return successful data for a URL with a valid robots.txt", async () => { + const robotsUrl = new URL("https://example.com/robots.txt"); + const url = new URL("https://example.com/test"); + + mockEndpoint({ + path: robotsUrl, + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + mockEndpoint({ + path: url, + body: "Hello!", + }); + + let body = ""; + await fetchAsBotStream({ + url, + method: "GET", + writable: new Writable({ + write(chunk, _encoding, next) { + body += chunk; + next(); + }, + }), + }); + + expect(body).toBe("Hello!"); + }); + + test("Should not return data for a URL disallowed by robots.txt", async () => { + const robotsUrl = new URL("https://example.com/robots.txt"); + const url = new URL("https://example.com/test"); + + mockEndpoint({ + path: robotsUrl, + body: "User-agent: *\nDisallow: /test\n", + headers: { + "content-type": "text/plain", + }, + }); + + const error = await fetchAsBotStream({ + url, + method: "GET", + writable: createWriteStream(devNull), + }).catch((e) => e as Error); + + expect(error).toBeInstanceOf(RobotDeniedError); + expect(error?.message).toBe( + `playful-programming/1.0 is disallowed from ${url.hostname}!`, + ); + }); + + test("Should return data for a URL disallowed by robots.txt when skipRobotsCheck is true", async () => { + const robotsUrl = new URL("https://example.com/robots.txt"); + mockEndpoint({ + path: robotsUrl, + body: "User-agent: *\nDisallow: /test\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("https://example.com/test"); + const mockedBody = "Hello!"; + mockEndpoint({ + path: url, + body: mockedBody, + }); + + let body = ""; + await fetchAsBotStream({ + url, + method: "GET", + writable: new Writable({ + write(chunk, _encoding, next) { + body += chunk; + next(); + }, + }), + skipRobotsCheck: true, + }); + + expect(body).toBe(mockedBody); + + // Consume the remaining robots.txt mock so afterEach has no pending interceptors + await request(robotsUrl); + }); + + test("Should not follow redirects when followRedirects is 0", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectPath = "/another-test"; + const redirectStatus = 301; + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectPath, + }, + status: redirectStatus, + }); + + const redirectedUrl = new URL(redirectPath, baseUrl); + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectedUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + let body = ""; + const error = await fetchAsBotStream({ + url, + method: "GET", + writable: new Writable({ + write(chunk, _encoding, next) { + body += chunk; + next(); + }, + }), + followRedirects: 0, + }).catch((e) => e as Error); + + expect.assert( + error instanceof Error === true, + "Expected an error to be thrown", + ); + expect(error.message).toBe(`Request ${url} returned ${redirectStatus}`); + expect(body).toBe(""); + + // Consume the remaining redirect mock so afterEach has no pending interceptors + await request(redirectedUrl); + }); + + test("Should handle single redirect", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectPath = "/another-test"; + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectPath, + }, + status: 301, + }); + + const redirectedUrl = new URL(redirectPath, baseUrl); + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectedUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + let body = ""; + await fetchAsBotStream({ + url, + method: "GET", + writable: new Writable({ + write(chunk, _encoding, next) { + body += chunk; + next(); + }, + }), + }); + + expect(body).toBe(redirectionBody); + }); + + test("Should handle chain of redirects", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const redirectionBody = "This is the redirection result."; + for (let i = 1; i <= 3; i++) { + if (i === 3) { + mockEndpoint({ + path: new URL(`/${i}`, baseUrl), + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + continue; + } + + mockEndpoint({ + path: new URL(`/${i}`, baseUrl), + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: new URL(`/${i + 1}`, baseUrl).toString(), + }, + status: 301, + }); + } + + let body = ""; + await fetchAsBotStream({ + url: new URL("/1", baseUrl), + method: "GET", + writable: new Writable({ + write(chunk, _encoding, next) { + body += chunk; + next(); + }, + }), + }); + + expect(body).toBe(redirectionBody); + }); + + test("Should throw error when redirect limit is exceeded", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + for (let i = 1; i <= 12; i++) { + if (i === 12) { + mockEndpoint({ + path: new URL(`/${i}`, baseUrl), + body: "This is the redirection result.", + headers: { + "content-type": "text/plain", + }, + }); + continue; + } + + mockEndpoint({ + path: new URL(`/${i}`, baseUrl), + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: `/${i + 1}`, + }, + status: 302, + }); + } + + let body = ""; + const error = await fetchAsBotStream({ + url: new URL("/1", baseUrl), + method: "GET", + writable: new Writable({ + write(chunk, _encoding, next) { + body += chunk; + next(); + }, + }), + }).catch((e) => e as Error); + + expect(body).toBe(""); + expect(error).toBeInstanceOf(Error); + + // Consume the remaining redirect mocks so afterEach has no pending interceptors + await request(new URL("/12", baseUrl)); + }); + + test("Should throw error when redirecting to invalid location", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const invalidLocation = "http://[invalid-url"; + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: invalidLocation, + }, + status: 303, + }); + + const error = await fetchAsBotStream({ + url: new URL("/test", baseUrl), + method: "GET", + writable: createWriteStream(devNull), + }).catch((e) => e as Error); + + expect(error).toBeInstanceOf(Error); + expect(error?.message).toBe( + `The redirect location ${invalidLocation} couldn't be parsed as a URL for ${url}`, + ); + }); + + test("Should throw error when location header is missing on redirect", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + }, + status: 307, + }); + + const error = await fetchAsBotStream({ + url: new URL("/test", baseUrl), + method: "GET", + writable: createWriteStream(devNull), + }).catch((e) => e as Error); + + expect(error).toBeInstanceOf(Error); + expect(error?.message).toBe( + `The redirect location undefined couldn't be parsed as a URL for ${url}`, + ); + }); + + test("Should throw error when redirecting to unsupported protocol", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: "ftp://example.com/file", + }, + status: 308, + }); + + const error = await fetchAsBotStream({ + url: new URL("/test", baseUrl), + method: "GET", + writable: createWriteStream(devNull), + }).catch((e) => e as Error); + + expect(error).toBeInstanceOf(Error); + expect(error?.message).toBe(`Invalid redirect protocol for ${url}`); + }); + + test("Should handle absolute redirects", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectPath = "/another-test"; + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectPath, + }, + status: 301, + }); + + const redirectedUrl = new URL(redirectPath, baseUrl); + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectedUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + let body = ""; + await fetchAsBotStream({ + url, + method: "GET", + writable: new Writable({ + write(chunk, _encoding, next) { + body += chunk; + next(); + }, + }), + }); + + expect(body).toBe(redirectionBody); + }); + + test("Should handle relative redirects", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test/", baseUrl); + const redirectPath = "another-test"; + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectPath, + }, + status: 301, + }); + + const redirectedUrl = new URL(redirectPath, url); + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectedUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + let body = ""; + await fetchAsBotStream({ + url, + method: "GET", + writable: new Writable({ + write(chunk, _encoding, next) { + body += chunk; + next(); + }, + }), + }); + + expect(body).toBe(redirectionBody); + }); + + test("Should handle full URL redirects", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectUrl = new URL("/another-test", baseUrl); + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectUrl.toString(), + }, + status: 301, + }); + + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + let body = ""; + await fetchAsBotStream({ + url, + method: "GET", + writable: new Writable({ + write(chunk, _encoding, next) { + body += chunk; + next(); + }, + }), + }); + + expect(body).toBe(redirectionBody); + }); + + test("Should handle cross-domain redirects", async () => { + const baseUrl = "https://example.com"; + const redirectBaseUrl = "https://example.net"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + mockEndpoint({ + path: new URL("/robots.txt", redirectBaseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectUrl = new URL("/test", redirectBaseUrl); + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectUrl.toString(), + }, + status: 301, + }); + + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + let body = ""; + await fetchAsBotStream({ + url, + method: "GET", + writable: new Writable({ + write(chunk, _encoding, next) { + body += chunk; + next(); + }, + }), + }); + + expect(body).toBe(redirectionBody); + }); + + test("Should throw error when redirected location in same domain is disallowed by robots.txt", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow: /another-test\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectPath = "/another-test"; + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectPath, + }, + status: 301, + }); + + const redirectedUrl = new URL(redirectPath, url); + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectedUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + let body = ""; + const error = await fetchAsBotStream({ + url, + method: "GET", + writable: new Writable({ + write(chunk, _encoding, next) { + body += chunk; + next(); + }, + }), + }).catch((e) => e as Error); + + expect(body).toBe(""); + expect(error).toBeInstanceOf(Error); + expect(error?.message).toBe( + `playful-programming/1.0 is disallowed from ${url.hostname}!`, + ); + + // Consume the remaining redirect mock so afterEach has no pending interceptors + await request(redirectedUrl); + }); + + test("Should throw error when redirected location in different domain is disallowed by robots.txt", async () => { + const baseUrl = "https://example.com"; + const redirectBaseUrl = "https://example.net"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + mockEndpoint({ + path: new URL("/robots.txt", redirectBaseUrl), + body: "User-agent: *\nDisallow: /test\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectUrl = new URL("/test", redirectBaseUrl); + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectUrl.toString(), + }, + status: 301, + }); + + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + let body = ""; + const error = await fetchAsBotStream({ + url, + method: "GET", + writable: new Writable({ + write(chunk, _encoding, next) { + body += chunk; + next(); + }, + }), + }).catch((e) => e as Error); + + expect(body).toBe(""); + expect(error).toBeInstanceOf(Error); + expect(error?.message).toBe( + `playful-programming/1.0 is disallowed from ${redirectUrl.hostname}!`, + ); + + // Consume the remaining redirect mock so afterEach has no pending interceptors + await request(redirectUrl); + }); }); diff --git a/apps/worker/src/utils/fetchAsBot.ts b/apps/worker/src/utils/fetchAsBot.ts index 5effabe..5ec6009 100644 --- a/apps/worker/src/utils/fetchAsBot.ts +++ b/apps/worker/src/utils/fetchAsBot.ts @@ -1,7 +1,9 @@ -import { request, stream, type Dispatcher } from "undici"; +import { createWriteStream } from "fs"; import { LRUCache } from "lru-cache"; +import { devNull } from "os"; import robotsParserDefault, { type Robot } from "robots-parser"; -import type { Writable } from "stream"; +import { type Writable } from "stream"; +import { request, stream, type Dispatcher } from "undici"; const robotsParser = robotsParserDefault as never as typeof robotsParserDefault.default; @@ -42,11 +44,22 @@ async function getRobots(input: URL): Promise { return robotsParser(robotsUrl.toString(), robots); } +async function checkRobotsAccess(url: URL) { + const robots = await getRobots(url); + + if (robots && robots.isDisallowed(url.toString(), userAgent)) { + throw new RobotDeniedError( + `${userAgent} is disallowed from ${url.hostname}!`, + ); + } +} + type FetchAsBotInit = Omit< Dispatcher.RequestOptions, - "origin" | "path" + "origin" | "path" | "method" > & { url: string | URL; + method: "GET" | "POST" | "PUT" | "DELETE" | "HEAD"; skipRobotsCheck?: boolean; maxLength?: number; followRedirects?: number; @@ -68,18 +81,12 @@ export async function fetchAsBot(options: FetchAsBotInit) { } = options; const parsedUrl = url instanceof URL ? url : new URL(url); if (!skipRobotsCheck) { - const robots = await getRobots(parsedUrl); - - if (robots && robots.isDisallowed(url.toString(), userAgent)) { - throw new RobotDeniedError( - `${userAgent} is disallowed from ${parsedUrl.hostname}!`, - ); - } + await checkRobotsAccess(parsedUrl); } - console.debug(init.method ?? "GET", parsedUrl.href); + console.log(init.method ?? "GET", parsedUrl.href); - const response = await request(url, { + const response = await request(parsedUrl, { ...init, headers: { "User-Agent": userAgent, @@ -89,61 +96,147 @@ export async function fetchAsBot(options: FetchAsBotInit) { signal: init?.signal ?? AbortSignal.timeout(10 * 1000), }); - if (response.statusCode == 301 || response.statusCode == 302) { + if ( + [301, 302, 303, 307, 308].includes(response.statusCode) && + followRedirects > 0 + ) { await response.body.dump(); + const newLocation = response.headers["location"]?.toString(); - console.log(`redirect (${response.statusCode})`); + const newLocationUrl = newLocation + ? URL.parse(newLocation, parsedUrl) + : null; + if (newLocationUrl) { + console.log( + `redirect (${response.statusCode}) [${parsedUrl} -> ${newLocationUrl}]`, + ); + + if (!["https:", "http:"].includes(newLocationUrl.protocol)) { + throw new Error(`Invalid redirect protocol for ${parsedUrl}`); + } - if (followRedirects > 0 && newLocation && URL.canParse(newLocation)) { - const newUrl = new URL(newLocation); return await fetchAsBot({ ...options, - url: newUrl, + url: newLocationUrl, followRedirects: followRedirects - 1, }); + } else { + throw new Error( + `The redirect location ${newLocation} couldn't be parsed as a URL for ${parsedUrl}`, + ); } } if (response.statusCode < 200 || response.statusCode > 299) { await response.body.dump(); - throw new Error(`Request ${url} returned ${response.statusCode}`); + throw new Error(`Request ${parsedUrl} returned ${response.statusCode}`); } return response; } +interface FetchAsBotStreamFactoryOpaque { + writable: Writable; + followRedirects: number; + currentUrl: URL; + redirect: boolean; + error: Error | null; +} + +const fetchAsBotStreamFactory: Dispatcher.StreamFactory< + FetchAsBotStreamFactoryOpaque +> = ({ opaque, statusCode, headers }) => { + opaque.redirect = false; + + if ( + [301, 302, 303, 307, 308].includes(statusCode) && + opaque.followRedirects > 0 + ) { + const newLocation = headers["location"]?.toString(); + const newLocationUrl = newLocation + ? URL.parse(newLocation, opaque.currentUrl) + : null; + if (newLocationUrl) { + console.log( + `redirect (${statusCode}) [${opaque.currentUrl} -> ${newLocationUrl}]`, + ); + + if (!["https:", "http:"].includes(newLocationUrl.protocol)) { + opaque.error = new Error( + `Invalid redirect protocol for ${opaque.currentUrl}`, + ); + return createWriteStream(devNull); + } + + opaque.currentUrl = newLocationUrl; + opaque.followRedirects -= 1; + opaque.redirect = true; + } else { + opaque.error = new Error( + `The redirect location ${newLocation} couldn't be parsed as a URL for ${opaque.currentUrl}`, + ); + } + + return createWriteStream(devNull); + } + + if (statusCode < 200 || statusCode > 299) { + opaque.error = new Error( + `Request ${opaque.currentUrl} returned ${statusCode}`, + ); + + return createWriteStream(devNull); + } + + return opaque.writable; +}; + export async function fetchAsBotStream({ url, skipRobotsCheck, maxLength, writable, + followRedirects = 10, ...init }: FetchAsBotInit & { writable: Writable }) { const parsedUrl = url instanceof URL ? url : new URL(url); - if (!skipRobotsCheck) { - const robots = await getRobots(parsedUrl); - if (robots && robots.isDisallowed(url.toString(), userAgent)) { - throw new RobotDeniedError( - `${userAgent} is disallowed from ${parsedUrl.hostname}!`, - ); - } - } + console.log(init.method ?? "GET", parsedUrl.href); - console.debug(init.method ?? "GET", parsedUrl.href); + const opaque: FetchAsBotStreamFactoryOpaque = { + writable, + followRedirects, + currentUrl: parsedUrl, + redirect: false, + error: null, + }; - await stream( - url, - { - ...init, - headers: { - "User-Agent": userAgent, - "Accept-Language": "en", - ...init?.headers, + while (true) { + if (!skipRobotsCheck) { + await checkRobotsAccess(opaque.currentUrl); + } + + await stream( + opaque.currentUrl, + { + ...init, + headers: { + "User-Agent": userAgent, + "Accept-Language": "en", + ...init?.headers, + }, + signal: init?.signal ?? AbortSignal.timeout(10 * 1000), + opaque, }, - signal: init?.signal ?? AbortSignal.timeout(10 * 1000), - opaque: writable, - }, - ({ opaque }) => opaque, - ); + fetchAsBotStreamFactory, + ); + + if (opaque.error) { + throw opaque.error; + } + + if (!opaque.redirect) { + break; + } + } } diff --git a/apps/worker/test-utils/server.ts b/apps/worker/test-utils/server.ts index 423453c..57c6486 100644 --- a/apps/worker/test-utils/server.ts +++ b/apps/worker/test-utils/server.ts @@ -1,12 +1,20 @@ import { MockAgent, setGlobalDispatcher } from "undici"; -const mockAgent = new MockAgent({ - connections: 1, - bodyTimeout: 10, - connectTimeout: 10, - headersTimeout: 10, +let mockAgent: MockAgent; +beforeEach(() => { + mockAgent = new MockAgent({ + connections: 1, + bodyTimeout: 10, + connectTimeout: 10, + headersTimeout: 10, + }); + mockAgent.disableNetConnect(); + setGlobalDispatcher(mockAgent); +}); +afterEach(async () => { + mockAgent.assertNoPendingInterceptors(); + await mockAgent.close(); }); -setGlobalDispatcher(mockAgent); interface MockEndpointProps { path: string | URL; @@ -14,6 +22,7 @@ interface MockEndpointProps { headers?: Record; method?: "get" | "post" | "put" | "delete"; status?: number; + repeatTimes?: number; } export function mockEndpoint({ @@ -22,6 +31,7 @@ export function mockEndpoint({ headers, method = "get", status = 200, + repeatTimes = 1, }: MockEndpointProps) { const url = path instanceof URL ? path : new URL(path); mockAgent @@ -32,5 +42,6 @@ export function mockEndpoint({ }) .reply(status, body, { headers, - }); + }) + .times(repeatTimes); }