Skip to content

Repo sync #39988

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 24 additions & 29 deletions src/frame/tests/robots-txt.ts
Original file line number Diff line number Diff line change
@@ -1,53 +1,48 @@
import type { Response } from 'got'
import { beforeAll, describe, expect, test, vi } from 'vitest'
import robotsParser, { type Robot } from 'robots-parser'
import { describe, expect, test, vi } from 'vitest'

import {
SURROGATE_ENUMS,
makeLanguageSurrogateKey,
} from '@/frame/middleware/set-fastly-surrogate-key'
import { get } from '@/tests/helpers/e2etest'

// Type alias for the response from e2etest helper
type TestResponse = {
body: string
statusCode: number
headers: Record<string, string>
url: string
ok: boolean
}

describe('robots.txt', () => {
vi.setConfig({ testTimeout: 60 * 1000 })

let res: Response<string>, robots: Robot
beforeAll(async () => {
res = await get('/robots.txt', {
headers: {
Host: 'docs.github.com',
},
})
test('returns disallow all for localhost (default behavior)', async () => {
const res: TestResponse = await get('/robots.txt')
expect(res.statusCode).toBe(200)
robots = robotsParser('https://docs.github.com/robots.txt', res.body)
})

test('allows indexing of the homepage and English content', async () => {
expect(robots.isAllowed('https://docs.github.com/')).toBe(true)
expect(robots.isAllowed('https://docs.github.com/en')).toBe(true)
expect(
robots.isAllowed('https://docs.github.com/en/articles/verifying-your-email-address'),
).toBe(true)
})

test('disallows indexing of internal domains', async () => {
const res = await get('/robots.txt', {
headers: {
host: 'docs-internal.github.com',
},
})
expect(res.body).toEqual('User-agent: *\nDisallow: /')
})

test('does not have duplicate lines', () => {
test('does not have duplicate lines', async () => {
const res: TestResponse = await get('/robots.txt')
expect(res.body.split('\n').length).toBe(new Set(res.body.split('\n')).size)
})

test('is cached by headers', () => {
test('is cached by headers', async () => {
const res: TestResponse = await get('/robots.txt')
expect(res.headers['cache-control']).toMatch(/public, max-age=/)

const surrogateKeySplit = (res.headers['surrogate-key'] as string).split(/\s/g)
expect(surrogateKeySplit.includes(SURROGATE_ENUMS.DEFAULT)).toBeTruthy()
expect(surrogateKeySplit.includes(makeLanguageSurrogateKey('en'))).toBeTruthy()
})

test('validates robots.txt format', async () => {
const res: TestResponse = await get('/robots.txt')
// Should be valid robots.txt format
expect(res.body).toMatch(/^User-agent: \*/)
expect(res.statusCode).toBe(200)
expect(res.headers['content-type']).toMatch(/text\/plain/)
})
})
77 changes: 50 additions & 27 deletions src/tests/helpers/e2etest.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import cheerio from 'cheerio'
import got, { Response, OptionsOfTextResponseBody, Method } from 'got'
import { fetchWithRetry } from '@/frame/lib/fetch-utils'
import { omitBy, isUndefined } from 'lodash-es'

type ResponseTypes = 'buffer' | 'json' | 'text'
Expand All @@ -9,8 +9,8 @@ type ResponseTypeMap = {
text: string
}

interface GetOptions<ResponseType extends ResponseTypes = 'text', M extends Method = 'get'> {
method?: M
interface GetOptions<ResponseType extends ResponseTypes = 'text'> {
method?: string
body?: any
followRedirects?: boolean
followAllRedirects?: boolean
Expand All @@ -26,12 +26,16 @@ interface GetDOMOptions {
retries?: number
}

interface ResponseWithHeaders<T> extends Response<T> {
interface ResponseWithHeaders<T> {
body: T
statusCode: number
headers: Record<string, string>
url: string
ok: boolean
}

// Type alias for cached DOM results to improve maintainability
type CachedDOMResult = cheerio.Root & { res: Response; $: cheerio.Root }
type CachedDOMResult = cheerio.Root & { res: ResponseWithHeaders<string>; $: cheerio.Root }

// Cache to store DOM objects
const getDOMCache = new Map<string, CachedDOMResult>()
Expand All @@ -43,43 +47,62 @@ const getDOMCache = new Map<string, CachedDOMResult>()
* @param options - Configuration options for the request.
* @returns A promise that resolves to the HTTP response.
*/
export async function get<T extends ResponseTypes = 'text', M extends Method = 'get'>(
export async function get<T extends ResponseTypes = 'text'>(
route: string,
options: GetOptions<T, M> = {},
options: GetOptions<T> = {},
): Promise<ResponseWithHeaders<ResponseTypeMap[T]>> {
const {
method = 'get',
body,
body: requestBody,
followRedirects = false,
followAllRedirects = false,
headers = {},
responseType,
retries = 0,
} = options

// Ensure the method is a valid function on `got`
const fn = got[method as 'get']
if (!fn || typeof fn !== 'function') {
throw new Error(`No method function for '${method}'`)
}

// Construct the options for the `got` request, omitting undefined values
const xopts: OptionsOfTextResponseBody = omitBy(
// Construct the options for the fetch request
const fetchOptions: RequestInit = omitBy(
{
body,
headers,
retry: { limit: retries },
throwHttpErrors: false,
followRedirect: followAllRedirects || followRedirects,
responseType: responseType || undefined,
method: method.toUpperCase(),
body: requestBody,
headers: headers as HeadersInit,
redirect: followAllRedirects || followRedirects ? 'follow' : 'manual',
},
isUndefined,
)

// Perform the HTTP request
return (await fn(`http://localhost:4000${route}`, xopts)) as ResponseWithHeaders<
ResponseTypeMap[T]
>
const response = await fetchWithRetry(`http://localhost:4000${route}`, fetchOptions, {
retries,
throwHttpErrors: false,
})

// Get response body based on responseType
let responseBody: ResponseTypeMap[T]
if (responseType === 'json') {
responseBody = (await response.json()) as ResponseTypeMap[T]
} else if (responseType === 'buffer') {
const arrayBuffer = await response.arrayBuffer()
responseBody = arrayBuffer as ResponseTypeMap[T]
} else {
responseBody = (await response.text()) as ResponseTypeMap[T]
}

// Convert headers to record format
const headersRecord: Record<string, string> = {}
response.headers.forEach((value, key) => {
headersRecord[key] = value
})

// Return response in got-compatible format
return {
body: responseBody,
statusCode: response.status,
headers: headersRecord,
url: response.url,
ok: response.ok,
} as ResponseWithHeaders<ResponseTypeMap[T]>
}

/**
Expand All @@ -92,7 +115,7 @@ export async function get<T extends ResponseTypes = 'text', M extends Method = '
export async function head(
route: string,
opts: { followRedirects?: boolean } = { followRedirects: false },
): Promise<Response<string>> {
): Promise<ResponseWithHeaders<string>> {
const res = await get(route, { method: 'head', followRedirects: opts.followRedirects })
return res
}
Expand All @@ -107,7 +130,7 @@ export async function head(
export function post(
route: string,
opts: Omit<GetOptions, 'method'> = {},
): Promise<Response<string>> {
): Promise<ResponseWithHeaders<string>> {
return get(route, { ...opts, method: 'post' })
}

Expand Down
Loading