-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprerender.ts
343 lines (303 loc) · 14 KB
/
prerender.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
import type { NextApiRequest, NextApiResponse } from "next";
import prerendercloud from "prerendercloud";
// TL;DR: rewrites defined in next.config.js forward bot traffic to this function
// which pre-renders the page using Headless-Render-API and returns that pre-rendered page
// to the user while caching it in Vercel's CDN until the next deploy or CACHE_CONTROL_MAX_AGE_SECONDS
// this middleware, with default settings, can be ignored during development since it's
// only active for certain user-agents: twitterbot, slackbot, etc. defined in next.config.js
// but, to test in development mode:
// `DEBUG=prerendercloud PORT=3000 npm run dev`
// in another terminal launch ngrok (or ssh tunnel) to expose the port
// `ngrok http 3000`
// now that you have a public URL, you can test (after replacing ngrok URL):
// `curl -XGET -i -Atwitterbot 'https://$NGROK_SUB_DOMAIN.ngrok.io/react-router-page-1'`
// and verify pre-rendered content by searching for: "prerender.cloud processed at"
// Read the 5 steps below to understand how it works and configure options:
// STEP 1: Specify pages to be pre-rendered in the `afterFiles` field of next.config.js's 'rewrites()' function
//
// Note 1: `afterFiles` is used rather than `beforeFiles` so requests for static files like index.js, style.css are
// handled first by the vercel router checking the disk
// Note 2: the `has` field is used to restrict this rewrite rule to specific user-agents we want to be
// served pre-rendered content. This same user-agent restrictive filter is also built-in to the
// prerendercloud package, but is much faster when done at the Vercel router layer via their rewrites config.
/*
```
async rewrites() {
return {
afterFiles: [
{
source: "/:path*",
destination: "/api/prerender",
has: [
{
type: "header",
key: "user-agent",
value: "(slackbot|twitterbot).*",
},
],
},
],
```
*/
// STEP 2: make sure your project root includes an .env file with
// the `PRERENDER_TOKEN` environment variable set to your Headless-Render-API API token
// OR you can hardcode it: prerendercloud.set('prerenderToken', 'mySecretToken')
// OR you can configure it from Vercel's web console for env vars in project settings.
// STEP 3: consider configuring how long to cache data from Headless-Render-API
// How long to cache responses from pre-rendering API (Headless-Render-API.com) in Vercel's "cache-control" cache.
// (the cache key comes from the requested URL)
const CACHE_CONTROL_MAX_AGE_SECONDS = 60 * 60 * 24;
// Vercel docs claim this "cache-control" cache clears on each deploy (if your site is using a custom domain).
// https://vercel.com/docs/concepts/functions/serverless-functions/edge-caching#cache-invalidation-and-purging
// Sidebar: I'm calling it "cache-control" cache, because that's how we're allowed to control it. Not sure
// what else it's called, possibly CDN cache or "edge-cache".
// STEP 4: place this file (prerender.ts) in the pages/api directory
// and run `vercel deploy --prod`
// STEP 5: testing
// curl -XGET -i -Atwitterbot https://messari.io/asset/bitcoin
// confirm existence of open graph tags
// make same request again and notice it came back instantly from cache
// (see cache-control header)
// STEP 5: done --------------------------------------------------------------
// see this function call for opinions on how to configure the prerendercloud library
configurePrenderLib(prerendercloud);
// This is a Vercel serverless function, not an edge function. The difference
// is this takes a request and a response whereas the edge functions only take a request
// and require an additional export: `export const config { runtime: 'experimental-edge'}`
//
// Note: the prerendercloud library is expressjs middleware, so its interface
// is is req, res, next. Thus, this Vercel serverless function merely
// converts NextJS's NextApiRequest and NextApiResponse to the expressjs interface
// and passes to the prerendercloud library. The same approach is used for
// Amazon CloudFront here: https://github.com/sanfrancesco/prerendercloud-lambda-edge/tree/master/lib
export default function handler(
nextApiReq: NextApiRequest,
nextApiResponse: NextApiResponse
) {
// uncomment this for more verbose logging
// console.log("process.env.PRERENDER_TOKEN", process.env.PRERENDER_TOKEN);
// console.log("pages/api/prerender.ts handler", {
// url: nextApiReq.url,
// userAgent: nextApiReq.headers["user-agent"],
// });
// if anyone requests example.com/api/prerender, it will:
// 1. hit this function
// 2. and then prerendercloud middleware would try to pre-render it
// which will most likely cause infinte loop
if (nextApiReq.url && nextApiReq.url.startsWith("/api/prerender")) {
nextApiResponse.status(404).send("not found");
return Promise.resolve();
}
// create mock to fulfill req/res/next interface expected by prerendercloud library
// which typically runs as express middleware
const { req, res, next, waitForPromise } = convertNextJsToReqResNextHandler(
nextApiReq,
nextApiResponse
);
// call prerendercloud with mocked req/res/next objects
prerendercloud(req, res, next);
// return a promise that resolves after the prerendercloud
// middleware is done writing to the socket
return waitForPromise.then(() => {
// console.log("done waiting");
});
}
function convertNextJsToReqResNextHandler(
nextApiReq: NextApiRequest,
nextApiRes: NextApiResponse
): fakeHandler {
const protocol = nextApiReq.headers.host?.includes("localhost")
? "http"
: "https";
// note: nextApiReq.url is the pathname (excludes host) of the request before the rewrite
// also note: in vercel's production environment nextApiReq.url is still just the path, but
// also has a query parameter ?path appended with the originally requested path
// for example:
// twitterbot user-agent requests: /asset/bitcoin
// vercel dev rewrites to: /api/prerender/asset/bitcoin
// vercel prod rewrites to: /api/prerender/asset/bitcoin?path=/asset/bitcoin
const originallyRequestedUrlWithProtocolAndHost = `${protocol}://${nextApiReq.headers.host}${nextApiReq.url}`;
// friday:
// figure out whether to strip the :path query from the url
// * update prerendercloud to try/catch around call to process (so it works with edge functions)
// * update prerendercloud to expose got function and/or fetch index.html
// upload API key
// test it in prod to ensure caching isn't caching the wrong URLs (maybe ok to append :path to the url)
const headers: NodeJS.Dict<string | undefined> = {};
let status: number;
// const url = nextApiReq?.url?.pathname + nextApiReq?.url?.search;
// let url = "/";
const url = nextApiReq.url;
console.log({
originallyRequestedUrlWithProtocolAndHost,
url,
userAgent: nextApiReq.headers["user-agent"],
});
const req = {
connection: { encrypted: true },
method: nextApiReq.method,
originalUrl: url,
url,
headers: {
host: nextApiReq.headers.host,
"user-agent": nextApiReq.headers["user-agent"],
},
};
let callWhenDone: (value: unknown) => void;
const waitForPromise = new Promise((resolve) => {
callWhenDone = resolve;
});
const res = {
// the vary package in prerendercloud needs getHeader and setHeader
getHeader(key: string): string | undefined {
return headers[key];
},
setHeader(key: string, val: string): void {
// console.log("setHeader", key, val);
headers[key] = val;
nextApiRes.setHeader(key, val);
},
writeHead(_status: number, _headers: NodeJS.Dict<string | undefined>) {
// console.log("writeHead", { _status }, _headers);
status = _status;
nextApiRes.status(_status);
Object.keys(_headers).forEach((headerKey) => {
const header = _headers[headerKey];
if (!header) {
return;
}
this.setHeader(headerKey, header);
});
},
end(body: string): void {
// special handler for 301 to use nextJS's redirect method
if (status == 301) {
const location = headers["location"];
if (!location) {
throw new Error("no location header");
}
nextApiRes.redirect(location);
callWhenDone(null);
return;
}
// vercel will cache the response to this function, using the requested URL as a cache key,
// if a cache-control header is set on the response.
// their recommendation is here (note the first max-age=0 is for the user's browser, the 2nd is for vercel):
// https://vercel.com/docs/concepts/functions/serverless-functions/edge-caching#recommended-cache-control
nextApiRes.setHeader(
"Cache-Control",
`max-age=0, s-maxage=${CACHE_CONTROL_MAX_AGE_SECONDS}, stale-while-revalidate`
);
nextApiRes.send(body);
callWhenDone(null);
},
};
function next() {
// Thoughts on implementing this next interface:
// First of all, not critical: since we're already restricting the traffic driven to this function
// via afterFiles and bot filtering rewrite rule, this will rarely be called. But if we did want to call it
// (for example: after repeated 500 errors from upstream Headless-Render-API)
// 1. it seems like there's no `rewrite` method on nextApiRes (to rewrite to /index.html)
// 2. and calling next (assuming it was even possible) doesn't make since we've already
// used the `rewrites` feature to rewrite the URL to /api/prerender/:path*
// 3. but we could simply fetch the /index.html and return the response
// console.log("calling next", { originallyRequestedUrlWithProtocolAndHost });
prerendercloud
._got(originallyRequestedUrlWithProtocolAndHost, {
encoding: null,
retries: 3,
headers: {
"user-agent": "prerender-vercel-middleware-nextfn",
},
})
.then((res: any) => {
nextApiRes.send(res.body);
callWhenDone(null);
})
.catch((err: any) => {
nextApiRes.status(500).send(err.toString());
callWhenDone(null);
});
}
return { req, res, next, waitForPromise };
}
function configurePrenderLib(pcloudlib: any) {
// Otto recommendation: leave commented out for default config of 5 minute server cache
//
// Headless-Render-API's responses, by default, are cached for 5 minutes
// at service.headless-render-api.com's servers
// but since we're using Verel's Cache-Control, we probably don't care
// about caching data in Headless-Render-API since we're unlikely
// to benefit from a 2nd, Nth response coming from cache
// UNLESS the first request times out in this handler, but finishes behind
// the scenes at Headless-Render-API (and also gets cached) so then our 2nd request
// comes back immediately from that cache
// pcloudlib.set("disableServerCache", true);
// Otto recommendation: uncomment this config since primary use case is open graph and/or meta
// tags and by only including the pre-rendered meta tags, avoid any possible
// bugs caused by rehydrating pre-rendered content
// uncomment this if you only care about the meta tags, e.g. open graph tags
// this will return the original index.html with the pre-rendered content between
// the <head> and </head> tags only
prerendercloud.set("metaOnly", () => true);
// Otto recommendation: uncomment to sanitize the URLs as much as possible to prevent
// abusive bots trolling random URLs like /assets/bitcoin?wpadmin=1234
// if you need some query params to be respected for the pre-render, add them
// explicitly to this array, else they will be removed from the URL
prerendercloud.set("whitelistQueryParams", () => []);
// options below unlikely to change ----------------------------------------
// disabling because we're not using Headless-Render-API's mocked/stubbed XHR calls feature
pcloudlib.set("disableAjaxBypass", true);
// disabling because we're not using Headless-Render-API's mocked/stubbed XHR calls feature
pcloudlib.set("disableAjaxPreload", true);
// comment this out (enable feature) if it becomes obvious that we're getting duplicate
// meta tags from the pre-rendering + client hydration and/or
// it's causing problems for our use case (pre-rendering twitterbot, telegram, etc.)
pcloudlib.set("disableHeadDedupe", true);
// we use vercel's cache-control cache so we don't need this middleware local cache
// (nor does in-mempry cache for a serverless function make much sense)
// prerendercloud.set('enableMiddlewareCache', true);
// prerendercloud.set('middlewareCacheMaxBytes', 1000000000); // 1GB
// we are already filtering the user-agent (for bots only) in vercel's rewrite section
// of next.config.js, so we don't need to do it again here via botsOnly or lists.
//
// prerendercloud.set("botsOnly", true);
//
// likewise: whitelistPaths/blacklistPaths.
//
// prerendercloud.set("whitelistPaths", req => [
// "/docs",
// "/docs/"
// /\/users\/\d{1,6}\/profile$/, // without the ending $, this is equivalent to startsWith
// /\/users\/\d{1,6}\/profile\/?$/, // note the optional ending slash (\/?) and $
// "/google-domain-verification.html",
// "/google-domain-verification/",
// ]);
// prerendercloud.set("blacklistPaths", req => [
// "/google-domain-verification",
// "/google-domain-verification.html",
// "/google-domain-verification/",
// "/image-gallery/*",
// ]);
}
// create typescript type for fakeHandler:
interface fakeHandler {
req: {
connection: any;
method: string | undefined;
originalUrl: string | undefined;
url: string | undefined;
headers: {
host: string | undefined;
"user-agent": string | undefined;
};
};
res: {
getHeader: (key: string) => string | undefined;
setHeader: (key: string, val: string) => void;
writeHead: (status: number, headers: { [key: string]: string }) => void;
end: (body: string) => void;
};
next: () => void;
waitForPromise: Promise<unknown>;
}