Skip to content

Commit c2c609e

Browse files
authored
Merge pull request #30 from storacha/feat/optimization
feat(fetcher): first step at optimization
2 parents 4566f96 + 6d95105 commit c2c609e

File tree

8 files changed

+333
-74
lines changed

8 files changed

+333
-74
lines changed

package.json

+6-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@web3-storage/blob-fetcher",
3-
"version": "2.4.3",
3+
"version": "2.4.4-rc.0",
44
"description": "A blob fetcher that batches requests and reads multipart byterange responses.",
55
"main": "src/index.js",
66
"type": "module",
@@ -50,11 +50,16 @@
5050
"./locator/indexing-service": {
5151
"import": "./src/locator/indexing-service/index.js",
5252
"types": "./dist/src/locator/indexing-service/index.d.ts"
53+
},
54+
"./tracing/tracing": {
55+
"import": "./src/tracing/tracing.js",
56+
"types": "./dist/src/tracing/tracing.d.ts"
5357
}
5458
},
5559
"dependencies": {
5660
"@cloudflare/workers-types": "^4.20241022.0",
5761
"@ipld/dag-ucan": "^3.4.0",
62+
"@opentelemetry/api": "^1.9.0",
5863
"@storacha/indexing-service-client": "^2.0.0",
5964
"@ucanto/interface": "^10.0.1",
6065
"@web3-storage/blob-index": "^1.0.2",

pnpm-lock.yaml

+9
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/api.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import { ByteView, MultihashDigest } from 'multiformats'
22
import { Failure, Result, URI, DID } from '@ucanto/interface'
3-
import { Range } from 'multipart-byte-range'
43
import { QueryError } from '@storacha/indexing-service-client/api'
4+
import { Range } from 'multipart-byte-range'
55

66
export { ByteView, MultihashDigest } from 'multiformats'
77
export { Failure, Result, URI, DID, Principal } from '@ucanto/interface'

src/fetcher/batching.js

+125-42
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@
22
import * as API from '../api.js'
33
import { DigestMap } from '@web3-storage/blob-index'
44
import defer from 'p-defer'
5-
import { MultipartByteRangeDecoder, getBoundary } from 'multipart-byte-range/decoder'
65
import { NetworkError, NotFoundError } from '../lib.js'
76
import { fetchBlob } from './simple.js'
87
import { resolveRange } from './lib.js'
8+
import { withAsyncGeneratorSpan, withResultSpan } from '../tracing/tracing.js'
9+
import { MultipartByteRangeDecoder, getBoundary } from 'multipart-byte-range'
910

1011
/**
1112
* @typedef {'*'|`${number},${number}`|`${number}`} RangeKey
@@ -18,6 +19,7 @@ const MAX_BATCH_SIZE = 16
1819
/** @implements {API.Fetcher} */
1920
class BatchingFetcher {
2021
#locator
22+
#fetch
2123

2224
/** @type {DigestMap<API.MultihashDigest, RangedRequests>} */
2325
#pendingReqs = new DigestMap()
@@ -30,9 +32,13 @@ class BatchingFetcher {
3032
/** @type {Promise<void>|null} */
3133
#processing = null
3234

33-
/** @param {API.Locator} locator */
34-
constructor (locator) {
35+
/**
36+
* @param {API.Locator} locator
37+
* @param {typeof globalThis.fetch} [fetch]
38+
*/
39+
constructor (locator, fetch = globalThis.fetch.bind(globalThis)) {
3540
this.#locator = locator
41+
this.#fetch = fetch
3642
}
3743

3844
#scheduleBatchProcessing () {
@@ -66,6 +72,14 @@ class BatchingFetcher {
6672
const pendingReqs = this.#pendingReqs
6773
this.#pendingReqs = new DigestMap()
6874

75+
// Basic algorithm
76+
// 1. assemble each http request
77+
// 2. fire off request
78+
// 3. once first byte received, begin processing the response async in background
79+
// 4. immediately go to next http request, but after first iteration, wait so that we're never processing the body
80+
// of more than one response at a time
81+
/** @type {Promise<API.Result<true, API.NotFound|API.Aborted|API.NetworkError>> | undefined } */
82+
let lastResolveBlobs
6983
while (true) {
7084
const first = queue.shift()
7185
if (!first) break
@@ -84,16 +98,25 @@ class BatchingFetcher {
8498
if (locs.length >= MAX_BATCH_SIZE) break
8599
}
86100

87-
const res = await fetchBlobs(siteURL, locs)
88-
if (res.error) break
89-
for (const [i, blob] of res.ok.entries()) {
90-
const rangeReqs = pendingReqs.get(blob.digest)
91-
const key = rangeKey(locs[i].range)
92-
const reqs = rangeReqs?.get(key)
93-
reqs?.[0].resolve({ ok: blob })
94-
reqs?.slice(1).forEach(r => r.resolve({ ok: blob.clone() }))
95-
rangeReqs?.delete(key)
101+
const fetchRes = await fetchBlobs(siteURL, locs, this.#fetch)
102+
// if we have an error, stop
103+
if (fetchRes.error) {
104+
break
105+
}
106+
// if we are still processing the previous response, we should wait before we process this response
107+
if (lastResolveBlobs !== undefined) {
108+
const resolveRes = await lastResolveBlobs
109+
lastResolveBlobs = undefined
110+
if (resolveRes.error) {
111+
break
112+
}
96113
}
114+
lastResolveBlobs = resolveRequests(fetchRes.ok, pendingReqs)
115+
}
116+
117+
// await the last call to resolve blobs
118+
if (lastResolveBlobs !== undefined) {
119+
await lastResolveBlobs
97120
}
98121

99122
// resolve `undefined` for any remaining requests
@@ -135,83 +158,143 @@ class BatchingFetcher {
135158
/**
136159
* Create a new batching blob fetcher.
137160
* @param {API.Locator} locator
161+
* @param {typeof globalThis.fetch} [fetch]
138162
* @returns {API.Fetcher}
139163
*/
140-
export const create = (locator) => new BatchingFetcher(locator)
164+
export const create = (locator, fetch = globalThis.fetch.bind(globalThis)) => new BatchingFetcher(locator, fetch)
165+
166+
/** @typedef {{range: API.AbsoluteRange, digest: API.MultihashDigest, orig: API.Range | undefined}} ResolvedBlobs */
141167

142168
/**
143169
* Fetch blobs from the passed locations. The locations MUST share a common
144170
* site to fetch from.
145-
*
171+
*/
172+
export const fetchBlobs = withResultSpan('fetchBlobs', _fetchBlobs)
173+
174+
/**
146175
* @param {URL} url Desired URL to fetch blobs from.
147176
* @param {Array<{ location: API.Location, range?: API.Range }>} locations
148-
* @returns {Promise<API.Result<API.Blob[], API.NotFound|API.Aborted|API.NetworkError>>}
177+
* @param {typeof globalThis.fetch} [fetch]
178+
* @returns {Promise<API.Result<AsyncGenerator<BlobResult, API.Result<true, API.NotFound|API.Aborted|API.NetworkError>>, API.NotFound|API.Aborted|API.NetworkError>>}
149179
*/
150-
export const fetchBlobs = async (url, locations) => {
180+
async function _fetchBlobs (url, locations, fetch = globalThis.fetch.bind(globalThis)) {
151181
if (locations.length === 1) {
152-
const res = await fetchBlob(locations[0].location, locations[0].range)
182+
const res = await fetchBlob(locations[0].location, locations[0].range, fetch)
153183
if (res.error) return res
154-
return { ok: [res.ok] }
184+
return {
185+
ok: (async function * () {
186+
yield { blob: res.ok, range: locations[0].range }
187+
return { ok: true }
188+
}())
189+
}
155190
}
156191

157-
const ranges = []
192+
// resolve ranges for blobs
193+
194+
/** @type {ResolvedBlobs[]} */
195+
const resolvedBlobs = []
158196
for (const { location, range } of locations) {
159197
for (const s of location.site) {
160198
let found = false
161199
for (const l of s.location) {
162200
if (l.toString() === url.toString()) {
163-
/** @type {import('multipart-byte-range').AbsoluteRange} */
201+
/** @type {API.AbsoluteRange} */
164202
let resolvedRange = [s.range.offset, s.range.offset + s.range.length - 1]
165203
if (range) {
166204
const relRange = resolveRange(range, s.range.length)
167205
resolvedRange = [s.range.offset + relRange[0], s.range.offset + relRange[1]]
168206
}
169-
ranges.push(resolvedRange)
207+
resolvedBlobs.push({
208+
digest: location.digest,
209+
range: resolvedRange,
210+
orig: range
211+
})
170212
found = true
171213
break
172214
}
173215
}
174216
if (found) break
175217
}
176218
}
177-
if (ranges.length !== locations.length) {
219+
if (resolvedBlobs.length !== locations.length) {
178220
throw new Error('no common site')
179221
}
180222

181-
const headers = { Range: `bytes=${ranges.map(r => `${r[0]}-${r[1]}`).join(',')}` }
223+
const headers = { Range: `bytes=${resolvedBlobs.map(r => `${r.range[0]}-${r.range[1]}`).join(',')}` }
182224
try {
183225
const res = await fetch(url, { headers })
184226
if (!res.ok) {
185227
return { error: new NetworkError(url, { cause: new Error(`unexpected HTTP status: ${res.status}`) }) }
186228
}
229+
return { ok: consumeBatchResponse(url, resolvedBlobs, res) }
230+
} catch (err) {
231+
return { error: new NetworkError(url, { cause: err }) }
232+
}
233+
}
187234

188-
if (!res.body) {
189-
return { error: new NetworkError(url, { cause: new Error('missing repsonse body') }) }
190-
}
235+
/** @typedef {{blob: API.Blob, range: API.Range | undefined}} BlobResult */
191236

192-
const boundary = getBoundary(res.headers)
193-
if (!boundary) {
194-
return { error: new NetworkError(url, { cause: new Error('missing multipart boundary') }) }
195-
}
237+
/**
238+
* Consumes a batch request to create multiple blobs. Will break up
239+
* a byte range going from first byte byte of first blob to last byte of last blob
240+
* into appropriate ranges for each blob
241+
*/
242+
const consumeBatchResponse = withAsyncGeneratorSpan('consumeBatchResponse', _consumeBatchResponse)
196243

197-
/** @type {API.Blob[]} */
198-
const blobs = []
199-
let i = 0
200-
await res.body
201-
.pipeThrough(new MultipartByteRangeDecoder(boundary))
202-
.pipeTo(new WritableStream({
203-
write (part) {
204-
blobs.push(new Blob(locations[i].location.digest, part.content))
205-
i++
206-
}
207-
}))
244+
/**
245+
* @param {URL} url
246+
* @param {ResolvedBlobs[]} resolvedBlobs
247+
* @param {Response} res
248+
* @returns {AsyncGenerator<BlobResult, API.Result<true, API.NotFound|API.Aborted|API.NetworkError>>}
249+
*/
250+
async function * _consumeBatchResponse (url, resolvedBlobs, res) {
251+
if (!res.body) {
252+
return { error: new NetworkError(url, { cause: new Error('missing repsonse body') }) }
253+
}
254+
255+
const boundary = getBoundary(res.headers)
256+
if (!boundary) {
257+
return { error: new NetworkError(url, { cause: new Error('missing multipart boundary') }) }
258+
}
259+
260+
let i = 0
208261

209-
return { ok: blobs }
262+
try {
263+
for await (const chunk of res.body.pipeThrough(new MultipartByteRangeDecoder(boundary))) {
264+
// generate blob out of the current buffer
265+
const blob = new Blob(resolvedBlobs[i].digest, chunk.content)
266+
yield ({ blob, range: resolvedBlobs[i].orig })
267+
i++
268+
}
269+
return { ok: true }
210270
} catch (err) {
211271
return { error: new NetworkError(url, { cause: err }) }
212272
}
213273
}
214274

275+
/**
276+
* Resolve pending requests from blobs generated out of the last fetch
277+
*
278+
* @param {AsyncGenerator<BlobResult, API.Result<true, API.NotFound|API.Aborted|API.NetworkError>>} blobResults
279+
* @param {DigestMap<API.MultihashDigest, RangedRequests>} pendingReqs
280+
* @returns {Promise<API.Result<true, API.NotFound|API.Aborted|API.NetworkError>>}
281+
*/
282+
const resolveRequests = async (blobResults, pendingReqs) => {
283+
for (;;) {
284+
const { value: result, done } = await blobResults.next()
285+
if (done) {
286+
return result
287+
}
288+
const { blob, range } = result
289+
const rangeReqs = pendingReqs.get(blob.digest)
290+
const key = rangeKey(range)
291+
const reqs = rangeReqs?.get(key)
292+
reqs?.[0].resolve({ ok: blob })
293+
reqs?.slice(1).forEach(r => r.resolve({ ok: blob.clone() }))
294+
rangeReqs?.delete(key)
295+
}
296+
}
297+
215298
/** @implements {API.Blob} */
216299
class Blob {
217300
#digest

src/fetcher/lib.js

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/**
2-
* @param {import('multipart-byte-range').Range} range
2+
* @param {import('../api.js').Range} range
33
* @param {number} totalSize
4-
* @returns {import('multipart-byte-range').AbsoluteRange}
4+
* @returns {import('../api.js').AbsoluteRange}
55
*/
66
export const resolveRange = (range, totalSize) => {
77
let last = range[1]

0 commit comments

Comments
 (0)