Skip to content

Commit e482ee9

Browse files
authored
fix: Use new HAR datsets to compute data (#248)
* feat: use new har crawl dataset to fetch http archive data * fix: use right dataset name
1 parent b7ec692 commit e482ee9

File tree

4 files changed

+26
-17
lines changed

4 files changed

+26
-17
lines changed

bin/automated-update.js

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@ const {getEntity, entities} = require('../lib/')
1010

1111
const bigQuery = new BigQuery()
1212

13-
const HA_REQUESTS_TABLE_REGEX = /`httparchive\.requests\.\w+`/g
13+
const HA_REQUESTS_TABLE_REGEX = /`httparchive\.crawl\.requests`/g
1414
const HA_LH_TABLE_REGEX = /`httparchive\.lighthouse\.\w+`/g
1515
const LH_3P_TABLE_REGEX = /`lighthouse-infrastructure\.third_party_web\.\w+`/g
16-
const DATE_UNDERSCORE_REGEX = /\d{4}_\d{2}_\d{2}/g
16+
const DATE_DASH_REGEX = /\d{4}-\d{2}-\d{2}/g
1717
const LH_PROJECT_REGEX = /lighthouse-infrastructure/g
1818

1919
const TABLE_REPLACEMENTS = process.env.USE_SAMPLE_DATA
@@ -29,9 +29,9 @@ const TABLE_REPLACEMENTS = process.env.USE_SAMPLE_DATA
2929
[process.env.OVERRIDE_LH_PROJECT, LH_PROJECT_REGEX],
3030
].filter(([override]) => override)
3131

32-
function getQueryForTable(filename, dateUnderscore) {
32+
function getQueryForTable(filename, dateDash) {
3333
const text = fs.readFileSync(filename, 'utf-8')
34-
let query = text.replace(DATE_UNDERSCORE_REGEX, dateUnderscore)
34+
let query = text.replace(DATE_DASH_REGEX, dateDash)
3535
for (const [override, regex] of TABLE_REPLACEMENTS) {
3636
query = query.replace(regex, override)
3737
}
@@ -155,12 +155,9 @@ async function main() {
155155
exitFn: () => process.exit(1),
156156
})
157157

158-
const mostObservedDomainsQuery = getQueryForTable(
159-
mostObservedDomainsFilename,
160-
dateStringUnderscore
161-
)
162-
const allObservedDomainsQuery = getQueryForTable(allObservedDomainsFilename, dateStringUnderscore)
163-
const entityPerPageQuery = getQueryForTable(entityPerPageFilename, dateStringUnderscore)
158+
const mostObservedDomainsQuery = getQueryForTable(mostObservedDomainsFilename, dateStringHypens)
159+
const allObservedDomainsQuery = getQueryForTable(allObservedDomainsFilename, dateStringHypens)
160+
const entityPerPageQuery = getQueryForTable(entityPerPageFilename, dateStringHypens)
164161

165162
// 1. Get and write in 'observed-domains' json file domains observed more than 50 times
166163
await withExistenceCheck(observedDomainsFilename, {
@@ -206,7 +203,7 @@ async function main() {
206203
const domainEntityMapping = entities.reduce((array, {name, domains}) => {
207204
return array.concat(domains.map(domain => ({name, domain})))
208205
}, [])
209-
const thirdPartyWebTableWriterStream = await getThirdPartyWebTable(dateStringUnderscore).then(
206+
const thirdPartyWebTableWriterStream = await getThirdPartyWebTable(dateStringHypens).then(
210207
table =>
211208
table.createWriteStream({
212209
sourceFormat: 'NEWLINE_DELIMITED_JSON',

sql/all-observed-domains-query.sql

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,11 @@ FROM
2525
page,
2626
NET.HOST(url) AS domain
2727
FROM
28-
`httparchive.requests.2022_01_01_mobile`
28+
`httparchive.crawl.requests`
29+
WHERE
30+
date = "2022-01-01"
31+
AND
32+
client = "mobile"
2933
GROUP BY
3034
page,
3135
domain

sql/entity-per-page.sql

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,19 @@ FROM
2828
FROM
2929
(
3030
SELECT
31-
url AS page,
32-
report
31+
page,
32+
lighthouse as report
3333
FROM
34-
`httparchive.lighthouse.2022_01_01_mobile`
34+
`httparchive.crawl.pages`
35+
WHERE
36+
date = "2022-01-01"
37+
AND
38+
client = "mobile"
3539
),
3640
UNNEST (
3741
JSON_QUERY_ARRAY(report, '$.audits.bootup-time.details.items')
3842
) AS bootupTimeItems
39-
INNER JOIN `lighthouse-infrastructure.third_party_web.2022_01_01` ON NET.HOST(JSON_VALUE(bootupTimeItems, "$.url")) = domain
43+
INNER JOIN `lighthouse-infrastructure.third_party_web.2022-01-01` ON NET.HOST(JSON_VALUE(bootupTimeItems, "$.url")) = domain
4044
)
4145
WHERE
4246
canonicalDomain IS NOT NULL

sql/most-observed-domains-query.sql

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,11 @@ FROM
88
NET.HOST(url) AS domain,
99
COUNT(0) AS totalOccurrences
1010
FROM
11-
`httparchive.requests.2022_01_01_mobile`
11+
`httparchive.crawl.requests`
12+
WHERE
13+
date = "2022-01-01"
14+
AND
15+
client = "mobile"
1216
GROUP BY
1317
page,
1418
domain

0 commit comments

Comments
 (0)