Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 8 additions & 11 deletions bin/automated-update.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ const {getEntity, entities} = require('../lib/')

const bigQuery = new BigQuery()

const HA_REQUESTS_TABLE_REGEX = /`httparchive\.requests\.\w+`/g
const HA_REQUESTS_TABLE_REGEX = /`httparchive\.crawl\.requests`/g
const HA_LH_TABLE_REGEX = /`httparchive\.lighthouse\.\w+`/g
const LH_3P_TABLE_REGEX = /`lighthouse-infrastructure\.third_party_web\.\w+`/g
const DATE_UNDERSCORE_REGEX = /\d{4}_\d{2}_\d{2}/g
const DATE_DASH_REGEX = /\d{4}-\d{2}-\d{2}/g
const LH_PROJECT_REGEX = /lighthouse-infrastructure/g

const TABLE_REPLACEMENTS = process.env.USE_SAMPLE_DATA
Expand All @@ -29,9 +29,9 @@ const TABLE_REPLACEMENTS = process.env.USE_SAMPLE_DATA
[process.env.OVERRIDE_LH_PROJECT, LH_PROJECT_REGEX],
].filter(([override]) => override)

function getQueryForTable(filename, dateUnderscore) {
function getQueryForTable(filename, dateDash) {
const text = fs.readFileSync(filename, 'utf-8')
let query = text.replace(DATE_UNDERSCORE_REGEX, dateUnderscore)
let query = text.replace(DATE_DASH_REGEX, dateDash)
for (const [override, regex] of TABLE_REPLACEMENTS) {
query = query.replace(regex, override)
}
Expand Down Expand Up @@ -155,12 +155,9 @@ async function main() {
exitFn: () => process.exit(1),
})

const mostObservedDomainsQuery = getQueryForTable(
mostObservedDomainsFilename,
dateStringUnderscore
)
const allObservedDomainsQuery = getQueryForTable(allObservedDomainsFilename, dateStringUnderscore)
const entityPerPageQuery = getQueryForTable(entityPerPageFilename, dateStringUnderscore)
const mostObservedDomainsQuery = getQueryForTable(mostObservedDomainsFilename, dateStringHypens)
const allObservedDomainsQuery = getQueryForTable(allObservedDomainsFilename, dateStringHypens)
const entityPerPageQuery = getQueryForTable(entityPerPageFilename, dateStringHypens)

// 1. Get and write in 'observed-domains' json file domains observed more than 50 times
await withExistenceCheck(observedDomainsFilename, {
Expand Down Expand Up @@ -206,7 +203,7 @@ async function main() {
const domainEntityMapping = entities.reduce((array, {name, domains}) => {
return array.concat(domains.map(domain => ({name, domain})))
}, [])
const thirdPartyWebTableWriterStream = await getThirdPartyWebTable(dateStringUnderscore).then(
const thirdPartyWebTableWriterStream = await getThirdPartyWebTable(dateStringHypens).then(
table =>
table.createWriteStream({
sourceFormat: 'NEWLINE_DELIMITED_JSON',
Expand Down
6 changes: 5 additions & 1 deletion sql/all-observed-domains-query.sql
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,11 @@ FROM
page,
NET.HOST(url) AS domain
FROM
`httparchive.requests.2022_01_01_mobile`
`httparchive.crawl.requests`
WHERE
date = "2022-01-01"
AND
client = "mobile"
GROUP BY
page,
domain
Expand Down
12 changes: 8 additions & 4 deletions sql/entity-per-page.sql
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,19 @@ FROM
FROM
(
SELECT
url AS page,
report
page,
lighthouse as report
FROM
`httparchive.lighthouse.2022_01_01_mobile`
`httparchive.crawl.pages`
WHERE
date = "2022-01-01"
AND
client = "mobile"
),
UNNEST (
JSON_QUERY_ARRAY(report, '$.audits.bootup-time.details.items')
) AS bootupTimeItems
INNER JOIN `lighthouse-infrastructure.third_party_web.2022_01_01` ON NET.HOST(JSON_VALUE(bootupTimeItems, "$.url")) = domain
INNER JOIN `lighthouse-infrastructure.third_party_web.2022-01-01` ON NET.HOST(JSON_VALUE(bootupTimeItems, "$.url")) = domain
)
WHERE
canonicalDomain IS NOT NULL
Expand Down
6 changes: 5 additions & 1 deletion sql/most-observed-domains-query.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@ FROM
NET.HOST(url) AS domain,
COUNT(0) AS totalOccurrences
FROM
`httparchive.requests.2022_01_01_mobile`
`httparchive.crawl.requests`
WHERE
date = "2022-01-01"
AND
client = "mobile"
GROUP BY
page,
domain
Expand Down