Skip to content

Commit

Permalink
Update (almost) all existing queries to use latest httparchive.crawl …
Browse files Browse the repository at this point in the history
…dataset.
  • Loading branch information
felixarntz committed Jan 8, 2025
1 parent b79d4c3 commit a55b577
Show file tree
Hide file tree
Showing 36 changed files with 403 additions and 376 deletions.
28 changes: 10 additions & 18 deletions sql/2022/11/performance-lab-version-distribution.sql
Original file line number Diff line number Diff line change
Expand Up @@ -15,26 +15,18 @@
# limitations under the License.

SELECT
info as version,
COUNT(DISTINCT url) AS sites,
COUNT(DISTINCT url) / total AS pct_sites
version,
COUNT(DISTINCT page) AS sites,
COUNT(DISTINCT page) / SUM(COUNT(DISTINCT page)) OVER () AS pct_sites
FROM
`httparchive.technologies.2022_10_01_*`
JOIN (
SELECT
app,
COUNT(DISTINCT url) AS total
FROM
`httparchive.technologies.2022_10_01_*`
GROUP BY
app
)
USING
(app)
`httparchive.crawl.pages`,
UNNEST(technologies) AS technology,
UNNEST(technology.info) AS version
WHERE
app = 'Performance Lab'
date = '2022-10-01'
AND is_root_page
AND technology.technology = 'Performance Lab'
GROUP BY
version,
total
version
ORDER BY
sites DESC
23 changes: 9 additions & 14 deletions sql/2022/12/inaccurate-sizes-attribute-impact.sql
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

# See query results here: https://github.com/GoogleChromeLabs/wpp-research/pull/19
CREATE TEMPORARY FUNCTION
getSrcsetSizesAccuracy(payload STRING)
getSrcsetSizesAccuracy(responsive_images JSON)
RETURNS ARRAY<STRUCT<sizesAbsoluteError INT64,
sizesRelativeError FLOAT64,
wDescriptorAbsoluteError INT64,
Expand All @@ -26,9 +26,7 @@ CREATE TEMPORARY FUNCTION
wastedLoadedPercent FLOAT64>>
LANGUAGE js AS '''
try {
var $ = JSON.parse(payload);
var responsiveImages = JSON.parse($._responsive_images);
responsiveImages = responsiveImages['responsive-images'];
var responsiveImages = responsive_images['responsive-images'];
return responsiveImages.map(({
sizesAbsoluteError,
sizesRelativeError,
Expand Down Expand Up @@ -71,19 +69,16 @@ SELECT
APPROX_QUANTILES(image.wastedLoadedPercent, 1000)[OFFSET(percentile * 10)] AS wastedLoadedPercent
FROM (
SELECT
tpages._TABLE_SUFFIX AS client,
client,
image
FROM
`httparchive.pages.2022_10_01_*` AS tpages,
UNNEST(getSrcsetSizesAccuracy(payload)) AS image
JOIN
`httparchive.technologies.2022_10_01_*` AS tech
ON
tech.url = tpages.url
`httparchive.crawl.pages`,
UNNEST(technologies) AS technology,
UNNEST(getSrcsetSizesAccuracy(custom_metrics.responsive_images)) AS image
WHERE
tpages._TABLE_SUFFIX = tech._TABLE_SUFFIX
AND app = 'WordPress'
AND category = 'CMS' ),
date = '2022-10-01'
AND is_root_page
AND technology.technology = 'WordPress' ),
UNNEST([10, 25, 50, 75, 90]) AS percentile
GROUP BY
percentile,
Expand Down
12 changes: 6 additions & 6 deletions sql/2022/12/lcp-image-without-fetchpriority-high-opportunity.sql
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@

# See query results here: https://github.com/GoogleChromeLabs/wpp-research/pull/15
CREATE TEMP FUNCTION
getFetchPriorityAttr(attributes STRING)
getFetchPriorityAttr(performance JSON)
RETURNS STRING
LANGUAGE js AS '''
try {
const data = JSON.parse(attributes);
const data = performance.lcp_elem_stats.attributes;
const fetchpriorityAttr = data.find(attr => attr["name"] === "fetchpriority")
return fetchpriorityAttr.value;
} catch (e) {
Expand All @@ -38,12 +38,12 @@ SELECT
FROM (
SELECT
client,
COUNTIF( getFetchPriorityAttr(JSON_EXTRACT(payload, '$._performance.lcp_elem_stats.attributes')) = "high"
AND JSON_EXTRACT_SCALAR(payload, '$._performance.lcp_elem_stats.nodeName') = "IMG" ) AS `with_fetchpriority_on_lcp`,
COUNTIF(JSON_EXTRACT_SCALAR(payload, '$._performance.lcp_elem_stats.nodeName') = "IMG") AS `total_with_lcp`,
COUNTIF( getFetchPriorityAttr(custom_metrics.performance) = "high"
AND JSON_VALUE(custom_metrics.performance.lcp_elem_stats.nodeName) = "IMG" ) AS `with_fetchpriority_on_lcp`,
COUNTIF(JSON_VALUE(custom_metrics.performance.lcp_elem_stats.nodeName) = "IMG") AS `total_with_lcp`,
COUNT(page) AS `total_wp_sites`,
FROM
`httparchive.all.pages`,
`httparchive.crawl.pages`,
UNNEST(technologies) AS technologies,
UNNEST(technologies.categories) AS category
WHERE
Expand Down
30 changes: 18 additions & 12 deletions sql/2022/12/usage-of-core-themes-with-jquery.sql
Original file line number Diff line number Diff line change
Expand Up @@ -16,27 +16,33 @@

# See query results here: https://github.com/GoogleChromeLabs/wpp-research/pull/13
SELECT
_TABLE_SUFFIX AS client,
app,
COUNT(DISTINCT url) AS sites,
COUNT(DISTINCT url) / total AS pct_sites
client,
technology.technology AS app,
COUNT(DISTINCT page) AS sites,
COUNT(DISTINCT page) / total AS pct_sites
FROM
`httparchive.technologies.2022_10_01_*`
`httparchive.crawl.pages`,
UNNEST(technologies) AS technology
JOIN (
SELECT
_TABLE_SUFFIX,
COUNT(DISTINCT url) AS total
client,
COUNT(DISTINCT page) AS total
FROM
`httparchive.technologies.2022_10_01_*`
`httparchive.crawl.pages`,
UNNEST(technologies) AS technology
WHERE
app = "WordPress"
date = '2022-10-01'
AND is_root_page
AND technology.technology = "WordPress"
GROUP BY
_TABLE_SUFFIX
client
)
USING
(_TABLE_SUFFIX)
(client)
WHERE
app IN (
date = '2022-10-01'
AND is_root_page
AND technology.technology IN (
"Twenty Eleven",
"Twenty Twelve",
"Twenty Thirteen",
Expand Down
6 changes: 3 additions & 3 deletions sql/2023/01/alloptions-query-time-distribution.sql
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@ WITH relevantServerTimings AS (
SELECT
client,
url,
EXTRACT_SERVER_TIMING_METRIC(httparchive.all.PARSE_SERVER_TIMING_HEADER(response_header.value), 'wp-load-alloptions-query') AS alloptions_query_time,
EXTRACT_SERVER_TIMING_METRIC(httparchive.all.PARSE_SERVER_TIMING_HEADER(response_header.value), 'wp-before-template') AS before_template_time
EXTRACT_SERVER_TIMING_METRIC(httparchive.fn.PARSE_SERVER_TIMING_HEADER(response_header.value), 'wp-load-alloptions-query') AS alloptions_query_time,
EXTRACT_SERVER_TIMING_METRIC(httparchive.fn.PARSE_SERVER_TIMING_HEADER(response_header.value), 'wp-before-template') AS before_template_time
FROM
`httparchive.all.requests`,
`httparchive.crawl.requests`,
UNNEST(response_headers) AS response_header
WHERE
date = '2023-01-01'
Expand Down
38 changes: 23 additions & 15 deletions sql/2023/01/block-theme-usage.sql
Original file line number Diff line number Diff line change
Expand Up @@ -16,35 +16,43 @@

# See query results here: https://github.com/GoogleChromeLabs/wpp-research/pull/32
SELECT
_TABLE_SUFFIX AS client,
COUNT(DISTINCT url) AS with_block_theme,
client,
COUNT(DISTINCT page) AS with_block_theme,
total_wp_sites,
COUNT(DISTINCT url) / total_wp_sites AS pct_total,
# For reference, include number of sites greater than or equal to WP 5.9, since only then block theme support was launched.
wp_gte_59
FROM
`httparchive.technologies.2022_11_01_*`
`httparchive.crawl.pages`,
UNNEST(technologies) AS technology,
UNNEST(technology.info) AS version
JOIN
`httparchive.response_bodies.2022_11_01_*`
`httparchive.crawl.requests`
USING
(_TABLE_SUFFIX, url)
(date, client, page, is_root_page)
JOIN (
SELECT
_TABLE_SUFFIX,
COUNT(DISTINCT IF (info = '' OR CAST(REGEXP_EXTRACT(info, r'^(\d+\.\d+)') AS FLOAT64) >= 5.9, url, NULL)) AS wp_gte_59,
COUNT(DISTINCT url) AS total_wp_sites
client,
COUNT(DISTINCT IF (version = '' OR CAST(REGEXP_EXTRACT(version, r'^(\d+\.\d+)') AS FLOAT64) >= 5.9, page, NULL)) AS wp_gte_59,
COUNT(DISTINCT page) AS total_wp_sites
FROM
`httparchive.technologies.2022_11_01_*`
`httparchive.crawl.pages`,
UNNEST(technologies) AS technology,
UNNEST(technology.info) AS version
WHERE
app = "WordPress"
date = '2022-11-01'
AND is_root_page
AND technology.technology = "WordPress"
GROUP BY
_TABLE_SUFFIX )
client )
USING
(_TABLE_SUFFIX)
(client)
WHERE
app = "WordPress"
AND (info = '' OR CAST(REGEXP_EXTRACT(info, r'^(\d+\.\d+)') AS FLOAT64) >= 5.9)
AND body LIKE '%<div class="wp-site-blocks">%'
date = '2022-11-01'
AND is_root_page
AND technology.technology = "WordPress"
AND (version = '' OR CAST(REGEXP_EXTRACT(version, r'^(\d+\.\d+)') AS FLOAT64) >= 5.9)
AND response_body LIKE '%<div class="wp-site-blocks">%'
GROUP BY
client,
wp_gte_59,
Expand Down
17 changes: 10 additions & 7 deletions sql/2023/01/critical-css-opportunity.sql
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,16 @@ SELECT
(total_wp_sites - sites_with_critical_css) / total_wp_sites AS opportunity
FROM (
SELECT
pages._TABLE_SUFFIX AS client,
COUNT(pages.url) AS total_wp_sites,
COUNTIF( CAST(JSON_EXTRACT_SCALAR(payload, '$._renderBlockingCSS') AS INT64) = 0
AND CAST(JSON_EXTRACT_SCALAR(payload, '$._inline_style_bytes') AS INT64) > 0 ) AS sites_with_critical_css
client,
COUNT(page) AS total_wp_sites,
COUNTIF( CAST(JSON_VALUE(payload._renderBlockingCSS) AS INT64) = 0
AND CAST(JSON_VALUE(custom_metrics.other.inline_style_bytes) AS INT64) > 0 ) AS sites_with_critical_css
FROM
`httparchive.pages.2022_10_01_*` AS pages
`httparchive.crawl.pages`,
UNNEST(technologies) AS technology
WHERE
JSON_EXTRACT(pages.payload, '$._detected_apps.WordPress') IS NOT NULL
date = '2022-10-01'
AND is_root_page
AND technology.technology = 'WordPress'
GROUP BY
pages._TABLE_SUFFIX )
client )
25 changes: 15 additions & 10 deletions sql/2023/01/cwvs-by-wordpress-version.sql
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ SELECT
AVG( pct_eligible_origins_with_good_cwv ) AS pct_eligible_origins_with_good_cwv
FROM (
SELECT
REGEXP_EXTRACT(info, '(\\d.\\d).*') AS major_version,
REGEXP_EXTRACT(version, '(\\d.\\d).*') AS major_version,
client,
COUNT(DISTINCT url) AS origins,
COUNT(DISTINCT IF (good_fid, url, NULL)) AS origins_with_good_fid,
Expand Down Expand Up @@ -79,18 +79,23 @@ FROM (
'phone') )
JOIN (
SELECT
DISTINCT CAST('2022-10-01' AS DATE) AS date,
date,
category,
app,
info,
_TABLE_SUFFIX AS client,
url
technology.technology AS app,
version,
client,
page AS url
FROM
`httparchive.technologies.2022_10_01_*`
`httparchive.crawl.pages`,
UNNEST(technologies) AS technology,
UNNEST(technology.categories) AS category,
UNNEST(technology.info) AS version
WHERE
app = 'WordPress'
date = '2022-10-01'
AND is_root_page
AND technology.technology = 'WordPress'
AND category = 'CMS'
AND info != '' )
AND version != '' )
USING
(date,
url,
Expand All @@ -99,7 +104,7 @@ FROM (
date,
major_version,
app,
info,
version,
client )
WHERE
origins > 100
Expand Down
15 changes: 9 additions & 6 deletions sql/2023/01/external-deferred-scripts-distribution.sql
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,19 @@

# See query results here: https://github.com/GoogleChromeLabs/wpp-research/pull/29
SELECT
_TABLE_SUFFIX AS client,
client,
percentile,
APPROX_QUANTILES(CAST(JSON_EXTRACT(JSON_EXTRACT_SCALAR(payload, '$._javascript'), '$.script_tags.src') AS INT64), 1000)[OFFSET(percentile * 10)] AS external_scripts,
APPROX_QUANTILES(CAST(JSON_EXTRACT(JSON_EXTRACT_SCALAR(payload, '$._javascript'), '$.script_tags.defer') AS INT64) / CAST(JSON_EXTRACT(JSON_EXTRACT_SCALAR(payload, '$._javascript'), '$.script_tags.src') AS INT64), 1000)[OFFSET(percentile * 10)] AS pct_deferred
APPROX_QUANTILES(CAST(JSON_VALUE(custom_metrics.javascript.script_tags.src) AS INT64), 1000)[OFFSET(percentile * 10)] AS external_scripts,
APPROX_QUANTILES(CAST(JSON_VALUE(custom_metrics.javascript.script_tags.defer) AS INT64) / CAST(JSON_VALUE(custom_metrics.javascript.script_tags.src) AS INT64), 1000)[OFFSET(percentile * 10)] AS pct_deferred
FROM
`httparchive.pages.2022_10_01_*`,
`httparchive.crawl.pages`,
UNNEST(technologies) AS technology,
UNNEST([10, 25, 50, 75, 90, 100]) AS percentile
WHERE
JSON_EXTRACT(payload, '$._detected_apps.WordPress') IS NOT NULL
AND CAST(JSON_EXTRACT(JSON_EXTRACT_SCALAR(payload, '$._javascript'), '$.script_tags.src') AS INT64) > 0
date = '2022-10-01'
AND is_root_page
AND technology.technology = 'WordPress'
AND CAST(JSON_VALUE(custom_metrics.javascript.script_tags.src) AS INT64) > 0
GROUP BY
client,
percentile
Expand Down
32 changes: 19 additions & 13 deletions sql/2023/01/lazyloaded-lcp-opportunity.sql
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@

# See query results here: https://github.com/GoogleChromeLabs/wpp-research/pull/28
CREATE TEMP FUNCTION
getLoadingAttr(attributes STRING)
getLoadingAttr(performance JSON)
RETURNS STRING
LANGUAGE js AS '''
try {
const data = JSON.parse(attributes);
const data = performance.lcp_elem_stats.attributes;
const loadingAttr = data.find(attr => attr["name"] === "loading")
return loadingAttr.value
} catch (e) {
Expand All @@ -30,16 +30,22 @@ CREATE TEMP FUNCTION

WITH
lcp_stats AS (
SELECT
_TABLE_SUFFIX AS client,
url,
JSON_EXTRACT_SCALAR(payload, '$._performance.lcp_elem_stats.nodeName') AS nodeName,
JSON_EXTRACT_SCALAR(payload, '$._performance.lcp_elem_stats.url') AS elementUrl,
JSON_EXTRACT(payload, '$._performance.lcp_elem_stats.attributes') AS attributes,
JSON_EXTRACT(payload, '$._detected_apps.WordPress') AS wpVersion,
getLoadingAttr(JSON_EXTRACT(payload, '$._performance.lcp_elem_stats.attributes')) AS loading,
# The `DISTINCT` is necessary to strip duplicate entries due to `UNNEST(technology.info)` which will result in 2 entries for each actual record.
SELECT DISTINCT
client,
page AS url,
JSON_VALUE(custom_metrics.performance.lcp_elem_stats.nodeName) AS nodeName,
JSON_VALUE(custom_metrics.performance.lcp_elem_stats.url) AS elementUrl,
wpVersion,
getLoadingAttr(custom_metrics.performance) AS loading,
FROM
`httparchive.pages.2022_10_01_*`
`httparchive.crawl.pages`,
UNNEST(technologies) AS technology,
UNNEST(technology.info) AS wpVersion
WHERE
date = '2022-10-01'
AND is_root_page
AND technology.technology = 'WordPress'
)

SELECT
Expand All @@ -54,8 +60,8 @@ SELECT
AND nodeName = "IMG") / COUNT(0) AS pct_total_opportunity,
# For reference, include number of sites greater than or equal to WP 5.5. Sites without a known version can be considered
# part of this since at this point they are most likely on a more recent version.
COUNTIF(wpVersion = '""'
OR CAST(REGEXP_EXTRACT(wpVersion, r'^"(\d+\.\d+)') AS FLOAT64) >= 5.5) AS wp_gte_55
COUNTIF(wpVersion = ''
OR CAST(REGEXP_EXTRACT(wpVersion, r'^(\d+\.\d+)') AS FLOAT64) >= 5.5) AS wp_gte_55
FROM
lcp_stats
WHERE
Expand Down
Loading

0 comments on commit a55b577

Please sign in to comment.