From a55b5779c7f605f32cf9be825c3bb2663647511d Mon Sep 17 00:00:00 2001 From: Felix Arntz Date: Wed, 8 Jan 2025 15:15:35 -0800 Subject: [PATCH] Update (almost) all existing queries to use latest httparchive.crawl dataset. --- .../performance-lab-version-distribution.sql | 28 +++--- .../12/inaccurate-sizes-attribute-impact.sql | 23 ++--- ...without-fetchpriority-high-opportunity.sql | 12 +-- .../12/usage-of-core-themes-with-jquery.sql | 30 ++++--- .../01/alloptions-query-time-distribution.sql | 6 +- sql/2023/01/block-theme-usage.sql | 38 ++++---- sql/2023/01/critical-css-opportunity.sql | 17 ++-- sql/2023/01/cwvs-by-wordpress-version.sql | 25 +++--- ...external-deferred-scripts-distribution.sql | 15 ++-- sql/2023/01/lazyloaded-lcp-opportunity.sql | 32 ++++--- ...iority-high-opportunity-more-efficient.sql | 28 +++--- sql/2023/01/sites-with-deferred-scripts.sql | 15 ++-- .../01/sites-with-slow-alloptions-queries.sql | 6 +- .../01/webp-adoption-by-wordpress-version.sql | 86 +++++++++---------- ...opportunity-custom-metrics-alternative.sql | 19 ++-- ...ritical-css-opportunity-custom-metrics.sql | 19 ++-- sql/2023/03/top-lazy-lcp-class-names.sql | 14 +-- sql/2023/04/image-lazy-loading-usage.sql | 29 ++++--- .../blocking-head-scripts-count-by-plugin.sql | 20 ++--- ...n-head-scripts-from-themes-and-plugins.sql | 20 ++--- .../08/theme-plugin-script-placements.sql | 20 ++--- sql/2023/10/bfcache-failure-reasons.sql | 10 +-- sql/2023/10/bfcache-score-counts.sql | 4 +- .../10/cross-device-matching-lcp-image.sql | 10 +-- sql/2023/10/heartbeat-script-presence.sql | 20 ++--- sql/2023/10/page-content-types.sql | 4 +- ...mbed-blocks-on-root-and-non-root-pages.sql | 4 +- sql/2024/01/ttfb-localized-sites.sql | 5 +- sql/2024/04/image-formats.sql | 20 ++--- .../04/inaccurate-sizes-attribute-impact.sql | 32 +++---- ...f-for-sites-enabling-speculation-rules.sql | 16 ++-- .../home-page-template-types-popularity.sql | 12 +-- .../08/performance-lab-plugins-adoption.sql | 16 ++-- .../10/auto-sizes-impact-before-after.sql | 58 ++++++------- .../11/images-using-width-fit-content.sql | 24 +++--- .../auto-sizes-wp67-impact-before-after.sql | 42 ++++----- 36 files changed, 403 insertions(+), 376 deletions(-) diff --git a/sql/2022/11/performance-lab-version-distribution.sql b/sql/2022/11/performance-lab-version-distribution.sql index 028c981..a3fec9a 100644 --- a/sql/2022/11/performance-lab-version-distribution.sql +++ b/sql/2022/11/performance-lab-version-distribution.sql @@ -15,26 +15,18 @@ # limitations under the License. SELECT - info as version, - COUNT(DISTINCT url) AS sites, - COUNT(DISTINCT url) / total AS pct_sites + version, + COUNT(DISTINCT page) AS sites, + COUNT(DISTINCT page) / SUM(COUNT(DISTINCT page)) OVER () AS pct_sites FROM - `httparchive.technologies.2022_10_01_*` -JOIN ( - SELECT - app, - COUNT(DISTINCT url) AS total - FROM - `httparchive.technologies.2022_10_01_*` - GROUP BY - app -) -USING - (app) + `httparchive.crawl.pages`, + UNNEST(technologies) AS technology, + UNNEST(technology.info) AS version WHERE - app = 'Performance Lab' + date = '2022-10-01' + AND is_root_page + AND technology.technology = 'Performance Lab' GROUP BY - version, - total + version ORDER BY sites DESC diff --git a/sql/2022/12/inaccurate-sizes-attribute-impact.sql b/sql/2022/12/inaccurate-sizes-attribute-impact.sql index 655bb28..902edc9 100644 --- a/sql/2022/12/inaccurate-sizes-attribute-impact.sql +++ b/sql/2022/12/inaccurate-sizes-attribute-impact.sql @@ -16,7 +16,7 @@ # See query results here: https://github.com/GoogleChromeLabs/wpp-research/pull/19 CREATE TEMPORARY FUNCTION - getSrcsetSizesAccuracy(payload STRING) + getSrcsetSizesAccuracy(responsive_images JSON) RETURNS ARRAY> LANGUAGE js AS ''' try { - var $ = JSON.parse(payload); - var responsiveImages = JSON.parse($._responsive_images); - responsiveImages = responsiveImages['responsive-images']; + var responsiveImages = responsive_images['responsive-images']; return responsiveImages.map(({ sizesAbsoluteError, sizesRelativeError, @@ -71,19 +69,16 @@ SELECT APPROX_QUANTILES(image.wastedLoadedPercent, 1000)[OFFSET(percentile * 10)] AS wastedLoadedPercent FROM ( SELECT - tpages._TABLE_SUFFIX AS client, + client, image FROM - `httparchive.pages.2022_10_01_*` AS tpages, - UNNEST(getSrcsetSizesAccuracy(payload)) AS image - JOIN - `httparchive.technologies.2022_10_01_*` AS tech - ON - tech.url = tpages.url + `httparchive.crawl.pages`, + UNNEST(technologies) AS technology, + UNNEST(getSrcsetSizesAccuracy(custom_metrics.responsive_images)) AS image WHERE - tpages._TABLE_SUFFIX = tech._TABLE_SUFFIX - AND app = 'WordPress' - AND category = 'CMS' ), + date = '2022-10-01' + AND is_root_page + AND technology.technology = 'WordPress' ), UNNEST([10, 25, 50, 75, 90]) AS percentile GROUP BY percentile, diff --git a/sql/2022/12/lcp-image-without-fetchpriority-high-opportunity.sql b/sql/2022/12/lcp-image-without-fetchpriority-high-opportunity.sql index f99f7af..f5ccf43 100644 --- a/sql/2022/12/lcp-image-without-fetchpriority-high-opportunity.sql +++ b/sql/2022/12/lcp-image-without-fetchpriority-high-opportunity.sql @@ -16,11 +16,11 @@ # See query results here: https://github.com/GoogleChromeLabs/wpp-research/pull/15 CREATE TEMP FUNCTION - getFetchPriorityAttr(attributes STRING) + getFetchPriorityAttr(performance JSON) RETURNS STRING LANGUAGE js AS ''' try { - const data = JSON.parse(attributes); + const data = performance.lcp_elem_stats.attributes; const fetchpriorityAttr = data.find(attr => attr["name"] === "fetchpriority") return fetchpriorityAttr.value; } catch (e) { @@ -38,12 +38,12 @@ SELECT FROM ( SELECT client, - COUNTIF( getFetchPriorityAttr(JSON_EXTRACT(payload, '$._performance.lcp_elem_stats.attributes')) = "high" - AND JSON_EXTRACT_SCALAR(payload, '$._performance.lcp_elem_stats.nodeName') = "IMG" ) AS `with_fetchpriority_on_lcp`, - COUNTIF(JSON_EXTRACT_SCALAR(payload, '$._performance.lcp_elem_stats.nodeName') = "IMG") AS `total_with_lcp`, + COUNTIF( getFetchPriorityAttr(custom_metrics.performance) = "high" + AND JSON_VALUE(custom_metrics.performance.lcp_elem_stats.nodeName) = "IMG" ) AS `with_fetchpriority_on_lcp`, + COUNTIF(JSON_VALUE(custom_metrics.performance.lcp_elem_stats.nodeName) = "IMG") AS `total_with_lcp`, COUNT(page) AS `total_wp_sites`, FROM - `httparchive.all.pages`, + `httparchive.crawl.pages`, UNNEST(technologies) AS technologies, UNNEST(technologies.categories) AS category WHERE diff --git a/sql/2022/12/usage-of-core-themes-with-jquery.sql b/sql/2022/12/usage-of-core-themes-with-jquery.sql index 820d609..6fe75ed 100644 --- a/sql/2022/12/usage-of-core-themes-with-jquery.sql +++ b/sql/2022/12/usage-of-core-themes-with-jquery.sql @@ -16,27 +16,33 @@ # See query results here: https://github.com/GoogleChromeLabs/wpp-research/pull/13 SELECT - _TABLE_SUFFIX AS client, - app, - COUNT(DISTINCT url) AS sites, - COUNT(DISTINCT url) / total AS pct_sites + client, + technology.technology AS app, + COUNT(DISTINCT page) AS sites, + COUNT(DISTINCT page) / total AS pct_sites FROM - `httparchive.technologies.2022_10_01_*` + `httparchive.crawl.pages`, + UNNEST(technologies) AS technology JOIN ( SELECT - _TABLE_SUFFIX, - COUNT(DISTINCT url) AS total + client, + COUNT(DISTINCT page) AS total FROM - `httparchive.technologies.2022_10_01_*` + `httparchive.crawl.pages`, + UNNEST(technologies) AS technology WHERE - app = "WordPress" + date = '2022-10-01' + AND is_root_page + AND technology.technology = "WordPress" GROUP BY - _TABLE_SUFFIX + client ) USING - (_TABLE_SUFFIX) + (client) WHERE - app IN ( + date = '2022-10-01' + AND is_root_page + AND technology.technology IN ( "Twenty Eleven", "Twenty Twelve", "Twenty Thirteen", diff --git a/sql/2023/01/alloptions-query-time-distribution.sql b/sql/2023/01/alloptions-query-time-distribution.sql index f5cddb5..2629d93 100644 --- a/sql/2023/01/alloptions-query-time-distribution.sql +++ b/sql/2023/01/alloptions-query-time-distribution.sql @@ -27,10 +27,10 @@ WITH relevantServerTimings AS ( SELECT client, url, - EXTRACT_SERVER_TIMING_METRIC(httparchive.all.PARSE_SERVER_TIMING_HEADER(response_header.value), 'wp-load-alloptions-query') AS alloptions_query_time, - EXTRACT_SERVER_TIMING_METRIC(httparchive.all.PARSE_SERVER_TIMING_HEADER(response_header.value), 'wp-before-template') AS before_template_time + EXTRACT_SERVER_TIMING_METRIC(httparchive.fn.PARSE_SERVER_TIMING_HEADER(response_header.value), 'wp-load-alloptions-query') AS alloptions_query_time, + EXTRACT_SERVER_TIMING_METRIC(httparchive.fn.PARSE_SERVER_TIMING_HEADER(response_header.value), 'wp-before-template') AS before_template_time FROM - `httparchive.all.requests`, + `httparchive.crawl.requests`, UNNEST(response_headers) AS response_header WHERE date = '2023-01-01' diff --git a/sql/2023/01/block-theme-usage.sql b/sql/2023/01/block-theme-usage.sql index 805085a..0599846 100644 --- a/sql/2023/01/block-theme-usage.sql +++ b/sql/2023/01/block-theme-usage.sql @@ -16,35 +16,43 @@ # See query results here: https://github.com/GoogleChromeLabs/wpp-research/pull/32 SELECT - _TABLE_SUFFIX AS client, - COUNT(DISTINCT url) AS with_block_theme, + client, + COUNT(DISTINCT page) AS with_block_theme, total_wp_sites, COUNT(DISTINCT url) / total_wp_sites AS pct_total, # For reference, include number of sites greater than or equal to WP 5.9, since only then block theme support was launched. wp_gte_59 FROM - `httparchive.technologies.2022_11_01_*` + `httparchive.crawl.pages`, + UNNEST(technologies) AS technology, + UNNEST(technology.info) AS version JOIN - `httparchive.response_bodies.2022_11_01_*` + `httparchive.crawl.requests` USING - (_TABLE_SUFFIX, url) + (date, client, page, is_root_page) JOIN ( SELECT - _TABLE_SUFFIX, - COUNT(DISTINCT IF (info = '' OR CAST(REGEXP_EXTRACT(info, r'^(\d+\.\d+)') AS FLOAT64) >= 5.9, url, NULL)) AS wp_gte_59, - COUNT(DISTINCT url) AS total_wp_sites + client, + COUNT(DISTINCT IF (version = '' OR CAST(REGEXP_EXTRACT(version, r'^(\d+\.\d+)') AS FLOAT64) >= 5.9, page, NULL)) AS wp_gte_59, + COUNT(DISTINCT page) AS total_wp_sites FROM - `httparchive.technologies.2022_11_01_*` + `httparchive.crawl.pages`, + UNNEST(technologies) AS technology, + UNNEST(technology.info) AS version WHERE - app = "WordPress" + date = '2022-11-01' + AND is_root_page + AND technology.technology = "WordPress" GROUP BY - _TABLE_SUFFIX ) + client ) USING - (_TABLE_SUFFIX) + (client) WHERE - app = "WordPress" - AND (info = '' OR CAST(REGEXP_EXTRACT(info, r'^(\d+\.\d+)') AS FLOAT64) >= 5.9) - AND body LIKE '%
%' + date = '2022-11-01' + AND is_root_page + AND technology.technology = "WordPress" + AND (version = '' OR CAST(REGEXP_EXTRACT(version, r'^(\d+\.\d+)') AS FLOAT64) >= 5.9) + AND response_body LIKE '%
%' GROUP BY client, wp_gte_59, diff --git a/sql/2023/01/critical-css-opportunity.sql b/sql/2023/01/critical-css-opportunity.sql index 882837a..47db4cf 100644 --- a/sql/2023/01/critical-css-opportunity.sql +++ b/sql/2023/01/critical-css-opportunity.sql @@ -23,13 +23,16 @@ SELECT (total_wp_sites - sites_with_critical_css) / total_wp_sites AS opportunity FROM ( SELECT - pages._TABLE_SUFFIX AS client, - COUNT(pages.url) AS total_wp_sites, - COUNTIF( CAST(JSON_EXTRACT_SCALAR(payload, '$._renderBlockingCSS') AS INT64) = 0 - AND CAST(JSON_EXTRACT_SCALAR(payload, '$._inline_style_bytes') AS INT64) > 0 ) AS sites_with_critical_css + client, + COUNT(page) AS total_wp_sites, + COUNTIF( CAST(JSON_VALUE(payload._renderBlockingCSS) AS INT64) = 0 + AND CAST(JSON_VALUE(custom_metrics.other.inline_style_bytes) AS INT64) > 0 ) AS sites_with_critical_css FROM - `httparchive.pages.2022_10_01_*` AS pages + `httparchive.crawl.pages`, + UNNEST(technologies) AS technology WHERE - JSON_EXTRACT(pages.payload, '$._detected_apps.WordPress') IS NOT NULL + date = '2022-10-01' + AND is_root_page + AND technology.technology = 'WordPress' GROUP BY - pages._TABLE_SUFFIX ) + client ) diff --git a/sql/2023/01/cwvs-by-wordpress-version.sql b/sql/2023/01/cwvs-by-wordpress-version.sql index 79acbe9..78b7fab 100644 --- a/sql/2023/01/cwvs-by-wordpress-version.sql +++ b/sql/2023/01/cwvs-by-wordpress-version.sql @@ -43,7 +43,7 @@ SELECT AVG( pct_eligible_origins_with_good_cwv ) AS pct_eligible_origins_with_good_cwv FROM ( SELECT - REGEXP_EXTRACT(info, '(\\d.\\d).*') AS major_version, + REGEXP_EXTRACT(version, '(\\d.\\d).*') AS major_version, client, COUNT(DISTINCT url) AS origins, COUNT(DISTINCT IF (good_fid, url, NULL)) AS origins_with_good_fid, @@ -79,18 +79,23 @@ FROM ( 'phone') ) JOIN ( SELECT - DISTINCT CAST('2022-10-01' AS DATE) AS date, + date, category, - app, - info, - _TABLE_SUFFIX AS client, - url + technology.technology AS app, + version, + client, + page AS url FROM - `httparchive.technologies.2022_10_01_*` + `httparchive.crawl.pages`, + UNNEST(technologies) AS technology, + UNNEST(technology.categories) AS category, + UNNEST(technology.info) AS version WHERE - app = 'WordPress' + date = '2022-10-01' + AND is_root_page + AND technology.technology = 'WordPress' AND category = 'CMS' - AND info != '' ) + AND version != '' ) USING (date, url, @@ -99,7 +104,7 @@ FROM ( date, major_version, app, - info, + version, client ) WHERE origins > 100 diff --git a/sql/2023/01/external-deferred-scripts-distribution.sql b/sql/2023/01/external-deferred-scripts-distribution.sql index 0658312..3b2c2c3 100644 --- a/sql/2023/01/external-deferred-scripts-distribution.sql +++ b/sql/2023/01/external-deferred-scripts-distribution.sql @@ -16,16 +16,19 @@ # See query results here: https://github.com/GoogleChromeLabs/wpp-research/pull/29 SELECT - _TABLE_SUFFIX AS client, + client, percentile, - APPROX_QUANTILES(CAST(JSON_EXTRACT(JSON_EXTRACT_SCALAR(payload, '$._javascript'), '$.script_tags.src') AS INT64), 1000)[OFFSET(percentile * 10)] AS external_scripts, - APPROX_QUANTILES(CAST(JSON_EXTRACT(JSON_EXTRACT_SCALAR(payload, '$._javascript'), '$.script_tags.defer') AS INT64) / CAST(JSON_EXTRACT(JSON_EXTRACT_SCALAR(payload, '$._javascript'), '$.script_tags.src') AS INT64), 1000)[OFFSET(percentile * 10)] AS pct_deferred + APPROX_QUANTILES(CAST(JSON_VALUE(custom_metrics.javascript.script_tags.src) AS INT64), 1000)[OFFSET(percentile * 10)] AS external_scripts, + APPROX_QUANTILES(CAST(JSON_VALUE(custom_metrics.javascript.script_tags.defer) AS INT64) / CAST(JSON_VALUE(custom_metrics.javascript.script_tags.src) AS INT64), 1000)[OFFSET(percentile * 10)] AS pct_deferred FROM - `httparchive.pages.2022_10_01_*`, + `httparchive.crawl.pages`, + UNNEST(technologies) AS technology, UNNEST([10, 25, 50, 75, 90, 100]) AS percentile WHERE - JSON_EXTRACT(payload, '$._detected_apps.WordPress') IS NOT NULL - AND CAST(JSON_EXTRACT(JSON_EXTRACT_SCALAR(payload, '$._javascript'), '$.script_tags.src') AS INT64) > 0 + date = '2022-10-01' + AND is_root_page + AND technology.technology = 'WordPress' + AND CAST(JSON_VALUE(custom_metrics.javascript.script_tags.src) AS INT64) > 0 GROUP BY client, percentile diff --git a/sql/2023/01/lazyloaded-lcp-opportunity.sql b/sql/2023/01/lazyloaded-lcp-opportunity.sql index f79ef16..78a71b2 100644 --- a/sql/2023/01/lazyloaded-lcp-opportunity.sql +++ b/sql/2023/01/lazyloaded-lcp-opportunity.sql @@ -16,11 +16,11 @@ # See query results here: https://github.com/GoogleChromeLabs/wpp-research/pull/28 CREATE TEMP FUNCTION - getLoadingAttr(attributes STRING) + getLoadingAttr(performance JSON) RETURNS STRING LANGUAGE js AS ''' try { - const data = JSON.parse(attributes); + const data = performance.lcp_elem_stats.attributes; const loadingAttr = data.find(attr => attr["name"] === "loading") return loadingAttr.value } catch (e) { @@ -30,16 +30,22 @@ CREATE TEMP FUNCTION WITH lcp_stats AS ( - SELECT - _TABLE_SUFFIX AS client, - url, - JSON_EXTRACT_SCALAR(payload, '$._performance.lcp_elem_stats.nodeName') AS nodeName, - JSON_EXTRACT_SCALAR(payload, '$._performance.lcp_elem_stats.url') AS elementUrl, - JSON_EXTRACT(payload, '$._performance.lcp_elem_stats.attributes') AS attributes, - JSON_EXTRACT(payload, '$._detected_apps.WordPress') AS wpVersion, - getLoadingAttr(JSON_EXTRACT(payload, '$._performance.lcp_elem_stats.attributes')) AS loading, + # The `DISTINCT` is necessary to strip duplicate entries due to `UNNEST(technology.info)` which will result in 2 entries for each actual record. + SELECT DISTINCT + client, + page AS url, + JSON_VALUE(custom_metrics.performance.lcp_elem_stats.nodeName) AS nodeName, + JSON_VALUE(custom_metrics.performance.lcp_elem_stats.url) AS elementUrl, + wpVersion, + getLoadingAttr(custom_metrics.performance) AS loading, FROM - `httparchive.pages.2022_10_01_*` + `httparchive.crawl.pages`, + UNNEST(technologies) AS technology, + UNNEST(technology.info) AS wpVersion + WHERE + date = '2022-10-01' + AND is_root_page + AND technology.technology = 'WordPress' ) SELECT @@ -54,8 +60,8 @@ SELECT AND nodeName = "IMG") / COUNT(0) AS pct_total_opportunity, # For reference, include number of sites greater than or equal to WP 5.5. Sites without a known version can be considered # part of this since at this point they are most likely on a more recent version. - COUNTIF(wpVersion = '""' - OR CAST(REGEXP_EXTRACT(wpVersion, r'^"(\d+\.\d+)') AS FLOAT64) >= 5.5) AS wp_gte_55 + COUNTIF(wpVersion = '' + OR CAST(REGEXP_EXTRACT(wpVersion, r'^(\d+\.\d+)') AS FLOAT64) >= 5.5) AS wp_gte_55 FROM lcp_stats WHERE diff --git a/sql/2023/01/lcp-image-without-fetchpriority-high-opportunity-more-efficient.sql b/sql/2023/01/lcp-image-without-fetchpriority-high-opportunity-more-efficient.sql index 95f7912..c726871 100644 --- a/sql/2023/01/lcp-image-without-fetchpriority-high-opportunity-more-efficient.sql +++ b/sql/2023/01/lcp-image-without-fetchpriority-high-opportunity-more-efficient.sql @@ -16,11 +16,11 @@ # See query results here: https://github.com/GoogleChromeLabs/wpp-research/pull/27 CREATE TEMP FUNCTION - getFetchPriorityAttr(attributes STRING) + getFetchPriorityAttr(performance JSON) RETURNS STRING LANGUAGE js AS ''' try { - const data = JSON.parse(attributes); + const data = performance.lcp_elem_stats.attributes; const fetchpriorityAttr = data.find(attr => attr["name"] === "fetchpriority") return fetchpriorityAttr.value; } catch (e) { @@ -30,16 +30,22 @@ try { WITH lcp_stats AS ( - SELECT - _TABLE_SUFFIX AS client, - url, - JSON_EXTRACT_SCALAR(payload, '$._performance.lcp_elem_stats.nodeName') AS nodeName, - JSON_EXTRACT_SCALAR(payload, '$._performance.lcp_elem_stats.url') AS elementUrl, - JSON_EXTRACT(payload, '$._performance.lcp_elem_stats.attributes') AS attributes, - JSON_EXTRACT(payload, '$._detected_apps.WordPress') AS wpVersion, - getFetchPriorityAttr(JSON_EXTRACT(payload, '$._performance.lcp_elem_stats.attributes')) AS fetchpriority, + # The `DISTINCT` is necessary to strip duplicate entries due to `UNNEST(technology.info)` which will result in 2 entries for each actual record. + SELECT DISTINCT + client, + page AS url, + JSON_VALUE(custom_metrics.performance.lcp_elem_stats.nodeName) AS nodeName, + JSON_VALUE(custom_metrics.performance.lcp_elem_stats.url) AS elementUrl, + wpVersion, + getFetchPriorityAttr(custom_metrics.performance) AS fetchpriority, FROM - `httparchive.pages.2022_10_01_*` + `httparchive.crawl.pages`, + UNNEST(technologies) AS technology, + UNNEST(technology.info) AS wpVersion + WHERE + date = '2022-10-01' + AND is_root_page + AND technology.technology = 'WordPress' ) SELECT diff --git a/sql/2023/01/sites-with-deferred-scripts.sql b/sql/2023/01/sites-with-deferred-scripts.sql index 96f1731..75a63a3 100644 --- a/sql/2023/01/sites-with-deferred-scripts.sql +++ b/sql/2023/01/sites-with-deferred-scripts.sql @@ -16,14 +16,17 @@ # See query results here: https://github.com/GoogleChromeLabs/wpp-research/pull/29 SELECT - _TABLE_SUFFIX AS client, - COUNTIF(CAST(JSON_EXTRACT(JSON_EXTRACT_SCALAR(payload, '$._javascript'), '$.script_tags.defer') AS INT64) > 0) AS with_deferred_scripts, - COUNTIF(CAST(JSON_EXTRACT(JSON_EXTRACT_SCALAR(payload, '$._javascript'), '$.script_tags.src') AS INT64) > 0) AS with_any_external_scripts, + client, + COUNTIF(CAST(JSON_VALUE(custom_metrics.javascript.script_tags.defer) AS INT64) > 0) AS with_deferred_scripts, + COUNTIF(CAST(JSON_VALUE(custom_metrics.javascript.script_tags.src) AS INT64) > 0) AS with_any_external_scripts, COUNT(0) AS total_wp_sites, - COUNTIF(CAST(JSON_EXTRACT(JSON_EXTRACT_SCALAR(payload, '$._javascript'), '$.script_tags.defer') AS INT64) > 0) / COUNT(0) AS defer_pct, + COUNTIF(CAST(JSON_VALUE(custom_metrics.javascript.script_tags.defer) AS INT64) > 0) / COUNT(0) AS defer_pct, FROM - `httparchive.pages.2022_10_01_*` + `httparchive.crawl.pages`, + UNNEST(technologies) AS technology WHERE - JSON_EXTRACT(payload, '$._detected_apps.WordPress') IS NOT NULL + date = '2022-10-01' + AND is_root_page + AND technology.technology = 'WordPress' GROUP BY client diff --git a/sql/2023/01/sites-with-slow-alloptions-queries.sql b/sql/2023/01/sites-with-slow-alloptions-queries.sql index 50beed2..a175c5f 100644 --- a/sql/2023/01/sites-with-slow-alloptions-queries.sql +++ b/sql/2023/01/sites-with-slow-alloptions-queries.sql @@ -27,10 +27,10 @@ WITH relevantServerTimings AS ( SELECT client, url, - EXTRACT_SERVER_TIMING_METRIC(httparchive.all.PARSE_SERVER_TIMING_HEADER(response_header.value), 'wp-load-alloptions-query') AS alloptions_query_time, - EXTRACT_SERVER_TIMING_METRIC(httparchive.all.PARSE_SERVER_TIMING_HEADER(response_header.value), 'wp-before-template') AS before_template_time + EXTRACT_SERVER_TIMING_METRIC(httparchive.fn.PARSE_SERVER_TIMING_HEADER(response_header.value), 'wp-load-alloptions-query') AS alloptions_query_time, + EXTRACT_SERVER_TIMING_METRIC(httparchive.fn.PARSE_SERVER_TIMING_HEADER(response_header.value), 'wp-before-template') AS before_template_time FROM - `httparchive.all.requests`, + `httparchive.crawl.requests`, UNNEST(response_headers) AS response_header WHERE date = '2023-01-01' diff --git a/sql/2023/01/webp-adoption-by-wordpress-version.sql b/sql/2023/01/webp-adoption-by-wordpress-version.sql index f690145..99bc548 100644 --- a/sql/2023/01/webp-adoption-by-wordpress-version.sql +++ b/sql/2023/01/webp-adoption-by-wordpress-version.sql @@ -31,33 +31,30 @@ FROM ( COUNT(0) AS pages, COUNTIF(has_webp) / COUNT(0) AS pct_webp FROM ( - SELECT - DISTINCT url, - REGEXP_EXTRACT(info, r'(\d\.\d+)') AS version + SELECT DISTINCT + page AS url, + REGEXP_EXTRACT(version, r'(\d\.\d+)') AS version FROM - `httparchive.technologies.2022_10_01_mobile` + `httparchive.crawl.pages`, + UNNEST(technologies) AS technology, + UNNEST(info) AS version WHERE - app = 'WordPress' ) + date = '2022-10-01' + AND client = 'mobile' + AND is_root_page + AND technology.technology = 'WordPress' ) JOIN ( SELECT - url, - has_webp - FROM ( - SELECT - pageid, - COUNTIF(ext = 'webp') > 0 AS has_webp - FROM - `httparchive.summary_requests.2022_10_01_mobile` - GROUP BY - pageid ) - JOIN ( - SELECT - pageid, - url - FROM - `httparchive.summary_pages.2022_10_01_mobile` ) - USING - (pageid) ) + page AS url, + COUNTIF(JSON_VALUE(summary.ext) = 'webp') > 0 AS has_webp + FROM + `httparchive.crawl.requests` + WHERE + date = '2022-10-01' + AND client = 'mobile' + AND is_root_page + GROUP BY + page ) USING (url) WHERE @@ -79,33 +76,30 @@ JOIN ( COUNT(0) AS pages, COUNTIF(has_webp) / COUNT(0) AS pct_webp FROM ( - SELECT - DISTINCT url, - REGEXP_EXTRACT(info, r'(\d\.\d+)') AS version + SELECT DISTINCT + page AS url, + REGEXP_EXTRACT(version, r'(\d\.\d+)') AS version FROM - `httparchive.technologies.2022_10_01_desktop` + `httparchive.crawl.pages`, + UNNEST(technologies) AS technology, + UNNEST(info) AS version WHERE - app = 'WordPress' ) + date = '2022-10-01' + AND client = 'desktop' + AND is_root_page + AND technology.technology = 'WordPress' ) JOIN ( SELECT - url, - has_webp - FROM ( - SELECT - pageid, - COUNTIF(ext = 'webp') > 0 AS has_webp - FROM - `httparchive.summary_requests.2022_10_01_desktop` - GROUP BY - pageid ) - JOIN ( - SELECT - pageid, - url - FROM - `httparchive.summary_pages.2022_10_01_desktop` ) - USING - (pageid) ) + page AS url, + COUNTIF(JSON_VALUE(summary.ext) = 'webp') > 0 AS has_webp + FROM + `httparchive.crawl.requests` + WHERE + date = '2022-10-01' + AND client = 'desktop' + AND is_root_page + GROUP BY + page ) USING (url) WHERE diff --git a/sql/2023/03/critical-css-opportunity-custom-metrics-alternative.sql b/sql/2023/03/critical-css-opportunity-custom-metrics-alternative.sql index 2d01650..7f606c7 100644 --- a/sql/2023/03/critical-css-opportunity-custom-metrics-alternative.sql +++ b/sql/2023/03/critical-css-opportunity-custom-metrics-alternative.sql @@ -23,16 +23,19 @@ SELECT (total_wp_sites - sites_with_critical_css) / total_wp_sites AS opportunity FROM ( SELECT - pages._TABLE_SUFFIX AS client, - COUNT(pages.url) AS total_wp_sites, + client, + COUNT(page) AS total_wp_sites, COUNTIF( - CAST(JSON_EXTRACT_SCALAR(payload, '$._renderBlockingCSS') AS INT64) = 0 - AND CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._css'), '$.inlineCssInHead') AS INT64) > 0 - AND CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._css'), '$.externalCssInBody') AS INT64) > 0 + CAST(JSON_VALUE(payload._renderBlockingCSS) AS INT64) = 0 + AND CAST(JSON_VALUE(custom_metrics.other.css.inlineCssInHead) AS INT64) > 0 + AND CAST(JSON_VALUE(custom_metrics.other.css.externalCssInBody) AS INT64) > 0 ) AS sites_with_critical_css FROM - `httparchive.pages.2022_03_01_*` AS pages + `httparchive.crawl.pages`, + UNNEST(technologies) AS technology WHERE - JSON_EXTRACT(pages.payload, '$._detected_apps.WordPress') IS NOT NULL + date = '2023-03-01' + AND is_root_page + AND technology.technology = 'WordPress' GROUP BY - pages._TABLE_SUFFIX ) + client ) diff --git a/sql/2023/03/critical-css-opportunity-custom-metrics.sql b/sql/2023/03/critical-css-opportunity-custom-metrics.sql index 8d4e61d..624f0a2 100644 --- a/sql/2023/03/critical-css-opportunity-custom-metrics.sql +++ b/sql/2023/03/critical-css-opportunity-custom-metrics.sql @@ -23,16 +23,19 @@ SELECT (total_wp_sites - sites_with_critical_css) / total_wp_sites AS opportunity FROM ( SELECT - pages._TABLE_SUFFIX AS client, - COUNT(pages.url) AS total_wp_sites, + client, + COUNT(page) AS total_wp_sites, COUNTIF( - CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._css'), '$.externalCssInHead') AS INT64) = 0 - AND CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._css'), '$.inlineCssInHead') AS INT64) > 0 - AND CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._css'), '$.externalCssInBody') AS INT64) > 0 + CAST(JSON_VALUE(custom_metrics.other.css.externalCssInHead) AS INT64) = 0 + AND CAST(JSON_VALUE(custom_metrics.other.css.inlineCssInHead) AS INT64) > 0 + AND CAST(JSON_VALUE(custom_metrics.other.css.externalCssInBody) AS INT64) > 0 ) AS sites_with_critical_css FROM - `httparchive.pages.2022_03_01_*` AS pages + `httparchive.crawl.pages`, + UNNEST(technologies) AS technology WHERE - JSON_EXTRACT(pages.payload, '$._detected_apps.WordPress') IS NOT NULL + date = '2023-03-01' + AND is_root_page + AND technology.technology = 'WordPress' GROUP BY - pages._TABLE_SUFFIX ) + client ) diff --git a/sql/2023/03/top-lazy-lcp-class-names.sql b/sql/2023/03/top-lazy-lcp-class-names.sql index a8e5f1c..1da03da 100644 --- a/sql/2023/03/top-lazy-lcp-class-names.sql +++ b/sql/2023/03/top-lazy-lcp-class-names.sql @@ -15,9 +15,9 @@ # limitations under the License. # See query results here: https://github.com/GoogleChromeLabs/wpp-research/pull/45 -CREATE TEMP FUNCTION getAttr(attributes STRING, attribute STRING) RETURNS STRING LANGUAGE js AS ''' +CREATE TEMP FUNCTION getAttr(performance JSON, attribute STRING) RETURNS STRING LANGUAGE js AS ''' try { - const data = JSON.parse(attributes); + const data = performance.lcp_elem_stats.attributes; const attr = data.find(attr => attr["name"] === attribute) return attr.value } catch (e) { @@ -28,15 +28,15 @@ CREATE TEMP FUNCTION getAttr(attributes STRING, attribute STRING) RETURNS STRING WITH lazypress AS ( SELECT page, - getAttr(JSON_EXTRACT(payload, '$._performance.lcp_elem_stats.attributes'), 'loading') = 'lazy' AS native_lazy, - getAttr(JSON_EXTRACT(payload, '$._performance.lcp_elem_stats.attributes'), 'class') AS class, - JSON_EXTRACT_SCALAR(payload, '$._performance.lcp_elem_stats.nodeName') = 'IMG' AS img_lcp + getAttr(custom_metrics.performance, 'loading') = 'lazy' AS native_lazy, + getAttr(custom_metrics.performance, 'class') AS class, + JSON_VALUE(custom_metrics.performance.lcp_elem_stats.nodeName) = 'IMG' AS img_lcp FROM - `httparchive.all.pages`, + `httparchive.crawl.pages`, UNNEST(technologies) AS t WHERE date = '2023-02-01' AND - client = 'desktop' AND + client = 'desktop' AND is_root_page AND t.technology = 'WordPress' ), diff --git a/sql/2023/04/image-lazy-loading-usage.sql b/sql/2023/04/image-lazy-loading-usage.sql index c2370d2..4bf110c 100644 --- a/sql/2023/04/image-lazy-loading-usage.sql +++ b/sql/2023/04/image-lazy-loading-usage.sql @@ -15,12 +15,11 @@ # limitations under the License. # See query results here: https://github.com/GoogleChromeLabs/wpp-research/pull/51 -CREATE TEMPORARY FUNCTION get_image_loading_attributes(images_string STRING) +CREATE TEMPORARY FUNCTION get_image_loading_attributes(images JSON) RETURNS ARRAY> LANGUAGE js AS ''' var result = []; try { - var images = JSON.parse(images_string); for (const img of images){ result.push({ loading: img.loading @@ -31,16 +30,26 @@ return result; '''; SELECT - _TABLE_SUFFIX AS client, - COUNT(DISTINCT url) AS urls_with_images, - COUNT(DISTINCT IF (loading = 'lazy', url, NULL)) AS urls_with_loading_lazy, - COUNT(DISTINCT IF (loading = 'lazy', url, NULL)) / COUNT(DISTINCT url) AS pct_with_loading_lazy + client, + COUNT(DISTINCT page) AS urls_with_images, + COUNT(DISTINCT IF (loading = 'lazy', page, NULL)) AS urls_with_loading_lazy, + COUNT(DISTINCT IF (loading = 'lazy', page, NULL)) / COUNT(DISTINCT page) AS pct_with_loading_lazy FROM - `httparchive.pages.2023_03_01_*`, - UNNEST(get_image_loading_attributes(JSON_EXTRACT_SCALAR(payload, '$._Images'))) AS image_loading + `httparchive.crawl.pages`, + UNNEST(technologies) AS technology, + UNNEST(get_image_loading_attributes(custom_metrics.other.Images)) AS image_loading WHERE - JSON_EXTRACT(payload, '$._detected_apps.WordPress') IS NOT NULL - AND (CAST(REGEXP_EXTRACT(JSON_EXTRACT(payload, '$._detected_apps.WordPress'), r'^"(\d+\.\d+)') AS FLOAT64) >= 5.5) + date = '2023-03-01' + AND is_root_page + AND technology.technology = 'WordPress' + AND EXISTS( + SELECT + version + FROM + UNNEST(technology.info) AS version + WHERE + CAST(REGEXP_EXTRACT(version, r'^(\d+\.\d+)') AS FLOAT64) >= 5.5 + ) GROUP BY client ORDER BY diff --git a/sql/2023/08/blocking-head-scripts-count-by-plugin.sql b/sql/2023/08/blocking-head-scripts-count-by-plugin.sql index 64fca7c..5adcbf1 100644 --- a/sql/2023/08/blocking-head-scripts-count-by-plugin.sql +++ b/sql/2023/08/blocking-head-scripts-count-by-plugin.sql @@ -16,7 +16,7 @@ # Based on query here: https://github.com/GoogleChromeLabs/wpp-research/pull/63 -CREATE TEMP FUNCTION GET_BLOCKING_HEAD_SOURCES(custom_metrics STRING) +CREATE TEMP FUNCTION GET_BLOCKING_HEAD_SOURCES(cms JSON) RETURNS ARRAY LANGUAGE js AS ''' @@ -40,15 +40,14 @@ function getSource(src) { /** * Get script sources for scripts in the head which are blocking (not async nor defer). * - * @param {object} data - * @param {object} data.cms - * @param {object} data.cms.wordpress - * @param {Array<{src: string, intended_strategy: string, async: boolean, defer: boolean, in_footer: boolean}>} data.cms.wordpress.scripts + * @param {object} cms + * @param {object} cms.wordpress + * @param {Array<{src: string, intended_strategy: string, async: boolean, defer: boolean, in_footer: boolean}>} cms.wordpress.scripts * @return {Array} Sources. */ -function getBlockingHeadScriptSources(data) { +function getBlockingHeadScriptSources(cms) { const sources = []; - for ( const script of data.cms.wordpress.scripts ) { + for ( const script of cms.wordpress.scripts ) { const source = getSource(script.src); if (!source) { @@ -65,8 +64,7 @@ function getBlockingHeadScriptSources(data) { const sources = []; try { - const data = JSON.parse(custom_metrics); - sources.push(...getBlockingHeadScriptSources(data)); + sources.push(...getBlockingHeadScriptSources(cms)); } catch (e) {} return sources; '''; @@ -74,9 +72,9 @@ return sources; WITH all_sources AS ( SELECT - GET_BLOCKING_HEAD_SOURCES(custom_metrics) AS sources, + GET_BLOCKING_HEAD_SOURCES(custom_metrics.cms) AS sources, FROM - `httparchive.all.pages`, + `httparchive.crawl.pages`, UNNEST(technologies) AS technology WHERE date = CAST('2023-07-01' AS DATE) diff --git a/sql/2023/08/blocking-in-head-scripts-from-themes-and-plugins.sql b/sql/2023/08/blocking-in-head-scripts-from-themes-and-plugins.sql index 5883f95..4a71c56 100644 --- a/sql/2023/08/blocking-in-head-scripts-from-themes-and-plugins.sql +++ b/sql/2023/08/blocking-in-head-scripts-from-themes-and-plugins.sql @@ -15,7 +15,7 @@ # limitations under the License. # See query results here: https://github.com/GoogleChromeLabs/wpp-research/pull/63 -CREATE TEMP FUNCTION GET_BLOCKING_HEAD_SOURCES (custom_metrics STRING) RETURNS ARRAY LANGUAGE js AS ''' +CREATE TEMP FUNCTION GET_BLOCKING_HEAD_SOURCES (cms JSON) RETURNS ARRAY LANGUAGE js AS ''' const sourceRegExp = new RegExp( '/wp-content/(?plugin|theme)s/(?[^/]+)/(?[^\?]+)' ); @@ -36,15 +36,14 @@ function getSource(src) { /** * Get script sources for scripts in the head which are blocking (not async nor defer). * - * @param {object} data - * @param {object} data.cms - * @param {object} data.cms.wordpress - * @param {Array<{src: string, intended_strategy: string, async: boolean, defer: boolean, in_footer: boolean}>} data.cms.wordpress.scripts + * @param {object} cms + * @param {object} cms.wordpress + * @param {Array<{src: string, intended_strategy: string, async: boolean, defer: boolean, in_footer: boolean}>} cms.wordpress.scripts * @return {Array} Sources. */ -function getBlockingHeadScriptSources(data) { +function getBlockingHeadScriptSources(cms) { const sources = []; - for ( const script of data.cms.wordpress.scripts ) { + for ( const script of cms.wordpress.scripts ) { const source = getSource(script.src); if (!source) { @@ -62,17 +61,16 @@ function getBlockingHeadScriptSources(data) { const sources = []; try { - const data = JSON.parse(custom_metrics); - sources.push(...getBlockingHeadScriptSources(data)); + sources.push(...getBlockingHeadScriptSources(cms)); } catch (e) {} return sources; '''; WITH all_sources AS ( SELECT - GET_BLOCKING_HEAD_SOURCES(custom_metrics) AS sources, + GET_BLOCKING_HEAD_SOURCES(custom_metrics.cms) AS sources, FROM - `httparchive.all.pages`, + `httparchive.crawl.pages`, UNNEST(technologies) AS technology WHERE date = CAST("2023-07-01" AS DATE) diff --git a/sql/2023/08/theme-plugin-script-placements.sql b/sql/2023/08/theme-plugin-script-placements.sql index 04dabec..e875ee0 100644 --- a/sql/2023/08/theme-plugin-script-placements.sql +++ b/sql/2023/08/theme-plugin-script-placements.sql @@ -15,22 +15,21 @@ # limitations under the License. # See query results here: https://github.com/GoogleChromeLabs/wpp-research/pull/63 -CREATE TEMP FUNCTION GET_SCRIPT_PLACEMENTS (custom_metrics STRING) RETURNS ARRAY LANGUAGE js AS ''' +CREATE TEMP FUNCTION GET_SCRIPT_PLACEMENTS (cms JSON) RETURNS ARRAY LANGUAGE js AS ''' const sourceRegExp = new RegExp('/wp-content/(plugin|theme)s/([^/]+)/'); /** * Get script placements. * - * @param {object} data - * @param {object} data.cms - * @param {object} data.cms.wordpress - * @param {Array<{src: string, intended_strategy: string, async: boolean, defer: boolean, in_footer: boolean}>} data.cms.wordpress.scripts + * @param {object} cms + * @param {object} cms.wordpress + * @param {Array<{src: string, intended_strategy: string, async: boolean, defer: boolean, in_footer: boolean}>} cms.wordpress.scripts * @return {Array} Placements. */ -function getScriptPlacements(data) { +function getScriptPlacements(cms) { const placements = []; - for (const script of data.cms.wordpress.scripts) { + for (const script of cms.wordpress.scripts) { if (!sourceRegExp.test(script.src)) { continue; } @@ -50,17 +49,16 @@ function getScriptPlacements(data) { const placements = []; try { - const data = JSON.parse(custom_metrics); - placements.push(...getScriptPlacements(data)); + placements.push(...getScriptPlacements(cms)); } catch (e) {} return placements; '''; WITH all_placements AS ( SELECT - GET_SCRIPT_PLACEMENTS(custom_metrics) AS placements, + GET_SCRIPT_PLACEMENTS(custom_metrics.cms) AS placements, FROM - `httparchive.all.pages`, + `httparchive.crawl.pages`, UNNEST(technologies) AS technology WHERE date = CAST("2023-07-01" AS DATE) diff --git a/sql/2023/10/bfcache-failure-reasons.sql b/sql/2023/10/bfcache-failure-reasons.sql index 3791610..0a6e502 100644 --- a/sql/2023/10/bfcache-failure-reasons.sql +++ b/sql/2023/10/bfcache-failure-reasons.sql @@ -16,16 +16,15 @@ # See query results here: https://github.com/GoogleChromeLabs/wpp-research/pull/75 -CREATE TEMP FUNCTION getItemReasons(items STRING) RETURNS ARRAY LANGUAGE js AS +CREATE TEMP FUNCTION getItemReasons(items JSON) RETURNS ARRAY LANGUAGE js AS # language=javascript ''' try { if ( ! items ) { return []; } - const parsedItems = JSON.parse(items); const reasons = []; - for ( const item of parsedItems ) { + for ( const item of items ) { reasons.push( item.reason ); } return reasons; @@ -35,13 +34,12 @@ CREATE TEMP FUNCTION getItemReasons(items STRING) RETURNS ARRAY LANGUAGE '''; WITH - wordPressPages AS ( SELECT page as url, - getItemReasons( JSON_EXTRACT(lighthouse, '$.audits.bf-cache.details.items')) AS reasons + getItemReasons(lighthouse.audits['bf-cache'].details.items) AS reasons FROM - `httparchive.all.pages`, + `httparchive.crawl.pages`, UNNEST(technologies) AS t WHERE date = '2023-08-01' AND diff --git a/sql/2023/10/bfcache-score-counts.sql b/sql/2023/10/bfcache-score-counts.sql index 7fb17b3..991b639 100644 --- a/sql/2023/10/bfcache-score-counts.sql +++ b/sql/2023/10/bfcache-score-counts.sql @@ -21,9 +21,9 @@ WITH wordPressPages AS ( SELECT page as url, - JSON_EXTRACT(lighthouse, '$.audits.bf-cache.score') AS bfCacheScore + JSON_VALUE(lighthouse.audits['bf-cache'].score) AS bfCacheScore FROM - `httparchive.all.pages`, + `httparchive.crawl.pages`, UNNEST(technologies) AS t WHERE date = '2023-08-01' AND diff --git a/sql/2023/10/cross-device-matching-lcp-image.sql b/sql/2023/10/cross-device-matching-lcp-image.sql index 7f8e80e..d2550d6 100644 --- a/sql/2023/10/cross-device-matching-lcp-image.sql +++ b/sql/2023/10/cross-device-matching-lcp-image.sql @@ -17,9 +17,9 @@ # See query results here: https://github.com/GoogleChromeLabs/wpp-research/pull/73 # h/t https://github.com/GoogleChromeLabs/wpp-research/blob/0b6c2ca8ddc2c68d4eddcb3d4e069c5e75a2ca16/sql/2023/03/top-lazy-lcp-class-names.sql#L18-L26 -CREATE TEMP FUNCTION getAttr(attributes STRING, attribute STRING) RETURNS STRING LANGUAGE js AS ''' +CREATE TEMP FUNCTION getAttr(performance JSON, attribute STRING) RETURNS STRING LANGUAGE js AS ''' try { - const data = JSON.parse(attributes); + const data = performance.lcp_elem_stats.attributes; const attr = data.find(attr => attr["name"] === attribute) return attr.value } catch (e) { @@ -44,10 +44,10 @@ WITH all_device_wordpress_lcp AS ( SELECT page, IF(client = "mobile", "phone", "desktop") AS device, - IF(getAttr(JSON_EXTRACT(payload, '$._performance.lcp_elem_stats.attributes'), 'fetchpriority') = 'high', true, false) AS has_fetchpriority, - JSON_EXTRACT_SCALAR(payload, '$._performance.lcp_elem_stats.nodeName') AS lcp_element, + IF(getAttr(custom_metrics.performance, 'fetchpriority') = 'high', true, false) AS has_fetchpriority, + JSON_VALUE(custom_metrics.performance.lcp_elem_stats.nodeName) AS lcp_element, FROM - `httparchive.all.pages` + `httparchive.crawl.pages` WHERE date = '2023-08-01' AND is_root_page AND diff --git a/sql/2023/10/heartbeat-script-presence.sql b/sql/2023/10/heartbeat-script-presence.sql index b47b4fb..3df7256 100644 --- a/sql/2023/10/heartbeat-script-presence.sql +++ b/sql/2023/10/heartbeat-script-presence.sql @@ -16,7 +16,7 @@ # See query results here: https://github.com/GoogleChromeLabs/wpp-research/pull/75 -CREATE TEMP FUNCTION HAS_SCRIPT(handle STRING, custom_metrics STRING) RETURNS BOOL LANGUAGE js AS +CREATE TEMP FUNCTION HAS_SCRIPT(handle STRING, cms JSON) RETURNS BOOL LANGUAGE js AS # language=javascript ''' @@ -24,14 +24,13 @@ CREATE TEMP FUNCTION HAS_SCRIPT(handle STRING, custom_metrics STRING) RETURNS BO * Get whether a script is present. * * @param {string} handle - * @param {object} data - * @param {object} data.cms - * @param {object} data.cms.wordpress - * @param {Array<{handle: string}>} data.cms.wordpress.scripts + * @param {object} cms + * @param {object} cms.wordpress + * @param {Array<{handle: string}>} cms.wordpress.scripts * @return {boolean} */ -function hasScript(handle, data) { - for (const script of data.cms.wordpress.scripts) { +function hasScript(handle, cms) { + for (const script of cms.wordpress.scripts) { if (script.handle === handle) { return true; } @@ -40,8 +39,7 @@ function hasScript(handle, data) { } try { - const data = JSON.parse(custom_metrics); - return hasScript(handle, data); + return hasScript(handle, cms); } catch (e) {} return false; @@ -49,9 +47,9 @@ return false; WITH script_presence AS ( SELECT - HAS_SCRIPT("heartbeat", custom_metrics) AS has_script, + HAS_SCRIPT("heartbeat", custom_metrics.cms) AS has_script, FROM - `httparchive.all.pages`, + `httparchive.crawl.pages`, UNNEST(technologies) AS technology WHERE date = CAST("2023-08-01" AS DATE) AND diff --git a/sql/2023/10/page-content-types.sql b/sql/2023/10/page-content-types.sql index abbce71..06e992d 100644 --- a/sql/2023/10/page-content-types.sql +++ b/sql/2023/10/page-content-types.sql @@ -20,7 +20,7 @@ WITH pages AS ( client, page AS url FROM - `httparchive.all.pages`, + `httparchive.crawl.pages`, UNNEST(technologies) AS t WHERE date = '2023-08-01' AND @@ -35,7 +35,7 @@ requests AS ( url, REGEXP_REPLACE( resp_headers.value, ' *;.*$', '' ) AS content_type FROM - `httparchive.all.requests`, + `httparchive.crawl.requests`, UNNEST(response_headers) as resp_headers WHERE date = "2023-08-01" AND diff --git a/sql/2024/01/embed-blocks-on-root-and-non-root-pages.sql b/sql/2024/01/embed-blocks-on-root-and-non-root-pages.sql index 029939f..b04838b 100644 --- a/sql/2024/01/embed-blocks-on-root-and-non-root-pages.sql +++ b/sql/2024/01/embed-blocks-on-root-and-non-root-pages.sql @@ -18,11 +18,11 @@ SELECT is_root_page, - JSON_EXTRACT(custom_metrics, '$.cms.wordpress.has_embed_block') AS has_embed_block, + CAST(JSON_VALUE(custom_metrics.cms.wordpress.has_embed_block) AS BOOL) AS has_embed_block, COUNT(*) AS page_count, ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER (), 1) AS percentage_of_total FROM - `httparchive.all.pages`, + `httparchive.crawl.pages`, UNNEST(technologies) AS technology WHERE date = CAST('2023-12-01' AS DATE) diff --git a/sql/2024/01/ttfb-localized-sites.sql b/sql/2024/01/ttfb-localized-sites.sql index 34f261b..6fd5d5f 100644 --- a/sql/2024/01/ttfb-localized-sites.sql +++ b/sql/2024/01/ttfb-localized-sites.sql @@ -13,6 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# # See https://github.com/GoogleChromeLabs/wpp-research/pull/54 CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, @@ -33,10 +34,10 @@ WITH pages AS ( SELECT client, - IS_LOCALIZED(REPLACE(TRIM(LOWER(JSON_VALUE(JSON_VALUE(payload, '$._almanac'), '$.html_node.lang'))), '_', '-' )) AS is_localized, + IS_LOCALIZED(REPLACE(TRIM(LOWER(JSON_VALUE(custom_metrics.other.almanac.html_node.lang))), '_', '-' )) AS is_localized, page AS url, FROM - `httparchive.all.pages`, + `httparchive.crawl.pages`, UNNEST(technologies) AS t WHERE date = '2023-12-01' diff --git a/sql/2024/04/image-formats.sql b/sql/2024/04/image-formats.sql index 3a50802..ba57920 100644 --- a/sql/2024/04/image-formats.sql +++ b/sql/2024/04/image-formats.sql @@ -40,8 +40,8 @@ CREATE TEMPORARY FUNCTION OR info = version ) ) ); CREATE TEMPORARY FUNCTION - IS_IMAGE (summary STRING) - RETURNS BOOLEAN AS (STARTS_WITH(LOWER(CAST(JSON_EXTRACT_SCALAR(summary, "$.mimeType") AS STRING)), 'image/')); + IS_IMAGE (summary JSON) + RETURNS BOOLEAN AS (STARTS_WITH(LOWER(CAST(JSON_VALUE(summary.mimeType) AS STRING)), 'image/')); WITH pagesWithLcpImages AS ( @@ -49,16 +49,16 @@ WITH date, client, page, - JSON_EXTRACT_SCALAR(custom_metrics, '$.performance.lcp_elem_stats.url') AS url, - JSON_EXTRACT_SCALAR(custom_metrics, '$.performance.lcp_elem_stats.naturalWidth') AS image_width, - JSON_EXTRACT_SCALAR(custom_metrics, '$.performance.lcp_elem_stats.naturalHeight') AS image_height, + JSON_VALUE(custom_metrics.performance.lcp_elem_stats.url) AS url, + JSON_VALUE(custom_metrics.performance.lcp_elem_stats.naturalWidth) AS image_width, + JSON_VALUE(custom_metrics.performance.lcp_elem_stats.naturalHeight) AS image_height, FROM - `httparchive.all.pages` + `httparchive.crawl.pages` WHERE IS_CMS(technologies, 'WordPress', '') - AND LOWER(JSON_EXTRACT_SCALAR(custom_metrics, '$.performance.lcp_elem_stats.nodeName')) = 'img' + AND LOWER(JSON_VALUE(custom_metrics.performance.lcp_elem_stats.nodeName)) = 'img' AND date = DATE_TO_QUERY ), imageRequests AS ( @@ -67,10 +67,10 @@ WITH client, page, url, - LOWER(CAST(JSON_EXTRACT_SCALAR(summary, "$.mimeType") AS STRING)) AS mime_type, - CAST( JSON_EXTRACT_SCALAR(summary, "$.respSize") AS NUMERIC) AS resp_size, + LOWER(CAST(JSON_VALUE(summary.mimeType) AS STRING)) AS mime_type, + CAST(JSON_VALUE(summary.respSize) AS NUMERIC) AS resp_size, FROM - `httparchive.all.requests` + `httparchive.crawl.requests` WHERE IS_IMAGE(summary) AND date = DATE_TO_QUERY ) diff --git a/sql/2024/04/inaccurate-sizes-attribute-impact.sql b/sql/2024/04/inaccurate-sizes-attribute-impact.sql index f68c880..20a2027 100644 --- a/sql/2024/04/inaccurate-sizes-attribute-impact.sql +++ b/sql/2024/04/inaccurate-sizes-attribute-impact.sql @@ -18,7 +18,7 @@ DECLARE DATE_TO_QUERY DATE DEFAULT '2024-03-01'; -CREATE TEMPORARY FUNCTION GET_IMG_SIZES_ACCURACY(custom_metrics STRING) RETURNS +CREATE TEMPORARY FUNCTION GET_IMG_SIZES_ACCURACY(responsive_images JSON) RETURNS ARRAY AS ( +CREATE TEMPORARY FUNCTION GET_CONTENT_TYPE(cms JSON) RETURNS STRUCT