Skip to content

Conversation

@chr156r33n
Copy link

@chr156r33n chr156r33n commented Nov 14, 2025

Merging the 2025 SEO Chapter SQL into the main project

Makes progress on #4086

@chr156r33n chr156r33n marked this pull request as ready for review November 14, 2025 19:38
@tunetheweb tunetheweb added the analysis Querying the dataset label Nov 20, 2025
Copy link
Member

@tunetheweb tunetheweb left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Most of the queries look good, but they can be simplified somewhat now the columns are JSON. I've rewritten where this is the case and made the suggestions.

Comment on lines +5 to +22
CREATE TEMPORARY FUNCTION getRelStatsWptBodies(wpt_bodies_json JSON)
RETURNS STRUCT<
rel ARRAY<STRING>
> LANGUAGE js AS '''
var result = {rel: []};
// Function to retrieve only keys if value is >0
function getKey(dict){
const arr = [],
obj = Object.keys(dict);
for (var x in obj){
if(dict[obj[x]] > 0){
arr.push(obj[x]);
}
}
return arr;
}
try {
var wpt_bodies = wpt_bodies_json;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need to have a second variable I don't think?

Suggested change
CREATE TEMPORARY FUNCTION getRelStatsWptBodies(wpt_bodies_json JSON)
RETURNS STRUCT<
rel ARRAY<STRING>
> LANGUAGE js AS '''
var result = {rel: []};
// Function to retrieve only keys if value is >0
function getKey(dict){
const arr = [],
obj = Object.keys(dict);
for (var x in obj){
if(dict[obj[x]] > 0){
arr.push(obj[x]);
}
}
return arr;
}
try {
var wpt_bodies = wpt_bodies_json;
CREATE TEMPORARY FUNCTION getRelStatsWptBodies(wpt_bodies JSON)
RETURNS STRUCT<
rel ARRAY<STRING>
> LANGUAGE js AS '''
var result = {rel: []};
// Function to retrieve only keys if value is >0
function getKey(dict){
const arr = [],
obj = Object.keys(dict);
for (var x in obj){
if(dict[obj[x]] > 0){
arr.push(obj[x]);
}
}
return arr;
}
try {

ELSE 'No Assigned Page'
END
AS is_root_page,
getRelStatsWptBodies(TO_JSON(custom_metrics.wpt_bodies)) AS wpt_bodies_info
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's already JSON so can remove unnecessary cast:

Suggested change
getRelStatsWptBodies(TO_JSON(custom_metrics.wpt_bodies)) AS wpt_bodies_info
getRelStatsWptBodies(custom_metrics.wpt_bodies) AS wpt_bodies_info

Comment on lines +4 to +9
CREATE TEMPORARY FUNCTION getContentLanguagesAlmanac(almanac_json JSON)
RETURNS ARRAY<STRING>
LANGUAGE js AS '''
var result = [];
try {
var almanac = almanac_json;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
CREATE TEMPORARY FUNCTION getContentLanguagesAlmanac(almanac_json JSON)
RETURNS ARRAY<STRING>
LANGUAGE js AS '''
var result = [];
try {
var almanac = almanac_json;
CREATE TEMPORARY FUNCTION getContentLanguagesAlmanac(almanac JSON)
RETURNS ARRAY<STRING>
LANGUAGE js AS '''
var result = [];
try {

WHEN is_root_page = TRUE THEN 'Homepage'
ELSE 'No Assigned Page'
END AS is_root_page,
getContentLanguagesAlmanac(TO_JSON(custom_metrics.other.almanac)) AS content_languages
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
getContentLanguagesAlmanac(TO_JSON(custom_metrics.other.almanac)) AS content_languages
getContentLanguagesAlmanac(custom_metrics.other.almanac) AS content_languages

Comment on lines +5 to +24
CREATE TEMPORARY FUNCTION getLinkDesciptionsWptBodies(wpt_bodies_json JSON)
RETURNS STRUCT<
links_same_site INT64,
links_window_location INT64,
links_window_open INT64,
links_href_javascript INT64
> LANGUAGE js AS '''
var result = {
links_same_site: 0,
links_window_location: 0,
links_window_open: 0,
links_href_javascript: 0
};
try {
var wpt_bodies = wpt_bodies_json;

if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') return result;

if (wpt_bodies.anchors && wpt_bodies.anchors.rendered) {
var anchors_rendered = wpt_bodies.anchors.rendered;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
CREATE TEMPORARY FUNCTION getLinkDesciptionsWptBodies(wpt_bodies_json JSON)
RETURNS STRUCT<
links_same_site INT64,
links_window_location INT64,
links_window_open INT64,
links_href_javascript INT64
> LANGUAGE js AS '''
var result = {
links_same_site: 0,
links_window_location: 0,
links_window_open: 0,
links_href_javascript: 0
};
try {
var wpt_bodies = wpt_bodies_json;
if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') return result;
if (wpt_bodies.anchors && wpt_bodies.anchors.rendered) {
var anchors_rendered = wpt_bodies.anchors.rendered;
CREATE TEMPORARY FUNCTION getLinkDesciptionsWptBodies(anchors JSON)
RETURNS STRUCT<
links_same_site INT64,
links_window_location INT64,
links_window_open INT64,
links_href_javascript INT64
> LANGUAGE js AS '''
var result = {
links_same_site: 0,
links_window_location: 0,
links_window_open: 0,
links_href_javascript: 0
};
try {
if (Array.isArray(anchors) || typeof anchors != 'object') return result;
if (anchors && anchors.rendered) {
var anchors_rendered = anchors.rendered;

Comment on lines +5 to +19
SELECT
client,
COUNT(DISTINCT page) AS total_pages,
COUNTIF(JSON_QUERY(TO_JSON_STRING(payload), '$._valid-head.invalidHead') = 'true') AS pages_with_invalid_head,
COUNTIF(ARRAY_LENGTH(JSON_EXTRACT_ARRAY(TO_JSON_STRING(payload), '$._valid-head.invalidElements')) > 0) AS pages_with_invalid_elements,
SAFE_DIVIDE(COUNTIF(JSON_QUERY(TO_JSON_STRING(payload), '$._valid-head.invalidHead') = 'true'), COUNT(DISTINCT page)) AS pct_invalid_head
FROM
`httparchive.crawl.pages`
WHERE
date = '2025-07-01'
AND is_root_page = TRUE
GROUP BY
client
ORDER BY
client No newline at end of file
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
SELECT
client,
COUNT(DISTINCT page) AS total_pages,
COUNTIF(JSON_QUERY(TO_JSON_STRING(payload), '$._valid-head.invalidHead') = 'true') AS pages_with_invalid_head,
COUNTIF(ARRAY_LENGTH(JSON_EXTRACT_ARRAY(TO_JSON_STRING(payload), '$._valid-head.invalidElements')) > 0) AS pages_with_invalid_elements,
SAFE_DIVIDE(COUNTIF(JSON_QUERY(TO_JSON_STRING(payload), '$._valid-head.invalidHead') = 'true'), COUNT(DISTINCT page)) AS pct_invalid_head
FROM
`httparchive.crawl.pages`
WHERE
date = '2025-07-01'
AND is_root_page = TRUE
GROUP BY
client
ORDER BY
client
SELECT
client,
COUNT(DISTINCT page) AS total_pages,
COUNTIF(JSON_VALUE(custom_metrics.other.`valid-head`.invalidHead) = 'true') AS pages_with_invalid_head,
COUNTIF(ARRAY_LENGTH(JSON_QUERY_ARRAY(custom_metrics.other.`valid-head`.invalidElements)) > 0) AS pages_with_invalid_elements,
SAFE_DIVIDE(COUNTIF(JSON_VALUE(custom_metrics.other.`valid-head`.invalidHead) = 'true'), COUNT(DISTINCT page)) AS pct_invalid_head
FROM
`httparchive.crawl.pages`
WHERE
date = '2025-07-01'
AND is_root_page = TRUE
GROUP BY
client
ORDER BY
client

Comment on lines +1 to +42
#standardSQL
# Unused CSS and JS

SELECT
client,
rank_grouping,
CASE
WHEN rank_grouping = 100000000 THEN 'all'
ELSE FORMAT("%'d", rank_grouping)
END AS ranking,
COUNT(DISTINCT page) AS pages,
SUM(unused_javascript) / COUNT(DISTINCT page) AS unused_javascript_kib_avg,
SUM(unused_css_rules) / COUNT(DISTINCT page) AS unused_css_rules_kib_avg
FROM (
SELECT
client,
page,
rank
FROM `httparchive.crawl.pages`
WHERE date = '2025-07-01'
)
LEFT JOIN (
SELECT
client,
page,
SAFE_DIVIDE(
CAST(JSON_EXTRACT_SCALAR(TO_JSON_STRING(lighthouse),
'$.audits["unused-javascript"].details.overallSavingsBytes') AS INT64),
1024
) AS unused_javascript,
SAFE_DIVIDE(
CAST(JSON_EXTRACT_SCALAR(TO_JSON_STRING(lighthouse),
'$.audits["unused-css-rules"].details.overallSavingsBytes') AS INT64),
1024
) AS unused_css_rules
FROM `httparchive.crawl.pages`
WHERE date = '2025-07-01'
)
USING (client, page),
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping
WHERE rank <= rank_grouping
GROUP BY client, rank_grouping
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
#standardSQL
# Unused CSS and JS
SELECT
client,
rank_grouping,
CASE
WHEN rank_grouping = 100000000 THEN 'all'
ELSE FORMAT("%'d", rank_grouping)
END AS ranking,
COUNT(DISTINCT page) AS pages,
SUM(unused_javascript) / COUNT(DISTINCT page) AS unused_javascript_kib_avg,
SUM(unused_css_rules) / COUNT(DISTINCT page) AS unused_css_rules_kib_avg
FROM (
SELECT
client,
page,
rank
FROM `httparchive.crawl.pages`
WHERE date = '2025-07-01'
)
LEFT JOIN (
SELECT
client,
page,
SAFE_DIVIDE(
CAST(JSON_EXTRACT_SCALAR(TO_JSON_STRING(lighthouse),
'$.audits["unused-javascript"].details.overallSavingsBytes') AS INT64),
1024
) AS unused_javascript,
SAFE_DIVIDE(
CAST(JSON_EXTRACT_SCALAR(TO_JSON_STRING(lighthouse),
'$.audits["unused-css-rules"].details.overallSavingsBytes') AS INT64),
1024
) AS unused_css_rules
FROM `httparchive.crawl.pages`
WHERE date = '2025-07-01'
)
USING (client, page),
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping
WHERE rank <= rank_grouping
GROUP BY client, rank_grouping
#standardSQL
# Unused CSS and JS
SELECT
client,
rank_grouping,
CASE
WHEN rank_grouping = 100000000 THEN 'all'
ELSE FORMAT("%'d", rank_grouping)
END AS ranking,
COUNT(DISTINCT page) AS pages,
SUM(unused_javascript) / COUNT(DISTINCT page) AS unused_javascript_kib_avg,
SUM(unused_css_rules) / COUNT(DISTINCT page) AS unused_css_rules_kib_avg
FROM
(
SELECT
client,
page,
rank
FROM
`httparchive.crawl.pages`
WHERE
date = '2025-07-01'
)
LEFT JOIN
(
SELECT
client,
page,
SAFE_DIVIDE(SAFE.INT64(lighthouse.audits.`unused-javascript`.details.overallSavingsBytes), 1024) AS unused_javascript,
SAFE_DIVIDE(SAFE.INT64(lighthouse.audits.`unused-css-rules`.details.overallSavingsBytes), 1024) AS unused_css_rules
FROM
`httparchive.crawl.pages`
WHERE
date = '2025-07-01'
) USING (client, page),
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping
WHERE
rank <= rank_grouping
GROUP BY
client,
rank_grouping
ORDER BY
rank_grouping;

Comment on lines +1 to +3
#standardSQL
# Validation query to check what's in the payload for invalid head elements
# This will help us understand the actual structure
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is needed anymore. Can we remove from this PR?

Comment on lines +1 to +50
#standardSQL
# Videos per page
# returns all the data we need from _almanac

CREATE TEMPORARY FUNCTION getVideosAlmanacInfo(almanac_json JSON)
RETURNS STRUCT<
videos_total INT64
> LANGUAGE js AS '''
var result = {
videos_total: 0
};
try {
var almanac = almanac_json;

if (Array.isArray(almanac) || typeof almanac != 'object') return result;

if (almanac.videos && almanac.videos.total) {
result.videos_total = almanac.videos.total;
}
} catch (e) {}
return result;
''';

SELECT
percentile,
client,
COUNT(DISTINCT page) AS total,

# videos per page
APPROX_QUANTILES(video_almanac_info.videos_total, 1000)[OFFSET(percentile * 10)] AS videos_count

FROM (
SELECT
client AS client,
percentile,
page,
getVideosAlmanacInfo(TO_JSON(custom_metrics.other.almanac)) AS video_almanac_info
FROM
`httparchive.crawl.pages`,
UNNEST([10, 25, 50, 75, 90]) AS percentile
WHERE date = '2025-07-01'
)
WHERE
video_almanac_info.videos_total > 0
GROUP BY
percentile,
client
ORDER BY
percentile,
client
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
#standardSQL
# Videos per page
# returns all the data we need from _almanac
CREATE TEMPORARY FUNCTION getVideosAlmanacInfo(almanac_json JSON)
RETURNS STRUCT<
videos_total INT64
> LANGUAGE js AS '''
var result = {
videos_total: 0
};
try {
var almanac = almanac_json;
if (Array.isArray(almanac) || typeof almanac != 'object') return result;
if (almanac.videos && almanac.videos.total) {
result.videos_total = almanac.videos.total;
}
} catch (e) {}
return result;
''';
SELECT
percentile,
client,
COUNT(DISTINCT page) AS total,
# videos per page
APPROX_QUANTILES(video_almanac_info.videos_total, 1000)[OFFSET(percentile * 10)] AS videos_count
FROM (
SELECT
client AS client,
percentile,
page,
getVideosAlmanacInfo(TO_JSON(custom_metrics.other.almanac)) AS video_almanac_info
FROM
`httparchive.crawl.pages`,
UNNEST([10, 25, 50, 75, 90]) AS percentile
WHERE date = '2025-07-01'
)
WHERE
video_almanac_info.videos_total > 0
GROUP BY
percentile,
client
ORDER BY
percentile,
client
#standardSQL
# Videos per page
CREATE TEMPORARY FUNCTION getVideosInfo(videos JSON)
RETURNS STRUCT<
videos_total INT64
> LANGUAGE js AS '''
var result = {
videos_total: 0
};
try {
if (Array.isArray(videos) || typeof videos != 'object') return result;
if (videos && videos.total) {
result.videos_total = videos.total;
}
} catch (e) {}
return result;
''';
SELECT
percentile,
client,
COUNT(DISTINCT page) AS total,
# videos per page
APPROX_QUANTILES(video_almanac_info.videos_total, 1000)[OFFSET(percentile * 10)] AS videos_count
FROM (
SELECT
client AS client,
percentile,
page,
getVideosInfo(TO_JSON(custom_metrics.other.almanac.videos)) AS video_almanac_info
FROM
`httparchive.crawl.pages`,
UNNEST([10, 25, 50, 75, 90]) AS percentile
WHERE
date = '2025-07-01'
)
WHERE
video_almanac_info.videos_total > 0
GROUP BY
percentile,
client
ORDER BY
percentile,
client

Comment on lines +4 to +60
WITH plugin_counts AS (
SELECT
t.technology AS plugin,
p.is_root_page,
p.client,
COUNT(DISTINCT p.page) AS site_count
FROM `httparchive.crawl.pages` AS p,
UNNEST(p.technologies) AS t,
UNNEST(t.categories) AS cat
WHERE
p.date = '2025-07-01'
AND cat IN ('seo', 'SEO')
AND is_root_page = TRUE
GROUP BY
t.technology,
p.is_root_page,
p.client
),
filtered_total_counts AS (
SELECT
is_root_page,
client,
SUM(site_count) AS total_filtered_sites
FROM plugin_counts
GROUP BY
is_root_page,
client
),
overall_total_counts AS (
SELECT
client,
COUNT(DISTINCT page) AS total_all_sites
FROM `httparchive.crawl.pages`
WHERE
date = '2025-07-01'
AND is_root_page = TRUE
GROUP BY
client
)
SELECT
pc.plugin,
pc.client,
pc.site_count,
ftc.total_filtered_sites AS total_seo_sites,
otc.total_all_sites,
ROUND(SAFE_DIVIDE(pc.site_count, ftc.total_filtered_sites), 4) AS pct_of_seo_sites,
ROUND(SAFE_DIVIDE(pc.site_count, otc.total_all_sites), 4) AS pct_of_all_sites
FROM plugin_counts pc
JOIN filtered_total_counts ftc
ON pc.is_root_page = ftc.is_root_page
AND pc.client = ftc.client
JOIN overall_total_counts otc
ON pc.client = otc.client
ORDER BY
pc.is_root_page,
pc.client,
pc.site_count DESC;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
WITH plugin_counts AS (
SELECT
t.technology AS plugin,
p.is_root_page,
p.client,
COUNT(DISTINCT p.page) AS site_count
FROM `httparchive.crawl.pages` AS p,
UNNEST(p.technologies) AS t,
UNNEST(t.categories) AS cat
WHERE
p.date = '2025-07-01'
AND cat IN ('seo', 'SEO')
AND is_root_page = TRUE
GROUP BY
t.technology,
p.is_root_page,
p.client
),
filtered_total_counts AS (
SELECT
is_root_page,
client,
SUM(site_count) AS total_filtered_sites
FROM plugin_counts
GROUP BY
is_root_page,
client
),
overall_total_counts AS (
SELECT
client,
COUNT(DISTINCT page) AS total_all_sites
FROM `httparchive.crawl.pages`
WHERE
date = '2025-07-01'
AND is_root_page = TRUE
GROUP BY
client
)
SELECT
pc.plugin,
pc.client,
pc.site_count,
ftc.total_filtered_sites AS total_seo_sites,
otc.total_all_sites,
ROUND(SAFE_DIVIDE(pc.site_count, ftc.total_filtered_sites), 4) AS pct_of_seo_sites,
ROUND(SAFE_DIVIDE(pc.site_count, otc.total_all_sites), 4) AS pct_of_all_sites
FROM plugin_counts pc
JOIN filtered_total_counts ftc
ON pc.is_root_page = ftc.is_root_page
AND pc.client = ftc.client
JOIN overall_total_counts otc
ON pc.client = otc.client
ORDER BY
pc.is_root_page,
pc.client,
pc.site_count DESC;
WITH plugin_counts AS (
SELECT
t.technology AS plugin,
p.is_root_page,
p.client,
COUNT(DISTINCT p.page) AS site_count
FROM
`httparchive.crawl.pages` AS p,
UNNEST(p.technologies) AS t,
UNNEST(t.categories) AS cat
WHERE
p.date = '2025-07-01' AND
lower(cat) = 'seo' AND
is_root_page
GROUP BY
t.technology,
p.is_root_page,
p.client
),
filtered_total_counts AS (
SELECT
is_root_page,
client,
SUM(site_count) AS total_filtered_sites
FROM
plugin_counts
GROUP BY
is_root_page,
client
),
overall_total_counts AS (
SELECT
client,
COUNT(DISTINCT page) AS total_all_sites
FROM
`httparchive.crawl.pages`
WHERE
date = '2025-07-01' AND
is_root_page = TRUE
GROUP BY
client
)
SELECT
pc.plugin,
pc.client,
pc.site_count,
ftc.total_filtered_sites AS total_seo_sites,
otc.total_all_sites,
ROUND(SAFE_DIVIDE(pc.site_count, ftc.total_filtered_sites), 4) AS pct_of_seo_sites,
ROUND(SAFE_DIVIDE(pc.site_count, otc.total_all_sites), 4) AS pct_of_all_sites
FROM
plugin_counts pc
JOIN filtered_total_counts ftc ON pc.is_root_page = ftc.is_root_page AND pc.client = ftc.client
JOIN overall_total_counts otc ON pc.client = otc.client
ORDER BY
pc.is_root_page,
pc.client,
pc.site_count DESC;

Copy link
Member

@tunetheweb tunetheweb left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Most of the queries look good, but they can be simplified somewhat now the columns are JSON. I've rewritten where this is the case and made the suggestions.

@tunetheweb tunetheweb mentioned this pull request Nov 27, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

analysis Querying the dataset

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants