-
-
Notifications
You must be signed in to change notification settings - Fork 200
SEO 2025 SQL #4272
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
SEO 2025 SQL #4272
Conversation
tunetheweb
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Most of the queries look good, but they can be simplified somewhat now the columns are JSON. I've rewritten where this is the case and made the suggestions.
| CREATE TEMPORARY FUNCTION getRelStatsWptBodies(wpt_bodies_json JSON) | ||
| RETURNS STRUCT< | ||
| rel ARRAY<STRING> | ||
| > LANGUAGE js AS ''' | ||
| var result = {rel: []}; | ||
| // Function to retrieve only keys if value is >0 | ||
| function getKey(dict){ | ||
| const arr = [], | ||
| obj = Object.keys(dict); | ||
| for (var x in obj){ | ||
| if(dict[obj[x]] > 0){ | ||
| arr.push(obj[x]); | ||
| } | ||
| } | ||
| return arr; | ||
| } | ||
| try { | ||
| var wpt_bodies = wpt_bodies_json; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No need to have a second variable I don't think?
| CREATE TEMPORARY FUNCTION getRelStatsWptBodies(wpt_bodies_json JSON) | |
| RETURNS STRUCT< | |
| rel ARRAY<STRING> | |
| > LANGUAGE js AS ''' | |
| var result = {rel: []}; | |
| // Function to retrieve only keys if value is >0 | |
| function getKey(dict){ | |
| const arr = [], | |
| obj = Object.keys(dict); | |
| for (var x in obj){ | |
| if(dict[obj[x]] > 0){ | |
| arr.push(obj[x]); | |
| } | |
| } | |
| return arr; | |
| } | |
| try { | |
| var wpt_bodies = wpt_bodies_json; | |
| CREATE TEMPORARY FUNCTION getRelStatsWptBodies(wpt_bodies JSON) | |
| RETURNS STRUCT< | |
| rel ARRAY<STRING> | |
| > LANGUAGE js AS ''' | |
| var result = {rel: []}; | |
| // Function to retrieve only keys if value is >0 | |
| function getKey(dict){ | |
| const arr = [], | |
| obj = Object.keys(dict); | |
| for (var x in obj){ | |
| if(dict[obj[x]] > 0){ | |
| arr.push(obj[x]); | |
| } | |
| } | |
| return arr; | |
| } | |
| try { |
| ELSE 'No Assigned Page' | ||
| END | ||
| AS is_root_page, | ||
| getRelStatsWptBodies(TO_JSON(custom_metrics.wpt_bodies)) AS wpt_bodies_info |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's already JSON so can remove unnecessary cast:
| getRelStatsWptBodies(TO_JSON(custom_metrics.wpt_bodies)) AS wpt_bodies_info | |
| getRelStatsWptBodies(custom_metrics.wpt_bodies) AS wpt_bodies_info |
| CREATE TEMPORARY FUNCTION getContentLanguagesAlmanac(almanac_json JSON) | ||
| RETURNS ARRAY<STRING> | ||
| LANGUAGE js AS ''' | ||
| var result = []; | ||
| try { | ||
| var almanac = almanac_json; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| CREATE TEMPORARY FUNCTION getContentLanguagesAlmanac(almanac_json JSON) | |
| RETURNS ARRAY<STRING> | |
| LANGUAGE js AS ''' | |
| var result = []; | |
| try { | |
| var almanac = almanac_json; | |
| CREATE TEMPORARY FUNCTION getContentLanguagesAlmanac(almanac JSON) | |
| RETURNS ARRAY<STRING> | |
| LANGUAGE js AS ''' | |
| var result = []; | |
| try { |
| WHEN is_root_page = TRUE THEN 'Homepage' | ||
| ELSE 'No Assigned Page' | ||
| END AS is_root_page, | ||
| getContentLanguagesAlmanac(TO_JSON(custom_metrics.other.almanac)) AS content_languages |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| getContentLanguagesAlmanac(TO_JSON(custom_metrics.other.almanac)) AS content_languages | |
| getContentLanguagesAlmanac(custom_metrics.other.almanac) AS content_languages |
| CREATE TEMPORARY FUNCTION getLinkDesciptionsWptBodies(wpt_bodies_json JSON) | ||
| RETURNS STRUCT< | ||
| links_same_site INT64, | ||
| links_window_location INT64, | ||
| links_window_open INT64, | ||
| links_href_javascript INT64 | ||
| > LANGUAGE js AS ''' | ||
| var result = { | ||
| links_same_site: 0, | ||
| links_window_location: 0, | ||
| links_window_open: 0, | ||
| links_href_javascript: 0 | ||
| }; | ||
| try { | ||
| var wpt_bodies = wpt_bodies_json; | ||
|
|
||
| if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') return result; | ||
|
|
||
| if (wpt_bodies.anchors && wpt_bodies.anchors.rendered) { | ||
| var anchors_rendered = wpt_bodies.anchors.rendered; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| CREATE TEMPORARY FUNCTION getLinkDesciptionsWptBodies(wpt_bodies_json JSON) | |
| RETURNS STRUCT< | |
| links_same_site INT64, | |
| links_window_location INT64, | |
| links_window_open INT64, | |
| links_href_javascript INT64 | |
| > LANGUAGE js AS ''' | |
| var result = { | |
| links_same_site: 0, | |
| links_window_location: 0, | |
| links_window_open: 0, | |
| links_href_javascript: 0 | |
| }; | |
| try { | |
| var wpt_bodies = wpt_bodies_json; | |
| if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') return result; | |
| if (wpt_bodies.anchors && wpt_bodies.anchors.rendered) { | |
| var anchors_rendered = wpt_bodies.anchors.rendered; | |
| CREATE TEMPORARY FUNCTION getLinkDesciptionsWptBodies(anchors JSON) | |
| RETURNS STRUCT< | |
| links_same_site INT64, | |
| links_window_location INT64, | |
| links_window_open INT64, | |
| links_href_javascript INT64 | |
| > LANGUAGE js AS ''' | |
| var result = { | |
| links_same_site: 0, | |
| links_window_location: 0, | |
| links_window_open: 0, | |
| links_href_javascript: 0 | |
| }; | |
| try { | |
| if (Array.isArray(anchors) || typeof anchors != 'object') return result; | |
| if (anchors && anchors.rendered) { | |
| var anchors_rendered = anchors.rendered; |
| SELECT | ||
| client, | ||
| COUNT(DISTINCT page) AS total_pages, | ||
| COUNTIF(JSON_QUERY(TO_JSON_STRING(payload), '$._valid-head.invalidHead') = 'true') AS pages_with_invalid_head, | ||
| COUNTIF(ARRAY_LENGTH(JSON_EXTRACT_ARRAY(TO_JSON_STRING(payload), '$._valid-head.invalidElements')) > 0) AS pages_with_invalid_elements, | ||
| SAFE_DIVIDE(COUNTIF(JSON_QUERY(TO_JSON_STRING(payload), '$._valid-head.invalidHead') = 'true'), COUNT(DISTINCT page)) AS pct_invalid_head | ||
| FROM | ||
| `httparchive.crawl.pages` | ||
| WHERE | ||
| date = '2025-07-01' | ||
| AND is_root_page = TRUE | ||
| GROUP BY | ||
| client | ||
| ORDER BY | ||
| client No newline at end of file |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| SELECT | |
| client, | |
| COUNT(DISTINCT page) AS total_pages, | |
| COUNTIF(JSON_QUERY(TO_JSON_STRING(payload), '$._valid-head.invalidHead') = 'true') AS pages_with_invalid_head, | |
| COUNTIF(ARRAY_LENGTH(JSON_EXTRACT_ARRAY(TO_JSON_STRING(payload), '$._valid-head.invalidElements')) > 0) AS pages_with_invalid_elements, | |
| SAFE_DIVIDE(COUNTIF(JSON_QUERY(TO_JSON_STRING(payload), '$._valid-head.invalidHead') = 'true'), COUNT(DISTINCT page)) AS pct_invalid_head | |
| FROM | |
| `httparchive.crawl.pages` | |
| WHERE | |
| date = '2025-07-01' | |
| AND is_root_page = TRUE | |
| GROUP BY | |
| client | |
| ORDER BY | |
| client | |
| SELECT | |
| client, | |
| COUNT(DISTINCT page) AS total_pages, | |
| COUNTIF(JSON_VALUE(custom_metrics.other.`valid-head`.invalidHead) = 'true') AS pages_with_invalid_head, | |
| COUNTIF(ARRAY_LENGTH(JSON_QUERY_ARRAY(custom_metrics.other.`valid-head`.invalidElements)) > 0) AS pages_with_invalid_elements, | |
| SAFE_DIVIDE(COUNTIF(JSON_VALUE(custom_metrics.other.`valid-head`.invalidHead) = 'true'), COUNT(DISTINCT page)) AS pct_invalid_head | |
| FROM | |
| `httparchive.crawl.pages` | |
| WHERE | |
| date = '2025-07-01' | |
| AND is_root_page = TRUE | |
| GROUP BY | |
| client | |
| ORDER BY | |
| client | |
| #standardSQL | ||
| # Unused CSS and JS | ||
|
|
||
| SELECT | ||
| client, | ||
| rank_grouping, | ||
| CASE | ||
| WHEN rank_grouping = 100000000 THEN 'all' | ||
| ELSE FORMAT("%'d", rank_grouping) | ||
| END AS ranking, | ||
| COUNT(DISTINCT page) AS pages, | ||
| SUM(unused_javascript) / COUNT(DISTINCT page) AS unused_javascript_kib_avg, | ||
| SUM(unused_css_rules) / COUNT(DISTINCT page) AS unused_css_rules_kib_avg | ||
| FROM ( | ||
| SELECT | ||
| client, | ||
| page, | ||
| rank | ||
| FROM `httparchive.crawl.pages` | ||
| WHERE date = '2025-07-01' | ||
| ) | ||
| LEFT JOIN ( | ||
| SELECT | ||
| client, | ||
| page, | ||
| SAFE_DIVIDE( | ||
| CAST(JSON_EXTRACT_SCALAR(TO_JSON_STRING(lighthouse), | ||
| '$.audits["unused-javascript"].details.overallSavingsBytes') AS INT64), | ||
| 1024 | ||
| ) AS unused_javascript, | ||
| SAFE_DIVIDE( | ||
| CAST(JSON_EXTRACT_SCALAR(TO_JSON_STRING(lighthouse), | ||
| '$.audits["unused-css-rules"].details.overallSavingsBytes') AS INT64), | ||
| 1024 | ||
| ) AS unused_css_rules | ||
| FROM `httparchive.crawl.pages` | ||
| WHERE date = '2025-07-01' | ||
| ) | ||
| USING (client, page), | ||
| UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping | ||
| WHERE rank <= rank_grouping | ||
| GROUP BY client, rank_grouping |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| #standardSQL | |
| # Unused CSS and JS | |
| SELECT | |
| client, | |
| rank_grouping, | |
| CASE | |
| WHEN rank_grouping = 100000000 THEN 'all' | |
| ELSE FORMAT("%'d", rank_grouping) | |
| END AS ranking, | |
| COUNT(DISTINCT page) AS pages, | |
| SUM(unused_javascript) / COUNT(DISTINCT page) AS unused_javascript_kib_avg, | |
| SUM(unused_css_rules) / COUNT(DISTINCT page) AS unused_css_rules_kib_avg | |
| FROM ( | |
| SELECT | |
| client, | |
| page, | |
| rank | |
| FROM `httparchive.crawl.pages` | |
| WHERE date = '2025-07-01' | |
| ) | |
| LEFT JOIN ( | |
| SELECT | |
| client, | |
| page, | |
| SAFE_DIVIDE( | |
| CAST(JSON_EXTRACT_SCALAR(TO_JSON_STRING(lighthouse), | |
| '$.audits["unused-javascript"].details.overallSavingsBytes') AS INT64), | |
| 1024 | |
| ) AS unused_javascript, | |
| SAFE_DIVIDE( | |
| CAST(JSON_EXTRACT_SCALAR(TO_JSON_STRING(lighthouse), | |
| '$.audits["unused-css-rules"].details.overallSavingsBytes') AS INT64), | |
| 1024 | |
| ) AS unused_css_rules | |
| FROM `httparchive.crawl.pages` | |
| WHERE date = '2025-07-01' | |
| ) | |
| USING (client, page), | |
| UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping | |
| WHERE rank <= rank_grouping | |
| GROUP BY client, rank_grouping | |
| #standardSQL | |
| # Unused CSS and JS | |
| SELECT | |
| client, | |
| rank_grouping, | |
| CASE | |
| WHEN rank_grouping = 100000000 THEN 'all' | |
| ELSE FORMAT("%'d", rank_grouping) | |
| END AS ranking, | |
| COUNT(DISTINCT page) AS pages, | |
| SUM(unused_javascript) / COUNT(DISTINCT page) AS unused_javascript_kib_avg, | |
| SUM(unused_css_rules) / COUNT(DISTINCT page) AS unused_css_rules_kib_avg | |
| FROM | |
| ( | |
| SELECT | |
| client, | |
| page, | |
| rank | |
| FROM | |
| `httparchive.crawl.pages` | |
| WHERE | |
| date = '2025-07-01' | |
| ) | |
| LEFT JOIN | |
| ( | |
| SELECT | |
| client, | |
| page, | |
| SAFE_DIVIDE(SAFE.INT64(lighthouse.audits.`unused-javascript`.details.overallSavingsBytes), 1024) AS unused_javascript, | |
| SAFE_DIVIDE(SAFE.INT64(lighthouse.audits.`unused-css-rules`.details.overallSavingsBytes), 1024) AS unused_css_rules | |
| FROM | |
| `httparchive.crawl.pages` | |
| WHERE | |
| date = '2025-07-01' | |
| ) USING (client, page), | |
| UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping | |
| WHERE | |
| rank <= rank_grouping | |
| GROUP BY | |
| client, | |
| rank_grouping | |
| ORDER BY | |
| rank_grouping; |
| #standardSQL | ||
| # Validation query to check what's in the payload for invalid head elements | ||
| # This will help us understand the actual structure |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think this is needed anymore. Can we remove from this PR?
| #standardSQL | ||
| # Videos per page | ||
| # returns all the data we need from _almanac | ||
|
|
||
| CREATE TEMPORARY FUNCTION getVideosAlmanacInfo(almanac_json JSON) | ||
| RETURNS STRUCT< | ||
| videos_total INT64 | ||
| > LANGUAGE js AS ''' | ||
| var result = { | ||
| videos_total: 0 | ||
| }; | ||
| try { | ||
| var almanac = almanac_json; | ||
|
|
||
| if (Array.isArray(almanac) || typeof almanac != 'object') return result; | ||
|
|
||
| if (almanac.videos && almanac.videos.total) { | ||
| result.videos_total = almanac.videos.total; | ||
| } | ||
| } catch (e) {} | ||
| return result; | ||
| '''; | ||
|
|
||
| SELECT | ||
| percentile, | ||
| client, | ||
| COUNT(DISTINCT page) AS total, | ||
|
|
||
| # videos per page | ||
| APPROX_QUANTILES(video_almanac_info.videos_total, 1000)[OFFSET(percentile * 10)] AS videos_count | ||
|
|
||
| FROM ( | ||
| SELECT | ||
| client AS client, | ||
| percentile, | ||
| page, | ||
| getVideosAlmanacInfo(TO_JSON(custom_metrics.other.almanac)) AS video_almanac_info | ||
| FROM | ||
| `httparchive.crawl.pages`, | ||
| UNNEST([10, 25, 50, 75, 90]) AS percentile | ||
| WHERE date = '2025-07-01' | ||
| ) | ||
| WHERE | ||
| video_almanac_info.videos_total > 0 | ||
| GROUP BY | ||
| percentile, | ||
| client | ||
| ORDER BY | ||
| percentile, | ||
| client |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| #standardSQL | |
| # Videos per page | |
| # returns all the data we need from _almanac | |
| CREATE TEMPORARY FUNCTION getVideosAlmanacInfo(almanac_json JSON) | |
| RETURNS STRUCT< | |
| videos_total INT64 | |
| > LANGUAGE js AS ''' | |
| var result = { | |
| videos_total: 0 | |
| }; | |
| try { | |
| var almanac = almanac_json; | |
| if (Array.isArray(almanac) || typeof almanac != 'object') return result; | |
| if (almanac.videos && almanac.videos.total) { | |
| result.videos_total = almanac.videos.total; | |
| } | |
| } catch (e) {} | |
| return result; | |
| '''; | |
| SELECT | |
| percentile, | |
| client, | |
| COUNT(DISTINCT page) AS total, | |
| # videos per page | |
| APPROX_QUANTILES(video_almanac_info.videos_total, 1000)[OFFSET(percentile * 10)] AS videos_count | |
| FROM ( | |
| SELECT | |
| client AS client, | |
| percentile, | |
| page, | |
| getVideosAlmanacInfo(TO_JSON(custom_metrics.other.almanac)) AS video_almanac_info | |
| FROM | |
| `httparchive.crawl.pages`, | |
| UNNEST([10, 25, 50, 75, 90]) AS percentile | |
| WHERE date = '2025-07-01' | |
| ) | |
| WHERE | |
| video_almanac_info.videos_total > 0 | |
| GROUP BY | |
| percentile, | |
| client | |
| ORDER BY | |
| percentile, | |
| client | |
| #standardSQL | |
| # Videos per page | |
| CREATE TEMPORARY FUNCTION getVideosInfo(videos JSON) | |
| RETURNS STRUCT< | |
| videos_total INT64 | |
| > LANGUAGE js AS ''' | |
| var result = { | |
| videos_total: 0 | |
| }; | |
| try { | |
| if (Array.isArray(videos) || typeof videos != 'object') return result; | |
| if (videos && videos.total) { | |
| result.videos_total = videos.total; | |
| } | |
| } catch (e) {} | |
| return result; | |
| '''; | |
| SELECT | |
| percentile, | |
| client, | |
| COUNT(DISTINCT page) AS total, | |
| # videos per page | |
| APPROX_QUANTILES(video_almanac_info.videos_total, 1000)[OFFSET(percentile * 10)] AS videos_count | |
| FROM ( | |
| SELECT | |
| client AS client, | |
| percentile, | |
| page, | |
| getVideosInfo(TO_JSON(custom_metrics.other.almanac.videos)) AS video_almanac_info | |
| FROM | |
| `httparchive.crawl.pages`, | |
| UNNEST([10, 25, 50, 75, 90]) AS percentile | |
| WHERE | |
| date = '2025-07-01' | |
| ) | |
| WHERE | |
| video_almanac_info.videos_total > 0 | |
| GROUP BY | |
| percentile, | |
| client | |
| ORDER BY | |
| percentile, | |
| client |
| WITH plugin_counts AS ( | ||
| SELECT | ||
| t.technology AS plugin, | ||
| p.is_root_page, | ||
| p.client, | ||
| COUNT(DISTINCT p.page) AS site_count | ||
| FROM `httparchive.crawl.pages` AS p, | ||
| UNNEST(p.technologies) AS t, | ||
| UNNEST(t.categories) AS cat | ||
| WHERE | ||
| p.date = '2025-07-01' | ||
| AND cat IN ('seo', 'SEO') | ||
| AND is_root_page = TRUE | ||
| GROUP BY | ||
| t.technology, | ||
| p.is_root_page, | ||
| p.client | ||
| ), | ||
| filtered_total_counts AS ( | ||
| SELECT | ||
| is_root_page, | ||
| client, | ||
| SUM(site_count) AS total_filtered_sites | ||
| FROM plugin_counts | ||
| GROUP BY | ||
| is_root_page, | ||
| client | ||
| ), | ||
| overall_total_counts AS ( | ||
| SELECT | ||
| client, | ||
| COUNT(DISTINCT page) AS total_all_sites | ||
| FROM `httparchive.crawl.pages` | ||
| WHERE | ||
| date = '2025-07-01' | ||
| AND is_root_page = TRUE | ||
| GROUP BY | ||
| client | ||
| ) | ||
| SELECT | ||
| pc.plugin, | ||
| pc.client, | ||
| pc.site_count, | ||
| ftc.total_filtered_sites AS total_seo_sites, | ||
| otc.total_all_sites, | ||
| ROUND(SAFE_DIVIDE(pc.site_count, ftc.total_filtered_sites), 4) AS pct_of_seo_sites, | ||
| ROUND(SAFE_DIVIDE(pc.site_count, otc.total_all_sites), 4) AS pct_of_all_sites | ||
| FROM plugin_counts pc | ||
| JOIN filtered_total_counts ftc | ||
| ON pc.is_root_page = ftc.is_root_page | ||
| AND pc.client = ftc.client | ||
| JOIN overall_total_counts otc | ||
| ON pc.client = otc.client | ||
| ORDER BY | ||
| pc.is_root_page, | ||
| pc.client, | ||
| pc.site_count DESC; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| WITH plugin_counts AS ( | |
| SELECT | |
| t.technology AS plugin, | |
| p.is_root_page, | |
| p.client, | |
| COUNT(DISTINCT p.page) AS site_count | |
| FROM `httparchive.crawl.pages` AS p, | |
| UNNEST(p.technologies) AS t, | |
| UNNEST(t.categories) AS cat | |
| WHERE | |
| p.date = '2025-07-01' | |
| AND cat IN ('seo', 'SEO') | |
| AND is_root_page = TRUE | |
| GROUP BY | |
| t.technology, | |
| p.is_root_page, | |
| p.client | |
| ), | |
| filtered_total_counts AS ( | |
| SELECT | |
| is_root_page, | |
| client, | |
| SUM(site_count) AS total_filtered_sites | |
| FROM plugin_counts | |
| GROUP BY | |
| is_root_page, | |
| client | |
| ), | |
| overall_total_counts AS ( | |
| SELECT | |
| client, | |
| COUNT(DISTINCT page) AS total_all_sites | |
| FROM `httparchive.crawl.pages` | |
| WHERE | |
| date = '2025-07-01' | |
| AND is_root_page = TRUE | |
| GROUP BY | |
| client | |
| ) | |
| SELECT | |
| pc.plugin, | |
| pc.client, | |
| pc.site_count, | |
| ftc.total_filtered_sites AS total_seo_sites, | |
| otc.total_all_sites, | |
| ROUND(SAFE_DIVIDE(pc.site_count, ftc.total_filtered_sites), 4) AS pct_of_seo_sites, | |
| ROUND(SAFE_DIVIDE(pc.site_count, otc.total_all_sites), 4) AS pct_of_all_sites | |
| FROM plugin_counts pc | |
| JOIN filtered_total_counts ftc | |
| ON pc.is_root_page = ftc.is_root_page | |
| AND pc.client = ftc.client | |
| JOIN overall_total_counts otc | |
| ON pc.client = otc.client | |
| ORDER BY | |
| pc.is_root_page, | |
| pc.client, | |
| pc.site_count DESC; | |
| WITH plugin_counts AS ( | |
| SELECT | |
| t.technology AS plugin, | |
| p.is_root_page, | |
| p.client, | |
| COUNT(DISTINCT p.page) AS site_count | |
| FROM | |
| `httparchive.crawl.pages` AS p, | |
| UNNEST(p.technologies) AS t, | |
| UNNEST(t.categories) AS cat | |
| WHERE | |
| p.date = '2025-07-01' AND | |
| lower(cat) = 'seo' AND | |
| is_root_page | |
| GROUP BY | |
| t.technology, | |
| p.is_root_page, | |
| p.client | |
| ), | |
| filtered_total_counts AS ( | |
| SELECT | |
| is_root_page, | |
| client, | |
| SUM(site_count) AS total_filtered_sites | |
| FROM | |
| plugin_counts | |
| GROUP BY | |
| is_root_page, | |
| client | |
| ), | |
| overall_total_counts AS ( | |
| SELECT | |
| client, | |
| COUNT(DISTINCT page) AS total_all_sites | |
| FROM | |
| `httparchive.crawl.pages` | |
| WHERE | |
| date = '2025-07-01' AND | |
| is_root_page = TRUE | |
| GROUP BY | |
| client | |
| ) | |
| SELECT | |
| pc.plugin, | |
| pc.client, | |
| pc.site_count, | |
| ftc.total_filtered_sites AS total_seo_sites, | |
| otc.total_all_sites, | |
| ROUND(SAFE_DIVIDE(pc.site_count, ftc.total_filtered_sites), 4) AS pct_of_seo_sites, | |
| ROUND(SAFE_DIVIDE(pc.site_count, otc.total_all_sites), 4) AS pct_of_all_sites | |
| FROM | |
| plugin_counts pc | |
| JOIN filtered_total_counts ftc ON pc.is_root_page = ftc.is_root_page AND pc.client = ftc.client | |
| JOIN overall_total_counts otc ON pc.client = otc.client | |
| ORDER BY | |
| pc.is_root_page, | |
| pc.client, | |
| pc.site_count DESC; |
tunetheweb
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Most of the queries look good, but they can be simplified somewhat now the columns are JSON. I've rewritten where this is the case and made the suggestions.
Merging the 2025 SEO Chapter SQL into the main project
Makes progress on #4086