#standardSQL
SELECT a.id id, size, content, binary, copies,
sample_repo_name, sample_path
FROM (
SELECT id, ANY_VALUE(repo_name) sample_repo_name, ANY_VALUE(path) sample_path
FROM `bigquery-public-data.github_repos.files`
WHERE ENDS_WITH(path,'.ipynb')
GROUP BY 1
) a
JOIN `bigquery-public-data.github_repos.contents` b
ON a.id=b.id
#standardSQL
SELECT
CASE WHEN package LIKE '%\\n",' THEN SUBSTR(package, 0, LENGTH(package) - 4)
ELSE package
END as package, n
FROM (
SELECT REGEXP_EXTRACT( line, r'(?:\S+\s+)(\S+)' ) as package, count(*) as n
FROM (
SELECT line
FROM (
SELECT SPLIT(content, '\n \"') as lines
FROM `fh-bigquery.github_extracts.contents_ipynb`
), UNNEST(lines) line
WHERE STARTS_WITH(line, 'import ') OR STARTS_WITH(line, 'from '))
GROUP BY package
)
ORDER BY n DESC
LIMIT 50;
Graduate student in statistics at Duke University. Former dev.to employee. I like to blog about data science on my Medium publication, perplex.city, and on dev.to
Thanks, Felipe. Still honing my SQL skills so all ears for better solutions. I've got some more BigQuery posts in the pipeline both here on dev.to and on Medium
For further actions, you may consider blocking this person and/or reporting abuse
We're a place where coders share, stay up-to-date and grow their careers.
Thanks Walker!
A better query to extract all the ipynb (I left the results at bigquery.cloud.google.com/table/fh...
And more #standardSQL for the imports query:
Thanks, Felipe. Still honing my SQL skills so all ears for better solutions. I've got some more BigQuery posts in the pipeline both here on dev.to and on Medium