Polars
Polars
r = requests.get("https://datasets-server.boincai.com/parquet?dataset=blog_authorship_corpus")
j = r.json()
urls = [f['url'] for f in j['parquet_files'] if f['split'] == 'train']
urls
['https://boincai.com/datasets/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/train/0000.parquet',
'https://boincai.com/datasets/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/train/0001.parquet']import polars as pl
df = (
pl.read_parquet("https://boincai.com/datasets/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/train/0000.parquet")
.groupby("horoscope")
.agg(
[
pl.count(),
pl.col("text").str.n_chars().mean().alias("avg_blog_length")
]
)
.sort("avg_blog_length", descending=True)
.limit(5)
)
print(df)
shape: (5, 3)
βββββββββββββ¬ββββββββ¬ββββββββββββββββββ
β horoscope β count β avg_blog_length β
β --- β --- β --- β
β str β u32 β f64 β
βββββββββββββͺββββββββͺββββββββββββββββββ‘
β Aquarius β 34062 β 1129.218836 β
β Cancer β 41509 β 1098.366812 β
β Capricorn β 33961 β 1073.2002 β
β Libra β 40302 β 1072.071833 β
β Leo β 40587 β 1064.053687 β
βββββββββββββ΄ββββββββ΄ββββββββββββββββββLazy API
Last updated