A Coding Fingers-On on FineWeb for Streaming, Filtering, Deduplication, Tokenization, and Giant-Scale Net Corpus Analytics

A Coding Fingers-On on FineWeb for Streaming, Filtering, Deduplication, Tokenization, and Giant-Scale Net Corpus Analytics


df["domain"] = df["url"].apply(lambda u: urlparse(u).netloc.substitute("www.", "") if isinstance(u, str) else "?")
top_domains = df["domain"].value_counts().head(15)
print("n--- High 15 domains in pattern ---")
print(top_domains)
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes[0, 0].hist(df["token_count"].clip(higher=4000), bins=50, shade="#7b2d26")
axes[0, 0].set_title("Token depend per doc (gpt2)")
axes[0, 0].set_xlabel("tokens"); axes[0, 0].set_ylabel("docs")
axes[0, 1].hist(df["language_score"], bins=40, shade="#2d5d7b")
axes[0, 1].axvline(0.65, shade="pink", ls="--", label="FineWeb cutoff 0.65")
axes[0, 1].set_title("fastText English language rating")
axes[0, 1].set_xlabel("rating"); axes[0, 1].legend()
axes[1, 0].hist(df["chars_per_token"].clip(higher=8), bins=40, shade="#3f7b2d")
axes[1, 0].set_title("Characters per token (compression)")
axes[1, 0].set_xlabel("chars / token")
top_domains.iloc[::-1].plot(sort="barh", ax=axes[1, 1], shade="#7b5d2d")
axes[1, 1].set_title("High domains")
plt.tight_layout()
plt.present()
print("n" + "=" * 70)
print("SUMMARY")
print("=" * 70)
print(f"Docs streamed          : {len(df):,}")
print(f"Whole gpt2 tokens       : {df['token_count'].sum():,}")
print(f"Median tokens/doc       : {int(df['token_count'].median())}")
print(f"Distinctive domains          : {df['domain'].nunique():,}")
print(f"Imply language_score     : {df['language_score'].imply():.3f}")
print(f"Close to-duplicate pairs    : {len(dup_pairs)}")
print(f"Docs flagged by filters : {(pd.Collection(outcomes) != 'stored').sum()} / {len(outcomes)}")
print("nNext steps:")
print("  • Swap title="sample-10BT" for an actual crawl, e.g. title="CC-MAIN-2024-10"")
print("  • Increase N_DOCS for stronger statistics")
print("  • Use the complete datatrove pipeline to breed FineWeb end-to-end")



Source link

Leave a Reply

Your email address will not be published. Required fields are marked *