import os import pyarrow.parquet as pq from glob import glob from tqdm import tqdm INPUT_DIRS = [ "books", "fineweb", "wikipedia", ] OUTPUT_DIR = "merged_text" os.makedirs(OUTPUT_DIR, exist_ok=True) OUT_FILE = os.path.join(OUTPUT_DIR, "corpus.txt") def extract_text_from_parquet(path): try: table = pq.read_table(path) df = table.to_pandas() # Look for likely text column for col in ["text", "content", "document", "article", "source"]: if col in df.columns: return df[col].astype(str).tolist() # Fallback: take the first string-like column for col in df.columns: if df[col].dtype == object: return df[col].astype(str).tolist() return [] except Exception as e: print(f"Error reading {path}: {e}") return [] all_parquet_files = [] for d in INPUT_DIRS: all_parquet_files.extend(glob(f"{d}/**/*.parquet", recursive=True)) print("Total parquet files found:", len(all_parquet_files)) with open(OUT_FILE, "w", encoding="utf-8") as fout: for file in tqdm(all_parquet_files, desc="Extracting text"): texts = extract_text_from_parquet(file) for t in texts: t = t.strip() if len(t) < 50: continue if not any(c.isalpha() for c in t): continue fout.write(t + "\n\n") print("DONE! Saved merged corpus →", OUT_FILE)