| import os |
| import pandas as pd |
| from pathlib import Path |
| from tqdm import tqdm |
|
|
| |
|
|
|
|
| def process_csvs(folder_path, new_folder_name): |
| |
| board = os.path.basename(folder_path) |
| |
| sorted_folder = Path(new_folder_name) |
| sorted_folder.mkdir(parents=True, exist_ok=True) |
|
|
| |
| all_files = [ |
| os.path.join(folder_path, file) |
| for file in os.listdir(folder_path) |
| if file.endswith(".csv") |
| ] |
| |
| list_of_dataframes = [pd.read_csv(file) for file in all_files] |
| |
| combined_df = pd.concat(list_of_dataframes, ignore_index=True) |
|
|
| |
| combined_df = combined_df.sort_values(by="last_edit") |
|
|
| |
| num_chunks = len(combined_df) // 10000 + (1 if len(combined_df) % 10000 else 0) |
| chunks = [combined_df.iloc[i * 10000 : (i + 1) * 10000] for i in range(num_chunks)] |
|
|
| |
| for idx, chunk in tqdm(enumerate(chunks)): |
| start_date = pd.to_datetime(chunk["last_edit"].iloc[0]).strftime("%d%m%y") |
| end_date = pd.to_datetime(chunk["last_edit"].iloc[-1]).strftime("%d%m%y") |
| filename = f"BitcoinForum_{board}_{start_date}_to_{end_date}.csv" |
| chunk.to_csv(os.path.join(sorted_folder, filename), index=False) |
|
|
|
|
| folder_paths = [ |
| "./raw-data", |
| "./preprocessed-data", |
| ] |
|
|
| |
| for folder_path in folder_paths: |
| folder_name = os.path.basename(folder_path) |
| new_folder_name = f"sorted-{folder_name}" |
| for folder in tqdm(os.listdir(folder_path)): |
| if os.path.isdir(os.path.join(folder_path, folder)): |
| process_csvs(os.path.join(folder_path, folder), new_folder_name) |
|
|