| | """ |
| | This is a utility script for use in sagemaker |
| | """ |
| |
|
| | import json |
| | import pandas as pd |
| | import pyarrow as pa |
| | import pyarrow.parquet as pq |
| | import os |
| | from tqdm import tqdm |
| |
|
| | |
| | json_file_path = "/home/studio-lab-user/arxiv-paper-recommender-system/arxiv-metadata-oai-snapshot.json" |
| | parquet_file_path = "/home/studio-lab-user/arxiv-paper-recommender-system/data/processed/arxiv_papers_raw.parquet.gzip" |
| |
|
| | |
| | batch_size = 10000 |
| |
|
| | |
| | parent_dir = os.path.dirname(parquet_file_path) |
| | os.makedirs(parent_dir, exist_ok=True) |
| |
|
| | |
| | with open(json_file_path, 'r') as file: |
| | |
| | arxiv_data = [] |
| | processed_count = 0 |
| |
|
| | |
| | for line in tqdm(file): |
| | |
| | arxiv_data.append(json.loads(line)) |
| |
|
| | processed_count += 1 |
| |
|
| | |
| | if processed_count % batch_size == 0: |
| | df = pd.DataFrame.from_records(arxiv_data) |
| | |
| | |
| | |
| | table = pa.Table.from_pandas(df) |
| |
|
| | |
| | pq.write_to_dataset(table , root_path=parquet_file_path) |
| | arxiv_data = [] |
| |
|
| | |
| | if arxiv_data: |
| | df = pd.DataFrame.from_records(arxiv_data) |
| | |
| | |
| | pq.write_to_dataset(parquet_file_path , root_path=parquet_file_path) |
| |
|