| import json |
| import argparse |
| from tqdm import tqdm |
|
|
| def process_line(line, old_text, new_text): |
| |
| data = json.loads(line) |
| |
| |
| def replace_text(obj): |
| if isinstance(obj, dict): |
| return {k: replace_text(v) for k, v in obj.items()} |
| elif isinstance(obj, list): |
| return [replace_text(item) for item in obj] |
| elif isinstance(obj, str): |
| return obj.replace(old_text, new_text) |
| else: |
| return obj |
| |
| |
| processed_data = replace_text(data) |
| |
| |
| return json.dumps(processed_data, ensure_ascii=False) |
|
|
| def main(input_file, output_file, old_text, new_text): |
| with open(input_file, 'r', encoding='utf-8') as infile, \ |
| open(output_file, 'w', encoding='utf-8') as outfile: |
| |
| |
| total_lines = sum(1 for _ in infile) |
| infile.seek(0) |
| |
| |
| for line in tqdm(infile, total=total_lines, desc="Processing"): |
| processed_line = process_line(line.strip(), old_text, new_text) |
| outfile.write(processed_line + '\n') |
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description="Replace text in a JSONL file.") |
| parser.add_argument("input_file", help="Input JSONL file to process") |
| parser.add_argument("output_file", help="Output file for processed JSONL") |
| parser.add_argument("--old_text", default="尖米", help="Text to be replaced") |
| parser.add_argument("--new_text", default="FAYO", help="Text to replace with") |
| args = parser.parse_args() |
|
|
| main(args.input_file, args.output_file, args.old_text, args.new_text) |