| import json |
| import argparse |
|
|
| def main(): |
| parser = argparse.ArgumentParser(description='Convert and merge JSONL files with question-answer mappings') |
| parser.add_argument('orig_path', help='Path to the original JSONL file') |
| parser.add_argument('out_path', help='Path to the output JSONL file') |
| parser.add_argument('mapping_paths', nargs='+', help='Path(s) to mapping JSONL file(s)') |
| |
| args = parser.parse_args() |
| |
| |
| orig_path = args.orig_path |
| out_path = args.out_path |
| mapping_paths = args.mapping_paths |
|
|
| |
| mapping = {} |
| for mp in mapping_paths: |
| with open(mp, 'r', encoding='utf-8') as f_map: |
| for idx, line in enumerate(f_map): |
| obj = json.loads(line) |
| q = obj.get("question") |
| if q is None: |
| continue |
| |
| ctx = obj.get("context", "") |
| |
| raw_ans = obj.get("answers", obj.get("answer", [])) |
| |
| if isinstance(raw_ans, list): |
| ans = raw_ans |
| else: |
| ans = [raw_ans] |
| |
| mapping[q] = {"context": ctx, "answers": ans} |
|
|
| |
| with open(orig_path, 'r', encoding='utf-8') as f_in, \ |
| open(out_path, 'w', encoding='utf-8') as f_out: |
| for line in f_in: |
| item = json.loads(line) |
| inp = item.get("input") |
| if inp in mapping: |
| item["context"] = mapping[inp]["context"] |
| item["answers"] = mapping[inp]["answers"] |
| f_out.write(json.dumps(item, ensure_ascii=False) + "\n") |
|
|
| print(f"Merge completed, output file: {out_path}") |
|
|
| if __name__ == "__main__": |
| main() |