| |
|
|
| import re |
|
|
| def merge_documents(main_dict, additional_json, limit=1000): |
| """ |
| Adds a subset of documents from an additional JSON file to the main dictionary. |
| |
| Args: |
| main_dict (dict): The main dictionary where processed documents are stored. |
| additional_json (list): The additional JSON data containing documents. |
| limit (int): The maximum number of documents to add to the main dictionary. |
| |
| Returns: |
| dict: The updated main dictionary with additional documents added. |
| """ |
| |
| count = 0 |
|
|
| for doc in additional_json: |
| if count >= limit: |
| break |
| |
| |
| wikipedia_id = doc.get("wikipedia_id") |
| text = doc.get("text", []) |
| |
| |
| if wikipedia_id not in main_dict: |
| |
| joined_text = " ".join(text) |
| sanitized_text = sanitize_text(joined_text) |
| |
| |
| main_dict[wikipedia_id] = sanitized_text |
| count += 1 |
| |
| print(f"{count} documents added to the main dictionary.") |
| return main_dict |
|
|
| def sanitize_text(text): |
| """ |
| Cleans and standardizes text by keeping only alphanumeric characters and spaces. |
| Args: |
| text (str): Text to sanitize. |
| Returns: |
| str: Sanitized text. |
| """ |
| if isinstance(text, str): |
| |
| text = re.sub(r'[^a-zA-Z0-9\s]', '', text) |
| |
| text = re.sub(r'\s+', ' ', text).strip() |
| return text |
|
|
|
|
| def process_json_data(json_data): |
| result_dict = {} |
| |
| for doc in json_data: |
| |
| wikipedia_id = doc.get("wikipedia_id") |
| text = doc.get("text", []) |
| |
| |
| joined_text = " ".join(text) |
| sanitized_text = sanitize_text(joined_text) |
| |
| |
| result_dict[wikipedia_id] = sanitized_text |
|
|
| return result_dict |
|
|
| def process_queries(json_data): |
| """ |
| Processes a JSON object containing queries and query IDs. |
| |
| Args: |
| json_data (dict): The input JSON data. |
| |
| Returns: |
| dict: A dictionary with query_id as the key and query text as the value. |
| """ |
| result_dict = {} |
| |
| for query_id, query_info in json_data.items(): |
| |
| query_text = query_info.get("input", "") |
| |
| |
| result_dict[query_id] = query_text |
|
|
| return result_dict |
|
|
| |
| |
| |
|
|
|
|