Spaces:
Sleeping
Sleeping
| from transformers import BertForQuestionAnswering, BertTokenizer | |
| import torch | |
| import PyPDF2 | |
| # Load the model & tokenizer | |
| model_loc = "D://code/bert_easy/bert-large-uncased-whole-word-masking-finetuned-squad" | |
| model = BertForQuestionAnswering.from_pretrained(model_loc) | |
| tokenizer = BertTokenizer.from_pretrained(model_loc) | |
| # Extract text from PDF | |
| pdf_path = "C://Users/HP/Downloads/Resumes/Akshat_Thakkar_2022BTCS008.pdf" | |
| text = "" | |
| try: | |
| pdf_reader = PyPDF2.PdfReader(pdf_path) | |
| for page in pdf_reader.pages: | |
| page_text = page.extract_text() | |
| if page_text: # Ensure text is extracted | |
| text += page_text + "\n" | |
| except Exception as e: | |
| print(f"Error reading PDF: {e}") | |
| text = "" | |
| # Ensure valid context | |
| if not text.strip(): | |
| print("No valid text extracted from PDF.") | |
| exit() | |
| # Limit context to 512 tokens | |
| tokens = tokenizer.tokenize(text) | |
| context = tokenizer.convert_tokens_to_string(tokens[:1512]) # Truncate to 512 tokens | |
| # Define question | |
| question = "What is my name?" | |
| # Tokenize input | |
| inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512) | |
| # Get model predictions | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| # Extract answer | |
| answer_start = torch.argmax(outputs.start_logits) | |
| answer_end = torch.argmax(outputs.end_logits) + 1 | |
| answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end])) | |
| print("summary:", answer) | |
| print("Answer:", answer) | |