SpacyModelCreator

Paused

App Files Files Community

SpacyModelCreator / utils /model.py

WebashalarForML

Update utils/model.py

83f32c2 verified over 1 year ago

raw

history blame contribute delete

3.38 kB

	import spacy
	from spacy.training import Example
	from spacy.util import minibatch, compounding
	from pathlib import Path
	from spacy.tokens import DocBin
	import random
	import shutil
	import os

	def load_data_from_spacy_file(file_path):
	"""Load training data from .spacy file."""
	nlp = spacy.blank("en")

	try:
	doc_bin = DocBin().from_disk(file_path)
	docs = list(doc_bin.get_docs(nlp.vocab))
	print(f"Loaded {len(docs)} documents from {file_path}.")
	return docs
	except Exception as e:
	print(f"Error loading data from .spacy file: {e}")
	return []

	def train_model(epochs, model_path):
	"""Train NER model."""
	nlp = spacy.blank("en")

	# Add the NER pipeline
	if "ner" not in nlp.pipe_names:
	ner = nlp.add_pipe("ner")

	nlp.add_pipe("sentencizer") # Optional component to split sentences

	# Define entity labels
	labels = [
	"PERSON", "CONTACT", "EMAIL", "ABOUT", "EXPERIENCE", "YEARS_EXPERIENCE",
	"UNIVERSITY", "SOFT_SKILL", "INSTITUTE", "LAST_QUALIFICATION_YEAR", "JOB_TITLE",
	"COMPANY", "COURSE", "DOB", "HOBBIES", "LINK", "SCHOOL", "QUALIFICATION",
	"LANGUAGE", "LOCATION", "PROJECTS", "SKILL", "CERTIFICATE"
	]

	# Add the labels to the NER pipeline
	for label in labels:
	ner.add_label(label)

	# Load training data
	train_data = load_data_from_spacy_file("./data/Spacy_data.spacy")

	# Verify if data was loaded correctly
	if not train_data:
	print("No training data found. Exiting training.")
	return

	optimizer = nlp.begin_training()
	epoch_losses = []
	best_loss = float('inf')

	# Start training loop
	for epoch in range(epochs):
	losses = {}
	random.shuffle(train_data) # Shuffle data

	# Create batches
	batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))

	for batch in batches:
	# Extract texts and annotations
	try:
	texts, annotations = zip(
	*[(doc.text, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]})
	for doc in batch]
	)
	except ValueError as e:
	print(f"Error processing batch: {e}")
	continue

	# Create Example objects
	examples = [Example.from_dict(nlp.make_doc(text), annotation)
	for text, annotation in zip(texts, annotations)]

	# Update the model
	nlp.update(examples, sgd=optimizer, drop=0.35, losses=losses)

	# Record loss for this epoch
	current_loss = losses.get("ner", float('inf'))
	epoch_losses.append(current_loss)

	print(f"Losses at epoch {epoch + 1}: {losses}")

	# Save the best model
	if current_loss < best_loss:
	best_loss = current_loss
	temp_model_path = model_path + "_temp"
	nlp.to_disk(temp_model_path)

	# Safely move to the final path
	if os.path.exists(model_path):
	shutil.rmtree(model_path)
	shutil.copytree(temp_model_path, model_path)
	shutil.rmtree(temp_model_path)

	# Save the final model
	nlp.to_disk(model_path)
	print(f"Training completed. Final model saved at: {model_path}")

	return epoch_losses