Spaces:

iBrokeTheCode
/

Multimodal_Product_Classification

Sleeping

App Files Files Community

Multimodal_Product_Classification / tests /test_nlp_models.py

iBrokeTheCode

chore: Add tests cases

43fe501 6 months ago

raw

history blame contribute delete

2.98 kB

	import numpy as np
	import pandas as pd
	import pytest
	from transformers import AutoModel, AutoTokenizer

	from src.nlp_models import HuggingFaceEmbeddings

	# import torch
	# import os

	####################################################################################################
	################################## Test the Text Embeddings Model ##################################
	####################################################################################################


	@pytest.fixture
	def mock_text_data(tmp_path):
	"""
	Fixture to create a mock CSV file with text data for testing.
	"""
	data = {"description": ["Product 1 description", "Product 2 description"]}
	df = pd.DataFrame(data)
	file_path = tmp_path / "test_text_data.csv"
	df.to_csv(file_path, index=False)
	return str(file_path)


	@pytest.mark.parametrize(
	"model_name, expected_hidden_size",
	[
	("sentence-transformers/all-MiniLM-L6-v2", 384), # MiniLM with 384 hidden units
	# ('bert-base-uncased', 768), # BERT base with 768 hidden units
	],
	)
	def test_huggingface_embeddings_generic(
	model_name, expected_hidden_size, mock_text_data
	):
	"""
	Generic test for loading a Hugging Face model, generating text embeddings, and saving them to a CSV file.

	This test ensures that:
	- The model and tokenizer are properly loaded from Hugging Face.
	- Embeddings are correctly generated for text descriptions.
	- Embeddings are saved in the correct format to a CSV file.

	Parameters:
	----------
	model_name : str
	The name of the Hugging Face model to test.
	expected_hidden_size : int
	The expected hidden size (dimensionality) of the embeddings generated by the model.
	mock_text_data : str
	Path to the mock CSV file containing text descriptions.
	"""
	# Initialize the HuggingFaceEmbeddings model with the provided model name
	model = HuggingFaceEmbeddings(
	model_name=model_name, path=mock_text_data, device="cpu"
	)

	# Check that the tokenizer and model were loaded correctly
	assert isinstance(
	model.tokenizer, type(AutoTokenizer.from_pretrained(model_name))
	), (
	f"Tokenizer should be an instance of {type(AutoTokenizer.from_pretrained(model_name))}"
	)
	assert isinstance(model.model, type(AutoModel.from_pretrained(model_name))), (
	f"Model should be an instance of {type(AutoModel.from_pretrained(model_name))}"
	)

	# Generate embeddings for a sample text
	sample_text = "This is a test description."
	embeddings = model.get_embedding(sample_text)

	# Check that the embeddings are a NumPy array with the expected shape
	assert isinstance(embeddings, np.ndarray), "Embeddings should be a NumPy array"
	assert embeddings.shape == (expected_hidden_size,), (
	f"Embeddings shape should be ({expected_hidden_size},), got {embeddings.shape}"
	)


	if __name__ == "__main__":
	pytest.main()