| | import numpy as np |
| | import pandas as pd |
| | import pytest |
| | from transformers import AutoModel, AutoTokenizer |
| |
|
| | from src.nlp_models import HuggingFaceEmbeddings |
| |
|
| | |
| | |
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | @pytest.fixture |
| | def mock_text_data(tmp_path): |
| | """ |
| | Fixture to create a mock CSV file with text data for testing. |
| | """ |
| | data = {"description": ["Product 1 description", "Product 2 description"]} |
| | df = pd.DataFrame(data) |
| | file_path = tmp_path / "test_text_data.csv" |
| | df.to_csv(file_path, index=False) |
| | return str(file_path) |
| |
|
| |
|
| | @pytest.mark.parametrize( |
| | "model_name, expected_hidden_size", |
| | [ |
| | ("sentence-transformers/all-MiniLM-L6-v2", 384), |
| | |
| | ], |
| | ) |
| | def test_huggingface_embeddings_generic( |
| | model_name, expected_hidden_size, mock_text_data |
| | ): |
| | """ |
| | Generic test for loading a Hugging Face model, generating text embeddings, and saving them to a CSV file. |
| | |
| | This test ensures that: |
| | - The model and tokenizer are properly loaded from Hugging Face. |
| | - Embeddings are correctly generated for text descriptions. |
| | - Embeddings are saved in the correct format to a CSV file. |
| | |
| | Parameters: |
| | ---------- |
| | model_name : str |
| | The name of the Hugging Face model to test. |
| | expected_hidden_size : int |
| | The expected hidden size (dimensionality) of the embeddings generated by the model. |
| | mock_text_data : str |
| | Path to the mock CSV file containing text descriptions. |
| | """ |
| | |
| | model = HuggingFaceEmbeddings( |
| | model_name=model_name, path=mock_text_data, device="cpu" |
| | ) |
| |
|
| | |
| | assert isinstance( |
| | model.tokenizer, type(AutoTokenizer.from_pretrained(model_name)) |
| | ), ( |
| | f"Tokenizer should be an instance of {type(AutoTokenizer.from_pretrained(model_name))}" |
| | ) |
| | assert isinstance(model.model, type(AutoModel.from_pretrained(model_name))), ( |
| | f"Model should be an instance of {type(AutoModel.from_pretrained(model_name))}" |
| | ) |
| |
|
| | |
| | sample_text = "This is a test description." |
| | embeddings = model.get_embedding(sample_text) |
| |
|
| | |
| | assert isinstance(embeddings, np.ndarray), "Embeddings should be a NumPy array" |
| | assert embeddings.shape == (expected_hidden_size,), ( |
| | f"Embeddings shape should be ({expected_hidden_size},), got {embeddings.shape}" |
| | ) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | pytest.main() |
| |
|