Spaces:
Running
Running
| """ | |
| ========================================================= | |
| Polyline & Vector Utility Functions | |
| ========================================================= | |
| This file contains utility (i.e. globally used) functions in accordance with DRY principle : | |
| - Handling polylines (multi-dimensional paths) used in courses/resources. | |
| - Computing geometric and vector operations (centroids, distances, cosine similarity). | |
| - Finding nearest resources for a learner based on polyline similarity. | |
| - Converting NumPy arrays to standard Python lists for JSON serialization. | |
| (more to be added with time...) | |
| ========================================================= | |
| """ | |
| import numpy as np | |
| import math | |
| import heapq | |
| import re | |
| from bs4 import BeautifulSoup | |
| import nltk | |
| from nltk.stem import WordNetLemmatizer, PorterStemmer | |
| from nltk.corpus import stopwords | |
| # Ensure NLTK data is downloaded | |
| try: | |
| nltk.data.find('corpora/stopwords') | |
| except LookupError: | |
| nltk.download('stopwords') | |
| try: | |
| nltk.data.find('corpora/wordnet') | |
| except LookupError: | |
| nltk.download('wordnet') | |
| # =========================== | |
| # utils_preprocess_text | |
| # =========================== | |
| def utils_preprocess_text(text: str, flg_stemm: bool = False, flg_lemm: bool = True, lst_stopwords: list = None) -> str: | |
| """ | |
| Preprocess text by removing HTML tags, punctuations, numbers, stopwords, and applying stemming/lemmatization. | |
| Parameters: | |
| text (str): The text to preprocess. | |
| flg_stemm (bool): Flag to apply stemming. Default is False. | |
| flg_lemm (bool): Flag to apply lemmatization. Default is True. | |
| lst_stopwords (list): List of stopwords to remove. Default is None. | |
| Returns: | |
| str: The preprocessed text. | |
| """ | |
| if not text: | |
| return "" | |
| # Remove HTML | |
| soup = BeautifulSoup(text, 'lxml') | |
| text = soup.get_text() | |
| # Remove punctuations and numbers | |
| text = re.sub('[^a-zA-Z]', ' ', text) | |
| # Single character removal | |
| text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text) | |
| # Remove multiple spaces | |
| text = re.sub(r'\s+', ' ', text) | |
| # Tokenize text | |
| lst_text = text.split() | |
| # Remove stopwords | |
| if lst_stopwords is not None: | |
| lst_text = [word for word in lst_text if word not in lst_stopwords] | |
| # Apply stemming | |
| if flg_stemm: | |
| ps = PorterStemmer() | |
| lst_text = [ps.stem(word) for word in lst_text] | |
| # Apply lemmatization | |
| if flg_lemm: | |
| lem = WordNetLemmatizer() | |
| lst_text = [lem.lemmatize(word) for word in lst_text] | |
| text = " ".join(lst_text) | |
| return text | |
| # =========================== | |
| # convert_to_lists | |
| # =========================== | |
| def convert_to_lists(data): | |
| """ | |
| Recursively convert NumPy arrays to standard Python lists. | |
| Parameters: | |
| data (np.ndarray | list | dict | other): Input data structure. | |
| Returns: | |
| list | dict | original type: Data converted to lists recursively. | |
| """ | |
| if isinstance(data, np.ndarray): | |
| return data.tolist() | |
| elif isinstance(data, list): | |
| return [convert_to_lists(item) for item in data] | |
| elif isinstance(data, dict): | |
| return {key: convert_to_lists(value) for key, value in data.items()} | |
| else: | |
| return data | |
| # =========================== | |
| # get_lowline_of_polylines | |
| # =========================== | |
| def get_lowline_of_polylines(polylines): | |
| """ | |
| Get the minimum value along each dimension across all polylines. | |
| Parameters: | |
| polylines (list of lists): Each inner list is a polyline vector. | |
| Returns: | |
| list: Minimum values for each dimension (lowline). | |
| """ | |
| if not polylines: | |
| return [0] * 12 # Default 12-dimension zero vector | |
| lowline = [min([polylines[i][j] for i in range(len(polylines))]) | |
| for j in range(len(polylines[0]))] | |
| return lowline | |
| # =========================== | |
| # get_highline_of_polylines | |
| # =========================== | |
| def get_highline_of_polylines(polylines): | |
| """ | |
| Get the maximum value along each dimension across all polylines. | |
| Parameters: | |
| polylines (list of lists): Each inner list is a polyline vector. | |
| Returns: | |
| list: Maximum values for each dimension (highline). | |
| """ | |
| return [max([polylines[i][j] for i in range(len(polylines))]) | |
| for j in range(len(polylines[0]))] | |
| # =========================== | |
| # get_cos_sim | |
| # =========================== | |
| def get_cos_sim(a: np.ndarray, b: np.ndarray) -> float: | |
| """ | |
| Calculate the cosine similarity between two vectors. | |
| Cosine similarity measures how similar two vectors are in direction | |
| regardless of their magnitude. | |
| Parameters: | |
| a (np.ndarray): First vector. | |
| b (np.ndarray): Second vector. | |
| Returns: | |
| float: Cosine similarity (1 = identical direction, -1 = opposite). | |
| """ | |
| dot_product = np.dot(a, b) | |
| norm_a = np.linalg.norm(a) | |
| norm_b = np.linalg.norm(b) | |
| return dot_product / (norm_a * norm_b) | |
| # =========================== | |
| # calculate_centroid | |
| # =========================== | |
| def calculate_centroid(polylines): | |
| """ | |
| Calculate the centroid (mean point) of a list of polylines. | |
| Parameters: | |
| polylines (list of lists): List of N-dimension polyline vectors. | |
| Returns: | |
| list: Centroid coordinates (mean along each dimension). | |
| """ | |
| polyline_array = np.array(polylines) | |
| centroid = np.mean(polyline_array, axis=0) | |
| return centroid.tolist() | |
| # =========================== | |
| # two_polyline_distance | |
| # =========================== | |
| def two_polyline_distance(point1, point2): | |
| """ | |
| Calculate Euclidean distance between two polylines (points in N-dimensional space). | |
| Parameters: | |
| point1 (list | np.ndarray): First polyline coordinates. | |
| point2 (list | np.ndarray): Second polyline coordinates. | |
| Returns: | |
| float: Euclidean distance between the two polylines. | |
| """ | |
| if len(point1) != len(point2): | |
| raise ValueError("Points must have the same dimensions") | |
| return math.sqrt(sum((p2 - p1) ** 2 for p1, p2 in zip(point1, point2))) | |
| # =========================== | |
| # nearest_seven | |
| # =========================== | |
| def nearest_seven(learner_polyline, resources_id_polylines): | |
| """ | |
| Find the 7 nearest resources to the learner based on polyline distance. | |
| Parameters: | |
| learner_polyline (list): Learner's current polyline coordinates. | |
| resources_id_polylines (list of tuples): Each tuple is (resource_id, polyline). | |
| Returns: | |
| list: IDs of the 7 nearest resources. | |
| """ | |
| top7 = [] | |
| for id_polyline in resources_id_polylines: | |
| distance = two_polyline_distance(learner_polyline, id_polyline[1]) | |
| heapq.heappush(top7, (-distance, id_polyline[0])) # Use negative distance for max-heap | |
| if len(top7) > 7: | |
| heapq.heappop(top7) | |
| return [id[1] for id in top7] | |
| # =========================== | |
| # calculate_distance | |
| # =========================== | |
| def calculate_distance(pos1, pos2): | |
| """ | |
| Compute Euclidean distance between two 2D points. | |
| Parameters: | |
| pos1 (list | tuple): [x, y] of first point. | |
| pos2 (list | tuple): [x, y] of second point. | |
| Returns: | |
| float: Euclidean distance between pos1 and pos2. | |
| """ | |
| return np.sqrt((pos1[0] - pos2[0]) ** 2 + (pos1[1] - pos2[1]) ** 2) | |
| # =========================== | |
| # is_valid_id | |
| # =========================== | |
| def is_valid_id(id): | |
| """ | |
| Check if a given ID is valid (convertible to integer). | |
| Parameters: | |
| id (any): ID to validate. | |
| Returns: | |
| bool: True if valid integer, False otherwise. | |
| """ | |
| try: | |
| _ = int(id) | |
| return True | |
| except: | |
| return False |