| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| |
|
| | import re |
| | from gensim.models import Word2Vec |
| |
|
| |
|
| | |
| | def my_simple_tokenizer(text): |
| | """Tokenizes text while preserving punctuation as separate tokens.""" |
| | return [x.lower() for x in re.split(r"([.,;:¡!¿?]+)?\s+", text) if x] |
| |
|
| |
|
| | |
| | |
| | maya2vec_path = "./model_512_60_5_-0.25_0.7308_3.35E-05" |
| | model = Word2Vec.load(maya2vec_path) |
| | print("Model loaded successfully.",type(model)) |
| |
|
| |
|
| | |
| | |
| | word = "meyaj" |
| | if word in model.wv: |
| | vector = model.wv[word] |
| | print(f"Semantic encoded word '{word}' in", type(vector), vector.shape) |
| | else: |
| | print(f"The word '{word}' is out-of-vocabulary (OOV).") |
| |
|
| |
|
| | |
| | |
| | text = "Bix a bel Táan in bin ich kool Tene' ooxolen" |
| | tokens = my_simple_tokenizer(text) |
| | try: |
| | vector = model.wv.get_mean_vector(tokens) |
| | print("Semantic encoded text in", type(vector), vector.shape) |
| | except KeyError: |
| | print("Some words in the input text are OOV, affecting the embedding computation.") |
| |
|
| |
|
| | |
| | |
| | word1, word2 = "peek'", "waalak'" |
| | if word1 in model.wv and word2 in model.wv: |
| | similarity = model.wv.similarity(word1, word2) |
| | print(f"Similarity between '{word1}' and '{word2}': {similarity:.4f}") |
| | else: |
| | print(f"One or both words ('{word1}', '{word2}') are OOV.") |
| |
|
| |
|
| | |
| | unknown_word = "furnance" |
| | if unknown_word in model.wv: |
| | vector = model.wv[unknown_word] |
| | else: |
| | print(f"The word '{unknown_word}' is OOV.") |
| |
|
| |
|