API-Journaling / utils /text_utils.py
nabilaalt's picture
initial
780a315
import re, string, json, requests
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from tensorflow.keras.preprocessing.sequence import pad_sequences
def get_slang_word_list(url):
response = requests.get(url)
lines = response.text.strip().split('\n')
slang_dict = {}
for line in lines:
if '=' in line:
key, val = line.split('=', 1)
slang_dict[key.strip()] = val.strip()
return slang_dict
def get_stopwords(urls):
all_stopwords = set()
for url in urls:
response = requests.get(url)
words = response.text.strip().split('\n')
if words and 'stopword' in words[0].lower():
words.pop(0)
all_stopwords.update(w.strip().lower() for w in words)
return all_stopwords
def build_slang_dictionary():
slang_url = 'https://raw.githubusercontent.com/King-srt/Indonesia-Slang-Dictionary/refs/heads/main/dictionary_indonesia.txt'
slangwords = get_slang_word_list(slang_url)
with open('utils/custom_slang.json', 'r', encoding='utf-8') as f:
custom_slang = json.load(f)
slangwords.update(custom_slang)
return slangwords
def build_stopwords():
urls = [
'https://raw.githubusercontent.com/yasirutomo/python-sentianalysis-id/master/data/feature_list/stopwordsID.txt',
'https://raw.githubusercontent.com/Braincore-id/IndoTWEEST/main/stopwords_twitter.csv'
]
stop_words = get_stopwords(urls)
with open('utils/add_stopwords.json', 'r', encoding='utf-8') as f:
add_stopwords = json.load(f)
stop_words.update(add_stopwords)
return stop_words
def processing_text_id(text, slangwords, stop_words):
if pd.isnull(text):
return []
text = text.lower()
text = re.sub(r'@[A-Za-z0-9_]+', '', text)
text = re.sub(r'#[A-Za-z0-9_]+', '', text)
text = re.sub(r'RT[\s]+', '', text)
text = re.sub(r'http\S+', '', text)
text = re.sub(r'\d+', '', text)
text = text.translate(str.maketrans('', '', string.punctuation))
text = text.strip()
tokens = word_tokenize(text)
tokens = [slangwords.get(token, token) for token in tokens]
tokens = [token for token in tokens if token not in stop_words and token.strip()]
return tokens
def predict_sentiment_per_sentence(text, model, tokenizer, label_encoder, max_len, slangwords, stop_words):
sentences = sent_tokenize(text)
print(sentences)
results = []
for sentence in sentences:
cleaned_tokens = processing_text_id(sentence, slangwords, stop_words)
processed_text = ' '.join(cleaned_tokens)
if not processed_text.strip():
results.append("unknown")
continue
print(sentence)
sequence = tokenizer.texts_to_sequences([processed_text])
padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post')
prediction = model.predict(padded_sequence, verbose=0)
predicted_class_index = np.argmax(prediction, axis=1)[0]
predicted_label = label_encoder.inverse_transform([predicted_class_index])[0]
results.append(predicted_label)
return results