Spaces:
Sleeping
Sleeping
| import re, string, json, requests | |
| import numpy as np | |
| import pandas as pd | |
| from nltk.tokenize import sent_tokenize, word_tokenize | |
| from tensorflow.keras.preprocessing.sequence import pad_sequences | |
| def get_slang_word_list(url): | |
| response = requests.get(url) | |
| lines = response.text.strip().split('\n') | |
| slang_dict = {} | |
| for line in lines: | |
| if '=' in line: | |
| key, val = line.split('=', 1) | |
| slang_dict[key.strip()] = val.strip() | |
| return slang_dict | |
| def get_stopwords(urls): | |
| all_stopwords = set() | |
| for url in urls: | |
| response = requests.get(url) | |
| words = response.text.strip().split('\n') | |
| if words and 'stopword' in words[0].lower(): | |
| words.pop(0) | |
| all_stopwords.update(w.strip().lower() for w in words) | |
| return all_stopwords | |
| def build_slang_dictionary(): | |
| slang_url = 'https://raw.githubusercontent.com/King-srt/Indonesia-Slang-Dictionary/refs/heads/main/dictionary_indonesia.txt' | |
| slangwords = get_slang_word_list(slang_url) | |
| with open('utils/custom_slang.json', 'r', encoding='utf-8') as f: | |
| custom_slang = json.load(f) | |
| slangwords.update(custom_slang) | |
| return slangwords | |
| def build_stopwords(): | |
| urls = [ | |
| 'https://raw.githubusercontent.com/yasirutomo/python-sentianalysis-id/master/data/feature_list/stopwordsID.txt', | |
| 'https://raw.githubusercontent.com/Braincore-id/IndoTWEEST/main/stopwords_twitter.csv' | |
| ] | |
| stop_words = get_stopwords(urls) | |
| with open('utils/add_stopwords.json', 'r', encoding='utf-8') as f: | |
| add_stopwords = json.load(f) | |
| stop_words.update(add_stopwords) | |
| return stop_words | |
| def processing_text_id(text, slangwords, stop_words): | |
| if pd.isnull(text): | |
| return [] | |
| text = text.lower() | |
| text = re.sub(r'@[A-Za-z0-9_]+', '', text) | |
| text = re.sub(r'#[A-Za-z0-9_]+', '', text) | |
| text = re.sub(r'RT[\s]+', '', text) | |
| text = re.sub(r'http\S+', '', text) | |
| text = re.sub(r'\d+', '', text) | |
| text = text.translate(str.maketrans('', '', string.punctuation)) | |
| text = text.strip() | |
| tokens = word_tokenize(text) | |
| tokens = [slangwords.get(token, token) for token in tokens] | |
| tokens = [token for token in tokens if token not in stop_words and token.strip()] | |
| return tokens | |
| def predict_sentiment_per_sentence(text, model, tokenizer, label_encoder, max_len, slangwords, stop_words): | |
| sentences = sent_tokenize(text) | |
| print(sentences) | |
| results = [] | |
| for sentence in sentences: | |
| cleaned_tokens = processing_text_id(sentence, slangwords, stop_words) | |
| processed_text = ' '.join(cleaned_tokens) | |
| if not processed_text.strip(): | |
| results.append("unknown") | |
| continue | |
| print(sentence) | |
| sequence = tokenizer.texts_to_sequences([processed_text]) | |
| padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post') | |
| prediction = model.predict(padded_sequence, verbose=0) | |
| predicted_class_index = np.argmax(prediction, axis=1)[0] | |
| predicted_label = label_encoder.inverse_transform([predicted_class_index])[0] | |
| results.append(predicted_label) | |
| return results | |