Spaces:

nabilaalt
/

API-Journaling

Sleeping

App Files Files Community

API-Journaling / utils /text_utils.py

nabilaalt

initial

780a315 9 months ago

raw

history blame contribute delete

3.16 kB

	import re, string, json, requests
	import numpy as np
	import pandas as pd
	from nltk.tokenize import sent_tokenize, word_tokenize
	from tensorflow.keras.preprocessing.sequence import pad_sequences

	def get_slang_word_list(url):
	response = requests.get(url)
	lines = response.text.strip().split('\n')
	slang_dict = {}
	for line in lines:
	if '=' in line:
	key, val = line.split('=', 1)
	slang_dict[key.strip()] = val.strip()
	return slang_dict

	def get_stopwords(urls):
	all_stopwords = set()
	for url in urls:
	response = requests.get(url)
	words = response.text.strip().split('\n')
	if words and 'stopword' in words[0].lower():
	words.pop(0)
	all_stopwords.update(w.strip().lower() for w in words)
	return all_stopwords

	def build_slang_dictionary():
	slang_url = 'https://raw.githubusercontent.com/King-srt/Indonesia-Slang-Dictionary/refs/heads/main/dictionary_indonesia.txt'
	slangwords = get_slang_word_list(slang_url)
	with open('utils/custom_slang.json', 'r', encoding='utf-8') as f:
	custom_slang = json.load(f)
	slangwords.update(custom_slang)
	return slangwords

	def build_stopwords():
	urls = [
	'https://raw.githubusercontent.com/yasirutomo/python-sentianalysis-id/master/data/feature_list/stopwordsID.txt',
	'https://raw.githubusercontent.com/Braincore-id/IndoTWEEST/main/stopwords_twitter.csv'
	]
	stop_words = get_stopwords(urls)
	with open('utils/add_stopwords.json', 'r', encoding='utf-8') as f:
	add_stopwords = json.load(f)
	stop_words.update(add_stopwords)
	return stop_words

	def processing_text_id(text, slangwords, stop_words):
	if pd.isnull(text):
	return []
	text = text.lower()
	text = re.sub(r'@[A-Za-z0-9_]+', '', text)
	text = re.sub(r'#[A-Za-z0-9_]+', '', text)
	text = re.sub(r'RT[\s]+', '', text)
	text = re.sub(r'http\S+', '', text)
	text = re.sub(r'\d+', '', text)
	text = text.translate(str.maketrans('', '', string.punctuation))
	text = text.strip()
	tokens = word_tokenize(text)
	tokens = [slangwords.get(token, token) for token in tokens]
	tokens = [token for token in tokens if token not in stop_words and token.strip()]
	return tokens

	def predict_sentiment_per_sentence(text, model, tokenizer, label_encoder, max_len, slangwords, stop_words):
	sentences = sent_tokenize(text)
	print(sentences)
	results = []
	for sentence in sentences:
	cleaned_tokens = processing_text_id(sentence, slangwords, stop_words)
	processed_text = ' '.join(cleaned_tokens)
	if not processed_text.strip():
	results.append("unknown")
	continue
	print(sentence)
	sequence = tokenizer.texts_to_sequences([processed_text])
	padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post')
	prediction = model.predict(padded_sequence, verbose=0)
	predicted_class_index = np.argmax(prediction, axis=1)[0]
	predicted_label = label_encoder.inverse_transform([predicted_class_index])[0]
	results.append(predicted_label)
	return results