| |
| """Survey_Analysis_v_3_2_86.ipynb |
| |
| Automatically generated by Colaboratory. |
| |
| Original file is located at |
| https://colab.research.google.com/drive/1VOlSQ6kva-BiGfJc7b3BwlKBegP13tdS |
| """ |
|
|
| |
| |
|
|
| import streamlit |
|
|
| |
|
|
| |
|
|
| |
| import numpy as np |
| import pandas as pd |
| |
| import matplotlib.pyplot as plt |
| |
| import plotly.graph_objects as go |
|
|
|
|
| import pygal as py |
| import squarify as sq |
| import matplotlib |
| plt.rcParams["figure.figsize"] = (20,15) |
| matplotlib.rc('xtick', labelsize=7) |
| matplotlib.rc('ytick', labelsize=7) |
|
|
| font = {'family' : 'normal', |
| 'weight' : 'bold', |
| 'size' : 5} |
|
|
| matplotlib.rc('font', **font) |
| from sklearn.feature_extraction.text import CountVectorizer |
| import warnings |
| warnings.filterwarnings("ignore", category=FutureWarning) |
| |
|
|
| df=pd.read_csv("/content/gen-data.csv",engine="python",encoding="ISO-8859-1") |
| df |
|
|
| col1=df.keys()[0] |
| col2=df.keys()[1] |
| col2 |
|
|
| df2=pd.DataFrame([[col1, col2]], columns=list([col1,col2]), index=[4845]) |
|
|
| df=df.append(df2, ignore_index=True).set_axis(['sentiment', 'news'], axis=1, inplace=False) |
|
|
| df |
|
|
| df = df.replace("neutral","neutral") |
|
|
| sns.countplot(y="sentiment",data=df) |
|
|
| df.isnull().sum() |
|
|
| from textblob import TextBlob |
|
|
| def preprocess(ReviewText): |
| ReviewText = ReviewText.str.replace("(<br/>)", "") |
| ReviewText = ReviewText.str.replace('(<a).*(>).*(</a>)', '') |
| ReviewText = ReviewText.str.replace('(&)', '') |
| ReviewText = ReviewText.str.replace('(>)', '') |
| ReviewText = ReviewText.str.replace('(<)', '') |
| ReviewText = ReviewText.str.replace('(\xa0)', ' ') |
| return ReviewText |
| df['Review Text'] = preprocess(df['news']) |
|
|
| df['polarity'] = df['news'].map(lambda text: TextBlob(text).sentiment.polarity) |
| df['news_len'] = df['news'].astype(str).apply(len) |
| df['word_count'] = df['news'].apply(lambda x: len(str(x).split())) |
|
|
| df |
|
|
| print('top 4 random reviews with the highest positive sentiment polarity: \n') |
|
|
| df1=df.drop_duplicates(subset=['Review Text']) |
|
|
| cl = df1.loc[df1.polarity == 1, ['Review Text']].sample(4).values |
| for c in cl: |
| print(c[0]) |
|
|
| print('5 random reviews with the most neutral sentiment(zero) polarity: \n') |
| cl1 = df.loc[df.polarity == 0, ['Review Text']].sample(5).values |
| for c in cl1: |
| print(c[0]) |
|
|
| print('5 reviews with the most negative polarity having polarity lesser than -0.80: \n') |
| cl3 = df.loc[df.polarity <= -0.80, ['Review Text']].sample(5).values |
| for c in cl3: |
| print(c[0]) |
|
|
| sns.boxplot(df["polarity"],palette="rainbow",data=df) |
|
|
| df['polarity'].plot( |
| kind='hist', |
| bins=50, |
| color="peru", |
| title='Sentiment Polarity Distribution');plt.show() |
|
|
| p_s=df[df["polarity"]>0].count()["sentiment"] |
| neu_s=df[df["polarity"]==0].count()["sentiment"] |
| neg_s=df[df["polarity"]<0].count()["sentiment"] |
|
|
| |
| sentiment = ['positive_sentiment',"neutral_sentiment","negative_sentiment"] |
| |
| |
| |
| values = [p_s,neu_s,neg_s] |
| |
| |
| colors = ['#FF0000', 'olive', '#FFFF00'] |
| |
| explode = (0.05, 0.05, 0.05) |
| |
| |
| plt.pie(values, colors=colors, labels=sentiment, |
| autopct='%1.1f%%', pctdistance=0.85, |
| explode=explode) |
| |
| |
| centre_circle = plt.Circle((0, 0), 0.70, fc='white') |
| fig = plt.gcf() |
| |
| |
| fig.gca().add_artist(centre_circle) |
| |
| |
| plt.title('count of polarity as per sentiment') |
| |
| |
| plt.show() |
|
|
| df.plot.box(y=["word_count"],color="hotpink") |
|
|
| df['word_count'].plot( |
| kind='hist', |
| bins=100, |
| color="orange", |
| title='Review Text Word Count Distribution');plt.show() |
|
|
| sns.boxenplot(x="news_len",data=df) |
| plt.show() |
|
|
| df['news_len'].plot( |
| kind='hist', |
| bins=50, |
| color="lightblue", |
| title='Review Text Word Count Distribution');plt.show() |
|
|
| fig = px.scatter(df, x="news_len", y="word_count", color="sentiment", |
| marginal_x="box", marginal_y="violin", |
| title="Click on the legend items!") |
| fig.show() |
|
|
| def get_top_n_words(corpus, n=None): |
| vec = CountVectorizer().fit(corpus) |
| bag_of_words = vec.transform(corpus) |
| sum_words = bag_of_words.sum(axis=0) |
| words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] |
| words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) |
| return words_freq[:n] |
| common_words = get_top_n_words(df['Review Text'], 20) |
| for word, freq in common_words: |
| print(word, freq) |
| df1 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count']) |
| df1.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot( |
| kind='bar',title='Top 20 words in review before removing stop words') |
| df1 |
|
|
| def get_top_n_words(corpus, n=None): |
| vec = CountVectorizer(stop_words = 'english').fit(corpus) |
| bag_of_words = vec.transform(corpus) |
| sum_words = bag_of_words.sum(axis=0) |
| words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] |
| words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) |
| return words_freq[:n] |
| common_words = get_top_n_words(df['Review Text'], 20) |
| for word, freq in common_words: |
| print(word, freq) |
| df2 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count']) |
| df2.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(kind='bar', title='Top 20 words in review after removing stop words') |
|
|
| def get_top_n_bigram(corpus, n=None): |
| vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus) |
| bag_of_words = vec.transform(corpus) |
| sum_words = bag_of_words.sum(axis=0) |
| words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] |
| words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) |
| return words_freq[:n] |
| common_words = get_top_n_bigram(df['Review Text'], 20) |
| for word, freq in common_words: |
| print(word, freq) |
| df3 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count']) |
| df3.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot( |
| kind='bar',title='Top 20 bigrams in review before removing stop words') |
|
|
| def get_top_n_bigram(corpus, n=None): |
| vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus) |
| bag_of_words = vec.transform(corpus) |
| sum_words = bag_of_words.sum(axis=0) |
| words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] |
| words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) |
| return words_freq[:n] |
| common_words = get_top_n_bigram(df['Review Text'], 20) |
| for word, freq in common_words: |
| print(word, freq) |
| df4 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count']) |
| df4.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot( |
| kind='bar', title='Top 20 bigrams in review after removing stop words') |
|
|
| def get_top_n_trigram(corpus, n=None): |
| vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus) |
| bag_of_words = vec.transform(corpus) |
| sum_words = bag_of_words.sum(axis=0) |
| words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] |
| words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) |
| return words_freq[:n] |
| common_words = get_top_n_trigram(df['Review Text'], 20) |
| for word, freq in common_words: |
| print(word, freq) |
| df5 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count']) |
| df5.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot( |
| kind='bar', title='Top 20 trigrams in review before removing stop words') |
|
|
| def get_top_n_trigram(corpus, n=None): |
| vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus) |
| bag_of_words = vec.transform(corpus) |
| sum_words = bag_of_words.sum(axis=0) |
| words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] |
| words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) |
| return words_freq[:n] |
| common_words = get_top_n_trigram(df['Review Text'], 20) |
| for word, freq in common_words: |
| print(word, freq) |
| df6 = pd.DataFrame(common_words, columns = ['ReviewText' ,'count']) |
| df6.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot( |
| kind='bar', title='Top 20 trigrams in review after removing stop words') |
|
|
| import nltk |
| nltk.download('punkt') |
| nltk.download('wordnet') |
| nltk.download('omw-1.4') |
| nltk.download('averaged_perceptron_tagger') |
|
|
| |
| blob = TextBlob(str(df['Review Text'])) |
| pos_df = pd.DataFrame(blob.tags, columns = ['word' , 'pos']) |
| pos_df = pos_df.pos.value_counts()[:20] |
| pos_df.plot( |
| kind='bar', |
| title='Top 20 Part-of-speech tagging for review corpus') |
|
|
| y0 = df.loc[df['sentiment'] == 'positive']['polarity'] |
| y1 = df.loc[df['sentiment'] == 'negative']['polarity'] |
| y2 = df.loc[df['sentiment'] == 'neutral']['polarity'] |
|
|
| trace0 = go.Box( |
| y=y0, |
| name = 'positive', |
| marker = dict( |
| color = 'rgb(214, 12, 140)', |
| ) |
| ) |
| trace1 = go.Box( |
| y=y1, |
| name = 'negative', |
| marker = dict( |
| color = 'rgb(0, 128, 128)', |
| ) |
| ) |
| trace2 = go.Box( |
| y=y2, |
| name = 'neutral', |
| marker = dict( |
| color = 'rgb(10, 140, 208)', |
| ) |
| ) |
| data = [trace0, trace1, trace2] |
| layout = go.Layout( |
| title = "Polarity Boxplot according to sentiment" |
| ) |
|
|
| go.Figure(data=data,layout=layout) |
|
|
| y0 = df.loc[df['sentiment'] == 'positive']['news_len'] |
| y1 = df.loc[df['sentiment'] == 'negative']['news_len'] |
| y2 = df.loc[df['sentiment'] == 'neutral']['news_len'] |
|
|
|
|
| trace0 = go.Box( |
| y=y0, |
| name = 'positive', |
| marker = dict( |
| color = 'rgb(214, 12, 140)', |
| ) |
| ) |
| trace1 = go.Box( |
| y=y1, |
| name = 'negative', |
| marker = dict( |
| color = 'rgb(0, 128, 128)', |
| ) |
| ) |
| trace2 = go.Box( |
| y=y2, |
| name = 'neutral', |
| marker = dict( |
| color = 'rgb(10, 140, 208)', |
| ) |
| ) |
| data = [trace0, trace1, trace2] |
| layout = go.Layout( |
| title = "news length Boxplot by sentiment" |
| ) |
| go.Figure(data=data,layout=layout) |
|
|
| xp = df.loc[df['sentiment'] == "positive", 'polarity'] |
| xneu = df.loc[df['sentiment'] == "neutral", 'polarity'] |
| xneg= df.loc[df['sentiment'] == "negative", 'polarity'] |
|
|
| trace1 = go.Histogram( |
| x=xp, name='positive', |
| opacity=0.75 |
| ) |
| trace2 = go.Histogram( |
| x=xneu, name = 'neutral', |
| opacity=0.75 |
| ) |
| trace3 = go.Histogram( |
| x=xneg, name = 'negative', |
| opacity=0.75 |
| ) |
| data = [trace1, trace2,trace3] |
| layout = go.Layout(barmode='overlay', title='Distribution of Sentiment polarity') |
| go.Figure(data=data, layout=layout) |
|
|
| trace1 = go.Scatter( |
| x=df['polarity'], y=df['news_len'], mode='markers', name='points', |
| marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4) |
| ) |
| trace2 = go.Histogram2dContour( |
| x=df['polarity'], y=df['news_len'], name='density', ncontours=50, |
| colorscale='Hot', reversescale=True, showscale=False |
| ) |
| trace3 = go.Histogram( |
| x=df['polarity'], name='Sentiment polarity density', |
| marker=dict(color='rgb(102,0,0)'), |
| yaxis='y2' |
| ) |
| trace4 = go.Histogram( |
| y=df['news_len'], name='news length density', marker=dict(color='rgb(102,0,0)'), |
| xaxis='x2' |
| ) |
| data = [trace1, trace2, trace3, trace4] |
|
|
| layout = go.Layout( |
| showlegend=False, |
| autosize=False, |
| width=600, |
| height=550, |
| xaxis=dict( |
| domain=[0, 0.85], |
| showgrid=False, |
| zeroline=False |
| ), |
| yaxis=dict( |
| domain=[0, 0.85], |
| showgrid=False, |
| zeroline=False |
| ), |
| margin=dict( |
| t=50 |
| ), |
| hovermode='x unified', |
| bargap=0, |
| xaxis2=dict( |
| domain=[0.85, 1], |
| showgrid=False, |
| zeroline=False |
| ), |
| yaxis2=dict( |
| domain=[0.85, 1], |
| showgrid=False, |
| zeroline=False |
| ) |
| ) |
|
|
| go.Figure(data=data, layout=layout) |
|
|
| trace1 = go.Scatter( |
| x=df['polarity'], y=df['word_count'], mode='markers', name='points', |
| marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4) |
| ) |
| trace2 = go.Histogram2dContour( |
| x=df['polarity'], y=df['word_count'], name='density', ncontours=20, |
| colorscale='Hot', reversescale=True, showscale=False |
| ) |
| trace3 = go.Histogram( |
| x=df['polarity'], name='Sentiment polarity density', |
| marker=dict(color='rgb(102,0,0)'), |
| yaxis='y2' |
| ) |
| trace4 = go.Histogram( |
| y=df['word_count'], name='word count density', marker=dict(color='rgb(112,0,0)'), |
| xaxis='x2' |
| ) |
| data = [trace1, trace2, trace3, trace4] |
|
|
| layout = go.Layout( |
| showlegend=False, |
| autosize=False, |
| width=600, |
| height=550, |
| xaxis=dict( |
| domain=[0, 0.85], |
| showgrid=False, |
| zeroline=False |
| ), |
| yaxis=dict( |
| domain=[0, 0.85], |
| showgrid=False, |
| zeroline=False |
| ), |
| margin=dict( |
| t=50 |
| ), |
| hovermode='closest', |
| bargap=0, |
| xaxis2=dict( |
| domain=[0.85, 1], |
| showgrid=False, |
| zeroline=False |
| ), |
| yaxis2=dict( |
| domain=[0.85, 1], |
| showgrid=False, |
| zeroline=False |
| ) |
| ) |
|
|
| go.Figure(data=data, layout=layout) |
|
|
| |
|
|
| |
|
|
| import scattertext as st |
| import spacy |
| nlp = spacy.blank("en") |
| nlp.add_pipe('sentencizer') |
| |
| corpus = st.CorpusFromPandas(df, category_col='sentiment', text_col='Review Text', nlp=nlp).build() |
| print(list(corpus.get_scaled_f_scores_vs_background().index[:20])) |
|
|
| term_freq_df = corpus.get_term_freq_df() |
| term_freq_df['positive_sentiment'] = corpus.get_scaled_f_scores('positive') |
| list(term_freq_df.sort_values(by='positive_sentiment', ascending=False).index[:20]) |
|
|
| term_freq_df['neutral_sentiment'] = corpus.get_scaled_f_scores('neutral') |
| list(term_freq_df.sort_values(by='neutral_sentiment', ascending=False).index[:20]) |
|
|
| term_freq_df['negative_sentiment'] = corpus.get_scaled_f_scores('negative') |
| list(term_freq_df.sort_values(by='negative_sentiment', ascending=False).index[:20]) |
|
|
| from sklearn.feature_extraction.text import TfidfVectorizer |
| from sklearn.decomposition import TruncatedSVD |
| from collections import Counter |
|
|
| tfidf_vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True) |
| reindexed_data = df['Review Text'].values |
| document_term_matrix = tfidf_vectorizer.fit_transform(reindexed_data) |
| n_topics = 10 |
| lsa_model = TruncatedSVD(n_components=n_topics) |
| lsa_topic_matrix = lsa_model.fit_transform(document_term_matrix) |
|
|
| def get_keys(topic_matrix): |
| ''' |
| returns an integer list of predicted topic |
| categories for a given topic matrix |
| ''' |
| keys = topic_matrix.argmax(axis=1).tolist() |
| return keys |
|
|
| def keys_to_counts(keys): |
| ''' |
| returns a tuple of topic categories and their |
| accompanying magnitudes for a given list of keys |
| ''' |
| count_pairs = Counter(keys).items() |
| categories = [pair[0] for pair in count_pairs] |
| counts = [pair[1] for pair in count_pairs] |
| return (categories, counts) |
| |
| lsa_keys = get_keys(lsa_topic_matrix) |
| lsa_categories, lsa_counts = keys_to_counts(lsa_keys) |
|
|
| def get_top_n_words(n, keys, document_term_matrix, tfidf_vectorizer): |
| ''' |
| returns a list of n_topic strings, where each string contains the n most common |
| words in a predicted category, in order |
| ''' |
| top_word_indices = [] |
| for topic in range(n_topics): |
| temp_vector_sum = 0 |
| for i in range(len(keys)): |
| if keys[i] == topic: |
| temp_vector_sum += document_term_matrix[i] |
| temp_vector_sum = temp_vector_sum.toarray() |
| top_n_word_indices = np.flip(np.argsort(temp_vector_sum)[0][-n:],0) |
| top_word_indices.append(top_n_word_indices) |
| top_words = [] |
| for topic in top_word_indices: |
| topic_words = [] |
| for index in topic: |
| temp_word_vector = np.zeros((1,document_term_matrix.shape[1])) |
| temp_word_vector[:,index] = 1 |
| the_word = tfidf_vectorizer.inverse_transform(temp_word_vector)[0][0] |
| topic_words.append(the_word.encode('ascii').decode('utf-8')) |
| top_words.append(" ".join(topic_words)) |
| return top_words |
| |
| top_lsa=get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer) |
|
|
| for i in range(len(top_lsa)): |
| print("Topic {}: ".format(i+1), top_lsa[i]) |
|
|
| top_3_words = get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer) |
| labels = ['Topic {}: \n'.format(i+1) + top_3_words[i] for i in lsa_categories] |
| fig, ax = plt.subplots(figsize=(16,8)) |
| ax.bar(lsa_categories, lsa_counts,color="skyblue"); |
| ax.set_xticks(lsa_categories,); |
| ax.set_xticklabels(labels, rotation=45, rotation_mode='default',color="olive"); |
| ax.set_ylabel('Number of review text on topics'); |
| ax.set_title('Count of LSA topics'); |
| plt.show(); |
|
|
| """#---2----""" |
|
|
| df['sentiment'].value_counts() |
|
|
| from sklearn.model_selection import train_test_split |
| train,eva = train_test_split(df,test_size = 0.2) |
|
|
| |
|
|
| from simpletransformers.classification import ClassificationModel |
|
|
| |
| model = ClassificationModel('bert', 'bert-base-cased', num_labels=3, args={'reprocess_input_data': True, 'overwrite_output_dir': True},use_cuda=False) |
|
|
| |
| def making_label(st): |
| if(st=='positive'): |
| return 0 |
| elif(st=='neutral'): |
| return 2 |
| else: |
| return 1 |
| |
| train['label'] = train['sentiment'].apply(making_label) |
| eva['label'] = eva['sentiment'].apply(making_label) |
| print(train.shape) |
|
|
| train_df = pd.DataFrame({ |
| 'text': train['news'][:1500].replace(r'\n', ' ', regex=True), |
| 'label': train['label'][:1500] |
| }) |
|
|
| eval_df = pd.DataFrame({ |
| 'text': eva['news'][-400:].replace(r'\n', ' ', regex=True), |
| 'label': eva['label'][-400:] |
| }) |
|
|
| model.train_model(train_df) |
|
|
| result, model_outputs, wrong_predictions = model.eval_model(eval_df) |
|
|
| result |
|
|
| model_outputs |
|
|
| len(wrong_predictions) |
|
|
| lst = [] |
| for arr in model_outputs: |
| lst.append(np.argmax(arr)) |
|
|
| true = eval_df['label'].tolist() |
| predicted = lst |
|
|
| import sklearn |
| mat = sklearn.metrics.confusion_matrix(true , predicted) |
| mat |
|
|
| df_cm = pd.DataFrame(mat, range(3), range(3)) |
|
|
| sns.heatmap(df_cm, annot=True) |
| plt.show() |
|
|
| print(sklearn.metrics.classification_report(true,predicted,target_names=['positive','neutral','negative'])) |
|
|
| sklearn.metrics.accuracy_score(true,predicted) |
|
|
| |
| def get_result(statement): |
| result = model.predict([statement]) |
| pos = np.where(result[1][0] == np.amax(result[1][0])) |
| pos = int(pos[0]) |
| sentiment_dict = {0:'positive',1:'negative',2:'neutral'} |
| print(sentiment_dict[pos]) |
| return |
|
|
| |
| get_result("According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .") |
|
|
| |
| get_result("According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .") |
|
|
| |
| get_result('Sales in Finland decreased by 2.0 % , and international sales decreased by 9.3 % in terms of euros , and by 15.1 % in terms of local currencies .') |
|
|
| get_result("This company is growing like anything with 23% profit every year") |
|
|
| get_result("This company is not able to make any profit but make very less profit in last quarter") |
|
|
| get_result("The doctor treated well and the patient was very healthy") |
|
|
| get_result("the act of politicians is to serve and help needy and not to create ruck suck") |
|
|
| get_result("American burger is too good. Can't resisit to go and have one") |
|
|
| get_result("GDP per capita increased to double in India from 2013") |
|
|
| get_result("Indian economy is doing very good and will become super power one day.") |
|
|
| get_result("Indian economy is doing very good and will create millions of jobs in coming years") |
|
|
| get_result("Indian economy is not doing very good and need urgent reforms but we are pretty sure it will be very good in coming years") |
|
|
| get_result("Indian economy is doing very good.Indian economy is not doing very good ") |
|
|
| get_result("Indian economy is not doing very good. Indian economy will bounce back to become leading economy") |
|
|
| get_result("Indian economy is not doing very good. Urgent reforms is required to create new jobs and improve export") |
|
|
| get_result("The stock market of Indian economy is dangling too much") |
|
|
| """#VADER""" |
|
|
| |
|
|
| from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer |
|
|
| obj = SentimentIntensityAnalyzer() |
|
|
| sentence = "Ram is really good " |
| sentiment_dict = obj.polarity_scores(sentence) |
| print(sentiment_dict) |
|
|
| |
| sentence = "Ram is better " |
| sentiment_dict = obj.polarity_scores(sentence) |
| print(sentiment_dict) |
|
|
| sentence = "Rahul is really bad" |
| sentiment_dict = obj.polarity_scores(sentence) |
| print(sentiment_dict) |
|
|
| |
| print(obj.polarity_scores('Ram is good boy')) |
| print(obj.polarity_scores('Ram is good boy!')) |
| print(obj.polarity_scores('Ram is good boy!!')) |
|
|
| |
| print(obj.polarity_scores('Ram is good')) |
| print(obj.polarity_scores('Ram is GOOD')) |
|
|
| |
| print(obj.polarity_scores('Ram is good')) |
| print(obj.polarity_scores('Ram is better')) |
| print(obj.polarity_scores('Ram is best')) |
|
|
| print(obj.polarity_scores('Ram is bad')) |
| print(obj.polarity_scores('Ram is worse')) |
| print(obj.polarity_scores('Ram is worst')) |
|
|
| |
| print(obj.polarity_scores('Ram is good')) |
| print(obj.polarity_scores('Ram is good, but he is also naughty sometimes')) |
|
|
| |
| print(obj.polarity_scores("That Hotel")) |
| print(obj.polarity_scores("That Hotel SUX")) |
| print(obj.polarity_scores("That Hotel SUCKS")) |
|
|
| |
| print(obj.polarity_scores("Your :) is the most beautiful thing I have ever seen")) |
| print(obj.polarity_scores("Your smile is the most beautiful thing I have ever seen")) |
|
|
| print(obj.polarity_scores("Your :( is the worst thing I have ever seen")) |
| print(obj.polarity_scores("Your smile is the worst thing I have ever seen")) |
|
|
| |
| |
|
|
| """#3.a Using FINBERT Model""" |
|
|
| |
| |
|
|
| from transformers import BertTokenizer, BertForSequenceClassification, pipeline |
|
|
| |
| import transformers |
| transformers.__version__ |
|
|
| finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3) |
| tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone') |
|
|
| nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer) |
| results = nlp(['growth is strong and we have plenty of liquidity.', |
| 'there is a shortage of capital, and we need extra financing.', |
| 'formulation patents might protect Vasotec to a limited extent.']) |
|
|
| results |
|
|
| """#FINBERT ESG""" |
|
|
| finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-esg',num_labels=4) |
| tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-esg') |
|
|
| nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer) |
| results = nlp(['Managing and working to mitigate the impact our operations have on the environment is a core element of our business.', |
| 'Rhonda has been volunteering for several years for a variety of charitable community programs.', |
| 'Cabot\'s annual statements are audited annually by an independent registered public accounting firm.', |
| 'As of December 31, 2012, the 2011 Term Loan had a principal balance of $492.5 million.']) |
|
|
| results |
|
|
| """#FINBERT Classification""" |
|
|
| finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-fls',num_labels=3) |
| tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-fls') |
|
|
| nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer) |
| results = nlp(['we expect the age of our fleet to enhance availability and reliability due to reduced downtime for repairs.', |
| 'on an equivalent unit of production basis, general and administrative expenses declined 24 percent from 1994 to $.67 per boe.', |
| 'we will continue to assess the need for a valuation allowance against deferred tax assets considering all available evidence obtained in']) |
|
|
| results |
|
|
| X = df['Review Text'].to_list() |
| y = df['sentiment'].to_list() |
|
|
| from transformers import BertTokenizer, BertForSequenceClassification |
|
|
| finbert_whole = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3) |
| tokenizer_whole = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone') |
|
|
| labels = {0:'neutral', 1:'positive',2:'negative'} |
|
|
| sent_val = list() |
| for x in X: |
| inputs = tokenizer_whole(x, return_tensors="pt", padding=True) |
| outputs = finbert_whole(**inputs)[0] |
| |
| val = labels[np.argmax(outputs.detach().numpy())] |
| print(x, '---->', val) |
| print('#######################################################') |
| sent_val.append(val) |
|
|
| from sklearn.metrics import accuracy_score |
| print(accuracy_score(y, sent_val)) |
|
|
| """#Using DISTILBERT""" |
|
|
| from transformers import DistilBertTokenizer, DistilBertForSequenceClassification |
|
|
| tokenizer_distilbert = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") |
| model_distilbert = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") |
|
|
| labels = {0:'neutral', 1:'positive',2:'negative'} |
|
|
| sent_val_bert = list() |
| for x in X: |
| inputs = tokenizer_distilbert(x, return_tensors="pt", padding=True) |
| outputs = model_distilbert(**inputs)[0] |
| |
| val = labels[np.argmax(outputs.detach().numpy())] |
| print(x, '---->', val) |
| print('#######################################################') |
| sent_val_bert.append(val) |
|
|
| from sklearn.metrics import accuracy_score |
| print(accuracy_score(y, sent_val)) |
|
|
| """#Bert""" |
|
|
| tokenizer_bert = DistilBertTokenizer.from_pretrained("bert-base-uncased") |
| model_bert = DistilBertForSequenceClassification.from_pretrained("bert-base-uncased") |
|
|
| labels = {0:'neutral', 1:'positive',2:'negative'} |
|
|
| sent_val_bert1 = list() |
| for x in X: |
| inputs = tokenizer_bert(x, return_tensors="pt", padding=True) |
| outputs = model_bert(**inputs)[0] |
| |
| val = labels[np.argmax(outputs.detach().numpy())] |
| print(x, '---->', val) |
| print('#######################################################') |
| sent_val_bert1.append(val) |
|
|
| from sklearn.metrics import accuracy_score |
| print(accuracy_score(y, sent_val)) |