import warnings warnings.simplefilter("ignore", UserWarning) from matplotlib import pyplot as plt %matplotlib inline
import pandas as pd pd.options.mode.chained_assignment = None import numpy as np from string import punctuation
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, auc, roc_auc_score from sklearn.externals import joblib
data = pd.read_csv('./data/tweets.csv', encoding='latin1', usecols=['Sentiment', 'SentimentText']) data.columns = ['sentiment', 'text'] data = data.sample(frac=1, random_state=42) print(data.shape)
1
(1578614, 2)
1 2
for row in data.head(10).iterrows(): print(row[ 1]['sentiment'], row[ 1]['text'])
1 http://www.popsugar.com/2999655 keep voting for robert pattinson in the popsugar100 as well!!
1 @GamrothTaylor I am starting to worry about you, only I have Navy Seal type sleep hours.
0 sunburned...no sunbaked! ow. it hurts to sit.
1 Celebrating my 50th birthday by doing exactly the same as I do every other day - working on our websites. It's just another day.
1 Leah and Aiden Gosselin are the cutest kids on the face of the Earth
1 @MissHell23 Oh. I didn't even notice.
0 WTF is wrong with me?!!! I'm completely miserable. I need to snap out of this
0 Was having the best time in the gym until I got to the car and had messages waiting for me... back to the down stage!
1 @JENTSYY oh what happened??
0 @catawu Ghod forbid he should feel responsible for anything!
from keras.preprocessing.text import Tokenizer from keras.preprocessing.text import text_to_word_sequence from keras.preprocessing.sequence import pad_sequences
from keras.models import Model from keras.models import Sequential
from keras.layers import Input, Dense, Embedding, Conv1D, Conv2D, MaxPooling1D, MaxPool2D from keras.layers import Reshape, Flatten, Dropout, Concatenate from keras.layers import SpatialDropout1D, concatenate from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.callbacks import Callback from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping from keras.models import load_model from keras.utils.vis_utils import plot_model
4 - Recurrent Neural Network without pre-trained embedding
def get_coefs(word, *arr): try: return word, np.asarray(arr, dtype='float32') except: return None, None embeddings_index = dict(get_coefs(*o.strip().split()) for o in tqdm_notebook(open('./embeddings/glove.840B.300d.txt')))
embed_size=300 for k in tqdm_notebook(list(embeddings_index.keys())): v = embeddings_index[k] try: if v.shape != (embed_size, ): embeddings_index.pop(k) except: pass embeddings_index.pop(None)
oov = 0 for word, i in tqdm_notebook(word_index.items()): if i >= MAX_NB_WORDS: continue embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector else: oov += 1
print(oov)
def get_rnn_model_with_glove_embeddings(): embedding_dim = 300 inp = Input(shape=(MAX_LENGTH, )) x = Embedding(MAX_NB_WORDS, embedding_dim, weights=[embedding_matrix], input_length=MAX_LENGTH, trainable=True)(inp) x = SpatialDropout1D(0.3)(x) x = Bidirectional(GRU(100, return_sequences=True))(x) avg_pool = GlobalAveragePooling1D()(x) max_pool = GlobalMaxPooling1D()(x) conc = concatenate([avg_pool, max_pool]) outp = Dense(1, activation="sigmoid")(conc) model = Model(inputs=inp, outputs=outp) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model
import seaborn as sns from sklearn.metrics import roc_auc_score sns.set_style("whitegrid") sns.set_palette("pastel")
predictions_files = os.listdir('./predictions/')
predictions_dfs = [] for f in predictions_files: aux = pd.read_csv('./predictions/{0}'.format(f)) aux.columns = [f.strip('.csv')] predictions_dfs.append(aux)
predictions = pd.concat(predictions_dfs, axis=1)
scores = {}
for column in tqdm_notebook(predictions.columns, leave=False): if column != 'y_true': s = accuracy_score(predictions['y_true'].values, predictions[column].values) scores[column] = s
import warnings warnings.simplefilter("ignore", UserWarning) from matplotlib import pyplot as plt %matplotlib inline
import pandas as pd pd.options.mode.chained_assignment = None import numpy as np from string import punctuation
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, auc, roc_auc_score from sklearn.externals import joblib
data = pd.read_csv('./data/tweets.csv', encoding='latin1', usecols=['Sentiment', 'SentimentText']) data.columns = ['sentiment', 'text'] data = data.sample(frac=1, random_state=42) print(data.shape)
1
(1578614, 2)
1 2
for row in data.head(10).iterrows(): print(row[ 1]['sentiment'], row[ 1]['text'])
1 http://www.popsugar.com/2999655 keep voting for robert pattinson in the popsugar100 as well!!
1 @GamrothTaylor I am starting to worry about you, only I have Navy Seal type sleep hours.
0 sunburned...no sunbaked! ow. it hurts to sit.
1 Celebrating my 50th birthday by doing exactly the same as I do every other day - working on our websites. It's just another day.
1 Leah and Aiden Gosselin are the cutest kids on the face of the Earth
1 @MissHell23 Oh. I didn't even notice.
0 WTF is wrong with me?!!! I'm completely miserable. I need to snap out of this
0 Was having the best time in the gym until I got to the car and had messages waiting for me... back to the down stage!
1 @JENTSYY oh what happened??
0 @catawu Ghod forbid he should feel responsible for anything!
from keras.preprocessing.text import Tokenizer from keras.preprocessing.text import text_to_word_sequence from keras.preprocessing.sequence import pad_sequences
from keras.models import Model from keras.models import Sequential
from keras.layers import Input, Dense, Embedding, Conv1D, Conv2D, MaxPooling1D, MaxPool2D from keras.layers import Reshape, Flatten, Dropout, Concatenate from keras.layers import SpatialDropout1D, concatenate from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.callbacks import Callback from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping from keras.models import load_model from keras.utils.vis_utils import plot_model
4 - Recurrent Neural Network without pre-trained embedding
def get_coefs(word, *arr): try: return word, np.asarray(arr, dtype='float32') except: return None, None embeddings_index = dict(get_coefs(*o.strip().split()) for o in tqdm_notebook(open('./embeddings/glove.840B.300d.txt')))
embed_size=300 for k in tqdm_notebook(list(embeddings_index.keys())): v = embeddings_index[k] try: if v.shape != (embed_size, ): embeddings_index.pop(k) except: pass embeddings_index.pop(None)
oov = 0 for word, i in tqdm_notebook(word_index.items()): if i >= MAX_NB_WORDS: continue embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector else: oov += 1
print(oov)
def get_rnn_model_with_glove_embeddings(): embedding_dim = 300 inp = Input(shape=(MAX_LENGTH, )) x = Embedding(MAX_NB_WORDS, embedding_dim, weights=[embedding_matrix], input_length=MAX_LENGTH, trainable=True)(inp) x = SpatialDropout1D(0.3)(x) x = Bidirectional(GRU(100, return_sequences=True))(x) avg_pool = GlobalAveragePooling1D()(x) max_pool = GlobalMaxPooling1D()(x) conc = concatenate([avg_pool, max_pool]) outp = Dense(1, activation="sigmoid")(conc) model = Model(inputs=inp, outputs=outp) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model
import seaborn as sns from sklearn.metrics import roc_auc_score sns.set_style("whitegrid") sns.set_palette("pastel")
predictions_files = os.listdir('./predictions/')
predictions_dfs = [] for f in predictions_files: aux = pd.read_csv('./predictions/{0}'.format(f)) aux.columns = [f.strip('.csv')] predictions_dfs.append(aux)
predictions = pd.concat(predictions_dfs, axis=1)
scores = {}
for column in tqdm_notebook(predictions.columns, leave=False): if column != 'y_true': s = accuracy_score(predictions['y_true'].values, predictions[column].values) scores[column] = s