from keras.datasets import mnist
import numpy as np
import matplotlib.pyplot as plt

# Loading the MNIST data
(X_autoenc, _), (X_classification, y_classification) = mnist.load_data() 

X_autoenc = X_autoenc.astype('float32') / 255.
X_classification = X_classification.astype('float32') / 255.

ind = 12
plt.imshow(X_autoenc[ind])
print(X_autoenc.shape)
print(X_classification.shape)


# Creating the noisy version of the MNIST data
noise = 1* np.random.random(X_autoenc.shape)
X_autoenc_noisy = X_autoenc + noise

plt.imshow(X_autoenc_noisy[ind])


# defining an autoencoder

from keras.layers import Input, Dense
from keras.models import Model

encoding_dim = 64 # original dim is 784 (28*28) 
input_img = Input(shape=(784, ))

# defining the encoder network (just a single layer 784x32 FFDN)
encoded = Dense(encoding_dim, activation='relu')(input_img)

# "decoded" is the lossy reconstruction of the input
decoded = Dense(784, activation='sigmoid')(encoded)

# ============================================================

# encoder, for obtaining encoded representations after training
encoder = Model(input_img, encoded)

# full autoencoder for training
autoencoder = Model(input_img, decoded)


# training the autoencoder
autoencoder.compile(optimizer="adam", loss = "mean_squared_error")

# just converting 28x28 image matrices into 784 dimensional vectors
X_autoenc_input = X_autoenc.reshape((len(X_autoenc), np.prod(X_autoenc.shape[1:]))) 
X_autoenc_noisy_input = X_autoenc_noisy.reshape((len(X_autoenc_noisy), np.prod(X_autoenc_noisy.shape[1:])))

print(X_autoenc_input.shape)
print(X_autoenc_noisy_input.shape)


# creating dev set for cross-validation and early stopping
# first 5000 images will be used for the validation dataset, the rest as the training dataset
X_dev_input = X_autoenc_input[:5000]
X_train_input = X_autoenc_input[5000:]

X_dev_noisy_input = X_autoenc_noisy_input[:5000]
X_train_noisy_input = X_autoenc_noisy_input[5000:]


print(X_dev_input.shape)
print(X_train_input.shape)


#training the regular autoencoder
autoencoder.fit(X_train_input, X_train_input,
                epochs=15,
                batch_size=64,
                shuffle=True,
                validation_data=(X_dev_input, X_dev_input))

# training the denoising autorencoder
#autoencoder.fit(X_train_noisy_input, X_train_input,
#                epochs=15,
#                batch_size=64,
#                shuffle=True,
#                validation_data=(X_dev_noisy_input, X_dev_input))


# preprocessing the data for classification
# just converting 28x28 image matrices into 784 dimensional vectors
X_classification_input = X_classification.reshape((len(X_classification), np.prod(X_classification.shape[1:]))) 
print(X_classification_input.shape)

X_classification_reconstr = autoencoder.predict(X_classification_input)
print(X_classification_reconstr.shape)

X_classification_encoded = encoder.predict(X_classification_input)
print(X_classification_encoded.shape)


ind = 17
img_orig = X_classification[ind].reshape(28, 28)
img_rec = X_classification_reconstr[ind].reshape(28, 28)


plt.imshow(img_orig)


plt.imshow(img_rec)


# let's train logistic regression to classify images
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# let's split the images for classification into two sets: training and testing
X_class_train = X_classification_input[:2000]
y_class_train = y_classification[:2000]
X_class_test = X_classification_input[9000:]
y_class_test = y_classification[9000:]

# ==========================================================================

# 1. using full original features (784 pixel values)
print(X_class_train.shape, X_class_test.shape)
print(y_class_train.shape, y_class_test.shape)

class_model = LogisticRegression(C = 1)
class_model.fit(X_class_train, y_class_train)
preds = class_model.predict(X_class_test)
print(preds.shape)

acc = accuracy_score(preds, y_class_test)
print("Classification accuracy, raw pixel features: " + str(acc))


# and 2. using the encoded features, obtained from each image from the autoencoder's encoder (64 values)
X_class_train = X_classification_encoded[:2000]
X_class_test = X_classification_encoded[9000:]
y_class_train = y_classification[:2000]
y_class_test = y_classification[9000:]

print(X_class_train.shape, X_class_test.shape)

class_model = LogisticRegression(C = 1)
class_model.fit(X_class_train, y_class_train)
preds = class_model.predict(X_class_test)
print(preds.shape)

acc = accuracy_score(preds, y_class_test)
print("Classification accuracy, from autoencoded representations: " + str(acc))


from keras.utils import to_categorical

# let's just load MNIST from scratch

(X_train, y_train), (X_test, y_test) = mnist.load_data()

# reshape the data so it fits the CNN model of Keras
# first dim: number of instances; second and third dim: size of the image matrix; 
# last dim: number of channels (there can possibly be different image channels, e.g., RGB, one channel for each color). Here we use only one channel (our images are greyscale)
X_train = X_train.reshape(60000,28,28,1)
X_test = X_test.reshape(10000,28,28,1)

# normalizing the pixel values to [0.0, 1.0] interval
X_train = X_train.astype('float32') / 255.
X_test = X_test.astype('float32') / 255.

# also, we need to one-hot-encode the label for each image because we will use the categorical cross-entropy loss
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

print(X_train[0])
print(y_train[0])
plt.imshow(X_train[0])


from keras.layers import Dense, Conv2D, Flatten
from keras.layers import MaxPooling2D
from keras.models import Model

input_img = Input(shape=(28, 28, 1, ))

# first convolution layer
first_conv = Conv2D(64, kernel_size=5, activation='relu', input_shape=(28,28,1))(input_img)

# first pooling layer
first_pool = MaxPooling2D(pool_size=(2, 2))(first_conv)

# second convolution layer
second_conv = Conv2D(32, kernel_size=3, activation='relu')(first_pool)

# second pooling layer
second_pool = MaxPooling2D(pool_size=(2, 2))(second_conv)

# flattening the remaining 2D matrix into a vector
representation = Flatten()(second_pool)

cl_layer = Dense(10, activation='softmax')(representation)

# full autoencoder for training
classifier = Model(input_img, cl_layer)

# OPTIONAL
# intermediate models, for obtaining intermediate representations
conv2d_1 = Model(input_img, first_conv)
pool_1 = Model(input_img, first_pool)
conv2d_2 = Model(input_img, second_conv)
pool_2 = Model(input_img, second_pool)
reps = Model(input_img, representation)


classifier.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
classifier.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=32)


i = 352

preds = conv2d_1.predict(np.array([X_test[i]]))
print("After first convolution: " + str(preds[0].shape))

preds = pool_1.predict(np.array([X_test[i]]))
print("After first pooling: " + str(preds[0].shape))

preds = conv2d_2.predict(np.array([X_test[i]]))
print("After second convolution: " + str(preds[0].shape))

preds = pool_2.predict(np.array([X_test[i]]))
print("After second pooling: " + str(preds[0].shape))

preds = reps.predict(np.array([X_test[i]]))
print("After flattening: " + str(preds[0].shape))

preds = classifier.predict(np.array([X_test[i]]))
print("Classification result: " + str(preds[0]))

print("Real class:")
print(y_test[i])

plt.imshow(X_test[i])


from keras.models import Sequential

#create model: Sequential model is just a linear stack of layers: 
#added layers will be executed in sequence, one after the other
model = Sequential()

#add layers to the model
model.add(Conv2D(64, kernel_size=5, activation='relu', input_shape=(28,28,1))) 
model.add(MaxPooling2D(pool_size=(3, 3)))
model.add(Conv2D(32, kernel_size=3, activation='relu')) 
model.add(MaxPooling2D(pool_size=(2, 2)))           
model.add(Flatten())
model.add(Dense(10, activation='softmax'))


# with Sequential, we have defined the very same CNN model architecture as before, only in a more elegant and concise way
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3)


#!pip install nltk
import numpy as np
import nltk
#nltk.download('treebank')


tagged_sents = nltk.corpus.treebank.tagged_sents()
print(tagged_sents[0])
print(len(tagged_sents))

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
3914


# preprocessing, splitting tokens from tags
sentences, tag_seqs = [], []
for tag_sent in tagged_sents:
  sent, tags = zip(*tag_sent)
  sentences.append([t.lower() for t in sent])
  tag_seqs.append(list(tags))

ind = 31
print(sentences[ind])
print(tag_seqs[ind])

['``', 'but', 'you', 'have', '*-1', 'to', 'recognize', 'that', 'these', 'events', 'took', 'place', '35', 'years', 'ago', '.']
['``', 'CC', 'PRP', 'VBP', '-NONE-', 'TO', 'VB', 'IN', 'DT', 'NNS', 'VBD', 'NN', 'CD', 'NNS', 'IN', '.']


def build_vocabulary(sequences):
  vocab = {}
  for seq in sequences:
    for item in seq:
      if item not in vocab:
        vocab[item] = len(vocab)
  return vocab

vocab_tokens = build_vocabulary(sentences)
vocab_tags = build_vocabulary(tag_seqs)

# adding a special padding token
vocab_tokens["<PAD>"] = len(vocab_tokens)
vocab_tags["<PAD>"] = len(vocab_tags)

print(len(vocab_tokens))
print(len(vocab_tags))
print(vocab_tags)

11388
47
{'NNP': 0, ',': 1, 'CD': 2, 'NNS': 3, 'JJ': 4, 'MD': 5, 'VB': 6, 'DT': 7, 'NN': 8, 'IN': 9, '.': 10, 'VBZ': 11, 'VBG': 12, 'CC': 13, 'VBD': 14, 'VBN': 15, '-NONE-': 16, 'RB': 17, 'TO': 18, 'PRP': 19, 'RBR': 20, 'WDT': 21, 'VBP': 22, 'RP': 23, 'PRP$': 24, 'JJS': 25, 'POS': 26, '``': 27, 'EX': 28, "''": 29, 'WP': 30, ':': 31, 'JJR': 32, 'WRB': 33, '$': 34, 'NNPS': 35, 'WP$': 36, '-LRB-': 37, '-RRB-': 38, 'PDT': 39, 'RBS': 40, 'FW': 41, 'UH': 42, 'SYM': 43, 'LS': 44, '#': 45, '<PAD>': 46}


# padding all sentences to the length of the longest 
max_len = max([len(x) for x in sentences])
print(max_len)

padded_sents = [sent + ["<PAD>"] * (max_len - len(sent)) for sent in sentences]
padded_tagseqs = [ts + ["<PAD>"] * (max_len - len(ts)) for ts in tag_seqs]

print(padded_sents[0])
print(padded_tagseqs[0])

# HINT: you can also use Keras to pad sequences
# See: from keras.preprocessing.sequence import pad_sequences

271
['pierre', 'vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.', '29', '.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', ',', 'MD', 'VB', 'DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NNP', 'CD', '.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']


# replacing the strings (words and tags) with IDs from the vocabulary

sents_ids = [[vocab_tokens[t] for t in s] for s in padded_sents]
print(sents_ids[0])

tags_ids = [[vocab_tags[t] for t in ts] for ts in padded_tagseqs]
print(tags_ids[0])

# we additionally need to convert the labels into one-hot-encoding vectors (to be able to use categorical_crossentropy loss in the model)
tags_labels = []
for t_seq in tags_ids:
  cat_labs = []
  for tag in t_seq:
    lab_vec = np.zeros(len(vocab_tags))
    lab_vec[tag] = 1.0
    cat_labs.append(lab_vec)
  tags_labels.append(cat_labs)

tags_labels = np.array(tags_labels)
#print(tags_labels[0])

[0, 1, 2, 3, 4, 5, 2, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387, 11387]
[0, 0, 1, 2, 3, 4, 1, 5, 6, 7, 8, 9, 7, 4, 8, 0, 2, 10, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46]


# Finally, let's split out data into train and test sets
ind_train = (int)(0.8 * len(sents_ids))
x_train = np.array(sents_ids[:ind_train])
y_train = np.array(tags_labels[:ind_train])
x_test = np.array(sents_ids[ind_train:])
y_test = np.array(tags_labels[ind_train:])

print(len(x_train), len(y_train))
print(len(x_test), len(y_test))

3131 3131
783 783


from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam

model = Sequential()
model.add(InputLayer(input_shape=(max_len, )))
model.add(Embedding(len(vocab_tokens), 32)) # the number specifies the dimension for word embedding vectors
model.add(Bidirectional(LSTM(64, return_sequences=True))) # the number specifies the dimension/size of the LSTM state vector
model.add(TimeDistributed(Dense(len(vocab_tags))))
model.add(Activation('softmax'))
 
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding (Embedding)       (None, 271, 32)           364416    
                                                                 
 bidirectional (Bidirectiona  (None, 271, 128)         49664     
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 271, 47)          6063      
 ibuted)                                                         
                                                                 
 activation (Activation)     (None, 271, 47)           0         
                                                                 
=================================================================
Total params: 420,143
Trainable params: 420,143
Non-trainable params: 0
_________________________________________________________________


model.compile(loss='categorical_crossentropy', optimizer=Adam(0.001), metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=256, epochs=2, validation_split=0.2)

Epoch 1/3
10/10 [==============================] - 83s 8s/step - loss: 3.7077 - accuracy: 0.7194 - val_loss: 3.3914 - val_accuracy: 0.9092
Epoch 2/3
10/10 [==============================] - 79s 8s/step - loss: 2.1717 - accuracy: 0.9039 - val_loss: 0.5742 - val_accuracy: 0.9092
Epoch 3/3
10/10 [==============================] - 79s 8s/step - loss: 0.5788 - accuracy: 0.9039 - val_loss: 0.5455 - val_accuracy: 0.9092

<keras.callbacks.History at 0x2e4c1eb5850>


# let's predict and evaluate on the test set 
type(x_test)
scores = model.evaluate(x_test, y_test)
print("Avg test loss: " + str(scores[0]))
print("Accuracy: " + str(scores[1] * 100))

25/25 [==============================] - 3s 129ms/step - loss: 0.5683 - accuracy: 0.9056
Avg test loss: 0.5682629346847534
Accuracy: 90.55624008178711

Autoencoders¶

Convolutional Neural Networks¶

Recurrent neural networks¶