I am trying to get into Multi-class text classification I have the following code:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from tqdm import tqdm
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('german'))
vocab_size = 2000
embedding_dim = 64
max_length = 100
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
tickets = []
labels = []
df = pd.read_csv('tensorflow/Incidents_20210728.csv', sep=';')
df['Symptom'] = df['Symptom'].str.lower()
def getLemmText(text):
tokens = word_tokenize(text)
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(word) for word in tokens]
return ' '.join(tokens)
def getStemmText(text):
tokens = word_tokenize(text)
ps = PorterStemmer()
tokens = [ps.stem(word) for word in tokens]
return ' '.join(tokens)
df['Symptom'] = list(map(getLemmText, df['Symptom']))
df['Symptom'] = list(map(getStemmText, df['Symptom']))
xtrain, xtest, ytrain, ytest = train_test_split(
df['Symptom'], df['_qResolvedByGroup'],
test_size=0.33,
random_state=53)
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(list(xtrain) + list(xtest))
xtrain_seq = tokenizer.texts_to_sequences(xtrain)
xtest_seq = tokenizer.texts_to_sequences(xtest)
word_index = tokenizer.word_index
xtrain_pad = sequence.pad_sequences(
xtrain_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)
xtest_pad = sequence.pad_sequences(
xtest_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(list(ytrain))
train_label_seq = np.array(
label_tokenizer.texts_to_sequences(ytrain), dtype=object)
test_label_seq = np.array(
label_tokenizer.texts_to_sequences(ytest), dtype=object)
model = Sequential()
model.add(Embedding(len(word_index) + 1,
embedding_dim))
model.add(SpatialDropout1D(0.3))
model.add(Bidirectional(LSTM(embedding_dim,
dropout=0.3, recurrent_dropout=0.3)))
model.add(Dense(embedding_dim, activation='relu'))
model.add(Dropout(0.8))
model.add(Dense(embedding_dim, activation='relu'))
model.add(Dropout(0.8))
model.add(Dense(68))
model.add(Activation('softmax'))
model.compile(loss='sparse_categorical_crossentropy',
optimizer='adam', metrics=['accuracy'])
num_epochs = 10
history = model.fit(xtrain_pad, train_label_seq, epochs=num_epochs,
validation_data=(xtest_pad, test_label_seq), verbose=2)
But it throw the following error:
ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).
I tried to change np.array
to np.hstack
and searched a bit but couldn’t resolve the error.
I hope someone can help me here.