I’m trying to create a NN model with Tensorflow using a few different text files. My code are as follows
DIRECTORY_URL = "https://raw.githubusercontent.com/ake700/Python/main/Data_Science/Text_NeuralNetwork/text_files/"
FILE_NAMES = ['AK.txt', 'AZ.txt', 'SH.txt', 'DK.txt']
for name in FILE_NAMES:
text_dir = tf.keras.utils.get_file(name, origin=DIRECTORY_URL+name)
parent_dir = pathlib.Path(text_dir).parent
list(parent_dir.iterdir())
def labeler(example, index):
return example, tf.cast(index, tf.int64)
labeled_data_sets = []
for i, file_name in enumerate(FILE_NAMES):
lines_dataset = tf.data.TextLineDataset(os.path.join(parent_dir, file_name))
labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
labeled_data_sets.append(labeled_dataset)
BUFFER_SIZE = 50000
BATCH_SIZE = 64
VALIDATION_SIZE = 5000
all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
all_labeled_data = all_labeled_data.concatenate(labeled_dataset)
all_labeled_data = all_labeled_data.shuffle(
BUFFER_SIZE, reshuffle_each_iteration=False)
for text, label in all_labeled_data.take(10):
print("Sentence: ", text.numpy())
print("Label:", label.numpy())
for ex in all_labeled_data.take(5):
print(ex)
Output so far:
Sentence: b'Molecules in milk absorb light to various degrees depending on the wavelength of light. Milk has a weak incident of visible light absorption mainly contributed by riboflavin in milk serum and ?carotene in milk fat . The ultraviolet absorption in milk is strongly affected by the aromatic rings of tyrosine and tryptophan and double bonds of the milk fat. In contrast strong water vapor absorption bands and other bands such as aliphatic ester bonds and CH of lipids peptide bonds of proteins OH groups of lactose and undissociated COOH groups contribute to the infrared light absorption in milk. '
Label: 2
(<tf.Tensor: shape=(), dtype=string, numpy=b'Molecules in milk absorb light to various degrees depending on the wavelength of light. Milk has a weak incident of visible light absorption mainly contributed by riboflavin in milk serum and ?carotene in milk fat . The ultraviolet absorption in milk is strongly affected by the aromatic rings of tyrosine and tryptophan and double bonds of the milk fat.'>, <tf.Tensor: shape=(), dtype=int64, numpy=2>)
Preparing dataset for training
tokenizer = tf_text.UnicodeScriptTokenizer()
def tokenize(text, unused_label):
lower_case = tf_text.case_fold_utf8(text)
return tokenizer.tokenize(lower_case)
tokenized_ds = all_labeled_data.map(tokenize)
for text_batch in tokenized_ds.take(5):
print("Tokens: ", text_batch.numpy())
# Tokens: [b'molecules' b'in' b'milk' b'absorb' b'light' b'to' b'various' b'degrees'
# b'depending' b'on' b'the' b'wavelength' b'of' b'light' b'.' b'milk'
# b'has' b'a' b'weak' b'incident' b'of' b'visible' b'light' b'absorption'
AUTOTUNE = tf.data.AUTOTUNE
def configure_dataset(dataset):
return dataset.cache().prefetch(buffer_size=AUTOTUNE)
VOCAB_SIZE = 10000
tokenized_ds = configure_dataset(tokenized_ds)
vocab_dict = collections.defaultdict(lambda: 0)
for toks in tokenized_ds.as_numpy_iterator():
for tok in toks:
vocab_dict[tok] += 1
vocab = sorted(vocab_dict.items(), key=lambda x: x[1], reverse=True)
vocab = [token for token, count in vocab]
vocab = vocab[:VOCAB_SIZE]
vocab_size = len(vocab)
print("Vocab size: ", vocab_size)
print("First five vocab entries:", vocab[:5])
# Vocab size: 5678
# First five vocab entries: [b'the', b'.', b'of', b'and', b'in']
keys = vocab
values = range(2, len(vocab) + 2) # Reserve `0` for padding, `1` for OOV tokens.
init = tf.lookup.KeyValueTensorInitializer(
keys, values, key_dtype=tf.string, value_dtype=tf.int64)
num_oov_buckets = 1
vocab_table = tf.lookup.StaticVocabularyTable(init, num_oov_buckets)
def preprocess_text(text, label):
standardized = tf_text.case_fold_utf8(text)
tokenized = tokenizer.tokenize(standardized)
vectorized = vocab_table.lookup(tokenized)
return vectorized, label
example_text, example_label = next(iter(all_labeled_data))
print("Sentence: ", example_text.numpy())
vectorized_text, example_label = preprocess_text(example_text, example_label)
print("Vectorized sentence: ", vectorized_text.numpy())
# Sentence: b'Molecules in milk absorb light to various degrees depending on the wavelength of light. Milk has a weak incident of visible light absorption mainly contributed by riboflavin in milk serum and ?carotene in milk fat ... '
#Vectorized sentence: [1330 6 9 1478 174 7 182 1127 1028 23 2 853 4 174
# 3 9 67 8 1029 817 4 1128 174 193 695 1331 19 3575
# 6 9 1660 5 287 3576 6 9 99 3 2 1479 193 6
# 9 22 818 455 19 2 2697 2698 4 2699 5 2700 5 636
# 1661 4 2 9 99 3 6 390 1209 76 3577 193 1662 5
# 61 1662 74 17 3578 3579 1661 5 3580 4 1210 3581 1661 4
# 663 3582 484 4 913 5 3583 3584 484 561 7 2 2701 174
# 193 6 9 3]
all_encoded_data = all_labeled_data.map(preprocess_text)
train_data = all_encoded_data.skip(VALIDATION_SIZE).shuffle(BUFFER_SIZE)
validation_data = all_encoded_data.take(VALIDATION_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE)
validation_data = validation_data.padded_batch(BATCH_SIZE)
sample_text, sample_labels = next(iter(validation_data))
print("Text batch shape: ", sample_text.shape)
print("Label batch shape: ", sample_labels.shape)
print("First text example: ", sample_text[0])
print("First label example: ", sample_labels[0])
# Text batch shape: (64, 213)
# Label batch shape: (64,)
# First text example: tf.Tensor( [array] , shape=(213,), dtype=int64)
# First label example: tf.Tensor(2, shape=(), dtype=int64)
vocab_size += 2
train_data = configure_dataset(train_data)
validation_data = configure_dataset(validation_data)
def create_model(vocab_size, num_labels):
model = tf.keras.Sequential([
layers.Embedding(vocab_size, 64, mask_zero=True),
layers.Conv1D(64, 5, padding="valid", activation="relu", strides=2),
layers.GlobalMaxPooling1D(),
layers.Dense(num_labels)
])
return model
model = create_model(vocab_size=vocab_size, num_labels=3)
model.compile(
optimizer='adam',
loss=losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'],
run_eagerly=True)
history = model.fit(train_data, validation_data=validation_data, epochs=3)
When compiling the model, I am getting this error
ValueError: Unexpected result of `train_function` (Empty logs). Please use `Model.compile(..., run_eagerly=True)`, or `tf.config.run_functions_eagerly(True)` for more information of where went wrong, or file a issue/bug to `tf.keras`.
According to a previous post, it seems possible to fix this by using an encoded array? I’m not quite sure what this means and how it’s applicable to my dataset/model. I also tried to append the run_eagerly=True
in my model.compile
statement to see more details, but it did not seem to do anything.