I’m trying to fine-tune a pre-trained DistilBERT model from Huggingface using Tensorflow. Everything runs smoothly and the model builds and trains without error. But when I try to save the model it stops with the error “IndexError: list index out of range”. I’m using pycharm with TPU.
Any help would be much appreciated!
import h5py
import numpy as np
import pandas as pd
import pydot
import simplejson as simplejson
import tensorflow as tf
import os
from transformers import pipeline
from tensorflow import keras
train = pd.read_csv("train.csv")
print("Training_dataset_shape:", train.shape)
from transformers import BertTokenizer, TFBertModel, TFAutoModel,AutoTokenizer
#model_name = "bert-base-multilingual-cased"
#tokenizer = BertTokenizer.from_pretrained(model_name) # FC: this is the tokenizer we will use on our text data to tokenize it
model_name = "huggingface/distilbert-base-uncased-finetuned-mnli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# FC we make a function in order to have a list of the id for each word and the separator
def encode_sentence(s):
tokens = list(tokenizer.tokenize(s)) # FC: split the sentence into tokens that are either words or sub-words
tokens.append('[SEP]') # FC: a token called [SEP] (=separator) is added to mark end of each sentence
return tokenizer.convert_tokens_to_ids(tokens) # # FC: instead of returning the list of tokens, a list of each token ID is returned
def bert_encode(hypotheses, premises,
tokenizer): # FC: for RoBERTa we remove the input_type_ids from the inputs of the model
num_examples = len(hypotheses)
sentence1 = tf.ragged.constant([ # FC: constructs a constant ragged tensor. every entry has a different length
encode_sentence(s) for s in np.array(hypotheses)])
sentence2 = tf.ragged.constant([
encode_sentence(s) for s in np.array(premises)])
cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])] * sentence1.shape[
0] # FC: list of IDs for the token '[CLS]' to denote each beginning
input_word_ids = tf.concat([cls, sentence1, sentence2],
axis=-1) # FC: put everything together. every row still has a different length.
# input_word_ids2 = tf.concat([cls, sentence2, sentence1], axis=-1)
# input_word_ids = tf.concat([input_word_ids1, input_word_ids2], axis=0) # we duplicate the dataset inverting sentence 1 and 2
input_mask = tf.ones_like(
input_word_ids).to_tensor() # FC: first, a tensor with just ones in it is constructed in the same size as input_word_ids. Then, by applying to_tensor the ends of each row are padded with zeros to give every row the same length
type_cls = tf.zeros_like(cls)
type_s1 = tf.zeros_like(sentence1)
type_s2 = tf.ones_like(sentence2)
input_type_ids = tf.concat(
[type_cls, type_s1, type_s2], axis=-1).to_tensor() # FC: concatenates everything and again adds padding
inputs = {
'input_word_ids': input_word_ids.to_tensor(), # FC: input_word_ids hasn't been padded yet - do it here now
'input_mask': input_mask}
return inputs
train_input = bert_encode(train.premise.values, train.hypothesis.values, tokenizer)
# total_train_input = bert_encode(total_train.premise.values, total_train.hypothesis.values, tokenizer)
max_len = 136 #: FC 50 in the initial tutorial
def build_model():
encoder = TFAutoModel.from_pretrained(model_name)
input_word_ids = tf.keras.Input(shape(max_len,),dtype=tf.int32,name="input_word_ids")
input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32,name="input_mask")
embedding = encoder([input_word_ids, input_mask])[0] # FC: add_input_type_ids for the BERT model
output = tf.keras.layers.Dense(3, activation='softmax')(embedding[:, 0, :])
model = tf.keras.Model(inputs=[input_word_ids, input_mask],outputs=output) # FC: based on the code in the lines above, a model is now constructed and passed into the variable model
model.compile(tf.keras.optimizers.Adam(learning_rate=1e5),loss='sparse_categorical_crossentropy', metrics=['accuracy'])
return model
with strategy.scope():
model = build_model()
# print("model.layers[2]:-------". model.layers[2])
# model.layers[2].trainable=True
for key in train_input.keys():
train_input[key] = train_input[key][:,:max_len]
print("train the model now")
early_stop = tf.keras.callbacks.EarlyStopping(patience=3,restore_best_weights=True), train.label.values, epochs = 3, verbose = 1, validation_split = 0.01,
print("Training is completeted")"saved_model/trackers/1")```