@markdaoust I am trying on the transformer tokenizer but receives a new error. The full code is below:
import tensorflow_datasets as tfds
import numpy as np
import tensorflow as tf
import logging
import tensorflow_text as text
from transformers import BertTokenizer
config = tfds.translate.wmt.WmtConfig(
description="WMT 2019 translation task dataset.",
version="0.0.3",
language_pair=("zh", "en"),
subsets={
tfds.Split.TRAIN: ["newscommentary_v13"],
tfds.Split.VALIDATION: ["newsdev2017"],
}
)
builder = tfds.builder("wmt_translate", config=config)
print(builder.info.splits)
builder.download_and_prepare()
datasets = builder.as_dataset(as_supervised=True)
print('datasets is {}'.format(datasets))
train_examples=datasets["train"]
val_examples=datasets["validation"]
train_examples = train_examples.take(128)
#1. Get train, validation and test text data
for zh_examples, en_examples in train_examples.batch(3).take(1):
for zh in zh_examples.numpy():
print(zh.decode('utf-8'))
print()
for en in en_examples.numpy():
print(en.decode('utf-8'))
print('Start building tokenizer ...')
tokenizer_en = BertTokenizer.from_pretrained("bert-base-cased")
tokenizer_zh = BertTokenizer.from_pretrained("bert-base-chinese")
print('End building tokenizer ...')
def py_wrap_tokenize_pairs(zh, en):
return tf.numpy_function(tokenize_pairs, [zh, en],[tf.int64,tf.int64])
def tokenize_pairs(zh, en):
zh = tokenizer_zh.tokenize(zh)
zh = zh.to_tensor()
en = tokenizer_en.tokenize(en)
en = en.to_tensor()
return zh, en
# 4. Make batches
BUFFER_SIZE = 20000
BATCH_SIZE = 64
def make_batches(ds):
return (
ds
.cache()
.shuffle(BUFFER_SIZE)
.batch(BATCH_SIZE)
.map(py_wrap_tokenize_pairs, num_parallel_calls=tf.data.experimental.AUTOTUNE)
.prefetch(tf.data.experimental.AUTOTUNE))
train_batches = make_batches(train_examples)
val_batches = make_batches(val_examples)
# Error occurs in this function
for (batch, (inp, tar)) in enumerate(train_batches):
print(batch, inp, tar)
And the error is:
File â/Users/cong/nlp/study/transformer/data_zh.pyâ, line 31, in tokenize_pairs
zh = tokenizer_zh.tokenize(zh)
File "/Users/cong/.venv/tf2/lib/python3.8/site-packages/transformers/tokenization_utils.py", line 362, in tokenize
tokenized_text = split_on_tokens(no_split_token, text)
File "/Users/cong/.venv/tf2/lib/python3.8/site-packages/transformers/tokenization_utils.py", line 336, in split_on_tokens
if not text.strip():
AttributeError: 'numpy.ndarray' object has no attribute 'strip'
It looks like the problem is still in the tokenize_pairs function. I first load the pretrained English and chinese tokenizers, define py_wrap_tokenize_pairs wrapper, modify the tokenize_pairs function, and make_batches. I think I have some misunderstanding on your comments above.