Tensorflow dataset has () shape

Liam_Adams · May 12, 2022, 3:34am

I’m trying to follow this tutorial to use BERT Classify text with BERT | Text | TensorFlow but with a different dataset for multilabel text classification.

This is how the model is created from the tutorial

import tensorflow_hub as hub
import tensorflow_text

#bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8'
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

def build_classifier_model(input_shape, output_dim):
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(output_dim, activation=None, name='classifier')(net)
  return tf.keras.Model(text_input, net)

classifier_model = build_classifier_model(None, lookup.vocabulary_size())

This is how I create my datasets

padding_token = "<pad>"
auto = tf.data.AUTOTUNE

def make_dataset(dataframe, lookup, is_train=True):
    labels = tf.ragged.constant(dataframe["ATTRIBUTE_VALUE"].values) # uneven number of labels in each row
    label_binarized = lookup(labels).numpy() # get multi hot encoding
    dataset = tf.data.Dataset.from_tensor_slices(
        (dataframe["TEXT"].values, label_binarized)
    ) # combine text and multi hot into one tf dataset
    return dataset

terms = tf.ragged.constant(train_df["ATTRIBUTE_VALUE"].values)
lookup = tf.keras.layers.StringLookup(output_mode="multi_hot") # maps attribute values to multi hot encoding
lookup.adapt(terms)

train_dataset = make_dataset(train_df, lookup, is_train=True)
validation_dataset = make_dataset(val_df, lookup, is_train=False)
test_dataset = make_dataset(test_df, lookup, is_train=False)

train_dataset = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)
validation_dataset = validation_dataset.cache().prefetch(buffer_size=AUTOTUNE)
test_dataset = test_dataset.cache().prefetch(buffer_size=AUTOTUNE)

After creating my datasets they have shape=(). When I try to train the model

from official.nlp import optimization

steps_per_epoch = tf.data.experimental.cardinality(train_dataset).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

classifier_model.compile(optimizer=optimizer,
                         loss="binary_crossentropy",
                         metrics=[tf.keras.metrics.Recall()])

history = classifier_model.fit(x=train_dataset,
                               validation_data=validation_dataset,
                               epochs=epochs)

I get the following error which I believe is saying my dataset needs to be of shape=(None,) instead of shape=().

ValueError: Exception encountered when calling layer “preprocessing”
(type KerasLayer).

in user code:

    File "/opt/anaconda3/envs/mv_prodmatch/lib/python3.10/site-packages/tensorflow_hub/keras_layer.py",

line 237, in call *
result = smart_cond.smart_cond(training,

    ValueError: Could not find matching concrete function to call loaded from the SavedModel. Got:
      Positional arguments (3 total):
        * Tensor("inputs:0", shape=(), dtype=string)
        * False
        * None
      Keyword arguments: {}
    
     Expected these arguments to match one of the following 4 option(s):
    
    Option 1:
      Positional arguments (3 total):
        * TensorSpec(shape=(None,), dtype=tf.string, name='sentences')
        * False
        * None
      Keyword arguments: {}
    
    Option 2:
      Positional arguments (3 total):
        * TensorSpec(shape=(None,), dtype=tf.string, name='sentences')
        * True
        * None
      Keyword arguments: {}
    
    Option 3:
      Positional arguments (3 total):
        * TensorSpec(shape=(None,), dtype=tf.string, name='inputs')
        * False
        * None
      Keyword arguments: {}
    
    Option 4:
      Positional arguments (3 total):
        * TensorSpec(shape=(None,), dtype=tf.string, name='inputs')
        * True
        * None
      Keyword arguments: {}


Call arguments received:
  • inputs=tf.Tensor(shape=(), dtype=string)
  • training=True

How can I make my dataset have shape=(None,) instead of shape=()?

Liam_Adams · May 12, 2022, 1:35pm

Found the answer here python - Tensorflow dataset has () shape - Stack Overflow

train_dataset = train_dataset.batch(batch_size).cache().prefetch(buffer_size=AUTOTUNE)