Lower-than-expected accuracies of pretrained MobileNet models

I tried to validate the pretrained MobileNet V2 and V3 models available at keras.applications.MobileNetV2() and keras.applications.MobileNetV3(). To my surprise, both yielded lower-than-expected Top-1 accuracies on ImageNet 2012.

  • MobileNet V2: expected = 71.8%, measured = 61.6%
  • MobileNet V3: expected = 75.6%, measured = 71.0%

import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds
import numpy as np
import os
import time

from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input, decode_predictions
from tensorflow.keras.preprocessing import image

print(tf.__version__) #2.7.1
print(keras.__version__) #2.7.0

Prepare ImageNet 2012 validation

labels_path = tf.keras.utils.get_file('ImageNetLabels.txt','https://storage.googleapis.com/download.tensorflow.org/data/ImageNetLabels.txt')
imagenet_labels = np.array(open(labels_path).read().splitlines())

data_dir_val  = '/home/le_user/imagenet_dataset/'
write_dir_val = '/home/le_user/imagenet_dataset_tfds'

# Construct a tf.data.Dataset
download_config_val = tfds.download.DownloadConfig(
    extract_dir=os.path.join(write_dir_val, 'extracted'),
    manual_dir=data_dir_val)

download_and_prepare_kwargs_val = {
    'download_dir': os.path.join(write_dir_val, 'downloaded'),
    'download_config': download_config_val,
}

def resize_with_crop(image, label):
    i = image
    i = tf.cast(i, tf.float32)
    i = tf.image.resize_with_crop_or_pad(i, 224, 224)
    i = tf.keras.applications.mobilenet_v2.preprocess_input(i)
    return (i, label)

def resize_with_crop_v3(image, label):
    i = image
    i = tf.cast(i, tf.float32)
    i = tf.image.resize_with_crop_or_pad(i, 224, 224)
    i = tf.keras.applications.mobilenet_v3.preprocess_input(i)
    return (i, label)

ds = tfds.load('imagenet2012', 
               data_dir=os.path.join(write_dir_val, 'data'),         
               split='validation', 
               shuffle_files=False, 
               download=False, 
               as_supervised=True,
               download_and_prepare_kwargs=download_and_prepare_kwargs_val)

strategy = tf.distribute.MirroredStrategy()

AUTOTUNE = tf.data.AUTOTUNE
BATCH_SIZE_PER_REPLICA = 128
NUM_GPUS = strategy.num_replicas_in_sync

ds_single   = ds.map(resize_with_crop)
ds_single   = ds_single.batch(batch_size=BATCH_SIZE_PER_REPLICA)
ds_single   = ds_single.cache().prefetch(buffer_size=AUTOTUNE)

Use pre-trained weights to validate accuracy

mbv2_eval = keras.applications.MobileNetV2(include_top=True, 
                                           weights='imagenet')
mbv2_eval.trainable = False
mbv2_eval.compile(optimizer='adam',
             loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False),
             metrics=['accuracy'])

start_time = time.time()
result = mbv2_eval.evaluate(ds_single)
print(f"--- Single-GPU eval took {(time.time() - start_time)} seconds ---")

print(dict(zip(mbv2_eval.metrics_names, result)))

Output is

391/391 [==============================] - 49s 118ms/step - loss: 1.7855 - accuracy: 0.6155
--- Single-GPU eval took 48.85072922706604 seconds ---
{'loss': 1.7854770421981812, 'accuracy': 0.6154599785804749}

To reproduce the measured MobileNet V3 results:

ds_mbv3_parallel = ds.map(resize_with_crop_v3)
ds_mbv3_parallel = ds_mbv3_parallel.batch(batch_size=BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync)
ds_mbv3_parallel = ds_mbv3_parallel.cache().prefetch(buffer_size=AUTOTUNE)

with strategy.scope():
    mbv3_eval_parallel = keras.applications.MobileNetV3Large()
    mbv3_eval_parallel.trainable = False
    mbv3_eval_parallel.compile(optimizer='adam',
                               loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                               metrics=['accuracy'])

start_time = time.time()
result_parallel = mbv3_eval_parallel.evaluate(ds_mbv3_parallel)
print(f"--- {strategy.num_replicas_in_sync}-GPU eval took {(time.time() - start_time)} seconds ---")

print(dict(zip(mbv3_eval_parallel.metrics_names, result_parallel)))

Output is:

98/98 [==============================] - 60s 459ms/step - loss: 1.2824 - accuracy: 0.7104
--- 4-GPU eval took 60.1328125 seconds ---
{'loss': 1.2823766469955444, 'accuracy': 0.7104200124740601}