Why `tf.keras.applications` is so slow?

Hello everyone! I am trying to implement an inference function with tensorflow. I find that tf.keras.applications is particularly slow in my case.

This is my main file.

import argparse

import tensorflow as tf
import tensorflow.keras as keras

from util import load_data, Timer

def main(use_cuda):
    model_name = 'resnet152'

    with Timer("step 0: dummy input", use_cuda):
        dummy_input = tf.zeros([1])

    with Timer('step 1: load model', use_cuda):
        model = keras.applications.ResNet152(weights=None)

    with Timer('step 2: preprocess data', use_cuda):
        img_files, img_data = load_data(use_tf=True)
        data = img_data

    with Timer('step 3: predict', use_cuda):
        pred = model(data)
        result = tf.argmax(pred, axis=1).numpy()

    return dict(zip(img_files, result))

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--use_cuda", default="0", type=int)
    args = parser.parse_args()


This is the util file.

import os
import time
from contextlib import ContextDecorator

import numpy as np
from PIL import Image

def load_data(root='./data', use_tf=False):

class Timer(ContextDecorator):

    def __init__(self, description, cuda):
        self.description = description
        self.cuda = cuda

    def __enter__(self):
        self.start_time = time.perf_counter()

    def __exit__(self, *args):
        self.end_time = time.perf_counter()
        print(f'>>>>>>>>>> {self.description:25s}---{(self.end_time - self.start_time) * 1000:8.2f} ms <<<<<<<<<<')

This is the output. I cannot below it takes around 2 seconds in the load model step.

2021-07-07 18:29:31.453675: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
>>>>>>>>>> step 0: dummy input      ---    1.34 ms <<<<<<<<<<
>>>>>>>>>> step 1: load model       --- 1994.82 ms <<<<<<<<<<
>>>>>>>>>> step 2: preprocess data  ---   29.40 ms <<<<<<<<<<
>>>>>>>>>> step 3: predict          --- 2787.95 ms <<<<<<<<<<


Can you check if It Is running on GPU with:

