Tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)

tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)

Cuda - 11.4
TF - 2.6.2
Docker Container Runtime

Logs and python File below

Python File

import argparse
import io
import os
import subprocess

import ray
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
#import tensorflow as tf
from PIL import Image
from psutil import cpu_count
tf.disable_v2_behavior()
from utils import *
from object_detection.utils import dataset_util, label_map_util

label_map = label_map_util.load_labelmap(’./label_map.pbtxt’)
label_map_dict = label_map_util.get_label_map_dict(label_map)
t2idict = {y:x for x,y in label_map_dict.items()}
def class_text_to_int(text):
return t2idict[text]

def create_tf_example(filename, encoded_jpeg, annotations):
“”"
This function create a tf.train.Example from the Waymo frame.

args:
    - filename [str]: name of the image
    - encoded_jpeg [bytes]: jpeg encoded image
    - annotations [protobuf object]: bboxes and classes

returns:
    - tf_example [tf.Train.Example]: tf example in the objection detection api format.
"""

# TODO: Implement function to convert the data
encoded_jpg_io = io.BytesIO(encoded_jpeg)
image = Image.open(encoded_jpg_io)
width, height = image.size

image_format = b'jpeg'

xmins = []
xmaxs = []
ymins = []
ymaxs = []
classes_text = []
classes = []

for index, row in enumerate(annotations):

    xmin = row.box.center_x - row.box.length/2.0
    xmax = row.box.center_x + row.box.length/2.0
    ymin = row.box.center_y - row.box.width/2.0
    ymax = row.box.center_y + row.box.width/2.0


    xmins.append(xmin / width)
    xmaxs.append(xmax / width)
    ymins.append(ymin / height)
    ymaxs.append(ymax / height)
    classes_text.append(class_text_to_int(row.type).encode('utf8'))
    classes.append(row.type)

filename = filename.encode('utf8')
tf_example = tf.train.Example(features=tf.train.Features(feature={
    'image/height': int64_feature(height),
    'image/width': int64_feature(width),
    'image/filename': bytes_feature(filename),
    'image/source_id': bytes_feature(filename),
    'image/encoded': bytes_feature(encoded_jpeg),
    'image/format': bytes_feature(image_format),
    'image/object/bbox/xmin': float_list_feature(xmins),
    'image/object/bbox/xmax': float_list_feature(xmaxs),
    'image/object/bbox/ymin': float_list_feature(ymins),
    'image/object/bbox/ymax': float_list_feature(ymaxs),
    'image/object/class/text': bytes_list_feature(classes_text),
    'image/object/class/label': int64_list_feature(classes),
}))
return tf_example

def download_tfr(filepath, temp_dir):
“”"
download a single tf record

args:
    - filepath [str]: path to the tf record file
    - temp_dir [str]: path to the directory where the raw data will be saved

returns:
    - local_path [str]: path where the file is saved
"""
# create data dir
dest = os.path.join(temp_dir, 'raw')
os.makedirs(dest, exist_ok=True)
filename = os.path.basename(filepath)
local_path = os.path.join(dest, filename)
if os.path.exists(local_path):
    return local_path
print("start downloading {}".format(local_path))
# download the tf record file
#cmd = ['gsutil', 'cp', filepath, f'{dest}']
#logger.info(f'Downloading {filepath}')
#res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
#if res.returncode != 0:
#    logger.error(f'Could not download file {filepath}')


#print("complete downloading {}".format(local_path))
return local_path

def process_tfr(filepath, data_dir):
“”"
process a Waymo tf record into a tf api tf record

args:
    - filepath [str]: path to the Waymo tf record file
    - data_dir [str]: path to the destination directory
"""
# create processed data dir
dest = os.path.join(data_dir, 'processed')
os.makedirs(dest, exist_ok=True)
file_name = os.path.basename(filepath)

if os.path.exists(f'{dest}/{file_name}'):
    return

logger.info(f'Processing {filepath}')
writer = tf.python_io.TFRecordWriter(f'{dest}/{file_name}')
dataset = tf.data.TFRecordDataset(filepath, compression_type='')
for idx, data in enumerate(dataset):
    frame = open_dataset.Frame()
    frame.ParseFromString(bytearray(data.numpy()))
    encoded_jpeg, annotations = parse_frame(frame)
    filename = file_name.replace('.tfrecord', f'_{idx}.tfrecord')
    tf_example = create_tf_example(filename, encoded_jpeg, annotations)
    writer.write(tf_example.SerializeToString())
writer.close()
return

@ray.remote
def download_and_process(filename, temp_dir, data_dir):
# need to re-import the logger because of multiprocesing
dest = os.path.join(data_dir, ‘processed’)
os.makedirs(dest, exist_ok=True)
file_name = os.path.basename(filename)

if os.path.exists(f'{dest}/{file_name}'):
    print("processed file  {} exists, skip".format(file_name))
    return
logger = get_module_logger(__name__)
local_path = download_tfr(filename, temp_dir)
#local_path = "/app/project/training_0000"
process_tfr(local_path, data_dir)
# remove the original tf record to save space
#if os.path.exists(local_path):
#    logger.info(f'Deleting {local_path}')
#    os.remove(local_path)

if name == “main”:
parser = argparse.ArgumentParser(description=‘Download and process tf files’)
parser.add_argument(’–data_dir’, required=False, default="./data",
help=‘processed data directory’)
parser.add_argument(’–temp_dir’, required=False, default="/app/project/training_0000",
help=‘raw data directory’)
args = parser.parse_args()
logger = get_module_logger(name)
# open the filenames file
with open(‘filenames1.txt’, ‘r’) as f:
filenames = f.read().splitlines()
logger.info(f’Download {len(filenames)} files. Be patient, this will take a long time.’)

data_dir = args.data_dir
temp_dir = args.temp_dir

download_and_process(filenames[0], temp_dir, data_dir)

# init ray
ray.init(num_cpus=cpu_count())

workers = [download_and_process.remote(fn, temp_dir, data_dir) for fn in filenames[:100]]
_ = ray.get(workers)
print("Done with downloading")

Logs

2021-11-05 05:52:19,569 WARNING services.py:1559 – WARNING: The object store is using /tmp instead of /dev/shm because /dev/shm has only 67108864 bytes available. This may slow down performance! You may be able to free up space by deleting files in /dev/shm or terminating any running plasma_store_server processes. If you are inside a Docker container, you may need to pass an argument with the flag ‘–shm-size’ to ‘docker run’.
2021-11-05 05:52:19,584 WARNING services.py:1559 – WARNING: The object store is using /tmp instead of /dev/shm because /dev/shm has only 67108864 bytes available. This may slow down performance! You may be able to free up space by deleting files in /dev/shm or terminating any running plasma_store_server processes. If you are inside a Docker container, you may need to pass an argument with the flag ‘–shm-size’ to ‘docker run’.
(pid=8218) 2021-11-05 05:52:25,785 INFO Processing /app/project/training_0000/raw/segment-10444454289801298640_4360_000_4380_000_with_camera_labels.tfrecord
(pid=8218) 2021-11-05 05:52:25.788420: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8214) 2021-11-05 05:52:25,822 INFO Processing /app/project/training_0000/raw/segment-10212406498497081993_5300_000_5320_000_with_camera_labels.tfrecord
(pid=8214) 2021-11-05 05:52:25.823758: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8281) 2021-11-05 05:52:25,833 INFO Processing /app/project/training_0000/raw/segment-10023947602400723454_1120_000_1140_000_with_camera_labels.tfrecord
(pid=8281) 2021-11-05 05:52:25.835462: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8278) 2021-11-05 05:52:25,833 INFO Processing /app/project/training_0000/raw/segment-10107710434105775874_760_000_780_000_with_camera_labels.tfrecord
(pid=8278) 2021-11-05 05:52:25.836066: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8208) 2021-11-05 05:52:25,914 INFO Processing /app/project/training_0000/raw/segment-10226164909075980558_180_000_200_000_with_camera_labels.tfrecord
(pid=8208) 2021-11-05 05:52:25.916432: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8240) 2021-11-05 05:52:25,943 INFO Processing /app/project/training_0000/raw/segment-10082223140073588526_6140_000_6160_000_with_camera_labels.tfrecord
(pid=8240) 2021-11-05 05:52:25.945284: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8259) 2021-11-05 05:52:25,965 INFO Processing /app/project/training_0000/raw/segment-1005081002024129653_5313_150_5333_150_with_camera_labels.tfrecord
(pid=8259) 2021-11-05 05:52:25.968303: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8253) 2021-11-05 05:52:25,973 INFO Processing /app/project/training_0000/raw/segment-10061305430875486848_1080_000_1100_000_with_camera_labels.tfrecord
(pid=8253) 2021-11-05 05:52:25.976335: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8260) 2021-11-05 05:52:25,980 INFO Processing /app/project/training_0000/raw/segment-10017090168044687777_6380_000_6400_000_with_camera_labels.tfrecord
(pid=8260) 2021-11-05 05:52:25.983356: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8251) 2021-11-05 05:52:25,996 INFO Processing /app/project/training_0000/raw/segment-10075870402459732738_1060_000_1080_000_with_camera_labels.tfrecord
(pid=8251) 2021-11-05 05:52:25.998741: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8248) 2021-11-05 05:52:26,006 INFO Processing /app/project/training_0000/raw/segment-10072140764565668044_4060_000_4080_000_with_camera_labels.tfrecord
(pid=8248) 2021-11-05 05:52:26.010197: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8244) 2021-11-05 05:52:26,013 INFO Processing /app/project/training_0000/raw/segment-10094743350625019937_3420_000_3440_000_with_camera_labels.tfrecord
(pid=8244) 2021-11-05 05:52:26.015093: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8205) 2021-11-05 05:52:26,026 INFO Processing /app/project/training_0000/raw/segment-10231929575853664160_1160_000_1180_000_with_camera_labels.tfrecord
(pid=8205) 2021-11-05 05:52:26.029786: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8201) 2021-11-05 05:52:26,032 INFO Processing /app/project/training_0000/raw/segment-10327752107000040525_1120_000_1140_000_with_camera_labels.tfrecord
(pid=8201) 2021-11-05 05:52:26.034096: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8195) 2021-11-05 05:52:26,101 INFO Processing /app/project/training_0000/raw/segment-10455472356147194054_1560_000_1580_000_with_camera_labels.tfrecord
(pid=8195) 2021-11-05 05:52:26.104592: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8230) 2021-11-05 05:52:26,447 INFO Processing /app/project/training_0000/raw/segment-10275144660749673822_5755_561_5775_561_with_camera_labels.tfrecord
(pid=8230) 2021-11-05 05:52:26.450848: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8241) 2021-11-05 05:52:26,477 INFO Processing /app/project/training_0000/raw/segment-10206293520369375008_2796_800_2816_800_with_camera_labels.tfrecord
(pid=8241) 2021-11-05 05:52:26.483426: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8216) 2021-11-05 05:52:26,786 INFO Processing /app/project/training_0000/raw/segment-10391312872392849784_4099_400_4119_400_with_camera_labels.tfrecord
(pid=8216) 2021-11-05 05:52:26.791352: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8198) 2021-11-05 05:52:26,908 INFO Processing /app/project/training_0000/raw/segment-10235335145367115211_5420_000_5440_000_with_camera_labels.tfrecord
(pid=8198) 2021-11-05 05:52:26.912763: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8274) 2021-11-05 05:52:26,922 INFO Processing /app/project/training_0000/raw/segment-1022527355599519580_4866_960_4886_960_with_camera_labels.tfrecord
(pid=8274) 2021-11-05 05:52:26.926734: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8189) 2021-11-05 05:52:27,069 INFO Processing /app/project/training_0000/raw/segment-10241508783381919015_2889_360_2909_360_with_camera_labels.tfrecord
(pid=8189) 2021-11-05 05:52:27.072531: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8282) 2021-11-05 05:52:27,160 INFO Processing /app/project/training_0000/raw/segment-10096619443888687526_2820_000_2840_000_with_camera_labels.tfrecord
(pid=8282) 2021-11-05 05:52:27.164348: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8263) 2021-11-05 05:52:27,190 INFO Processing /app/project/training_0000/raw/segment-10072231702153043603_5725_000_5745_000_with_camera_labels.tfrecord
(pid=8263) 2021-11-05 05:52:27.194641: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8238) 2021-11-05 05:52:27,203 INFO Processing /app/project/training_0000/raw/segment-10153695247769592104_787_000_807_000_with_camera_labels.tfrecord
(pid=8238) 2021-11-05 05:52:27.207482: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
Done with downloading

I don’t think it is currently supported, please upvote and subcribe to:

I used the latest tensorflow docker image , does it support cuda 11.4 ?

Tensorflow/tensorflow:latest-gpu

What are the steps to compile Cuda 11.4 & cuDNN = 8.2 using docker image “tensorflow/tensorflow:latest-devel-gpu”

It was updated on in the Docker image used by CI for nightly/master and not in tensorflow/tensorflow:latest-devel-gpu.

See my comment at
https://tensorflow-prod.ospodiscourse.com/t/building-tensorflow-from-source-with-docker-how-to-change-cuda-version/5542/3?u=bhack

If you have updated the Cuda version in that Docker image yourself you can try to compile from source with:

TF does not support CUDA 11.4. That requires moving to manylinux2014 standard, something we didn’t know in advance of starting the upgrade to CUDA 11.4.

Unfortunately, the changes done to switch to CUDA 11.4 were also not properly rolled back

I still see the effect of the PR I’ve mentioned for the CI on master:

Is this not orchestrated currently?

I rolled back to CUDA - 11.2 on Host machine , still when starting the latest tensorflow container , it fails at same .

(pid=1829) 2021-11-08 11:34:42,477 INFO Processing /app/project/training_0000/raw/segment-10212406498497081993_5300_000_5320_000_with_camera_labels.tfrecord
(pid=1868) 2021-11-08 11:34:43.089321: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected

Its Strange , now I am getting another error , I didn’t change anything
Please let me know why this exception is coming because of below mentioned syntax

def process_tfr(filepath, data_dir):

process a Waymo tf record into a tf api tf record

args:
- filepath [str]: path to the Waymo tf record file
- data_dir [str]: path to the destination directory

 # create processed data dir

dest = os.path.join(data_dir, ‘processed’)
os.makedirs(dest, exist_ok=True)
file_name = os.path.basename(filepath)
if os.path.exists(f’{dest}/{file_name}’):
return
logger.info(f’Processing {filepath}’)
writer = tf.python_io.TFRecordWriter(f’{dest}/{file_name}’)
dataset = tf.data.TFRecordDataset(filepath, compression_type=’’)
for idx, data in enumerate(dataset):


(pid=138) Instructions for updating:
(pid=138) non-resource variables are not supported in the long term
(pid=100) WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/compat/v2_compat.py:101: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.
(pid=100) Instructions for updating:
(pid=100) non-resource variables are not supported in the long term
(pid=250) 2021-11-10 07:39:15,977 INFO Processing /app/project/training_0000/raw/segment-10017090168044687777_6380_000_6400_000_with_camera_labels.tfrecord
Traceback (most recent call last):
File “download_process.py”, line 204, in
_ = ray.get(workers)
File “/usr/local/lib/python3.8/dist-packages/ray/worker.py”, line 1538, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(RuntimeError): ray::main.download_and_process() (pid=250, ip=172.17.0.2)
File “python/ray/_raylet.pyx”, line 479, in ray._raylet.execute_task
File “download_process.py”, line 176, in download_and_process
process_tfr(local_path, data_dir)
File “download_process.py”, line 137, in process_tfr
iterator = iter(dataset)
File “/usr/local/lib/python3.8/dist-packages/tensorflow/python/data/ops/dataset_ops.py”, line 3445, in iter
return iter(self._dataset)
File “/usr/local/lib/python3.8/dist-packages/tensorflow/python/data/ops/dataset_ops.py”, line 413, in iter
raise RuntimeError("iter() is only supported inside of tf.function "
RuntimeError: iter() is only supported inside of tf.function or when eager execution is enabled.

For whom it’s worth, I have this error everytime I restore from hibernation my ubuntu system.

My current workaround is either to avoid hibernation or to reboot the system.