tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
Cuda - 11.4
TF - 2.6.2
Docker Container Runtime
Logs and python File below
Python File
import argparse
import io
import os
import subprocessimport ray
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
#import tensorflow as tf
from PIL import Image
from psutil import cpu_count
tf.disable_v2_behavior()
from utils import *
from object_detection.utils import dataset_util, label_map_utillabel_map = label_map_util.load_labelmap(‘./label_map.pbtxt’)
label_map_dict = label_map_util.get_label_map_dict(label_map)
t2idict = {y:x for x,y in label_map_dict.items()}
def class_text_to_int(text):
return t2idict[text]def create_tf_example(filename, encoded_jpeg, annotations):
“”"
This function create a tf.train.Example from the Waymo frame.args: - filename [str]: name of the image - encoded_jpeg [bytes]: jpeg encoded image - annotations [protobuf object]: bboxes and classes returns: - tf_example [tf.Train.Example]: tf example in the objection detection api format. """ # TODO: Implement function to convert the data encoded_jpg_io = io.BytesIO(encoded_jpeg) image = Image.open(encoded_jpg_io) width, height = image.size image_format = b'jpeg' xmins = [] xmaxs = [] ymins = [] ymaxs = [] classes_text = [] classes = [] for index, row in enumerate(annotations): xmin = row.box.center_x - row.box.length/2.0 xmax = row.box.center_x + row.box.length/2.0 ymin = row.box.center_y - row.box.width/2.0 ymax = row.box.center_y + row.box.width/2.0 xmins.append(xmin / width) xmaxs.append(xmax / width) ymins.append(ymin / height) ymaxs.append(ymax / height) classes_text.append(class_text_to_int(row.type).encode('utf8')) classes.append(row.type) filename = filename.encode('utf8') tf_example = tf.train.Example(features=tf.train.Features(feature={ 'image/height': int64_feature(height), 'image/width': int64_feature(width), 'image/filename': bytes_feature(filename), 'image/source_id': bytes_feature(filename), 'image/encoded': bytes_feature(encoded_jpeg), 'image/format': bytes_feature(image_format), 'image/object/bbox/xmin': float_list_feature(xmins), 'image/object/bbox/xmax': float_list_feature(xmaxs), 'image/object/bbox/ymin': float_list_feature(ymins), 'image/object/bbox/ymax': float_list_feature(ymaxs), 'image/object/class/text': bytes_list_feature(classes_text), 'image/object/class/label': int64_list_feature(classes), })) return tf_example
def download_tfr(filepath, temp_dir):
“”"
download a single tf recordargs: - filepath [str]: path to the tf record file - temp_dir [str]: path to the directory where the raw data will be saved returns: - local_path [str]: path where the file is saved """ # create data dir dest = os.path.join(temp_dir, 'raw') os.makedirs(dest, exist_ok=True) filename = os.path.basename(filepath) local_path = os.path.join(dest, filename) if os.path.exists(local_path): return local_path print("start downloading {}".format(local_path)) # download the tf record file #cmd = ['gsutil', 'cp', filepath, f'{dest}'] #logger.info(f'Downloading {filepath}') #res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) #if res.returncode != 0: # logger.error(f'Could not download file {filepath}') #print("complete downloading {}".format(local_path)) return local_path
def process_tfr(filepath, data_dir):
“”"
process a Waymo tf record into a tf api tf recordargs: - filepath [str]: path to the Waymo tf record file - data_dir [str]: path to the destination directory """ # create processed data dir dest = os.path.join(data_dir, 'processed') os.makedirs(dest, exist_ok=True) file_name = os.path.basename(filepath) if os.path.exists(f'{dest}/{file_name}'): return logger.info(f'Processing {filepath}') writer = tf.python_io.TFRecordWriter(f'{dest}/{file_name}') dataset = tf.data.TFRecordDataset(filepath, compression_type='') for idx, data in enumerate(dataset): frame = open_dataset.Frame() frame.ParseFromString(bytearray(data.numpy())) encoded_jpeg, annotations = parse_frame(frame) filename = file_name.replace('.tfrecord', f'_{idx}.tfrecord') tf_example = create_tf_example(filename, encoded_jpeg, annotations) writer.write(tf_example.SerializeToString()) writer.close() return
@ray.remote
def download_and_process(filename, temp_dir, data_dir):
# need to re-import the logger because of multiprocesing
dest = os.path.join(data_dir, ‘processed’)
os.makedirs(dest, exist_ok=True)
file_name = os.path.basename(filename)if os.path.exists(f'{dest}/{file_name}'): print("processed file {} exists, skip".format(file_name)) return logger = get_module_logger(__name__) local_path = download_tfr(filename, temp_dir) #local_path = "/app/project/training_0000" process_tfr(local_path, data_dir) # remove the original tf record to save space #if os.path.exists(local_path): # logger.info(f'Deleting {local_path}') # os.remove(local_path)
if name == “main”:
parser = argparse.ArgumentParser(description=‘Download and process tf files’)
parser.add_argument(‘–data_dir’, required=False, default=“./data”,
help=‘processed data directory’)
parser.add_argument(‘–temp_dir’, required=False, default=“/app/project/training_0000”,
help=‘raw data directory’)
args = parser.parse_args()
logger = get_module_logger(name)
# open the filenames file
with open(‘filenames1.txt’, ‘r’) as f:
filenames = f.read().splitlines()
logger.info(f’Download {len(filenames)} files. Be patient, this will take a long time.')data_dir = args.data_dir temp_dir = args.temp_dir
download_and_process(filenames[0], temp_dir, data_dir)
# init ray ray.init(num_cpus=cpu_count()) workers = [download_and_process.remote(fn, temp_dir, data_dir) for fn in filenames[:100]] _ = ray.get(workers) print("Done with downloading")
Logs
2021-11-05 05:52:19,569 WARNING services.py:1559 – WARNING: The object store is using /tmp instead of /dev/shm because /dev/shm has only 67108864 bytes available. This may slow down performance! You may be able to free up space by deleting files in /dev/shm or terminating any running plasma_store_server processes. If you are inside a Docker container, you may need to pass an argument with the flag ‘–shm-size’ to ‘docker run’.
2021-11-05 05:52:19,584 WARNING services.py:1559 – WARNING: The object store is using /tmp instead of /dev/shm because /dev/shm has only 67108864 bytes available. This may slow down performance! You may be able to free up space by deleting files in /dev/shm or terminating any running plasma_store_server processes. If you are inside a Docker container, you may need to pass an argument with the flag ‘–shm-size’ to ‘docker run’.
(pid=8218) 2021-11-05 05:52:25,785 INFO Processing /app/project/training_0000/raw/segment-10444454289801298640_4360_000_4380_000_with_camera_labels.tfrecord
(pid=8218) 2021-11-05 05:52:25.788420: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8214) 2021-11-05 05:52:25,822 INFO Processing /app/project/training_0000/raw/segment-10212406498497081993_5300_000_5320_000_with_camera_labels.tfrecord
(pid=8214) 2021-11-05 05:52:25.823758: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8281) 2021-11-05 05:52:25,833 INFO Processing /app/project/training_0000/raw/segment-10023947602400723454_1120_000_1140_000_with_camera_labels.tfrecord
(pid=8281) 2021-11-05 05:52:25.835462: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8278) 2021-11-05 05:52:25,833 INFO Processing /app/project/training_0000/raw/segment-10107710434105775874_760_000_780_000_with_camera_labels.tfrecord
(pid=8278) 2021-11-05 05:52:25.836066: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8208) 2021-11-05 05:52:25,914 INFO Processing /app/project/training_0000/raw/segment-10226164909075980558_180_000_200_000_with_camera_labels.tfrecord
(pid=8208) 2021-11-05 05:52:25.916432: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8240) 2021-11-05 05:52:25,943 INFO Processing /app/project/training_0000/raw/segment-10082223140073588526_6140_000_6160_000_with_camera_labels.tfrecord
(pid=8240) 2021-11-05 05:52:25.945284: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8259) 2021-11-05 05:52:25,965 INFO Processing /app/project/training_0000/raw/segment-1005081002024129653_5313_150_5333_150_with_camera_labels.tfrecord
(pid=8259) 2021-11-05 05:52:25.968303: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8253) 2021-11-05 05:52:25,973 INFO Processing /app/project/training_0000/raw/segment-10061305430875486848_1080_000_1100_000_with_camera_labels.tfrecord
(pid=8253) 2021-11-05 05:52:25.976335: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8260) 2021-11-05 05:52:25,980 INFO Processing /app/project/training_0000/raw/segment-10017090168044687777_6380_000_6400_000_with_camera_labels.tfrecord
(pid=8260) 2021-11-05 05:52:25.983356: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8251) 2021-11-05 05:52:25,996 INFO Processing /app/project/training_0000/raw/segment-10075870402459732738_1060_000_1080_000_with_camera_labels.tfrecord
(pid=8251) 2021-11-05 05:52:25.998741: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8248) 2021-11-05 05:52:26,006 INFO Processing /app/project/training_0000/raw/segment-10072140764565668044_4060_000_4080_000_with_camera_labels.tfrecord
(pid=8248) 2021-11-05 05:52:26.010197: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8244) 2021-11-05 05:52:26,013 INFO Processing /app/project/training_0000/raw/segment-10094743350625019937_3420_000_3440_000_with_camera_labels.tfrecord
(pid=8244) 2021-11-05 05:52:26.015093: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8205) 2021-11-05 05:52:26,026 INFO Processing /app/project/training_0000/raw/segment-10231929575853664160_1160_000_1180_000_with_camera_labels.tfrecord
(pid=8205) 2021-11-05 05:52:26.029786: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8201) 2021-11-05 05:52:26,032 INFO Processing /app/project/training_0000/raw/segment-10327752107000040525_1120_000_1140_000_with_camera_labels.tfrecord
(pid=8201) 2021-11-05 05:52:26.034096: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8195) 2021-11-05 05:52:26,101 INFO Processing /app/project/training_0000/raw/segment-10455472356147194054_1560_000_1580_000_with_camera_labels.tfrecord
(pid=8195) 2021-11-05 05:52:26.104592: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8230) 2021-11-05 05:52:26,447 INFO Processing /app/project/training_0000/raw/segment-10275144660749673822_5755_561_5775_561_with_camera_labels.tfrecord
(pid=8230) 2021-11-05 05:52:26.450848: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8241) 2021-11-05 05:52:26,477 INFO Processing /app/project/training_0000/raw/segment-10206293520369375008_2796_800_2816_800_with_camera_labels.tfrecord
(pid=8241) 2021-11-05 05:52:26.483426: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8216) 2021-11-05 05:52:26,786 INFO Processing /app/project/training_0000/raw/segment-10391312872392849784_4099_400_4119_400_with_camera_labels.tfrecord
(pid=8216) 2021-11-05 05:52:26.791352: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8198) 2021-11-05 05:52:26,908 INFO Processing /app/project/training_0000/raw/segment-10235335145367115211_5420_000_5440_000_with_camera_labels.tfrecord
(pid=8198) 2021-11-05 05:52:26.912763: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8274) 2021-11-05 05:52:26,922 INFO Processing /app/project/training_0000/raw/segment-1022527355599519580_4866_960_4886_960_with_camera_labels.tfrecord
(pid=8274) 2021-11-05 05:52:26.926734: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8189) 2021-11-05 05:52:27,069 INFO Processing /app/project/training_0000/raw/segment-10241508783381919015_2889_360_2909_360_with_camera_labels.tfrecord
(pid=8189) 2021-11-05 05:52:27.072531: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8282) 2021-11-05 05:52:27,160 INFO Processing /app/project/training_0000/raw/segment-10096619443888687526_2820_000_2840_000_with_camera_labels.tfrecord
(pid=8282) 2021-11-05 05:52:27.164348: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8263) 2021-11-05 05:52:27,190 INFO Processing /app/project/training_0000/raw/segment-10072231702153043603_5725_000_5745_000_with_camera_labels.tfrecord
(pid=8263) 2021-11-05 05:52:27.194641: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
(pid=8238) 2021-11-05 05:52:27,203 INFO Processing /app/project/training_0000/raw/segment-10153695247769592104_787_000_807_000_with_camera_labels.tfrecord
(pid=8238) 2021-11-05 05:52:27.207482: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (34)
Done with downloading