We have a SLURM batch file that fails with TF2 and Keras, and also fails when called directly on a node that has a GPU. Here is the Python script contents:
from datetime import date
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN
from keras.optimizers import adam
from keras.layers import Dropout
from tensorflow.keras.callbacks import Callback, EarlyStopping
from sklearn.preprocessing import StandardScaler
from datetime import datetime, timedelta
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.recurrent import LSTM
from keras.models import load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint
import warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3"
warnings.filterwarnings('ignore')
import tensorflow as tf
import logging
logging.getLogger('tesorflow').setLevel(logging.FATAL)
delay = 252
window = 60
factor = 15
K = 8.4
sbo = 1.25
sso = 1.25
sbc = 0.75
ssc = 0.5
r = 0.02
tran_cost = 0.0002
leverage = 1.0
start_val = 100
bo = 1
so = -1
X_pd=pd.read_pickle('./data/X_pd.pkl')
X = pd.DataFrame(columns=range(0, window))
Y = []
for tag in X_pd.columns[:1]:
# i=0 ....len(X_pd.index)-window
for i in range(0, len(X_pd.index) - window):
X_example = X_pd.loc[i:i + window - 1][tag].values
X= X.append(pd.Series(X_example), ignore_index=True)
Y.append(X_pd.loc[i + window][tag])
print('done %s stocks' % (tag))
Y=pd.DataFrame(Y)
#normalization
SS = StandardScaler()
features = SS.fit_transform(X.values)
X=features
X=pd.DataFrame(X)
#LSTM model
def trainLSTMModel(layers, neurons, d):
model = Sequential()
model.add(LSTM(neurons[0], input_shape=(layers[1], layers[2]), return_sequences=False,activation='relu'))
#model.add(Dropout(d))
#model.add(LSTM(neurons[1], input_shape=(layers[1], layers[2]), return_sequences=False))
#model.add(Dropout(d))
#model.add(Dense(neurons[2], kernel_initializer="uniform", activation='relu'))
model.add(Dense(neurons[3], kernel_initializer="uniform", activation='relu'))
optimizer=adam(learning_rate=0.001)
#adam = Adam(decay=0.2)
# predict up and down
# model.compile(optimizer="adam", loss="binary_crossentropy", metrics=['accuracy'])
model.compile(loss='mse', optimizer=optimizer)
model.summary()
return model
length=X.shape[0]
X=np.array(X)
Y=np.array(Y)
time_step = 60
d = 0.3
output=1
shape = [length,time_step, output] # feature, window, output
neurons = [64, 64, 32, 1]
epochs = 100
batch_size=10000
model = trainLSTMModel(shape, neurons, d)
#shape from [samples, timesteps] into [samples, timesteps, features]
n_features = 1
X = X.reshape((X.shape[0], X.shape[1], n_features))
gpu_no = 0
with tf.device('/gpu:' + str(gpu_no)):
# sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True))
# keras.backend.set_session(sess)
print('model_manager: running tensorflow version: ' + tf.__version__)
print('model_manager: will attempt to run on ' + '/gpu:' + str(gpu_no))
model.fit(X, Y, epochs=epochs, verbose=2,batch_size=batch_size)
The log shows this:
Loading requirement: cuda10.1/toolkit/10.1.243
Loading cm-ml-python3deps/3.3.0
Loading requirement: gcc5/5.5.0 python36
Loading tensorflow2-py36-cuda10.1-gcc/2.0.0
Loading requirement: ml-pythondeps-py36-cuda10.1-gcc/3.3.0
openblas/dynamic/0.2.20 hdf5_18/1.8.20 keras-py36-cuda10.1-gcc/2.3.1
protobuf3-gcc/3.8.0 nccl2-cuda10.1-gcc/2.7.8
Loading openmpi/cuda/64/3.1.4
Loading requirement: hpcx/2.4.0
2021-08-18 11:11:43.064175: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2021-08-18 11:18:08.026219: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2021-08-18 11:18:08.031771: E tensorflow/stream_executor/cuda/cuda_driver.cc:318] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error
2021-08-18 11:18:08.031811: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: node001
2021-08-18 11:18:08.031819: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: node001
2021-08-18 11:18:08.031921: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 460.73.1
2021-08-18 11:18:08.031958: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 460.73.1
2021-08-18 11:18:08.031966: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 460.73.1
2021-08-18 11:18:08.032266: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX512F
Using TensorFlow backend.
done A stocks
Model: "sequential_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm_1 (LSTM) (None, 64) 16896
_________________________________________________________________
dense_1 (Dense) (None, 1) 65
=================================================================
Total params: 16,961
Trainable params: 16,961
Non-trainable params: 0
_________________________________________________________________
model_manager: running tensorflow version: 2.0.0
model_manager: will attempt to run on /gpu:0
Traceback (most recent call last):
File "stocks.py", line 99, in <module>
model.fit(X, Y, epochs=epochs, verbose=2,batch_size=batch_size)
File "/cm/shared/apps/keras-py36-cuda10.1-gcc/2.3.1/lib/python3.6/site-packages/keras/engine/training.py", line 1213, in fit
self._make_train_function()
File "/cm/shared/apps/keras-py36-cuda10.1-gcc/2.3.1/lib/python3.6/site-packages/keras/engine/training.py", line 316, in _make_train_function
loss=self.total_loss)
File "/cm/shared/apps/keras-py36-cuda10.1-gcc/2.3.1/lib/python3.6/site-packages/keras/legacy/interfaces.py", line 91, in wrapper
return func(*args, **kwargs)
File "/cm/shared/apps/keras-py36-cuda10.1-gcc/2.3.1/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 75, in symbolic_fn_wrapper
return func(*args, **kwargs)
File "/cm/shared/apps/keras-py36-cuda10.1-gcc/2.3.1/lib/python3.6/site-packages/keras/optimizers.py", line 519, in get_updates
for (i, p) in enumerate(params)]
File "/cm/shared/apps/keras-py36-cuda10.1-gcc/2.3.1/lib/python3.6/site-packages/keras/optimizers.py", line 519, in <listcomp>
for (i, p) in enumerate(params)]
File "/cm/shared/apps/keras-py36-cuda10.1-gcc/2.3.1/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 963, in zeros
v = tf.zeros(shape=shape, dtype=dtype, name=name)
File "/cm/shared/apps/tensorflow2-py36-cuda10.1-gcc/2.0.0/lib/python3.6/site-packages/tensorflow_core/python/ops/array_ops.py", line 2349, in zeros
output = _constant_if_small(zero, shape, dtype, name)
File "/cm/shared/apps/tensorflow2-py36-cuda10.1-gcc/2.0.0/lib/python3.6/site-packages/tensorflow_core/python/ops/array_ops.py", line 2307, in _constant_if_small
return constant(value, shape=shape, dtype=dtype, name=name)
File "/cm/shared/apps/tensorflow2-py36-cuda10.1-gcc/2.0.0/lib/python3.6/site-packages/tensorflow_core/python/framework/constant_op.py", line 227, in constant
allow_broadcast=True)
File "/cm/shared/apps/tensorflow2-py36-cuda10.1-gcc/2.0.0/lib/python3.6/site-packages/tensorflow_core/python/framework/constant_op.py", line 235, in _constant_impl
t = convert_to_eager_tensor(value, ctx, dtype)
File "/cm/shared/apps/tensorflow2-py36-cuda10.1-gcc/2.0.0/lib/python3.6/site-packages/tensorflow_core/python/framework/constant_op.py", line 96, in convert_to_eager_tensor
return ops.EagerTensor(value, ctx.device_name, dtype)
RuntimeError: /job:localhost/replica:0/task:0/device:GPU:0 unknown device.
Why is the script not seeing the GPU?