Модель оценки Tensorflow расходится с потерей = NaN

Я использую оценщик тензорного потока, настроенный как CNN, и каждый раз, когда я запускаю свой код, я получаю эту ошибку:

ERROR:tensorflow:Model diverged with loss = NaN.
Traceback (most recent call last):
  File "cnn_training_v3.py", line 108, in <module>
    classifier.train(input_fn=train_input_fn, steps=200, hooks=[logging_hook])
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 363, in train
    loss = self._train_model(input_fn, hooks, saving_listeners)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 843, in _train_model
    return self._train_model_default(input_fn, hooks, saving_listeners)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 859, in _train_model_default
    saving_listeners)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1059, in _train_with_estimator_spec
    _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 567, in run
    run_metadata=run_metadata)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1043, in run
    run_metadata=run_metadata)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1134, in run
    raise six.reraise(*original_exc_info)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1119, in run
    return self._sess.run(*args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1199, in run
    run_metadata=run_metadata))
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/basic_session_run_hooks.py", line 623, in after_run
    raise NanLossDuringTrainingError
tensorflow.python.training.basic_session_run_hooks.NanLossDuringTrainingError: NaN loss during training.

Я знаю, что подобные вопросы уже задавались на этом сайте, но их ответы мне не помогли. Я попытался уменьшить скорость обучения, добавить эпсилон к моим логит-вероятностям и изменить функцию потерь, но все равно получил ошибки.

Вот моя функция CNN:

# CNN function
def cnn_model_fn(features, labels, mode):

    # Define the layers of the cnn
    input_layer = tf.reshape(features["images"], [-1, 200, 200, 3])
    conv_layer = tf.layers.conv2d(inputs=input_layer, filters=32, kernel_size=[5, 5], padding="same", activation=tf.nn.relu)
    pool_layer = tf.layers.max_pooling2d(inputs=conv_layer, pool_size=[2, 2], strides=2)
    conv_layer_two = tf.layers.conv2d(inputs=pool_layer, filters=64, kernel_size=[5, 5], padding="same", activation=tf.nn.relu)
    pool_layer_two = tf.layers.max_pooling2d(inputs=conv_layer_two, pool_size=[2, 2], strides=2)
    flat_pool_two = tf.reshape(pool_layer_two, [-1, 50 * 50 * 64])
    dense_layer = tf.layers.dense(inputs=flat_pool_two, units=1024, activation=tf.nn.relu)
    logits = tf.layers.dense(inputs=dense_layer, units=4)

    # Add epsilon to logits
    epsilon = tf.constant(value=0.00001, shape=(1,4))
    logits = logits + epsilon

    # Generate predictions (for PREDICT and EVAL mode)
    blocknum_prediction = tf.argmax(input=logits, axis=1)
    blocknum_probabilities = tf.nn.softmax(logits, name="softmax_tensor")
    predictions = {"blocknum_classes": blocknum_prediction}

    # Return predictions when in PREDICT mode
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Calculate Loss (for both TRAIN and EVAL modes)
    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)

    # Configure the Training Operation (for TRAIN mode)
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.0001)
        train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

    # Add evaluation metrics (for EVAL mode)
    eval_metric_ops = {"blocknum_accuracy": tf.metrics.accuracy(labels=labels, predictions=predictions["blocknum_classes"])}
    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

Вот мой основной код. Моя цель — научить CNN смотреть на изображение башни из блоков и предсказывать, сколько блоков на изображении.

# Load and process dataset

image_files = []
text_files = []
images = []
labels = []

# load files from folder
for root, dirs, files in os.walk("images"):  
    for filename in files:
        if 'before' in filename:
            image_files.append(filename)
        elif 'text' in filename:
            text_files.append(filename)

# for each pair of files, append relevant data to image and label lists
# note to self: label 0 means 2 blocks, label 1 means 3 blocks, label 2 means 4 blocks, label 3 means 5 blocks
for imagename in image_files:
    images.append(cv2.imread('images/'+filename))
    num = imagename[7:len(imagename)-4]
    for textname in text_files:
        if ('_'+num+'.') in textname:
            textfile = open('images/'+textname, 'r')
            for line in textfile:
                if 'Number of blocks' in line:
                    nblocks = int(line[18:].strip('\n'))
                    if nblocks == 2:
                        label = 0
                    elif nblocks == 3:
                        label = 1
                    elif nblocks == 4:
                        label = 2
                    elif nblocks == 5:
                        label = 3
            labels.append(label)

# separate images and labels into train and test sets - 50% train, 50% evaluate
train_images = images[0:len(images)/2]
train_labels = labels[0:len(labels)/2]
test_images = images[len(images)/2:]
test_labels = labels[len(labels)/2:]

# convert dataset into numpy arrays
train_data_numpy = np.array(train_images, np.float32)
train_labels_numpy = np.array(train_labels, np.int32)
test_data_numpy = np.array(test_images, np.float32)
test_labels_numpy = np.array(test_labels, np.int32)



# Put images through CNN

# Create the Estimator
classifier = tf.estimator.Estimator(model_fn=cnn_model_fn, model_dir="models/cnn")

# Set up logging for predictions
tensors_to_log = {"probabilities": "softmax_tensor"}
logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=1)

# Train the model
train_input_fn = tf.estimator.inputs.numpy_input_fn(x={"images":train_data_numpy}, y=train_labels_numpy, batch_size=1, num_epochs=None, shuffle=True)
classifier.train(input_fn=train_input_fn, steps=200, hooks=[logging_hook])

# Evaluate the model and print results
eval_input_fn = tf.estimator.inputs.numpy_input_fn(x={"images":test_data_numpy}, y=test_labels_numpy, num_epochs=1, shuffle=False)
eval_results = classifier.evaluate(input_fn=eval_input_fn)
print(eval_results)

Я использую Python 2.7.12 в Ubuntu 16.04. Мы будем очень признательны за любое понимание того, почему происходит эта потеря NaN.


person The Impossible Squish    schedule 18.05.2018    source источник


Ответы (1)


Решение найдено! Оказывается, предыдущие контрольные точки модели конфликтовали с текущим сеансом обучения, поэтому я удалил все в папке, в которой моя модель сохраняет контрольные точки, и теперь она тренируется без ошибок потери NaN.

person The Impossible Squish    schedule 20.05.2018