Tensorflow: проблема с заполнителем и резюме

Я изменил существующий пример cifar10, чтобы он работал как сиамская сеть. Но я сталкиваюсь с некоторыми трудностями в обучении.

Внесены изменения:

  • заполнитель вместо очереди
  • пользовательская функция потерь

Вот мой модифицированный cifar10_train.py:

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from datetime import datetime
import os.path
import time
import input_data
import numpy as np
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

import cifar10

FLAGS = tf.app.flags.FLAGS

tf.app.flags.DEFINE_string('train_dir', 'tmp/cifar10_train',
                           """Directory where to write event logs """
                           """and checkpoint.""")
tf.app.flags.DEFINE_integer('max_steps', 1000000,
                            """Number of batches to run.""")
tf.app.flags.DEFINE_boolean('log_device_placement', False,
                            """Whether to log device placement.""")


def train():
  """Train CIFAR-10 for a number of steps."""
  dataset = input_data.read()
  image, image_p, label = dataset.train_dataset
  image_size  = dataset.image_size
  batch_size = 28
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)

    # Get images and labels for CIFAR-10.
    images = tf.placeholder(tf.float32, shape=(batch_size, image_size[0], image_size[1], image_size[2]))
    images2 = tf.placeholder(tf.float32, shape=(batch_size, image_size[0], image_size[1], image_size[2]))
    labels = tf.placeholder(tf.float32, shape=(batch_size))
    tf.image_summary('images', images)
    tf.image_summary('images2', images)
    # Build a Graph that computes the logits predictions from the
    # inference model.
    with tf.variable_scope('inference') as scope:
      logits = cifar10.inference(images)
      scope.reuse_variables()
      logits2 = cifar10.inference(images2)

    # Calculate loss.
    loss = cifar10.loss(logits, logits2, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()

    # Start running operations on the Graph.
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir,
                                            graph_def=sess.graph_def)

    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      offset = (step * batch_size) % (dataset.train_samples - batch_size)
      _, loss_value = sess.run([train_op, loss], feed_dict={images: image[offset:(offset + batch_size)], images2: image_p[offset:(offset + batch_size)], labels: 1.0*label[offset:(offset + batch_size)]})
      duration = time.time() - start_time

      print(loss_value)
      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))

      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)

      # Save the model checkpoint periodically.
      if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)


def main(argv=None): 
 # pylint: disable=unused-argument
  train()


if __name__ == '__main__':
  tf.app.run()

Модифицированный cifar10.py

"""Builds the CIFAR-10 network.

Summary of available functions:

 # Compute input images and labels for training. If you would like to run
 # evaluations, use inputs() instead.
 inputs, labels = distorted_inputs()

 # Compute inference on the model inputs to make a prediction.
 predictions = inference(inputs)

 # Compute the total loss of the prediction with respect to the labels.
 loss = loss(predictions, labels)

 # Create a graph to run one step of training with respect to the loss.
 train_op = train(loss, global_step)
"""
# pylint: disable=missing-docstring
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import gzip
import os
import re
import sys
import tarfile

from six.moves import urllib
import tensorflow as tf

import input_data

FLAGS = tf.app.flags.FLAGS

# Basic model parameters.
tf.app.flags.DEFINE_integer('batch_size', 28,
                            """Number of images to process in a batch.""")
tf.app.flags.DEFINE_string('data_dir_p', '/tmp/cifar10_data',
                           """Path to the CIFAR-10 data directory.""")

# Global constants describing the CIFAR-10 data set.
# IMAGE_SIZE = cifar10_input.IMAGE_SIZE
# NUM_CLASSES = cifar10_input.NUM_CLASSES
NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = input_data.train_samples
# NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL


# Constants describing the training process.
MOVING_AVERAGE_DECAY = 0.9999     # The decay to use for the moving average.
NUM_EPOCHS_PER_DECAY = 350.0      # Epochs after which learning rate decays.
LEARNING_RATE_DECAY_FACTOR = 0.1  # Learning rate decay factor.
INITIAL_LEARNING_RATE = 0.001       # Initial learning rate.
Q = 360.6244
# If a model is trained with multiple GPU's prefix all Op names with tower_name
# to differentiate the operations. Note that this prefix is removed from the
# names of the summaries when visualizing a model.
TOWER_NAME = 'tower'

DATA_URL = 'http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz'


def _activation_summary(x):
  # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
  # session. This helps the clarity of presentation on tensorboard.
  tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name)
  tf.histogram_summary(tensor_name + '/activations', x)
  tf.scalar_summary(tensor_name + '/sparsity', tf.nn.zero_fraction(x))


def _variable_on_cpu(name, shape, initializer):

  with tf.device('/cpu:0'):
    var = tf.get_variable(name, shape, initializer=initializer)
  return var


def _variable_with_weight_decay(name, shape, stddev, wd):

  var = _variable_on_cpu(name, shape, tf.truncated_normal_initializer(stddev=stddev))
  if wd:
    weight_decay = tf.mul(tf.nn.l2_loss(var), wd, name='weight_loss')
    tf.add_to_collection('losses', weight_decay)
  return var

def inference(data):

  # We instantiate all variables using tf.get_variable() instead of
  # tf.Variable() in order to share variables across multiple GPU training runs.
  # If we only ran this model on a single GPU, we could simplify this function
  # by replacing all instances of tf.get_variable() with tf.Variable().
  #
  # conv1
  with tf.variable_scope('conv1') as scope:
    kernel = _variable_with_weight_decay('weights', shape=[5, 5, 1, 20],
                                         stddev=0.1, wd=0.0)

    conv = tf.nn.conv2d(data, kernel, [1, 1, 1, 1], padding='VALID')
    biases = _variable_on_cpu('biases', [20], tf.constant_initializer(0.0))
    conv1 = tf.nn.bias_add(conv, biases)
    _activation_summary(conv1)

  # pool1
  pool1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1],
                         padding='VALID', name='pool1')

  # conv2
  with tf.variable_scope('conv2') as scope:
    kernel = _variable_with_weight_decay('weights', shape=[5, 5, 20, 50],
                                         stddev=0.1, wd=0.0)
    conv = tf.nn.conv2d(pool1, kernel, [1, 1, 1, 1], padding='VALID')
    biases = _variable_on_cpu('biases', [50], tf.constant_initializer(0.0))
    conv2 = tf.nn.bias_add(conv, biases)
    _activation_summary(conv2)

  # pool2
  pool2 = tf.nn.max_pool(conv2, ksize=[1, 2, 2, 1],
                         strides=[1, 2, 2, 1], padding='VALID', name='pool2')

  # local3
  with tf.variable_scope('local3') as scope:
    # Move everything into depth so we can perform a single matrix multiply.
    dim = 1
    for d in pool2.get_shape()[1:].as_list():
      dim *= d
    reshape = tf.reshape(pool2, [pool2.get_shape()[0:].as_list()[0], dim])

    weights = _variable_with_weight_decay('weights', shape=[dim, 500],
                                          stddev=0.1, wd=0.0)
    biases = _variable_on_cpu('biases', [500], tf.constant_initializer(0.10))
    local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name)
    _activation_summary(local3)

  # local4
  with tf.variable_scope('local4') as scope:
    weights = _variable_with_weight_decay('weights', shape=[500, 10],
                                          stddev=0.1, wd=0.0)
    biases = _variable_on_cpu('biases', [10], tf.constant_initializer(0.0))
    local4 = tf.add(tf.matmul(local3, weights), biases, name=scope.name)
    _activation_summary(local4)

  #local5
  with tf.variable_scope('local5') as scope:
    weights = _variable_with_weight_decay('weights', [10, 10],
                                          stddev=0.1, wd=0.0)
    biases = _variable_on_cpu('biases', [10],
                              tf.constant_initializer(0.0))
    local5 = tf.add(tf.matmul(local4, weights), biases, name=scope.name)
    _activation_summary(local5)

  return local5


def loss(features1, features2, labels):

  energy_square = (tf.reduce_sum(tf.pow(tf.sub(features1, features2), 2),1))
  loss = tf.add(tf.mul(tf.pow(tf.sub(labels,1),2),energy_square),tf.mul(labels,tf.maximum(tf.sub(1.0,energy_square),0)))
  loss = tf.reduce_sum(loss) / features1.get_shape()[0:].as_list()[0] / 2
  # Calculate the average cross entropy loss across the batch.
  # labels = tf.cast(labels, tf.int64)
  # cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
      # logits, labels, name='cross_entropy_per_example')
  # cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
  tf.add_to_collection('losses', loss)

  # The total loss is defined as the cross entropy loss plus all of the weight
  # decay terms (L2 loss).
  return tf.add_n(tf.get_collection('losses'), name='total_loss')


def _add_loss_summaries(total_loss):

  # Compute the moving average of all individual losses and the total loss.
  loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
  losses = tf.get_collection('losses')
  loss_averages_op = loss_averages.apply(losses + [total_loss])

  # Attach a scalar summary to all individual losses and the total loss; do the
  # same for the averaged version of the losses.
  for l in losses + [total_loss]:
    # Name each loss as '(raw)' and name the moving average version of the loss
    # as the original loss name.
    tf.scalar_summary(l.op.name +' (raw)', l)
    tf.scalar_summary(l.op.name, loss_averages.average(l))

  return loss_averages_op


def train(total_loss, global_step):
  loss_averages_op = _add_loss_summaries(total_loss)

  num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size
  decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)

  # Decay the learning rate exponentially based on the number of steps.
  lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                  global_step,
                                  decay_steps,
                                  LEARNING_RATE_DECAY_FACTOR,
                                  staircase=True)
  tf.scalar_summary('learning_rate', lr)

  # Generate moving averages of all losses and associated summaries.
  loss_averages_op = _add_loss_summaries(total_loss)

  # Compute gradients.
  with tf.control_dependencies([loss_averages_op]):
    opt = tf.train.GradientDescentOptimizer(lr)
    grads = opt.compute_gradients(total_loss)


  # Apply gradients.
  apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

  # Add histograms for trainable variables.
  for var in tf.trainable_variables():
    tf.histogram_summary(var.op.name, var)

  # Add histograms for gradients.
  for grad, var in grads:
    if grad:
      tf.histogram_summary(var.op.name + '/gradients', grad)

  # Track the moving averages of all trainable variables.
  variable_averages = tf.train.ExponentialMovingAverage(
      MOVING_AVERAGE_DECAY, global_step)
  variables_averages_op = variable_averages.apply(tf.trainable_variables())

  with tf.control_dependencies([apply_gradient_op, variables_averages_op]):
    train_op = tf.no_op(name='train')

  return train_op

Ошибка, которую я получаю:

2016-03-01 15:56:59.483682: step 0, loss = 0.22 (9.7 examples/sec; 2.896 sec/batch)
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Invalid argument: You must feed a value for placeholder tensor 'Placeholder' with dtype float and shape [28,112,92,1]
     [[Node: Placeholder = Placeholder[dtype=DT_FLOAT, shape=[28,112,92,1], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary
     [[Node: HistogramSummary = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary/tag, inference/conv1/weights/read)]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary_1
     [[Node: HistogramSummary_1 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary_1/tag, inference/conv1/biases/read)]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Invalid argument: You must feed a value for placeholder tensor 'Placeholder_2' with dtype float and shape [28]
     [[Node: Placeholder_2 = Placeholder[dtype=DT_FLOAT, shape=[28], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary_3
     [[Node: HistogramSummary_3 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary_3/tag, inference/conv2/biases/read)]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary_2
     [[Node: HistogramSummary_2 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary_2/tag, inference/conv2/weights/read)]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary_4
     [[Node: HistogramSummary_4 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary_4/tag, inference/local3/weights/read)]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary_5
     [[Node: HistogramSummary_5 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary_5/tag, inference/local3/biases/read)]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary_6
     [[Node: HistogramSummary_6 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary_6/tag, inference/local4/weights/read)]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary_7
     [[Node: HistogramSummary_7 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary_7/tag, inference/local4/biases/read)]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary_8
     [[Node: HistogramSummary_8 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary_8/tag, inference/local5/weights/read)]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary_9
     [[Node: HistogramSummary_9 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary_9/tag, inference/local5/biases/read)]]
Traceback (most recent call last):
  File "cifar10_train.py", line 110, in <module>
    tf.app.run()
  File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/platform/default/_app.py", line 30, in run
    sys.exit(main(sys.argv))
  File "cifar10_train.py", line 106, in main
    train()
  File "cifar10_train.py", line 95, in train
    summary_str = sess.run(summary_op)
  File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 315, in run
    return self._run(None, fetches, feed_dict)
  File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 511, in _run
    feed_dict_string)
  File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 564, in _do_run
    target_list)
  File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 586, in _do_call
    e.code)
tensorflow.python.framework.errors.InvalidArgumentError: You must feed a value for placeholder tensor 'Placeholder' with dtype float and shape [28,112,92,1]
     [[Node: Placeholder = Placeholder[dtype=DT_FLOAT, shape=[28,112,92,1], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
Caused by op u'Placeholder', defined at:
  File "cifar10_train.py", line 110, in <module>
    tf.app.run()
  File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/platform/default/_app.py", line 30, in run
    sys.exit(main(sys.argv))
  File "cifar10_train.py", line 106, in main
    train()
  File "cifar10_train.py", line 36, in train
    images = tf.placeholder(tf.float32, shape=(batch_size, image_size[0], image_size[1], image_size[2]))
  File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/ops/array_ops.py", line 742, in placeholder
    name=name)
  File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/ops/gen_array_ops.py", line 583, in _placeholder
    name=name)
  File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/ops/op_def_library.py", line 655, in apply_op
    op_def=op_def)
  File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2040, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1087, in __init__
    self._traceback = _extract_stack()

Кроме того, когда я комментирую merge_all_summaries(), модель расходится с loss= NaN


person user3733814    schedule 01.03.2016    source источник


Ответы (1)


Проблема здесь в том, что некоторые сводки на вашем графике собраны tf.merge_all_summaries() зависит от ваших заполнителей. Например, код в cifar10.py создает сводки для различных активаций на каждом шаге, которые зависят от используемого обучающего примера.

Решение состоит в том, чтобы передать ту же тренировочную группу, когда вы оцениваете summary_op:

if step % 100 == 0:
  summary_str = sess.run(summary_op, feed_dict={
      images: image[offset:(offset + batch_size)],
      images2: image_p[offset:(offset + batch_size)],
      labels: 1.0 * label[offset:(offset + batch_size)]})

Хотя это дает наименьшую модификацию вашего исходного кода, это немного неэффективно, потому что он будет повторно выполнять шаг обучения каждые 100 шагов. Лучший способ решить эту проблему (хотя это потребует некоторой реструктуризации вашего цикла обучения) — получить сводки в том же вызове sess.run(), который выполняет шаг обучения:

if step % 100 == 0:
  _, loss_value, summary_str = sess.run([train_op, loss, summary_op], feed_dict={
      images: image[offset:(offset + batch_size)],
      images2: image_p[offset:(offset + batch_size)],
      labels: 1.0 * label[offset:(offset + batch_size)]})
person mrry    schedule 01.03.2016