Маска обучения RCNN на настраиваемом наборе данных зависает

Я пытаюсь обучить модель Mask RCNN на настраиваемом наборе данных. Набор данных, который я использую для тестирования, - это набор данных о кенгуру из https://github.com/experiencor/kangaroo и я по сути следую этому примеру: https://github.com/matterport/Mask_RCNN/tree/master/samples/balloon.

Я переписал часть кода, чтобы он соответствовал набору данных (который находится в ограничивающих прямоугольниках, а не в многоугольниках) (переименовал ballon.py в kangaroo.py и отображал только внесенные мной изменения):

def xml_annotation_to_dict(src):
    """Takes a source file (xml) and returns a dictionary

    Parameters
    ----------
    src : string
        Path to xml file

    Returns
    -------
    dict

    Example
    -------
    >>> xml_annotation_to_dict('path/to/annotation/file.xml')
    """
    tree = ET.parse(src)
    root = tree.getroot()

    annotation_data = {}
    annotation_data['source'] = src
    annotation_data['image_filename'] = list(root.iterfind('filename'))[0].text
    annotation_data['image'] = {
        'width': int(list(root.iterfind('size/width'))[0].text),
        'height': int(list(root.iterfind('size/height'))[0].text),
        'depth': int(list(root.iterfind('size/depth'))[0].text)
    }
    annotation_data['annotations'] = []
    for annotation in root.iterfind('object'):
        annotation_data['annotations'].append({
            'name': list(annotation.iterfind('name'))[0].text,
            'bbox': {
                'xmin': int(list(annotation.iterfind('bndbox/xmin'))[0].text),
                'ymin': int(list(annotation.iterfind('bndbox/ymin'))[0].text),
                'width': int(list(annotation.iterfind('bndbox/xmax'))[0].text) - int(list(annotation.iterfind('bndbox/xmin'))[0].text),
                'height': int(list(annotation.iterfind('bndbox/ymax'))[0].text) - int(list(annotation.iterfind('bndbox/ymin'))[0].text)
            }
        })
    return annotation_data

class CustomConfig(Config):
    """Configuration for training on the toy  dataset.
    Derives from the base Config class and overrides some values.
    """
    # Give the configuration a recognizable name
    NAME = 'Custom'

    # We use a GPU with 12GB memory, which can fit two images.
    # GPU_COUNT = 0
    # I've set this to 1 and that doesn't seem to work

    # Adjust down if you use a smaller GPU.
    # I've set this to 2 as well, didn't change much
    IMAGES_PER_GPU = 1

    # Number of classes (including background)
    NUM_CLASSES = 1 + 1  # Background + kangaroo

    # Number of training steps per epoch, changed from 100 to 20 to reduce training in this test, didn't change much
    STEPS_PER_EPOCH = 100

    # Skip detections with < 80% confidence
    DETECTION_MIN_CONFIDENCE = 0.8


class KangarooDataset(utils.Dataset):
    def load_kangaroo(self, dataset_dir, subset):
        # Add classes. We have only one class to add.
        self.add_class("kangaroo", 1, "kangaroo")

        # Train or validation dataset?
        assert subset in ["train", "val"]

        # load all filepaths and split deterministically
        # using sklear train_test_split method
        # assume files to be split over "annots" and "images"
        # directories
        images_paths = glob(os.path.join(dataset_dir, 'images', '*.jpg'))
        annotations_paths = glob(os.path.join(dataset_dir, 'annots', '*.xml'))

        images_train_paths, \
        images_test, \
        annotations_train_paths, \
        annotations_test_paths = train_test_split(
            images_paths,
            annotations_paths,
            train_size=0.7,
            test_size=0.3,
            random_state=0)

        if subset == 'train':
            paths = annotations_train_paths
        else:
            # subset == 'val', meaing test
            paths = annotations_test_paths

        # add dictionairy to class containing image and annotations
        for annotation_path in paths:
            annotation_data = xml_annotation_to_dict(annotation_path)

            # create list of polygons per annotation file (can have multiple polygons)
            polygons = []
            for annotation in annotation_data['annotations']:
                ys = [
                    annotation['bbox']['ymin'],
                    annotation['bbox']['ymin'],
                    annotation['bbox']['ymin'] + annotation['bbox']['height'],
                    annotation['bbox']['ymin'] + annotation['bbox']['height']
                ]
                xs = [
                    annotation['bbox']['xmin'],
                    annotation['bbox']['xmin'],
                    annotation['bbox']['xmin'] + annotation['bbox']['width'],
                    annotation['bbox']['xmin'] + annotation['bbox']['width']
                ]
                polygons.append({
                    'ys': ys,
                    'xs': xs
                })

            self.add_image(
                'kangaroo',
                image_id=annotation_data['image_filename'],
                path=os.path.join('data', 'images', annotation_data['image_filename']),
                width=annotation_data['image']['width'],
                height=annotation_data['image']['height'],
                polygons=polygons
            )


    def load_mask(self, image_id):
        """Generate instance masks for an image.
    Returns:
        masks: A bool array of shape [height, width, instance count] with
            one mask per instance.
        class_ids: a 1D array of class IDs of the instance masks.
        """
        # If not a kangaroo dataset image, delegate to parent class.
        image_info = self.image_info[image_id]
        if image_info["source"] != "kangaroo":
            return super(self.__class__, self).load_mask(image_id)

        # Convert polygons to a bitmap mask of shape
        # [height, width, instance_count]
        # info = self.image_info[image_id]
        mask = np.zeros([
            image_info["height"],
            image_info["width"],
            len(image_info["polygons"])],
                        dtype=np.uint8)
        for i, p in enumerate(image_info["polygons"]):
            # Get indexes of pixels inside the polygon and set them to 1
            rr, cc = skimage.draw.polygon(p['ys'], p['xs'])
            mask[rr, cc, i] = 1

        # Return mask, and array of class IDs of each instance. Since we have
        # one class ID only, we return an array of 1s
        return mask.astype(np.bool), np.ones([mask.shape[-1]], dtype=np.int32)

    def image_reference(self, image_id):
        """Return the path of the image."""
        info = self.image_info[image_id]
        if info["source"] == "kangaroo":
            return info["path"]
        else:
            super(self.__class__, self).image_reference(image_id)

Я пытался запустить это, используя:

python kangaroo.py train --dataset data/ --weights coco --logs logs/

Моя структура папок:

.
├── data
│   ├── annots
│   ├── coco
│   └── images
├── kangaroo.py
├── logs
└── requirements.txt

Я пробовал запустить это на процессоре моего Mac (15-дюймовая модель 2018 года с 6-ядерным i7), и, похоже, он работает, но, хотя мой процессор работает с вентиляторами, издающими звуки, как будто он всегда собирается снять терминал остается в первой эпохе. Я запускал это примерно от часа до 90 минут, а терминал остается на:

Epoch 1/100

Я подумал, что это может быть проблема с оборудованием, поэтому перешел на виртуальную машину в облаке (azure NC6 с графическим процессором Nvidia Tesla K80 с 12 ГБ памяти). Когда я запускаю модель там, она все еще остается в первой эпохе (после того, как я бы сказал, по крайней мере, от 20 до 30 минут).

Сначала казалось, что он не может использовать графический процессор из-за программного обеспечения драйвера NVIDIA 10.1, и тензорный поток искал 10.0. Я понизил версию до 10.0 и, похоже, работал, но он все еще зависает

epoch 1/100

На выходе я получаю

Using TensorFlow backend.
Weights:  coco
Dataset:  data/
Logs:  logs/

Configurations:
BACKBONE                       resnet101
BACKBONE_STRIDES               [4, 8, 16, 32, 64]
BATCH_SIZE                     2
BBOX_STD_DEV                   [0.1 0.1 0.2 0.2]
COMPUTE_BACKBONE_SHAPE         None
DETECTION_MAX_INSTANCES        100
DETECTION_MIN_CONFIDENCE       0.8
DETECTION_NMS_THRESHOLD        0.3
FPN_CLASSIF_FC_LAYERS_SIZE     1024
GPU_COUNT                      1
GRADIENT_CLIP_NORM             5.0
IMAGES_PER_GPU                 2
IMAGE_CHANNEL_COUNT            3
IMAGE_MAX_DIM                  1024
IMAGE_META_SIZE                14
IMAGE_MIN_DIM                  800
IMAGE_MIN_SCALE                0
IMAGE_RESIZE_MODE              square
IMAGE_SHAPE                    [1024 1024    3]
LEARNING_MOMENTUM              0.9
LEARNING_RATE                  0.001
LOSS_WEIGHTS                   {'rpn_class_loss': 1.0, 'rpn_bbox_loss': 1.0, 'mrcnn_class_loss': 1.0, 'mrcnn_bbox_loss': 1.0, 'mrcnn_mask_loss': 1.0}
MASK_POOL_SIZE                 14
MASK_SHAPE                     [28, 28]
MAX_GT_INSTANCES               100
MEAN_PIXEL                     [123.7 116.8 103.9]
MINI_MASK_SHAPE                (56, 56)
NAME                           Custom
NUM_CLASSES                    2
POOL_SIZE                      7
POST_NMS_ROIS_INFERENCE        1000
POST_NMS_ROIS_TRAINING         2000
PRE_NMS_LIMIT                  6000
ROI_POSITIVE_RATIO             0.33
RPN_ANCHOR_RATIOS              [0.5, 1, 2]
RPN_ANCHOR_SCALES              (32, 64, 128, 256, 512)
RPN_ANCHOR_STRIDE              1
RPN_BBOX_STD_DEV               [0.1 0.1 0.2 0.2]
RPN_NMS_THRESHOLD              0.7
RPN_TRAIN_ANCHORS_PER_IMAGE    256
STEPS_PER_EPOCH                100
TOP_DOWN_PYRAMID_SIZE          256
TRAIN_BN                       False
TRAIN_ROIS_PER_IMAGE           200
USE_MINI_MASK                  True
USE_RPN_ROIS                   True
VALIDATION_STEPS               50
WEIGHT_DECAY                   0.0001


WARNING:tensorflow:From /data/anaconda/envs/mrcnn/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:442: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

WARNING:tensorflow:From /data/anaconda/envs/mrcnn/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:58: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

WARNING:tensorflow:From /data/anaconda/envs/mrcnn/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3543: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

WARNING:tensorflow:From /data/anaconda/envs/mrcnn/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3386: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.

WARNING:tensorflow:From /data/anaconda/envs/mrcnn/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:1768: The name tf.image.resize_nearest_neighbor is deprecated. Please use tf.compat.v1.image.resize_nearest_neighbor instead.

WARNING:tensorflow:From /data/anaconda/envs/mrcnn/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:1154: calling reduce_max_v1 (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
WARNING:tensorflow:From /data/anaconda/envs/mrcnn/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:1188: calling reduce_sum_v1 (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
WARNING:tensorflow:From /data/anaconda/envs/mrcnn/lib/python3.7/site-packages/tensorflow_core/python/ops/array_ops.py:1475: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
WARNING:tensorflow:From /data/anaconda/envs/mrcnn/lib/python3.7/site-packages/mask_rcnn-2.1-py3.7.egg/mrcnn/model.py:553: The name tf.random_shuffle is deprecated. Please use tf.random.shuffle instead.

WARNING:tensorflow:From /data/anaconda/envs/mrcnn/lib/python3.7/site-packages/mask_rcnn-2.1-py3.7.egg/mrcnn/utils.py:202: The name tf.log is deprecated. Please use tf.math.log instead.

WARNING:tensorflow:From /data/anaconda/envs/mrcnn/lib/python3.7/site-packages/mask_rcnn-2.1-py3.7.egg/mrcnn/model.py:600: calling crop_and_resize_v1 (from tensorflow.python.ops.image_ops_impl) with box_ind is deprecated and will be removed in a future version.
Instructions for updating:
box_ind is deprecated, use box_indices instead
Loading weights  data/coco/mask_rcnn_coco.h5
WARNING:tensorflow:From /data/anaconda/envs/mrcnn/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:153: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

WARNING:tensorflow:From /data/anaconda/envs/mrcnn/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:158: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.

WARNING:tensorflow:From /data/anaconda/envs/mrcnn/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:163: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.

2019-12-30 15:31:50.050404: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2019-12-30 15:31:50.057727: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2596990000 Hz
2019-12-30 15:31:50.058553: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55d9bb43b7f0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2019-12-30 15:31:50.058580: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2019-12-30 15:31:50.061043: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2019-12-30 15:31:54.571132: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55d9bb50c1d0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2019-12-30 15:31:54.571172: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Tesla K80, Compute Capability 3.7
2019-12-30 15:31:54.571963: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1618] Found device 0 with properties:
name: Tesla K80 major: 3 minor: 7 memoryClockRate(GHz): 0.8235
pciBusID: f923:00:00.0
2019-12-30 15:31:54.572242: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0
2019-12-30 15:31:54.573457: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10.0
2019-12-30 15:31:54.574598: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10.0
2019-12-30 15:31:54.574896: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10.0
2019-12-30 15:31:54.576375: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10.0
2019-12-30 15:31:54.577490: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10.0
2019-12-30 15:31:54.579974: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2019-12-30 15:31:54.581303: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1746] Adding visible gpu devices: 0
2019-12-30 15:31:54.581360: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0
2019-12-30 15:31:54.584499: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1159] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-12-30 15:31:54.584536: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1165]      0
2019-12-30 15:31:54.584550: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1178] 0:   N
2019-12-30 15:31:54.585941: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1304] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10805 MB memory) -> physical GPU (device: 0, name: Tesla K80, pci bus id: f923:00:00.0, compute capability: 3.7)
WARNING:tensorflow:From /data/anaconda/envs/mrcnn/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:333: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.

WARNING:tensorflow:From /data/anaconda/envs/mrcnn/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:341: The name tf.variables_initializer is deprecated. Please use tf.compat.v1.variables_initializer instead.

Training network heads

Starting at epoch 0. LR=0.001

Checkpoint Path: logs/custom20191230T1531/mask_rcnn_custom_{epoch:04d}.h5
Selecting layers to train
fpn_c5p5               (Conv2D)
fpn_c4p4               (Conv2D)
fpn_c3p3               (Conv2D)
fpn_c2p2               (Conv2D)
fpn_p5                 (Conv2D)
fpn_p2                 (Conv2D)
fpn_p3                 (Conv2D)
fpn_p4                 (Conv2D)
In model:  rpn_model
    rpn_conv_shared        (Conv2D)
    rpn_class_raw          (Conv2D)
    rpn_bbox_pred          (Conv2D)
mrcnn_mask_conv1       (TimeDistributed)
mrcnn_mask_bn1         (TimeDistributed)
mrcnn_mask_conv2       (TimeDistributed)
mrcnn_mask_bn2         (TimeDistributed)
mrcnn_class_conv1      (TimeDistributed)
mrcnn_class_bn1        (TimeDistributed)
mrcnn_mask_conv3       (TimeDistributed)
mrcnn_mask_bn3         (TimeDistributed)
mrcnn_class_conv2      (TimeDistributed)
mrcnn_class_bn2        (TimeDistributed)
mrcnn_mask_conv4       (TimeDistributed)
mrcnn_mask_bn4         (TimeDistributed)
mrcnn_bbox_fc          (TimeDistributed)
mrcnn_mask_deconv      (TimeDistributed)
mrcnn_class_logits     (TimeDistributed)
mrcnn_mask             (TimeDistributed)
WARNING:tensorflow:From /data/anaconda/envs/mrcnn/lib/python3.7/site-packages/keras/optimizers.py:711: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

/data/anaconda/envs/mrcnn/lib/python3.7/site-packages/tensorflow_core/python/framework/indexed_slices.py:424: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
/data/anaconda/envs/mrcnn/lib/python3.7/site-packages/tensorflow_core/python/framework/indexed_slices.py:424: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
/data/anaconda/envs/mrcnn/lib/python3.7/site-packages/tensorflow_core/python/framework/indexed_slices.py:424: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
WARNING:tensorflow:From /data/anaconda/envs/mrcnn/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:899: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.

WARNING:tensorflow:From /data/anaconda/envs/mrcnn/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:625: calling Constant.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
WARNING:tensorflow:From /data/anaconda/envs/mrcnn/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:886: The name tf.assign is deprecated. Please use tf.compat.v1.assign instead.

WARNING:tensorflow:From /data/anaconda/envs/mrcnn/lib/python3.7/site-packages/keras/callbacks.py:705: The name tf.summary.merge_all is deprecated. Please use tf.compat.v1.summary.merge_all instead.

WARNING:tensorflow:From /data/anaconda/envs/mrcnn/lib/python3.7/site-packages/keras/callbacks.py:708: The name tf.summary.FileWriter is deprecated. Please use tf.compat.v1.summary.FileWriter instead.

/data/anaconda/envs/mrcnn/lib/python3.7/site-packages/keras/engine/training.py:1987: UserWarning: Using a generator with `use_multiprocessing=True` and multiple workers may duplicate your data. Please consider using the`keras.utils.Sequence class.
  UserWarning('Using a generator with `use_multiprocessing=True`'
Epoch 1/100

Я посмотрел статистику графического процессора (nvidia-smi), и кажется, что графический процессор использует почти всю память (скажем, 95% из 12 ГБ), но коэффициент использования составляет 0%. Это наводит меня на мысль, что модель не работает и как-то зависает.

Я слишком нетерпелив, и требуется ли более 30 минут для одной эпохи на графическом процессоре (тогда почему загрузка графического процессора невысока?) Или это действительно зависает?

Помощь очень ценится!


person Yorian    schedule 31.12.2019    source источник
comment
боже, у меня точно такая же проблема. Я новичок в мл, поэтому понятия не имею, что происходит. есть ли удача через 10 месяцев?   -  person Neekey    schedule 22.11.2020
comment
Привет, @Neekey, у нас все заработало, но я не могу вспомнить, что мы изменили. Какой у тебя код?   -  person Yorian    schedule 23.11.2020


Ответы (1)


Вы можете использовать Detectron2 для обучения своей модели.

Вот ссылка на руководство colab. Обратите внимание, что ваши аннотации должны быть в формате COCO для обучения с помощью Detectron2.

person Arun Sg    schedule 03.01.2020