Source code for tensorlayer.layers.convolution.deformable_conv

#! /usr/bin/python
# -*- coding: utf-8 -*-

import tensorflow as tf

import tensorlayer as tl
from tensorlayer import logging
from tensorlayer.decorators import deprecated_alias, private_method
from tensorlayer.layers.core import Layer

__all__ = [
    'DeformableConv2d',
]


[docs]class DeformableConv2d(Layer): """The :class:`DeformableConv2d` class is a 2D `Deformable Convolutional Networks <https://arxiv.org/abs/1703.06211>`__. Parameters ---------- offset_layer : tf.Tensor To predict the offset of convolution operations. The shape is (batchsize, input height, input width, 2*(number of element in the convolution kernel)) e.g. if apply a 3*3 kernel, the number of the last dimension should be 18 (2*3*3) n_filter : int The number of filters. filter_size : tuple of int The filter size (height, width). act : activation function The activation function of this layer. padding : str The padding algorithm type: "SAME" or "VALID". W_init : initializer The initializer for the weight matrix. b_init : initializer or None The initializer for the bias vector. If None, skip biases. in_channels : int The number of in channels. name : str A unique layer name. Examples -------- With TensorLayer >>> net = tl.layers.InputLayer([5, 10, 10, 16], name='input') >>> offset1 = tl.layers.Conv2d( ... n_filter=18, filter_size=(3, 3), strides=(1, 1), padding='SAME', name='offset1' ... )(net) >>> deformconv1 = tl.layers.DeformableConv2d( ... offset_layer=offset1, n_filter=32, filter_size=(3, 3), name='deformable1' ... )(net) >>> offset2 = tl.layers.Conv2d( ... n_filter=18, filter_size=(3, 3), strides=(1, 1), padding='SAME', name='offset2' ... )(deformconv1) >>> deformconv2 = tl.layers.DeformableConv2d( ... offset_layer=offset2, n_filter=64, filter_size=(3, 3), name='deformable2' ... )(deformconv1) References ---------- - The deformation operation was adapted from the implementation in `here <https://github.com/kastnerkyle/deform-conv>`__ Notes ----- - The padding is fixed to 'SAME'. - The current implementation is not optimized for memory usgae. Please use it carefully. """ # @deprecated_alias(layer='prev_layer', end_support_version=1.9) # TODO remove this line for the 1.9 release def __init__( self, offset_layer=None, # shape=(3, 3, 1, 100), n_filter=32, filter_size=(3, 3), act=None, padding='SAME', W_init=tl.initializers.truncated_normal(stddev=0.02), b_init=tl.initializers.constant(value=0.0), in_channels=None, name=None # 'deformable_conv_2d', ): super().__init__(name, act=act) self.offset_layer = offset_layer self.n_filter = n_filter self.filter_size = filter_size self.padding = padding self.W_init = W_init self.b_init = b_init self.in_channels = in_channels self.kernel_n = filter_size[0] * filter_size[1] if self.offset_layer.get_shape()[-1] != 2 * self.kernel_n: raise AssertionError("offset.get_shape()[-1] is not equal to: %d" % 2 * self.kernel_n) logging.info( "DeformableConv2d %s: n_filter: %d, filter_size: %s act: %s" % ( self.name, self.n_filter, str(self.filter_size ), self.act.__name__ if self.act is not None else 'No Activation' ) ) # try: # pre_channel = int(prev_layer.outputs.get_shape()[-1]) # except Exception: # if pre_channel is ?, it happens when using Spatial Transformer Net # pre_channel = 1 # logging.info("[warnings] unknow input channels, set to 1") # shape = (filter_size[0], filter_size[1], pre_channel, n_filter) # with tf.compat.v1.variable_scope(name): # offset = self.offset_layer # .outputs # # # if offset.get_shape()[-1] != 2 * shape[0] * shape[1]: # # raise AssertionError("offset.get_shape()[-1] is not equal to: %d" % 2 * shape[0] * shape[1]) # # # Grid initialisation # input_h = int(self.inputs.get_shape()[1]) # input_w = int(self.inputs.get_shape()[2]) # # kernel_n = shape[0] * shape[1] # initial_offsets = tf.stack( # tf.meshgrid(tf.range(shape[0]), tf.range(shape[1]), indexing='ij') # ) # initial_offsets --> (kh, kw, 2) # initial_offsets = tf.reshape(initial_offsets, (-1, 2)) # initial_offsets --> (n, 2) # initial_offsets = tf.expand_dims(initial_offsets, 0) # initial_offsets --> (1, n, 2) # initial_offsets = tf.expand_dims(initial_offsets, 0) # initial_offsets --> (1, 1, n, 2) # initial_offsets = tf.tile(initial_offsets, [input_h, input_w, 1, 1]) # initial_offsets --> (h, w, n, 2) # initial_offsets = tf.cast(initial_offsets, 'float32') # grid = tf.meshgrid( # tf.range(-int((shape[0] - 1) / 2.0), int(input_h - int((shape[0] - 1) / 2.0)), 1), # tf.range(-int((shape[1] - 1) / 2.0), int(input_w - int((shape[1] - 1) / 2.0)), 1), indexing='ij' # ) # # grid = tf.stack(grid, axis=-1) # grid = tf.cast(grid, 'float32') # grid --> (h, w, 2) # grid = tf.expand_dims(grid, 2) # grid --> (h, w, 1, 2) # grid = tf.tile(grid, [1, 1, self.kernel_n, 1]) # grid --> (h, w, n, 2) # grid_offset = grid + initial_offsets # grid_offset --> (h, w, n, 2) # # input_deform = self._tf_batch_map_offsets(self.inputs, offset, grid_offset) # # # W = tf.compat.v1.get_variable( # # name='W_deformableconv2d', shape=[1, 1, shape[0] * shape[1], shape[-2], shape[-1]], initializer=W_init, # # dtype=LayersConfig.tf_dtype, # # ) # # # _tensor = tf.nn.conv3d(input_deform, W, strides=[1, 1, 1, 1, 1], padding='VALID', name=None) # # _tensor = tf.nn.conv3d( # # input=input_deform, # # filters=W, # # strides=[1, 1, 1, 1, 1], # # padding='VALID', # # name=None # # ) # # # if b_init: # # b = tf.compat.v1.get_variable( # # name='b_deformableconv2d', shape=(shape[-1]), initializer=b_init, # dtype=LayersConfig.tf_dtype, # # ) # # # # _tensor = tf.nn.bias_add(_tensor, b, name='bias_add') # # # self.outputs = tf.reshape( # # tensor=self._apply_activation(_tensor), # # shape=[tf.shape(input=self.inputs)[0], input_h, input_w, shape[-1]] # # ) # # # self._add_layers(self.outputs) def __repr__(self): actstr = self.act.__name__ if self.act is not None else 'No Activation' s = ( '{classname}(in_channels={in_channels}, out_channels={n_filter}, kernel_size={filter_size}' ', padding={padding}' ) if self.b_init is None: s += ', bias=False' s += (', ' + actstr) if self.name is not None: s += ', name=\'{name}\'' s += ')' return s.format(classname=self.__class__.__name__, **self.__dict__) def build(self, inputs_shape): self.in_channels = inputs_shape[-1] self.input_h = int(inputs_shape[1]) self.input_w = int(inputs_shape[2]) initial_offsets = tf.stack( tf.meshgrid(tf.range(self.filter_size[0]), tf.range(self.filter_size[1]), indexing='ij') ) # initial_offsets --> (kh, kw, 2) initial_offsets = tf.reshape(initial_offsets, (-1, 2)) # initial_offsets --> (n, 2) initial_offsets = tf.expand_dims(initial_offsets, 0) # initial_offsets --> (1, n, 2) initial_offsets = tf.expand_dims(initial_offsets, 0) # initial_offsets --> (1, 1, n, 2) initial_offsets = tf.tile( initial_offsets, [self.input_h, self.input_w, 1, 1] ) # initial_offsets --> (h, w, n, 2) initial_offsets = tf.cast(initial_offsets, 'float32') grid = tf.meshgrid( tf.range( -int((self.filter_size[0] - 1) / 2.0), int(self.input_h - int((self.filter_size[0] - 1) / 2.0)), 1 ), tf.range( -int((self.filter_size[1] - 1) / 2.0), int(self.input_w - int((self.filter_size[1] - 1) / 2.0)), 1 ), indexing='ij' ) grid = tf.stack(grid, axis=-1) grid = tf.cast(grid, 'float32') # grid --> (h, w, 2) grid = tf.expand_dims(grid, 2) # grid --> (h, w, 1, 2) grid = tf.tile(grid, [1, 1, self.kernel_n, 1]) # grid --> (h, w, n, 2) self.grid_offset = grid + initial_offsets # grid_offset --> (h, w, n, 2) self.filter_shape = (1, 1, self.kernel_n, self.in_channels, self.n_filter) self.W = self._get_weights("W_deformableconv2d", shape=self.filter_shape, init=self.W_init) if self.b_init: self.b = self._get_weights("b_deformableconv2d", shape=(self.n_filter, ), init=self.b_init) def forward(self, inputs): # shape = (filter_size[0], filter_size[1], pre_channel, n_filter) offset = self.offset_layer grid_offset = self.grid_offset input_deform = self._tf_batch_map_offsets(inputs, offset, grid_offset) outputs = tf.nn.conv3d(input=input_deform, filters=self.W, strides=[1, 1, 1, 1, 1], padding='VALID', name=None) outputs = tf.reshape(tensor=outputs, shape=[outputs.get_shape()[0], self.input_h, self.input_w, self.n_filter]) if self.b_init: outputs = tf.nn.bias_add(outputs, self.b, name='bias_add') if self.act: outputs = self.act(outputs) return outputs def _to_bc_h_w(self, x, x_shape): """(b, h, w, c) -> (b*c, h, w)""" x = tf.transpose(a=x, perm=[0, 3, 1, 2]) x = tf.reshape(x, (-1, x_shape[1], x_shape[2])) return x def _to_b_h_w_n_c(self, x, x_shape): """(b*c, h, w, n) -> (b, h, w, n, c)""" x = tf.reshape(x, (-1, x_shape[4], x_shape[1], x_shape[2], x_shape[3])) x = tf.transpose(a=x, perm=[0, 2, 3, 4, 1]) return x def tf_flatten(self, a): """Flatten tensor""" return tf.reshape(a, [-1]) def _get_vals_by_coords(self, inputs, coords, idx, out_shape): indices = tf.stack( [idx, self.tf_flatten(coords[:, :, :, :, 0]), self.tf_flatten(coords[:, :, :, :, 1])], axis=-1 ) vals = tf.gather_nd(inputs, indices) vals = tf.reshape(vals, out_shape) return vals def _tf_repeat(self, a, repeats): """Tensorflow version of np.repeat for 1D""" # https://github.com/tensorflow/tensorflow/issues/8521 if len(a.get_shape()) != 1: raise AssertionError("This is not a 1D Tensor") a = tf.expand_dims(a, -1) a = tf.tile(a, [1, repeats]) a = self.tf_flatten(a) return a def _tf_batch_map_coordinates(self, inputs, coords): """Batch version of tf_map_coordinates Only supports 2D feature maps Parameters ---------- inputs : ``tf.Tensor`` shape = (b*c, h, w) coords : ``tf.Tensor`` shape = (b*c, h, w, n, 2) Returns ------- ``tf.Tensor`` A Tensor with the shape as (b*c, h, w, n) """ inputs_shape = inputs.get_shape() coords_shape = coords.get_shape() batch_channel = tf.shape(input=inputs)[0] input_h = int(inputs_shape[1]) input_w = int(inputs_shape[2]) kernel_n = int(coords_shape[3]) n_coords = input_h * input_w * kernel_n coords_lt = tf.cast(tf.floor(coords), 'int32') coords_rb = tf.cast(tf.math.ceil(coords), 'int32') coords_lb = tf.stack([coords_lt[:, :, :, :, 0], coords_rb[:, :, :, :, 1]], axis=-1) coords_rt = tf.stack([coords_rb[:, :, :, :, 0], coords_lt[:, :, :, :, 1]], axis=-1) idx = self._tf_repeat(tf.range(batch_channel), n_coords) vals_lt = self._get_vals_by_coords(inputs, coords_lt, idx, (batch_channel, input_h, input_w, kernel_n)) vals_rb = self._get_vals_by_coords(inputs, coords_rb, idx, (batch_channel, input_h, input_w, kernel_n)) vals_lb = self._get_vals_by_coords(inputs, coords_lb, idx, (batch_channel, input_h, input_w, kernel_n)) vals_rt = self._get_vals_by_coords(inputs, coords_rt, idx, (batch_channel, input_h, input_w, kernel_n)) coords_offset_lt = coords - tf.cast(coords_lt, 'float32') vals_t = vals_lt + (vals_rt - vals_lt) * coords_offset_lt[:, :, :, :, 0] vals_b = vals_lb + (vals_rb - vals_lb) * coords_offset_lt[:, :, :, :, 0] mapped_vals = vals_t + (vals_b - vals_t) * coords_offset_lt[:, :, :, :, 1] return mapped_vals def _tf_batch_map_offsets(self, inputs, offsets, grid_offset): """Batch map offsets into input Parameters ------------ inputs : ``tf.Tensor`` shape = (b, h, w, c) offsets: ``tf.Tensor`` shape = (b, h, w, 2*n) grid_offset: `tf.Tensor`` Offset grids shape = (h, w, n, 2) Returns ------- ``tf.Tensor`` A Tensor with the shape as (b, h, w, c) """ inputs_shape = inputs.get_shape() batch_size = tf.shape(input=inputs)[0] kernel_n = int(int(offsets.get_shape()[3]) / 2) input_h = inputs_shape[1] input_w = inputs_shape[2] channel = inputs_shape[3] # inputs (b, h, w, c) --> (b*c, h, w) inputs = self._to_bc_h_w(inputs, inputs_shape) # offsets (b, h, w, 2*n) --> (b, h, w, n, 2) offsets = tf.reshape(offsets, (batch_size, input_h, input_w, kernel_n, 2)) # offsets (b, h, w, n, 2) --> (b*c, h, w, n, 2) # offsets = tf.tile(offsets, [channel, 1, 1, 1, 1]) coords = tf.expand_dims(grid_offset, 0) # grid_offset --> (1, h, w, n, 2) coords = tf.tile(coords, [batch_size, 1, 1, 1, 1]) + offsets # grid_offset --> (b, h, w, n, 2) # clip out of bound coords = tf.stack( [ tf.clip_by_value(coords[:, :, :, :, 0], 0.0, tf.cast(input_h - 1, 'float32')), tf.clip_by_value(coords[:, :, :, :, 1], 0.0, tf.cast(input_w - 1, 'float32')) ], axis=-1 ) coords = tf.tile(coords, [channel, 1, 1, 1, 1]) mapped_vals = self._tf_batch_map_coordinates(inputs, coords) # (b*c, h, w, n) --> (b, h, w, n, c) mapped_vals = self._to_b_h_w_n_c(mapped_vals, [batch_size, input_h, input_w, kernel_n, channel]) return mapped_vals