paddlenlp.transformers.layoutxlm.visual_backbone 源代码

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2021 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
import os
from abc import abstractmethod
from collections import namedtuple

import numpy as np
import paddle
from paddle import ParamAttr
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn import Layer
from paddle.utils import try_import


def read_config(fp=None):
    if fp is None:
        dir_name = os.path.dirname(os.path.abspath(__file__))
        fp = os.path.join(dir_name, "visual_backbone.yaml")
    with open(fp, "r") as fin:
        yacs_config = try_import("yacs.config")
        cfg = yacs_config.CfgNode().load_cfg(fin)
    cfg.freeze()
    return cfg


[文档]class Conv2d(nn.Conv2D): def __init__(self, *args, **kwargs): norm = kwargs.pop("norm", None) activation = kwargs.pop("activation", None) super(Conv2d, self).__init__(*args, **kwargs) self.norm = norm self.activation = activation
[文档] def forward(self, x): x = super(Conv2d, self).forward(x) if self.norm is not None: x = self.norm(x) if self.activation is not None: x = self.activation(x) return x
[文档]class CNNBlockBase(Layer): def __init__(self, in_channels, out_channels, stride): """ The `__init__` method of any subclass should also contain these arguments. Args: in_channels (int): out_channels (int): stride (int): """ super(CNNBlockBase, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.stride = stride def freeze(self): for p in self.parameters(): p.stop_gradient = True
ResNetBlockBase = CNNBlockBase
[文档]class ShapeSpec(namedtuple("_ShapeSpec", ["channels", "height", "width", "stride"])): def __new__(cls, channels=None, height=None, width=None, stride=None): return super().__new__(cls, channels, height, width, stride)
[文档]def get_norm(norm, out_channels): """ Args: norm (str or callable): either one of BN, SyncBN, FrozenBN, GN; or a callable that takes a channel number and returns the normalization layer as a nn.Layer. out_channels (int): out_channels Returns: nn.Layer or None: the normalization layer """ if norm is None: return None if isinstance(norm, str): if len(norm) == 0: return None norm = { "BN": nn.BatchNorm, "SyncBN": nn.SyncBatchNorm, "FrozenBN": FrozenBatchNorm, }[norm] return norm(out_channels)
[文档]class FrozenBatchNorm(nn.BatchNorm): def __init__(self, num_channels): param_attr = ParamAttr(learning_rate=0.0, trainable=False) bias_attr = ParamAttr(learning_rate=0.0, trainable=False) super(FrozenBatchNorm, self).__init__( num_channels, param_attr=param_attr, bias_attr=bias_attr, use_global_stats=True )
[文档]class Backbone(nn.Layer): def __init__(self): super(Backbone, self).__init__()
[文档] @abstractmethod def forward(self, *args): pass
@property def size_divisibility(self) -> int: return 0 def output_shape(self): # this is a backward-compatible default return { name: ShapeSpec(channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]) for name in self._out_features }
[文档]class BasicBlock(CNNBlockBase): """ The basic residual block for ResNet-18 and ResNet-34 defined in :paper:`ResNet`, with two 3x3 conv layers and a projection shortcut if needed. """ def __init__(self, in_channels, out_channels, *, stride=1, norm="BN"): raise NotImplementedError
[文档]class BottleneckBlock(CNNBlockBase): """ The standard bottleneck residual block used by ResNet-50, 101 and 152 defined in :paper:`ResNet`. It contains 3 conv layers with kernels 1x1, 3x3, 1x1, and a projection shortcut if needed. """ def __init__( self, in_channels, out_channels, *, bottleneck_channels, stride=1, num_groups=1, norm="BN", stride_in_1x1=False, dilation=1, ): super(BottleneckBlock, self).__init__(in_channels, out_channels, stride) if in_channels != out_channels: self.shortcut = Conv2d( in_channels, out_channels, kernel_size=1, stride=stride, bias_attr=False, norm=get_norm(norm, out_channels), ) else: self.shortcut = None stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride) self.conv1 = Conv2d( in_channels, bottleneck_channels, kernel_size=1, stride=stride_1x1, bias_attr=False, norm=get_norm(norm, bottleneck_channels), ) self.conv2 = Conv2d( bottleneck_channels, bottleneck_channels, kernel_size=3, stride=stride_3x3, padding=1 * dilation, bias_attr=False, groups=num_groups, dilation=dilation, norm=get_norm(norm, bottleneck_channels), ) self.conv3 = Conv2d( bottleneck_channels, out_channels, kernel_size=1, bias_attr=False, norm=get_norm(norm, out_channels), ) # init code is removed cause pretrained model will be loaded
[文档] def forward(self, x): out = self.conv1(x) out = F.relu(out) out = self.conv2(out) out = F.relu(out) out = self.conv3(out) if self.shortcut is not None: shortcut = self.shortcut(x) else: shortcut = x out += shortcut out = F.relu(out) return out
[文档]class DeformBottleneckBlock(CNNBlockBase): """ Similar to :class:`BottleneckBlock`, but with :paper:`deformable conv <deformconv>` in the 3x3 convolution. """ def __init__( self, in_channels, out_channels, *, bottleneck_channels, stride=1, num_groups=1, norm="BN", stride_in_1x1=False, dilation=1, deform_modulated=False, deform_num_groups=1, ): raise NotImplementedError
[文档]class BasicStem(CNNBlockBase): """ The standard ResNet stem (layers before the first residual block), with a conv, relu and max_pool. """ def __init__(self, in_channels=3, out_channels=64, norm="BN"): """ Args: norm (str or callable): norm after the first conv layer. See :func:`layers.get_norm` for supported format. """ super(BasicStem, self).__init__(in_channels, out_channels, 4) self.in_channels = in_channels self.conv1 = Conv2d( in_channels, out_channels, kernel_size=7, stride=2, padding=3, bias_attr=False, norm=get_norm(norm, out_channels), )
[文档] def forward(self, x): x = self.conv1(x) x = F.relu(x) x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) return x
[文档]class ResNet(Backbone): def __init__(self, stem, stages, num_classes=None, out_features=None, freeze_at=0): super(ResNet, self).__init__() self.stem = stem self.num_classes = num_classes current_stride = self.stem.stride self._out_feature_strides = {"stem": current_stride} self._out_feature_channels = {"stem": self.stem.out_channels} self.stage_names, self.stages = [], [] if out_features is not None: num_stages = max([{"res2": 1, "res3": 2, "res4": 3, "res5": 4}.get(f, 0) for f in out_features]) stages = stages[:num_stages] for i, blocks in enumerate(stages): assert len(blocks) > 0, len(blocks) for block in blocks: assert isinstance(block, CNNBlockBase), block name = "res" + str(i + 2) stage = nn.Sequential(*blocks) self.add_sublayer(name, stage) self.stage_names.append(name) self.stages.append(stage) self._out_feature_strides[name] = current_stride = int( current_stride * np.prod([k.stride for k in blocks]) ) self._out_feature_channels[name] = curr_channels = blocks[-1].out_channels self.stage_names = tuple(self.stage_names) if num_classes is not None: self.avgpool = nn.AdaptiveAvgPool2D(1) self.linear = nn.Linear(curr_channels, num_classes) name = "linear" if out_features is None: out_features = [name] self._out_features = out_features assert len(self._out_features) children = [x[0] for x in self.named_children()] for out_feature in self._out_features: assert out_feature in children, "Available children: {}".format(", ".join(children)) self.freeze(freeze_at)
[文档] def forward(self, x): """ Args: x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. Returns: dict[str->Tensor]: names and the corresponding features """ assert x.dim() == 4, f"ResNet takes an input of shape (N, C, H, W). Got {x.shape} instead!" outputs = {} x = self.stem(x) if "stem" in self._out_features: outputs["stem"] = x for name, stage in zip(self.stage_names, self.stages): x = stage(x) if name in self._out_features: outputs[name] = x if self.num_classes is not None: x = self.avgpool(x) x = paddle.flatten(x, 1) x = self.linear(x) if "linear" in self._out_features: outputs["linear"] = x return outputs
def output_shape(self): return { name: ShapeSpec(channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]) for name in self._out_features }
[文档] @staticmethod def make_stage(block_class, num_blocks, *, in_channels, out_channels, **kwargs): """ Create a list of blocks of the same type that forms one ResNet stage. Args: block_class (type): a subclass of CNNBlockBase that's used to create all blocks in this stage. A module of this type must not change spatial resolution of inputs unless its stride != 1. num_blocks (int): number of blocks in this stage in_channels (int): input channels of the entire stage. out_channels (int): output channels of **every block** in the stage. kwargs: other arguments passed to the constructor of `block_class`. If the argument name is "xx_per_block", the argument is a list of values to be passed to each block in the stage. Otherwise, the same argument is passed to every block in the stage. Returns: list[CNNBlockBase]: a list of block module. Examples: :: stage = ResNet.make_stage( BottleneckBlock, 3, in_channels=16, out_channels=64, bottleneck_channels=16, num_groups=1, stride_per_block=[2, 1, 1], dilations_per_block=[1, 1, 2] ) Usually, layers that produce the same feature map spatial size are defined as one "stage" (in :paper:`FPN`). Under such definition, ``stride_per_block[1:]`` should all be 1. """ blocks = [] for i in range(num_blocks): curr_kwargs = {} for k, v in kwargs.items(): if k.endswith("_per_block"): assert len(v) == num_blocks, ( f"Argument '{k}' of make_stage should have the " f"same length as num_blocks={num_blocks}." ) newk = k[: -len("_per_block")] assert newk not in kwargs, f"Cannot call make_stage with both {k} and {newk}!" curr_kwargs[newk] = v[i] else: curr_kwargs[k] = v blocks.append(block_class(in_channels=in_channels, out_channels=out_channels, **curr_kwargs)) in_channels = out_channels return blocks
[文档] @staticmethod def make_default_stages(depth, block_class=None, **kwargs): """ Created list of ResNet stages from pre-defined depth (one of 18, 34, 50, 101, 152). If it doesn't create the ResNet variant you need, please use :meth:`make_stage` instead for fine-grained customization. Args: depth (int): depth of ResNet block_class (type): the CNN block class. Has to accept `bottleneck_channels` argument for depth > 50. By default it is BasicBlock or BottleneckBlock, based on the depth. kwargs: other arguments to pass to `make_stage`. Should not contain stride and channels, as they are predefined for each depth. Returns: list[list[CNNBlockBase]]: modules in all stages; see arguments of :class:`ResNet.__init__`. """ num_blocks_per_stage = { 18: [2, 2, 2, 2], 34: [3, 4, 6, 3], 50: [3, 4, 6, 3], 101: [3, 4, 23, 3], 152: [3, 8, 36, 3], }[depth] if block_class is None: block_class = BasicBlock if depth < 50 else BottleneckBlock if depth < 50: in_channels = [64, 64, 128, 256] out_channels = [64, 128, 256, 512] else: in_channels = [64, 256, 512, 1024] out_channels = [256, 512, 1024, 2048] ret = [] for (n, s, i, o) in zip(num_blocks_per_stage, [1, 2, 2, 2], in_channels, out_channels): if depth >= 50: kwargs["bottleneck_channels"] = o // 4 ret.append( ResNet.make_stage( block_class=block_class, num_blocks=n, stride_per_block=[s] + [1] * (n - 1), in_channels=i, out_channels=o, **kwargs, ) ) return ret
def freeze(self, freeze_at=0): if freeze_at >= 1: self.stem.freeze() for idx, stage in enumerate(self.stages, start=2): if freeze_at >= idx: for block in stage.children(): block.freeze() return self
[文档]class LastLevelMaxPool(nn.Layer): """ This module is used in the original FPN to generate a downsampled P6 feature from P5. """ def __init__(self): super(LastLevelMaxPool, self).__init__() self.num_levels = 1 self.in_feature = "p5"
[文档] def forward(self, x): return [F.max_pool2d(x, kernel_size=1, stride=2, padding=0)]
def _assert_strides_are_log2_contiguous(strides): """ Assert that each stride is 2x times its preceding stride, i.e. "contiguous in log2". """ for i, stride in enumerate(strides[1:], 1): assert stride == 2 * strides[i - 1], "Strides {} {} are not log2 contiguous".format(stride, strides[i - 1])
[文档]class FPN(Backbone): def __init__(self, bottom_up, in_features, out_channels, norm="", top_block=None, fuse_type="sum"): super(FPN, self).__init__() assert isinstance(bottom_up, Backbone) assert in_features, in_features # Feature map strides and channels from the bottom up network (e.g. ResNet) input_shapes = bottom_up.output_shape() strides = [input_shapes[f].stride for f in in_features] in_channels_per_feature = [input_shapes[f].channels for f in in_features] _assert_strides_are_log2_contiguous(strides) lateral_convs = [] output_convs = [] use_bias = norm == "" for idx, in_channels in enumerate(in_channels_per_feature): lateral_norm = get_norm(norm, out_channels) output_norm = get_norm(norm, out_channels) lateral_conv = Conv2d(in_channels, out_channels, kernel_size=1, bias_attr=use_bias, norm=lateral_norm) output_conv = Conv2d( out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias_attr=use_bias, norm=output_norm, ) stage = int(math.log2(strides[idx])) self.add_sublayer("fpn_lateral{}".format(stage), lateral_conv) self.add_sublayer("fpn_output{}".format(stage), output_conv) lateral_convs.append(lateral_conv) output_convs.append(output_conv) # Place convs into top-down order (from low to high resolution) # to make the top-down computation in forward clearer. self.lateral_convs = lateral_convs[::-1] self.output_convs = output_convs[::-1] self.top_block = top_block self.in_features = tuple(in_features) self.bottom_up = bottom_up # Return feature names are "p<stage>", like ["p2", "p3", ..., "p6"] self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in strides} # top block output feature maps. if self.top_block is not None: for s in range(stage, stage + self.top_block.num_levels): self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1) self._out_features = list(self._out_feature_strides.keys()) self._out_feature_channels = {k: out_channels for k in self._out_features} self._size_divisibility = strides[-1] assert fuse_type in {"avg", "sum"} self._fuse_type = fuse_type @property def size_divisibility(self): return self._size_divisibility
[文档] def forward(self, x): """ Args: x (dict[str->Tensor]): mapping feature map name (e.g., "res5") to feature map tensor for each feature level in high to low resolution order. Returns: dict[str->Tensor]: mapping from feature map name to FPN feature map tensor in high to low resolution order. Returned feature names follow the FPN paper convention: "p<stage>", where stage has stride = 2 ** stage e.g., ["p2", "p3", ..., "p6"]. """ bottom_up_features = self.bottom_up(x) results = [] prev_features = self.lateral_convs[0](bottom_up_features[self.in_features[-1]]) results.append(self.output_convs[0](prev_features)) # Reverse feature maps into top-down order (from low to high resolution) for idx, (lateral_conv, output_conv) in enumerate(zip(self.lateral_convs, self.output_convs)): if idx > 0: features = self.in_features[-idx - 1] features = bottom_up_features[features] top_down_features = F.interpolate(prev_features, scale_factor=2.0, mode="nearest") lateral_features = lateral_conv(features) prev_features = lateral_features + top_down_features if self._fuse_type == "avg": prev_features /= 2 results.insert(0, output_conv(prev_features)) if self.top_block is not None: if self.top_block.in_feature in bottom_up_features: top_block_in_feature = bottom_up_features[self.top_block.in_feature] else: top_block_in_feature = results[self._out_features.index(self.top_block.in_feature)] results.extend(self.top_block(top_block_in_feature)) assert len(self._out_features) == len(results) return {f: res for f, res in zip(self._out_features, results)}
def output_shape(self): return { name: ShapeSpec(channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]) for name in self._out_features }
[文档]def make_stage(*args, **kwargs): """ Deprecated alias for backward compatibiltiy. """ return ResNet.make_stage(*args, **kwargs)
[文档]def build_resnet_backbone(cfg, input_shape=None): """ Create a ResNet instance from config. Returns: ResNet: a :class:`ResNet` instance. """ # need registration of new blocks/stems? if input_shape is None: ch = 3 else: ch = input_shape.channels norm = cfg.MODEL.RESNETS.NORM stem = BasicStem( in_channels=ch, out_channels=cfg.MODEL.RESNETS.STEM_OUT_CHANNELS, norm=norm, ) # fmt: off freeze_at = cfg.MODEL.BACKBONE.FREEZE_AT # default as 2 out_features = cfg.MODEL.RESNETS.OUT_FEATURES depth = cfg.MODEL.RESNETS.DEPTH num_groups = cfg.MODEL.RESNETS.NUM_GROUPS width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP bottleneck_channels = num_groups * width_per_group in_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS stride_in_1x1 = cfg.MODEL.RESNETS.STRIDE_IN_1X1 res5_dilation = cfg.MODEL.RESNETS.RES5_DILATION deform_on_per_stage = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE deform_modulated = cfg.MODEL.RESNETS.DEFORM_MODULATED deform_num_groups = cfg.MODEL.RESNETS.DEFORM_NUM_GROUPS # fmt: on assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation) num_blocks_per_stage = { 18: [2, 2, 2, 2], 34: [3, 4, 6, 3], 50: [3, 4, 6, 3], 101: [3, 4, 23, 3], 152: [3, 8, 36, 3], }[depth] if depth in [18, 34]: assert out_channels == 64, "Must set MODEL.RESNETS.RES2_OUT_CHANNELS = 64 for R18/R34" assert not any(deform_on_per_stage), "MODEL.RESNETS.DEFORM_ON_PER_STAGE unsupported for R18/R34" assert res5_dilation == 1, "Must set MODEL.RESNETS.RES5_DILATION = 1 for R18/R34" assert num_groups == 1, "Must set MODEL.RESNETS.NUM_GROUPS = 1 for R18/R34" stages = [] for idx, stage_idx in enumerate(range(2, 6)): # res5_dilation is used this way as a convention in R-FCN & Deformable Conv paper dilation = res5_dilation if stage_idx == 5 else 1 first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2 stage_kargs = { "num_blocks": num_blocks_per_stage[idx], "stride_per_block": [first_stride] + [1] * (num_blocks_per_stage[idx] - 1), "in_channels": in_channels, "out_channels": out_channels, "norm": norm, } # Use BasicBlock for R18 and R34. if depth in [18, 34]: stage_kargs["block_class"] = BasicBlock else: stage_kargs["bottleneck_channels"] = bottleneck_channels stage_kargs["stride_in_1x1"] = stride_in_1x1 stage_kargs["dilation"] = dilation stage_kargs["num_groups"] = num_groups if deform_on_per_stage[idx]: stage_kargs["block_class"] = DeformBottleneckBlock stage_kargs["deform_modulated"] = deform_modulated stage_kargs["deform_num_groups"] = deform_num_groups else: stage_kargs["block_class"] = BottleneckBlock blocks = ResNet.make_stage(**stage_kargs) in_channels = out_channels out_channels *= 2 bottleneck_channels *= 2 stages.append(blocks) return ResNet(stem, stages, out_features=out_features, freeze_at=freeze_at)
def build_resnet_fpn_backbone(cfg, input_shape=None): bottom_up = build_resnet_backbone(cfg, input_shape) in_features = cfg.MODEL.FPN.IN_FEATURES out_channels = cfg.MODEL.FPN.OUT_CHANNELS backbone = FPN( bottom_up=bottom_up, in_features=in_features, out_channels=out_channels, norm=cfg.MODEL.FPN.NORM, top_block=LastLevelMaxPool(), fuse_type=cfg.MODEL.FPN.FUSE_TYPE, ) return backbone
[文档]class VisualBackbone(Layer): def __init__(self, config): super(VisualBackbone, self).__init__() self.cfg = read_config() self.backbone = build_resnet_fpn_backbone(self.cfg) # syncbn is removed cause that will cause import of torch assert len(self.cfg.MODEL.PIXEL_MEAN) == len(self.cfg.MODEL.PIXEL_STD) num_channels = len(self.cfg.MODEL.PIXEL_MEAN) self.register_buffer("pixel_mean", paddle.to_tensor(self.cfg.MODEL.PIXEL_MEAN).reshape([num_channels, 1, 1])) self.register_buffer("pixel_std", paddle.to_tensor(self.cfg.MODEL.PIXEL_STD).reshape([num_channels, 1, 1])) self.out_feature_key = "p2" # is_deterministic is disabled here. self.pool = nn.AdaptiveAvgPool2D(config["image_feature_pool_shape"][:2]) if len(config["image_feature_pool_shape"]) == 2: config["image_feature_pool_shape"].append(self.backbone.output_shape()[self.out_feature_key].channels) assert self.backbone.output_shape()[self.out_feature_key].channels == config["image_feature_pool_shape"][2]
[文档] def forward(self, images): images_input = (paddle.to_tensor(images) - self.pixel_mean) / self.pixel_std features = self.backbone(images_input) features = features[self.out_feature_key] features = self.pool(features).flatten(start_axis=2).transpose([0, 2, 1]) return features