
MASTER: Multi-Aspect Non-local Network for Scene Text Recognition




  1. 提出了Global Context(GC)block,Multi-Aspect GCAttention两个模块。
  2. 推理阶段提出基于缓存(memery-cache)的解码策略对解码过程加速。
  3. 在正常文本,弯曲文本上都取得了最好的效果。


(a)Global Context(GC)block

(b)Multi-Aspect GCAttention



transformer的解码与lstm的解码类似,用encoder部分的输出计算注意力权重,并对输入向量重新分配,获得当前时刻的输入。在解码阶段首先用multi-head attention计算当前时刻的输入编码,得到tmp_feature,相当于LSTM中将时刻t-1的输出输入到模型中,随后获取encoder部分的输出,分别作为Key和quary,计算获得注意力权重,并利用该权重对tmp_feature(value)重新分配,得到当前时刻的特征结果。


本文训练的时候使用66个字符进行训练,包括10个数字,52个大小写字母,4个特殊字符start of the sequence<SOS>

end of the sequence<EOS>

padding symbol<PAD>

unknown characters<UNK>


存储 Kt=xtWK与Vt=xtWv ,由于解码阶段依旧是step-by-step的计算方式,每个时刻需要计算 (XWk)(XWq)T(XWv) ​,因此,使用memery-cache手段,存储每个时刻计算的结果,可加快推理速度。


t=1,KQV=[xt0WK;xt1WK][yt1Wq][xt0WV;xt1WV] …







# YOLOv5 by Ultralytics, GPL-3.0 license
Image augmentation functions

import math
import random

import cv2
import numpy as np

class Albumentations:
    # YOLOv5 Albumentations class (optional, only used if package is installed)
    def __init__(self):
        self.transform = None
            #version 1.0.3
            import albumentations as A

            T = [
                    A.IAAAdditiveGaussianNoise(),   # 将高斯噪声添加到输入图像
                    A.GaussNoise(),    # 将高斯噪声应用于输入图像。
                ], p=0.2),   # 应用选定变换的概率
                    A.MotionBlur(p=0.2),   # 使用随机大小的内核将运动模糊应用于输入图像。
                    A.MedianBlur(blur_limit=3, p=0.01),    # 中值滤波
                    A.Blur(blur_limit=3, p=0.01),   # 使用随机大小的内核模糊输入图像。
                ], p=0.2),
                A.ShiftScaleRotate(shift_limit=0.0125, scale_limit=0.1, rotate_limit=5, p=0.1),
                # 随机应用仿射变换:平移,缩放和旋转输入
                A.RandomBrightnessContrast(p=0.2),   # 随机明亮对比度
                A.ImageCompression(quality_lower=75, p=0.0)]  # transforms
            self.transform = A.Compose(T)

            print('albumentations: ' + ', '.join(f'{x}' for x in self.transform.transforms if x.p))
        except ImportError:  # package not installed, skip
        except Exception as e:
            print('albumentations: '+ f'{e}')

    def __call__(self, im, p=0.8):
        if self.transform and random.random() < p:
            new = self.transform(image=im)  # transformed
            im = new['image']

        if random.random() > p:
            im = augment_hsv(im, hgain=0.5, sgain=0.5, vgain=0.5)
        if random.random() > p:
            im = hist_equalize(im, clahe=True, bgr=True)
        if random.random() > p:
            im = random_perspective(im, degrees=5, translate=.1, scale=.1, shear=5, perspective=0.0, border=(0, 0))

        return im

def augment_hsv(im, hgain=0.5, sgain=0.5, vgain=0.5):
    # HSV color-space augmentation
    if hgain or sgain or vgain:
        r = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] + 1  # random gains
        hue, sat, val = cv2.split(cv2.cvtColor(im, cv2.COLOR_BGR2HSV))
        dtype = im.dtype  # uint8

        x = np.arange(0, 256, dtype=r.dtype)
        lut_hue = ((x * r[0]) % 180).astype(dtype)
        lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
        lut_val = np.clip(x * r[2], 0, 255).astype(dtype)

        im_hsv = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val)))
        cv2.cvtColor(im_hsv, cv2.COLOR_HSV2BGR, dst=im)  # no return needed

def hist_equalize(im, clahe=True, bgr=True):
    # Equalize histogram on BGR image 'im' with im.shape(n,m,3) and range 0-255
    yuv = cv2.cvtColor(im, cv2.COLOR_BGR2YUV if bgr else cv2.COLOR_RGB2YUV)
    if clahe:
        c = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        yuv[:, :, 0] = c.apply(yuv[:, :, 0])
        yuv[:, :, 0] = cv2.equalizeHist(yuv[:, :, 0])  # equalize Y channel histogram
    return cv2.cvtColor(yuv, cv2.COLOR_YUV2BGR if bgr else cv2.COLOR_YUV2RGB)  # convert YUV image to RGB

def random_perspective(im,
                       border=(0, 0)):
    # torchvision.transforms.RandomAffine(degrees=(-10, 10), translate=(0.1, 0.1), scale=(0.9, 1.1), shear=(-10, 10))
    # targets = [cls, xyxy]

    height = im.shape[0] + border[0] * 2  # shape(h,w,c)
    width = im.shape[1] + border[1] * 2

    # Center
    C = np.eye(3)
    C[0, 2] = -im.shape[1] / 2  # x translation (pixels)
    C[1, 2] = -im.shape[0] / 2  # y translation (pixels)

    # Perspective
    P = np.eye(3)
    P[2, 0] = random.uniform(-perspective, perspective)  # x perspective (about y)
    P[2, 1] = random.uniform(-perspective, perspective)  # y perspective (about x)

    # Rotation and Scale
    R = np.eye(3)
    a = random.uniform(-degrees, degrees)
    # a += random.choice([-180, -90, 0, 90])  # add 90deg rotations to small rotations
    s = random.uniform(1 - scale, 1 + scale)
    # s = 2 ** random.uniform(-scale, scale)
    R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s)

    # Shear
    S = np.eye(3)
    S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # x shear (deg)
    S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # y shear (deg)

    # Translation
    T = np.eye(3)
    T[0, 2] = random.uniform(0.5 - translate, 0.5 + translate) * width  # x translation (pixels)
    T[1, 2] = random.uniform(0.5 - translate, 0.5 + translate) * height  # y translation (pixels)

    # Combined rotation matrix
    M = T @ S @ R @ P @ C  # order of operations (right to left) is IMPORTANT
    if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any():  # image changed
        if perspective:
            im = cv2.warpPerspective(im, M, dsize=(width, height), borderValue=(114, 114, 114))
        else:  # affine
            im = cv2.warpAffine(im, M[:2], dsize=(width, height), borderValue=(114, 114, 114))

    # Visualize
    # import matplotlib.pyplot as plt
    # ax = plt.subplots(1, 2, figsize=(12, 6))[1].ravel()
    # ax[0].imshow(im[:, :, ::-1])  # base
    # ax[1].imshow(im2[:, :, ::-1])  # warped

    return im


from data_utils.augmentations import Albumentations

self.albumentations = Albumentations() 
            if self.transform is not None:
                img = cv2.cvtColor(np.asarray(img),cv2.COLOR_RGB2BGR)  
                img = self.albumentations(img)
                img = Image.fromarray(cv2.cvtColor(img,cv2.COLOR_BGR2RGB))


img = img * random.randint(90,110)/100 + random.randint(-10,10)/100




前面4个为1是因为<PAD>: 0, <EOS>: 1, <SOS>: 2, <UNK>: 3

weight_ratio =  torch.tensor([ 1.0, 1.0, 1.0, 1.0, 0.03710795446111829, 0.7482002642086357, 0.736956554006917, 0.7322586869721394, 0.7562415579403601, 0.7484859954579863, 0.7486492704576153, 0.7323737216309688, 0.7457028988734025, 0.7531690193109795,     0.7358470261685295, 0.7448345727390123, 0.7471241335292633, 0.7606685369075715, 0.7746471033530747, 0.7560040670318089, 0.7463114693265649, 0.7556329874871978, 0.7619042317911268, 0.7645722937168812, 0.7485453681851242, 0.7455618886464502, 0.7405226284306304, 0.742    6118062667914, 0.5395422362737676, 0.554452212376245, 0.5693325021151534, 0.5977757492096005, 0.7496252096599427, 0.5732548129016936, 0.5380282317317541, 0.5781716168677917, 0.5244801175579997, 0.5379391726410473, 0.286599575485001, 0.9958550414866931, 0.99894984488    87504, 0.9991465170473943, 0.998441465912633, 0.9990834335248103, 0.9998589897730478, 0.9992986596606849, 0.9998960977275089, 0.9996140772736044, 0.981182556292767, 0.9907378545665049, 0.9973913108013834, 0.9981520238678363, 0.998074097163468, 0.9971315551201556, 0.    9963856852354871, 0.9973282272787994, 0.998374671594603, 0.9966565733030532, 0.9998589897730478, 0.9999369164774161, 0.9989016045479508, 0.9998255926140327, 0.9996586068189577, 0.9999888676136617, 0.9986975107984147, 0.9997625090914488, 0.9999554704546466, 0.9999703    136364311, 0.9998218818185867, 1.0, 0.9999369164774161, 1.0]).view(1,-1).cuda()

loss = F.cross_entropy(outputs.contiguous().view(-1, outputs.shape[-1]),target[:, 1:].contiguous().view(-1),  ignore_index=LabelTransformer.PAD, weight = weight_ratio)


(6)网络结构的修改,主要将传统卷积替换为深度可分离卷积,减少网络各个block数目,将max pooling使用stride=2的卷积替换,整个网络中去掉前2个block里面的GC block,以及代码中一些不友好的实现的更改。这样的修改主要是基于速度和精度的考虑,同时结合自己车牌识别的任务,不需要特别复杂的resnet50这样的网络。

# -*- coding: utf-8 -*-
# @Author: Wenwen Yu
# @Created Time: 10/4/2020 14:19

from torch import nn
from model.context_block import MultiAspectGCAttention

def conv_3x3_bn(inp, oup, stride, padding=1):
    hidden_dim = round(inp * expand_ratio)
    return nn.Sequential(
        # pw
        nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
        nn.BatchNorm2d(hidden_dim, momentum=0.9),
        # dw
        nn.Conv2d(hidden_dim, hidden_dim, 3, stride, padding, groups=hidden_dim, bias=False),
        nn.BatchNorm2d(hidden_dim, momentum=0.9),
        # pw-linear
        nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
        nn.BatchNorm2d(oup, momentum=0.9),

class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, use_gcb=False, gcb_config=None):
        super(BasicBlock, self).__init__()
        self.relu = nn.ReLU(inplace=True)

        hidden_dim = round(inplanes * expand_ratio)
        self.conv1_bn1_relu = nn.Sequential(
            # pw
            nn.Conv2d(inplanes, hidden_dim, 1, 1, 0, bias=False),
            nn.BatchNorm2d(hidden_dim, momentum=0.9),
            # dw
            nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
            nn.BatchNorm2d(hidden_dim, momentum=0.9),
            # pw-linear
            nn.Conv2d(hidden_dim, planes, 1, 1, 0, bias=False),
            nn.BatchNorm2d(planes, momentum=0.9),

        self.conv2_bn2 = nn.Sequential(
            # dw
            nn.Conv2d(planes, planes, 3, 1, 1, groups=planes, bias=False),
            nn.BatchNorm2d(planes, momentum=0.9),
            # pw-linear
            nn.Conv2d(planes, planes, 1, 1, 0, bias=False),
            nn.BatchNorm2d(planes, momentum=0.9),

        self.stride = stride
        self.use_gcb = use_gcb

        if self.use_gcb:
            gcb_ratio = gcb_config['ratio']
            gcb_headers = gcb_config['headers']
            att_scale = gcb_config['att_scale']
            fusion_type = gcb_config['fusion_type']
            self.context_block = MultiAspectGCAttention(inplanes=planes,

    def forward(self, x):
        residual = x

        out = self.conv1_bn1_relu(x)
        out = self.conv2_bn2(out)

        if self.use_gcb:
            out = self.context_block(out)

        out += residual
        out = self.relu(out)

        return out

class ResNet(nn.Module):

    def __init__(self, block, layers, zero_init_residual=False, gcb=None, in_channels=1):
        super(ResNet, self).__init__()
        gcb_config = gcb


        self.conv1_bn1_relu1 = conv_3x3_bn(in_channels, 64//ratio, 1, 1)

        self.conv2_bn2_relu2_maxpool = conv_3x3_bn(64//ratio, 128//ratio, 2, 1)

        self.conv3_bn3_relu3_maxpool = conv_3x3_bn(128//ratio, 256//ratio, 2, 1)

        self.conv4_bn4_relu4_maxpool = conv_3x3_bn(256//ratio, 512//ratio, (2,1), 1)

        self.layer3 = self._make_layer(block, 512//ratio, 512//ratio, layers[2], stride=1, gcb_config=gcb_config,

        self.conv5_bn5_relu5 = conv_3x3_bn(512//ratio, 512//ratio, 1)

        self.layer4 = self._make_layer(block, 512//ratio, 512//ratio, layers[3], stride=1, gcb_config=gcb_config,

        self.conv6_bn6_relu6 = conv_3x3_bn(512//ratio, 512//ratio, 1)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck):
                    nn.init.constant_(m.bn3.weight, 0)
                elif isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)

    def _make_layer(self, block, inplanes, outplanes, blocks, stride=1, use_gcb=False, gcb_config=None):

        layers = []
        layers.append(block(inplanes, outplanes, stride, use_gcb=use_gcb, gcb_config=gcb_config))
        for _ in range(1, blocks):
            layers.append(block(outplanes, outplanes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1_bn1_relu1(x)

        x = self.conv2_bn2_relu2_maxpool(x)

        x = self.conv3_bn3_relu3_maxpool(x)

        x = self.conv4_bn4_relu4_maxpool(x)
        x = self.layer3(x)

        x = self.conv5_bn5_relu5(x) 
        x = self.layer4(x)

        x = self.conv6_bn6_relu6(x)
        return x

def resnet50(gcb_kwargs, in_channels=1):
    model = ResNet(BasicBlock, [1, 1, 2, 1], gcb=gcb_kwargs, in_channels=in_channels)
    return model

class ConvEmbeddingGC(nn.Module):

    def __init__(self, gcb_kwargs, in_channels=1):
        self.backbone = resnet50(gcb_kwargs, in_channels=in_channels)

    def forward(self, x):
        feature = self.backbone(x)
        b, c, h, w = feature.shape  # (B, C, H/8, W/4)
        feature = feature.view(b, c, h * w)
        feature = feature.permute((0, 2, 1))
        return feature




            if self.transform is not None:
                random_num = random.randint(0,2)
                if random_num ==1:
                    left_right_flip = torch.flip(img, [2])
                    img = left_right_flip
                elif random_num==2:
                    left_right_up_down_flip = torch.flip(img, [2,1])
                    img = left_right_up_down_flip




  1. 基于自回归方式的识别,速度比ctc慢,效果比ctc好,尤其对于不规则文本效果更好。
  2. 模型训练的不好会出现一些字母重复预测的情况,一般出现在句子中间和句子末尾。
  3. 对于车牌识别来说,由于文本较短,速度影响较小,同时可以同时支持单行,双行车牌。