数据处理与数据集制备

目录


一、图片批量重命名

文件夹结构:

  • source
    • animals
      • cat
      • cattle
      • dog
      • horse
      • pig
    • fruits
      • apple
      • banana
      • durian
      • grape
      • orange
    • vehicles
      • bus
      • car
      • plane
      • ship
      • train
  • 实现代码:
    import os
    import cv2 as cv
    
    sourcePath  = '../../../DataSet/source/'
    
    animalPath  = sourcePath + 'animals/'
    fruitPath   = sourcePath + 'fruits/'
    vehiclePath = sourcePath + 'vehicles/'
    
    
    # 水果图片重命名
    for folder_list in os.listdir(fruitPath):
    
        count = 1
    
        # 读取每个目录下的子文件夹
        subfolder = fruitPath + folder_list + '/'
    
        for file in os.listdir(subfolder):
    
            old_name = subfolder + file
            new_name = subfolder + "%s_%d.jpg" % (folder_list, count)
            print(old_name, "====>", new_name)
    
            os.rename(old_name, new_name)
            count = count + 1
    
    
    # 动物图片重命名
    for folder_list in os.listdir(animalPath):
    
        count = 1
    
        # 读取每个目录下的子文件夹
        subfolder = animalPath + folder_list + '/'
    
        for file in os.listdir(subfolder):
    
            old_name = subfolder + file
            new_name = subfolder + "%s_%d.jpg" % (folder_list, count)
    
            print(old_name, "====>", new_name)
    
            os.rename(old_name, new_name)
            count = count + 1
    
    
    # 交通图片重命名
    for folder_list in os.listdir(vehiclePath):
    
        count = 1
    
        # 读取每个目录下的子文件夹
        subfolder = vehiclePath + folder_list + '/'
    
        for file in os.listdir(subfolder):
    
            old_name = subfolder + file
            new_name = subfolder + "%s_%d.jpg" % (folder_list, count)
    
            print(old_name, "====>", new_name)
    
            os.rename(old_name, new_name)
            count = count + 1
    
    
    
    

实现效果:


二、数据增强+高斯加噪

比赛的时候采用摄像头拍摄图像识别,拍摄的图像包含很多噪点,而原数据集为无噪声的原图。例如一张狗的照片拍摄时受到室内光线影响,有很多红色噪点,就容易被识别为苹果。去年出现过这种情况,因此今年针对解决,对数据集进行加噪处理。

  • 实现代码:
    # 图像加噪
    # 实际拍摄的图片会存在很多噪声
    # 与计算机原图差距很大
    # 因此需要对原图进行加噪处理
    
    import numpy as np
    import cv2   as cv
    import os
    from keras.preprocessing.image import ImageDataGenerator
    from keras.preprocessing.image import img_to_array
    from keras.preprocessing.image import load_img
    
    
    # 加噪函数
    def Gasuss_Noise(image, mean=0, var=0.001):
        '''
            添加高斯噪声
            image: 原始图像
            mean : 均值
            var :  方差, 越大,噪声越大
        '''
        image = cv.imread(image)
        image = np.array(image/255, dtype=float)                                                                            # 将原始图像的像素值进行归一化,除以255使得像素值在0-1之间
        noise = np.random.normal(mean, var ** 0.5, image.shape)                                                             # 创建一个均值为mean,方差为var呈高斯分布的图像矩阵
        out = image + noise                                                                                                 # 将噪声和原始图像进行相加得到加噪后的图像
        if out.min() < 0:
            low_clip = -1.
        else:
            low_clip = 0.
        out = np.clip(out, low_clip, 1.0)                                                                                   # clip函数将元素的大小限制在了low_clip和1之间了,小于的用low_clip代替,大于1的用1代替
        out = np.uint8(out * 255)                                                                                             # 解除归一化,乘以255将加噪后的图像的像素值恢复
        # cv.imshow("gasuss", out)
        noise = noise * 255
        return [noise, out]
    
    
    # 图像增强函数
    def Date_Enhancement(img_input_path, img_output_path):
        image = load_img(img_input_path)
    
        im1 = image.point(lambda p: p * 0.6)
        # im1.show()
    
        im1 = img_to_array(im1)  # 图像转为数组
        im1 = np.expand_dims(im1, axis=0)  # 增加一个维度
    
        img_dag = ImageDataGenerator(
            rotation_range=10,
            width_shift_range=0.001,   # horizen offset
            height_shift_range=0.001,  # vertical offset
            shear_range=0.02,
            # zoom_range=[0.6, 0.9],
            brightness_range = [0.9, 1.1],
            horizontal_flip=False,    # 水平翻转
            fill_mode="constant", cval=40
        )  # 旋转,宽度移动范围,高度移动范围,裁剪范围,水平翻转开启,填充模式
    
        img_generator = img_dag.flow(im1,
                                     batch_size=1,
                                     save_to_dir= img_output_path,
                                     save_prefix= "image",
                                     save_format= "jpg")  # 测试一张图像bath_size=1
        count = 0  # 计数器
    
        for raw_pic_path in img_generator:
            count += 1
            if count == 1:  # 生成多少个样本后退出
                break
    
    
    ####################################################################
    
    # 原地址
    sourcePath  = '../../../DataSet/source/'
    animalPath  = sourcePath + 'animals/'
    fruitPath   = sourcePath + 'fruits/'
    vehiclePath = sourcePath + 'vehicles/'
    
    # 中间地址
    # 用于保存第一步处理后的图片
    middlePath        = '../../../DataSet/middle/'
    animalMiddlePath  = middlePath + 'animals/'
    fruitMiddlePath   = middlePath + 'fruits/'
    vehicleMiddlePath = middlePath + 'vehicles/'
    
    # 目标地址
    # 保存第二步处理后的图片数据
    processPath        = '../../../DataSet/process/'
    animalProcessPath  = processPath + 'animals/'
    fruitProcessPath   = processPath + 'fruits/'
    vehicleProcessPath = processPath + 'vehicles/'
    
    
    for folder_name in os.listdir(fruitPath):
    
        rename_count = 1
    
        # 读取每个目录下的子文件夹
        sub_folder = fruitPath + folder_name + '/'
        middle_save_path = fruitMiddlePath + folder_name + '/'
    
        # 数据增强
        for raw_pic in os.listdir(sub_folder):
            raw_pic_path = sub_folder + raw_pic
            for i in range(0, 10):
                Date_Enhancement(raw_pic_path, middle_save_path)
    
        print(folder_name, "Enhance Done")
    
        # 高斯加噪
        for middle_pic in os.listdir(middle_save_path):
            middle_save_name = middle_save_path + middle_pic
            final_save_path = fruitProcessPath + folder_name + '/'
            final_save_name = final_save_path + "%s_%d.jpg" % (folder_name, rename_count)
    
            noise, out = Gasuss_Noise(middle_save_name, mean=0, var=0.002)
    
            cv.imwrite(final_save_name, out)
            rename_count = rename_count + 1
    
        print(folder_name, "Process Done")
    
    
    for folder_name in os.listdir(animalPath):
    
        rename_count = 1
    
        # 读取每个目录下的子文件夹
        sub_folder = animalPath + folder_name + '/'
        middle_save_path = animalMiddlePath + folder_name + '/'
    
        # 数据增强
        for raw_pic in os.listdir(sub_folder):
            raw_pic_path = sub_folder + raw_pic
            for i in range(0, 10):
                Date_Enhancement(raw_pic_path, middle_save_path)
    
        print(folder_name, "Enhance Done")
    
        # 高斯加噪
        for middle_pic in os.listdir(middle_save_path):
            middle_save_name = middle_save_path + middle_pic
            final_save_path = animalProcessPath + folder_name + '/'
            final_save_name = final_save_path + "%s_%d.jpg" % (folder_name, rename_count)
    
            noise, out = Gasuss_Noise(middle_save_name, mean=0, var=0.002)
    
            cv.imwrite(final_save_name, out)
            rename_count = rename_count + 1
    
        print(folder_name, "Process Done")
    
    
    for folder_name in os.listdir(vehiclePath):
    
        rename_count = 1
    
        # 读取每个目录下的子文件夹
        sub_folder = vehiclePath + folder_name + '/'
        middle_save_path = vehicleMiddlePath + folder_name + '/'
    
        # 数据增强
        for raw_pic in os.listdir(sub_folder):
            rename_count = 1
            raw_pic_path = sub_folder + raw_pic
            for i in range(0, 10):
                Date_Enhancement(raw_pic_path, middle_save_path)
    
        print(folder_name, "Enhance Done")
    
        # 高斯加噪
        for middle_pic in os.listdir(middle_save_path):
            middle_save_name = middle_save_path + middle_pic
            final_save_path = vehicleProcessPath + folder_name + '/'
            final_save_name = final_save_path + "%s_%d.jpg" % (folder_name, rename_count)
    
            noise, out = Gasuss_Noise(middle_save_name, mean=0, var=0.002)
    
            cv.imwrite(final_save_name, out)
            rename_count = rename_count + 1
    
        print(folder_name, "Process Done")
    

三、数据集制备

【说明】python访问文件/文件夹的顺序是按照首字母的排序进行的,于是标签的顺序为:

  • 实现代码:
    import os
    import cv2 as cv
    import numpy as np
    
    make = True
    check = True
    
    # 图片地址
    picPath        = '../../../DataSet/process/'
    
    if __name__ == "__main__":
    
        if make:
            all_data = []
            all_label = []
    
            i = -1
            for fruit_animal_vehicle in os.listdir(picPath):
                for apple_cat_bus in os.listdir(picPath + fruit_animal_vehicle + '/'):
                    i = i + 1
                    for pic in os.listdir(picPath + fruit_animal_vehicle + '/' + apple_cat_bus + '/'):
    
                        extension = os.path.splitext(pic)[-1]
                        if extension == '.jpg':
                            img = cv.imread(picPath + fruit_animal_vehicle + '/' + apple_cat_bus + '/' + pic)  # 读取图片数据到img   ##连接两个路径名组件  #  cv2.imread 读取的图片数据是 BGR 格式;
                            try:                                                   # 如果try发生异常
                                img = cv.resize(img, (32, 32))[..., (2, 1, 0)]        # BGR  2  RGB
                                all_data.append(img)                              # .append方法在数组尾部加上img
                                all_label.append(i)                               # .append方法
                            except:                                                # 执行except
                                continue
    
            all_data  = np.asarray(all_data)
            all_label = np.asarray(all_label)
    
            np.save("../../../DataSet/dataSet/pic", all_data)
            np.save("../../../DataSet/dataSet/label", all_label)
    
        if check:
            x = np.load("../../../DataSet/dataSet/pic.npy")
            y = np.load("../../../DataSet/dataSet/label.npy")
    
            label = ["cat", "cattle", "dog", "horse", "pig"] \
                  + ["apple", "banana", "durian", "grape", "orange"] \
                  + ["bus", "car", "plane", "ship", "train"]
    
            count = 0
            for d, idx in zip(x, y):                                             ## 将对象中对应的元素打包成一个个元组,然后返回由这些元组组成的列表
                print("Class %s %d" % (label[idx], count))
                d = cv.resize(d, (395, 395))[..., (2, 1, 0)]
                cv.imshow("img", d)
                count = count + 1
                cv.waitKey(0)