RDD2022 数据格式转换与清洗

您所在的位置：网站首页 › 数据格式转化英文 › RDD2022 数据格式转换与清洗

RDD2022 数据格式转换与清洗

2024-07-16 22:42:07| 来源: 网络整理| 查看: 265

Rdd2022数据集，是关于道路损伤的数据集，与rdd2020相比增加了两万多张图片但是由于格式不能被yolo直接使用且其中有大量的图片没有符合要求的标注，特写此文章记录数据清洗与格式化的过程

数据集下载

在开始前需要自己下载zip格式的RDD2022数据集，大小为12.4G 点击此处下载

之后，在桌面上新建一个名为my_file 的文件夹，将上面下载的压缩包放进去，将我的main.py放进去，文件夹结构如下

在这里插入图片描述在pycharm种运行main文件即可运行完成后my_file结构如下，其中的my_data就是你要的文件夹，其他的都没用：

注意事项

注意：

如果运行过程中出现任何bug使得程序没有进行到底，需要删除所有自动生成的文件，回到最开始的目录结构，从新开始运行main文件如果报错说有缺了什么包，自己去安装即可注意设置工作目录为你自己新建的那个my_file文件夹，一般情况下默认就是这个，如果报错找不到目录啥的就看下是不是这个问题按照个人需求，以国家为单位对数据集进行了train:val=7:3的切割特别注意：代码中将没有任何标注的图片直接剔除了，这可能会对你的训练产生影响源代码

main.py的代码如下:

import zipfile import os import os import xml.etree.ElementTree as ET from shutil import copyfile import shutil import argparse from pathlib import Path import random from collections import defaultdict import random work_dir = os.getcwd() countries = ["China_Drone", "China_MotorBike", "Czech", "India", "Japan", "Norway", "United_States"] labels = ["D00", "D10", "D20", "D40"] # 解压最开始的12.4G的压缩包到工作目录 # 解压之后是一个名为 RDD2022_all_countries 的文件夹 def unzip_rdd2022(): path = os.path.join(work_dir, 'RDD2022.zip') zip_file = zipfile.ZipFile(path) zip_list = zip_file.namelist() for f in zip_list: zip_file.extract(f, work_dir) zip_file.close() # RDD2022_all_countries文件夹里面有6个以国家名称命名的压缩包 # 进入这个文件夹里面继续解压，注意是解压到了RDD2022_all_countries # 这个文件夹里面，至此所有的压缩文件解压完毕 def unzip_RDD2022_all_countries(): dir_path = os.path.join(work_dir, 'RDD2022_all_countries') all_countries_zip_file_name = os.listdir(dir_path) for name in all_countries_zip_file_name: print('正在解压{}'.format(name)) all_countries_zip_file_path = os.path.join(dir_path, name) zip_file = zipfile.ZipFile(all_countries_zip_file_path) zip_list = zip_file.namelist() for f in zip_list: zip_file.extract(f, dir_path) zip_file.close() print('{}已解压完成'.format(name)) # 将所有有标签的图片以及对应的标注移动到一个新的文件夹中 # 然后后续操作都是针对这些有标签的图片进行的，其实就是变相去除了 # 没有标签的图片 def remove_useless_file(): # 一共6个国家，一个国家一个国家的操作 RDD2022_all_countries_path = os.path.join(work_dir, 'RDD2022_all_countries') for country in countries: print("开始对 {} 的标签与图片进行操作".format(country)) annoFiles = os.listdir(os.path.join(RDD2022_all_countries_path, country + "/train/annotations/xmls/")) jpgFiles = os.listdir(os.path.join(RDD2022_all_countries_path, country + "/train/images/")) newCountry = "new_" + country # 在RDD2022_all_countries文件夹下面新建文件夹，new_countryname/Annotations # new_countryname/JPEGImages annotations_dir = os.path.join(RDD2022_all_countries_path, newCountry, 'Annotations/') jpegimages_dir = os.path.join(RDD2022_all_countries_path, newCountry, 'JPEGImages/') os.makedirs(annotations_dir, exist_ok=True) os.makedirs(jpegimages_dir, exist_ok=True) for annoFile in annoFiles: tree = ET.parse( os.path.join(RDD2022_all_countries_path + "/" + country + "/train/annotations/xmls/", annoFile)) root = tree.getroot() for obj in root.findall("object"): a = obj.find("name").text if a not in labels: root.remove(obj) if len(root.findall("object")) > 0: country_path = os.path.join(RDD2022_all_countries_path, country) newCountry_path = os.path.join(RDD2022_all_countries_path, newCountry) tree.write(newCountry_path + "/Annotations/" + annoFile) copyfile(os.path.join(country_path + "/train/images/", annoFile.split(".")[0]) + ".jpg", newCountry_path + "/JPEGImages/" + annoFile.split(".")[0] + ".jpg") else: # print(f'{annoFile} 没有标签文件') continue print("{} 的标签与图片操作完毕".format(country)) # 将所有的图片复制到工作目录下的new_train/jpegimages 文件夹下 # 将所有的标签复制到工作目录下的new_train/annotations 文件夹下 def copy_file_2_new_train_dir(): # 首先创建new_train文件夹 os.makedirs(work_dir + "new_train/", exist_ok=True) # 创建new_train文件夹下面的两个文件夹 jpeg_path = os.path.join(work_dir, 'new_train', 'jpegimages/') annotation_path = os.path.join(work_dir, 'new_train', 'annotations/') os.makedirs(jpeg_path, exist_ok=True) os.makedirs(annotation_path, exist_ok=True) RDD2022_all_countries_path = os.path.join(work_dir, 'RDD2022_all_countries') for country in countries: print("{}正在复制".format(country)) jpeg_dir_path = os.path.join(RDD2022_all_countries_path, 'new_' + country, 'JPEGImages') all_jpeg_names = os.listdir(jpeg_dir_path) annotation_dir_path = os.path.join(RDD2022_all_countries_path, 'new_' + country, 'Annotations') all_anno_names = os.listdir(annotation_dir_path) for name in all_jpeg_names: source = os.path.join(jpeg_dir_path, name) target = os.path.join(work_dir, 'new_train', 'jpegimages') shutil.copy(source, target) for name in all_anno_names: source = os.path.join(annotation_dir_path, name) target = os.path.join(work_dir, 'new_train', 'annotations') shutil.copy(source, target) print("{}复制完毕".format(country)) # 生成一个包含所有xml文件路径的txt文件以便 xml2yolo文件调用 def generate_txt_file(): annoFiles = os.listdir(os.path.join(work_dir, "new_train/Annotations/")) yoloFile = open("./xml2yolo_damage.txt", "w") for i in range(len(annoFiles)): yoloFile.writelines(work_dir + "/new_train/Annotations/" + annoFiles[i] + "\n") yoloFile.close() def xml2yolo(): import argparse import os import xml.etree.ElementTree as ET from PIL import Image from collections import defaultdict # Type of image in Dataset imageType = ["jpeg", "png", "jpg", "JPEG", "JPG", "PNG"] # dictionary to store list of image paths in each class imageListDict = defaultdict(set) def convert(size, box): dw = 1. / size[0] dh = 1. / size[1] x = (box[0] + box[1]) / 2.0 y = (box[2] + box[3]) / 2.0 w = box[1] - box[0] h = box[3] - box[2] x = x * dw w = w * dw y = y * dh h = h * dh return [x, y, w, h] # convert minX,minY,maxX,maxY to normalized numbers required by Yolo def getYoloNumbers(imagePath, minX, minY, maxX, maxY): image = Image.open(imagePath) w = int(image.size[0]) h = int(image.size[1]) b = (minX, maxX, minY, maxY) bb = convert((w, h), b) image.close() return bb def getFileList3(filePath): xmlFiles = [] with open(filePath, "r") as f: xmlFiles = f.readlines() for i in range(len(xmlFiles)): temp = xmlFiles[i].strip().rsplit('.', 1)[0] xmlFiles[i] = os.path.abspath(temp.replace("JPEGImages", "Annotations") + ".xml") labels_path = os.path.dirname(xmlFiles[i]).replace("Annotations", "labels") if not os.path.exists(labels_path): os.mkdir(labels_path) assert (os.path.exists(xmlFiles[i])) return xmlFiles def main(): parser = argparse.ArgumentParser(description='run phase2.') parser.add_argument('--input-file', type=str, help='location to the list of images/xml files(absolute path). sample file at "./xml2yolo_damagee.txt"', default='./xml2yolo_damage.txt') args = parser.parse_args() # assign each class of dataset to a number outputCtoId = {'D00': 0, 'D10': 1, 'D20': 2, 'D40': 3} # read the path of the directory where XML and images are present xmlFiles = getFileList3(args.input_file) print("total files:", len(xmlFiles)) print('正在转换......') # loop over each file under dirPath for file in xmlFiles: filePath = file # print(filePath) tree = ET.parse(filePath) root = tree.getroot() i = 0 imageFile = filePath[:-4].replace("Annotations", "JPEGImages") + "." + imageType[i] while (not os.path.isfile(imageFile) and i

【本文地址】

公司简介

联系我们

今日新闻

点击排行

实验室常用的仪器、试剂和: 说到实验室常用到的东西，主要就分为仪器、试剂和耗

不用再找了，全球10大实验: 01、赛默飞世尔科技（热电）Thermo Fisher Scientif

三代水柜的量产巅峰T-72坦: 作者：寞寒最近，西边闹腾挺大，本来小寞以为忙完这

通风柜跟实验室通风系统有: 说到通风柜跟实验室通风，不少人都纠结二者到底是不

集消毒杀菌、烘干收纳为一: 厨房是家里细菌较多的地方，潮湿的环境、没有完全密

实验室设备之全钢实验台如: 全钢实验台是实验室家具中较为重要的家具之一，很多

图片新闻

实验室药品柜的特性有哪些: 实验室药品柜是实验室家具的重要组成部分之一，主要

小学科学实验中有哪些教学: 计算机计算器一般打孔器打气筒仪器车显微镜

实验室各种仪器原理动图讲: 1.紫外分光光谱UV分析原理：吸收紫外光能量，引起分

高中化学常见仪器及实验装: 1、可加热仪器：2、计量仪器：（1）仪器A的名称：量

微生物操作主要设备和器具: 今天盘点一下微生物操作主要设备和器具，别嫌我啰嗦

浅谈通风柜使用基本常识: 　众所周知，通风柜功能中最主要的就是排气功能。在

RDD2022 数据格式转换与清洗

RDD2022 数据格式转换与清洗

今日新闻

点击排行

推荐新闻

图片新闻

专题文章