RDD2022 数据格式转换与清洗 |
您所在的位置:网站首页 › 数据格式转化英文 › RDD2022 数据格式转换与清洗 |
Rdd2022数据集,是关于道路损伤的数据集,与rdd2020相比增加了两万多张图片 但是由于格式不能被yolo直接使用且其中有大量的图片没有符合要求的标注,特写此文章记录数据清洗与格式化的过程 数据集下载在开始前需要自己下载zip格式的RDD2022数据集,大小为12.4G 点击此处下载 之后,在桌面上新建一个名为my_file 的文件夹,将上面下载的压缩包放进去,将我的main.py放进去,文件夹结构如下
注意: 如果运行过程中出现任何bug使得程序没有进行到底,需要删除所有自动生成的文件,回到最开始的目录结构,从新开始运行main文件如果报错说有缺了什么包,自己去安装即可注意设置工作目录为你自己新建的那个my_file文件夹,一般情况下默认就是这个,如果报错找不到目录啥的就看下是不是这个问题按照个人需求,以国家为单位对数据集进行了train:val=7:3的切割特别注意:代码中将没有任何标注的图片直接剔除了,这可能会对你的训练产生影响 源代码main.py的代码如下: import zipfile import os import os import xml.etree.ElementTree as ET from shutil import copyfile import shutil import argparse from pathlib import Path import random from collections import defaultdict import random work_dir = os.getcwd() countries = ["China_Drone", "China_MotorBike", "Czech", "India", "Japan", "Norway", "United_States"] labels = ["D00", "D10", "D20", "D40"] # 解压最开始的12.4G的压缩包到工作目录 # 解压之后是一个名为 RDD2022_all_countries 的文件夹 def unzip_rdd2022(): path = os.path.join(work_dir, 'RDD2022.zip') zip_file = zipfile.ZipFile(path) zip_list = zip_file.namelist() for f in zip_list: zip_file.extract(f, work_dir) zip_file.close() # RDD2022_all_countries文件夹里面有6个以国家名称命名的压缩包 # 进入这个文件夹里面继续解压,注意是解压到了RDD2022_all_countries # 这个文件夹里面,至此所有的压缩文件解压完毕 def unzip_RDD2022_all_countries(): dir_path = os.path.join(work_dir, 'RDD2022_all_countries') all_countries_zip_file_name = os.listdir(dir_path) for name in all_countries_zip_file_name: print('正在解压{}'.format(name)) all_countries_zip_file_path = os.path.join(dir_path, name) zip_file = zipfile.ZipFile(all_countries_zip_file_path) zip_list = zip_file.namelist() for f in zip_list: zip_file.extract(f, dir_path) zip_file.close() print('{}已解压完成'.format(name)) # 将所有有标签的图片以及对应的标注移动到一个新的文件夹中 # 然后后续操作都是针对这些有标签的图片进行的,其实就是变相去除了 # 没有标签的图片 def remove_useless_file(): # 一共6个国家,一个国家一个国家的操作 RDD2022_all_countries_path = os.path.join(work_dir, 'RDD2022_all_countries') for country in countries: print("开始对 {} 的标签与图片进行操作".format(country)) annoFiles = os.listdir(os.path.join(RDD2022_all_countries_path, country + "/train/annotations/xmls/")) jpgFiles = os.listdir(os.path.join(RDD2022_all_countries_path, country + "/train/images/")) newCountry = "new_" + country # 在RDD2022_all_countries文件夹下面新建文件夹,new_countryname/Annotations # new_countryname/JPEGImages annotations_dir = os.path.join(RDD2022_all_countries_path, newCountry, 'Annotations/') jpegimages_dir = os.path.join(RDD2022_all_countries_path, newCountry, 'JPEGImages/') os.makedirs(annotations_dir, exist_ok=True) os.makedirs(jpegimages_dir, exist_ok=True) for annoFile in annoFiles: tree = ET.parse( os.path.join(RDD2022_all_countries_path + "/" + country + "/train/annotations/xmls/", annoFile)) root = tree.getroot() for obj in root.findall("object"): a = obj.find("name").text if a not in labels: root.remove(obj) if len(root.findall("object")) > 0: country_path = os.path.join(RDD2022_all_countries_path, country) newCountry_path = os.path.join(RDD2022_all_countries_path, newCountry) tree.write(newCountry_path + "/Annotations/" + annoFile) copyfile(os.path.join(country_path + "/train/images/", annoFile.split(".")[0]) + ".jpg", newCountry_path + "/JPEGImages/" + annoFile.split(".")[0] + ".jpg") else: # print(f'{annoFile} 没有标签文件') continue print("{} 的标签与图片操作完毕".format(country)) # 将所有的图片复制到工作目录下的new_train/jpegimages 文件夹下 # 将所有的标签复制到工作目录下的new_train/annotations 文件夹下 def copy_file_2_new_train_dir(): # 首先创建new_train文件夹 os.makedirs(work_dir + "new_train/", exist_ok=True) # 创建new_train文件夹下面的两个文件夹 jpeg_path = os.path.join(work_dir, 'new_train', 'jpegimages/') annotation_path = os.path.join(work_dir, 'new_train', 'annotations/') os.makedirs(jpeg_path, exist_ok=True) os.makedirs(annotation_path, exist_ok=True) RDD2022_all_countries_path = os.path.join(work_dir, 'RDD2022_all_countries') for country in countries: print("{}正在复制".format(country)) jpeg_dir_path = os.path.join(RDD2022_all_countries_path, 'new_' + country, 'JPEGImages') all_jpeg_names = os.listdir(jpeg_dir_path) annotation_dir_path = os.path.join(RDD2022_all_countries_path, 'new_' + country, 'Annotations') all_anno_names = os.listdir(annotation_dir_path) for name in all_jpeg_names: source = os.path.join(jpeg_dir_path, name) target = os.path.join(work_dir, 'new_train', 'jpegimages') shutil.copy(source, target) for name in all_anno_names: source = os.path.join(annotation_dir_path, name) target = os.path.join(work_dir, 'new_train', 'annotations') shutil.copy(source, target) print("{}复制完毕".format(country)) # 生成一个包含所有xml文件路径的txt文件以便 xml2yolo文件调用 def generate_txt_file(): annoFiles = os.listdir(os.path.join(work_dir, "new_train/Annotations/")) yoloFile = open("./xml2yolo_damage.txt", "w") for i in range(len(annoFiles)): yoloFile.writelines(work_dir + "/new_train/Annotations/" + annoFiles[i] + "\n") yoloFile.close() def xml2yolo(): import argparse import os import xml.etree.ElementTree as ET from PIL import Image from collections import defaultdict # Type of image in Dataset imageType = ["jpeg", "png", "jpg", "JPEG", "JPG", "PNG"] # dictionary to store list of image paths in each class imageListDict = defaultdict(set) def convert(size, box): dw = 1. / size[0] dh = 1. / size[1] x = (box[0] + box[1]) / 2.0 y = (box[2] + box[3]) / 2.0 w = box[1] - box[0] h = box[3] - box[2] x = x * dw w = w * dw y = y * dh h = h * dh return [x, y, w, h] # convert minX,minY,maxX,maxY to normalized numbers required by Yolo def getYoloNumbers(imagePath, minX, minY, maxX, maxY): image = Image.open(imagePath) w = int(image.size[0]) h = int(image.size[1]) b = (minX, maxX, minY, maxY) bb = convert((w, h), b) image.close() return bb def getFileList3(filePath): xmlFiles = [] with open(filePath, "r") as f: xmlFiles = f.readlines() for i in range(len(xmlFiles)): temp = xmlFiles[i].strip().rsplit('.', 1)[0] xmlFiles[i] = os.path.abspath(temp.replace("JPEGImages", "Annotations") + ".xml") labels_path = os.path.dirname(xmlFiles[i]).replace("Annotations", "labels") if not os.path.exists(labels_path): os.mkdir(labels_path) assert (os.path.exists(xmlFiles[i])) return xmlFiles def main(): parser = argparse.ArgumentParser(description='run phase2.') parser.add_argument('--input-file', type=str, help='location to the list of images/xml files(absolute path). sample file at "./xml2yolo_damagee.txt"', default='./xml2yolo_damage.txt') args = parser.parse_args() # assign each class of dataset to a number outputCtoId = {'D00': 0, 'D10': 1, 'D20': 2, 'D40': 3} # read the path of the directory where XML and images are present xmlFiles = getFileList3(args.input_file) print("total files:", len(xmlFiles)) print('正在转换......') # loop over each file under dirPath for file in xmlFiles: filePath = file # print(filePath) tree = ET.parse(filePath) root = tree.getroot() i = 0 imageFile = filePath[:-4].replace("Annotations", "JPEGImages") + "." + imageType[i] while (not os.path.isfile(imageFile) and i |
今日新闻 |
点击排行 |
|
推荐新闻 |
图片新闻 |
|
专题文章 |
CopyRight 2018-2019 实验室设备网 版权所有 win10的实时保护怎么永久关闭 |