网站首页 > 博客 > 正文

【数据集分类】使用python按照JSON标签对数据集进行分类

桂花香博客 2024-02-05 1 0

我们所使用的数据集为Istego100K，训练集中包含100000张图像，其中包含33404张用nsf5嵌密的图片，需要将根据数据集提供的标签数据将其提取出来。

标签大致如下：

parameters={

"000001.jpg":{ # parameters for stego-file

"quality": 95, # quality factor

"rate": 0.4, # embedding rate (payload)

"steg_algorithm": "nsf5" # steganographic algorithm

"000002.jpg":{ # parameters for cover-file

"quality": 90 # quality factor

}

首先我们需要借助工具将json数据转化为csv文件，代码如下：

import sys

import json

import csv

import xlsxwriter

out = []

arg=""

i=0

verbose=False

#parse input parameters

if len(sys.argv) == 4:

for i,arg in enumerate(sys.argv):

if("-v" in arg):

sys.argv.pop(i)

verbose = True

if(verbose):

fileDir = str(sys.argv[1])

outFile = str(sys.argv[2])

else:

print("Correct usage is : python jsontoexcel.py ")

print("Example : python jsontoexcel.py myfile.json")

print("will output two files: myfile.csv myfile.xlsx\n")

print("Whereas : python jsontoexcel.py myfile.json output")

print("will output two files: output.csv output.xlsx")

print("you can use the -v flag for verbose output")

elif len(sys.argv) == 3:

for i,arg in enumerate(sys.argv):

if("-v" in arg):

sys.argv.pop(i)

verbose = True

if(verbose):

fileDir = str(sys.argv[1])

outFile = "./" + str(sys.argv[1]).split(".")[0]

else:

fileDir = str(sys.argv[1])

outFile = str(sys.argv[2])

elif len(sys.argv) == 2:

for i,arg in enumerate(sys.argv):

if("-v" in arg):

print("Correct usage is : python jsontoexcel.py ")

print("Example : python jsontoexcel.py myfile.json")

print("will output two files: myfile.csv myfile.xlsx\n")

print("Whereas : python jsontoexcel.py myfile.json output")

print("will output two files: output.csv output.xlsx")

print("you can use the -v flag for verbose output")

sys.exit(-1)

fileDir = str(sys.argv[1])

outFile = "./"+str(sys.argv[1]).split(".")[0]

else:

print ("Correct usage is : python jsontoexcel.py ")

print("Example : python jsontoexcel.py myfile.json")

print("will output two files: myfile.csv myfile.xlsx\n")

print("Whereas : python jsontoexcel.py myfile.json output")

print("will output two files: output.csv output.xlsx")

print("you can use the -v flag for verbose output")

sys.exit(-1)

#flattens a tree object consisted of dictionaries and lists

def flatten_json(y):

print("flattening json file recursivelly")

list2 = []

labels = []

depth = []

global count

count = 0

#flatten each row of the root list

if type(y) is dict:

for j in y.values() :

#print(j)

out,lbl,cnt=flatten(j,' ')

if verbose:

print("Sub tree:" + str(out))

depth.append(cnt)

labels.append(lbl)

#print(out)

list2.append(out)

elif isinstance(y, list):

for j in y :

#print(j)

out,lbl,cnt=flatten(j,' ')

if verbose:

print("Sub tree:" + str(out))

depth.append(cnt)

labels.append(lbl)

#print(out)

list2.append(out)

label=[]

#find max path in json tree

label.append( max(labels, key=len))

if verbose:

print("labels:"+str(label))

list2 = label + list2

if verbose:

print (list2)

return (list2)

labels = []

#explore a tree with recursion and flatten to list

def flatten(x,name):

out=[]

label=[]

count=0

if type(x) is dict:

for a in x:

tmp,nm,cnt=flatten(x[a], name + a + '/')

out+=tmp

label+=nm

count+=cnt

elif isinstance(x, list):

i = 0

for a in x:

tmp,nm,cnt=flatten(a, name + str(i) + '/')

out+=tmp

label += nm

count+=cnt

i += 1

else:

count += 1

out.append(x)

label.append(name)

return out,label,count

#open json file

print("Loading json file")

with open(fileDir,encoding = 'utf-8', newline='') as file:

data = file.read().replace('\n', '')

all_data = json.loads(data)

print(all_data)

global count

#flatten data

flat = flatten_json(all_data)

#create csv with flattened data

print("Saving data as "+outFile+".csv")

data_csv = open(outFile+".csv", 'w',newline='')

csvwriter = csv.writer(data_csv)

data_csv.write('SEP=,\n')

for row in flat :

csvwriter.writerow(row)

#save data as xlsx

print("Saving data as "+outFile +'.xlsx')

workbook = xlsxwriter.Workbook(outFile +'.xlsx',)

worksheet = workbook.add_worksheet()

bold = workbook.add_format({'bold': True})

for r, row in enumerate(flat):

for c, col in enumerate(row):

if r==0:

worksheet.write(r, c, col, bold)

else:

worksheet.write_string(r, c, str(col))

workbook.close()

print("Successfully created files:"+outFile +'.xlsx , ' + outFile+".csv" )

运行命令python JsonToExcel.py

此时生成如下csv文件

再通过关键字匹配找到steg_algorithm=“nsf5”的图像id，将其对应图像提取出来，代码如下：

import os

import shutil

import pandas as pd

import random

# 打开表格文件并读取

f = open("C:/Users/hp/PycharmProjects/pythonProject2/train.csv", "rb") # 打开csv文件

list = pd.read_csv(f) # 这句不能少

listnew = list[list["steg_algorithm"]=="nsf5"] # 对应csv文件图片那一栏的标题

l = listnew["id"].tolist() # 对应csv文件标签那一栏的标题)

for each in l:

j='{:06d}'.format(each) #将图像编号转换为6位整数，不足补零，与原图像名称保持一致

print(j)

shutil.move('D:/实验资源/IStego100K/' + str(j) +'.jpg', 'D:/实验资源/IStego100K/' + '1')

print("完成")

任务完成！

好文链接

评论可见，请评论后查看内容，谢谢！！！评论后请刷新页面。

本文由用户于 2024-02-05 发布在夸智网，如有疑问，请联系我们。
本文链接：https://www.kuazhi.com/post/713091128.html

夸智网

【数据集分类】使用python按照JSON标签对数据集进行分类

git switch 命令详解

javascript gpt支持json格式的数据返回（response

发表评论取消回复

夸智网

【数据集分类】使用python按照JSON标签对数据集进行分类

git switch 命令详解

javascript gpt支持json格式的数据返回（response

相关文章

发表评论取消回复