开发者社区> 问答> 正文

如何替换JSON文件?(关闭)

我有一个JSON文件,如

[
  {
    "id": 1118690,
    "special": "<p>a。</p><p>bbb。</p><p>"cccc"</p>"
  },
  {
    "id": 2,
    "special": "<a>aabbcc,"ddb"</a>"
  }
]

我想替换为JSON对象。但在文件中,要有双引号。 问题来源StackOverflow 地址:/questions/59467174/how-to-replace-json-file

展开
收起
kun坤 2019-12-25 09:31:51 510 0
1 条回答
写回答
取消 提交回答
  • 以下是项目过程中遇到的一些数据清洗,预处理和统计等常规操作,有需要的可以自行借鉴下相关模块代码,转载或引用请注明。

    ###################################################################################

    -.- coding:utf-8 -.-

    author = 'Jack'

    date = 2019/07/09

    目录下所需文件:

    1- menu.xlsx: 存放菜单的Id、Name、Price三个属性

    2- origin.json: 存放原始标注图片的标注信息

    3- modify.json: 待检查的json文件

    操作文档说明:

    1- 拿到文件先抽样检查,确保整体无误,跟新下menu,xlsx

    2- 将新增图片放入到not_rename_file文件夹下

    3- 将相应的json文件命名为modify.json放到与data_process.py文件同级的目录下

    4- 修改相应的超参数设置(INDEX_START),运行以下程序:

    modify = Modify()

    modify.rename_key() # 修改json文件相应属性名

    modify.rename_file_name() # 修改图片文件名

    # modify.remark_label() # 如需修改标签值则调用此函数

    Check().check_label()

    5- 运行结束后检查origin.json是否有新增标注数据信息,若出现错误,则根据提示进行修正再次重新运行

    6- 重复步骤1-5,直至将所有批次的新增数据处理完毕,跳转到7-

    7- 在主程序中将第4-调用到的所有程序注释掉,确保不被运行到

    8- 修改相应的超参数(MORE_THAN_NUM),并运行以下程序:

    operate = Operate()

    operate.shuffler_data()

    label_name, _ = operate.feature_map()

    number_per_food_train, number_per_food_val, number_per_food_test, nv, nt = operate.counts(label_name)

    choose_label = \

    operate.visualize(number_per_food_train, number_per_food_val, number_per_food_test, label_name, nv, nt)

    train_list, val_list, test_list = operate.get_label(choose_label)

    operate.delete_photo('./train', train_list)

    operate.delete_photo('./val', val_list)

    operate.delete_photo('./test', test_list)

    执行完以上程序需记录“样本数超过?的标签值有?种,即[...]”这两个数据(外加中文标签),并把train/vak/test更新

    训练程序配置

    (1)在caipinshibie.py文件搜索changes修改相应参数

    (2)在mrcnn文件夹下的model.py文件定位到1711行修改相应的augmentation为None或True

    (3)运行训练文件

    ################################################################################### import json import xmltodict import pandas as pd from collections import OrderedDict import matplotlib.pyplot as plt import os import shutil import datetime import warnings

    系统参数设置

    plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置中文乱码格式 plt.rcParams['axes.unicode_minus'] = False # 设置负号正常显示 warnings.filterwarnings("ignore") # 忽略警告

    超参数设置

    MODIFY_JSON_NAME = 'modify.json' # 待检查的json文件 ORIGIN_JSON_NAME = 'origin.json' # 已有的json文件 OUTPUT_JSON_NAME = 'output.json' # 经筛选所得到的json文件 TRAIN_VAL_DIVIDE_JSON_NAME = 'via_region_data.json' # 划分训练测试集所得的json文件 MENU_EXCEL_NAME = 'menu.xlsx' # 菜单,须实时更新 MODIFY_FILE_PATH = './not_rename_file/' # 待修改文件名图片的存放路径 MOVE_FILE_PATH = './json_file/' # 此文件夹存放已经处理好的图片 OUTPUT_FILE_PATH = './output_file/' # 此文件夹存放经过筛选后所得图片

    INDEX_START = 955 # 新增图片的起始序号 MORE_THAN_NUM = 70 # 筛选出出现次数大于多少次的菜品

    class Modify(object): """用于重命名文件或属性"""

    ##############################################################################
    # rename_file_name():批量修改文件名
    # rename_key():重命名 json 文件中的键名和 filename 属性值
    # remark_label():修改标签值
    ##############################################################################
    
    @staticmethod
    def rename_file_name():
        files = os.listdir(MODIFY_FILE_PATH)
        cnt = INDEX_START
        for file in files:
            old_name = file
            old_path = os.path.join(MODIFY_FILE_PATH, old_name)
            new_name = 'IMG_' + str(cnt).zfill(4) + '.jpg'
            new_path = os.path.join(MODIFY_FILE_PATH, new_name)
            os.rename(old_path, new_path)
            print(old_name, "========>", new_name)
            cnt += 1
        print("Successful to rename filename!")
    
    @staticmethod
    def rename_key():
        """重命名json,格式为“IMG_0001”"""
    
        re_dict = {}
        cnt = INDEX_START
        with open(MODIFY_JSON_NAME, 'r') as f:
            json_dict = json.load(f)
            for k, val in json_dict.items():
                rename = "IMG_" + str(cnt).zfill(4) + '.jpg'
                re_dict[rename] = json_dict[k]
                re_dict[rename]['filename'] = rename
                cnt += 1
    
        with open(MODIFY_JSON_NAME, 'w') as f:
            f.write(json.dumps(re_dict))
    
    @staticmethod
    def remark_label():
        """修改标签值"""
    
        with open(ORIGIN_JSON_NAME, 'r') as f:
            # json_dict = json.load(f, object_pairs_hook=OrderedDict)  # 使用有序字典
            json_dict = json.load(f)
            print("修改前的字典顺序:\n", json_dict.keys())
            for k, val in json_dict.items():
                ls = len(json_dict[k]['regions'])
                for l in range(ls):
                    if json_dict[k]['regions'][str(l)]['region_attributes']['label'] == '1315':
                        json_dict[k]['regions'][str(l)]['region_attributes']['label'] = '0812'
                        print(k[4: 8])
    
        with open(ORIGIN_JSON_NAME, 'w') as f:
            # json_dict = sorted(zip(json_dict.values(), json_dict.keys()))
            print("修改后的字典顺序:\n", json_dict.keys())
            f.write(json.dumps(json_dict))
    

    class Check(object): """用于检测标注是否正确,若正确则合并到原始json文件当中"""

    #######################################################################################
    # move_file():将符合条件的图片剪切到总的图片文件夹下
    # check_label():检查标签值是否满足三位的格式及在指定的范围内
    #######################################################################################
    
    @staticmethod
    def move_file():
        """将符合条件的图片剪切到总的图片文件夹下"""
        files = os.listdir(MODIFY_FILE_PATH)
        for file in files:
            shutil.move(MODIFY_FILE_PATH+file, MOVE_FILE_PATH)
    
    @staticmethod
    def check_label():
        """检查标签值是否满足三位的格式及在指定的范围内"""
    
        modifies = []
        data = pd.read_excel('menu.xlsx', skiprows=1)
        max_label = max(data['Id'])
        with open(MODIFY_JSON_NAME, 'r') as f:
            json_dict = json.load(f)
            for k, val in json_dict.items():
                ls = len(json_dict[k]["regions"])
                for l in range(ls):
                    try:
                        if (len(json_dict[k]['regions'][str(l)]['region_attributes']['label']) != 3) or \
                                (int(json_dict[k]['regions'][str(l)]['region_attributes']['label']) < 1) or \
                                (int(json_dict[k]['regions'][str(l)]['region_attributes']['label']) > max_label):
                            modifies.append(k[4: 8] + str(l))
                    except KeyError:
                        modifies.append(k[4: 8] + '_' + str(l))
    
        if len(modifies) == 0:
            print("所有图片均符合要求!")
            with open(ORIGIN_JSON_NAME, 'r') as o:
                json_origin = json.load(o)
                with open(MODIFY_JSON_NAME, 'r') as m:
                    json_modify = json.load(m)
                    for k, val in json_modify.items():
                        json_origin[k] = val
    
            with open(ORIGIN_JSON_NAME, 'w') as f:
                f.write(json.dumps(json_origin))
            Check.move_file()
        else:
            print("需要修正的图片序号为:\n", modifies)
    
    @staticmethod
    def look_index(idx):
        # 寻找包含特定编号的图片,返回一个列表
        indexs = []
        with open(ORIGIN_JSON_NAME, 'r') as f:
            json_dict = json.load(f)
            for k, val in json_dict.items():
                ls = len(json_dict[k]["regions"])
                for l in range(ls):
                    if json_dict[k]['regions'][str(l)]['region_attributes']['label'] == idx:
                        indexs.append(k[4: 8])
    
        print("包含标签值为{}的图片编号为:".format(idx))
        print(indexs)
    

    class Operate(object): """对数据进行一些操作"""

    #######################################################################################
    # shuffler_data():将原始数据集按8:2的操作划分为训练数据集和测试数据集
    # feature_map():返回两个字典:{label: name}和{label: price}
    # counts(name_dict):统计已有菜品的种类数,以及每种菜品出现的次数
    # visualize(var_dict, name_dict):可视化统计结果,返回筛选出来的菜品标签值
    # get_label(lists):提取特定的标签值
    # delete_photo(paths, file_name_list):删除指定序号的图片
    # choose(paths, file_name_list):筛选指定序号的图片
    # divide_train_and_val(start, end):将数据集划分为训练数据集和测试数据集
    # print_name(l, name_dict):输出标签值对应的中文名"
    #######################################################################################
    
    @staticmethod
    def shuffler_data():
        train_data = {}
        validation_data = {}
        test_data = {}
        with open(ORIGIN_JSON_NAME, 'r', encoding='utf-8') as f:
            json_dict = json.load(f)
            cnt = 1
    
            for k, val in json_dict.items():
                if cnt % 10 == 3:
                    validation_data[k] = val
                elif cnt % 10 == 7:
                    test_data[k] = val
                else:
                    train_data[k] = val
                cnt = cnt + 1
        with open('./train/via_region_data.json', 'w', encoding='utf-8') as f:
            f.write(json.dumps(train_data, ensure_ascii=False))
        with open('./val/via_region_data.json', 'w', encoding='utf-8') as f:
            f.write(json.dumps(validation_data, ensure_ascii=False))
        with open('./test/via_region_data.json', 'w', encoding='utf-8') as f:
            f.write(json.dumps(test_data, ensure_ascii=False))
    
        cnt = 1
        files = os.listdir(MOVE_FILE_PATH)
        for file in files:
            if cnt % 10 == 3:
                shutil.copy(MOVE_FILE_PATH+file, './val/')
            elif cnt % 10 == 7:
                shutil.copy(MOVE_FILE_PATH+file, './test/')
            else:
                shutil.copy(MOVE_FILE_PATH+file, './train/')
            cnt = cnt + 1
    
    @staticmethod
    def feature_map():
        """
        read_path: Excel文件路径
        :return: 映射字典
        """
        data = pd.read_excel(MENU_EXCEL_NAME, skiprows=1)
        labels = list(data['Id'])
        labels = [str(label).zfill(3) for label in labels]  # 标签值
        names = list(data['Name'])  # 菜品名
        prices = list(data['Price'])  # 价格
        print("总标签值:\n", labels)
        print("总菜品名:\n", names)
        print("价格:\n", prices)
        print("-" * 200)
        na_dict = {}  # 每个标签值对应的菜品名
        pri_map = {}  # 每个标签值对应的菜品价格
        for i in range(len(labels)):
            na_dict[labels[i]] = names[i]
            pri_map[labels[i]] = prices[i]
    
        return na_dict, pri_map
    
    @staticmethod
    def counts(name_dict):
        """
        统计已有菜品的种类数,以及每种菜品出现的次数
        read_path: 文件路径
        :param name_dict: 从feature_map的输出获得,菜品名映射字典
        :return
            exist_labels 目前拥有的菜品种类
            num_per_food 每种菜品对应的数量
        """
    
        with open('./train/via_region_data.json', 'r') as f:
            json_dict = json.load(f, object_pairs_hook=OrderedDict)
            total_train = []
            for k, val in json_dict.items():
                ls = len(json_dict[k]['regions'])
                for l in range(ls):
                    total_train.append(json_dict[k]['regions'][str(l)]['region_attributes']['label'])
    
        with open('./val/via_region_data.json', 'r') as f:
            json_dict = json.load(f, object_pairs_hook=OrderedDict)
            total_val = []
            for k, val in json_dict.items():
                ls = len(json_dict[k]['regions'])
                for l in range(ls):
                    total_val.append(json_dict[k]['regions'][str(l)]['region_attributes']['label'])
    
        with open('./test/via_region_data.json', 'r') as f:
            json_dict = json.load(f, object_pairs_hook=OrderedDict)
            total_test = []
            for k, val in json_dict.items():
                ls = len(json_dict[k]['regions'])
                for l in range(ls):
                    total_test.append(json_dict[k]['regions'][str(l)]['region_attributes']['label'])
    
        value_cut_train = {}
        for t in total_train:
            value_cut_train[t] = value_cut_train.get(t, 0) + 1
    
        value_cut_val = {}
        for t in total_val:
            value_cut_val[t] = value_cut_val.get(t, 0) + 1
    
        value_cut_test = {}
        for t in total_test:
            value_cut_test[t] = value_cut_test.get(t, 0) + 1
    
        num_per_food_train = sorted(value_cut_train.items(), key=lambda v: v[1], reverse=False)
        num_per_food_val = sorted(value_cut_val.items(), key=lambda v: v[1], reverse=False)
        num_per_food_test = sorted(value_cut_test.items(), key=lambda v: v[1], reverse=False)
    
        print('num_per_food_train:\n', num_per_food_train)  # 训练数据集中每种菜品对应的数量
        print('num_per_food_val:\n', num_per_food_val)  # 训练数据集中每种菜品对应的数量
        print('num_per_food_test:\n', num_per_food_test)  # 训练数据集中每种菜品对应的数量
        print('-' * 100)
    
        exist_labels_train = [v for v in value_cut_train.keys()]
        exist_labels_val = [v for v in value_cut_val.keys()]
        exist_labels_test = [v for v in value_cut_test.keys()]
    
        print("exist_labels_train:\n", exist_labels_train)
        print("exist_labels_val:\n", exist_labels_val)
        print("exist_labels_test:\n", exist_labels_test)
        print('-' * 100)
    
        chinese_dict_train = {}
        for i in range(len(exist_labels_train)):
            chinese_dict_train[i] = name_dict[exist_labels_train[i]]
        names_train = list(chinese_dict_train.values())
        names_train.insert(0, '背景')
        print("训练数据集中已有的菜品种类(含背景):\n", names_train)
        print("训练数据集中总的菜品数量为: ", len(exist_labels_train))
        print('-' * 100)
    
        chinese_dict_val = {}
        for i in range(len(exist_labels_val)):
            chinese_dict_val[i] = name_dict[exist_labels_val[i]]
        names_val = list(chinese_dict_val.values())
        print("验证数据集中已有的菜品种类:\n", names_val)
        print("验证数据集中总的菜品数量为: ", len(exist_labels_val))
    
        chinese_dict_test = {}
        for i in range(len(exist_labels_test)):
            chinese_dict_test[i] = name_dict[exist_labels_test[i]]
        names_test = list(chinese_dict_test.values())
        print("测试数据集中已有的菜品种类:\n", names_test)
        print("测试数据集中总的菜品数量为: ", len(exist_labels_test))
    
        return num_per_food_train, num_per_food_val, num_per_food_test, names_val, names_test
    
    @staticmethod
    def visualize(train_dict, val_dict, test_dict, name_dict, nv, nt):
        """
        数据可视化
        var_dict: 从counts()的输出获得,每种菜品对应的数量num_per_food
        name_dic: 从feature_map()的输出获得,菜品名映射字典
        :return temp: 筛选出来的菜品标签值,用一个列表保存
        """
    
        # train
        keys, values, labels, names = [], [], [], ['背景']
        for i in range(len(train_dict)):
            if train_dict[i][1] >= MORE_THAN_NUM:
                labels.append(train_dict[i][0])
                names.append(name_dict[train_dict[i][0]])
            keys.append(name_dict[train_dict[i][0]])
            values.append(train_dict[i][1])
        print('训练数据集中样本数超过' + str(MORE_THAN_NUM) + '的标签值有{}种,即\n{}'.format(len(labels), labels))
        print("对应的中文标签为:", names)
    
        list_nv, list_nt = [], []
        for n in names:
            if n not in nv and n != '背景':
                list_nv.append(n)
            if n not in nt and n != '背景':
                list_nt.append(n)
        if len(list_nv) != 0:
            print("验证集中缺少目标菜品有:", list_nv)
        else:
            print("验证集中覆盖了所有的目标")
        if len(list_nt) != 0:
            print("测试集中缺少目标菜品有:", list_nt)
        else:
            print("测试集中覆盖了所有的目标")
    
        plt.figure(figsize=(12, 12))
        # plt.subplots_adjust(left=0.09, right=1, wspace=0.25, hspace=0.25, bottom=0.13, top=0.91)
        plt.barh(keys, values, color='steelblue', alpha=0.8)
        plt.yticks(fontsize=5)
        plt.title(str(datetime.date.today()) + u"训练数据菜品数量分布图")
        plt.ylabel(u'菜品种类')
        plt.xlabel(u'菜品数量')
        plt.tight_layout()
        for x, y in enumerate(values):
            plt.text(y + 0.3, x - 0.3, '%s' % y, fontsize=8)
        plt.savefig('./statistics_graph/' + str(datetime.date.today()) + u"训练数据菜品数量分布图" + '.png')
        plt.show()
    
        # val
        keys, values = [], []
        for i in range(len(val_dict)):
            keys.append(name_dict[val_dict[i][0]])
            values.append(val_dict[i][1])
        plt.figure(figsize=(12, 12))
        plt.barh(keys, values, color='steelblue', alpha=0.8)
        plt.yticks(fontsize=5)
        plt.title(str(datetime.date.today()) + u"验证数据菜品数量分布图")
        plt.ylabel(u'菜品种类')
        plt.xlabel(u'菜品数量')
        plt.tight_layout()
        for x, y in enumerate(values):
            plt.text(y + 0.3, x - 0.3, '%s' % y, fontsize=8)
        plt.savefig('./statistics_graph/' + str(datetime.date.today()) + u"验证数据菜品数量分布图" + '.png')
        plt.show()
    
        # test
        keys, values = [], []
        for i in range(len(test_dict)):
            keys.append(name_dict[test_dict[i][0]])
            values.append(test_dict[i][1])
        plt.figure(figsize=(12, 12))
        plt.barh(keys, values, color='steelblue', alpha=0.8)
        plt.yticks(fontsize=5)
        plt.title(str(datetime.date.today()) + u"测试数据菜品数量分布图")
        plt.ylabel(u'菜品种类')
        plt.xlabel(u'菜品数量')
        plt.tight_layout()
        for x, y in enumerate(values):
            plt.text(y + 0.3, x - 0.3, '%s' % y, fontsize=8)
        plt.savefig('./statistics_graph/' + str(datetime.date.today()) + u"测试数据菜品数量分布图" + '.png')
        plt.show()
    
        return labels
    
    @staticmethod
    def filter_data(json_path, filter_list, types):
        choice_dict = {}
        idx = []
        with open(json_path, 'r') as f:
            json_dict = json.load(f)
            nums = len(json_dict)
            for k, val in json_dict.items():
                cnt = 0  # 统计出现目标标签的次数
                ls = len(json_dict[k]["regions"])
                for l in range(ls):
                    if json_dict[k]["regions"][str(l)]["region_attributes"]["label"] in filter_list:
                        cnt += 1
                    else:
                        json_dict[k]["regions"].pop(str(l))
                if cnt >= 1:
                    choice_dict[k] = json_dict[k]
                else:
                    idx.append(k)
            print("共有{}数据{}张".format(types, (nums - len(idx))))
            print("需要删除的图片序号:\n", idx)
            print("-" * 100)
    
        with open(json_path, 'w') as f:
            f.write(json.dumps(choice_dict))
    
        return idx
    
    @staticmethod
    def get_label(lists):
        """
        提取特定的标签值
        :param lists 从visualize()处获得,保存筛选出的菜品labels
        :return idx_ 返回待删除的图片文件名
        """
        idx_trian = Operate.filter_data(json_path='./train/via_region_data.json', filter_list=lists, types="训练")
        idx_val = Operate.filter_data(json_path='./val/via_region_data.json', filter_list=lists, types="验证")
        idx_test = Operate.filter_data(json_path='./test/via_region_data.json', filter_list=lists, types="测试")
    
        return idx_trian, idx_val, idx_test
    
    @staticmethod
    def delete_photo(file_path, file_name_list):
        """
        删除指定序号的图片
        file_name_list 待删除文件名列表,由get_label()处获得输入idx
        """
        files = os.listdir(file_path)
        for i, f in enumerate(files):
            if f[0: 8] in file_name_list:
                os.remove(file_path + f)
                print("Success to delete picture {} !".format(f))
    
    @staticmethod
    def choose_photo(file_name_list):
        """
        筛选指定序号的图片
        file_name_list 待删除文件名列表,由get_label()处获得输入idx
        """
        files = os.listdir('./train')
        for i, f in enumerate(files):
            if f[0: 8] not in file_name_list:
                shutil.copyfile(MOVE_FILE_PATH+f, OUTPUT_FILE_PATH+f)
                print("Success to copy picture {} to output_file directory!".format(f[0: 8]))
    
    @staticmethod
    def divide_train_and_val(start, end):
        """
        将数据集划分为训练数据集和测试数据集
        :param start: 起始序号
        :param end: 结尾序号
        """
        via = {}
        with open(OUTPUT_JSON_NAME, 'r', encoding='utf-8') as f:
            json_dict = json.load(f)
            cnt = 1
            for k, val in json_dict.items():
                if (cnt >= start) and (cnt <= end):
                    via[k] = val
                cnt += 1
        with open(TRAIN_VAL_DIVIDE_JSON_NAME, 'w', encoding='utf-8') as f:
            f.write(json.dumps(via, ensure_ascii=False))
    
    @staticmethod
    def print_name(l, name_dict):
        """输出标签值对应的中文名"""
        ln = []
        for i in range(len(l)):
            ln.append(name_dict[l[i]])
        print("**30**:\n", ln)
    
    @staticmethod
    def turn_chinese(write_path):
        """
        将标签值映射为中文
        """
        data = pd.read_excel(MENU_EXCEL_NAME, skiprows=1)
        labels = list(data['序号'])
        labels = [str(label).zfill(3) for label in labels]
        names = list(data['菜名'])
        name_dict = {}
        for i in range(len(labels)):
            name_dict[labels[i]] = names[i]
    
        with open(OUTPUT_JSON_NAME, 'r') as f:
            json_dict = json.load(f)
            for k, val in json_dict.items():
                ls = len(json_dict[k]['regions'])
                for l in range(ls):
                    json_dict[k]['regions'][str(l)]['region_attributes']['label'] = \
                        name_dict[json_dict[k]['regions'][str(l)]['region_attributes']['label']]
        with open(write_path, 'w', encoding="utf-8") as f:
            # json_dict = sorted(zip(json_dict.values(), json_dict.keys()))
            print(json_dict.keys())
            f.write(json.dumps(json_dict, ensure_ascii=False))
    

    def main(): # modify = Modify() # modify.rename_key() # modify.rename_file_name() # Check().check_label()

    # operate = Operate()
    # operate.shuffler_data()
    # label_name, _ = operate.feature_map()
    # number_per_food_train, number_per_food_val, number_per_food_test, nv, nt = operate.counts(label_name)
    # choose_label = \
    #     operate.visualize(number_per_food_train, number_per_food_val, number_per_food_test, label_name, nv, nt)
    # train_list, val_list, test_list = operate.get_label(choose_label)
    # operate.delete_photo('./train', train_list)
    # operate.delete_photo('./val', val_list)
    # operate.delete_photo('./test', test_list)
    
    
    print('Success Running!')
    
    2021-02-08 00:20:55
    赞同 展开评论 打赏
问答分类:
问答标签:
问答地址:
问答排行榜
最热
最新

相关电子书

更多
低代码开发师(初级)实战教程 立即下载
冬季实战营第三期:MySQL数据库进阶实战 立即下载
阿里巴巴DevOps 最佳实践手册 立即下载