创建TensorFlow数据集
下一步就是设置函数读入数据时所需的必要信息。我没有写这段代码,只是把它改编自FreeMusicArchive。这一部分很可能在您自己的项目中发生变化,这取决于您使用的数据集。
#functiontoloadmetadata#adaptedfromhttps://github.com/mdeff/fma/blob/master/utils.pydefmetadata_load(filepath): filename=os.path.basename(filepath) if'features'infilename: returnpd.read_csv(filepath, index_col=0, header=[0, 1, 2]) if'echonest'infilename: returnpd.read_csv(filepath, index_col=0, header=[0, 1, 2]) if'genres'infilename: returnpd.read_csv(filepath, index_col=0) if'tracks'infilename: tracks=pd.read_csv(filepath, index_col=0, header=[0, 1]) COLUMNS= [('track', 'tags'), ('album', 'tags'), ('artist', 'tags'), ('track', 'genres'), ('track', 'genres_all')] forcolumninCOLUMNS: tracks[column] =tracks[column].map(ast.literal_eval) COLUMNS= [('track', 'date_created'), ('track', 'date_recorded'), ('album', 'date_created'), ('album', 'date_released'), ('artist', 'date_created'), ('artist', 'active_year_begin'), ('artist', 'active_year_end')] forcolumninCOLUMNS: tracks[column] =pd.to_datetime(tracks[column]) SUBSETS= ('small', 'medium', 'large') try: tracks['set', 'subset'] =tracks['set', 'subset'].astype( pd.CategoricalDtype(categories=SUBSETS, ordered=True)) exceptValueError: #thecategoriesandorderedargumentswereremovedinpandas0.25tracks['set', 'subset'] =tracks['set', 'subset'].astype( pd.CategoricalDtype(categories=SUBSETS, ordered=True)) COLUMNS= [('track', 'genre_top'), ('track', 'license'), ('album', 'type'), ('album', 'information'), ('artist', 'bio')] forcolumninCOLUMNS: tracks[column] =tracks[column].astype('category') returntracks#functiontogetgenreinformationforeachtrackIDdeftrack_genre_information(GENRE_PATH, TRACKS_PATH, subset): """GENRE_PATH (str): path to the csv with the genre metadataTRACKS_PATH (str): path to the csv with the track metadataFILE_PATHS (list): list of paths to the mp3 filessubset (str): the subset of the data desired"""#getthegenreinformationgenres=pd.read_csv(GENRE_PATH) #loadmetadataonallthetrackstracks=metadata_load(TRACKS_PATH) #focusonthespecificsubsettrackssubset_tracks=tracks[tracks['set', 'subset'] <=subset] #extracttrackIDandgenreinformationforeachtracksubset_tracks_genre=np.array([np.array(subset_tracks.index), np.array(subset_tracks['track', 'genre_top'])]).T#combinetheinformationinadataframetracks_genre_df=pd.DataFrame({'track_id': subset_tracks_genre[:,0], 'genre': subset_tracks_genre[:,1]}) #labelclasseswithnumbersencoder=LabelEncoder() tracks_genre_df['genre_nb'] =encoder.fit_transform(tracks_genre_df.genre) returntracks_genre_df#getgenreinformationforalltracksfromthesmallsubsetGENRE_PATH="fma_metadata/genres.csv"TRACKS_PATH="fma_metadata/tracks.csv"subset='small'small_tracks_genre=track_genre_information(GENRE_PATH, TRACKS_PATH, subset)
然后我们需要函数来创建一个TensorFlow数据集。其思想是在文件名列表上循环,在管道中应用一系列操作,这些操作返回批处理数据集,其中包含一个特征张量和一个标签张量。我们使用TensorFlow内置函数和Python函数(与tf.py_函数,对于在数据管道中使用Python函数非常有用)。这里我只包含从原始音频数据创建数据集的函数,但过程与以频谱图作为特性创建数据集的过程极为相似。
#checkthenumberofsongswhicharestoredinGCSnb_songs=len(tf.io.gfile.glob(GCS_PATTERN)) shard_size=math.ceil(1.0*nb_songs/SHARDS) print("Pattern matches {} songs which will be rewritten as {} .tfrec files containing {} songs each.".format(nb_songs, SHARDS, shard_size)) #functionstocreatethedatasetfromrawaudio#defineafunctiontogetthelabelassociatedwithafilepathdefget_label(file_path, genre_df=small_tracks_genre): path=file_path.numpy() path=path.decode("utf-8") track_id=int(path.split('/')[-1].split('.')[0].lstrip('0')) label=genre_df.loc[genre_df.track_id==track_id,'genre_nb'].values[0] returntf.constant([label]) #defineafunctionthatextractsthedesiredfeaturesfromafilepathdefget_audio(file_path, window_size=window_size): wav=tf.io.read_file(file_path) audio=tf.audio.decode_wav(wav, desired_channels=1).audiofiltered_audio=audio[:window_size,:] returnfiltered_audio#processthepathdefprocess_path(file_path, window_size=window_size): label=get_label(file_path) audio=get_audio(file_path, window_size) returnaudio, label#parser, wraparoundtheprocessingfunctionandspecifyoutputshapedefparser(file_path, window_size=window_size): audio, label=tf.py_function(process_path, [file_path], (tf.float32, tf.int32)) audio.set_shape((window_size,1)) label.set_shape((1,)) returnaudio, labelfilenames=tf.data.Dataset.list_files(GCS_PATTERN, seed=35155) #Thisalsoshufflestheimagesdataset_1d=filenames.map(parser, num_parallel_calls=AUTO) dataset_1d=dataset_1d.batch(shard_size)
在GCS上使用TFRecord格式
现在我们有了数据集,我们使用TFRecord格式将其存储在GCS上。这是GPU和TPU推荐使用的格式,因为并行化带来了快速的I/O。其主要思想是tf.Features和tf.Example. 我们将数据集写入这些示例,存储在GCS上。这部分代码应该需要对其他项目进行最少的编辑,除了更改特性类型之外。如果数据已经上传到记录格式一次,则可以跳过此部分。本节中的大部分代码都改编自TensorFlow官方文档以及本教程中有关音频管道的内容。
#writetoTFRecord#needtoTFRecordtogreatlyspeeduptheI/Oprocess, previouslyabottleneck#functionstocreatevariousfeatures#adaptedfromhttps://codelabs.developers.google.com/codelabs/keras-flowers-data/#4#andhttps://www.tensorflow.org/tutorials/load_data/tfrecorddef_bytestring_feature(list_of_bytestrings): returntf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_bytestrings)) def_int_feature(list_of_ints): #int64returntf.train.Feature(int64_list=tf.train.Int64List(value=list_of_ints)) def_float_feature(list_of_floats): #float32returntf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats)) #writerfunctiondefto_tfrecord(tfrec_filewriter, song, label): one_hot_class=np.eye(N_CLASSES)[label][0] feature= { "song": _float_feature(song.flatten().tolist()), #onesonginthelist"class": _int_feature([label]), #oneclassinthelist"one_hot_class": _float_feature(one_hot_class.tolist()) #variablelengthlistoffloats, n=len(CLASSES) } returntf.train.Example(features=tf.train.Features(feature=feature)) defwrite_tfrecord(dataset, GCS_OUTPUT): print("Writing TFRecords") forshard, (song, label) inenumerate(dataset): #batchsizeusedasshardsizehereshard_size=song.numpy().shape[0] #goodpracticetohavethenumberofrecordsinthefilenamefilename=GCS_OUTPUT+"{:02d}-{}.tfrec".format(shard, shard_size) withtf.io.TFRecordWriter(filename) asout_file: foriinrange(shard_size): example=to_tfrecord(out_file, song.numpy()[i], label.numpy()[i]) out_file.write(example.SerializeToString()) print("Wrote file {} containing {} records".format(filename, shard_size))s
一旦这些记录被存储,我们需要其他函数来读取它们。依次处理每个示例,从TFRecord中提取相关信息并重新构造tf.数据集. 这看起来像是一个循环过程(创建一个tf.数据集→作为TFRecord上传到GCS→将TFRecord读入tf.数据集),但这实际上通过简化I/O过程提供了巨大的速度效率。如果I/O是瓶颈,使用GPU或TPU是没有帮助的,这种方法允许我们通过优化数据加载来充分利用它们在训练期间的速度增益。
#functiontoparseanexampleandreturnthesongfeatureandtheone-hotclass#adaptedfromhttps://codelabs.developers.google.com/codelabs/keras-flowers-data/#4#andhttps://www.tensorflow.org/tutorials/load_data/tfrecorddefread_tfrecord_1d(example): features= { "song": tf.io.FixedLenFeature([window_size], tf.float32), #tf.stringmeansbytestring"class": tf.io.FixedLenFeature([1], tf.int64), #shape [] meansscalar"one_hot_class": tf.io.VarLenFeature(tf.float32), } example=tf.io.parse_single_example(example, features) song=example['song'] #song=tf.audio.decode_wav(example['song'], desired_channels=1).audiosong=tf.cast(example['song'], tf.float32) song=tf.reshape(song, [window_size, 1]) label=tf.reshape(example['class'], [1]) one_hot_class=tf.sparse.to_dense(example['one_hot_class']) one_hot_class=tf.reshape(one_hot_class, [N_CLASSES]) returnsong, one_hot_class#functiontoloadthedatasetfromTFRecordsdefload_dataset_1d(filenames): #readfromTFRecords. Foroptimalperformance, readfrommultiple#TFRecordfilesatonceandsettheoptionexperimental_deterministic=False#toalloworder-alteringoptimizations. option_no_order=tf.data.Options() option_no_order.experimental_deterministic=Falsedataset=tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO) dataset=dataset.with_options(option_no_order) dataset=dataset.map(read_tfrecord_1d, num_parallel_calls=AUTO) #ignorepotentiallycorruptedrecordsdataset=dataset.apply(tf.data.experimental.ignore_errors()) returndataset