梅尔滤波器组:是梅尔刻度上的一组三角滤波器组的归一化形式。实现步骤如下:
(1)获取梅尔刻度最大最小值。(m_min, m_max)
(2)获取梅尔刻度序列。序列最大最小值为:刻度最大最小值,长度为:滤波器个数+2
(3)构建三角滤波器组。根据全量频率区间和梅尔刻度序列,构建三角滤波器组。
(4)滤波器组归一化。
梅尔刻度
梅尔刻度(Mel scale)是一种基于人类听觉感知特性的频率尺度,用于表示音频信号的频率。
在处理音频数据时,常常需要将频率从赫兹(Hz)转换为梅尔刻度,或者反过来操作。
import numpy as np import matplotlib.pyplot as plt # 从Hz到梅尔刻度 def hz_to_mel(f): return 2595.0 * np.log10(1.0 + f / 700.0) # 从梅尔刻度到Hz def mel_to_hz(m): return 700.0 * (10.0 ** (m / 2595.0) - 1.0) # Hz和梅尔刻度关系图 sample_rate = 16000 # Hz f = np.linspace(0, sample_rate // 2, 16000) # 梅尔刻度 mel = hz_to_mel(f) # 绘制关系图 plt.figure(figsize=(5, 3)) plt.plot(f, mel) plt.xlabel('Hz') plt.ylabel('Mel') plt.grid(True) plt.show()
梅尔滤波器组
# 基本参数 n_fft = 512 n_mels = 64 f_min = 0.0 f_max = sample_rate / 2.0 n_freqs = int(n_fft // 2 + 1) # 所有频率 all_freqs = np.linspace(0, sample_rate // 2, n_freqs) # 最大最小频率 m_min = hz_to_mel(f_min) m_max = hz_to_mel(f_max) # 梅尔刻度 m_pts = np.linspace(m_min, m_max, n_mels + 2) # 转换为Hz f_pts = mel_to_hz(m_pts)
# 梅尔刻度和Hz图像 _, axs = plt.subplots(1, 2) axs[0].plot(m_pts) axs[0].grid(True) axs[1].plot(f_pts) axs[1].grid(True) plt.tight_layout() plt.show()
# 手敲三角梅尔滤波器组 def create_triangular_fbank(all_freqs, m_pts, norm=None): # 滤波器 filter_fbank = np.zeros((len(all_freqs), len(m_pts) - 2)) # 实现三角滤波器 for i in range(n_mels): left = m_pts[i] center = m_pts[i+1] right = m_pts[i+2] # up up = (all_freqs >= left) & (all_freqs < center) filter_fbank[up, i] = (all_freqs[up] - left) / (center - left) # down down = (all_freqs >= center) & (all_freqs <= right) filter_fbank[down, i] = (right - all_freqs[down]) / (right - center) # 是否归一化 if norm is not None and norm == "slaney": # 三角形面积 = (底 * 高)/ 2 => 高 = 面积 * 2 / 底 high = (1 * 2) / (right - left) return filter_fbank # 创建一个三角梅尔滤波器组 triangular_fbank = create_triangular_fbank(all_freqs, f_pts) print(triangular_fbank.shape)
# 使用pytorch实现梅尔滤波器组 import torchaudio.functional as F mel_fbanks = F.melscale_fbanks( n_freqs=n_freqs, n_mels=n_mels, f_min=f_min, f_max=f_max, sample_rate=sample_rate ) mel_fbanks.shape
# 对比两个滤波器 mse = np.square(triangular_fbank - mel_fbanks.numpy()).mean() mse
1.703962094077431e-13
# 绘图对比 _, axs = plt.subplots(1, 2) axs[0].imshow(triangular_fbank, aspect='auto') axs[1].imshow(mel_fbanks, aspect='auto') plt.show()
滤波器组归一化
# 手动实现归一化 enorm = 2.0 / (f_pts[2 : n_mels + 2] - f_pts[: n_mels]) triangular_fbank_ser = triangular_fbank * np.expand_dims(enorm, 0) # pytorch归一化 mel_fbanks_ser = F.melscale_fbanks( n_freqs=n_freqs, n_mels=n_mels, f_min=f_min, f_max=f_max, sample_rate=sample_rate, norm='slaney' ) # 均方差 mse = np.square(triangular_fbank_ser - mel_fbanks_ser.numpy()).mean() mse
6.676824653494738e-18
# 绘图比较 _, axs = plt.subplots(1, 2) axs[0].imshow(triangular_fbank_ser, aspect='auto') axs[1].imshow(mel_fbanks_ser, aspect='auto') plt.show()
# 绘制三维图看看 filter_fbank = triangular_fbank_ser.T fig = plt.figure(figsize=(10, 8)) ax = fig.add_subplot(111, projection='3d') for i in range(filter_fbank.shape[1]): x = np.full(filter_fbank.shape[0], i) y = np.arange(filter_fbank.shape[0]) z = filter_fbank[:, i] # 绘制 ax.plot(x, y, z) ax.set_xlabel('x') ax.set_ylabel('y') ax.set_zlabel('z') ax.view_init(elev=50, azim=-30) plt.show()