音频分类任务
UrbanSound8K数据集不算大,作为语音方向入门的练习在合适不过了,本文介绍基于CRNN的分类网络用于音频分类的方法。
一、数据集简介
该数据集收录的城市声音包含10个声音类别、8732个声音文件(每个文件最长不超过4秒)。环境声音分类是一个不断发展的研究领域,例如对导致城市噪声污染的声音研究。鉴于对环境,特别是城市环境声音进行分类的各种研究,我们应该使用哪种分类法,它是否能满足我们的研究需求等并没有明确的结论。通过本赛题建立准确的音频分类模型,希望给大家带来更多对音频分类方法上的思考与突破。
如果需要下载数据集,点这个链接
https://www.kaggle.com/chrisfilo/urbansound8k
二、特征提取
如果整个音频做傅里叶变换之后会随时掉时域的信息,并且会不符合真实人耳听到的信息,所以我们采用梅尔频谱图作为输入特征,音频采样率采用16000,每秒32帧,每张频谱图拼接128帧,每帧包含128个特征,所以总的来说是128*128的一张频谱图作为输入。
提取的代码如下
import numpy as np
import librosa.display
import pandas as pd
import matplotlib.pyplot as plt
import torch
import csv
from torch import nn
import torch.nn.functional as F
loss_tim=[]
got_class=[0,1,2,3,4,5,6,7,8,9]
torch.set_printoptions(threshold=np.inf)
torch.set_printoptions(precision=20)
#metadata路径
csv_path = "UrbanSound8K\metadata\\UrbanSound8K.csv"
data = []
label = []
test_fold = "1"
for t_f in range(1, 11):
with open(csv_path) as csvfile:
check=0
test_fold = str(t_f)
csv_reader = csv.reader(csvfile)
tmp_in_data=[]
label = []
for row in csv_reader:
if row[5] == test_fold and int(row[6]) in got_class:
check+=1
#数据集路径
data_path="UrbanSound8K\\audio"
data_path+='\\fold'+str(test_fold)+"\\"
now_data=data_path+str(row[0])
print(now_data)
wav, sr = librosa.load(now_data, sr=16000)
print("采样率:"+str(sr))
# 分帧,做傅里叶变换
tmp_www =librosa.feature.melspectrogram(y = wav, sr = sr,n_mels = 128)
torch_data = torch.from_numpy(tmp_www)
torch_data = torch_data.t()
print(torch_data.size())
for i in range(len(torch_data)//128):
tmp_in_data.append(torch_data[i:i + 128].reshape(1,128,128))
label.append(row[6])
for tmp in tmp_in_data:
print(tmp.size())
with open("feature"+str(test_fold)+".txt","w")as fd:
fd.write(str(tmp_in_data))
fd.close()
with open("label"+str(test_fold)+".txt","w")as fd:
fd.write(str(label))
fd.close()
#0 文件名 5 fold 6 class id
print("特征提取完成")
三、训练网络搭建
考虑到输入的特征是一张单通道的频谱图,既然是图像,那我们当然可以使用cv常用的一些CNN网络进行分类作为backbone,比如LeNet、ResNet、MoblieNet等等。但是考虑到音频本身是具有时序信息的一串序列,所以我们考虑同时使用RNN进行特征提取,最后和CNN提取的特征合并在一起进行分类。
网络结构如下
#CNN部分采用MoblieNetv2
#RNN部分采用双向LSTM
import torch.nn as nn
import torch
import torch.nn.functional as F
from collections import OrderedDict
def _make_divisible(ch, divisor=8, min_ch=None):
if min_ch is None:
min_ch = divisor
new_ch = max(min_ch, int(ch + divisor / 2) // divisor * divisor)
if new_ch < 0.9 * ch:
new_ch += divisor
return new_ch
class ConvBNReLU(nn.Sequential):
def __init__(self, in_channel, out_channel, kernel_size=3, stride=1, groups=1):
padding = (kernel_size - 1) // 2
super(ConvBNReLU, self).__init__(
nn.Conv2d(in_channel, out_channel, kernel_size, stride, padding, groups=groups, bias=False),
nn.BatchNorm2d(out_channel),
nn.ReLU6(inplace=True)
)
class InvertedResidual(nn.Module):
def __init__(self, in_channel, out_channel, stride, expand_ratio):
super(InvertedResidual, self).__init__()
hidden_channel = in_channel * expand_ratio
self.use_shortcut = stride == 1 and in_channel == out_channel
layers = []
if expand_ratio != 1:
# 1x1 pointwise conv
layers.append(ConvBNReLU(in_channel, hidden_channel, kernel_size=1))
layers.extend([
# 3x3 depthwise conv
ConvBNReLU(hidden_channel, hidden_channel, stride=stride, groups=hidden_channel),
# 1x1 pointwise conv(linear)
nn.Conv2d(hidden_channel, out_channel, kernel_size=1, bias=False),
nn.BatchNorm2d(out_channel),
])
self.conv = nn.Sequential(*layers)
def forward(self, x):
if self.use_shortcut:
return x + self.conv(x)
else:
return self.conv(x)
class CRNN(nn.Module):
def __init__(self, num_classes=10, alpha=1.0, round_nearest=8):#alpha超参数
super(CRNN, self).__init__()
block = InvertedResidual
input_channel = _make_divisible(32 * alpha, round_nearest)
last_channel = _make_divisible(1280 * alpha, round_nearest)
inverted_residual_setting = [
# t, c, n, s
[1, 16, 1, 1],
[6, 24, 2, 2],
[6, 32, 3, 2],
[6, 64, 4, 2],
[6, 96, 3, 1],
[6, 160, 3, 2],
[6, 320, 1, 1],
]
features = []
features.append(ConvBNReLU(1, input_channel, stride=2))
for t, c, n, s in inverted_residual_setting:
output_channel = _make_divisible(c * alpha, round_nearest)
for i in range(n):
stride = s if i == 0 else 1
features.append(block(input_channel, output_channel, stride, expand_ratio=t))
input_channel = output_channel
features.append(ConvBNReLU(input_channel, last_channel, 1))
self.features = nn.Sequential(*features)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.classifier = nn.Sequential(
nn.Dropout(0.3),
nn.Linear(2560, num_classes)
)
self.rnn = nn.LSTM(input_size = 128,hidden_size = 640,num_layers = 2,bidirectional=True,dropout=0.5)
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out')
if m.bias is not None:
nn.init.zeros_(m.bias)
elif isinstance(m, nn.BatchNorm2d):
nn.init.ones_(m.weight)
nn.init.zeros_(m.bias)
elif isinstance(m, nn.Linear):
nn.init.normal_(m.weight, 0, 0.01)
nn.init.zeros_(m.bias)
def forward(self, x):
x2 = x
x2 = x2.squeeze(dim=1)
x2 = x2.permute(2,0,1)
out2,_ = self.rnn(x2)
out2 = out2[-1, :, :]
print(out2.size())
x = self.features(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
xf = torch.cat((x,out2),1)
x = self.classifier(xf)
return x
四、训练
设置超参数进行训练lr=0.001,weight_decay=0.0005,batch_size=64,训练100个epoch,其中训练时随机打乱并且留出5%的数据不做训练作为验证集。代码如下所示
import torch
from torchvision import models
import numpy as np
import random
from CRNNmodel import CRNN
import torchvision.transforms as transforms
from helpers import get_device, one_hot_embedding
from torch.optim.lr_scheduler import StepLR, MultiStepLR
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
from losses import edl_mse_loss, edl_digamma_loss, edl_log_loss, relu_evidence
input=[]
t_input=[]
label=[]
tx_x=[]
tx_y=[]
va_y=[]
loss_y=[]
eval_input=[]
eval_label=[]
test_fold=1
train_num=2
BS = 64#batch_size
for t_f in range(1,train_num):
test_fold = str(t_f)
with open("feature"+str(test_fold)+".txt","r")as fd:
tmp_input=eval(str(fd.read()).replace("tensor","torch.tensor"))
for tmp_tensor in tmp_input:
input.append(tmp_tensor)
print(tmp_input[0].size())
fd.close()
print(len(input))
for t_f in range(1,train_num):
test_fold = str(t_f)
with open("label"+str(test_fold)+".txt","r")as fd:
tmp_label=eval(str(fd.read()).replace("'",""))
print("qqq:"+str(tmp_label))
for tmp_int in tmp_label:
label.append(tmp_int)
#print(tmp_input[0].size())
fd.close()
cc = list(zip(input,label))
random.shuffle(cc)
input[:],label[:]=zip(*cc)
train_fg=len(input)*19//20
eval_input=input[train_fg+1:len(input)]
eval_label=label[train_fg+1:len(input)]
input=input[0:train_fg]
label=label[0:train_fg]
for tmp in eval_input:
t_input.append(tmp.tolist())
eval_input=torch.tensor(t_input)
eval_label=torch.tensor(eval_label)
print(eval_input.size())
print(eval_label.size())
eval_input = (eval_input-torch.mean(eval_input))/torch.std(eval_input)
eval = torch.utils.data.TensorDataset(eval_input,eval_label)
dataloader_eval = torch.utils.data.DataLoader(eval, batch_size=BS,shuffle=True)
t_input.clear()
for tmp in input:
t_input.append(tmp.tolist())
input=torch.tensor(t_input)
label=torch.tensor(label)
print(input.size())
print(label.size())
input=(input-torch.mean(input))/torch.std(input)
train = torch.utils.data.TensorDataset(input,label)
dataloader_val = torch.utils.data.DataLoader(train, batch_size=BS,shuffle=True)
model = CRNN(num_classes=10)
model = model.cuda()
ansnet = model
mark_suc=0
loss_func =nn.CrossEntropyLoss()
learning_rate=0.001
opt = torch.optim.Adam(model.parameters(),lr=learning_rate,weight_decay=0.0005)
scheduler = MultiStepLR(opt, milestones=[50, 80], gamma=0.1)
al_epoch=100
al_sum = len(dataloader_val) * al_epoch
cnt = 0
device = get_device()
num_classes = 10
for epoch in range(al_epoch):
print("traing...")
sum_loss=0
running_corrects = 0.0000001
len_dataset = 0.000001
running_loss = 0.000001
for step, (t_x, t_y) in enumerate(dataloader_val):
cnt += 1
print("tx:")
print(t_x.size())
t_x = t_x.cuda()
t_y = t_y.cuda()
opt.zero_grad()
outputs = model(t_x)
loss = loss_func(outputs,t_y)
_, preds = torch.max(outputs, 1)
loss.backward()
opt.step()
scheduler.step()
print(str(cnt) + "/" + str(al_sum))
running_corrects += torch.sum(preds == t_y.data)
running_loss += loss.item()
len_dataset += len(t_y.data)
tx_y.append(running_corrects/len_dataset)
tx_x.append(epoch)
loss_y.append(running_loss/len_dataset)
print("suc:"+str(running_corrects/len_dataset))
print("loss:"+str(running_loss/len_dataset))
if running_corrects/len_dataset > mark_suc:
mark_suc = running_corrects/len_dataset
ansnet = model
running_corrects = 0.0000001
len_dataset = 0.000001
running_loss = 0.000001
for step, (t_x, t_y) in enumerate(dataloader_eval):
try:
t_x = t_x.cuda()
t_y = t_y.cuda()
out=model(t_x)
_, preds = torch.max(out, 1)
running_corrects += torch.sum(preds == t_y.data)
len_dataset += len(t_y.data)
except:
pass
print(running_corrects)
va_y.append(running_corrects/len_dataset)
print("val_suc:"+str(running_corrects/len_dataset))
print("val_loss:"+str(running_loss/len_dataset))
ty_y=[]
ty_yy=[]
print(tx_x)
print(tx_y)
for tmp in tx_y:
ty_y.append(tmp.cpu())
for tmp in va_y:
ty_yy.append(tmp.cpu())
print(ty_y)
torch.save(ansnet,"mm.pt")
print("best epoch suc:"+str(mark_suc))
plt.plot(tx_x,ty_y,label='success rate')
plt.plot(tx_x,ty_yy)
plt.xlabel('epoch')
plt.show()
训练过程中acc和val如下图所示
最后的权重文件会保存下来,acc准确率稳定在98%左右,val准确率最高94.7%,还算是比较好的效果,还有很大的调参空间,我只调了两次,我认为最后val的准确率调一调还能再上几个点。大家可以自己去试一试。
接下来给出github的仓库链接
CRNNforUrbanSound8K
2022-07-03 15:45:47 星期日
文章评论