计算机经验技术分享

你说AI会理解我们说的话吗?
我希望她会
  1. 首页
  2. 深度学习
  3. 正文

CRNN用作音频分类(针对UrbanSound8K数据集)

7月 2, 2022 736点热度 0人点赞 0条评论

音频分类任务


     UrbanSound8K数据集不算大,作为语音方向入门的练习在合适不过了,本文介绍基于CRNN的分类网络用于音频分类的方法。

一、数据集简介


     该数据集收录的城市声音包含10个声音类别、8732个声音文件(每个文件最长不超过4秒)。环境声音分类是一个不断发展的研究领域,例如对导致城市噪声污染的声音研究。鉴于对环境,特别是城市环境声音进行分类的各种研究,我们应该使用哪种分类法,它是否能满足我们的研究需求等并没有明确的结论。通过本赛题建立准确的音频分类模型,希望给大家带来更多对音频分类方法上的思考与突破。

如果需要下载数据集,点这个链接
https://www.kaggle.com/chrisfilo/urbansound8k

二、特征提取


     如果整个音频做傅里叶变换之后会随时掉时域的信息,并且会不符合真实人耳听到的信息,所以我们采用梅尔频谱图作为输入特征,音频采样率采用16000,每秒32帧,每张频谱图拼接128帧,每帧包含128个特征,所以总的来说是128*128的一张频谱图作为输入。

提取的代码如下

import numpy as np
import librosa.display
import pandas as pd
import matplotlib.pyplot as plt
import torch
import csv
from torch import nn
import torch.nn.functional as F
loss_tim=[]
got_class=[0,1,2,3,4,5,6,7,8,9]
torch.set_printoptions(threshold=np.inf)
torch.set_printoptions(precision=20)
#metadata路径
csv_path = "UrbanSound8K\metadata\\UrbanSound8K.csv"

data = []
label = []
test_fold = "1"
for t_f in range(1, 11):
    with open(csv_path) as csvfile:
        check=0
        test_fold = str(t_f)
        csv_reader = csv.reader(csvfile)
        tmp_in_data=[]
        label = []
        for row in csv_reader:
            if row[5] == test_fold and int(row[6]) in got_class:
                check+=1
                #数据集路径
                data_path="UrbanSound8K\\audio"
                data_path+='\\fold'+str(test_fold)+"\\"
                now_data=data_path+str(row[0])
                print(now_data)
                wav, sr = librosa.load(now_data, sr=16000)
                print("采样率:"+str(sr))
                # 分帧,做傅里叶变换
                tmp_www =librosa.feature.melspectrogram(y = wav, sr = sr,n_mels = 128)
                torch_data = torch.from_numpy(tmp_www)
                torch_data = torch_data.t()
                print(torch_data.size())
                for i in range(len(torch_data)//128):
                    tmp_in_data.append(torch_data[i:i + 128].reshape(1,128,128))
                    label.append(row[6])
        for tmp in tmp_in_data:
            print(tmp.size())
        with open("feature"+str(test_fold)+".txt","w")as fd:
            fd.write(str(tmp_in_data))
        fd.close()
        with open("label"+str(test_fold)+".txt","w")as fd:
            fd.write(str(label))
        fd.close()
#0 文件名 5 fold 6 class id
print("特征提取完成")

三、训练网络搭建


     考虑到输入的特征是一张单通道的频谱图,既然是图像,那我们当然可以使用cv常用的一些CNN网络进行分类作为backbone,比如LeNet、ResNet、MoblieNet等等。但是考虑到音频本身是具有时序信息的一串序列,所以我们考虑同时使用RNN进行特征提取,最后和CNN提取的特征合并在一起进行分类。

网络结构如下

代码如下[1]moblienetv2的结构

#CNN部分采用MoblieNetv2
#RNN部分采用双向LSTM
import torch.nn as nn
import torch
import torch.nn.functional as F
from collections import OrderedDict

def _make_divisible(ch, divisor=8, min_ch=None):
    if min_ch is None:
        min_ch = divisor
    new_ch = max(min_ch, int(ch + divisor / 2) // divisor * divisor)
    if new_ch < 0.9 * ch:
        new_ch += divisor
    return new_ch

class ConvBNReLU(nn.Sequential):
    def __init__(self, in_channel, out_channel, kernel_size=3, stride=1, groups=1):
        padding = (kernel_size - 1) // 2
        super(ConvBNReLU, self).__init__(
            nn.Conv2d(in_channel, out_channel, kernel_size, stride, padding, groups=groups, bias=False),
            nn.BatchNorm2d(out_channel),
            nn.ReLU6(inplace=True)
        )

class InvertedResidual(nn.Module):
    def __init__(self, in_channel, out_channel, stride, expand_ratio):
        super(InvertedResidual, self).__init__()
        hidden_channel = in_channel * expand_ratio
        self.use_shortcut = stride == 1 and in_channel == out_channel

        layers = []
        if expand_ratio != 1:
            # 1x1 pointwise conv
            layers.append(ConvBNReLU(in_channel, hidden_channel, kernel_size=1))
        layers.extend([
            # 3x3 depthwise conv
            ConvBNReLU(hidden_channel, hidden_channel, stride=stride, groups=hidden_channel),
            # 1x1 pointwise conv(linear)
            nn.Conv2d(hidden_channel, out_channel, kernel_size=1, bias=False),
            nn.BatchNorm2d(out_channel),
        ])

        self.conv = nn.Sequential(*layers)

    def forward(self, x):
        if self.use_shortcut:
            return x + self.conv(x)
        else:
            return self.conv(x)

class CRNN(nn.Module):
    def __init__(self, num_classes=10, alpha=1.0, round_nearest=8):#alpha超参数
        super(CRNN, self).__init__()
        block = InvertedResidual
        input_channel = _make_divisible(32 * alpha, round_nearest)
        last_channel = _make_divisible(1280 * alpha, round_nearest)

        inverted_residual_setting = [
            # t, c, n, s
            [1, 16, 1, 1],
            [6, 24, 2, 2],
            [6, 32, 3, 2],
            [6, 64, 4, 2],
            [6, 96, 3, 1],
            [6, 160, 3, 2],
            [6, 320, 1, 1],
        ]

        features = []
        features.append(ConvBNReLU(1, input_channel, stride=2))
        for t, c, n, s in inverted_residual_setting:
            output_channel = _make_divisible(c * alpha, round_nearest)
            for i in range(n):
                stride = s if i == 0 else 1
                features.append(block(input_channel, output_channel, stride, expand_ratio=t))
                input_channel = output_channel
        features.append(ConvBNReLU(input_channel, last_channel, 1))
        self.features = nn.Sequential(*features)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.classifier = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(2560, num_classes)
        )
        self.rnn = nn.LSTM(input_size = 128,hidden_size = 640,num_layers = 2,bidirectional=True,dropout=0.5)
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out')
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.ones_(m.weight)
                nn.init.zeros_(m.bias)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.zeros_(m.bias)

    def forward(self, x):
        x2 = x
        x2 = x2.squeeze(dim=1)
        x2 = x2.permute(2,0,1)
        out2,_ = self.rnn(x2)
        out2 = out2[-1, :, :]
        print(out2.size())
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        xf = torch.cat((x,out2),1)
        x = self.classifier(xf)
        return x

四、训练


     设置超参数进行训练lr=0.001,weight_decay=0.0005,batch_size=64,训练100个epoch,其中训练时随机打乱并且留出5%的数据不做训练作为验证集。代码如下所示

import torch
from torchvision import models
import numpy as np
import random
from CRNNmodel import CRNN
import torchvision.transforms as transforms
from helpers import get_device, one_hot_embedding
from torch.optim.lr_scheduler import StepLR, MultiStepLR
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
from losses import edl_mse_loss, edl_digamma_loss, edl_log_loss, relu_evidence
input=[]
t_input=[]
label=[]
tx_x=[]
tx_y=[]
va_y=[]
loss_y=[]
eval_input=[]
eval_label=[]
test_fold=1
train_num=2
BS = 64#batch_size
for t_f in range(1,train_num):
    test_fold = str(t_f)
    with open("feature"+str(test_fold)+".txt","r")as fd:
        tmp_input=eval(str(fd.read()).replace("tensor","torch.tensor"))
        for tmp_tensor in tmp_input:
            input.append(tmp_tensor)
        print(tmp_input[0].size())
    fd.close()
print(len(input))
for t_f in range(1,train_num):
    test_fold = str(t_f)
    with open("label"+str(test_fold)+".txt","r")as fd:
        tmp_label=eval(str(fd.read()).replace("'",""))
        print("qqq:"+str(tmp_label))
        for tmp_int in tmp_label:
            label.append(tmp_int)
        #print(tmp_input[0].size())
    fd.close()

cc = list(zip(input,label))
random.shuffle(cc)
input[:],label[:]=zip(*cc)
train_fg=len(input)*19//20
eval_input=input[train_fg+1:len(input)]
eval_label=label[train_fg+1:len(input)]
input=input[0:train_fg]
label=label[0:train_fg]

for tmp in eval_input:
    t_input.append(tmp.tolist())
eval_input=torch.tensor(t_input)
eval_label=torch.tensor(eval_label)
print(eval_input.size())
print(eval_label.size())
eval_input = (eval_input-torch.mean(eval_input))/torch.std(eval_input)
eval = torch.utils.data.TensorDataset(eval_input,eval_label)
dataloader_eval = torch.utils.data.DataLoader(eval, batch_size=BS,shuffle=True)
t_input.clear()
for tmp in input:
    t_input.append(tmp.tolist())
input=torch.tensor(t_input)
label=torch.tensor(label)
print(input.size())
print(label.size())
input=(input-torch.mean(input))/torch.std(input)
train = torch.utils.data.TensorDataset(input,label)
dataloader_val = torch.utils.data.DataLoader(train, batch_size=BS,shuffle=True)
model = CRNN(num_classes=10)
model = model.cuda()
ansnet = model
mark_suc=0
loss_func =nn.CrossEntropyLoss()
learning_rate=0.001
opt = torch.optim.Adam(model.parameters(),lr=learning_rate,weight_decay=0.0005)
scheduler = MultiStepLR(opt, milestones=[50, 80], gamma=0.1)
al_epoch=100
al_sum = len(dataloader_val) * al_epoch
cnt = 0
device = get_device()
num_classes = 10
for epoch in range(al_epoch):
    print("traing...")
    sum_loss=0
    running_corrects = 0.0000001
    len_dataset = 0.000001
    running_loss = 0.000001
    for step, (t_x, t_y) in enumerate(dataloader_val):
        cnt += 1
        print("tx:")
        print(t_x.size())
        t_x = t_x.cuda()
        t_y = t_y.cuda()
        opt.zero_grad()
        outputs = model(t_x)
        loss = loss_func(outputs,t_y)

        _, preds = torch.max(outputs, 1)
        loss.backward()
        opt.step()
        scheduler.step()
        print(str(cnt) + "/" + str(al_sum))
        running_corrects += torch.sum(preds == t_y.data)
        running_loss += loss.item()
        len_dataset += len(t_y.data)
    tx_y.append(running_corrects/len_dataset)
    tx_x.append(epoch)
    loss_y.append(running_loss/len_dataset)
    print("suc:"+str(running_corrects/len_dataset))
    print("loss:"+str(running_loss/len_dataset))
    if running_corrects/len_dataset > mark_suc:
        mark_suc = running_corrects/len_dataset
        ansnet = model
    running_corrects = 0.0000001
    len_dataset = 0.000001
    running_loss = 0.000001
    for step, (t_x, t_y) in enumerate(dataloader_eval):
        try:
            t_x = t_x.cuda()
            t_y = t_y.cuda()
            out=model(t_x)
            _, preds = torch.max(out, 1)
            running_corrects += torch.sum(preds == t_y.data)
            len_dataset += len(t_y.data)
        except:
            pass
    print(running_corrects)
    va_y.append(running_corrects/len_dataset)
    print("val_suc:"+str(running_corrects/len_dataset))
    print("val_loss:"+str(running_loss/len_dataset))
ty_y=[]
ty_yy=[]
print(tx_x)
print(tx_y)
for tmp in tx_y:
    ty_y.append(tmp.cpu())
for tmp in va_y:
    ty_yy.append(tmp.cpu())
print(ty_y)
torch.save(ansnet,"mm.pt")
print("best epoch suc:"+str(mark_suc))
plt.plot(tx_x,ty_y,label='success rate')
plt.plot(tx_x,ty_yy)
plt.xlabel('epoch')
plt.show()


训练过程中acc和val如下图所示



最后的权重文件会保存下来,acc准确率稳定在98%左右,val准确率最高94.7%,还算是比较好的效果,还有很大的调参空间,我只调了两次,我认为最后val的准确率调一调还能再上几个点。大家可以自己去试一试。

接下来给出github的仓库链接

CRNNforUrbanSound8K

2022-07-03 15:45:47 星期日

本作品采用 知识共享署名 4.0 国际许可协议 进行许可
标签: 暂无
最后更新:7月 3, 2022

admin

这个人很懒,什么都没留下

点赞
< 上一篇

文章评论

取消回复

分类
  • Uncategorized
  • 深度学习

COPYRIGHT © 2022 BSKJ WordPress Bitnami 5.9.1-0. ALL RIGHTS RESERVED.

Theme Kratos Made By Seaton Jiang

陕ICP备2021001712号-1