import fastaudio
from fastaudio.core.all import *
from fastai.vision.all import *
/home/studio-lab-user/.conda/envs/default/lib/python3.9/site-packages/torchaudio/backend/utils.py:46: UserWarning: "torchaudio.USE_SOUNDFILE_LEGACY_INTERFACE" flag is deprecated and will be removed in 0.9.0. Please remove the use of flag.
  warnings.warn(
%config InlineBackend.figure_format = 'retina'
Path()
Path('.')
 
 
path = Path()/'cleanAudio'
 
(#1) [Path('labels_no_duplicates.csv')]
 
data = pd.read_csv('labels_no_duplicates.csv', low_memory=False)
data
Unnamed: 0 path pattern kana syl drop
0 0 ある.yomi000142BB_0596.mp3 頭高 アル 2 1
1 2 思う.yomi0006C617_043A.mp3 中高 オモウ 3 2
2 3 など.yomi000240B7_0028.mp3 頭高 ナド 2 1
3 4 私.yomi00092F63_0072.mp3 平板 ワタくシ 4 0
4 5 見る.yomi000A41BD_001E.mp3 頭高 ミル 2 1
... ... ... ... ... ... ...
66489 74183 捨てがな_捨て仮名.yomi00072538_06BE.mp3 平板 すテカ゚ナ 5 0
66490 74185 くも膜下出血_蜘蛛膜下出血.yomi0001AAD1_0622.mp3 中高 クモマッカしュッケツ 10 6
66491 74187 捜す.yomi00072507_0088.mp3 平板 サカ゚ス 4 0
66492 74188 捜し物.yomi000724FD_0424.mp3 平板 サカ゚シモノ 6 0
66493 74189 あこや貝_阿古屋貝.yomi00013767_0114.mp3 中高 アコヤカ゚イ 6 3

66494 rows × 6 columns

cnfg = AudioConfig.Voice()
at = AudioTensor.create(path/'捨てがな_捨て仮名.yomi00072538_06BE.mp3')
aud2spec = AudioToSpec.from_cfg(cnfg)
aud2spec(at).show()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/tmp/ipykernel_71/2151244571.py in <module>
----> 1 aud2spec(at).show()

~/.conda/envs/default/lib/python3.9/site-packages/fastaudio/core/spectrogram.py in show(self, ctx, ax, title, **kwargs)
     75     def show(self, ctx=None, ax=None, title="", **kwargs):
     76         "Show spectrogram using librosa"
---> 77         return show_spectrogram(self, ctx=ctx, ax=ax, title=title, **kwargs)
     78 
     79 

~/.conda/envs/default/lib/python3.9/site-packages/fastaudio/core/spectrogram.py in show_spectrogram(sg, title, ax, ctx, **kwargs)
     86         # x_start, y_start, x_lenght, y_lenght, all in percent
     87         ia = ax.inset_axes((i / sg.nchannels, 0.2, 1 / sg.nchannels, 0.7))
---> 88         z = specshow(
     89             channel.cpu().numpy(), ax=ia, **sg._all_show_args(show_y=i == 0), **kwargs
     90         )

~/.conda/envs/default/lib/python3.9/site-packages/librosa/display.py in specshow(data, x_coords, y_coords, x_axis, y_axis, sr, hop_length, fmin, fmax, tuning, bins_per_octave, key, Sa, mela, thaat, ax, **kwargs)
    854     # Set up axis scaling
    855     __scale_axes(axes, x_axis, "x")
--> 856     __scale_axes(axes, y_axis, "y")
    857 
    858     # Construct tickers and locators

~/.conda/envs/default/lib/python3.9/site-packages/librosa/display.py in __scale_axes(axes, ax_type, which)
    972         return
    973 
--> 974     scaler(mode, **kwargs)
    975 
    976 

~/.conda/envs/default/lib/python3.9/site-packages/matplotlib/axes/_base.py in set_yscale(self, value, **kwargs)
   4098         g = self.get_shared_y_axes()
   4099         for ax in g.get_siblings(self):
-> 4100             ax.yaxis._set_scale(value, **kwargs)
   4101             ax._update_transScale()
   4102             ax.stale = True

~/.conda/envs/default/lib/python3.9/site-packages/matplotlib/axis.py in _set_scale(self, value, **kwargs)
    759     def _set_scale(self, value, **kwargs):
    760         if not isinstance(value, mscale.ScaleBase):
--> 761             self._scale = mscale.scale_factory(value, self, **kwargs)
    762         else:
    763             self._scale = value

~/.conda/envs/default/lib/python3.9/site-packages/matplotlib/scale.py in scale_factory(scale, axis, **kwargs)
    595         scale = scale.lower()
    596     scale_cls = _api.check_getitem(_scale_mapping, scale=scale)
--> 597     return scale_cls(axis, **kwargs)
    598 
    599 

TypeError: __init__() got an unexpected keyword argument 'linthreshy'
at.show()
crop2s = ResizeSignal(2000)
data = data.iloc[:,1:].set_index('path')
data
pattern kana syl drop
path
ある.yomi000142BB_0596.mp3 頭高 アル 2 1
思う.yomi0006C617_043A.mp3 中高 オモウ 3 2
など.yomi000240B7_0028.mp3 頭高 ナド 2 1
私.yomi00092F63_0072.mp3 平板 ワタくシ 4 0
見る.yomi000A41BD_001E.mp3 頭高 ミル 2 1
... ... ... ... ...
捨てがな_捨て仮名.yomi00072538_06BE.mp3 平板 すテカ゚ナ 5 0
くも膜下出血_蜘蛛膜下出血.yomi0001AAD1_0622.mp3 中高 クモマッカしュッケツ 10 6
捜す.yomi00072507_0088.mp3 平板 サカ゚ス 4 0
捜し物.yomi000724FD_0424.mp3 平板 サカ゚シモノ 6 0
あこや貝_阿古屋貝.yomi00013767_0114.mp3 中高 アコヤカ゚イ 6 3

66494 rows × 4 columns

data.loc['ある.yomi000142BB_0596.mp3'][0]
'頭高'
def get_label(fname):
    return data.loc[fname][0]
dblock = DataBlock(blocks=[AudioBlock, CategoryBlock],
                  get_items=get_audio_files,
                   item_tfms=[crop2s, aud2spec],
                   get_y=using_attr(get_label, 'name'),
                   splitter=RandomSubsetSplitter(0.1, 0.02)
                  )
dls = dblock.dataloaders(path, bs=32, shuffle=True)
len(dls.get_idxs())
dls.one_batch()[0].shape
learn = Learner(dls, xresnet50(pretrained=True), metrics=[accuracy, F1Score(average='weighted')])
learn.loss_func
FlattenedLoss of CrossEntropyLoss()
learn = learn.to_fp16()
learn.model[0][0].in_channels
def alter_learner(learn):
    layer = learn.model[0][0]
    layer.in_channels = 1
    layer.weight = nn.Parameter(layer.weight[:,1,:,:].unsqueeze(1))
    learn.model[0][0] = layer
alter_learner(learn)
learn.fine_tune(5, 3e-3)
epoch train_loss valid_loss accuracy f1_score time
0 0.424598 4.219926 0.439428 0.277089 00:34
epoch train_loss valid_loss accuracy f1_score time
0 0.205065 0.845803 0.760722 0.740047 00:33
1 0.203014 0.758320 0.784048 0.767731 00:33
2 0.143726 0.127169 0.960873 0.956737 00:33
3 0.111206 0.120738 0.963130 0.956488 00:34
4 0.076544 0.093413 0.968397 0.964488 00:34
learn.lr_find()
SuggestedLRs(lr_min=6.309573450380412e-08, lr_steep=9.12010818865383e-07)
learn.fit(3, slice(3e-7, 3e-6), wd=0.05)
epoch train_loss valid_loss accuracy f1_score time
0 0.078476 0.092806 0.966892 0.962685 00:33
1 0.080976 0.093654 0.967645 0.963156 00:33
2 0.083820 0.091037 0.968397 0.964490 00:33

Balanced Dataset

import random
random.seed(42)
nakadaka = random.sample(data[data.pattern == '中高'].index.values.tolist(), 2400)
atamadaka = random.sample(data[data.pattern == '頭高'].index.values.tolist(), 2400)
heiban = random.sample(data[data.pattern == '平板'].index.values.tolist(), 2400)
ds = [(i, 'nakadaka') for i in nakadaka] + [(i, 'atamadaka') for i in atamadaka] + [(i, 'heiban') for i in heiban]
df = pd.DataFrame(ds)
df[0] = path / df[0]
def get_x(df):
  return df[0]
def get_y(df):
  return df[1]
dblock = DataBlock(blocks=[AudioBlock, CategoryBlock],
                  get_x=get_x,
                   item_tfms=[crop2s, aud2spec],
                   get_y=get_y,
                   splitter=RandomSplitter(valid_pct=0.2)
                  )
dls2 = dblock.dataloaders(df)
learn2 = Learner(dls2, xresnet50(pretrained=True), CrossEntropyLossFlat(),
                 metrics=[accuracy, F1Score(average='weighted')], wd=0.05).to_fp16()
alter_learner(learn2)
learn2.lr_find()
SuggestedLRs(lr_min=0.07585775852203369, lr_steep=0.015848932787775993)
learn2.fine_tune(7, 9e-3)
epoch train_loss valid_loss accuracy f1_score time
0 1.734659 0.703426 0.713889 0.700657 00:26
epoch train_loss valid_loss accuracy f1_score time
0 0.363018 0.286336 0.909028 0.908712 00:26
1 0.229606 0.172058 0.947917 0.948306 00:26
2 0.133098 0.254065 0.905556 0.902066 00:26
3 0.088526 0.105291 0.961111 0.961012 00:26
4 0.060891 0.073923 0.967361 0.967326 00:27
5 0.043505 0.065347 0.975694 0.975679 00:27
6 0.033957 0.058501 0.975694 0.975704 00:27
learn2.recorder.plot_loss()

Data Augmentation

from fastaudio.augment.all import *
DBMelSpec = SpectrogramTransformer()
aud2spec.settings
{'mel': 'True',
 'to_db': 'False',
 'sample_rate': 16000,
 'n_fft': 1024,
 'win_length': 1024,
 'hop_length': 128,
 'f_min': 50.0,
 'f_max': 8000.0,
 'pad': 0,
 'n_mels': 128,
 'window_fn': <function _VariableFunctionsClass.hann_window>,
 'power': 2.0,
 'normalized': False,
 'wkwargs': None,
 'center': True,
 'pad_mode': 'reflect',
 'onesided': True,
 'norm': None,
 'stype': 'power',
 'top_db': None}
 
aud2spec(at).show()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/tmp/ipykernel_71/2151244571.py in <module>
----> 1 aud2spec(at).show()

~/.conda/envs/default/lib/python3.9/site-packages/fastaudio/core/spectrogram.py in show(self, ctx, ax, title, **kwargs)
     75     def show(self, ctx=None, ax=None, title="", **kwargs):
     76         "Show spectrogram using librosa"
---> 77         return show_spectrogram(self, ctx=ctx, ax=ax, title=title, **kwargs)
     78 
     79 

~/.conda/envs/default/lib/python3.9/site-packages/fastaudio/core/spectrogram.py in show_spectrogram(sg, title, ax, ctx, **kwargs)
     86         # x_start, y_start, x_lenght, y_lenght, all in percent
     87         ia = ax.inset_axes((i / sg.nchannels, 0.2, 1 / sg.nchannels, 0.7))
---> 88         z = specshow(
     89             channel.cpu().numpy(), ax=ia, **sg._all_show_args(show_y=i == 0), **kwargs
     90         )

~/.conda/envs/default/lib/python3.9/site-packages/librosa/display.py in specshow(data, x_coords, y_coords, x_axis, y_axis, sr, hop_length, fmin, fmax, tuning, bins_per_octave, key, Sa, mela, thaat, ax, **kwargs)
    854     # Set up axis scaling
    855     __scale_axes(axes, x_axis, "x")
--> 856     __scale_axes(axes, y_axis, "y")
    857 
    858     # Construct tickers and locators

~/.conda/envs/default/lib/python3.9/site-packages/librosa/display.py in __scale_axes(axes, ax_type, which)
    972         return
    973 
--> 974     scaler(mode, **kwargs)
    975 
    976 

~/.conda/envs/default/lib/python3.9/site-packages/matplotlib/axes/_base.py in set_yscale(self, value, **kwargs)
   4098         g = self.get_shared_y_axes()
   4099         for ax in g.get_siblings(self):
-> 4100             ax.yaxis._set_scale(value, **kwargs)
   4101             ax._update_transScale()
   4102             ax.stale = True

~/.conda/envs/default/lib/python3.9/site-packages/matplotlib/axis.py in _set_scale(self, value, **kwargs)
    759     def _set_scale(self, value, **kwargs):
    760         if not isinstance(value, mscale.ScaleBase):
--> 761             self._scale = mscale.scale_factory(value, self, **kwargs)
    762         else:
    763             self._scale = value

~/.conda/envs/default/lib/python3.9/site-packages/matplotlib/scale.py in scale_factory(scale, axis, **kwargs)
    595         scale = scale.lower()
    596     scale_cls = _api.check_getitem(_scale_mapping, scale=scale)
--> 597     return scale_cls(axis, **kwargs)
    598 
    599 

TypeError: __init__() got an unexpected keyword argument 'linthreshy'
item_tfms=[RemoveSilence(), ResizeSignal(2000), aud2spec, MaskTime(1,size=4), MaskFreq(1,size=3)]
show_image(Pipeline(item_tfms)(at))
<AxesSubplot:>
show_image(Pipeline(item_tfms)(at))
<AxesSubplot:>
dblock = DataBlock(blocks=[AudioBlock, CategoryBlock],
                  get_x=get_x,
                   item_tfms=item_tfms,
                   get_y=get_y,
                   splitter=RandomSplitter(valid_pct=0.2)
                  )
dls3 = dblock.dataloaders(df, )
learn3 = Learner(dls3, xresnet50(True), CrossEntropyLossFlat(),
                 wd=0.05, metrics=[accuracy, F1Score(average='weighted')]).to_fp16()
alter_learner(learn3)
learn3.lr_find()
0.00% [0/2 00:00<00:00]
37.78% [34/90 00:09<00:16 7.2763]
learn3.fine_tune(10, 7e-3)