%% Tutorial on tone recognition for isolated characters
% This tutorial explains the basics of Mandarin tone recognition for isolated characters.
% The dataset is availabe upon request.
%% Preprocessing
% Before we start, let's add necessary toolboxes to the search path of MATLAB:
addpath d:/users/jang/matlab/toolbox/utility
addpath d:/users/jang/matlab/toolbox/sap
addpath d:/users/jang/matlab/toolbox/machineLearning
%% Dataset collection
% First of all, we shall collect all the recording data from the corpus directory.
audioDir='D:\dataSet\mandarinTone\msar-2013';
fileCount=100;
auSet=recursiveFileList(audioDir, 'wav');
%auSet=auSet(1:length(auSet)/fileCount:end); % Use only a subset for simplicity
auSet=auSet(1:fileCount);
fprintf('Collected %d recordings...\n', length(auSet));
%%
% Since each recording contains 4 tones, we need to perform endpoint
% detection in order to have 4 segments corresponding to these 4 tones:
trOpt=trOptSet;
fprintf('Perform endpoint detection...\n');
fs=16000;
%if ~exist('auSet.mat', 'file')
tic
epdOpt=trOpt.epdOpt;
for i=1:length(auSet)
    fprintf('%d/%d, file=%s\n', i, length(auSet), auSet(i).path);
    au=myAudioRead(auSet(i).path);
    [~, ~, segment]=epdByVol(au, epdOpt, 1);
%   if length(segment)~=4, fprintf('Press...'); pause; fprintf('\n'); end
    auSet(i).segment=segment;
    auSet(i).segmentCount=length(segment);
    auSet(i).au=au;
end
toc
fprintf('Saving auSet.mat...\n');
save auSet auSet
%else
%   fprintf('Loading auSet.mat...\n');
%   load auSet.mat
%end
%%
% Since our endpoint detection cannot always successfully find these 4 segments, we can simply
% remove those recordings which cannot be correctly segmented:
keepIndex=[auSet.segmentCount]==4;
auSet=auSet(keepIndex);
fprintf('Keep %d recordings for further analysis\n', length(auSet));
%%
% After this step, each recording should have 4 segments corresponding to 4
% tones. Then we can perform pitch training on these segments: fprintf('Pitch tracking...\n'); ptOpt=trOpt.ptOpt; ptOpt.pitchDiffMax=2; for i=1:length(auSet) fprintf('%d/%d, file=%s\n', i, length(auSet), auSet(i).path); for j=1:length(auSet(i).segment) au=auSet(i).au; au.signal=au.signal(auSet(i).segment(j).beginSample:auSet(i).segment(j).endSample); % auSet(i).segment(j).pitch=pitchTrackForcedSmooth(au, ptOpt); auSet(i).segment(j).pitchObj=pitchTrack(au, ptOpt); end end fprintf('Saving auSet.mat after pitch tracking...\n'); save auSet auSet %% % After pitch tracking, we need to extracxt features. This is accomplished % in the following 4 steps: % % * Interpolate the original pitch to have a fixed length of 100. % * Subtract the mean of the interpolated pitch, such that its average value is 0. % * Use a 3-order polynomial to fit the interpolated pitch, and use the returned 4 coefficients as the features for tone recognition. fprintf('Feature extraction...\n'); fprintf('Order for polynomial fitting=%d\n', trOpt.feaOpt.polyOrder); for i=1:length(auSet) fprintf('%d/%d, file=%s\n', i, length(auSet), auSet(i).path); for j=1:length(auSet(i).segment) pitchObj=auSet(i).segment(j).pitchObj; pitchNorm=pitchObj.pitch-mean(pitchObj.pitch); x=linspace(-1, 1, length(pitchNorm)); % coef=polyfit(x, pitchNorm, trOpt.feaOpt.polyOrder); % Common polynomial fitting coef=polyFitChebyshev(pitchNorm, trOpt.feaOpt.polyOrder); % Chebysheve polynomial fitting. Why pitchNorm does not give better performance? auSet(i).segment(j).coef=coef(:); temp=interp1(x, pitchNorm, linspace(-1,1)); % For plotting only auSet(i).segment(j).pitchNorm=temp(:); % For plotting only end end %% % Once we have all the features for the recordings, we can create the dataset % for further exploration. segment=[auSet.segment]; ds.input=[]; ds.output=[]; for i=1:4 toneData(i).segment=segment(i:4:end); end for i=1:4 ds.input=[ds.input, [toneData(i).segment.coef]]; ds.output=[ds.output, i*ones(1, length(toneData(i).segment))]; end ds.outputName={'tone1', 'tone2', 'tone3', 'tone4'}; inputNum=size(ds.input, 1); for i=1:inputNum ds.inputName{i}=sprintf('c%d', i-1); %c1, c2, c3, etc end fprintf('Saving ds.mat...\n'); save ds ds %% Dataset visualization % Once we have every piece of necessary information stored in "ds", % we can invoke many different functions in Machine Learning Toolbox for % data visualization and classification. %% % For instance, we can display the size of each class: figure; [classSize, classLabel]=dsClassSize(ds, 1); %% % We can plot the distribution of each features within each class: figure; dsBoxPlot(ds); %% % The box plots indicate the ranges of the features vary a lot. To verify, % we can simply plot the range of features of the dataset: figure; dsRangePlot(ds); %% % Big range difference cause problems in distance-based classification. To verify,
% we can simply plot the range of features of the dataset:
figure;
dsRangePlot(ds);
%%
% Big range difference cause problems in distance-based classification. To
% avoid this, we can simply normalize the features:
ds2=ds;
ds2.input=inputNormalize(ds2.input);
%%
% We can plot the feature vectors within each class:
figure;
dsFeaVecPlot(ds);
figEnlarge;
%%
% We can do the scatter plots on every 2 features:
figure;
dsProjPlot2(ds);
figEnlarge;
%%
% It is hard to see the above plots due to a large difference in the range of each features.
% We can try the same plot with normalized inputs:
figure;
dsProjPlot2(ds2);
figEnlarge;
%%
% We can also do the scatter plots in the 3D space:
figure;
dsProjPlot3(ds2);
figEnlarge;
%%
% In order to visualize the distribution of the dataset,
% we can project the original dataset into 2-D space.
% This can be achieved by LDA (linear discriminant analysis):
ds2d=lda(ds);
ds2d.input=ds2d.input(1:2, :);
figure;
dsScatterPlot(ds2d);
xlabel('Input 1'); ylabel('Input 2');
title('Features projected on the first 2 lda vectors');
%% Classification
% We can try the most straightforward KNNC (k-nearest neighbor classifier):
rr=knncLoo(ds);
fprintf('rr=%g%% for ds\n', rr*100);
%%
% For normalized dataset, usually we can obtain a better accuracy:
[rr, computed]=knncLoo(ds2);
fprintf('rr=%g%% for ds2 of normalized inputs\n', rr*100);
%%
% We can plot the confusion matrix:
confMat=confMatGet(ds2.output, computed);
opt=confMatPlot('defaultOpt');
opt.className=ds.outputName;
opt.mode='both';
figure;
confMatPlot(confMat, opt);
%%
% We can perform input selection to find the best features:
myTic=tic;
figure;
bestInputIndex=inputSelectSequential(ds2);
figEnlarge;
fprintf('time=%g sec\n', toc(myTic));
%%
% We can even perform an exhaustive search on the classifiers and the way
% of input normalization:
opt=perfCv4classifier('defaultOpt');
opt.foldNum=10;
figure;
tic;
[perfData, bestId]=perfCv4classifier(ds, opt, 1);
toc
structDispInHtml(perfData, 'Performance of various classifiers via cross validation');
%%
% We can then display the confusion matrix of the best classifier:
computedClass=perfData(bestId).bestComputedClass;
confMat=confMatGet(ds.output, computedClass);
opt=confMatPlot('defaultOpt');
opt.className=ds.outputName;
figure;
confMatPlot(confMat, opt);
%% Error analysis
% We can dispatch each classification result to each segment,
% and label the correctness of the classification:
k=1;
for i=1:4
    for j=1:length(toneData(i).segment)
        toneData(i).segment(j).predicted=computedClass(k);
        toneData(i).segment(j).correct=isequal(i, computedClass(k));
        k=k+1;
    end
end
%%
% First of all, we can plot the normalized pitch curves for each tone:
figure;
for i=1:4
    subplot(2,2,i);
    index0=[toneData(i).segment.correct]==0;
    index1=[toneData(i).segment.correct]==1;
    pitchMat=[toneData(i).segment.pitchNorm];
    pitchLen=size(pitchMat, 1);
    plot((1:pitchLen)', pitchMat(:,index0), 'r', (1:pitchLen)', pitchMat(:, index1), 'b');
    title(sprintf('Tone %d', i));
end
axisLimitSame;
figEnlarge
%%
% In the above plots of normalized pitch vectors, we used "red" and "blue"
% to indicate the misclassified and correctly classified cases,
% respectively.
% As can be seen, some of the misclassified pitch curves are not smooth enough.
% Therefore if we can derive a smoother pitch curves, the overall accuracy
% of tone recognition may be improved.
%% Summary
% This is a brief tutorial on tone recognition in Mandarin Chinese, based on the features
% derived from pitch and volume.
% There are several directions for further improvement:
%
% * Explore other features, such as timber.
% * Try the classification problem using the whole dataset.