%% Tutorial on tone recognition for isolated characters % This tutorial explains the basics of Mandarin tone recognition for isolated characters. % The dataset is availabe upon request. %% Preprocessing % Before we start, let's add necessary toolboxes to the search path of MATLAB: addpath d:/users/jang/matlab/toolbox/utility addpath d:/users/jang/matlab/toolbox/sap addpath d:/users/jang/matlab/toolbox/machineLearning %% % All the above toolboxes can be downloaded from the author's . % Make sure you are using the latest toolboxes to work with this script. %% % For compatibility, here we list the platform and MATLAB version that we used to run this script: fprintf('Platform: %s\n', computer); fprintf('MATLAB version: %s\n', version); fprintf('Script starts at %s\n', char(datetime)); scriptStartTime=tic; % Timing for the whole script %% Dataset collection % First of all, we shall collect all the recording data from the corpus directory. audioDir='D:\dataSet\mandarinTone\msar-2013'; fileCount=100; auSet=recursiveFileList(audioDir, 'wav'); %auSet=auSet(1:length(auSet)/fileCount:end); % Use only a subset for simplicity auSet=auSet(1:fileCount); fprintf('Collected %d recordings...\n', length(auSet)); %% % Since each recording contains 4 tones, we need to perform endpoint % detection in order to have 4 segments corresponding to these 4 tones: trOpt=trOptSet; fprintf('Perform endpoint detection...\n'); fs=16000; %if ~exist('auSet.mat', 'file') tic epdOpt=trOpt.epdOpt; for i=1:length(auSet) fprintf('%d/%d, file=%s\n', i, length(auSet), auSet(i).path); au=myAudioRead(auSet(i).path); [~, ~, segment]=epdByVol(au, epdOpt, 1); % if length(segment)~=4, fprintf('Press...'); pause; fprintf('\n'); end auSet(i).segment=segment; auSet(i).segmentCount=length(segment); auSet(i).au=au; end toc fprintf('Saving auSet.mat...\n'); save auSet auSet %else % fprintf('Loading auSet.mat...\n'); % load auSet.mat %end %% % Since our endpoint detection cannot always successfully find these 4 segments, we can simply % remove those recordings which cannot be correctly segmented: keepIndex=[auSet.segmentCount]==4; auSet=auSet(keepIndex); fprintf('Keep %d recordings for further analysis\n', length(auSet)); %% % After this step, each recording should have 4 segments corresponding to 4 % tones. Then we can perform pitch training on these segments: fprintf('Pitch tracking...\n'); ptOpt=trOpt.ptOpt; ptOpt.pitchDiffMax=2; for i=1:length(auSet) fprintf('%d/%d, file=%s\n', i, length(auSet), auSet(i).path); for j=1:length(auSet(i).segment) au=auSet(i).au; au.signal=au.signal(auSet(i).segment(j).beginSample:auSet(i).segment(j).endSample); % auSet(i).segment(j).pitch=pitchTrackForcedSmooth(au, ptOpt); auSet(i).segment(j).pitchObj=pitchTrack(au, ptOpt); end end fprintf('Saving auSet.mat after pitch tracking...\n'); save auSet auSet %% % After pitch tracking, we need to extracxt features. This is accomplished % in the following 4 steps: % % * Interpolate the original pitch to have a fixed length of 100. % * Subtract the mean of the interpolated pitch, such that its average value is 0. % * Use a 3-order polynomial to fit the interpolated pitch, and use the returned 4 coefficients as the features for tone recognition. fprintf('Feature extraction...\n'); fprintf('Order for polynomial fitting=%d\n', trOpt.feaOpt.polyOrder); for i=1:length(auSet) fprintf('%d/%d, file=%s\n', i, length(auSet), auSet(i).path); for j=1:length(auSet(i).segment) pitchObj=auSet(i).segment(j).pitchObj; pitchNorm=pitchObj.pitch-mean(pitchObj.pitch); x=linspace(-1, 1, length(pitchNorm)); % coef=polyfit(x, pitchNorm, trOpt.feaOpt.polyOrder); % Common polynomial fitting coef=polyFitChebyshev(pitchNorm, trOpt.feaOpt.polyOrder); % Chebysheve polynomial fitting. Why pitchNorm does not give better performance? auSet(i).segment(j).coef=coef(:); temp=interp1(x, pitchNorm, linspace(-1,1)); % For plotting only auSet(i).segment(j).pitchNorm=temp(:); % For plotting only end end %% % Once we have all the features for the recordings, we can create the dataset % for further exploration. segment=[auSet.segment]; ds.input=[]; ds.output=[]; for i=1:4 toneData(i).segment=segment(i:4:end); end for i=1:4 ds.input=[ds.input, [toneData(i).segment.coef]]; ds.output=[ds.output, i*ones(1, length(toneData(i).segment))]; end ds.outputName={'tone1', 'tone2', 'tone3', 'tone4'}; inputNum=size(ds.input, 1); for i=1:inputNum ds.inputName{i}=sprintf('c%d', i-1); %c1, c2, c3, etc end fprintf('Saving ds.mat...\n'); save ds ds %% Dataset visualization % Once we have every piece of necessary information stored in "ds", % we can invoke many different functions in Machine Learning Toolbox for % data visualization and classification. %% % For instance, we can display the size of each class: figure; [classSize, classLabel]=dsClassSize(ds, 1); %% % We can plot the distribution of each features within each class: figure; dsBoxPlot(ds); %% % The box plots indicate the ranges of the features vary a lot. To verify, % we can simply plot the range of features of the dataset: figure; dsRangePlot(ds); %% % Big range difference cause problems in distance-based classification. To % avoid this, we can simply normalize the features: ds2=ds; ds2.input=inputNormalize(ds2.input); %% % We can plot the feature vectors within each class: figure; dsFeaVecPlot(ds); figEnlarge; %% % We can do the scatter plots on every 2 features: figure; dsProjPlot2(ds); figEnlarge; %% % It is hard to see the above plots due to a large difference in the range of each features. % We can try the same plot with normalized inputs: figure; dsProjPlot2(ds2); figEnlarge; %% % We can also do the scatter plots in the 3D space: figure; dsProjPlot3(ds2); figEnlarge; %% % In order to visualize the distribution of the dataset, % we can project the original dataset into 2-D space. % This can be achieved by LDA (linear discriminant analysis): ds2d=lda(ds); ds2d.input=ds2d.input(1:2, :); figure; dsScatterPlot(ds2d); xlabel('Input 1'); ylabel('Input 2'); title('Features projected on the first 2 lda vectors'); %% Classification % We can try the most straightforward KNNC (k-nearest neighbor classifier): rr=knncLoo(ds); fprintf('rr=%g%% for ds\n', rr*100); %% % For normalized dataset, usually we can obtain a better accuracy: [rr, computed]=knncLoo(ds2); fprintf('rr=%g%% for ds2 of normalized inputs\n', rr*100); %% % We can plot the confusion matrix: confMat=confMatGet(ds2.output, computed); opt=confMatPlot('defaultOpt'); opt.className=ds.outputName; opt.mode='both'; figure; confMatPlot(confMat, opt); %% % We can perform input selection to find the best features: myTic=tic; figure; bestInputIndex=inputSelectSequential(ds2); figEnlarge; fprintf('time=%g sec\n', toc(myTic)); %% % We can even perform an exhaustive search on the classifiers and the way % of input normalization: opt=perfCv4classifier('defaultOpt'); opt.foldNum=10; figure; tic; [perfData, bestId]=perfCv4classifier(ds, opt, 1); toc structDispInHtml(perfData, 'Performance of various classifiers via cross validation'); %% % We can then display the confusion matrix of the best classifier: computedClass=perfData(bestId).bestComputedClass; confMat=confMatGet(ds.output, computedClass); opt=confMatPlot('defaultOpt'); opt.className=ds.outputName; figure; confMatPlot(confMat, opt); %% Error analysis % We can dispatch each classification result to each segment, % and label the correctness of the classification: k=1; for i=1:4 for j=1:length(toneData(i).segment) toneData(i).segment(j).predicted=computedClass(k); toneData(i).segment(j).correct=isequal(i, computedClass(k)); k=k+1; end end %% % First of all, we can plot the normalized pitch curves for each tone: figure; for i=1:4 subplot(2,2,i); index0=[toneData(i).segment.correct]==0; index1=[toneData(i).segment.correct]==1; pitchMat=[toneData(i).segment.pitchNorm]; pitchLen=size(pitchMat, 1); plot((1:pitchLen)', pitchMat(:,index0), 'r', (1:pitchLen)', pitchMat(:, index1), 'b'); title(sprintf('Tone %d', i)); end axisLimitSame; figEnlarge %% % In the above plots of normalized pitch vectors, we used "red" and "blue" % to indicate the misclassified and correctly classified cases, % respectively. % As can be seen, some of the misclassified pitch curves are not smooth enough. % Therefore if we can derive a smoother pitch curves, the overall accuracy % of tone recognition may be improved. %% Summary % This is a brief tutorial on tone recognition in Mandarin Chinese, based on the features % derived from pitch and volume. % There are several directions for further improvement: % % * Explore other features, such as timber. % * Try the classification problem using the whole dataset. % %% Appendix % List of functions and datasets used in this script % % * <../list.asp List of files in this folder> % %% % Date and time when finishing this script: fprintf('%s\n', char(datetime)); %% % Overall elapsed time: toc(scriptStartTime) %% % .