%% Tutorial on tone recognition for isolated characters % In this tutorial, we shall explain the basics of Mandarin tone recognition for isolated characters. % The dataset is availabe at . %% Preprocessing % Before we start, let's add necessary toolboxes to the search path of MATLAB: addpath d:/users/jang/matlab/toolbox/utility addpath d:/users/jang/matlab/toolbox/machineLearning %% % For compatibility, here we list the platform and MATLAB version that we used to run this script: fprintf('Platform: %s\n', computer); fprintf('MATLAB version: %s\n', version); scriptStartTime=tic; %% Dataset construction % First of all, we shall collect all the recording data from the corpus directory. audioDir='D:\dataSet\mandarinTone\2013-msar'; auData=recursiveFileList(audioDir, 'wav'); auData=auData(1:100); % Use only the first 100 recordings for simplicity fprintf('Collected %d recordings...\n', length(auData)); %% % Since each recording contains 4 tones, we need to perform endpoint % detection in order to have 4 segments corresponding to these 4 tones: fs=16000; if ~exist('auData.mat', 'file') tic epdPrm=epdPrmSet(fs); for i=1:length(auData) fprintf('%d/%d, file=%s\n', i, length(auData), auData(i).path); aObj=myAudioRead(auData(i).path); [~, ~, auData(i).segment]=epdByVol(aObj, epdPrm); auData(i).segmentCount=length(auData(i).segment); auData(i).obj=aObj; end toc fprintf('Saving auData.mat...\n'); save auData auData else fprintf('Loading auData.mat...\n'); load auData.mat end %% % Since our endpoint detection cannot always successfully find these 4 segments, we can simply % remove those recordings which cannot be correctly segmented: keepIndex=[auData.segmentCount]==4; auData=auData(keepIndex); fprintf('Keep %d recordings for further analysis\n', length(auData)); %% % After this step, each recording should have 4 segments corresponding to 4 % tones. Then we can perform pitch training on these segments: fprintf('Pitch tracking...\n'); nbits=16; pfType=1; % 0 for AMDF, 1 for ACF ptOpt=ptOptSet(fs, nbits, pfType); ptOpt.frameSize=640; ptOpt.overlap=640-160; % frameSize 是 HTK 的兩倍,frame rate=100, 搭配語音辨識使用 ptOpt.useVolThreshold=0; ptOpt.useClarityThreshold=0; %ptOpt.mainFun='maxPickingOverPf'; for i=1:length(auData) fprintf('%d/%d, file=%s\n', i, length(auData), auData(i).path); for j=1:length(auData(i).segment) tempObj=auData(i).obj; tempObj.signal=tempObj.signal(auData(i).segment(j).beginSample:auData(i).segment(j).endSample); auData(i).segment(j).pitch=pitchTrackingForcedSmooth(tempObj, ptOpt); end end %% % After pitch tracking, we need to extracxt features. This is accomplished % in the following 4 steps: % % * Interpolate the original pitch to have a fixed length of 100. % * Subtract the mean of the interpolated pitch, such that its average value is 0. % * Use a 3-order polynomial to fit the interpolated pitch, and use the returned 4 coefficients as the features for tone recognition. fprintf('Feature extraction...\n'); for i=1:length(auData) fprintf('%d/%d, file=%s\n', i, length(auData), auData(i).path); for j=1:length(auData(i).segment) pitch=auData(i).segment(j).pitch; pitch2=interp1(1:length(pitch), pitch, linspace(1, length(pitch))); pitchNorm=pitch2-mean(pitch2); coef=polyfit(linspace(1, length(pitch)), pitchNorm, 3); auData(i).segment(j).coef=coef(:); auData(i).segment(j).pitchNorm=pitchNorm(:); end end %% % Once we have all the features for the recordings, we can create the dataset % for further exploration. segment=[auData.segment]; ds.input=[]; ds.output=[]; for i=1:4 toneData(i).segment=segment(i:4:end); ds.input=[ds.input, [toneData(i).segment.coef]]; ds.output=[ds.output, i*ones(1, length(toneData(i).segment))]; end ds.outputName={'tone1', 'tone2', 'tone3', 'tone4'}; ds.inputName={'c1', 'c2', 'c3', 'c4'}; %% Dataset visualization % Once we have every piece of necessary information stored in "ds", % we can invoke many different functions in Machine Learning Toolbox for % data visualization and classification. %% % For instance, we can display the size of each class: figure; [classSize, classLabel]=dsClassSize(ds, 1); %% % We can plot the distribution of each features within each class: figure; dsBoxPlot(ds); %% % The box plots indicate the ranges of the features vary a lot. To verify, % we can simply plot the range of features of the dataset: figure; dsRangePlot(ds); %% % Big range difference cause problems in distance-based classification. To % avoid this, we can simply normalize the features: ds2=ds; ds2.input=inputNormalize(ds2.input); %% % We can plot the feature vectors within each class: figure; dsFeaVecPlot(ds); figEnlarge; %% % We can do the scatter plots on every 2 features: figure; dsProjPlot2(ds); figEnlarge; %% % It is hard to see the above plots due to a large difference in the range of each features. % We can try the same plot with normalized inputs: figure; dsProjPlot2(ds2); figEnlarge; %% % We can also do the scatter plots in the 3D space: figure; dsProjPlot3(ds2); figEnlarge; %% % In order to visualize the distribution of the dataset, % we can project the original dataset into 2-D space. % This can be achieved by LDA (linear discriminant analysis): ds2d=lda(ds); ds2d.input=ds2d.input(1:2, :); figure; dsScatterPlot(ds2d); xlabel('Input 1'); ylabel('Input 2'); title('Features projected on the first 2 lda vectors'); %% Classification % We can try the most straightforward KNNC (k-nearest neighbor classifier): rr=knncLoo(ds); fprintf('rr=%g%% for ds\n', rr*100); %% % For normalized dataset, usually we can obtain a better accuracy: [rr, computed]=knncLoo(ds2); fprintf('rr=%g%% for ds2 of normalized inputs\n', rr*100); %% % We can plot the confusion matrix: confMat=confMatGet(ds2.output, computed); opt=confMatPlot('defaultOpt'); opt.className=ds.outputName; opt.mode='both'; figure; confMatPlot(confMat, opt); figEnlarge; %% % We can perform input selection to find the best features: figure; tic; inputSelectSequential(ds2, inf, 'knnc', 1); toc %% % We can even perform an exhaustive search on the classifiers and the way % of input normalization: opt=perfCv4classifier('defaultOpt'); opt.foldNum=10; tic; [perfData, bestId]=perfCv4classifier(ds, opt, 1); toc structDispInHtml(perfData, 'Performance of various classifiers via cross validation'); %% % We can then display the confusion matrix of the best classifier: confMat=confMatGet(ds.output, perfData(bestId).bestComputedClass); opt=confMatPlot('defaultOpt'); opt.className=ds.outputName; figure; confMatPlot(confMat, opt); figEnlarge; %% Summary % This is a brief tutorial on leaf recognition based on its shape and color statistics. % There are several directions for further improvement: % % * Explore other features, such as vein distribution % * Try the classification problem using the whole dataset % * Use template matching as an alternative to improve the performance % %% % Overall elapsed time: toc(scriptStartTime) %% % , created on date