%% Tutorial on tone recognition for isolated characters
% This tutorial explains the basics of Mandarin tone recognition for isolated characters.
% The dataset is availabe upon request. 
%% Preprocessing
% Before we start, let's add necessary toolboxes to the search path of MATLAB:
addpath d:/users/jang/matlab/toolbox/utility
addpath d:/users/jang/matlab/toolbox/sap
addpath d:/users/jang/matlab/toolbox/machineLearning
%%
% All the above toolboxes can be downloaded from the author's <http://mirlab.org/jang/matlab/toolbox toolbox page>.
% Make sure you are using the latest toolboxes to work with this script. 
%%
% For compatibility, here we list the platform and MATLAB version that we used to run this script:
fprintf('Platform: %s\n', computer);
fprintf('MATLAB version: %s\n', version);
fprintf('Script starts at %s\n', char(datetime));
scriptStartTime=tic;	% Timing for the whole script
%% Dataset collection
% First of all, we shall collect all the recording data from the corpus directory.
audioDir='D:\dataSet\mandarinTone\msar-2013';
fileCount=100;
auSet=recursiveFileList(audioDir, 'wav');
%auSet=auSet(1:length(auSet)/fileCount:end);	% Use only a subset for simplicity
auSet=auSet(1:fileCount);
fprintf('Collected %d recordings...\n', length(auSet));
%%
% Since each recording contains 4 tones, we need to perform endpoint
% detection in order to have 4 segments corresponding to these 4 tones:
trOpt=trOptSet;
fprintf('Perform endpoint detection...\n');
fs=16000;
%if ~exist('auSet.mat', 'file')
	tic
	epdOpt=trOpt.epdOpt;
	for i=1:length(auSet)
		fprintf('%d/%d, file=%s\n', i, length(auSet), auSet(i).path);
		au=myAudioRead(auSet(i).path);
		[~, ~, segment]=epdByVol(au, epdOpt, 1);
	%	if length(segment)~=4, fprintf('Press...'); pause; fprintf('\n'); end
		auSet(i).segment=segment;
		auSet(i).segmentCount=length(segment);
		auSet(i).au=au;
	end
	toc
	fprintf('Saving auSet.mat...\n');
	save auSet auSet
%else
%	fprintf('Loading auSet.mat...\n');
%	load auSet.mat
%end
%%
% Since our endpoint detection cannot always successfully find these 4 segments, we can simply
% remove those recordings which cannot be correctly segmented:
keepIndex=[auSet.segmentCount]==4;
auSet=auSet(keepIndex);
fprintf('Keep %d recordings for further analysis\n', length(auSet));
%%
% After this step, each recording should have 4 segments corresponding to 4
% tones. Then we can perform pitch training on these segments:
fprintf('Pitch tracking...\n');
ptOpt=trOpt.ptOpt;
ptOpt.pitchDiffMax=2;
for i=1:length(auSet)
	fprintf('%d/%d, file=%s\n', i, length(auSet), auSet(i).path);
	for j=1:length(auSet(i).segment)
		au=auSet(i).au;
		au.signal=au.signal(auSet(i).segment(j).beginSample:auSet(i).segment(j).endSample);
	%	auSet(i).segment(j).pitch=pitchTrackForcedSmooth(au, ptOpt);
		auSet(i).segment(j).pitchObj=pitchTrack(au, ptOpt);
	end
end
fprintf('Saving auSet.mat after pitch tracking...\n');
save auSet auSet
%%
% After pitch tracking, we need to extracxt features. This is accomplished
% in the following 4 steps:
%
% * Interpolate the original pitch to have a fixed length of 100.
% * Subtract the mean of the interpolated pitch, such that its average value is 0.
% * Use a 3-order polynomial to fit the interpolated pitch, and use the returned 4 coefficients as the features for tone recognition.
fprintf('Feature extraction...\n');
fprintf('Order for polynomial fitting=%d\n', trOpt.feaOpt.polyOrder);
for i=1:length(auSet)
	fprintf('%d/%d, file=%s\n', i, length(auSet), auSet(i).path);
	for j=1:length(auSet(i).segment)
		pitchObj=auSet(i).segment(j).pitchObj;
		pitchNorm=pitchObj.pitch-mean(pitchObj.pitch);
		x=linspace(-1, 1, length(pitchNorm));
	%	coef=polyfit(x, pitchNorm, trOpt.feaOpt.polyOrder);			% Common polynomial fitting
		coef=polyFitChebyshev(pitchNorm, trOpt.feaOpt.polyOrder);	% Chebysheve polynomial fitting. Why pitchNorm does not give better performance?
		auSet(i).segment(j).coef=coef(:);
		temp=interp1(x, pitchNorm, linspace(-1,1));	% For plotting only
		auSet(i).segment(j).pitchNorm=temp(:);	% For plotting only
	end
end
%%
% Once we have all the features for the recordings, we can create the dataset
% for further exploration.
segment=[auSet.segment];
ds.input=[]; ds.output=[];
for i=1:4
	toneData(i).segment=segment(i:4:end);
end
for i=1:4
	ds.input=[ds.input, [toneData(i).segment.coef]];
	ds.output=[ds.output, i*ones(1, length(toneData(i).segment))];
end
ds.outputName={'tone1', 'tone2', 'tone3', 'tone4'};
inputNum=size(ds.input, 1);
for i=1:inputNum
	ds.inputName{i}=sprintf('c%d', i-1);	%c1, c2, c3, etc
end
fprintf('Saving ds.mat...\n');
save ds ds
%% Dataset visualization
% Once we have every piece of necessary information stored in "ds",
% we can invoke many different functions in Machine Learning Toolbox for
% data visualization and classification.
%%
% For instance, we can display the size of each class:
figure;
[classSize, classLabel]=dsClassSize(ds, 1);
%%
% We can plot the distribution of each features within each class:
figure; dsBoxPlot(ds);
%%
% The box plots indicate the ranges of the features vary a lot. To verify,
% we can simply plot the range of features of the dataset:
figure; dsRangePlot(ds);
%%
% Big range difference cause problems in distance-based classification. To
% avoid this, we can simply normalize the features:
ds2=ds;
ds2.input=inputNormalize(ds2.input);
%%
% We can plot the feature vectors within each class:
figure; dsFeaVecPlot(ds); figEnlarge;
%%
% We can do the scatter plots on every 2 features:
figure; dsProjPlot2(ds); figEnlarge;
%%
% It is hard to see the above plots due to a large difference in the range of each features.
% We can try the same plot with normalized inputs:
figure; dsProjPlot2(ds2); figEnlarge;
%%
% We can also do the scatter plots in the 3D space:
figure; dsProjPlot3(ds2); figEnlarge;
%%
% In order to visualize the distribution of the dataset,
% we can project the original dataset into 2-D space.
% This can be achieved by LDA (linear discriminant analysis):
ds2d=lda(ds);
ds2d.input=ds2d.input(1:2, :);
figure; dsScatterPlot(ds2d); xlabel('Input 1'); ylabel('Input 2');
title('Features projected on the first 2 lda vectors');
%% Classification
% We can try the most straightforward KNNC (k-nearest neighbor classifier):
rr=knncLoo(ds);
fprintf('rr=%g%% for ds\n', rr*100);
%%
% For normalized dataset, usually we can obtain a better accuracy:
[rr, computed]=knncLoo(ds2);
fprintf('rr=%g%% for ds2 of normalized inputs\n', rr*100);
%%
% We can plot the confusion matrix:
confMat=confMatGet(ds2.output, computed);
opt=confMatPlot('defaultOpt');
opt.className=ds.outputName;
opt.mode='both';
figure; confMatPlot(confMat, opt);
%%
% We can perform input selection to find the best features:
myTic=tic;
figure; bestInputIndex=inputSelectSequential(ds2); figEnlarge;
fprintf('time=%g sec\n', toc(myTic));
%%
% We can even perform an exhaustive search on the classifiers and the way
% of input normalization:
opt=perfCv4classifier('defaultOpt');
opt.foldNum=10;
figure;
tic; [perfData, bestId]=perfCv4classifier(ds, opt, 1); toc
structDispInHtml(perfData, 'Performance of various classifiers via cross validation');
%%
% We can then display the confusion matrix of the best classifier:
computedClass=perfData(bestId).bestComputedClass;
confMat=confMatGet(ds.output, computedClass);
opt=confMatPlot('defaultOpt');
opt.className=ds.outputName;
figure; confMatPlot(confMat, opt);
%% Error analysis
% We can dispatch each classification result to each segment,
% and label the correctness of the classification:
k=1;
for i=1:4
	for j=1:length(toneData(i).segment)
		toneData(i).segment(j).predicted=computedClass(k);
		toneData(i).segment(j).correct=isequal(i, computedClass(k));
		k=k+1;
	end
end
%%
% First of all, we can plot the normalized pitch curves for each tone:
figure;
for i=1:4
	subplot(2,2,i);
	index0=[toneData(i).segment.correct]==0;
	index1=[toneData(i).segment.correct]==1;
	pitchMat=[toneData(i).segment.pitchNorm];
	pitchLen=size(pitchMat, 1);
	plot((1:pitchLen)', pitchMat(:,index0), 'r', (1:pitchLen)', pitchMat(:, index1), 'b');
	title(sprintf('Tone %d', i));
end
axisLimitSame; figEnlarge
%%
% In the above plots of normalized pitch vectors, we used "red" and "blue"
% to indicate the misclassified and correctly classified cases,
% respectively.
% As can be seen, some of the misclassified pitch curves are not smooth enough.
% Therefore if we can derive a smoother pitch curves, the overall accuracy
% of tone recognition may be improved.
%% Summary
% This is a brief tutorial on tone recognition in Mandarin Chinese, based on the features
% derived from pitch and volume.
% There are several directions for further improvement:
%
% * Explore other features, such as timber.
% * Try the classification problem using the whole dataset.
%
%% Appendix
% List of functions and datasets used in this script
%
% * <../list.asp List of files in this folder>
%
%%
% Date and time when finishing this script:
fprintf('%s\n', char(datetime));
%%
% Overall elapsed time:
toc(scriptStartTime)
%%
% <http://mirlab.org/jang Jyh-Shing Roger Jang>.