%% Tutorial on Voiced Sound Detection for Polyphonic Audio Music
% This tutorial describes the basics of voiced sound detection (VSD).
% In particular, the audio files used for our analysis are singing/humming
% recordings with human labeled frame-based pitch as the groundtruth.
% Here the voiced sound refer to the frames the recordings which have pitch.
% In other words, this is a binary classification problem given the mixture of vocal and background music.
% We shall try to tackle this problem with several common methods. 
%% Preprocessing
% Before we start, let's add necessary toolboxes to the search path of MATLAB:
addpath d:/users/jang/matlab/toolbox/utility
addpath d:/users/jang/matlab/toolbox/sap
addpath d:/users/jang/matlab/toolbox/machineLearning
%%
% All the above toolboxes can be downloaded from the author's <http://mirlab.org/jang/matlab/toolbox toolbox page>.
% Make sure you are using the latest toolboxes to work with this script. 
%%
% For compatibility, here we list the platform and MATLAB version that we used to run this script:
fprintf('Platform: %s\n', computer);
fprintf('MATLAB version: %s\n', version);
fprintf('Script starts at %s\n', char(datetime));
scriptStartTime=tic;	% Timing for the whole script
%%
% For this script, most of the modifiable options are set in vsdOptSet.m:
type vsdOptSet
%%
% You can change options in this file to try other settings for VSD.
% In particular, the following changes are mandatory if you want to run this script on your machine:
%
% * Make "vsdOpt.audioDir" point to the audio folder of the MIR-1K corpus.
% * Make "vsdOpt.pitchDir" point to the pitch folder of the MIR-1K corpus.
%
% The MIR-1K corpus is available at <http://mirlab.org/dataSet/public here>.
%
%% Dataset collection
% We can now read all the audio files (that have been manually labeled with frame-based pitch) to perform feature extraction.
% Note that:
% * The result is stored in a big structure varaible auSet for further analysis.
% * We also save auSet to vsdAuSet.mat so that we don't need to do it again next time.
% * If the program finds vsdAuSet.mat, it'll load the file directly to restore auSet.
% * If you prefer to redo the feature extraction, you need to delete "vsdAuSet.mat" in advance.
%
% Here it goes:
vsdOpt=vsdOptSet;
auFileNum=20;		% Use only this number of audio files
if ~exist('vsdAuSet.mat', 'file')
	myTic=tic;
	fprintf('Collecting audio files and their features from "%s"...\n', vsdOpt.audioDir);
	auSet=recursiveFileList(vsdOpt.audioDir, 'wav');	% Collect all wav files
	auSet=auSet(1:auFileNum);
	auSet=auSetFeaExtract(auSet, vsdOpt);
	fprintf('Time for feature extraction over %d files = %g sec\n', length(auSet), toc(myTic));
	fprintf('Saving auSet to vsdAuSet.mat...\n');
	save vsdAuSet auSet
else
	fprintf('Loading auSet from vsdAuSet.mat...\n');
	load vsdAuSet.mat
end
%%
% Now we can create a variable ds for data visualization and classification:
ds.input=[auSet.feature];
ds.output=[auSet.tOutput];
ds.inputName=auSet(1).other.inputName;
ds.outputName=vsdOpt.outputName;
%%
% We can obtain the feature dimensions and data count:
[dim, count]=size(ds.input);
fprintf('Feature dimensions = %d, data count = %d\n', dim, count);
%%
% Since the dataset is big, we reduce the dataset for easy plotting:
downSampleRatio=2;
ds.input=ds.input(:, 1:downSampleRatio:end);
ds.output=ds.output(:, 1:downSampleRatio:end);
fprintf('Data count = %d after 1/%d down sampling.\n', size(ds.input, 2), downSampleRatio);
%% Data analysis and visualization
% Display data sizes for each class:
[classSize, classLabel]=dsClassSize(ds, 1);
%%
% Display feature distribution for each class:
figure; dsBoxPlot(ds);
%%
% Plot the feature vectors within each class:
figure; dsFeaVecPlot(ds);
%% Input selection
% We can select more important inputs based on leave-one-out (LOO) criterion of KNNC:
myTic=tic;
figure; bestInputIndex=inputSelectSequential(ds); figEnlarge;
fprintf('time=%g sec\n', toc(myTic));
%% Input transformation
% We can also perform LDA and evaluation of its performance based on LOO criterion.
% To get a quick result, here we find the transformation matrix using all data points and then evaluate KNNC performance based on LOO.
% This is only an approximate result and it tends to be on the optimistic side:
myTic=tic;
ds2=ds; ds2.input=inputNormalize(ds2.input);	% input normalization
opt=ldaPerfViaKnncLoo('defaultOpt');
opt.mode='approximate';
recogRate1=ldaPerfViaKnncLoo(ds, opt);
recogRate2=ldaPerfViaKnncLoo(ds2, opt);
figure; plot(1:length(recogRate1), 100*recogRate1, 'o-', 1:length(recogRate2), 100*recogRate2, '^-'); grid on
legend('Raw data', 'Normalized data', 'location', 'northOutside', 'orientation', 'horizontal');
xlabel('No. of projected features based on LDA');
ylabel('LOO recognition rates using KNNC (%)');
fprintf('time=%g sec\n', toc(myTic));
%%
% It would be too time consuming to perform the LDA evaluation of exact
% LOO. But if you do want to try it, uncomment the following code:
%myTic=tic;
%ds2=ds; ds2.input=inputNormalize(ds2.input);	% input normalization
%opt=ldaPerfViaKnncLoo('defaultOpt');
%opt.mode='exact';
%recogRate1=ldaPerfViaKnncLoo(ds, opt);
%recogRate2=ldaPerfViaKnncLoo(ds2, opt);
%figure; plot(1:length(recogRate1), 100*recogRate1, 'o-', 1:length(recogRate2), 100*recogRate2, '^-'); grid on
%legend('Raw data', 'Normalized data', 'location', 'northOutside', 'orientation', 'horizontal');
%xlabel('No. of projected features based on LDA');
%ylabel('LOO recognition rates using KNNC (%)');
%fprintf('time=%g sec\n', toc(myTic));
%% HMM training
% Using the collected auSet, we can start HMM training for voiced sound
% detection:
myTic=tic;
[vsdHmmModel, rr]=hmmTrain4audio(auSet, vsdOpt, 1); figEnlarge;
fprintf('Time for HMM training=%g sec\n', toc(myTic));
%% HMM test
% After the training, we can test the HMM using a single audio file:
myTic=tic;
auFile=auSet(1).path;
au=myAudioRead(auFile);
au=vsdOpt.feaExtractFcn(au, vsdOpt, 1);	% Attach feature and tOutput
wObj=hmmEval4audio(au, vsdOpt, vsdHmmModel, 1); figEnlarge;
fprintf('time=%g sec\n', toc(myTic));
%% Performance evaluation of HMM via LOO
% To evaluate the performance more objectively, we can test the LOO accuracy by
% using "leave-one-file-out". (Note that it's very time consuming.)
myTic=tic;
showPlot=1;
[outsideRr, cvData]=hmmPerfLoo4audio(auSet, vsdOpt, showPlot); figEnlarge;
fprintf('time=%g sec\n', toc(myTic));
%%
% We can then detect the best number of Gaussian components in each state
% of the HMM using such leave-one-file-out cross validation.
% This part is also time consuming:
myTic=tic;
maxExponent=6;
insideRr=zeros(1, maxExponent);
for i=1:maxExponent
	vsdOpt.gaussianNum=2^i;
	fprintf('%d/%d: No. of Gaussian component=%d, ', i, maxExponent, vsdOpt.gaussianNum);
	[outsideRr(i), cvData]=hmmPerfLoo4audio(auSet, vsdOpt);
	insideRr(i)=mean([cvData.insideRr]);
	fprintf('insideRR=%g%%, outsideRr=%g%%\n', insideRr(i)*100, outsideRr(i)*100);
end
plot(1:maxExponent, insideRr*100, '.-', 1:maxExponent, outsideRr*100, '.-'); grid on; figEnlarge;
set(gca, 'xTick', 1:maxExponent);
label=get(gca, 'xticklabel');
for i=1:length(label), label{i}=sprintf('2^%s', label{i}); end;
set(gca, 'xticklabel', label); 
legend('Inside-test RR', 'Outside-test RR', 'location', 'northOutside', 'orientation', 'horizontal');
xlabel('No. of Gaussian mixtures');
ylabel('Recognition rate (%)');
fprintf('time=%g sec\n', toc(myTic));
%% Summary
% This is a brief tutorial on using HMM for voiced sound detection for polyphonic audio music.
% There are several directions for further improvement:
%
% * Explore other preprocessing, such as HPSS.
% * Investigate new features for VSD.
% * Change the configuration of the GMM used in HMM.
% * Use of other classifiers for VSD.
%
%% Appendix
% List of functions and datasets used in this script
%
% * <http://mirlab.org/dataSet/public MIR1k dataset>
% * <../list.asp List of files in this folder>
%
%%
% Date and time when finishing this script:
fprintf('Date & time: %s\n', char(datetime));
%%
% Overall elapsed time:
toc(scriptStartTime)
%%
% Date and time when finishing this script:
fprintf('Date & time: %s\n', char(datetime));
%%
% If you are interested in the original MATLAB code for this page, you can
% type "grabcode(URL)" under MATLAB, where URL is the web address of this
% page.
%%
% <http://mirlab.org/jang Jyh-Shing Roger Jang>