%% Tutorial on Voiced Sound Detection for Polyphonic Audio Music % This tutorial describes the basics of voiced sound detection (VSD). % In particular, the audio files used for our analysis are singing/humming % recordings with human labeled frame-based pitch as the groundtruth. % Here the voiced sound refer to the frames the recordings which have pitch. % In other words, this is a binary classification problem given the mixture of vocal and background music. % We shall try to tackle this problem with several common methods. %% Preprocessing % Before we start, let's add necessary toolboxes to the search path of MATLAB: addpath d:/users/jang/matlab/toolbox/utility addpath d:/users/jang/matlab/toolbox/sap addpath d:/users/jang/matlab/toolbox/machineLearning %% % All the above toolboxes can be downloaded from the author's . % Make sure you are using the latest toolboxes to work with this script. %% % For compatibility, here we list the platform and MATLAB version that we used to run this script: fprintf('Platform: %s\n', computer); fprintf('MATLAB version: %s\n', version); fprintf('Script starts at %s\n', char(datetime)); scriptStartTime=tic; % Timing for the whole script %% % For this script, most of the modifiable options are set in vsdOptSet.m: type vsdOptSet %% % You can change options in this file to try other settings for VSD. % In particular, the following changes are mandatory if you want to run this script on your machine: % % * Make "vsdOpt.audioDir" point to the audio folder of the MIR-1K corpus. % * Make "vsdOpt.pitchDir" point to the pitch folder of the MIR-1K corpus. % % The MIR-1K corpus is available at . % %% Dataset collection % We can now read all the audio files (that have been manually labeled with frame-based pitch) to perform feature extraction. % Note that: % * The result is stored in a big structure varaible auSet for further analysis. % * We also save auSet to vsdAuSet.mat so that we don't need to do it again next time. % * If the program finds vsdAuSet.mat, it'll load the file directly to restore auSet. % * If you prefer to redo the feature extraction, you need to delete "vsdAuSet.mat" in advance. % % Here it goes: vsdOpt=vsdOptSet; auFileNum=20; % Use only this number of audio files if ~exist('vsdAuSet.mat', 'file') myTic=tic; fprintf('Collecting audio files and their features from "%s"...\n', vsdOpt.audioDir); auSet=recursiveFileList(vsdOpt.audioDir, 'wav'); % Collect all wav files auSet=auSet(1:auFileNum); auSet=auSetFeaExtract(auSet, vsdOpt); fprintf('Time for feature extraction over %d files = %g sec\n', length(auSet), toc(myTic)); fprintf('Saving auSet to vsdAuSet.mat...\n'); save vsdAuSet auSet else fprintf('Loading auSet from vsdAuSet.mat...\n'); load vsdAuSet.mat end %% % Now we can create a variable ds for data visualization and classification: ds.input=[auSet.feature]; ds.output=[auSet.tOutput]; ds.inputName=auSet(1).other.inputName; ds.outputName=vsdOpt.outputName; %% % We can obtain the feature dimensions and data count: [dim, count]=size(ds.input); fprintf('Feature dimensions = %d, data count = %d\n', dim, count); %% % Since the dataset is big, we reduce the dataset for easy plotting: downSampleRatio=2; ds.input=ds.input(:, 1:downSampleRatio:end); ds.output=ds.output(:, 1:downSampleRatio:end); fprintf('Data count = %d after 1/%d down sampling.\n', size(ds.input, 2), downSampleRatio); %% Data analysis and visualization % Display data sizes for each class: [classSize, classLabel]=dsClassSize(ds, 1); %% % Display feature distribution for each class: figure; dsBoxPlot(ds); %% % Plot the feature vectors within each class: figure; dsFeaVecPlot(ds); %% Input selection % We can select more important inputs based on leave-one-out (LOO) criterion of KNNC: myTic=tic; figure; bestInputIndex=inputSelectSequential(ds); figEnlarge; fprintf('time=%g sec\n', toc(myTic)); %% Input transformation % We can also perform LDA and evaluation of its performance based on LOO criterion. % To get a quick result, here we find the transformation matrix using all data points and then evaluate KNNC performance based on LOO. % This is only an approximate result and it tends to be on the optimistic side: myTic=tic; ds2=ds; ds2.input=inputNormalize(ds2.input); % input normalization opt=ldaPerfViaKnncLoo('defaultOpt'); opt.mode='approximate'; recogRate1=ldaPerfViaKnncLoo(ds, opt); recogRate2=ldaPerfViaKnncLoo(ds2, opt); figure; plot(1:length(recogRate1), 100*recogRate1, 'o-', 1:length(recogRate2), 100*recogRate2, '^-'); grid on legend('Raw data', 'Normalized data', 'location', 'northOutside', 'orientation', 'horizontal'); xlabel('No. of projected features based on LDA'); ylabel('LOO recognition rates using KNNC (%)'); fprintf('time=%g sec\n', toc(myTic)); %% % It would be too time consuming to perform the LDA evaluation of exact % LOO. But if you do want to try it, uncomment the following code: %myTic=tic; %ds2=ds; ds2.input=inputNormalize(ds2.input); % input normalization %opt=ldaPerfViaKnncLoo('defaultOpt'); %opt.mode='exact'; %recogRate1=ldaPerfViaKnncLoo(ds, opt); %recogRate2=ldaPerfViaKnncLoo(ds2, opt); %figure; plot(1:length(recogRate1), 100*recogRate1, 'o-', 1:length(recogRate2), 100*recogRate2, '^-'); grid on %legend('Raw data', 'Normalized data', 'location', 'northOutside', 'orientation', 'horizontal'); %xlabel('No. of projected features based on LDA'); %ylabel('LOO recognition rates using KNNC (%)'); %fprintf('time=%g sec\n', toc(myTic)); %% HMM training % Using the collected auSet, we can start HMM training for voiced sound % detection: myTic=tic; [vsdHmmModel, rr]=hmmTrain4audio(auSet, vsdOpt, 1); figEnlarge; fprintf('Time for HMM training=%g sec\n', toc(myTic)); %% HMM test % After the training, we can test the HMM using a single audio file: myTic=tic; auFile=auSet(1).path; au=myAudioRead(auFile); au=vsdOpt.feaExtractFcn(au, vsdOpt, 1); % Attach feature and tOutput wObj=hmmEval4audio(au, vsdOpt, vsdHmmModel, 1); figEnlarge; fprintf('time=%g sec\n', toc(myTic)); %% Performance evaluation of HMM via LOO % To evaluate the performance more objectively, we can test the LOO accuracy by % using "leave-one-file-out". (Note that it's very time consuming.) myTic=tic; showPlot=1; [outsideRr, cvData]=hmmPerfLoo4audio(auSet, vsdOpt, showPlot); figEnlarge; fprintf('time=%g sec\n', toc(myTic)); %% % We can then detect the best number of Gaussian components in each state % of the HMM using such leave-one-file-out cross validation. % This part is also time consuming: myTic=tic; maxExponent=6; insideRr=zeros(1, maxExponent); for i=1:maxExponent vsdOpt.gaussianNum=2^i; fprintf('%d/%d: No. of Gaussian component=%d, ', i, maxExponent, vsdOpt.gaussianNum); [outsideRr(i), cvData]=hmmPerfLoo4audio(auSet, vsdOpt); insideRr(i)=mean([cvData.insideRr]); fprintf('insideRR=%g%%, outsideRr=%g%%\n', insideRr(i)*100, outsideRr(i)*100); end plot(1:maxExponent, insideRr*100, '.-', 1:maxExponent, outsideRr*100, '.-'); grid on; figEnlarge; set(gca, 'xTick', 1:maxExponent); label=get(gca, 'xticklabel'); for i=1:length(label), label{i}=sprintf('2^%s', label{i}); end; set(gca, 'xticklabel', label); legend('Inside-test RR', 'Outside-test RR', 'location', 'northOutside', 'orientation', 'horizontal'); xlabel('No. of Gaussian mixtures'); ylabel('Recognition rate (%)'); fprintf('time=%g sec\n', toc(myTic)); %% Summary % This is a brief tutorial on using HMM for voiced sound detection for polyphonic audio music. % There are several directions for further improvement: % % * Explore other preprocessing, such as HPSS. % * Investigate new features for VSD. % * Change the configuration of the GMM used in HMM. % * Use of other classifiers for VSD. % %% Appendix % List of functions and datasets used in this script % % * % * <../list.asp List of files in this folder> % %% % Date and time when finishing this script: fprintf('Date & time: %s\n', char(datetime)); %% % Overall elapsed time: toc(scriptStartTime) %% % Date and time when finishing this script: fprintf('Date & time: %s\n', char(datetime)); %% % If you are interested in the original MATLAB code for this page, you can % type "grabcode(URL)" under MATLAB, where URL is the web address of this % page. %% %