%% Tutorial on leaf recognition
% In this tutorial, we shall explain the basics of leaf recognition based on its shape and color statistics.
% The dataset is availabe at <http://flavia.sourceforge.net http://flavia.sourceforge.net>. 
%% Preprocessing
% Before we start, let's add necessary toolboxes to the search path of MATLAB:
addpath d:/users/jang/matlab/toolbox/utility
addpath d:/users/jang/matlab/toolbox/machineLearning
%%
% For compatibility, here we list the platform and MATLAB version that we used to run this script:
fprintf('Platform: %s\n', computer);
fprintf('MATLAB version: %s\n', version);
scriptStartTime=tic;
%% Dataset construction
% First of all, we shall collect all the image data from the image directory:
imDir='D:\users\jang\books\dcpr\appNote\leafId\Leaves';
opt=mmDataCollect('defaultOpt');
opt.extName='jpg';
imageData=mmDataCollect(imDir, opt);
for i=1:length(imageData)
	[~, imageData(i).mainName]=fileparts(imageData(i).name);
	imageData(i).mainName=eval(imageData(i).mainName);
end
%%
% Now we can read the class information from the file class.txt, which is collected from the
% website of the dataset.
classFile='class.txt';
classData=tableRead(classFile, 2);
for i=1:length(classData)
	items=split(classData(i).filename, '-');
	classData(i).range=[eval(items{1}), eval(items{2})];
end
fprintf('%d classes are collected from %s.\n', length(classData), classFile);
%%
% For simplicity, we shall only keep 5 classes of leaves for further
% investigation:
validClassIndex=[2, 7, 9, 25, 29];	% Hard ones
%validClassIndex=[1, 4, 6, 18, 28, 32];	% Easy ones
classData=classData(validClassIndex);
fprintf('%d classes are retained for further analysis.\n', length(classData));
%%
% We can print the common names of these plants (or classes of leaves):
for i=1:length(classData)
	fprintf('id=%d, common name=%s\n', i, classData(i).commonName);
end
%%
% (Of course, the above 3 statements can be omited if we want to use all
% classes of leaves for further exploration.)
%%
% Based on the class information, we need to assign the class label for each leaf (or image):
allMainName=[imageData.mainName];
for i=1:length(imageData)
	imageData(i).classId=0;	% Default value, indicating no class being assigned yet
end
for i=1:length(classData)
	range=classData(i).range;
	for j=range(1):range(2);
		index=find(allMainName==j);
		if length(index)~=1, keyboard; end
		imageData(index).classId=i;
		imageData(index).class=classData(i).commonName;
	end
end
imageData([imageData.classId]==0)=[];	% Delete image with no class info
fprintf('%d images are retained for further analysis.\n', length(imageData));
%%
% We can plot the leaves for each class:
for i=1:length(classData)
	index=find([imageData.classId]==i);
	figure; myMontage(imageData(index), struct('montageSize', [nan, 10]));
	title(sprintf('%d files of class %d ("%s")', length(index), i, classData(i).commonName));
end
%% Feature extraction
% For each image (or leaf), we need to extract the corresponding feature
% vector for classification. The process of leaf feature extraction is
% already packed into a function leafFeaExtract.m, which will be detailed
% later. For now, let extract the features for each leaf:
myTic=tic;
for i=1:length(imageData)
%	fprintf('%d/%d: imFile=%s\n', i, length(imageData), imageData(i).path);
	im=imread(imageData(i).path);
	imageData(i).fea=leafFeaExtract(im);
end
fprintf('Time for feature extraction over %d images = %g sec\n', length(imageData), toc(myTic));
ds.input=[imageData.fea];
ds.output=[imageData.classId];
ds.inputName={'a/p', 'eccentricity', 'major/minor', 'a/ca', 'mean', 'variance'};
ds.outputName={classData.commonName};
save ds ds	% Save it for future use
%%
% Basically the extracted features are based on the regions separated Otsu's method.
% We only consider the region with the maximum area, and compute its region properties and color statistics as features.
% You can type "leafFeaExtract" to have a self-demo of the function:
figure; leafFeaExtract;
%%
% Note that since feature extraction is a lengthy process, we have save the
% resulting variable "ds" into "ds.mat".
% If needed, you can simply load the file to restore the dataset variable "ds" and play around with it.
%% Dataset visualization
% Once we have every piece of necessary information stored in "ds",
% we can invoke many different functions in Machine Learning Toolbox for
% data visualization and classification.
%%
% For instance, we can display the size of each class:
figure;
[classSize, classLabel]=dsClassSize(ds, 1);
%%
% We can plot the distribution of each features within each class:
figure; dsBoxPlot(ds);
%%
% The box plots indicate the ranges of the features vary a lot. To verify,
% we can simply plot the range of features of the dataset:
figure; dsRangePlot(ds);
%%
% Big range difference cause problems in distance-based classification. To
% avoid this, we can simply normalize the features:
ds2=ds;
ds2.input=inputNormalize(ds2.input);
%%
% We can plot the feature vectors within each class:
figure; dsFeaVecPlot(ds);
%%
% We can do the scatter plots on every 2 features:
figure; dsProjPlot2(ds);
set(gcf, 'units', 'normalized', 'outerposition', [0 0 1 1]);	% Maximize figure window
%%
% It is hard to see the above plots due to a large difference in the range of each features.
% We can try the same plot with normalized inputs:
figure; dsProjPlot2(ds2);
set(gcf, 'units', 'normalized', 'outerposition', [0 0 1 1]);	% Maximize figure window
%%
% We can also do the scatter plots in the 3D space:
figure; dsProjPlot3(ds2);
set(gcf, 'units', 'normalized', 'outerposition', [0 0 1 1]);	% Maximize figure window
%%
% In order to visualize the distribution of the dataset,
% we can project the original dataset into 2-D space.
% This can be achieved by LDA (linear discriminant analysis):
ds2d=lda(ds);
ds2d.input=ds2d.input(1:2, :);
figure; dsScatterPlot(ds2d); xlabel('Input 1'); ylabel('Input 2');
title('Features projected on the first 2 lda vectors');
%% Classification
% We can try the most straightforward KNNC (k-nearest neighbor classifier):
rr=knncLoo(ds);
fprintf('rr=%g%% for ds\n', rr*100);
%%
% For normalized dataset, usually we can obtain a better accuracy:
[rr, computed]=knncLoo(ds2);
fprintf('rr=%g%% for ds2 of normalized inputs\n', rr*100);
%%
% We can plot the confusion matrix:
confMat=confMatGet(ds2.output, computed);
opt=confMatPlot('defaultOpt');
opt.className=ds.outputName;
opt.mode='both';
figure; confMatPlot(confMat, opt);
%%
% We can perform input selection to find the best features:
figure; inputSelectSequential(ds2, inf, 'knnc');
%%
% We can even perform an exhaustive search on the classifiers and the way
% of input normalization:
opt=perfCv4classifier('defaultOpt');
opt.foldNum=10;
tic; [perfData, bestId]=perfCv4classifier(ds, opt, 1); toc
structDispInHtml(perfData, 'Performance of various classifiers via cross validation');
%%
% We can then display the confusion matrix of the best classifier:
confMat=confMatGet(ds.output, perfData(bestId).bestComputedClass);
opt=confMatPlot('defaultOpt');
opt.className=ds.outputName;
figure; confMatPlot(confMat, opt);
%%
% We can also list all the misclassified images in a table:
for i=1:length(imageData)
	imageData(i).classIdPredicted=perfData(bestId).bestComputedClass(i);
	imageData(i).classPredicted=ds.outputName{imageData(i).classIdPredicted};
end
listOpt=mmDataList('defaultOpt');
mmDataList(imageData, listOpt);
%% Summary
% This is a brief tutorial on leaf recognition based on its shape and color statistics.
% There are several directions for further improvement:
%
% * Explore other features, such as vein distribution
% * Try the classification problem using the whole dataset
% * Use template matching as an alternative to improve the performance
%
%%
% Overall elapsed time:
toc(scriptStartTime)
%%
% <http://mirlab.org/jang Jyh-Shing Roger Jang>, created on
date