%% Tutorial on leaf recognition % In this tutorial, we shall explain the basics of leaf recognition based on its shape and color statistics. % The dataset is availabe at . %% Preprocessing % Before we start, let's add necessary toolboxes to the search path of MATLAB: addpath d:/users/jang/matlab/toolbox/utility addpath d:/users/jang/matlab/toolbox/machineLearning %% % For compatibility, here we list the platform and MATLAB version that we used to run this script: fprintf('Platform: %s\n', computer); fprintf('MATLAB version: %s\n', version); scriptStartTime=tic; %% Dataset construction % First of all, we shall collect all the image data from the image directory: imDir='D:\users\jang\books\dcpr\appNote\leafId\Leaves'; opt=mmDataCollect('defaultOpt'); opt.extName='jpg'; imageData=mmDataCollect(imDir, opt); for i=1:length(imageData) [~, imageData(i).mainName]=fileparts(imageData(i).name); imageData(i).mainName=eval(imageData(i).mainName); end %% % Now we can read the class information from the file class.txt, which is collected from the % website of the dataset. classFile='class.txt'; classData=tableRead(classFile, 2); for i=1:length(classData) items=split(classData(i).filename, '-'); classData(i).range=[eval(items{1}), eval(items{2})]; end fprintf('%d classes are collected from %s.\n', length(classData), classFile); %% % For simplicity, we shall only keep 5 classes of leaves for further % investigation: validClassIndex=[2, 7, 9, 25, 29]; % Hard ones %validClassIndex=[1, 4, 6, 18, 28, 32]; % Easy ones classData=classData(validClassIndex); fprintf('%d classes are retained for further analysis.\n', length(classData)); %% % We can print the common names of these plants (or classes of leaves): for i=1:length(classData) fprintf('id=%d, common name=%s\n', i, classData(i).commonName); end %% % (Of course, the above 3 statements can be omited if we want to use all % classes of leaves for further exploration.) %% % Based on the class information, we need to assign the class label for each leaf (or image): allMainName=[imageData.mainName]; for i=1:length(imageData) imageData(i).classId=0; % Default value, indicating no class being assigned yet end for i=1:length(classData) range=classData(i).range; for j=range(1):range(2); index=find(allMainName==j); if length(index)~=1, keyboard; end imageData(index).classId=i; imageData(index).class=classData(i).commonName; end end imageData([imageData.classId]==0)=[]; % Delete image with no class info fprintf('%d images are retained for further analysis.\n', length(imageData)); %% % We can plot the leaves for each class: for i=1:length(classData) index=find([imageData.classId]==i); figure; myMontage(imageData(index), struct('montageSize', [nan, 10])); title(sprintf('%d files of class %d ("%s")', length(index), i, classData(i).commonName)); end %% Feature extraction % For each image (or leaf), we need to extract the corresponding feature % vector for classification. The process of leaf feature extraction is % already packed into a function leafFeaExtract.m, which will be detailed % later. For now, let extract the features for each leaf: myTic=tic; for i=1:length(imageData) % fprintf('%d/%d: imFile=%s\n', i, length(imageData), imageData(i).path); im=imread(imageData(i).path); imageData(i).fea=leafFeaExtract(im); end fprintf('Time for feature extraction over %d images = %g sec\n', length(imageData), toc(myTic)); ds.input=[imageData.fea]; ds.output=[imageData.classId]; ds.inputName={'a/p', 'eccentricity', 'major/minor', 'a/ca', 'mean', 'variance'}; ds.outputName={classData.commonName}; save ds ds % Save it for future use %% % Basically the extracted features are based on the regions separated Otsu's method. % We only consider the region with the maximum area, and compute its region properties and color statistics as features. % You can type "leafFeaExtract" to have a self-demo of the function: figure; leafFeaExtract; %% % Note that since feature extraction is a lengthy process, we have save the % resulting variable "ds" into "ds.mat". % If needed, you can simply load the file to restore the dataset variable "ds" and play around with it. %% Dataset visualization % Once we have every piece of necessary information stored in "ds", % we can invoke many different functions in Machine Learning Toolbox for % data visualization and classification. %% % For instance, we can display the size of each class: figure; [classSize, classLabel]=dsClassSize(ds, 1); %% % We can plot the distribution of each features within each class: figure; dsBoxPlot(ds); %% % The box plots indicate the ranges of the features vary a lot. To verify, % we can simply plot the range of features of the dataset: figure; dsRangePlot(ds); %% % Big range difference cause problems in distance-based classification. To % avoid this, we can simply normalize the features: ds2=ds; ds2.input=inputNormalize(ds2.input); %% % We can plot the feature vectors within each class: figure; dsFeaVecPlot(ds); %% % We can do the scatter plots on every 2 features: figure; dsProjPlot2(ds); set(gcf, 'units', 'normalized', 'outerposition', [0 0 1 1]); % Maximize figure window %% % It is hard to see the above plots due to a large difference in the range of each features. % We can try the same plot with normalized inputs: figure; dsProjPlot2(ds2); set(gcf, 'units', 'normalized', 'outerposition', [0 0 1 1]); % Maximize figure window %% % We can also do the scatter plots in the 3D space: figure; dsProjPlot3(ds2); set(gcf, 'units', 'normalized', 'outerposition', [0 0 1 1]); % Maximize figure window %% % In order to visualize the distribution of the dataset, % we can project the original dataset into 2-D space. % This can be achieved by LDA (linear discriminant analysis): ds2d=lda(ds); ds2d.input=ds2d.input(1:2, :); figure; dsScatterPlot(ds2d); xlabel('Input 1'); ylabel('Input 2'); title('Features projected on the first 2 lda vectors'); %% Classification % We can try the most straightforward KNNC (k-nearest neighbor classifier): rr=knncLoo(ds); fprintf('rr=%g%% for ds\n', rr*100); %% % For normalized dataset, usually we can obtain a better accuracy: [rr, computed]=knncLoo(ds2); fprintf('rr=%g%% for ds2 of normalized inputs\n', rr*100); %% % We can plot the confusion matrix: confMat=confMatGet(ds2.output, computed); opt=confMatPlot('defaultOpt'); opt.className=ds.outputName; opt.mode='both'; figure; confMatPlot(confMat, opt); %% % We can perform input selection to find the best features: figure; inputSelectSequential(ds2, inf, 'knnc'); %% % We can even perform an exhaustive search on the classifiers and the way % of input normalization: opt=perfCv4classifier('defaultOpt'); opt.foldNum=10; tic; [perfData, bestId]=perfCv4classifier(ds, opt, 1); toc structDispInHtml(perfData, 'Performance of various classifiers via cross validation'); %% % We can then display the confusion matrix of the best classifier: confMat=confMatGet(ds.output, perfData(bestId).bestComputedClass); opt=confMatPlot('defaultOpt'); opt.className=ds.outputName; figure; confMatPlot(confMat, opt); %% % We can also list all the misclassified images in a table: for i=1:length(imageData) imageData(i).classIdPredicted=perfData(bestId).bestComputedClass(i); imageData(i).classPredicted=ds.outputName{imageData(i).classIdPredicted}; end listOpt=mmDataList('defaultOpt'); mmDataList(imageData, listOpt); %% Summary % This is a brief tutorial on leaf recognition based on its shape and color statistics. % There are several directions for further improvement: % % * Explore other features, such as vein distribution % * Try the classification problem using the whole dataset % * Use template matching as an alternative to improve the performance % %% % Overall elapsed time: toc(scriptStartTime) %% % , created on date