%% Tutorial on Fatty Liver Recognition (by <http://mirlab.org/jang Roger Jang>)
%%
% This tutorial explains the basics of fatty liver recognition based on patients' data. 
%% Preprocessing
% Before we start, let's add necessary toolboxes to the search path of MATLAB:
addpath d:/users/jang/matlab/toolbox/utility
addpath d:/users/jang/matlab/toolbox/sap
addpath d:/users/jang/matlab/toolbox/machineLearning
%%
% All the above toolboxes can be downloaded from the author's <http://mirlab.org/jang/matlab/toolbox toolbox page>.
% Make sure you are using the latest toolboxes to work with this script.
%%
% For compatibility, here we list the platform and MATLAB version that we used to run this script:
fprintf('Platform: %s\n', computer);
fprintf('MATLAB version: %s\n', version);
fprintf('Date & time: %s\n', char(datetime));
scriptStartTime=tic;	% Timing for the whole script
%% Data preparation
% First of all, we shall read the data from a data file:
file='D:\dataSet\mj\liver\fliver.csv';
fprintf('Reading %s...\n', file);
tic; data=xlsFile2struct(file, 'fliver'); time=toc;
fprintf('Time=%g sec\n', time);
%%
% We can get rid of some of the fields, as follows.
%
% * Get rid of derived fields
fieldToDelete={'bmi', 'ms1', 'ms2', 'ms3', 'ms4', 'ms5', 'ms'};
data=rmfield(data, fieldToDelete);
%%
% * Get rid of useless fields
fieldToDelete={'id', 'n', 'fliver', 'father', 'gfather_f', 'gfather_m', 'mother', 'gmother_f', 'gmother_m', 'marriage_98', 'marriage_14'};
data=rmfield(data, fieldToDelete);
%%
% * Get rid of fields with too many NaN
fieldToDelete={'fincome', 'pincome'};
data=rmfield(data, fieldToDelete);
%%
%data=rmfield(data, {'g_wei', 'g_hei'});
%%
% Since there are still numerous NaN in the dataset, we can eliminate data with NaN.
fprintf('Original data size = %g\n', length(data));
fprintf('List of no. of NaN at each field:\n');
fieldNames=fieldnames(data);
for i=1:length(fieldNames) 
	fieldName=fieldNames{i};
	id=find(isnan([data.(fieldName)]));
	count=length(id);
	if count>0, fprintf('sum(isnan([data.%s]))=%d\n', fieldName, count); end
	data(id)=[];
end
fprintf('After removing NaN, data size = %g\n', length(data));
%fprintf('Saving data.mat...\n');
%save data data
%%
% The we can create the basic dataset ds.
outputFieldName='flivero';
ds.output=[data.(outputFieldName)]+1;	% Starting from 1
data=rmfield(data, outputFieldName);
ds.inputName=fieldnames(data);
dataCount=length(data);
ds.input=zeros(length(ds.inputName), dataCount);
for i=1:length(ds.inputName)
	ds.input(i,:)=[data.(ds.inputName{i})];
end
ds.outputName={'no', 'slightly', 'yes', 'alot'};
ds
%fprintf('Saving ds.mat...\n');
%save ds2 ds
%%
% We partition the whole dataset into training and test sets.
opt=cvDataGen('defaultOpt');
opt.foldNum=2;
opt.cvDataType='full';
cvData=cvDataGen(ds, opt);
dsTrain=cvData.TS;		% Training set
dsTest=cvData.VS;		% Test set
%%
% For more detailed analysis, we partition the data based on gender.
opt.inputName='gender';
opt.inputValue=1;
dsFemale=dsSubset(dsTrain, opt);
opt.inputValue=2;
dsMale=dsSubset(dsTrain, opt);
%%
% We can plot the patient counts based on gender and years.
id=find(strcmp(dsFemale.inputName, 'yr'));
year=dsFemale.input(id, :);
figure;
[a, b]=elementCount(year); subplot(211); bar(a, b); xlabel('Years'); ylabel('Patient counts'); title(sprintf('Female total=%d', size(dsFemale.input,2)));
year=dsMale.input(id, :);
[a, b]=elementCount(year); subplot(212); bar(a, b); xlabel('Years'); ylabel('Patient counts'); title(sprintf('Male total=%d', size(dsMale.input,2)));
axisLimitSame;
%%
% For simplicity, we shall use 2016 male data for further analysis.
ds=dsSubset(dsMale, 'yr', 2016);
ds
%% Dataset visualization
% Size of the classes:
figure; [classSize, classLabel]=dsClassSize(ds, 1)
%%
% Box plot for two classes:
figure; dsBoxPlot(ds);
%%
% Range plot of the dataset:
figure; dsRangePlot(ds);
%%
% Range plot of the normalized dataset ds2:
ds2=ds;
ds2.input=inputNormalize(ds2.input);
figure; dsRangePlot(ds2);
%%
% Scatter plots of ds2:
figure; dsProjPlot2(ds2); figEnlarge;
%%
% Correlation plot:
figure; corrplot(ds.input', 'varNames', ds.inputName); figEnlarge
%% Classification
% KNN results on ds and ds2:
rr=knncLoo(ds);
fprintf('rr=%g%% for ds\n', rr*100);
rr=knncLoo(ds2);
fprintf('rr=%g%% for ds2\n', rr*100);
%%
% Sequential forward selection on the features of ds:
figure; tic; inputSelectSequential(ds, inf, 'knnc'); toc; figEnlarge
%%
% Sequential forward selection on the features of ds2:
figure; tic; inputSelectSequential(ds2, inf, 'knnc'); toc; figEnlarge
%%
% Dimensionality reduction based on LDA:
opt=ldaPerfViaKnncLoo('defaultOpt');
%opt.mode='exact';		% This option causes error, why?
tic
recogRate1=ldaPerfViaKnncLoo(ds, opt);
ds2=ds; ds2.input=inputNormalize(ds2.input);	% input normalization
recogRate2=ldaPerfViaKnncLoo(ds2, opt);
fprintf('Time=%g sec\n', toc);
[featureNum, dataNum] = size(ds.input);
figure;
plot(1:featureNum, 100*recogRate1, 'o-', 1:featureNum, 100*recogRate2, '^-'); grid on
legend({'Raw data', 'Normalized data'}, 'location', 'northOutside', 'orientation', 'horizontal');
xlabel('No. of projected features based on LDA');
ylabel('LOO recognition rates using KNNC (%)');
%%
% Let's perform AutoML with ds:
opt=perfCv4classifier('defaultOpt');
opt.foldNum=5;
opt.classifiers(find(strcmp(opt.classifiers, 'src')))=[];	% Get rid of 'src' since it's extremely slow
tic; [perfData, bestId]=perfCv4classifier(ds, opt, 1); toc
%structDispInHtml(perfData, 'Performance of various classifiers via cross validation');
%%
% Plot of confusion matrix
confMat=confMatGet(ds.output, perfData(bestId).bestComputedClass);
opt=confMatPlot('defaultOpt');
opt.className=ds.outputName;
figure; confMatPlot(confMat, opt);
%%
% Since NBC is the best classifier, let's use it for feature selection:
figure; tic; [inputId, bestRr]=inputSelectSequential(ds, inf, 'nbc'); toc; figEnlarge
%%
% Now we can list the training and test performance of the dataset based on NBC, with breakdowns into genders and years:
figure;
fprintf('Holdout test on female data:\n');
dsFemaleTrain=dsSubset(dsTrain, 'gender', 1);
dsFemaleTest= dsSubset(dsTest,  'gender', 1);
subplot(211); rrFemale=perfByField(dsFemaleTrain, dsFemaleTest, 'yr', 1);
title(sprintf('gender=%d, training rr=%g%%, test rr=%g%%', 1, 100*mean(rrFemale(:,1)), 100*mean(rrFemale(:,2))));
fprintf('Holdout test on male data:\n');
dsMaleTrain=dsSubset(dsTrain, 'gender', 2);
dsMaleTest= dsSubset(dsTest,  'gender', 2);
subplot(212); rrMale=perfByField(dsMaleTrain, dsMaleTest, 'yr', 1);
title(sprintf('gender=%d, training rr=%g%%, test rr=%g%%', 2, 100*mean(rrMale(:,1)), 100*mean(rrMale(:,2))));
%%
% We can use the selected inputs for the above plot:
figure;
fprintf('Holdout test on female data:\n');
dsFemaleTrain=dsSubset(dsTrain, 'gender', 1); dsFemaleTrain.input=dsFemaleTrain.input(inputId,:); dsFemaleTrain.inputName=dsFemaleTrain.inputName(inputId);
dsFemaleTest= dsSubset(dsTest,  'gender', 1); dsFemaleTest.input= dsFemaleTest.input(inputId,:);  dsFemaleTest.inputName=  dsFemaleTest.inputName(inputId);
subplot(211); rrFemale=perfByField(dsFemaleTrain, dsFemaleTest, 'yr', 1);
title(sprintf('gender=%d, training rr=%g%%, test rr=%g%%', 1, 100*mean(rrFemale(:,1)), 100*mean(rrFemale(:,2))));
fprintf('Holdout test on male data:\n');
dsMaleTrain=dsSubset(dsTrain, 'gender', 2);
dsMaleTest= dsSubset(dsTest,  'gender', 2);
subplot(212); rrMale=perfByField(dsMaleTrain, dsMaleTest, 'yr', 1);
title(sprintf('gender=%d, training rr=%g%%, test rr=%g%%', 2, 100*mean(rrMale(:,1)), 100*mean(rrMale(:,2))));
%% Summary
% This is a brief tutorial on fatty liver recognition based on patients' data.
% There are several directions for further improvement:
%
% * Try the classification problem using the whole dataset
% * Use template matching as an alternative to improve the performance
%
%% Appendix
% List of functions, scripts, and datasets used in this script:
%
% * <../list.asp List of files in this folder>
%
%%
% Date and time when finishing this script:
fprintf('Date & time: %s\n', char(datetime));
%%
% Overall elapsed time:
toc(scriptStartTime)
%%
% <http://mirlab.org/jang Jyh-Shing Roger Jang>, created on
datetime