%% Tutorial on Fatty Liver Recognition (by <http://mirlab.org/jang Roger Jang>)
%%
% This tutorial explains the basics of fatty liver recognition based on patients' data. 
%% Preprocessing
% Before we start, let's add necessary toolboxes to the search path of MATLAB:
addpath d:/users/jang/matlab/toolbox/utility
addpath d:/users/jang/matlab/toolbox/sap
addpath d:/users/jang/matlab/toolbox/machineLearning
clear all; close all;
%%
% All the above toolboxes can be downloaded from the author's <http://mirlab.org/jang/matlab/toolbox toolbox page>.
% Make sure you are using the latest toolboxes to work with this script.
%%
% For compatibility, here we list the platform and MATLAB version that we used to run this script:
fprintf('Platform: %s\n', computer);
fprintf('MATLAB version: %s\n', version);
fprintf('Date & time: %s\n', char(datetime));
scriptStartTime=tic;	% Timing for the whole script
%% Data preparation
% First of all, we shall read the data from a data file:
file='D:\dataSet\mj\liver\small\fliver.csv';
fprintf('Reading %s...\n', file);
tic; data=xlsFile2struct(file, 'fliver'); time=toc;
fprintf('Time=%g sec\n', time);
fprintf('Data size=%d\n', length(data));
%%
% We delete some of the fields, as follows.
%
% * Delete derived fields. See for "新創變項codebook.xlsx" details.
fieldToDelete={'bmi', 'ms1', 'ms2', 'ms3', 'ms4', 'ms5', 'ms'};
data=rmfield(data, fieldToDelete);
%%
% * Delete apparently useless fields
fieldToDelete={'id', 'n', 'flivero', 'father', 'gfather_f', 'gfather_m', 'mother', 'gmother_f', 'gmother_m', 'marriage_98', 'marriage_14'};
data=rmfield(data, fieldToDelete);
%%
% * Delete fields with too many NaN
fieldToDelete={'fincome', 'pincome'};	% Famile and personal income
data=rmfield(data, fieldToDelete);
%%
% Since there are still numerous NaN in the dataset, we can eliminate data entries with NaN.
fprintf('Original data count = %g\n', length(data));
fprintf('List of no. of NaN at each field:\n');
fieldNames=fieldnames(data);
for i=1:length(fieldNames) 
	fieldName=fieldNames{i};
	id=find(isnan([data.(fieldName)]));
	count=length(id);
	if count>0, fprintf('sum(isnan([data.%s]))=%d\n', fieldName, count); end
	data(id)=[];
end
fprintf('After removing NaN, data size = %g\n', length(data));
fprintf('Saving data.mat...\n');
save data data
%%
% Then we can create the basic dataset ds.
outputFieldName='fliver';
ds.output=[data.(outputFieldName)]+1;	% Starting from 1
data=rmfield(data, outputFieldName);
ds.inputName=fieldnames(data);
dataCount=length(data);
ds.input=zeros(length(ds.inputName), dataCount);
for i=1:length(ds.inputName)
	ds.input(i,:)=[data.(ds.inputName{i})];
end
ds.outputName={'no', 'yes'};
ds
fprintf('Saving ds.mat...\n');
save ds ds
%%
% We partition the whole dataset into training and test sets.
opt=cvDataGen('defaultOpt');
opt.foldNum=2;
opt.cvDataType='full';
cvData=cvDataGen(ds, opt);
dsTrain=cvData.TS;		% Training set
dsTest=cvData.VS;		% Test set
%%
% For more detailed analysis, we partition the data based on gender.
opt=struct('inputName', 'gender', 'inputValue', 1);
dsTrainFemale=dsSubset(dsTrain, opt);
opt.inputValue=2;
dsTrainMale=dsSubset(dsTrain, opt);
opt.inputValue=1;
dsTestFemale=dsSubset(dsTest, opt);
opt.inputValue=2;
dsTestMale=dsSubset(dsTest, opt);
%%
% We can plot the patient counts of the training data based on gender and years.
id=find(strcmp(dsTrainFemale.inputName, 'yr'));
year=dsTrainFemale.input(id, :);
figure;
[a, b]=elementCount(year); subplot(211); bar(a, b); xlabel('Years'); ylabel('Patient counts'); title(sprintf('Female total=%d', size(dsTrainFemale.input,2)));
year=dsTrainMale.input(id, :);
[a, b]=elementCount(year); subplot(212); bar(a, b); xlabel('Years'); ylabel('Patient counts'); title(sprintf('Male total=%d', size(dsTrainMale.input,2)));
axisLimitSame;
%%
% For simplicity, we shall use 2016 male data for further analysis.
opt=struct('inputName', 'yr', 'inputValue', 2016);
ds=dsSubset(dsTrainMale, opt);			% Training set
ds
dsSmallTest=dsSubset(dsTestMale, opt);		% Test set
dsSmallTest
%% Dataset visualization
% Size of the classes:
figure; [classSize, classLabel]=dsClassSize(ds, 1)
%%
% Box plot for two classes:
figure; dsBoxPlot(ds);
%%
% Range plot of the dataset:
figure; dsRangePlot(ds);
%%
% Range plot of the normalized dataset ds2:
ds2=ds;
ds2.input=inputNormalize(ds2.input);
figure; dsRangePlot(ds2);
%%
% Scatter plots of ds2:
figure; dsProjPlot2(ds2); figEnlarge;
%%
% Correlation plot:
figure; corrplot(ds.input', 'varNames', strPurify(ds.inputName)); figEnlarge
%% Classification
% KNN results on ds and ds2:
tic; rr=knncLoo(ds);
fprintf('rr=%g%% for ds, time=%g sec\n', rr*100, toc);
tic; rr=knncLoo(ds2);
fprintf('rr=%g%% for ds2, time=%g sec\n', rr*100, toc);
%%
% Sequential forward selection on the features of ds:
figure; tic; inputSelectSequential(ds, inf, 'knnc'); toc; figEnlarge
%%
% Sequential forward selection on the features of ds2:
figure; tic; inputSelectSequential(ds2, inf, 'knnc'); toc; figEnlarge
%%
% Dimensionality reduction based on LDA:
opt=ldaPerfViaKnncLoo('defaultOpt');
opt.mode='exact';		% This option causes error, why?
tic
recogRate1=ldaPerfViaKnncLoo(ds, opt);
ds2=ds; ds2.input=inputNormalize(ds2.input);	% input normalization
recogRate2=ldaPerfViaKnncLoo(ds2, opt);
fprintf('Time=%g sec\n', toc);
[featureNum, dataNum] = size(ds.input);
figure;
plot(1:featureNum, 100*recogRate1, 'o-', 1:featureNum, 100*recogRate2, '^-'); grid on
legend({'Raw data', 'Normalized data'}, 'location', 'northOutside', 'orientation', 'horizontal');
xlabel('No. of projected features based on LDA');
ylabel('LOO recognition rates using KNNC (%)');
%%
% Let's perform AutoML with ds:
opt=perfCv4classifier('defaultOpt');
opt.foldNum=5;
opt.classifiers(find(strcmp(opt.classifiers, 'src')))=[];	% Get rid of 'src' since it's extremely slow
tic; [perfData, bestId]=perfCv4classifier(ds, opt, 1); toc
%structDispInHtml(perfData, 'Performance of various classifiers via cross validation');
%%
% Plot of confusion matrix
confMat=confMatGet(ds.output, perfData(bestId).bestComputedClass);
opt=confMatPlot('defaultOpt');
opt.className=ds.outputName;
figure; confMatPlot(confMat, opt);
%%
% Since NBC is the best classifier, we'll use it for more detailed
% analysis. We can plot the DET curve:
[cPrm, logLike1, rr1]=nbcTrain(ds);
[computedClass, logLike2, rr2, hitIndex]=nbcEval(dsSmallTest, cPrm);
logLike=diff(logLike2);
id01=find(dsSmallTest.output==1);
id02=find(dsSmallTest.output==2);
data01=logLike(id01);
data02=logLike(id02);
opt=detPlot('defaultOpt');
opt.scaleMode='linear';
figure; detPlot(data01, data02, opt);
%%
% We can plot the density curves for each class:
opt=detDataGet('defaultOpt');
figure; [th, fp, fn]=detDataGet(data01, data02, opt, 1);
%%
% We can also plot the lift chart:
gt=dsSmallTest.output;
opt=liftChart('defaultOpt');
figure;
opt.type='binwisePrecision';
subplot(121); liftChart(logLike, gt, opt, 1);
opt.type='cumulatedRecall';
subplot(122); liftChart(logLike, gt, opt, 1);
figEnlarge
%%
% Since the selection of the best threshold depends on the cost of FP
% (false positive) and FN (false negative), we shall fix the cost of FP to
% be 1, and change the cost of FN to see the the best threshold varies, as
% follows:
figure;
opt=costVsTh4binClassifier('defaultOpt');
opt.fnCost=1:6;
[costMin, thBest, costVec, thVec, misclassifiedCount]=costVsTh4binClassifier(gt, logLike, opt, 1);
%%
% For better visualization, we can take the log of the likelihood:
figure;
[costMin, thBest, costVec, thVec, misclassifiedCount]=costVsTh4binClassifier(gt, log(logLike-min(logLike)+1), opt, 1);
%%
% As best threshold vs. the cost of FN is shown next:
figure;
plot(opt.fnCost, thBest, 'o-');
for i=1:length(opt.fnCost)
	h=text(opt.fnCost(i), thBest(i), sprintf('  [FP, FN] = %s', mat2str(misclassifiedCount{i})));
	set(h, 'rotation', 30);
end
xlabel('FN costs'); ylabel('Thresholds'); title('Best thresholds vs. FN costs'); grid on
%%
% Since NBC is the best classifier, let's use it for feature selection:
figure; tic; [inputId, bestRr]=inputSelectSequential(ds, inf, 'nbc'); toc; figEnlarge
%%
% Now we can list the training and test performance of the dataset based on NBC, with breakdowns into genders and years:
figure;
fprintf('Holdout test on female data:\n');
opt1=struct('inputName', 'gender', 'inputValue', 1);
dsFemaleTrain=dsSubset(dsTrain, opt1);
dsFemaleTest= dsSubset(dsTest,  opt1);
optField=perfByField('defaultOpt');
optField.fieldName='yr';
optField.classifier='nbc';
subplot(211); rrFemale=perfByField(dsFemaleTrain, dsFemaleTest, optField, 1);
title(sprintf('gender=%d, training rr=%g%%, test rr=%g%%', 1, 100*mean(rrFemale(:,1)), 100*mean(rrFemale(:,2))));
fprintf('Holdout test on male data:\n');
opt2=struct('inputName', 'gender', 'inputValue', 2);
dsMaleTrain=dsSubset(dsTrain, opt2);
dsMaleTest= dsSubset(dsTest,  opt2);
subplot(212); rrMale=perfByField(dsMaleTrain, dsMaleTest, optField, 1);
title(sprintf('gender=%d, training rr=%g%%, test rr=%g%%', 2, 100*mean(rrMale(:,1)), 100*mean(rrMale(:,2))));
%%
% We can only use the selected inputs for the above plot:
figure;
fprintf('Holdout test on female data:\n');
opt1=struct('inputName', 'gender', 'inputValue', 1);
dsFemaleTrain=dsSubset(dsTrain, opt1); dsFemaleTrain.input=dsFemaleTrain.input(inputId,:); dsFemaleTrain.inputName=dsFemaleTrain.inputName(inputId);
dsFemaleTest= dsSubset(dsTest,  opt1); dsFemaleTest.input= dsFemaleTest.input(inputId,:);  dsFemaleTest.inputName=  dsFemaleTest.inputName(inputId);
subplot(211); rrFemale=perfByField(dsFemaleTrain, dsFemaleTest, optField, 1);
title(sprintf('gender=%d, training rr=%g%%, test rr=%g%%', 1, 100*mean(rrFemale(:,1)), 100*mean(rrFemale(:,2))));
fprintf('Holdout test on male data:\n');
opt2=struct('inputName', 'gender', 'inputValue', 2);
dsMaleTrain=dsSubset(dsTrain, opt2);
dsMaleTest= dsSubset(dsTest,  opt2);
subplot(212); rrMale=perfByField(dsMaleTrain, dsMaleTest, optField, 1);
title(sprintf('gender=%d, training rr=%g%%, test rr=%g%%', 2, 100*mean(rrMale(:,1)), 100*mean(rrMale(:,2))));
%% Summary
% This is a brief tutorial on fatty liver recognition based on patients' data.
% There are several directions for further improvement:
%
% * Try the classification problem using the whole dataset
% * Use template matching as an alternative to improve the performance
%
%% Appendix
% List of functions, scripts, and datasets used in this script:
%
% * <../list.asp List of files in this folder>
%
%%
% Date and time when finishing this script:
fprintf('Date & time: %s\n', char(datetime));
%%
% Overall elapsed time:
toc(scriptStartTime)
%%
% <http://mirlab.org/jang Jyh-Shing Roger Jang>, created on
datetime