%% Tutorial on Fatty Liver Recognition (by ) %% % This tutorial explains the basics of fatty liver recognition based on patients' data. %% Preprocessing % Before we start, let's add necessary toolboxes to the search path of MATLAB: addpath d:/users/jang/matlab/toolbox/utility addpath d:/users/jang/matlab/toolbox/sap addpath d:/users/jang/matlab/toolbox/machineLearning clear all; close all; %% % All the above toolboxes can be downloaded from the author's . % Make sure you are using the latest toolboxes to work with this script. %% % For compatibility, here we list the platform and MATLAB version that we used to run this script: fprintf('Platform: %s\n', computer); fprintf('MATLAB version: %s\n', version); fprintf('Date & time: %s\n', char(datetime)); scriptStartTime=tic; % Timing for the whole script %% Data preparation % First of all, we shall read the data from a data file: file='D:\dataSet\mj\liver\small\fliver.csv'; fprintf('Reading %s...\n', file); tic; data=xlsFile2struct(file, 'fliver'); time=toc; fprintf('Time=%g sec\n', time); fprintf('Data size=%d\n', length(data)); %% % We delete some of the fields, as follows. % % * Delete derived fields. See for "新創變項codebook.xlsx" details. fieldToDelete={'bmi', 'ms1', 'ms2', 'ms3', 'ms4', 'ms5', 'ms'}; data=rmfield(data, fieldToDelete); %% % * Delete apparently useless fields fieldToDelete={'id', 'n', 'flivero', 'father', 'gfather_f', 'gfather_m', 'mother', 'gmother_f', 'gmother_m', 'marriage_98', 'marriage_14'}; data=rmfield(data, fieldToDelete); %% % * Delete fields with too many NaN fieldToDelete={'fincome', 'pincome'}; % Famile and personal income data=rmfield(data, fieldToDelete); %% % Since there are still numerous NaN in the dataset, we can eliminate data entries with NaN. fprintf('Original data count = %g\n', length(data)); fprintf('List of no. of NaN at each field:\n'); fieldNames=fieldnames(data); for i=1:length(fieldNames) fieldName=fieldNames{i}; id=find(isnan([data.(fieldName)])); count=length(id); if count>0, fprintf('sum(isnan([data.%s]))=%d\n', fieldName, count); end data(id)=[]; end fprintf('After removing NaN, data size = %g\n', length(data)); fprintf('Saving data.mat...\n'); save data data %% % Then we can create the basic dataset ds. outputFieldName='fliver'; ds.output=[data.(outputFieldName)]+1; % Starting from 1 data=rmfield(data, outputFieldName); ds.inputName=fieldnames(data); dataCount=length(data); ds.input=zeros(length(ds.inputName), dataCount); for i=1:length(ds.inputName) ds.input(i,:)=[data.(ds.inputName{i})]; end ds.outputName={'no', 'yes'}; ds fprintf('Saving ds.mat...\n'); save ds ds %% % We partition the whole dataset into training and test sets. opt=cvDataGen('defaultOpt'); opt.foldNum=2; opt.cvDataType='full'; cvData=cvDataGen(ds, opt); dsTrain=cvData.TS; % Training set dsTest=cvData.VS; % Test set %% % For more detailed analysis, we partition the data based on gender. opt=struct('inputName', 'gender', 'inputValue', 1); dsTrainFemale=dsSubset(dsTrain, opt); opt.inputValue=2; dsTrainMale=dsSubset(dsTrain, opt); opt.inputValue=1; dsTestFemale=dsSubset(dsTest, opt); opt.inputValue=2; dsTestMale=dsSubset(dsTest, opt); %% % We can plot the patient counts of the training data based on gender and years. id=find(strcmp(dsTrainFemale.inputName, 'yr')); year=dsTrainFemale.input(id, :); figure; [a, b]=elementCount(year); subplot(211); bar(a, b); xlabel('Years'); ylabel('Patient counts'); title(sprintf('Female total=%d', size(dsTrainFemale.input,2))); year=dsTrainMale.input(id, :); [a, b]=elementCount(year); subplot(212); bar(a, b); xlabel('Years'); ylabel('Patient counts'); title(sprintf('Male total=%d', size(dsTrainMale.input,2))); axisLimitSame; %% % For simplicity, we shall use 2016 male data for further analysis. opt=struct('inputName', 'yr', 'inputValue', 2016); ds=dsSubset(dsTrainMale, opt); % Training set ds dsSmallTest=dsSubset(dsTestMale, opt); % Test set dsSmallTest %% Dataset visualization % Size of the classes: figure; [classSize, classLabel]=dsClassSize(ds, 1) %% % Box plot for two classes: figure; dsBoxPlot(ds); %% % Range plot of the dataset: figure; dsRangePlot(ds); %% % Range plot of the normalized dataset ds2: ds2=ds; ds2.input=inputNormalize(ds2.input); figure; dsRangePlot(ds2); %% % Scatter plots of ds2: figure; dsProjPlot2(ds2); figEnlarge; %% % Correlation plot: figure; corrplot(ds.input', 'varNames', strPurify(ds.inputName)); figEnlarge %% Classification % KNN results on ds and ds2: tic; rr=knncLoo(ds); fprintf('rr=%g%% for ds, time=%g sec\n', rr*100, toc); tic; rr=knncLoo(ds2); fprintf('rr=%g%% for ds2, time=%g sec\n', rr*100, toc); %% % Sequential forward selection on the features of ds: figure; tic; inputSelectSequential(ds, inf, 'knnc'); toc; figEnlarge %% % Sequential forward selection on the features of ds2: figure; tic; inputSelectSequential(ds2, inf, 'knnc'); toc; figEnlarge %% % Dimensionality reduction based on LDA: opt=ldaPerfViaKnncLoo('defaultOpt'); opt.mode='exact'; % This option causes error, why? tic recogRate1=ldaPerfViaKnncLoo(ds, opt); ds2=ds; ds2.input=inputNormalize(ds2.input); % input normalization recogRate2=ldaPerfViaKnncLoo(ds2, opt); fprintf('Time=%g sec\n', toc); [featureNum, dataNum] = size(ds.input); figure; plot(1:featureNum, 100*recogRate1, 'o-', 1:featureNum, 100*recogRate2, '^-'); grid on legend({'Raw data', 'Normalized data'}, 'location', 'northOutside', 'orientation', 'horizontal'); xlabel('No. of projected features based on LDA'); ylabel('LOO recognition rates using KNNC (%)'); %% % Let's perform AutoML with ds: opt=perfCv4classifier('defaultOpt'); opt.foldNum=5; opt.classifiers(find(strcmp(opt.classifiers, 'src')))=[]; % Get rid of 'src' since it's extremely slow tic; [perfData, bestId]=perfCv4classifier(ds, opt, 1); toc %structDispInHtml(perfData, 'Performance of various classifiers via cross validation'); %% % Plot of confusion matrix confMat=confMatGet(ds.output, perfData(bestId).bestComputedClass); opt=confMatPlot('defaultOpt'); opt.className=ds.outputName; figure; confMatPlot(confMat, opt); %% % Since NBC is the best classifier, we'll use it for more detailed % analysis. We can plot the DET curve: [cPrm, logLike1, rr1]=nbcTrain(ds); [computedClass, logLike2, rr2, hitIndex]=nbcEval(dsSmallTest, cPrm); logLike=diff(logLike2); id01=find(dsSmallTest.output==1); id02=find(dsSmallTest.output==2); data01=logLike(id01); data02=logLike(id02); opt=detPlot('defaultOpt'); opt.scaleMode='linear'; figure; detPlot(data01, data02, opt); %% % We can plot the density curves for each class: opt=detDataGet('defaultOpt'); figure; [th, fp, fn]=detDataGet(data01, data02, opt, 1); %% % We can also plot the lift chart: gt=dsSmallTest.output; opt=liftChart('defaultOpt'); figure; opt.type='binwisePrecision'; subplot(121); liftChart(logLike, gt, opt, 1); opt.type='cumulatedRecall'; subplot(122); liftChart(logLike, gt, opt, 1); figEnlarge %% % Since the selection of the best threshold depends on the cost of FP % (false positive) and FN (false negative), we shall fix the cost of FP to % be 1, and change the cost of FN to see the the best threshold varies, as % follows: figure; opt=costVsTh4binClassifier('defaultOpt'); opt.fnCost=1:6; [costMin, thBest, costVec, thVec, misclassifiedCount]=costVsTh4binClassifier(gt, logLike, opt, 1); %% % For better visualization, we can take the log of the likelihood: figure; [costMin, thBest, costVec, thVec, misclassifiedCount]=costVsTh4binClassifier(gt, log(logLike-min(logLike)+1), opt, 1); %% % As best threshold vs. the cost of FN is shown next: figure; plot(opt.fnCost, thBest, 'o-'); for i=1:length(opt.fnCost) h=text(opt.fnCost(i), thBest(i), sprintf(' [FP, FN] = %s', mat2str(misclassifiedCount{i}))); set(h, 'rotation', 30); end xlabel('FN costs'); ylabel('Thresholds'); title('Best thresholds vs. FN costs'); grid on %% % Since NBC is the best classifier, let's use it for feature selection: figure; tic; [inputId, bestRr]=inputSelectSequential(ds, inf, 'nbc'); toc; figEnlarge %% % Now we can list the training and test performance of the dataset based on NBC, with breakdowns into genders and years: figure; fprintf('Holdout test on female data:\n'); opt1=struct('inputName', 'gender', 'inputValue', 1); dsFemaleTrain=dsSubset(dsTrain, opt1); dsFemaleTest= dsSubset(dsTest, opt1); optField=perfByField('defaultOpt'); optField.fieldName='yr'; optField.classifier='nbc'; subplot(211); rrFemale=perfByField(dsFemaleTrain, dsFemaleTest, optField, 1); title(sprintf('gender=%d, training rr=%g%%, test rr=%g%%', 1, 100*mean(rrFemale(:,1)), 100*mean(rrFemale(:,2)))); fprintf('Holdout test on male data:\n'); opt2=struct('inputName', 'gender', 'inputValue', 2); dsMaleTrain=dsSubset(dsTrain, opt2); dsMaleTest= dsSubset(dsTest, opt2); subplot(212); rrMale=perfByField(dsMaleTrain, dsMaleTest, optField, 1); title(sprintf('gender=%d, training rr=%g%%, test rr=%g%%', 2, 100*mean(rrMale(:,1)), 100*mean(rrMale(:,2)))); %% % We can only use the selected inputs for the above plot: figure; fprintf('Holdout test on female data:\n'); opt1=struct('inputName', 'gender', 'inputValue', 1); dsFemaleTrain=dsSubset(dsTrain, opt1); dsFemaleTrain.input=dsFemaleTrain.input(inputId,:); dsFemaleTrain.inputName=dsFemaleTrain.inputName(inputId); dsFemaleTest= dsSubset(dsTest, opt1); dsFemaleTest.input= dsFemaleTest.input(inputId,:); dsFemaleTest.inputName= dsFemaleTest.inputName(inputId); subplot(211); rrFemale=perfByField(dsFemaleTrain, dsFemaleTest, optField, 1); title(sprintf('gender=%d, training rr=%g%%, test rr=%g%%', 1, 100*mean(rrFemale(:,1)), 100*mean(rrFemale(:,2)))); fprintf('Holdout test on male data:\n'); opt2=struct('inputName', 'gender', 'inputValue', 2); dsMaleTrain=dsSubset(dsTrain, opt2); dsMaleTest= dsSubset(dsTest, opt2); subplot(212); rrMale=perfByField(dsMaleTrain, dsMaleTest, optField, 1); title(sprintf('gender=%d, training rr=%g%%, test rr=%g%%', 2, 100*mean(rrMale(:,1)), 100*mean(rrMale(:,2)))); %% Summary % This is a brief tutorial on fatty liver recognition based on patients' data. % There are several directions for further improvement: % % * Try the classification problem using the whole dataset % * Use template matching as an alternative to improve the performance % %% Appendix % List of functions, scripts, and datasets used in this script: % % * <../list.asp List of files in this folder> % %% % Date and time when finishing this script: fprintf('Date & time: %s\n', char(datetime)); %% % Overall elapsed time: toc(scriptStartTime) %% % , created on datetime