%% Tutorial on Fatty Liver Recognition (by ) %% % This tutorial explains the basics of fatty liver recognition based on patients' data. %% Preprocessing % Before we start, let's add necessary toolboxes to the search path of MATLAB: addpath d:/users/jang/matlab/toolbox/utility addpath d:/users/jang/matlab/toolbox/sap addpath d:/users/jang/matlab/toolbox/machineLearning %% % All the above toolboxes can be downloaded from the author's . % Make sure you are using the latest toolboxes to work with this script. %% % For compatibility, here we list the platform and MATLAB version that we used to run this script: fprintf('Platform: %s\n', computer); fprintf('MATLAB version: %s\n', version); fprintf('Date & time: %s\n', char(datetime)); scriptStartTime=tic; % Timing for the whole script %% Data preparation % First of all, we shall read the data from a data file: file='D:\dataSet\mj\liver\fliver.csv'; fprintf('Reading %s...\n', file); tic; data=xlsFile2struct(file, 'fliver'); time=toc; fprintf('Time=%g sec\n', time); %% % We can get rid of some of the fields, as follows. % % * Get rid of derived fields fieldToDelete={'bmi', 'ms1', 'ms2', 'ms3', 'ms4', 'ms5', 'ms'}; data=rmfield(data, fieldToDelete); %% % * Get rid of useless fields fieldToDelete={'id', 'n', 'fliver', 'father', 'gfather_f', 'gfather_m', 'mother', 'gmother_f', 'gmother_m', 'marriage_98', 'marriage_14'}; data=rmfield(data, fieldToDelete); %% % * Get rid of fields with too many NaN fieldToDelete={'fincome', 'pincome'}; data=rmfield(data, fieldToDelete); %% %data=rmfield(data, {'g_wei', 'g_hei'}); %% % Since there are still numerous NaN in the dataset, we can eliminate data with NaN. fprintf('Original data size = %g\n', length(data)); fprintf('List of no. of NaN at each field:\n'); fieldNames=fieldnames(data); for i=1:length(fieldNames) fieldName=fieldNames{i}; id=find(isnan([data.(fieldName)])); count=length(id); if count>0, fprintf('sum(isnan([data.%s]))=%d\n', fieldName, count); end data(id)=[]; end fprintf('After removing NaN, data size = %g\n', length(data)); %fprintf('Saving data.mat...\n'); %save data data %% % The we can create the basic dataset ds. outputFieldName='flivero'; ds.output=[data.(outputFieldName)]+1; % Starting from 1 data=rmfield(data, outputFieldName); ds.inputName=fieldnames(data); dataCount=length(data); ds.input=zeros(length(ds.inputName), dataCount); for i=1:length(ds.inputName) ds.input(i,:)=[data.(ds.inputName{i})]; end ds.outputName={'no', 'slightly', 'yes', 'alot'}; ds %fprintf('Saving ds.mat...\n'); %save ds2 ds %% % We partition the whole dataset into training and test sets. opt=cvDataGen('defaultOpt'); opt.foldNum=2; opt.cvDataType='full'; cvData=cvDataGen(ds, opt); dsTrain=cvData.TS; % Training set dsTest=cvData.VS; % Test set %% % For more detailed analysis, we partition the data based on gender. opt.inputName='gender'; opt.inputValue=1; dsFemale=dsSubset(dsTrain, opt); opt.inputValue=2; dsMale=dsSubset(dsTrain, opt); %% % We can plot the patient counts based on gender and years. id=find(strcmp(dsFemale.inputName, 'yr')); year=dsFemale.input(id, :); figure; [a, b]=elementCount(year); subplot(211); bar(a, b); xlabel('Years'); ylabel('Patient counts'); title(sprintf('Female total=%d', size(dsFemale.input,2))); year=dsMale.input(id, :); [a, b]=elementCount(year); subplot(212); bar(a, b); xlabel('Years'); ylabel('Patient counts'); title(sprintf('Male total=%d', size(dsMale.input,2))); axisLimitSame; %% % For simplicity, we shall use 2016 male data for further analysis. ds=dsSubset(dsMale, 'yr', 2016); ds %% Dataset visualization % Size of the classes: figure; [classSize, classLabel]=dsClassSize(ds, 1) %% % Box plot for two classes: figure; dsBoxPlot(ds); %% % Range plot of the dataset: figure; dsRangePlot(ds); %% % Range plot of the normalized dataset ds2: ds2=ds; ds2.input=inputNormalize(ds2.input); figure; dsRangePlot(ds2); %% % Scatter plots of ds2: figure; dsProjPlot2(ds2); figEnlarge; %% % Correlation plot: figure; corrplot(ds.input', 'varNames', ds.inputName); figEnlarge %% Classification % KNN results on ds and ds2: rr=knncLoo(ds); fprintf('rr=%g%% for ds\n', rr*100); rr=knncLoo(ds2); fprintf('rr=%g%% for ds2\n', rr*100); %% % Sequential forward selection on the features of ds: figure; tic; inputSelectSequential(ds, inf, 'knnc'); toc; figEnlarge %% % Sequential forward selection on the features of ds2: figure; tic; inputSelectSequential(ds2, inf, 'knnc'); toc; figEnlarge %% % Dimensionality reduction based on LDA: opt=ldaPerfViaKnncLoo('defaultOpt'); %opt.mode='exact'; % This option causes error, why? tic recogRate1=ldaPerfViaKnncLoo(ds, opt); ds2=ds; ds2.input=inputNormalize(ds2.input); % input normalization recogRate2=ldaPerfViaKnncLoo(ds2, opt); fprintf('Time=%g sec\n', toc); [featureNum, dataNum] = size(ds.input); figure; plot(1:featureNum, 100*recogRate1, 'o-', 1:featureNum, 100*recogRate2, '^-'); grid on legend({'Raw data', 'Normalized data'}, 'location', 'northOutside', 'orientation', 'horizontal'); xlabel('No. of projected features based on LDA'); ylabel('LOO recognition rates using KNNC (%)'); %% % Let's perform AutoML with ds: opt=perfCv4classifier('defaultOpt'); opt.foldNum=5; opt.classifiers(find(strcmp(opt.classifiers, 'src')))=[]; % Get rid of 'src' since it's extremely slow tic; [perfData, bestId]=perfCv4classifier(ds, opt, 1); toc %structDispInHtml(perfData, 'Performance of various classifiers via cross validation'); %% % Plot of confusion matrix confMat=confMatGet(ds.output, perfData(bestId).bestComputedClass); opt=confMatPlot('defaultOpt'); opt.className=ds.outputName; figure; confMatPlot(confMat, opt); %% % Since NBC is the best classifier, let's use it for feature selection: figure; tic; [inputId, bestRr]=inputSelectSequential(ds, inf, 'nbc'); toc; figEnlarge %% % Now we can list the training and test performance of the dataset based on NBC, with breakdowns into genders and years: figure; fprintf('Holdout test on female data:\n'); dsFemaleTrain=dsSubset(dsTrain, 'gender', 1); dsFemaleTest= dsSubset(dsTest, 'gender', 1); subplot(211); rrFemale=perfByField(dsFemaleTrain, dsFemaleTest, 'yr', 1); title(sprintf('gender=%d, training rr=%g%%, test rr=%g%%', 1, 100*mean(rrFemale(:,1)), 100*mean(rrFemale(:,2)))); fprintf('Holdout test on male data:\n'); dsMaleTrain=dsSubset(dsTrain, 'gender', 2); dsMaleTest= dsSubset(dsTest, 'gender', 2); subplot(212); rrMale=perfByField(dsMaleTrain, dsMaleTest, 'yr', 1); title(sprintf('gender=%d, training rr=%g%%, test rr=%g%%', 2, 100*mean(rrMale(:,1)), 100*mean(rrMale(:,2)))); %% % We can use the selected inputs for the above plot: figure; fprintf('Holdout test on female data:\n'); dsFemaleTrain=dsSubset(dsTrain, 'gender', 1); dsFemaleTrain.input=dsFemaleTrain.input(inputId,:); dsFemaleTrain.inputName=dsFemaleTrain.inputName(inputId); dsFemaleTest= dsSubset(dsTest, 'gender', 1); dsFemaleTest.input= dsFemaleTest.input(inputId,:); dsFemaleTest.inputName= dsFemaleTest.inputName(inputId); subplot(211); rrFemale=perfByField(dsFemaleTrain, dsFemaleTest, 'yr', 1); title(sprintf('gender=%d, training rr=%g%%, test rr=%g%%', 1, 100*mean(rrFemale(:,1)), 100*mean(rrFemale(:,2)))); fprintf('Holdout test on male data:\n'); dsMaleTrain=dsSubset(dsTrain, 'gender', 2); dsMaleTest= dsSubset(dsTest, 'gender', 2); subplot(212); rrMale=perfByField(dsMaleTrain, dsMaleTest, 'yr', 1); title(sprintf('gender=%d, training rr=%g%%, test rr=%g%%', 2, 100*mean(rrMale(:,1)), 100*mean(rrMale(:,2)))); %% Summary % This is a brief tutorial on fatty liver recognition based on patients' data. % There are several directions for further improvement: % % * Try the classification problem using the whole dataset % * Use template matching as an alternative to improve the performance % %% Appendix % List of functions, scripts, and datasets used in this script: % % * <../list.asp List of files in this folder> % %% % Date and time when finishing this script: fprintf('Date & time: %s\n', char(datetime)); %% % Overall elapsed time: toc(scriptStartTime) %% % , created on datetime