function [data, wavDS] = stressFea(wavDS) %% == Parameter Setting prm; vowelList = textread('data/vowelphone.txt', '%s', 'delimiter', '\n', 'whitespace', ''); label = []; %% == Feature Extraction for i = 1:length(wavDS) fprintf('utterance %d/%d feature extraction...\n', i, length(wavDS)); wavDS(i).stressFea = []; % clear the old feature % [y, fs, nbits] = wavReadInt(wavDS(i).path(31:end)); au=myAudioRead(wavDS(i).path); y=au.signal; fs=au.fs; y=y*2^nbits/2; if fs ~= 16000 y = waveResample(y, fs, sampleRate); fs = 16000; end % Range of Spectral emphasis lowerBound = 300; upperBound = 2200; [coefB, coefA] = butter(5, [lowerBound*2/sampleRate, upperBound*2/sampleRate]); % 5th order butterworth filter y2 = filter(coefB, coefA, y); % spectral emphasis filtered by butterworth filter % frame blocking frame = buffer(y, frameSize, overlap, 'nodelay'); frame2 = buffer(y2, frameSize, overlap, 'nodelay'); rmsEnergy = sqrt(sum(frame.^2, 1) / frameSize) ; specEmphasis = sqrt(sum(frame2.^2, 1) /frameSize); for j = 1:length(wavDS(i).asraOutput) if strcmp(wavDS(i).asraOutput(j).name, 'sil') continue; else initialTime = wavDS(i).asraOutput(j).interval(1); % Normalization initialization meanEnergy = 0; meanSpecEmphasis = 0; meanDuration = 0; meanPitch = 0; tmpPitch = wavDS(i).asraOutput(j).pitch; for k = 1:length(wavDS(i).asraOutput(j).phone) tmpIdx = strfind(wavDS(i).asraOutput(j).phone(k).name, '+'); monophone = wavDS(i).asraOutput(j).phone(k).name(1:tmpIdx-1); tmpArr = strcmp(monophone, vowelList); if sum(tmpArr)>0 % phone is vowel startFrameIdx = round((wavDS(i).asraOutput(j).phone(k).interval(1) - initialTime)*frameRate + 1); endFrameIdx = round((wavDS(i).asraOutput(j).phone(k).interval(2) - initialTime)*frameRate); % Root Mean Square energy wavDS(i).asraOutput(j).phone(k).rmsEnergy = rmsEnergy(startFrameIdx:endFrameIdx); meanEnergy = meanEnergy + mean(rmsEnergy(startFrameIdx:endFrameIdx) ); % Spectral Emphasis Root Mean Square energy wavDS(i).asraOutput(j).phone(k).specEmphasis = specEmphasis(startFrameIdx:endFrameIdx); meanSpecEmphasis = meanSpecEmphasis + mean(specEmphasis(startFrameIdx:endFrameIdx)); % Duration vowelDuration = wavDS(i).asraOutput(j).phone(k).interval(2) - wavDS(i).asraOutput(j).phone(k).interval(1); meanDuration = meanDuration + vowelDuration; % Pitch wavDS(i).asraOutput(j).phone(k).pitch = tmpPitch(startFrameIdx:endFrameIdx); meanPitch = meanPitch + mean(wavDS(i).asraOutput(j).phone(k).pitch ); % Pseudo-Slope of Pitch wavDS(i).asraOutput(j).phone(k).slopePitch = (wavDS(i).asraOutput(j).phone(k).pitch(end) - wavDS(i).asraOutput(j).phone(k).pitch(1)) / (length(wavDS(i).asraOutput(j).phone(k).pitch) - 1); else continue; end end meanDuration = meanDuration / wavDS(i).stressNum; meanEnergy = meanEnergy / wavDS(i).stressNum; meanPitch = meanPitch / wavDS(i).stressNum; meanSpecEmphasis = meanSpecEmphasis / wavDS(i).stressNum; featureDimIdx = 0; for k = 1:length(wavDS(i).asraOutput(j).phone) tmpIdx = strfind(wavDS(i).asraOutput(j).phone(k).name, '+'); monophone = wavDS(i).asraOutput(j).phone(k).name(1:tmpIdx-1); tmpArr = strcmp(monophone, vowelList); if sum(tmpArr)>0 % phone is vowel featureDimIdx = featureDimIdx + 1; % rms energy wavDS(i).asraOutput(j).phone(k).rmsEnergy = wavDS(i).asraOutput(j).phone(k).rmsEnergy / meanEnergy; % Normalization wavDS(i).stressFea(1, featureDimIdx) = max(wavDS(i).asraOutput(j).phone(k).rmsEnergy); wavDS(i).stressFea(2, featureDimIdx) = mean(wavDS(i).asraOutput(j).phone(k).rmsEnergy); % spectral emphasis wavDS(i).asraOutput(j).phone(k).specEmphasis = wavDS(i).asraOutput(j).phone(k).specEmphasis / meanSpecEmphasis; % Normalization wavDS(i).stressFea(3, featureDimIdx) = max(wavDS(i).asraOutput(j).phone(k).specEmphasis ); wavDS(i).stressFea(4, featureDimIdx) = mean(wavDS(i).asraOutput(j).phone(k).specEmphasis ); % duration vowelDuration = wavDS(i).asraOutput(j).phone(k).interval(2) - wavDS(i).asraOutput(j).phone(k).interval(1); wavDS(i).stressFea(5, featureDimIdx) = vowelDuration / meanDuration; % Normalization % pitch wavDS(i).asraOutput(j).phone(k).pitch = wavDS(i).asraOutput(j).phone(k).pitch / meanPitch; % Normalization wavDS(i).stressFea(6, featureDimIdx) = max(wavDS(i).asraOutput(j).phone(k).pitch ); wavDS(i).stressFea(7, featureDimIdx) = median(wavDS(i).asraOutput(j).phone(k).pitch ); % Pseudo-Slope of Pitch wavDS(i).stressFea(8, featureDimIdx) = wavDS(i).asraOutput(j).phone(k).slopePitch; % Polynomial coefficient of pitch pitchLength = length(wavDS(i).asraOutput(j).phone(k).pitch); range = -1: 2/(pitchLength-1) : 1; inputPolyPitch = polyfit(range, wavDS(i).asraOutput(j).phone(k).pitch, fitOrder); coeffPitch = legendre_Approx(fitOrder, inputPolyPitch); wavDS(i).stressFea(9:9+fitOrder, featureDimIdx) = coeffPitch'; % Polynomial coefficient of Energy energyLength = length(wavDS(i).asraOutput(j).phone(k).rmsEnergy); range = -1: 2/(energyLength-1) : 1; inputPolyEnergy = polyfit(range, wavDS(i).asraOutput(j).phone(k).rmsEnergy, fitOrder); coeffEng = legendre_Approx(fitOrder, inputPolyEnergy); wavDS(i).stressFea(10+fitOrder:10+2*fitOrder, featureDimIdx) = coeffEng'; % Polynomial coefficient of Spectral Emphasis SpectralEngLength = length(wavDS(i).asraOutput(j).phone(k).specEmphasis ); range = -1: 2/(SpectralEngLength-1) : 1; inputPolySpecEnergy = polyfit(range, wavDS(i).asraOutput(j).phone(k).specEmphasis, fitOrder); coeffSpecEng = legendre_Approx(fitOrder, inputPolySpecEnergy); wavDS(i).stressFea(11+2*fitOrder:11+3*fitOrder, featureDimIdx) = coeffSpecEng'; %==== Create Groundtruth label vector if featureDimIdx==wavDS(i).stressPos label = [label; 1]; % 1 for Stressed else label = [label; 2]; % 2 for Unstressed end else continue; end end end end end %% == Create the feature DS feaDim = size(wavDS(1).stressFea, 1); fprintf('Dimension of Feature = %d\n', feaDim); data.type = 'allUtt'; data.input = [wavDS.stressFea]; % feature vector data.output = label'; % label data.id = ones(size(data.output)); % word id data.name = {wavDS.text}; % word data.stressNum = [wavDS.stressNum]; data.stressPos = [wavDS.stressPos]; data.totalSylNum = sum([wavDS.stressNum]); data.totalStressNum = length(wavDS); data.totalUnstressNum = data.totalSylNum - data.totalStressNum; %data.inputName = {'MaxRmsEnergy', 'MeanRmsEnergy', 'MaxSpecEmphasis', 'MeanSpecEmphasis', 'Duration', 'MaxPitch', 'MedianPitch', 'MeanPitch', 'VarPitch', 'PitchRange', 'PitchSlope', 'PolyPitch1', 'PolyPitch2', 'PolyPitch3', 'PolyPitch4', 'PolyRmsEnergy1', 'PolyRmsEnergy2', 'PolyRmsEnergy3', 'PolyRmsEnergy4', 'PolySpecEmphasis1', 'PolySpecEmphasis2', 'PolySpecEmphasis3', 'PolySpecEmphasis4', 'PolyVol1', 'PolyVol2', 'PolyVol3', 'PolyVol4', 'MaxVol', 'MeanVol'}; startIdx = 0; endIdx = 0; for i = 1:length(wavDS) startIdx = endIdx + 1; endIdx = endIdx + size(wavDS(i).stressFea, 2); data.id(startIdx:endIdx) = i; end %% == Create Syllable Number-dependent feature DS % Only 2~5 syllables are considered. for i = 2:5 idx = find(data.stressNum==i); dataSyl{i}.input = []; dataSyl{i}.output = []; dataSyl{i}.id = []; dataSyl{i}.trueId = []; for j = 1:length(idx) idx2 = find(data.id==idx(j)); dataSyl{i}.input = [dataSyl{i}.input, data.input([1:7,11:end], idx2)]; dataSyl{i}.output = [dataSyl{i}.output, data.output(:, idx2)]; for n = 1:length(idx2) dataSyl{i}.id = [dataSyl{i}.id, j]; end dataSyl{i}.trueId = [dataSyl{i}.trueId, data.id(:, idx2)]; end dataSyl{i}.name = data.name(idx); dataSyl{i}.stressNum = data.stressNum(idx); dataSyl{i}.stressPos = data.stressPos(idx); dataSyl{i}.totalSylNum = sum([dataSyl{i}.stressNum]); dataSyl{i}.totalStressNum = length(dataSyl{i}.name); dataSyl{i}.totalUnstressNum = dataSyl{i}.totalSylNum - dataSyl{i}.totalStressNum; dataSyl{i}.type = [num2str(i) 'SylUtt']; end %% ================= Save Data %save mat/wavDS.mat wavDS; file_name = sprintf('mat/stressFea_%d.mat', fitOrder); save(file_name ,'data'); save mat/dataSyl.mat dataSyl;