Question: hi i need a clear explanation for this matlab code in detalis please function outClass = knnclassify(sample, TRAIN, group, K, distance,rule,base) bioinfochecknargin(nargin,3,mfilename) [gindex,groups] = grp2idx(group);
hi
i need a clear explanation for this matlab code in detalis please
function outClass = knnclassify(sample, TRAIN, group, K, distance,rule,base)
bioinfochecknargin(nargin,3,mfilename)
[gindex,groups] = grp2idx(group);
nans = find(isnan(gindex));
if ~isempty(nans)
TRAIN(nans,:) = [];
gindex(nans) = [];
end
ngroups = length(groups);
[n,d] = size(TRAIN);
if size(gindex,1) ~= n
error('Bioinfo:knnclassify:BadGroupLength',...
'The length of GROUP must equal the number of rows in TRAINING.');
elseif size(sample,2) ~= d
error('Bioinfo:knnclassify:SampleTrainingSizeMismatch',...
'SAMPLE and TRAINING must have the same number of columns.');
end
m = size(sample,1);
if nargin < 4
K = 1;
elseif ~isnumeric(K)
error('Bioinfo:knnclassify:KNotNumeric',...
'K must be numeric.');
end
if ~isscalar(K)
error('Bioinfo:knnclassify:KNotScalar',...
'K must be a scalar.');
end
if K<1
error('Bioinfo:knnclassify:KLessThanOne',...
'K must be greater than or equal to 1.');
end
if isnan(K)
error('Bioinfo:knnclassify:KNaN',...
'K cannot be NaN.');
end
if nargin < 5 || isempty(distance)
distance = 'euclidean';
end
if ischar(distance)
distNames = {'euclidean','cityblock','cosine','correlation','hamming'};
i = find(strncmpi(distance, distNames,numel(distance)));
if length(i) > 1
error('Bioinfo:knnclassify:AmbiguousDistance', ...
'Ambiguous ''distance'' parameter value: %s.', distance);
elseif isempty(i)
error('Bioinfo:knnclassify:UnknownDistance', ...
'Unknown ''distance'' parameter value: %s.', distance);
end
distance = distNames{i};
else
error('Bioinfo:knnclassify:InvalidDistance', ...
'The ''distance'' parameter value must be a string.');
end
if nargin < 6
rule = 'nearest';
elseif ischar(rule)
if strncmpi(rule,'conc',4)
rule(4) = 's';
end
ruleNames = {'random','nearest','farthest','consensus'};
i = find(strncmpi(rule, ruleNames,numel(rule)));
if isempty(i)
error('Bioinfo:knnclassify:UnknownRule', ...
'Unknown ''Rule'' parameter value: %s.', rule);
end
rule = ruleNames{i};
else
error('Bioinfo:knnclassify:InvalidRule', ...
'The ''rule'' parameter value must be a string.');
end
[dSorted,dIndex] = distfun(sample,TRAIN,distance,K);
if K >1
classes = gindex(dIndex);
if size(classes,2) == 1
classes = classes';
end
counts = zeros(m,ngroups);
for outer = 1:m
for inner = 1:K
counts(outer,classes(outer,inner)) = counts(outer,classes(outer,inner)) + 1;
end
end
[L,outClass] = max(counts,[],2);
if strcmp(rule,'consensus')
noconsensus = (L~=K);
if any(noconsensus)
outClass(noconsensus) = ngroups+1;
if isnumeric(group) || islogical(group)
groups(end+1) = {'NaN'};
else
groups(end+1) = {''};
end
end
else % we need to check case where L <= K/2 for possible ties
checkRows = find(L<=(K/2));
for i = 1:numel(checkRows)
ties = counts(checkRows(i),:) == L(checkRows(i));
numTies = sum(ties);
if numTies > 1
choice = find(ties);
switch rule
case 'random'
tb = randsample(numTies,1);
outClass(checkRows(i)) = choice(tb);
case 'nearest'
for inner = 1:K
if ismember(classes(checkRows(i),inner),choice)
outClass(checkRows(i)) = classes(checkRows(i),inner);
break
end
end
case 'farthest'
% find the use the closest element of the equal groups
% to break the tie
for inner = K:-1:1
if ismember(classes(checkRows(i),inner),choice)
outClass(checkRows(i)) = classes(checkRows(i),inner);
break
end
end
end
end
end
end
else
outClass = gindex(dIndex);
end
if isa(group,'categorical')
labels = getlabels(group);
if isa(group,'nominal')
groups = nominal(groups,[],labels);
else
groups = ordinal(groups,[],getlabels(group));
end
outClass = groups(outClass);
elseif isnumeric(group) || islogical(group)
groups = str2num(char(groups));
outClass = groups(outClass);
elseif ischar(group)
groups = char(groups);
outClass = groups(outClass,:);
else
outClass = groups(outClass);
end
fid = fopen(base, 'w');
fprintf(fid,'%s ', num2str(dSorted));
fprintf(fid,'%s ', num2str(dIndex));
fclose(fid);
MM = dlmread(base);
MN = vertcat(MM);
fid = fopen(base, 'w');
fprintf(fid,'%d\t%d ', MN);
fclose(fid);
function [dSorted,dIndex] = distfun(Sample, Train, dist,K)
numSample = size(Sample,1);
dSorted = zeros(numSample,K);
dIndex = zeros(numSample,K);
switch dist
case 'euclidean' % we actually calculate the squared value
for i = 1:numSample
Dk = sum(bsxfun(@minus,Train,Sample(i,:)).^2, 2);
[dSorted(i,:),dIndex(i,:)] = getBestK(Dk,K);
end
case 'cityblock'
for i = 1:numSample
Dk = sum(abs(bsxfun(@minus,Train,Sample(i,:))), 2);
[dSorted(i,:),dIndex(i,:)] = getBestK(Dk,K);
end
case {'cosine'}
normSample = sqrt(sum(Sample.^2, 2));
normTrain = sqrt(sum(Train.^2, 2));
if any(min(normTrain) <= eps(max(normTrain))) || any(min(normSample) <= eps(max(normSample)))
warning('Bioinfo:knnclassify:ConstantDataForCos', ...
['Some points have small relative magnitudes, making them ', ...
'effectively zero. Either remove those points, or choose a ', ...
'distance other than ''cosine''.']);
end
Train = Train ./ normTrain(:,ones(1,size(Train,2)));
for i = 1:numSample
Dk = 1 - (Train * Sample(i,:)') ./ normSample(i);
[dSorted(i,:),dIndex(i,:)] = getBestK(Dk,K);
end
case {'correlation'}
Sample = bsxfun(@minus,Sample,mean(Sample,2));
Train = bsxfun(@minus,Train,mean(Train,2));
normSample = sqrt(sum(Sample.^2, 2));
normTrain = sqrt(sum(Train.^2, 2));
if any(min(normTrain) <= eps(max(normTrain))) || any(min(normSample) <= eps(max(normSample)))
warning('Bioinfo:knnclassify:ConstantDataForCorr', ...
['Some points have small relative standard deviations, making them ', ...
'effectively constant. Either remove those points, or choose a ', ...
'distance other than ''correlation''.']);
end
Train = Train ./ normTrain(:,ones(1,size(Train,2)));
for i = 1:numSample
Dk = 1 - (Train * Sample(i,:)') ./ normSample(i);
[dSorted(i,:),dIndex(i,:)] = getBestK(Dk,K);
end
case 'hamming'
if ~all(ismember(Sample(:),[0 1]))||~all(ismember(Train(:),[0 1]))
error('Bioinfo:knnclassify:HammingNonBinary',...
'Non-binary data cannot be classified using Hamming distance.');
end
p = size(Sample,2);
for i = 1:numSample
Dk = sum(abs(bsxfun(@minus,Train,Sample(i,:))), 2) / p;
[dSorted(i,:),dIndex(i,:)] = getBestK(Dk,K);
end
end
function [sorted,index] = getBestK(Dk,K)
if K>1
[sorted,index] = sort(Dk);
sorted = sorted(1:K);
index = index(1:K);
else
[sorted,index] = min(Dk);
end
Step by Step Solution
There are 3 Steps involved in it
Get step-by-step solutions from verified subject matter experts
