#purpose: to test if some observed data can be said to come from a common distribution
#signature: chisq_test_distribution(observed, distribution, bins)
#arguments: observed is the observed data in vector. distribution is the octave distribution function name in string(poiss for Poisson, norm for normal etc). bins is a vector for grouping observations, use cell array {} when a bin is a range of values.
#descriptions: the test is based on "sum over bins i (Observed i - Expected i).^2 / Expected i ~ chisquare(bins number-1-df lost through estimation) "
#assumptions: the observed data are i.i.d.
#output: chi-square, p-value, degree of freedom in octave structure format (equivalent to dictionary)
#reference: Sheldon Ross's Introduction to Statistics for Engineers chapter 11
function output= chisq_test_distribution(observed, distribution, bins)
#only below distributions are accepted for now, will add gradually
distribution_supported = {"norm", "poiss"};
#checking arguments entered correctly and converting formats
assert(ismatrix(observed) && rows(observed) == 1, "observed data should be a 1xn matrix");
assert(ismatrix(bins) && rows(bins) == 1, "bins should be a 1xn matrix or cell array");
assert(ischar(distribution), "distribution name should be entered as string");
assert(any(strcmpi(distribution, distribution_supported)),"distribution entered isn't recognized or not yet supported");
if !iscell(bins) #if bins is a vector, convert it to a cell format
bins = num2cell(bins); #it could be a vector, just this function is coded to loop a cell
endif
#set some variables based on distribution entered, for use in calling its probability density function
switch (distribution)
case "norm"
pdf = "normpdf";
mu = mean(observed);
var = var(observed, 0); #compute variance with (n-1) in denominator
pdf_args = [mu, var];
case "poiss"
pdf = "poisspdf";
lambda = mean(observed);
pdf_args = [lambda];
otherwise
disp("something is fucked up");
return;
endswitch
#variables declaration
k = length(bins); #number of bin groups
pdf_args_number = length(pdf_args); #number of parameter estimated
df = k - 1 - pdf_args_number; #chi-square degree of freedom
n = length(observed); #number of observations
probability = [];
observations = [];
#looping through bins to count realised observations in each and to calculate probability under null_hypothesis in each
for i=1:k
inner_bins = bins{1, i}; #get this bin entry
inner_k = length(inner_bins); #number of values in this bin, more than 1 if a range
inner_probability = [];
inner_observations = [];
for j=1:inner_k
switch (pdf_args_number) #pdf function call differed by number of arguments to pass
case 1
inner_probability(1,j) = feval(pdf, inner_bins(1, j), pdf_args(1));
case 2
inner_probability(1,j) = feval(pdf, inner_bins(1, j), pdf_args(1), pdf_args(2));
case 3
inner_probability(1,j) = feval(pdf, inner_bins(1, j), pdf_args(1), pdf_args(2), pdf_args(3));
case 4
inner_probability(1,j) = feval(pdf, inner_bins(1, j), pdf_args(1), pdf_args(2), pdf_args(3), pdf_args(4));
otherwise
disp("Not aware of any distribution having more than 4 parameters.");
return;
endswitch
inner_observations(1, j) = histc(observed, inner_bins(1, j)); #count how many observations fall into this bin
endfor
probability(1, i) = sum(inner_probability, 2); #assign over the sum of all inner probabilities
observations(1, i) = sum(inner_observations, 2); #assign over the sum of all inner observation counts
endfor
#calculate the test statistics and return results
expected = n .* probability; #expected observations if this distribution hypothesis is true
chisq = sum((observations - expected).^2 ./ expected, 2); #Pearson chi-square statistics
pvalue = 1 - chi2cdf(chisq, df);
output = struct("actual", observations, "expected", expected, "chisq", chisq, "pvalue", pvalue, "df", df);
return;
endfunction