unit MtxParseStatistics;
interface
(*Introduces routines for several linearizable regression models.
Introduces several ready-to-be-used linearizable regression models:
- Power model :
- simple linear model :
- simple exponential model :
- multiple linear model :
- Rational function :
- Generalized logistic function :
- Natural logarithm model:
If your data cannot be described with linearizable equation, you should use the
component or routine to
retrieve regression coeeficients from fitted equation.
*)
unit RegModels;
{$I bdsppdefs.inc}
interface
uses MtxVec, Math387, Regress
,SysUtils
,Types
;
(*Defines different linear models.*)
Type TRegressionModel = (
(*Linear function Y = B[0]+B[1]*X.*)rmLine,
(*Power function Y = (B[0]*X)^B[1].*)rmPower,
(*Polynomial function.*)rmPoly,
(*Simple exponential function Y = B[0]*Exp(B[1]*X).*)rmExp,
(*Multiple linear function Y = B[0] + B[1]*X1 + B[2]*X2 + ...*)rmMulLi,
(*Logistic function : Y = B[0] + (B[1]-B[0])/(1+Exp(-B[2]*x + B[3]))*)rmLogistic,
(*Rational function : Y = (B[0] + B[1].x + ... + B[n]x^n)/(1 + B[n+1].x + B[n+2].x^2 + ... B[n+d]*x^d)*)rmFrac,
(*Natural logarithm function : y = B[0] + B[1]*Ln(X).*)rmLn
);
(*Evaluates power function.
b[0]*X^b[1] for given value X and parameters in B array.
Evaluate power function for single x.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
y = RegModels.PowerEval(new double[] {-5,3.5},2.0);
}
}
*)
function PowerEval(Const B: Array of double; X: double): double; overload;
(*Evaluates b[0]*X^b[1] for given values in X vector, parameters in B array,
and returns the results in XHat vector.
Size and Complex properties of XHat vector are adjusted automatically.
Note
Use this version if you want to evaluate power function for multiple values
at the same time. This is a lot faster than calling single value version for each x value.
Evaluate power function for multiple values at the same time.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector X = new Vector(100,false);
Vector Y = new Vector(0);
X.Ramp(-5.0, 0.1);
RegModels.PowerEval(new double[] {1.0, 3.0}, X, Y);
}
}
*)
procedure PowerEval(Const B: Array of double; const X, YHat: TVec); overload;
(*Evaluate power function model (vectorized version).*)
procedure PowerEval(const B, X, YHat: TVec); overload;
(*Derivatives of power function.
Calculates the derivatives of the power function at point (X,Y)
with respect to the regression parameters Pars. The results are returned in
Grad vector. Grad.Values[I] contains the derivative with respect to the I-th
parameter. The Length and Complex properties of the Grad vector are adjusted
automatically.
*)
procedure PowerDeriv(RegressFun: TRegressFun; X,Y : double; Const Pars: Array of double; const Grad: TVec);
(*Fits simple exponential equation to data.
Vector of independent variable.
Vector of dependent variable.
Weights (optional). Weights are used only if they are set.
Returns regression coefficients for power function.
The routine fits equations to data by minimizing the sum of squared residuals.
The observed values obey the following equation:
In the following example we generate some data. Then we fit
power function to this data and retreive it's regression coefficients.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example(Steema.TeeChart.Styles.Point series1,
Steema.TeeChart.Styles.Point series 2)
{
Vector X = new Vector(100,false);
Vector Y = new Vector(0);
Vector YHat = new Vector(0);
Vector B = new Vector(0);
X.Ramp(-5.0, 0.1); // x= -5.0, -4.9, ..., +4.9
Y.Size(X);
Y.RandGauss(3.5,0.1); // sample data
RegModels.PowerFit(B,X,Y,null); // calculate coefficients
// evaluate y
RegModels.PowerEval(B, X, YHat);
MtxVecTee.DrawValues(X,Y,series1,false); // original data
MtxVecTee.DrawValueS(X,YHat,series2,false); // fitted data
}
}
*)
procedure PowerFit(const B: TVec; const X,Y: TVec; const Weights: TVec = nil);
(*Evaluates linear function.
b[0] + b[1]*X for given value X and parameters in B array.
Evaluate linear function for single x.
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
RegModels.LineEval(new double[] {1.0, 3.0}, 2.5);
}
}
*)
function LineEval(Const B: Array of double; X: double): double; overload;
(*Evaluates b[0] + b[1]*X for given values in X vector, parameters in B array,
and returns the results in XHat vector.
Size and Complex properties of XHat vector are adjusted automatically.
Note
Use this version if you want to evaluate linear function for multiple values
at the same time. This is a lot faster than calling single value version for each x value.
Evaluate linear function for multiple values at the same time.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector X = new Vector(100,false);
Vector Y = new Vector(0);
X.Ramp(-5.0, 0.1);
RegModels.LineEval(new double[] {1.0, 3.0}, X, Y);
}
}
*)
procedure LineEval(Const B: Array of double; const X, YHat: TVec); overload;
(*Evaluates linear function (fully vectorized version).*)
procedure LineEval(const B, X, YHat: TVec); overload;
(*Derivatives of linear function.
Calculates the derivatives of linear function at point (X,Y)
with respect to the regression parameters Pars. The results are returned in
Grad vector. Grad.Values[I] contains the derivative with respect to the I-th
parameter. The Length and Complex properties of the Grad vector are adjusted
automatically.
*)
procedure LineDeriv(RegressFun: TRegressFun; X,Y : double; Const Pars: Array of double; const Grad: TVec; const fp: TMtxFloatPrecision);
(*Fits linear equation to data.
Vector of independent variable.
Vector of dependent variable.
Weights (optional). Weights are used only if they are set.
Returns regression coefficients for linear function.
The routine fits equations to data by minimizing the sum of squared residuals.
The observed values obey the following equation:
In the following example we generate some data. Then we fit
simple linear function to this data and retreive it's regression
coefficients.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example(Steema.TeeChart.Styles.Line line1,
Steema.TeeChart.Styles.Line line2)
{
Vector X = new Vector(100,false);
Vector Y = new Vector(100,false);
Vector YHat = new Vector(0);
Vector B = new Vector(0);
X.Ramp(-5.0, 0.1); // x= -5, -4.95, ...-0.05
Y.RandGauss(3.5, 0.12); // sample data
RegModels.LineFit(B,X,Y,null); // calculate coefficients
// evaluate y by using calculated coefficients
LineEval(B,X, YHat);
MtxVecTee.DrawValues(X,Y,line1,false); // draw original data
MtxVecTee.DrawValues(X,YHat,line2,false); // draw fitted data
}
}
*)
procedure LineFit(const B: TVec; const X,Y: TVec; const Weights: TVec = nil);
(*Evaluates exponential function.
b[0]*Exp(b[1]*X) for given value X and parameters in B array.
Evaluate exponential function for single x.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
double y = RegModels.ExpEval(new double[] {3.0, -0.5}, 1.5);
// y = 3*Exp(-0.5*1.5) = 1.4170996582
}
}
*)
function ExpEval(Const B: Array of double; X: double): double; overload;
(*Evaluates b[0]*Exp(b[1]*X) for given values in X vector, parameters in B array,
and returns the results in XHat vector.
Size and Complex properties of XHat vector are adjusted automatically.
Note
Use this version if you want to evaluate exponential function for multiple values
at the same time. This is a lot faster than calling single value version for each x value.
Evaluate exponential function for multiple values at the same time.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example(Steema.TeeChart.Styles.Line line1)
{
Vector X = new Vector(100);
Vector Y = new Vector(0);
X.Ramp(-5.0, 0.05); // x= -5.0, -4.95, ... -0.05
// Y = b[0] + b[1]*X
RegModels.ExpEval(new double[] {1,3},X,Y);
MtxVecTee.DrawValues(X,Y,line1,false);
}
}
*)
procedure ExpEval(Const B: Array of double; const X, YHat: TVec) ; overload;
(*Evaluates exponential function (fully vecctorized version).*)
procedure ExpEval(const B, X, YHat: TVec); overload;
(*Derivatives of simple exponential function.
Calculates the derivatives of the simple exponential function at point (X,Y)
with respect to the regression parameters Pars. The results are returned in
Grad vector. Grad.Values[I] contains the derivative with respect to the I-th
parameter. The Length and Complex properties of the Grad vector are adjusted
automatically.
*)
procedure ExpDeriv(RegressFun: TRegressFun; X,Y : double; Const Pars: Array of double; const Grad: TVec; const fp: TMtxFloatPrecision);
(*Fits simple exponential equation to data.
Vector of independent variable.
Vector of dependent variable.
Weights (optional). Weights are used only if they are set.
Returns regression coefficients for simple exponent function.
The routine fits equations to data by minimizing the sum of squared residuals.
The observed values obey the following equation:
In the following example we generate some data. Then we fit
simple exponential function to this data and retreive it's regression coefficients.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example(Steema.TeeChart.Styles.Line line1,
Steema.TeeChart.Styles.Line line2)
{
Vector X = new Vector(100);
Vector Y = new Vector(100);
Vector B = new Vector(0);
Vector YHat = new Vector(0);
X.Ramp(-5.0, 0.05); // x= -5.0, -4.95, ... -0.05
Y.RandGauss(3.5, 0.12); // sample data
// calculate coefficients
RegModels.ExpFit(B,X,Y,null);
// evaluate y by using calculated coefficients
RegModels.ExpEval(B,X, YHat);
MtxVecTee.DrawValues(X,Y,line1,false);
MtxVecTee.DrawValues(X,YHat,line2,false);
}
}
*)
procedure ExpFit(const B: TVec; const X,Y: TVec; const Weights: TVec = nil);
(*Evaluates multiple linear function.
b[0] + b[1]*x[0] + b[2]*x[1] + ... for given value X and parameters in B array.*)
function MulLinEval(Const B: Array of double; const X: TVec; Constant: boolean = false): double; overload;
(*Evaluates b[0] + b[1]*x[0] + b[2]*x[1] + ... for given values in X vector, parameters in B array,
and returns the results in XHat vector.
Size and Complex properties of XHat vector are adjusted automatically.
Note
Use this version if you want to evaluate multiple linear function for multiple values
at the same time. This is a lot faster than calling single value version for each x value.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Matrix X = new Matrix(0,0);
Vector YHat = new Vector(0);
X.SetIt(3,3,false,new double[]{2, -3, 5,
1, 6, -4,
8, 7, 9});
// Evaluate, no constant term!
RegModels.MulLinEval(new double[] {1.0, 0.5, -2.0},X,YHat, false);
// YHat = (-9.5, 12, -6.5)
}
}
*)
procedure MulLinEval(Const B: Array of double; const X: TMtx; const YHat: TVec; Constant: boolean = false); overload;
(*Fits multiple linear equations to data.
Vector of independent variable.
Vector of dependent variable.
If true then intercept term b(0) will be included in calculations. If false, set intercept term b(0) to 0.0.
Weights (optional). Weights are used only if they are set.
Returns regression coefficients for multiple linear function.
The routine fits equations to data by minimizing the sum of squared residuals.
The observed values obey the following equation:
where X is matrix, Y, B are vectors.
In the following example we generate some data. Then we fit
multiple linear function to this data and retreive it's regression coefficients.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Matrix X = new Matrix(0,0);
Vector Y = new Vector(0);
Vector B = new Vector(0);
X.SetIt(3,2,false,new double[]{1.0, 2.0,
-3.2, 2.5,
8.0, -0.5});
Y.SetIt(3,new double[] {-3.0, 0.25, 9.0});
RegModels.MulLinFit(B,X,Y,true,null);
// B = (18.646428571, -1.9464285714, -9.85 )
}
}
*)
procedure MulLinFit(const B: TVec; const X: TMtx; const Y: TVec; Constant: boolean = false; const Weights: TVec = nil);
(*Evaluates rational fraction model.*)
function FracEval(Const B: array of double; X: double; DegNom: Integer; Constant: boolean = false): double; overload;
function FracEval(Const B: TVec; X: double; DegNom: Integer; Constant: boolean): double; overload;
(*Evaluates raational fraction model (parameters are not vectorized).
In this case only the independent and dependent variables are vectorized.
*)
procedure FracEval(Const B: Array of double; const X: TVec; const YHat: TVec; DegNom: Integer; Constant: boolean = false); overload;
(*Evaluate rational fraction (fully vectorized version).
In this case all parameter are vectors.
*)
procedure FracEval(const B, X, YHat: TVec; DegNom: Integer; Constant: boolean = false); overload;
(*Fits rational fraction equation to data.
Vector of independent variable.
Vector of dependent variable.
Nominator degree.
Denominator degree.
If false, B[0] i.e. constant term in nominator is set to 0.0.
Weights (optional). Weights are used only if they are set.
Returns regression coefficients for rational function.
The routine fits equations to data by minimizing the sum of squared residuals.
The observed values obey the following equation:
where n and d are nominator and denominator polynomial degrees.
In the following example we generate some data. Then we fit
power function to this data and retreive it's regression coefficients.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example(Steema.TeeChart.Styles.Line line1, Steema.TeeChart.Styles.Line line2)
{
Vector X = new Vector(100);
Vector Y = new Vector(100);
Vector B = new Vector(0);
Vector YHat = new Vector(0);
X.Ramp(-5.0, 0.05); // x= -5.0, -4.95, ... -0.05
Y.RandGauss(3.5, 0.12); // sample data
// calculate coefficients
RegModels.FracFit(B,X,Y,2,4,false,null);
// evaluate y by using calculated coefficients
RegModels.FracEval(B,X, YHat,2,false);
MtxVecTee.DrawValues(X,Y,line1,false);
MtxVecTee.DrawValues(X,YHat,line2,false);
}
}
*)
procedure FracFit(const B: TVec; const X,Y: TVec; DegNom,DegDenom: Integer; Constant: boolean = false; const Weights: TVec = nil);
(*Evaluate logistic model.
The observed values obey the following equation:
*)
function LogisticEval(Const B: Array of double; X: double; Constant: boolean): double; overload;
(*Evaluate logistic model (partially vectorized).*)
procedure LogisticEval(Const B: Array of double; const X,YHat: TVec; Constant: boolean); overload;
(*Evaluate logistic function (fully vectorized).
*)
procedure LogisticEval(const B,X,YHat: TVec; Constant: boolean); overload;
(*Fits logistic equation to data.
Vector of independent variable.
Vector of dependent variable.
If false, B[0] i.e. constant term in nominator is set to 0.0.
Weights (optional). Weights are used only if they are set.
Returns regression coefficients for logistic function.
The routine fits equations to data by minimizing the sum of squared residuals.
The observed values obey the following equation:
*)
procedure LogisticFit(const B: TVec; const X,Y: TVec; Constant: boolean = false; const Weights: TVec = nil);
(*Evaluate the logarithm y(x)=b[0] + b[1]*ln(x) model.
The observed values obey the following equation:
*)
function LnEval(Const B: Array of double; X: double): double; overload;
(*Evaluate the logarithm model (partially vectorized).*)
procedure LnEval(Const B: Array of double; const X, YHat: TVec); overload;
(*Evaluate logarithm function (fully vectorized).
*)
procedure LnEval(const B, X,YHat: TVec); overload;
(*Fits simple logarithm equation y(x)=b[0] + b[1]*ln(x) to data.
Vector of independent variable.
Vector of dependent variable.
Weights (optional). Weights are used only if they are set.
Returns regression coefficients for natural logarithm function.
The routine fits equations to data by minimizing the sum of squared residuals.
The observed values obey the following equation:
In the following example we generate some data. Then we fit
natural logarithm function to this data and retreive it's regression coefficients.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example(Steema.TeeChart.Styles.Line line1,
Steema.TeeChart.Styles.Line line2)
{
Vector X = new Vector(100);
Vector Y = new Vector(100);
Vector B = new Vector(0);
Vector YHat = new Vector(0);
X.Ramp(0.5, 0.05); // x= 0.5, 0.55, ...
Y.RandGauss(3.5, 0.12); // sample data
// calculate coefficients
RegModels.LnFit(B,X,Y,null);
// evaluate y by using calculated coefficients
RegModels.LnEval(B,X, YHat);
MtxVecTee.DrawValues(X,Y,line1,false);
MtxVecTee.DrawValues(X,YHat,line2,false);
}
}
*)
procedure LnFit(const B: TVec; const X,Y: TVec; const Weights: TVec = nil);
(*Regression routines.
Unit introduces several regression routines:
- Simple Linear Regression,
- Multiple Linear Regression,
- Ridge Regression,
- Logistic Regression,
- Poisson regression,
- Principal Component Regression
- Analysis of Variance routines.
*)
unit Regress;
{$I bdsppdefs.inc}
interface
uses MtxVec, MtxVecInt, Math387, Statistics, Optimization, MtxIntDiff
,SysUtils, Classes
;
type
(*Stepwise regression optimization algorithm. *)
TStepwiseMethod = (
(*Additional variables will be included in to the existing model, if the quality criteria is improved.*)
swForward,
(*Additional variables will be excluded from the existing model, if the quality criteria is improved.*)
swBackward,
(*All variable combinations will be computed and the set with best result (according to the criteria) will be selected.
This method allows maximum 15 variables. Variable count of more than 15 can quickly result in unresonably large memory
and CPU requirements which increases by 2x for each additional variable.*)
swExhaustive,
(*Perform a single regression step, show the result with statistics and allow the user to modify the list of included variables
before the next run.*)
swStepwise
);
(*Defines regression function.
The type describes the regression function of several regression
parameters (B array) and independent variable X. The TRegressFun
is used in several regression procedure. For higher (vectorized)
performance see TMultiRegressFun.
"Eckerle4" function from NIST files:
private double Eckerle4(TVec B, double x)
{
double sqrterm = (x-B[2])/B[1])*(x-B[2])/B[1]);
return B[0]/B[1] * Math.Exp(-0.5*(sqrterm));
}
*)
TRegressFun = function(const B: TVec; X: double): double;
(*Defines multiple regression function.
The type describes the vectorized regression function of several regression
parameters (B array) and multiple independent variables X. The TMultiRegressFun
is used in several regression procedure. The X holds a list of independent variables
and the result of the function is stored in to the Y parameter. The const parameter of "y" applies to
the pointer and not to the contents of the object. When performance is not an issue, the TRegressFun can be easier to use.
"Eckerle4" function from NIST files:
private void Eckerle4(TVec B, TVecList x, TVec y)
{
//B[0]/B[1] * Exp(-0.5*Sqr((X-B[2])/B[1]));
y.Normalize(X[0], B[2], B[1]);
y.Sqr();
y.Scale(-0.5);
y.Exp();
y.Scale(B[0]/B[1]);
}
*)
TMultiRegressFun = procedure(const B: TVec; const X: TVecList; const Y: TVec);
(*Defines derivatives of a function.
The type describes the procedure for calculating the derivatives
of a regression function TMultiRegressFun with respect to the regression
parameters (B array), evaluated at (X,Y). The function accepts all pairs of (x,y) values at which
the function is to be evaluted allowing for vectorized (up to 20x faster) computation. Each independent
variable X is stored as a separate vector in the list x. The result is returned in to the
parameter Grad. The "const" modifier of "Grad" applies to the pointer and not to the contents of object.
An example of regression function and it's derivatives with respect to regression parameters:
private void SimplexParabolaDeriv(Dew.Stats.TMultiRegresFun RegressFun, TVecList x, TVec y, TVec Pars, TVecList Grad);
{
Grad[0].Sqr(x[0]); // Grad.Values[0] = x*x;
Grad[1].Copy(x[0]); // Grad.Values[1] = x;
Grad[2].Size(x[0]);
Grad[2].SetVal(1); // Grad.Values[2] = 1.0;
}
*)
TMultiDeriveProc = procedure(const RegressFun: TMultiRegressFun; const X: TVecList; const Y: TVec; const Pars: TVec;const Grad: TVecList);
(*Defines derivatives of a function.
The type describes the procedure for calculating the derivatives
of a regression function TRegressFun with respect to the regression
parameters (B array), evaluated at (X,Y).
An example of regression function and it's derivatives with respect
to regression parameters:
private void SimplexParabolaDeriv(Dew.Stats.TRegresFun RegressFun, double x, double y,
TVec Pars, TVec Grad);
{
Grad.Values[0] = x*x;
Grad.Values[1] = x;
Grad.Values[2] = 1.0;
}
*)
TDeriveProc=procedure(RegressFun: TRegressFun; const X, Y: double; const Pars, Grad: TVec);
(*Defines F statistics parameters for regression test.*)
TFStats = packed record
SSE: double;
SSR: double;
(* dFE = Number_Of_Observations - Number_Of_Parameters *)
dFE: Integer;
(* dFR = Number_Of_Parameters - 1 *)
dFR: Integer;
F: double;
Signif: double;
end;
(*Defines one-way ANOVA statistics parameters.
This structured type defines following routine statistical parameters:
* SS1 Sum of squares (source of variation between groups)
* SS2 Sum of squares (source of variation within groups)
* SSTotal Sum of squares (total)
* Deg1 Degrees of freedom (source of variation between groups)
* Deg2 Degrees of freedom (source of variation within groups)
* DegTotal Degrees of freedom (total)
* MS1 Mean squares (source of variation between groups), where MS1 = SS1/Deg1
* MS2 Mean squares (source of variation within groups), where MS2 = SS2/Deg2
* FDist F statistics (the ratio of MS1 and MS2)
* FCrit Critical F value
*)
TANOVA1Result = packed record
SS1,SS2, SSTotal : double;
Deg1,Deg2,DegTotal : double;
MS1,MS2 : double;
FDist,FCrit : double;
end;
(*Defines two-way ANOVA (with or without replications) statistics parameters.
This structured type describes the routine statistical parameters:
* SS1 Sum of squares (source of variation rows)
* SS2 Sum of squares (source of variation columns)
* SS3 Sum of squares (source of variation interactions between rows and columns). The SS3 term is
only valid for two-way ANOVA with replications.
* SS4 Sum of squares (source of variation errors)
* SSTotal Sum of squares (total)
* Deg1 Degrees of freedom (rows)
* Deg2 Degrees of freedom (columns)
* Deg3 Degrees of freedom (interactions). The Deg3 term is only valid for two-way ANOVA with
replications.
* Deg4 Degrees of freedom (errors)
* DegTotal Degrees of freedom (total)
* MS1 Mean squares (rows)
* MS2 Mean squares (columns)
* MS3 Mean squares (interactions). The MS3 term is only valid for two-way ANOVA with replications.
* MS4 Mean squares (errors)
* FDist1 F statistics (the ratio of MS)
* FDist2 F statistics (the ratio of MS)
* FDist3 F statistics (the ratio of MS). The MS3 term is only valid for two-way ANOVA with replications.
* FCrit1 Critical F value
* FCrit2 Critical F value
* FCrit3 Critical F value. The MS3 term is only valid if yor're performing two-way ANOVA with
replications.
*)
TANOVA2Result = packed record
SS1,SS2,SS3,SS4,SSTotal : double;
Deg1,Deg2,Deg3,Deg4,DegTotal : double;
MS1,MS2,MS3,MS4 : double;
FDist1,FDist2,FDist3,
FCrit1,FCrit2,FCrit3 : double;
end;
(*Regression statistical parameters.
This structured type describes the following regression statistical parameters.
* SSE - Residual sum of squares.
* SST - Total sum of squares TSS = SSE + SSR.
* SSR - Regression sum of squares.
* ResidualVar - Residual variance (aka MSE).
* R2 - Coefficient of determination.
* AdjustedR2 - Adjusted coefficient of determination.
* StdError = sqrt(SSE/dFE) = sqrt(MSE) = sqrt(ResidualVar)
* dFE = Number_Of_Observations - Number_Of_Parameters
* dFT = Number_Of_Observations - 1
* FStat - Additional F-statistics parameters.
*)
TRegStats = packed record
SSE,
SST,
SSR,
ResidualVar,
R2,
AdjustedR2: double;
dfE,
dfT: Integer;
StdError: double;
FStats: TFStats;
end;
(*Callback function for custom quality criteria of stepwise regression optimization algorithm.
The Stat contains the details of the regression results, b contains the coefficients and t the corresponding t-Values for each item in b.
The ParamMask.Length equals the initial total variable count. b and t can and will have length less than this.
Which variables are included is determined from ParamMask, which has 1 at corresponding included variable index and 0
otherewise. aOwner is a custom optional object and will be nil if not specified as a parameter to the optimization.
*)
TStepwiseQualityCriteria = function(const Stat: TRegStats; const b, t: TVec; const ParamMask: TVecInt; const aOwner: TObject): double;
(*Defines regression equation solve method.
Defines the regression equation A*x=b solving method. Depending on A
several different solving methods can be used.
*)
TRegSolveMethod = (
(*Use QR decomposition to solve system of equations.*)regSolveLQR,
(*Use SVD to solve system of equations.*)regSolveSVD,
(*Use LU decomposition to solve system of equations.*)regSolveLU
);
(*Performs a one-way (single-factor) analysis of variance (ANOVA).
Data matrix, each column is separate group.
Desired significance level.
Returns the significance probability of null hypothesis that two means are equal.
If p is less than desired significance alpha then the result suggests the null hypothesis
(two means are equal) can be rejected.
Performs a one-way (single-factor) analysis of variance.
The following example will perform ANOVA1 on 4 variables.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Matrix Data = new Matrix(0,0);
double signprob;
TANOVA1Result ar;
Data.SetIt(4,4,false, new double[] {3.2, 2.5, 5.5, 7.0,
6.2, 12.0, 3.0, 4.1,
1.5, 5.7, 4.9, 3.5,
2.5, 3.1, 3.2, 3.4});
ar = Regress.ANOVA1(Data,out signprob,0.05);
// ar = (SS1:12.771875; SS2:82.1475; SSTotal:94.919375;
// Deg1:3; Deg2:12; DegTotal:15; MS1:4.2572916667;
// MS2:6.845625; FDist:0.62189963176; FCrit:3.4902948195
// signprob = 0.614256442
}
}
*)
function ANOVA1(const Data: TMtx; out p: double; Alpha: double = 0.05): TANOVA1Result;overload;
(*Perform the one-way analysis of variance where Data vector (one variable) is indexed by a
second grouping GroupIndex vector variable.
Data vector of single variable.
Desired significance level.
Stores group indexes.
Returns the significance probability of null hypothesis that two means are equal.
If p is less than desired significance alpha then the result suggests the null hypothesis
(two means are equal) can be rejected.
The following example will perform ANOVA on single variable with three
groups (0,1,2 - stored in GroupIndex vector).
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector Data = new Vector(0);
Vector GroupIndex = new Vector(0);
double signprob;
TANOVA1Result ar;
Data.SetIt(false, new double[] {1,2,3,4,5,6,7,8,9,10,11,12});
GroupIndex.SetIt(false, new double[] {0,0,0,0,1,1,1,1,2,2,2,2});
ar = Regress.ANOVA1(Data,GroupIndex,out signprob,0.05);
// ANOVARes = (SS1:128; SS2:15; SSTotal:143;
// Deg1:2; Deg2:9; DegTotal:11; MS1:64;
// MS2:1.6666666667; FDist:38.4; FCrit:4.2564947291)
// signprob = 3.9210149408e-05
}
}
*)
function ANOVA1(const Data: TVec; const GroupIndex: TVecInt; out p: double; Alpha: double = 0.05): TANOVA1Result;overload;
(*Performs balanced two-way analysis of variance (ANOVA).
Stores the data to me analyzed.
Defines the number of rows/cols replications.
Desired significance level.
Return the significance probability for the null hypothesis that rows, cols or interacted
terms means are equal. These values have some meaning if Replications is more than 1.
If any p is less than desired significance alpha then the result suggests the null hypothesis (rows mean, columns
mean or interaction mean is not equal) can be rejected.
Return the significance probability for the null hypothesis that rows, cols or interacted
terms means are equal. These values have some meaning if Replications is more than 1.
If any p is less than desired significance alpha then the result suggests the null hypothesis (rows mean, columns
mean or interaction mean is not equal) can be rejected.
Return the significance probability for the null hypothesis that rows, cols or interacted
terms means are equal. These values have some meaning if Replications is more than 1.
If any p is less than desired significance alpha then the result suggests the null hypothesis (rows mean, columns
mean or interaction mean is not equal) can be rejected.
Performs balanced two-way analysis of variance on ranks of data contained in Data matrix. The two-way analysis
of variance compares the means of two or more rows and two or more columns. The data in different columns
can be interpreted as change of one factor and data in different rows can be interpreted as changes in other factor.
If there is more than one observation per row/column then you can set the number of row/column replications by changing
Replications value to appropriate factor. An exception will be raised if (Data.Rows mod Replications) is not zero.
Example layout for replication factor 2:
Group1 Group2 Group3 Group4
Trial1 xx xx xx xx
xx xx xx xx
xx xx xx xx
Trial2 xx xx xx xx
xx xx xx xx
xx xx xx xx
Both trials have 3 samples and data is in 4 groups.
This example shows the ANOVA on data with Replications set to 2
meaning there are two rows/cols per "cell".
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
double prows, pcols, pinteract;
Matrix Data = new Matrix(0, 0);
Data.SetIt(4, 4, false, new double[] {2.5, 3.2, 4.2, 3.9,
1.9, 3.5, 3.6, 3.7,
3.4, 3.5, 3.3, 3.4,
4.2, 5.0, 3.1, 2.4});
TANOVA2Result ar = Regress.ANOVA2(Data, out prows, out pcols, out pinteract, 2, 0.05);
// ar =(SS1:0.2025; SS2:1.37; SS3:4.4675; SS4:2.39; SSTotal:8.43 ;
// Deg1: 1; Deg2:3 ; Deg3:3; Deg4:8 ; DegTotal:15 ;
// MS1:0.2025 ; MS2:0.45666666667 ; MS3:1.4891666667 ; MS4:0.29875 ;
// FDist1:0.060496875 ; FDist2:0.13642916667 ; FDist3:0.44488854167 ;
// FCrit1:5.3176550716 ; FCrit2:4.0661805514 ; FCrit3:4.0661805514 ;
// pRows: 0.81190529881
// pCols: 0.93549639694
// pInteract: 0.72753486979
}
}
*)
function ANOVA2(const Data: TMtx; out pRows,pCols,pInt: double; Replications: Integer = 1; Alpha: double = 0.05): TANOVA2Result;
(*Multivariante linear regression.*)
procedure MulLinRegress(const Y: TVec; const A: TMtx; const b: TVec; Constant: boolean = true; const YCalc: TVec = nil; const ATA: TMtx = nil; Method: TRegSolveMethod = regSolveLQR); overload;
(*Multivariante linear regression.
Defines vector of dependant variable.
Defines matrix of independant (also X) variables.
Returns calculated regression coefficiens.
Use QR, SVD, or LU solver. Typically QR will yield best compromise between stability and performance.
Defines weights (optional).
If true then intercept term b(0) will be included in calculations. If false, set intercept term b(0) to 0.0.
Returns vector of calculated dependant variable, where YCalc = A*b.
Returns inverse matrix of normal equations i.e [A(T)*A]^-1.
Routine fits equations to data by minimizing the sum of squared residuals:
SS = Sum [y(k) - ycalc(k)]^2 ,
where y(k) and ycalc(k) are respectively the observed and calculated value of the dependent variable for observation k.
ycalc(k) is a function of the regression parameters b(0), b(1) ... Here the observed values obey the following equation:
y(k) = b(0) + b(1) * x(1,k) + b(2) * x(2,k) + ...
i.e
y = A * b.
To calculate additional regression statistical values, use routine.
The following example performs multiple linear regression.
using Dew.Math;
using Dew.Stats.Units;
using Dew.Stats;
namespace Dew.Examples
{
private void Example()
{
Matrix A = new Matrix(0, 0);
Matrix ATA = new Matrix(0, 0);
Vector y = new Vector(0);
Vector b = new Vector(0);
Vector w = new Vector(0);
Vector yhat = new Vector(0);
Vector residuals = new Vector(0);
Vector BStdDev = new Vector(0);
TRegStats rs;
// independent variables
A.SetIt(4, 2, false, new double[] {1.0, 2.0,
-3.2, 2.5,
8.0, -0.5,
-2.2, 1.8});
w.SetIt(false, new double[] { 1, 2, 2, 1 }); // weights
y.SetIt(false, new double[] { -3.0, 0.25, 8.0, 5.5 }); // dependent variables
Regress.MulLinRegress(y, A, b, w, true, yhat, ATA, TRegSolveMethod.regSolveLQR); //do regression
// b=(19.093757944, -2.0141843616, -10.082487055)
Regress.RegressTest(y, yhat, ATA, out rs, residuals, BStdDev, true, w); // do basic regression stats
// RegStat = (ResidualVar:0.037230395108; R2:0.99965713428;
// AdjustedR2:0.99897140285; F:1457.7968725; SignifProb: 0.01851663347)
}
}
*)
procedure MulLinRegress(const Y: TVec; const A: TMtx; const b: TVec; const Weights: TVec; Constant: boolean = true; const YCalc: TVec = nil; const ATA: TMtx = nil;Method: TRegSolveMethod = regSolveLQR );overload;
(*Logistic regression.
response vector containing binomial counts.
number of trials for each count. Y is assumed to be binomial(p,N).
matrix of covariates, including the constant vector if required.
offset if required.
regression parameter estimates.
fitted values.
Regression parameter estimates errors.This is an estimate of the precision
of the B estimates.
Default precision for reweighted LQR.
Fit logistic regression model.
The following example calculates coefficients for simple logistic regression.
The counts are out of 10 in each case and there is one covariate[1].
using Dew.Math;
using Dew.Stats;
using Dew.Math.Units;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector y = new Vector(0);
Vector n = new Vector(0);
Vector B = new Vector(0);
y.SetIt(false, new double[] { 2, 0, 3, 1, 5, 5, 6, 9, 5, 9 });
n.Size(y);
n.SetVal(10.0);
Regress.LogisticRegress(y, n, B, null,0,null,null, Math387.EPS);
}
}
*)
procedure LogisticRegress(const y, n: TVec; const b: TVec; const A: TMtx = nil; Offset: double = 0.0; const YCalc: TVec = nil; const BStd: TVec = nil; Tolerance: double = EPS);overload;
(*Ordinal logistic regression.
Response levels.
Matrix of independent variables. It is assumed to have full column rank.
Set it to define initial estimates for B. After the call to LogisticRegress
returns regression parameter estimates for B.
Set it to define initial estimates for Theta. After the call to LogisticRegress
returns regression parameter estimates for Theta.
Returns logistic log-likehood function, evaluated at minimum.
Returns why the internal Marquardt optimization method stopped.
Maximum number of allowed iterations in main optimisation loop.
Desired tolerance for optimisation minimum.
Returns Theta and B coefficients standard error. This is an estimate of the precision
of the Theta and B estimates. The covariance matrix is obtained by inverting the observed information matrix evaluated
at the maximum likelihood estimates. The standard errors are the square roots of the diagonal elements of this covariance matrix.
If true then B and Theta initial estimates will be calculated. If false then you must
specify initial values for B and Theta.
number of iterations needed to converge to solution with Tolerance precision.
Performs logistic or ordinal logistic regression. Suppose y takes values in k ordered categories, and let p_ij
be the cumulative probability that y(i) falls in the j'th category
or higher. The ordinal logistic regression model is defined as:
logit(p_ij) = theta(j) + A_i'B , i = 1,..,length(Y), j = 1,..,k-1,
where A_i is the i'th row of A . The number of ordinal
categories k is taken to be the number of distinct values of int)y. If k is 2 the model is ordinary logistic regression[1].
The following example performs ordinal logistic regression. There are three levels of
response and two intercepts in the output to distinguish the three levels.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector y = new Vector(0);
Vector b = new Vector(0);
Vector theta = new Vector(0);
Vector se = new Vector(0);
Matrix A = new Matrix(0,0,false);
double min;
TOptStopReason stopreason;
y.SetIt(false,new double[] {1,1,2,1,3,2,3,2,3,3});
A.SetIt(false,10,1,new double[] {1,2,3,4,5,6,7,8,9,10});
Regression.LogisticRegress(y,A,b,theta,se,out min, out stopreason,
100,1.0e-8,true);
// b = (0.801), theta=(2.779,5.366)
}
}
*)
function LogisticRegress(const y: TVec; const A: TMtx; const B, Theta: TVec; const StdErr: TVec; out FMin: double; out StopReason: TOptStopReason;
MaxIter: Integer = 100; Tolerance: double = SQRTEPS; AutoInitEstimates: boolean = true):integer; overload;
(*Simple linear regression.
The routine fits equations to data by minimizing the sum of squared residuals:
SS = Sum [y(k) - ycalc(k)]^2 ,
where y(k) and ycalc(k) are respectively the observed and calculated value of the dependent variable for observation k.
ycalc(k) is a function of the regression parameters b(0) and b(1). In case constant term is used, the observed values
obey the following equation:
y(k) = b(0) + b(1) * x
i.e
Y = b(0) + b(1)*X.
or if constant term is NOT used:
y(k) = b(0) * x
To calculate additional regression statistical parameters, use routine.
The following example loads data for simple linear regression (line equation) and
then extracts line parameters b(0)=n and b(1)=k:
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
using Steema.TeeChart;
namespace Dew.Examples
{
private void Example(Styles.Line series1, Styles.Line series2)
{
Vector x = new Vector(0);
Vector y = new Vector(0);
Vector b = new Vector(0);
Vector yhat = new Vector(0);
x.SetIt(false,new double[] {1.0, 1.5, 2.3, 3.8, 4.2, 5.0, 5.3, 5.9});
y.SetIt(false,new double[] {11, 12, 12.5, 14, 14.3, 15.2, 15.3, 17});
LinRegress(x,y,b,true,null,yhat,null);
MtxVecTee.DrawValues(x,y,series1,false); // draw original data
MtxVecTee.DrawValues(x,yhat,series2,false); // draw fitted data
}
}
*)
procedure LinRegress(const X,Y: TVec; const B: TVec; Constant: Boolean = True; const Weights: TVec = nil; const YCalc: TVec = nil; const ATA: TMtx = nil);
(*Regression by using "ridge" regression method.
Calculates regression coefficients b by using "ridge regression" method. When data suffers from multicollinearity one
of the available methods which can still be used is ridge regression. In this case least squares estimates are unbiased,
but their variances are large so they may be far from the true value. By adding a degree of bias to the regression estimates,
ridge regression reduces the standard errors. It is hoped that the total effect will be to give more reliable estimates.
Ridge regression uses following model:
y = A*b ,
where y is vector of observations, A matrix of independent variables and b are regression coefficients. Additional k
parameter is the so called "ridge parameter". Regression coefficients are then calculated from the following formula:
b = inv(AT*A + k*I) * (AT*y),
where I is the identity matrix, A matrix of independent variables and AT=Transp(A).
Note
The routine does NOT calculate optimal k value for ridge regression.
An example of ridge regression method.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector y = new Vector(0);
Vector b = new Vector(0);
Matrix A = new Matrix(0,0);
y.SetIt(false,new double[] {-2.5, 0.1, 6.1});
A.SetIt(3,2,false, new double[] {1.0, 2.5,
3.2, -1.5,
0.4, 0.7});
Regress.RidgeRegress(y,A,0.0,b);
// b = (-6.8889789853, -6.450976395)
}
}
*)
procedure RidgeRegress(const Y: TVec; const A: TMtx; k: double; const b: TVec; Normalize: Boolean = true);
(*Regression tests.
Dependant variables.
Estimated (calculated) dependant variables.
Model weights (optional).
Number of variables (parameters) in ML model A*b=y (number of columns in A matrix or number of rows in b).
If true then include intercept term b(0) in calculations. If false, set intercept term b(0) to 0.0.
Returns regression statistics parameters.
Use regression results to calculate basic regression statistics for model:
A*b=Y
*)
procedure RegressTest(const Y, YCalc: TVec; NumPars: Integer; out RegStat: TRegStats; Constant: boolean = True; const Weights: TVec = nil); overload;
(*Regression tests.
Dependant variables.
Estimated (calculated) dependant variables.
Inverse matrix of normal equations i.e [A(T)*A]^-1.
Model weights (optional).
If true then include intercept term b(0) in calculations. If false, set intercept term b(0) to 0.0.
Returns regression statistics parameters.
Returns residual errors.
Returns standard deviation.
Using regression results the routine calculates additional regression statistical parameters, together with model
coefficients standard errors and model errors.
Perform MLR and use results to calculate additional statistical parameters.
using Dew.Math;
using Dew.Stats.Units;
using Dew.Stats;
namespace Dew.Examples
{
private void Example()
{
Matrix A = new Matrix(0, 0);
Matrix ATA = new Matrix(0, 0);
Vector y = new Vector(0);
Vector b = new Vector(0);
Vector w = new Vector(0);
Vector yhat = new Vector(0);
Vector res = new Vector(0);
Vector bse = new Vector(0);
TRegStats rs;
// independent variables
A.SetIt(4, 2, false, new double[] {1.0, 2.0,
-3.2, 2.5,
8.0, -0.5,
-2.2, 1.8});
w.SetIt(false, new double[] { 1, 2, 2, 1 }); // weights
y.SetIt(false, new double[] { -3.0, 0.25, 8.0, 5.5 }); // dependent variables
Regress.MulLinRegress(y, A, b, w, true, yhat, ATA, TRegSolveMethod.regSolveLQR); //do regression
// b=(19.093757944, -2.0141843616, -10.082487055)
Regress.RegressTest(y, yhat, ATA, out rs, res, bse, true, w); // do basic regression stats
// RegStat = (ResidualVar:0.037230395108; R2:0.99965713428;
// AdjustedR2:0.99897140285; F:1457.7968725; SignifProb: 0.01851663347)
}
}
*)
procedure RegressTest(const Y, YCalc: TVec; const ATA: TMtx; out RegStat: TRegStats; const Residuals: TVec; const BStdDev: TVec; Constant: boolean = True; const Weights: TVec = nil); overload;
(*General non-linear regression.
Vector of the independent variable.
Vector of dependent variable.
Regression function.
Procedure to calculate the derivatives of RegFun. You can define the exact derivative or use routine as
numerical approximation.
Holds initial estimate for regression parameters. After the call to NLinRegress b returns calculated regression parameters.
Defines which optimization method will be used to find regression parameters (see MtxVec.hlp TOptMethod type to learn more about this).
Returns why regression parameters search stopped (see MtxVec.hlp TOptStopReason type to learn more about different stop reasons).
Weights (optional).
Returns calculated values (optional).
If true, internal line search algoritm will use soft line search method. Set this parameter to true if you're using numerical
approximation for derivative. If this parameter is set to false, internal line search algorithm will use exact line search method. Set this parameter
to false if you're using *exact* derivative.
Maximum allowed numer of allowed iterations.
Desired regression parameters tolerance.
Minimum allowed gradient C-Norm.
If assigned, stores Fun, evaluated at each iteration step.
Optionally, you can also pass object to the Verbose parameter. This allows the optimization procedure
to be interrupted from another thread and optionally also allows logging and iteration count monitoring.
Number of iterations needed to calculate regression parameters with specified tolerance.
The routine fits equations to data by minimizing the sum of squared residuals :
SS = Sum [y(k) - ycalc(k)]^2 ,
where y(k) and ycalc(k) are respectively the observed and calculated value of the dependent variable for observation k. ycalc(k) is a function of the regression
parameters b(0), b(1) ... Here the observed values obey the following (non-linear) equation:
y(k) = RegFun[x(k), b(0), b(1), ... ]
Y = RegFun[X,b(0),b(1), ...]
where RegFun is the regression function and b(0),..b(i) are the regression parameters.
The following example uses data from NIST study involving circular interference transmittance.
The response variable is transmittance, and the predictor variable is wavelength. First we setup the
regression function Eckerle4 with three regression parameters (b0,b1,b2). Then we setup data
and specify initial estimate for regression parameters (see below):
using Dew.Math;
using Dew.Math.Tee;
using Dew.Stats.Units;
using Dew.Stats;
namespace Dew.Examples
{
// function definition
private double Eckerle4(TVec B, double x)
{
double sqrterm = (x-B[2])/B[1])*(x-B[2])/B[1]);
return B[0]/B[1] * Exp(-0.5*(sqrterm));
}
private void Example()
{
Vector x = new Vector(0);
Vector y = new Vector(0);
Vector b = new Vector(0);
Vector yhat = new Vector(0);
TOptStopReason StopReason;
x.SetIt(false,new double[] {400.0, 405.0, 410.0, 415.0,
420.0, 425.0, 430.0, 435.0,
436.5, 438.0, 439.5, 441.0,
442.5, 444.0, 445.5, 447.0,
448.5, 450.0, 451.5, 453.0,
454.5, 456.0, 457.5, 459.0,
460.5, 462.0, 463.5, 465.0,
470.0, 475.0, 480.0, 485.0,
490.0, 495.0, 500.0});
y.SetIt(false,new double[] {0.0001575, 0.0001699, 0.0002350, 0.0003102,
0.0004917, 0.0008710, 0.0017418, 0.0046400,
0.0065895, 0.0097302, 0.0149002, 0.0237310,
0.0401683, 0.0712559, 0.1264458, 0.2073413,
0.2902366, 0.3445623, 0.3698049, 0.3668534,
0.3106727, 0.2078154, 0.1164354, 0.0616764,
0.0337200, 0.0194023, 0.0117831, 0.0074357,
0.0022732, 0.0008800, 0.0004579, 0.0002345,
0.0001586, 0.0001143, 0.0000710});
b.SetIt(false,new double[] {1.0, 10.0, 500.0}); // initial estimates
Regress.NLinRegress(x,y,Eckerle4,null,b,optMarquardt, out StopReason,
null,yhat,false,300,1e-8,1e-10);
MtxVecTee.DrawValues(x,y,Series1,false); // draw data
MtxVecTee.DrawValues(x,yhat,Series2,false); // draw fitted value
}
}
*)
function NLinRegress(const X, Y: TVec; RegFun: TRegressFun; DeriveProc: TDeriveProc; const B: TVec; Method: TOptMethod;
out StopReason: TOptStopReason; const Weights: TVec = nil; const YCalc: TVec = nil; SoftSearch: boolean = false ; MaxIter : Integer = 500;
Tol : double = 0.00000001;
GradTol : double = 0.00000001 ; const Verbose: TStrings = nil): Integer; overload;
(*Non-linear regression with lower and upper bounds.
Vector of the independent variable.
Vector of the dependent variable.
Regression function.
Procedure to calculate the derivatives of RegFun. You can define the exact derivative or use routine as
numerical approximation.
Holds initial estimate for regression parameters. After the call to NLinRegress b returns calculated regression parameters.
Holds lower bounds for regression parameters. If there are no lower bounds, set BLowerB values to -INF.
Holds upper bounds for regression parameters. If there are no upper bounds, set BUpperB values to +INF.
Defines which optimization method will be used to find regression parameters (see MtxVec.hlp TOptMethod type to learn more about this).
Returns why regression parameters search stopped (see MtxVec.hlp TOptStopReason type to learn more about different stop reasons).
Weights (optional).
Returns calculated values (optional).
If true, internal line search algoritm will use soft line search method. Set this parameter to true if you're using numerical
approximation for derivative. If this parameter is set to false, internal line search algorithm will use exact line search method. Set this parameter
to false if you're using *exact* derivative.
Maximum allowed numer of allowed iterations.
Desired regression parameters tolerance.
Minimum allowed gradient C-Norm.
If assigned, stores Fun, evaluated at each iteration step.
Optionally, you can also pass object to the Verbose parameter. This allows the optimization procedure
to be interrupted from another thread and optionally also allows logging and iteration count monitoring.
Number of iterations needed to calculate regression parameters with specified tolerance.
General non-linear regression with lower and upper bounds for regression coefficients.
*)
function NLinRegress(const X, Y: TVec; RegFun: TRegressFun; DeriveProc: TDeriveProc; const B, BLowerB, BUpperB: TVec; Method: TOptMethod;
out StopReason: TOptStopReason; const Weights: TVec = nil; const YCalc: TVec = nil; SoftSearch: boolean = false ; MaxIter : Integer = 500;
Tol : double = 0.00000001;
GradTol : double = 0.00000001 ; const Verbose: TStrings = nil): Integer; overload;
(*Numerical derivative for regress function.
Routine calculates the derivatives of the regression function RegressFun at (X,Y) by numerical differentiation. The results of differentation
are returned in Grad vector. The Length and Complex properties of Grad vector are NOT adjusted automatically.
*)
procedure NumericDerive(RegressFun: TRegressFun; const X,Y : double; const Pars: TVec; const Grad: TVec);
(*Vectorized computation of the numerical derivative for the regress function.
Routine calculates the derivatives of the regression function RegressFun at (X,Y) by numerical differentiation.
The results of differentation are returned in Grad vectors, one vector for each parameter stored in "pars".
The "X" stores the independent variables and "Y" the dependent variable. RegressFun is the function, which
computes the value of the function of which we are looking for derivates.
The Length and Complex properties of Grad vector are NOT adjusted automatically.
Grad.Count should typically be equal to Pars.Length and Grad[i].Length equal to Y.Length. The function returns derivates separately
for each parameter and pair of (X0...Xi, Y).
Vectorized computation of the derivates can be 10-20x faster than non-vectorized computation.
*)
procedure MultiNumericDerive(const RegressFun: TMultiRegressFun; const X: TVecList; const Y : TVec; const Pars: TVec; const Grad: TVecList);
(*Poisson generalized linear model with log-link.
Defines Response vector (frequency count).
Defines covariates, including the constant vector (if needed).
returns regression parameter estimates.
returns regression parameter estimates standard errors.
returns residual deviance.
returns residual degrees of freedom,
Offset (optional).
Returns calculated values (optional).
Desired precision in calculations.
Maximum number of iteratively reweighted iterations in algorithm.
number of evaluations needed for desired precision.
Performs Poisson generalized linear model with log-link. Poisson regression is often used to analyze count data. It can be used to model the number of occurrences of
an event of interest or the rate of occurrence of an event of interest, as a function of some independent variables.
In Poisson regression it is assumed that the dependent variable Y, number of occurrences of an event, has a Poisson distribution given the independent variables
X1, X2, ...., Xn,
P(Y=k| x1, x2, ..., xn) = Exp[-m] m^k / k!, k=0, 1, 2, ......,
where the log of the mean m is assumed to be a linear function of the independent variables. That is,
log(m) = intercept + b1*X1 +b2*X2 + ....+ b3*Xn,
which implies that m is the exponential function of independent variables,
m = exp(intercept + b1*X1 +b2*X2 + ....+ b3*Xn).
In many situations the rate or incidence of an event needs to be modeled instead of the number of occurrences.
For example, suppose that we know the number of occurrences of certain disease by county and we want to find out
if frequency of occurrence depends on certain demographic variables and health policy programs also recorded by
county. Since more at risk subjects result in more occurrences of the disease, we need to adjust for the number
of subjects at risk in each county. For such data, we can write a Poisson regression model in the following form:
log(m) = log(N) + intercept + b1*X1 +b2*X2 + ....+ b3*Xn,
where N is the total number of subjects at risk by county. The logarithm of variable N is used as an offset, that is,
a regression variable with a constant coefficient of 1 for each observation. The log of the incidence, log (m / N),
is modeled now as a linear function of independent variables.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector y = new Vector(0);
Vector b = new Vector(0);
Vector berr = new Vector(0);
Matrix A = new Matrix(0,0);
y.SetIt(false,new double[] {1,2,3,4});
A.SetIt(4,2,false, new double[] {1,5,2,6,3,7,4,8});
Regression.PoissonRegress(y,A,b,berr,out dev,out df,0.0,null,1.0e-8,300);
}
}
*)
function PoissonRegress(const y: TVec; const A: TMtx; const Beta, BStdErr: TVec; out Deviance: double; out DF: Integer;
Offset: double = 0.0; const YCalc: TVec = nil;
Tol : double = 0.00000001 ;
MaxIter: Integer = 500): Integer;
(*Principal Component Regression.
Defines vector of dependant variable.
Defines matrix of independant variables.
Defines the number of variables to ommit from initial model.
Returns calculated regression coefficiens.
Returns vector of calculated dependant variable, where YCalc = A*b + constant term.
Returns principal component b coefficient standard error.
Performs unweighted Principal Component Regression (PCR). PCR is a technique for analyzing multiple regression data
that suffer from multicollinearity. When multicollinearity occurs, least squares estimates are unbiased, but their
variances are large so they may be far from the true value. By adding a degree of bias to the regression estimates,
principal components regression reduces the standard errors.
The algorithm first standardizes A matrix and performs PC regression on standardized matrix.
using Dew.Math;
using Dew.Stats.Units;
using Dew.Stats;
namespace Dew.Examples
{
private void Example()
{
Matrix A = new Matrix(0,0);
Matrix ATA = new Matrix(0,0);
Vector y = new Vector(0);
Vector ycalc = new Vector(0);
Vector b = new Vector(0);
Vector error = new Vector(0);
double mse;
// Load data
A.SetIt(18,3,false, new double[] {1, 2, 1,
2, 4, 2,
3, 6, 4,
4, 7, 3,
5, 7, 2,
6, 7, 1,
7, 8, 1,
8, 10, 2,
9, 12, 4,
10, 13, 3,
11, 13, 2,
12, 13, 1,
13, 14, 1,
14, 16, 2,
15, 18, 4,
16, 19, 3,
17, 19, 2,
18, 19, 1});
Y.SetIt(false, new double[] {3,9,11,15,13,13,17,21,25,27,25,27,29,33,35,37,37,39});
// Perform Principal Component Regression
Regress.PCRegress(y,A,b,ycalc,null,1);
// Errors
error.Sub(ycalc,y);
}
}
*)
procedure PCRegress(const Y: TVec; const A: TMtx; const b: TVec; const YCalc: TVec = nil; const Bse: TVec = nil ; NumOmmit: Integer = 1); overload;
(*Weighted PC regression.
Defines vector of dependant variable.
Defines matrix of independant variables.
Returns calculated regression coefficiens.
Defines weights for PC regression.
Returns vector of calculated dependant variable, where YCalc = A*b + constant term.
Returns principal component b coefficient standard error.
Defines the number of variables to ommit from initial model.*)
procedure PCRegress(const Y: TVec; const A: TMtx; const b: TVec; const Weights: TVec; const YCalc: TVec = nil; const Bse: TVec = nil ; NumOmmit: Integer = 1); overload;
(*R2 value.
The R2 value, in this case defined by the following equation:
R2 = 1.0 - SumOfSquares(Y-YCalc)/SumOfSquares(Y-Mean(Y)).
Defines vector of dependant variable.
Defines vector of estimated dependant variable values.
There are several ways to calculate R2, all equivalent for a linear model where model
includes a constant term, but not equivalent otherwise.
*)
function R2(const Y, YCalc: TVec): double;
(*Prediction error sum of squares (PRESS).
Defines vector of dependant variable.
Defines matrix of independant variables.
calculated PRESS value.
PRESS is an acronym for prediction sum of squares. It was developed for use in variable selection to validate a regression model.
To calculate PRESS, each observation is individually omitted. The remaining N - 1 observations are used to calculate a regression
and estimate the value of the omitted observation. This is done N times, once for each observation. The difference between the
actual Y value and the predicted Y with the observation deleted is called the prediction error or PRESS residual. The sum of
the squared prediction errors is the PRESS value. The smaller PRESS is, the better the predictability of the model.
*)
function PRESS(const A: TMtx; const Y: TVec): double;
(*Optimal k value for ridge regression.
Finds optimal k value for ridge regression, such that k minimizes ridge
model MSE.
*)
procedure RidgeOptimalk(const y: TVec; const A: TMtx; out k: double);
(*Stepwise regression is an optimization aglorithm aiming to improve the quality of the multiple linear regression
by excluding noisy variables.
Contains all independent variables and the dependent variable as the last item in the list.
Holds standard deviation of all aList items on input.
Specifies the stepwise regression method.
Length must be equal to number of independent variables (aList.Count-1).
This vector needs to be allocated in "bit" mode:
VariableMask.BitCount := NumberOfIndependentVars;
Matrix size of IterCount x (VarCount + 7). Each row starts with Step number followed by selection list of variables in columns followed by Standard Error (quality criteria).
We strive to reduce standard error and the model with the smallest standard error is considered best. Additional columns are as follows:
- [0] Standard error = sqrt(SSE/dFE), or custom quality criteria
- SSE = Residual sum of squares.
- [2] SSR = Regression sum of squares.
- [3] SST = Total sum of squares = SSE + SSR
- [4] R2 = Coefficient of determination
- [5] Adjusted R2 = Adjusted coefficient of determination
- [6] MSE = Residual variance
Matrix size of (IterCount*VarCount) x 5. Each iteration adds the independent variable count rows.
The columns are as follows:
- [0] Iteration Step
- [1] Variable index
- [2] variable selection where 0 means excluded and 1 means included.
- [3] holds the normalized coeffients and Fourth column the
- [4] corresponding t-values for each coefficient
- [5] two tailed p-values. Bigger p-values suggest the probability that the model would better, if the variable would be excluded.
Limits the maximum number of iterations. The function will raise an exception if this limit is reached.
If True, the VariableMask will be initialized to all vars excluded for Forward search and all vars included for Backward search.
If False, the search can start with preselected variables within VariableMask.
VariableMask.BitCount := NumberOfIndependentVars;
VariableMask.Bits[0] := false;
VariableMask.Bits[1] := true;
...
For the step by step method this parameter must be false (user initialization on each step is required).
Optional extra callback function to use quality criteria other than the default "Standard Error"
An optional object parameter to be passed to the CriteriaFun
Optimal result is possible only when using the "exhaustive" search method, which will check all posibilities.
After the final variable selection has been obtained, run the followed by , if detailed statistics data is required.
There are many methods to solve this problem. This function implements four approches: exhaustive, forward, backward and stepwise.
For models with less than 15 variables, the exhaustive search is the recommended method. Alternatively it is possible to perform "backward search"
by starting with all and removing one by one variable or "forward search" by starting with none and adding one by one variable.
Both backward and forward search can have selected variables already pre-included (or pre-excluded).
Single step mode allows the user to manually include or exclude individual variables from the model
after each step.
To use quality criteria other than default "Standard Error", you can pass extra callback with the CriteriaFun. The return value
will be used to determine, if the result is better or worse and a smaller value will be considered better.
We are looking for best fit of:
b0 + b1*x1 + b2*x2 + b3*x3 + b4*x4 + b5*x5 = y
where b1..b5 can be either zero or not, but would like to know which are best set to zero.
Last column of aSrc in the code example below is the dependent variable (y).
Uses MtxExpr, Regress, StatTools, Math387;
procedure TForm78.RunButtonClick(Sender: TObject);
var aSrc, reportCoeff, reportSSE: Matrix;
stdDevA: Vector;
aList: TVecList;
i: integer;
bi: VectorInt;
sMethod: TStepwiseMethod;
begin
Memo.Lines.BeginUpdate;
Memo.Lines.Clear;
aList := TVecList.Create;
try
aSrc.SetIt(15,6,false, [83,34, 65, 63, 64, 106,
73, 19, 73, 48, 82, 92,
54, 81, 82, 65, 73, 102,
96, 72, 91, 88, 94, 121,
84, 53, 72, 68, 82, 102,
86, 72, 63, 79, 57, 105,
76, 62, 64, 69, 64, 97,
54, 49, 43, 52, 84, 92,
37, 43, 92, 39, 72, 94,
42, 54, 96, 48, 83, 112,
71, 63, 52, 69, 42, 130,
63, 74, 74, 71, 91, 115,
69, 81, 82, 75, 54, 98,
81, 89, 64, 85, 62, 96,
50, 75, 72, 64, 45, 103]);
aList.DecomposeColumnMatrix(aSrc);
stdDevA.Size(aList.Count);
for i := 0 to aList.Count-1 do stdDevA[i] := aList[i].StdDev;
bi.BitCount := aList.Count-1;
sMethod := swBackward;
StepwiseRegression(aList, stdDevA, sMethod, bi, reportSSE, reportCoeff);
Memo.Lines.Add('');
reportSSE.ValuesToStrings(Memo.Lines, '', ftaRightAlign, '0.###', '0.###', true);
Memo.Lines.Add('');
reportCoeff.ValuesToStrings(Memo.Lines, '', ftaRightAlign, '0.###', '0.###', true);
Memo.Lines.Add('');
finally
Memo.Lines.EndUpdate;
aList.Free;
end;
end;
*)
procedure StepwiseRegression(const aList: TVecList; const stdDevA: TVec; const sMethod: TStepwiseMethod; const VariableMask: TVecInt;
const reportSSE, reportCoeff: TMtx; const MaxIter: integer = 1000; const InitMask: boolean = true;
const CriteriaFun: TStepwiseQualityCriteria = nil; const CriteriaOwner: TObject = nil);
(*General vectorized non-linear regression.
List of vectors of independent variable(s).
Vector of the dependent variable.
Regression function.
Procedure to calculate the derivatives of RegFun. You can define the exact derivative or use routine as
numerical approximation.
Holds initial estimate for regression parameters. After the call to NLinRegress b returns calculated regression parameters.
Defines which optimization method will be used to find regression parameters (see MtxVec.hlp TOptMethod type to learn more about this).
Returns why regression parameters search stopped (see MtxVec.hlp TOptStopReason type to learn more about different stop reasons).
Weights of X values (optional).
Returns calculated values (optional).
If true, internal line search algoritm will use soft line search method. Set this parameter to true if you're using numerical
approximation for derivative. If this parameter is set to false, internal line search algorithm will use exact line search method. Set this parameter
to false if you're using *exact* derivative.
Maximum allowed numer of allowed iterations.
Desired regression parameters tolerance.
Minimum allowed gradient C-Norm.
If assigned, stores Fun, evaluated at each iteration step.
Optionally, you can also pass object to the Verbose parameter. This allows the optimization procedure
to be interrupted from another thread and optionally also allows logging and iteration count monitoring.
Number of iterations needed to calculate regression parameters with specified tolerance.
The routine fits equations to data by minimizing the sum of squared residuals :
SS = Sum [y(k) - ycalc(k)]^2 ,
where y(k) and ycalc(k) are respectively the observed and calculated value of the dependent variable for observation k. ycalc(k) is a function of the regression
parameters b(0), b(1) ... Here the observed values obey the following (non-linear) equation:
y(k) = RegFun[x(k), b(0), b(1), ... ]
Y = RegFun[X,b(0),b(1), ...]
where RegFun is the regression function and b(0),..b(i) are the regression parameters.
The following example uses data from NIST study involving circular interference transmittance.
The response variable is transmittance, and the predictor variable is wavelength. First we setup the
regression function Eckerle4 with three regression parameters (b0,b1,b2). Then we setup data
and specify initial estimate for regression parameters (see below):
using Dew.Math;
using Dew.Math.Tee;
using Dew.Stats.Units;
using Dew.Stats;
namespace Dew.Examples
{
// function definition
private void Eckerle4(TVec B, TVecList x, TVec y)
{
// double a = (x-B[2])/B[1];
// result = B[0]/B[1] * Math.Exp(-0.5*a*a);
y.Normalize(x[0], B[2], B[1]);
y.Sqr();
y.Scale(-0.5);
y.Exp();
y.Scale(B[0]/B[1]);
}
private void Example()
{
TVecList x = new TVecList();
x.Add();
Vector y = new Vector(0);
Vector b = new Vector(0);
Vector yhat = new Vector(0);
TOptStopReason StopReason;
x[0].SetIt(false,new double[] {400.0, 405.0, 410.0, 415.0,
420.0, 425.0, 430.0, 435.0,
436.5, 438.0, 439.5, 441.0,
442.5, 444.0, 445.5, 447.0,
448.5, 450.0, 451.5, 453.0,
454.5, 456.0, 457.5, 459.0,
460.5, 462.0, 463.5, 465.0,
470.0, 475.0, 480.0, 485.0,
490.0, 495.0, 500.0});
y.SetIt(false,new double[] {0.0001575, 0.0001699, 0.0002350, 0.0003102,
0.0004917, 0.0008710, 0.0017418, 0.0046400,
0.0065895, 0.0097302, 0.0149002, 0.0237310,
0.0401683, 0.0712559, 0.1264458, 0.2073413,
0.2902366, 0.3445623, 0.3698049, 0.3668534,
0.3106727, 0.2078154, 0.1164354, 0.0616764,
0.0337200, 0.0194023, 0.0117831, 0.0074357,
0.0022732, 0.0008800, 0.0004579, 0.0002345,
0.0001586, 0.0001143, 0.0000710});
b.SetIt(false,new double[] {1.0, 10.0, 500.0}); // initial estimates
Regress.NLinRegress(x, y, Eckerle4, null, b, TOptMethod.optMarquardt, out StopReason, null, yhat, false, 300, 1e-8, 1e-10, null);
MtxVecTee.DrawValues(x[0],y,Series1,false); // draw data
MtxVecTee.DrawValues(x[0],yhat,Series2,false); // draw fitted value
}
}
*)
function NLinRegress(const X: TVecList; const Y: TVec; RegFun: TMultiRegressFun; DeriveProc: TMultiDeriveProc; const B: TVec; Method: TOptMethod;
out StopReason: TOptStopReason; const Weights: TVec = nil; const YCalc: TVec = nil; SoftSearch: boolean = false ; MaxIter : Integer = 500;
Tol : double = 0.00000001;
GradTol: double = 0.00000001 ; const Verbose: TStrings = nil): Integer; overload;
(*Non-linear regression with lower and upper bounds.
List of vectors of independent variable(s).
Vector of dependent variable.
Regression function.
Procedure to calculate the derivatives of RegFun. You can define the exact derivative or use routine as
numerical approximation.
Holds initial estimate for regression parameters. After the call to NLinRegress b returns calculated regression parameters.
Holds lower bounds for regression parameters. If there are no lower bounds, set BLowerB values to -INF.
Holds upper bounds for regression parameters. If there are no upper bounds, set BUpperB values to +INF.
Defines which optimization method will be used to find regression parameters (see MtxVec.hlp TOptMethod type to learn more about this).
Returns why regression parameters search stopped (see MtxVec.hlp TOptStopReason type to learn more about different stop reasons).
Weights (optional).
Returns calculated values (optional).
If true, internal line search algoritm will use soft line search method. Set this parameter to true if you're using numerical
approximation for derivative. If this parameter is set to false, internal line search algorithm will use exact line search method. Set this parameter
to false if you're using *exact* derivative.
Maximum allowed numer of allowed iterations.
Desired regression parameters tolerance.
Minimum allowed gradient C-Norm.
If assigned, stores Fun, evaluated at each iteration step.
Optionally, you can also pass object to the Verbose parameter. This allows the optimization procedure
to be interrupted from another thread and optionally also allows logging and iteration count monitoring.
Number of iterations needed to calculate regression parameters with specified tolerance.
General non-linear regression with lower and upper bounds for regression coefficients.
*)
function NLinRegress(const X: TVecList; const Y: TVec; RegFun: TMultiRegressFun; DeriveProc: TMultiDeriveProc; const B, BLowerB, BUpperB: TVec; Method: TOptMethod;
out StopReason: TOptStopReason; const Weights: TVec = nil; const YCalc: TVec = nil; SoftSearch: boolean = false ; MaxIter : Integer = 500;
Tol : double = 0.00000001;
GradTol : double = 0.00000001 ; const Verbose: TStrings = nil): Integer; overload;
(*Control charts.
Introduces several control charts. Control charts are used to routinely monitor quality. Depending on the number of process
characteristics to be monitored, there are two basic types of control charts. The first, referred to as a univariate control chart,
is a graphical display (chart) of one quality characteristic. The second, referred to as a multivariate control chart, is a
graphical display of a statistic that summarizes or represents more than one quality characteristic.
If a single quality characteristic has been measured or computed from a sample, the control chart shows the value of the
quality characteristic versus the sample number or versus time. In general, the chart contains a center line that represents
the mean value for the in-control process. Two other horizontal lines, called the upper control limit (UCL) and the lower control
limit (LCL), are also shown on the chart. These control limits are chosen so that almost all of the data points will fall within
these limits as long as the process remains in-control.
Check the following link to learn more
about probability plots.
*)
unit StatControlCharts;
{$I BdsppDefs.inc}
interface
uses MtxVec, Math387, Statistics
,Classes
,SysUtils
,Types
;
(*X-Chart ("XBar Chart").
Each Data row contains replicated observation taken at specific time.
Confidence level for upper and lower control limit. Confidence must lie in the (0,1) interval.
Returns values to be drawn.
Returns control Chart centerline.
Returns control Chart upper control limit.
Returns control Chart lower control limit.
This routine calculates X-Chart ("XBar Chart") drawing values, center line, upper and lower control limit.
When dealing with a quality characteristic that can be expressed as a measurement, it is customary
to monitor both the mean value of the quality characteristic and its variability. Control
over the average quality is exercised by the control chart for averages, usually called the XBar chart.
For this chart type, the center line is defined by process grand mean and upper and lower control limits are
defined by process standard deviation. Similarly, for R and S charts the center line is defined by process range and
variability (standard deviation) respectively.
The following code loads data, stored in binary file and then creates a XBar chart.
using Dew.Math;
using Dew.Math.Units;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Matrix Data;
Vector DrawVec;
double cl,ucl,lcl;
Data.LoadFromFile("ewma_data.mtx");
StatControlCharts.QCXChart(Data,DrawVec,out cl, out ucl, out lcl, 0.025);
}
}
*)
procedure QCXChart(const Data: TMtx; const DrawVec: TVec; out CL, UCL, LCL: double; Confidence: double = 0.997);
(*S-Chart.
Each Data row contains replicated observation taken at specific time.
Confidence level for upper and lower control limit. Confidence must lie in the (0,1) interval.
Returns values to be drawn.
Returns control Chart centerline.
Returns control Chart upper control limit.
Returns control Chart lower control limit.
Calculates S-Chart drawing values, center line, upper and lower control limit.
Place a TChart component on the chart and add a TQCSerie (Series1). The
following code creates necessary values for S chart:
using Dew.Math;
using Dew.Math.Units;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
double cl,ucl,lcl;
Matrix Data = new Matrix(0);
Vector DrawVec = new Vector(0);
Data.LoadFromFile("data.mtx"); // data is now initialized
StatControlCharts.QCSChart(Data,DrawVec,out cl, out ucl, out lcl, 0.025);
}
}
*)
procedure QCSChart(const Data: TMtx; const DrawVec: TVec; out CL, UCL, LCL: double; Confidence: double = 0.997);
(*R-Chart.
Each Data row contains replicated observation taken at specific time.
Confidence level for upper and lower control limit. Confidence must lie in the (0,1) interval.
Returns values to be drawn.
Returns control Chart centerline.
Returns control Chart upper control limit.
Returns control Chart lower control limit.
Calculates R-Chart drawing values, center line, upper and lower control limit.
The following code will create R chart:
using Dew.Math;
using Dew.Math.Units;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Matrix Data = new Matrix(0,0);
Vector DrawVec = new Vector(0);
double cl,ucl,lcl;
Data.LoadFromFile("data.mtx");
StatControlCharts.QCRChart(Data,DrawVec,out cl, out ucl, out lcl, 0.025);
}
}
*)
procedure QCRChart(const Data: TMtx; const DrawVec: TVec; out CL, UCL, LCL: double; Confidence: double = 0.997);
(*Process Capability indexes p, Cp and Cpk.
Process data.
Lower specification limit. LB and UB are boundaries within which measurements on a product characteristic must lie.
Upper specification limit. LB and UB are boundaries within which measurements on a product characteristic must lie.
Desired significance level.
Returns calculated significance.
Returns capability index.
Returns 100*(1-Alpha) Cp confindence interval.
Returns capability index for uncentred process.
Returns 100*(1-Alpha) CPK confidence interval.
Calculates the Process Capability indexes p, Cp(aka CPR) and Cpk(AKA CPRk).
In the following example capability indexes and their condifence are calculated.
using Dew.Math;
using Dew.Math.Units;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
double p, lb, ub, cp, cpk;
double[] CPConfInt, CPKConfInt;
Vector data = new Vector(0);
lb = -0.1;
ub = 0.1;
data.LoadFromFile("PCDATA.vec");
StatControlCharts.QCCapIndexes(data, lb, ub, out p, out cp, out cpk, out CPConfInt, out CPKConfInt, 0.05);
}
}
*)
procedure QCCapIndexes(const Data: TVec; const LB, UB: double; out p, CP, CPK : double;
var CPConfInt, CPKConfInt: TTwoElmReal; Alpha: double = 0.05);
(*P-Chart.
Calculates the P-Chart (Control chart for proportions) drawing values, center line, upper and lower control limits.
Control limits are based on the normal approximation to the binomial distribution.
When p is small, the normal approximation may not always be adequate. In such cases, we may use control limits obtained directly
from a table of binomial probabilities. If P is small, the lower control limit obtained from the normal approximation may be a
negative number. If this should occur, it is customary to consider zero as the lower control limit.
Note
The assumption is all samples have the same SampleSize.
*)
procedure QCPChart(const Data: TVec; SampleSize : Integer; const DrawVec: TVec; out CL, UCL, LCL : double; Confidence: double = 0.997); overload;
(*In this case each sample can have different size. You must store sizes in SampleSize vector.
Data to be analyzed. Each value represents number of defects.
Confidence level for upper and lower control limit. Confidence must lie
in the (0,1) interval.
Sample size. Can be integer or vector.
Returns values to be drawn.
Returns control Chart centerline.
Returns control Chart upper control limit.
Returns control Chart lower control limit.
An exeption is raised if Data and SampleSize length do not match.
The following code will create P chart:
using Dew.Math;
using Dew.Math.Units;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example(Dew.Stats.QCSeries QCSeries1)
{
Vector drawvec = new Vector(0);
Matrix data = new Matrix(0,0);
double cl,lcl,ucl;
StatControlCharts.QCPChart(data,drawvec,out cl,out ucl,out lcl,0.05);
// Setup series properties
QCSeries->UCL = ucl;
QCSeries->LCL = lcl;
QCSeries->CL = cl;
MtxVecTee.DrawValues(drawVec,QCSeries1,0,1,false);
}
}
*)
procedure QCPChart(const Data: TVec; const SampleSize : TVec; const DrawVec: TVec; out CL: double; const UCL,LCL: TVec; Confidence: double = 0.997); overload;
(*Constructs nP-Chart.
Constructs nP-Chart drawing values, center line, upper and lower control limit.
The assumption is all samples have the same SampleSize.
*)
procedure QCNPChart(const Data: TVec; SampleSize : Integer; const DrawVec: TVec; out CL, UCL, LCL : double; Confidence: double = 0.997); overload;
(*In this case each sample can have different size. You must store sizes in SampleSize vector.
Data to be analyzed. Each value represents number of defects.
Confidence level for upper and lower control limit. Confidence must lie
in the (0,1) interval.
Sample size. Can be integer or vector.
Returns values to be drawn.
Returns control Chart centerline.
Returns control Chart upper control limit.
Returns control Chart lower control limit.
An exeption is raised if Data and SampleSize length do not match.
*)
procedure QCNPChart(const Data: TVec; const SampleSize :TVec; const DrawVec: TVec; const CL, UCL, LCL: TVec; Confidence: double); overload;
(*U-Chart.
Constructs U-Chart drawing values, center line, upper and lower control limit.
The assumption is all samples have the same SampleSize.
*)
procedure QCUChart(const Data: TVec; SampleSize :Integer; const DrawVec: TVec; out CL, UCL, LCL : double; Confidence: double = 0.997); overload;
(*In this case each sample can have different size. You must store sizes in SampleSize vector.
An exception is raised if Data and SampleSize length do not match.
Data to be analyzed. Each value represents number of defects.
Confidence level for upper and lower control limit. Confidence must lie
in the (0,1) interval.
Sample size. Can be integer or vector.
Returns values to be drawn.
Returns control Chart centerline.
Returns control Chart upper control limit.
Returns control Chart lower control limit.
*)
procedure QCUChart(const Data: TVec; const SampleSize :TVec; const DrawVec: TVec; out CL: double; const UCL, LCL: TVec; Confidence: double = 0.997); overload;
(*Constructs C-Chart.
Data to be analyzed. Each value represents number of defects.
Confidence level for upper and lower control limit. Confidence must lie
in the (0,1) interval.
Returns values to be drawn.
Returns control Chart centerline.
Returns control Chart upper control limit.
Returns control Chart lower control limit.
Constructs C-Chart drawing values, center line, upper and lower control limit.
*)
procedure QCCChart(const Data: TVec; const DrawVec: TVec; out CL, UCL, LCL : double; Confidence: double);
(*Constructs EWMA Control Chart.
Calculates the Exponential Weighted Moving Average (EWMA) control chart. In this case UCL and LCL are constant (asymptote) limits.
This chart is also known as exponentially smoothed or geometric moving average chart. It evaluates the process level using an
exponentially smoothed moving average. Here, by the term exponentially, we mean the procedure by which individual observations
or subgroups are given progressively less importance or weight. When compared to the XChart, the EWMA chart is more sensitive
to smaller shifts in the process level.
The exponentially weighted moving average is defined as
XHat[t] = r*XHat[t] + (1-r)XHat[t-1] ,
where r is a constant and XHat[t] are EWMA chart points. The starting value for the first sample at time t = 1 = XHat[0] is
grand mean value.
*)
procedure EWMAChart(const Data: TMtx; const DrawVec: TVec; out CL, UCL, LCL: double; r: double = 0.3; Confidence: double = 0.997); overload;
(*Calculates the Exponential Weighted Moving Average (EWMA) control chart.
Data of grouped responses. Each row contais a response at specific time.
It's asumed the rows are in time order.
Weighting constant that weights past and current information. If,
for example, r=0.3, 70% of the weight will be given to past information and
30% to current information. Typically a r between 0.1 and 0.4 provides a
reasonable balance between past and current information and 0.2 is very common in actual practice.
Returns the calculated EWMA chart points.
Returns EWMA chart center line.
Returns EWMA chart lower control limits. In this case UCL and LCL are constant (asymptote) limits.
Returns EWMA chart upper control limits. In this case UCL and LCL are constant (asymptote) limits.
In this case UCL and LCL are not constant, but use an exact formula to calculate control limits for each point. It's worth noting
that UCL and LCL values rapidly approach the asymptote value.
The following code will load data from file and create EWMA chart with
r=0.25 and significance 95%:
using Dew.Math;
using Dew.Math.Units;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example(Steema.TeeChart.Styles.Line Series1, Steema.TeeChart.Styles.Line Series2,
Steema.TeeChart.Styles.Line Series3)
{
Matrix data = new Matrix(0,0);
Vector lcl = new Vector(0);
Vector ucl = new Vector(0);
double cl;
data.LoadFromFile("ewma_data.vec");
StatControlCharts.EWMAChart(data,drawvec,out cl,ucl,lcl,0.25,0.95);
MtxVecTee.DrawValues(drawvec,Series1,0,1,false);
// Series2 and Series3 are used for displaying control limits.
MtxVecTee.DrawValues(lcl,Series2,0,1,false);
MtxVecTee.DrawValues(ucl,Series3,0,1,false);
}
}
*)
procedure EWMAChart(const Data: TMtx; const DrawVec: TVec; out CL: double; UCL, LCL: TVec; r: double = 0.3; Confidence: double = 0.997); overload;
(*Compares values agains Westgard rules.
Data to-be-checked with Westgard rules.
Vector storing each point status. If point(i) is out-of-control, then
OutControl.Values[i] will be greater than zero (see explanation above).
In most cases equal to data mean, but you can specify any value.
In most cases equal to data standard deviation, but you can specify any value.
Data individual values are tested to determine if they are in, or out, of control using a set of five rules called the Westgard rules. These rules indicate
which points are out-of-control. The Westgard rules are (see www.westgard.com/mltirule.htm for details):
Description Rule
- 1S3 One value beyond three sigma from the mean.
- 1S2 One value beyond two sigma from the mean.
- 2S2 Two consecutive values either greater than, or less than, two sigma from the mean.
- RS4 A difference between consecutive values greater than four sigma.
- 41S Four consecutive values greater than, or less than, one sigma from the mean.
- 10X Ten consecutive values all greater than, or less than, the mean.
For each point several rules can be violated at the same time. Each Westgard rule violation has different value:
Rule violated Value
- 1S3 1=2^0
- 1S2 2=2^1
- 2S2 * 4=2^2
- RS4 8=2^3
- 4S1 16=2^4
- 10X * 32=2^5
For example, if rules 2S2 and 10X were violated, then the sum of violations for point would be 4 + 32 = 36 = 100100.
Load process data, then check if any points are out-of-control.
using Dew.Math;
using Dew.Math.Units;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example(Dew.Stats.QCSeries QCSeries1)
{
Vector data = new Vector(0);
Vector outofcontrol = new Vector(0);
VectorInt indexes = new VectorInt(0);
data.LoadFromFile("process_data.vec");
StatControlCharts.QCWestgardRules(data,outofcontrol,data.Mean(),data.StdDev());
// Now find indexes of out-of-control points
indexes.FindIndexes(outofcontrol,">",0);
// indexes.IValues now stores the indices of out-of-control points
}
}
*)
procedure QCWestgardRules(const Data: TVec; const OutControl: TVec; dataMean, dataSigma: double);
(*Moving range chart.
Data to be analyzed. Each value represents number of defects.
Confidence level for upper and lower control limit. Confidence must lie
in the (0,1) interval.
Returns control Chart centerline.
Returns control Chart upper control limit.
Returns control Chart lower control limit.
Uses Data values to construct a moving-range quality control chart. This QC chart type can be
used for individual measurements (sample size = 1).
*)
procedure QCMRChart(const Data: TVec; out CL, UCL, LCL: double; Confidence: double = 0.997);
(*CUMSum chart.
Each Data row contains replicated observation taken at specific time.
Confidence level for upper and lower control limit. Confidence must lie in the (0,1) interval.
Returns values to be drawn.
Defines the estimate of the in-control mean.
Defines known (or estimated) standard deviation of the sample means.
Design parameter of the V-mask. k is the rise in the V-arm corresponding to one sampling unit.
Design parameter of the V-mask. h defines the rise in the arm coresponding to the distance
from the origin to point vertex.
If set, returns high values for cumsum plot. When SHigh exceeds value h, the process is said to be out-of-control.
Compare SHigh values to h to find if a process is out-of-control.
If set, returns low values for cumsum plot. When SLow exceeds value h, the process is said to be out-of-control.
Compare SLow values to h to find if a process is out-of-control.
Calculates CumSum Chart drawing values and additonal values, needed for deciding if some samples are
out-of-control.
See www.itl.nist.gov/div898/handbook/pmc/section3/pmc323.htm to
learn more about Cumsum QC charts.
Perform CumSum QC to determine if process is out-of-control.
using Dew.Math;
using Dew.Math.Units;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Matrix data = new Matrix(0,0);
data.LoadFromFile("process_data.mtx");
Vector sh = new Vector(0);
Vector sl = new Vector(0);
Vector s = new Vector(0);
VectorInt outofcontrol = new VectorInt(0);
// estimate k=0.32, h = 4.7
double k = 0.32;
double h = 4.7;
// estimate process mean and sigma
double m = 230.3;
double sig = 5.2;
StatControlCharts.QCCumSumChart(data,s,k,h,m,sig,sh,sl,0.997);
// find point indexes which exceed h (out-of-control points)
outofcontrol.FindIndexes(sh,">",h);
}
}
*)
procedure QCCumSumChart(const Data: TMtx; const DrawVec: TVec; k, h: double; mean: double; sigma: double = 1.0;
const SHigh: TVec = nil; const SLow: TVec = nil; Confidence: double = 0.997);
(*Basic statistical routines.
Introduces basic statistical routines, including hypothesis testing, distribution
parameter estimation, descriptive statistics, ...
*)
unit Statistics;
{$I bdsppdefs.inc}
interface
uses AbstractMtxVec, MtxVec, Math387, Probabilities, Optimization, MtxVecInt
;
type
(*Defines the array of two double elements.
*)
TTwoElmReal = array [0..1] of double;
(*Defines one or two sided hypothesis testing.
Defines if hypothesis will be tested on closed interval (two sided) or
open interval (one sided).
*)
THypothesisType=(
(*Perform two sided test. Acceptance region is [p0-p0(signif),p0+p0(signif)).*)
htTwoTailed,
(*Perform one sided right tailed test. Acceptance region is [p0+p0(signif),INF).*)
htRightTailed,
(*Perform one sided left tailed test. Acceptance region is [-INF,p0+p0(signif)].*)
htLeftTailed
);
(*Defines result of the hypothesis test.
After test, the result is one of three possibilities.
*)
THypothesisResult=(
(*Do not reject the null hypothesis H0.*)hrNotReject,
(*Reject the null hypothesis H0.*)hrReject,
(*Could not calculate the null hypothesis result. This usually means testi is NOT
suitable for your data.*)hrNan
);
(*Defines type of Principal Component Analysis (PCA).
Defines whether the PCA analysis is to be run on a correlation or covariance matrix.
Normally, the analysis is run on the scale-invariant correlation matrix since the
scale of the variables changes the analysis when the covariance matrix is used.
*)
TPCAMode = (
(*CA is run on correlation matrix (supported by PCA).*)PCACorrMat,
(*CA is run on covariance matrix (supported by PCA and FA).*)PCACovMat,
(*CA is run on raw data matrix (supported by PCA and FA).*)PCARawData
);
(*Defines methods for calculatating percentile.*)
TPercentileMethod = (
(*The 100pth percentile is computed as Zp = (1-d)X[k] + dX[k+1]
where k+1 equals the integer part of P(n+1), d is the fractional part of p(n+1), and X[k-1] is the kth observation when the data are sorted from lowest to highest.*)
pctMethodNPlus=0,
(*The 100pth percentile is computed as Zp = (1-d)X[k] + dX[k+1]
where k+1 equals the integer part of P(n-1)+1, d is the fractional part of p(n+1), and X[k-1] is the kth observation when the data are sorted from lowest to highest.*)
pctMethodNMinus=1,
(*The 100pth percentile is computed as Zp = X[k]
where k+1 equals the integer that is closest to np and X[k-1] is the kth observation when the data are sorted from lowest to highest.*)
pctMethodClosestN=2,
(*The 100pth percentile is computed as Zp = X[k]
where k+1 equals the integer part of np if np is exactly an integer or the integer part of np+1 if np is not exactly an integer.
X[k-1] is the kth observation when the data are sorted from lowest to highest. Note that EDF stands for empirical distribution function.*)
pctMethodEDF=3,
(*The 100pth percentile is computed as Zp = 0.5*(X[k1] + X[k2])y
where k1 and k2 are defined as follows: If np is an integer, k1=k2=np. If np is not exactly an integer, k1 equals the integer part of np and k2 = k1+1.
X[k-1] is the kth observation when the data are sorted from lowest to highest. Note that EDF stands for empirical distribution function.*)
pctMethodEDFAve=4
);
(*Defines Hotelling T2 test type.*)
THotellingT2Type = (
(*One sample Hotelling T2 test.*)htT2OneSample,
(*Two sample paired Hotelling T2 test.*)htT2Paired,
(*Two sample unpaired Hotelling T2 test (test homoskedasticity).*)htT2Homoskedasticity,
(*Two sample unpaired Hotelling T2 test (test heteroskedasticity).*)htT2Heteroskedasticity
);
(*Defines method for pairwise distance calculation.*)
TPWDistMethod = (
(*Euclidian distance (L2 norm).*)pwdistEuclidian,
(*The "City block" distance (L1 norm).*)pwdistCityBlock,
(*The Chebychev distance.*)pwdistChebychev,
(*Custom pairwise distance calculation.*)pwdistCustom
);
(*Defines the criterion for measuring the improvement of Latih Hypercube DOE.*)
TLHCImprove = (
(*No improvement.*)lhcImproveNone,
(*Base criteria on maximized minimum distance between points.*)lhcImproveMinDist
);
(*Defines the rotation method.*)
TMatrixRotation = (
(*No rotation.*)rotNone,
(*Varimax rotation.*)rotVarimax,
(*Ortomax rotation.*)rotOrtomax,
(*Quartimax rotation.*)rotQuartimax,
(*Equamax rotation.*)rotEquamax
);
(*Calculate parameters for Beta distributed values using MLE.*)
procedure BetaFit(const X: TVec; out A, B: double; MaxIter: Integer = 500; Tolerance: double = 1e-8); overload;
(*Calculate parameters for Beta distributed values.
Stores data which is assumed to be Beta distributed.
Return Beta distribution parameter estimator a.
Return Beta distribution parameter estimator b.
Maximum number of iterations needed for deriving a and b.
Defines the acceptable tolerance for calculating a and b.
a (1-Alpha)*100 percent confidence interval.
b (1-Alpha)*100 percent confidence interval.
Confidence interval percentage.
The following example generates 100 random Beta distributed
values and then uses BetaFit routine to extract used a and b parameters:
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples;
{
private void Example()
{
Vector v = new Vector(1000, false);
// first, generate 1000 randomly beta distributed
// numbers with parameters a = 3 and b = 2
Random rnd = new Random();
StatRandom.RandomBeta(3.0, 2.0, v, rnd.Next());
double esta, estb;
double[] cia = new double[2];
double[] cib = new double[2];
// Now extract the a,b and their 95% confidence intervals.
//Use at max 300 iterations and tolerance 0.001
Statistics.BetaFit(v, out esta, out estb, ref cia, ref cib, 300, 1.0e-3, 0.05);
*)
procedure BetaFit(const X: TVec; out A, B: double; var PCIA, PCIB: TTwoElmReal; MaxIter: Integer = 500; Tolerance: double = 1e-8; Alpha: double = 0.05 ); overload;
(*Calculate parameters for Cauchy distributed values.*)
procedure CauchyFit(const X: TVec; out m, b: double;MaxIter: Integer = 500; Tolerance: double=1e-8); overload;
(*Calculate the parameters and their (1-Alpha) confidence invervals for Cauchy distributed values.*)
procedure CauchyFit(const X: TVec; out m, b: double; var mCI, bCI: TTwoElmReal; MaxIter: Integer = 500;
Tolerance: double=1e-8; Alpha: double = 0.05 ); overload;
(*Calculate parameters for Chi-Squared distributed values.*)
procedure ChiSquareFit(const X: TVec; out nu: Integer);
(*Calculate parameters for Erlang distributed values.*)
procedure ErlangFit(const X: TVec; out k: Integer; out lambda: double);
(*Calculate parameters for exponentialy distributed values.*)
procedure ExponentFit(const X: TVec; out mu: double);overload;
(*Calculate parameters for exponentially distributed values.
Stores data which is assumed to be exponentialy distributed.
Returns exponential distribution parameter estimator.
Mu (1-Alpha)*100 percent confidence interval.
Confidence interval percentage.
The following example generates 100 random standard exponentially distributed
values and then uses ExponentFit routine to extract used Mu parameter:
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples;
{
private void Example()
{
Vector vec1 = new Vector(1000,false);
// first, generate 1000 randomly beta distributed
// numbers with parameter mu=4.13, and default seed
StatRandom.RandomExponent(4.13,vec1,-1);
double estmu;
double[] cimu = new double[2];
// Now extract the mu and its 95% confidence intervals.
Statistics.ExponentFit(vec1,out estmu,out cimu, 0.05);
*)
procedure ExponentFit(const X: TVec; out mu: double; var PCIMu: TTwoElmReal; Alpha: double = 0.05);overload;
(*Calculate parameter for Fisher-F distributed values.
Stores data which is assumed to follow Fisher-F distribution.
Return Fisher-F distribution parameter estimator Nu1.
Return Fisher-F distribution parameter estimator Nu2.
Maximum number of iterations needed for deriving a, b and d.
Defines the acceptable tolerance for calculating a and b.
*)
procedure FFit(const X: TVec; out Nu1, Nu2: Integer; MaxIter: Integer = 500; Tolerance: double=1e-8);
(*Calculate parameters for Gamma distributed values using MLE.*)
procedure GammaFit(const X: TVec; out A, B: double; MaxIter: Integer = 500; Tolerance: double=1e-8); overload;
(*Calculate parameters for Gamma distributed values.
Stores data which is assumed to be Gamma distributed.
Return Gamma distribution parameter estimator A.
Return Gamma distribution parameter estimator B.
Maximum number of iterations needed for deriving a and b.
Defines the acceptable tolerance for calculating a and b.
A (1-Alpha)*100 percent confidence interval.
B (1-Alpha)*100 percent confidence interval.
Confidence interval percentage.
The following example generates 100 random Gamma distributed
values and then uses GammaFit routine to extract used a and b parameters
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples;
{
private void Example()
{
Vector vec1 = new Vector(0);
// first, generate 1000 randomly gamma distributed
// numbers with parameters a=0.5 and b =1.2
vec1.Size(1000, false);
StatRandom.RandomGamma(0.5, 1.2, vec1, -1);
// Now extract the a,b and their 95% confidence intervals.
// Use at max 400 iterations and tolerance 0.0001
double resA, resB;
double[] CIA = new double[2];
double[] CIB = new double[2];
Statistics.GammaFit(vec1, out resA, out resB, out CIA, out CIB, 400, 1e-4, 0.05);
}
}
*)
procedure GammaFit(const X: TVec; out A, B: double; var PCIA, PCIB: TTwoElmReal; MaxIter: Integer = 500; Tolerance: double=1e-8; Alpha: double = 0.05 );overload;
(*Calculate parameters for inverse Gaussian distributed values.*)
procedure InverseGaussianFit(const X: TVec; out mu, lambda: double); overload;
(*Mu and lambda for inverse Gaussian distributed values.*)
procedure InverseGaussianFit(const X: TVec; out mu, lambda: double; var muConfInt, lambdaConfInt: TTwoElmReal; Alpha: double = 0.05 );overload;
(*Calculate parameters for Laplace distributed values.
Stores data which is assumed to be Laplace distributed.
Return Laplace distribution parameter estimator M.
Return Laplace distribution parameter estimator b.
The following example generates 100 random Laplace distributed
values and then uses LaplaceFit routine to extract used Mu and b parameters
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples;
{
private void Example()
{
Vector Data = new Vector(100);
StatRandom.RandomLaplace(3, 0.2, Data,-1);
double mu, b;
Statistics.LaplaceFit(Data, out mu, out b);
// mu approx 3.0
// b approx 0.2
}
}
*)
procedure LaplaceFit(const X: TVec; out mu, b: double);overload;
(*Calculate parameters for Logistic distributed values.
Stores data which is assumed to follow logistic distribution.
Return Logistic distribution parameter estimator M.
Return Logistic distribution parameter estimator b.
The following example generates 100 random logistic distributed
values and then uses LogisticFit routine to extract used m and b parameters
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples;
{
private void Example()
{
Vector Data = new Vector(100);
StatRandom.RandomLogistic(3,0.2,Data, -1);
double m, b;
Statistics.LogisticFit(Data, out m, out b);
// m approx 3.0
// b approx 0.2
}
}
*)
procedure LogisticFit(const X: TVec; out m, b: double);
(*Calculate parameters for log-normally distributed values.*)
procedure LogNormalFit(const X: TVec; out mu, sigma: double);overload;
(*Calculate parameters for log-normally distributed values.
Stores data which is assumed to be log-normaly distributed.
Return log-normal distribution parameter estimator M u.
Return log-normal distribution parameter estimator Sigma.
Mu (1-Alpha)*100 percent confidence interval.
Sigma (1-Alpha)*100 percent confidence interval.
Confidence interval percentage.
The following example generates 100 random log-normally distributed
values and then uses NormalFit routine to extract used Mu and Sigma parameters
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples;
{
private void Example()
{
Vector Data = new Vector(0);
Data.Size(100, false);
StatRandom.RandomLogNormal(3, 0.2, Data, -1);
double mu, sigma;
double[] MuCI = new double[2];
double[] SigmaCI = new double[2];
Statistics.LogNormalFit(Data, out mu, out sigma, out MuCI, out SigmaCI, 0.05);
// mu approx 3.0
// sigma approx 0.2
}
}
*)
procedure LogNormalFit(const X: TVec; out mu, sigma:double; var PCIMu, PCISigma: TTwoElmReal; Alpha: double = 0.05 );overload;
(*Calculate parameters for Maxwell distributed values.
Stores data which is assumed to to follow Maxwell distribution.
Return Maxwell distribution parameter estimator a.
*)
procedure MaxwellFit(const X: TVec; out a: double); overload;
(*Calculate parameters for normally distributed values.*)
procedure NormalFit(const X: TVec; out mu, sigma: double);overload;
(*Calculate parameters for normally distributed values.
Stores data which is assumed to be normaly distributed.
Return normal distribution parameter estimator Mu.
Return normal distribution parameter estimator Sigma.
Mu (1-Alpha)*100 percent confidence interval.
Sigma (1-Alpha)*100 percent confidence interval.
Confidence interval percentage.
The following example generates 100 random standard normally distributed
values and then uses NormalFit routine to extract used Mu and Sigma parameters
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples;
{
private void Example()
{
Vector vec1 = new Vector(0);
// first, generate 1000 normaly distributed
// numbers with Mu a=0.0 and Sigma =1.0
vec1.Size(1000, false);
StatRandom.RandomNormal(0.0, 1.0, vec1, -1);
double resMu, resSigma;
double[] CIMu = new double[2];
double[] CISigma = new double[2];
// Now extract the Mu,Sigma and their 95% confidence intervals.
// Use at max 400 iterations and tolerance 0.0001
Statistics.NormalFit(vec1, out resMu, out resSigma, out CIMu, out CISigma, 0.05);
}
}
*)
procedure NormalFit(const X: TVec; out mu, sigma:double; var PCIMu, PCISigma: TTwoElmReal; Alpha: double = 0.05 );overload;
(*Calculate parameter for Student-T distributed values.
Stores data which is assumed to follow Student-T distribution.
Return Student-T distribution parameter estimator Nu.
Maximum number of iterations needed for deriving a, b and d.
Defines the acceptable tolerance for calculating a and b.*)
procedure StudentFit(const X: TVec; out Nu: Integer; MaxIter: Integer = 500; Tolerance: double=1e-8);
(*Calculate parameters for Rayleigh distributed values.*)
procedure RayleighFit(const X: TVec; out b: double); overload;
(*Calculate parameters for Rayleigh distributed values.
Stores data which is assumed to be Rayleigh distributed.
Returns Rayleigh distribution parameter estimator.
b (1-Alpha)*100 percent confidence interval.
Confidence interval percentage.
The following example generates 100 random Rayleigh distributed
values and then uses RayleighFit routine to extract used b parameter
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples;
{
private void Example()
{
Vector vec1 = new Vector(0);
// first, generate 1000 randomly Rayleigh distributed
// numbers with b=1.3
vec1.Size(1000,false);
StatRandom.RandomRayleigh(1.3,vec1,-1);
// Now extract the r and it's 95% confidence intervals.
double resb;
double[] CIb = new double[2];
Statistics.RayleighFit(vec1,out resb,out CIb,0.05);
}
}
*)
procedure RayleighFit(const X: TVec; out b: double; var PCIb: TTwoElmReal; Alpha: double = 0.05 );overload;
(*Calculate parameters for Triangular distributed values.
Stores data which is assumed to follow triangular distribution.
Return Triangular distribution parameter estimator a.
Return Triangular distribution parameter estimator b.
Return Triangular distribution parameter estimator c.
Maximum number of iterations needed for deriving a, b and d.
Defines the acceptable tolerance for calculating a and b.*)
procedure TriangularFit(const X: TVec; out a, b,c: double;MaxIter: Integer = 500; Tolerance: double=1e-8);
(*Calculate parameters for uniformly distributed values.*)
procedure UniformFit(const X: TVec; out low, high: double);overload;
(*Calculate parameters for uniformly distributed values.
Stores data which is assumed to be uniformly distributed.
Return uniform distribution parameter estimator Low.
Return uniform distribution parameter estimator High.
Low (1-Alpha)*100 percent confidence interval.
High (1-Alpha)*100 percent confidence interval.
Confidence interval percentage.
The following example generates 100 random uniformly distributed
values and then uses UniformFit routine to extract used Low and High parameters
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples;
{
private void Example()
{
Vector vec1 = new Vector(1000,false);
// first, generate 1000 randomly Weibull distributed
// numbers with parameters Low=1.5 and High =2.2
StatRandom.RandomUniform(1.5,2.2,vec1,-1);
// Now extract the Low,High and their 95% confidence intervals.
double resLow,resHigh;
double[] CILow = new double[2];
double[] CIHigh = new double[2];
Statistics.UniformFit(vec1,out resLow,out resHigh,out CILow,out CIHigh, 0.05);
}
}
*)
procedure UniformFit(const X: TVec; out low, high : double; var PCILow, PCIHigh: TTwoElmReal; Alpha: double = 0.05 );overload;
(*Calculate parameters for Weibull distributed values using MLE.*)
procedure WeibullFit(const X: TVec; out A, B: double; MaxIter: Integer = 500; Tolerance: double=1e-8); overload;
(*Calculate parameters for Weibull distributed values.
Stores data which is assumed to be Weibull distributed.
Return Weibull distribution parameter estimator A.
Return Weibull distribution parameter estimator B.
Maximum number of iterations needed for deriving a and b.
Defines the acceptable tolerance for calculating a and b.
A (1-Alpha)*100 percent confidence interval.
B (1-Alpha)*100 percent confidence interval.
Confidence interval percentage.
The following example generates 1000 random Weibull distributed
values and then uses WeibullFit routine to extract used a and b parameters
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples;
{
private void Example()
{
Vector vec1 = new Vector(1000,false);
// first, generate 1000 randomly gamma distributed
// numbers with parameters a=0.5 and b =1.2
StatRandom.RandomWeibull(0.5,1.2,vec1,-1);
// Now extract the a,b and their 95% confidence intervals.
// Use at max 400 iterations and tolerance 0.0001
double resA, resB;
double[] CIA = new double[2];
double[] CIB = new double[2];
Statistics.WeibullFit(vec1,out resA,out resB,out CIA,out CIB,400,1e-4,0.05);
}
}
*)
procedure WeibullFit(const X: TVec; out A, B: double; var PCIA, PCIB: TTwoElmReal; MaxIter: Integer = 500; Tolerance: double=1e-8; Alpha: double = 0.05 ); overload;
(*Calculate parameters for binomial distributed values.*)
procedure BinomFit(const X: TVec; N: double; const p: TVec);overload;
procedure BinomFit(const X: TVec; const N: TVec; const p: TVec);overload;
(*Calculate parameters for binomial distributed values.
Stores data which is assumed to be binomial distributed.
Defines binomial distribution n parameter.
Returns binomial distribution p parameter, estimated at each X value.
p (1-Alpha)*100 percent confidence intervals. Each pair of Low
and High values defines lower and upper CI for corresponding p value.
Confidence interval percentage.
The following example generates 100 random Weibull distributed
values and then uses WeibullFit routine to extract used a and b parameters:
The following example generates 100 binomial distributed values and then
uses BinomFit routine to extract p parameter
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples;
{
private void Example()
{
Vector vec1 = new Vector(1000, false);
Vector LowInt = new Vector(0);
Vector HighInt = new Vector(0);
Vector p = new Vector(0);
StatRandom.RandomBinom(35, 0.25, vec1, -1);
// Now extract p-s and its 100*(1-Alpha)
// confidence intervals.
Statistics.BinomFit(vec1, 35, p, LowInt, HighInt, 0.05);
// vector p holds the p for each element in vec1,
// vectors LowInt and HighInt hold the lower and upper conf. int. limit
// for each element in p
}
}
*)
procedure BinomFit(const X: TVec; N: double; const p, Low, High: TVec; Alpha: double = 0.05 );overload;
procedure BinomFit(const X: TVec; out N: Integer; out p: double); overload;
(*Calculate parameters for geometrically distributed values.*)
procedure GeometricFit(const X: TVec; out P: double); overload;
(*Calculate parameters for geometrically distributed values.
Stores data which is assumed to be geometricaly distributed.
Returns geometric distribution parameter estimator.
P (1-Alpha)*100 percent confidence interval.
Confidence interval percentage.
The following example generates 1000 random Geometric distributed
values and then uses GeometricFit routine to extract used P parameter
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples;
{
private void Example()
{
Vector vec1 = new Vector(1000, false);
// first, generate 1000 randomly geometr. distributed
// numbers with parameter p=0.713
StatRandom.RandomGeometric(0.713, vec1, -1);
// Now, extract the p and its 100*(1-0.05) = 95%
// confidence interval
double resp;
double[] CIp = new double[2];
Statistics.GeometricFit(vec1, out resp, out CIp, 0.05);
}
}
*)
procedure GeometricFit(const X: TVec; out P: double; var PCI: TTwoElmReal; Alpha: double = 0.05); overload;
(*Calculate parameters for negative binomial distributed values.*)
procedure NegBinomFit(const X: TVec; out r, p: double);
(*Calculate parameters for Poisson distributed values.*)
procedure PoissonFit(const X: TVec; out lambda: double);overload;
(*Calculate parameters for Poisson distributed values.
Stores data which is assumed to be Poisson distributed.
Returns Poisson distribution parameter estimator.
Lambda (1-Alpha)*100 percent confidence interval.
Confidence interval percentage.
The following example generates 500 random Poisson distributed
values and then uses PoissonFit routine to extract used Lambda parameter
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples;
{
private void Example()
{
Vector vec1 = new Vector(500,false);
// first, generate 500 randomly Poiss. distributed
// numbers with parameter lambda=1.17
StatRandom.RandomPoisson(1.17,vec1,-1);
// Now, extract the lambda and its 95%
// confidence interval
double resLambda;
double[] CILambda = new double[2];
Statistics.PoissonFit(vec1,out resLambda,out CILambda,0.05);
}
}
*)
procedure PoissonFit(const X: TVec; out lambda: double; var lambdaConfInt: TTwoElmReal; Alpha: double = 0.05 );overload;
(*Calculate parameters for discrete uniformly distributed values.*)
procedure UniformDFit(const X: TVec; out N: Integer); overload;
(*Calculate parameters for discrete uniformly distributed values.
Stores data which is assumed to be discrete uniformly distributed.
Returns discrete uniform distribution parameter estimator.
N (1-Alpha)*100 percent confidence interval.
Confidence interval percentage.
The following example generates 1000 random discrete uniform distributed
values and then uses UniformDFit routine to extract used N parameter:
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples;
{
private void Example()
{
Vector vec1 = new Vector(1000, false);
// first, generate 1000 randomly disc.uniformly distributed
// numbers with parameter N = 12
StatRandom.RandomUniformD(12, vec1, -1);
// Now, extract N and its 95%
// confidence interval
int resN;
double[] CIN = new double[2];
Statistics.UniformDFit(vec1, out resN, out CIN, 0.05);
}
}
*)
procedure UniformDFit(const X: TVec; out N: Integer; var NConfInt: TTwoElmReal; Alpha: double = 0.05);overload;
(*Covariance/variance.
Defines sample (variable) values (observables). In this
case X is treated as row and not (as normally) column vector.
Returns the covariance (in this case equal to variance) for X vector elements.
Because in this case X is represented as row vectro, the the result is simply scalar value
E(X(T)*X)-E(X(T))E(X) = Var(X).
If true (default value), the result will be normalized with number of observations
(N), otherwise it will be normalized with N-1.
The covariance between two real-valued random variables x and y,with expected
values E(x)=mu and E(y)=nu is defined as:
where E(x), E(y) are x and y expected values.
For more info about covariance definition and properties check thd following links:
1. http://mathworld.wolfram.com/Covariance.html
2. http://en.wikipedia.org/wiki/Covariance
*)
procedure Covariance(const X: TVec; out aResult: double; NormN: boolean = true ); overload;
(*Calculate the variance-covariance matrix (Result), assuming vectors X and Y are two variable and their elements are the observations.
X and Y can be two vectors or matrices of equal size. In first case two vectors are treated as two variables, X values as first
variable observables, Y vector values as second variable observabled. In second case two matrices are treated as two
variables X and Y, all X values as X variable observables and
all Y values as Y variable observables.
For column-vector valued random variables X and Y with respective expected values mu and nu,
and respective scalar components m and n, the covariance is defined to be the m×n matrix called the
covariance matrix:
Calculate the covariance matrix from two vectors representing two variables.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector data1 = new Vector(0);
Vector data2 = new Vector(0);
Matrix cov = new Matrix(0,0);
data1.SetIt(false,new double[] {1.2,3});
data2.SetIt(false,new double[] {5,5.5});
Statistics.Covariance(data1,data2,cov,false);
// cov = [1.62, 0.45,
// 0.45, 0.125]
}
}
*)
procedure Covariance(const X,Y: TDenseMtxVec; const aResult: TMtx; NormN: boolean = true); overload;
(*Calculate the covariance matrix (Result), assuming matrix X columns are variables and its rows are observations.
By definition the covariance matrix is a matrix of covariances between elements of a vector. It is the natural
generalization to higher dimensions of the concept of the variance of a scalar-valued random variable.
If X columns represent observation samples (variables), it's rows sample(s) values (observables), muj
Xj j-th column average value, then the covariance matrix is defined as:
or in matrix form:
where E is the expected value. The inverse of this matrix, is called the inverse covariance matrix or the
precision matrix.
Note
This version does all necessary calculations to calculate covariance matrix.
*)
procedure Covariance(const X: TMtx; const aResult: TMtx; NormN: boolean = true); overload;
(*Pearson correlation coefficients.
Defines first sample (variable) values (observables).
Defines second sample (variable) values (observables).
Returns Pearson correlation coefficients bewteen samples
X and Y. Size of Result is adjusted automatically.
Calculates Person correlation coefficients between X and Y representing two samples/variables
with X and Y Values being treated as observables.
If X and Y are two matrices, the rouitine calculates Pearson correlation coefficients rx,y between X and Y matrix.
The algorithm assumes X and Y are two samples (variables) and all X are are treated as it's observables (same goes for Y).
Correlation coefficient
The correlation coefficient rho between two random variables is defined by the following equation:
where x,y are two variables, Cov covariance betwen x and y, sigma(s) their expected standard
deviations and E-s their expected values. If the variables are independent then the correlation is 0, but the converse is not true
because the correlation coefficient detects only linear dependencies between two variables.
Sample correlation coefficients.
If we have a series of n measurements of X and Y, then the Pearson product-moment
correlation coefficient can be used to estimate the correlation of X and Y. The Pearson coefficient is also
known as the "sample correlation coefficient". The Pearson correlation coefficient is then the best estimate
of the correlation of X and Y .
Note
The correlation is defined only if both of the standard deviations are finite and both of them are nonzero.
Calculate correlation coefficients from Data1 and Data2 representing two variables.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector data1 = new Vector(0);
Vector data2 = new Vector(0);
Matrix corr = new Matrix(0,0);
data1.SetIt(false, new double[] {1,2,3});
data2.SetIt(false, new double[] {5, 5.5, 1.0} );
Statistics.CorrCoef(data1,data2,corr);
// corr = [1.00000000, -0.81088485,
// -0.81088485, 1.00000000]
}
}
*)
procedure CorrCoef(const X,Y: TDenseMtxVec; const aResult: TMtx); overload;
(*Pearson correlation coefficients.
Additionally returns also the Students t-distribution value as an
indicator of statistical significance.
*)
procedure CorrCoef(const X,Y: TDenseMtxVec; const aResult: TMtx; var tValue: double); overload;
(*Pearson correlation coefficients between matrix rows and cols.
This version calculates Pearson correlation coefficients rx,y between X matrix
rows and cols. X colums are treated as samples (variables) and rows as values (observables).
*)
procedure CorrCoef(const X: TMtx; const aResult: TMtx); overload;
(*Histogram.
Divide the Data vector elements into intervals, specified by the Bins vector. The Bins
elements define the center points for the intervals. The Bins elements must be sorted
in ascending order. The number of elements falling in each interval is counted and the
result for each interval (frequency distribution) is written to the Results vector.
The Length and Complex properties of the Results vector are adjusted automatically.
Note
Use this version if you need non-equidistant histogram.
This example constructs 4 unequal bins and counts the frequencies.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector data = new Vector(0);
Vector bins = new Vector(0);
Vector freq = new Vector(0);
data.SetIt(false, new double[] {1,2,3,4,5,6,7,8,9,10});
// Centerpoints, note that values are sorted!
bins.SetIt(false, new double[] {1.5, 2.0, 6.0, 9.0});
Statistics.Histogram(data,bins,freq);
// Freq holds the count of elements in each bin
}
}
*)
procedure Histogram(const Data, Bins: TVec; const Results: TVec);overload;
(*Divide the Data vector elements into NumBins equal intervals.
The number of elements falling in each interval is counted and the result for each interval
(frequency distribution) is written to the Results vector. if CenterBins is true then
Bins will store bins center points. If CenterBins is false, Bins will store bins
edges.The Length and Complex properties of the Results and Bins vectors are
adjusted automatically.
Note
Use this version if you need equidistant histogram.
Equal bins -> faster algorithm.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector Data = new Vector(1000);
Vector Bins = new Vector(0);
Vector Freq = new Vector(0);
StatRandom.RandomNormal(-3,0.2,Data,-1); // generate some normaly distributed data
// Now, divide the data into 15 equal intervals
Statistics.Histogram(Data,15,Freq,Bins,true);
// Bins holds the 15 intervals centerpoints
// and Freq holds the count of elements in each bin
}
}
*)
procedure Histogram(const Data: TVec; const NumBins: Integer; const Results: TVec; const Bins: TVec; CenterBins: boolean = false); overload;
(*Same as above, but here only values between Min and Max parameters will be counted.
*)
procedure Histogram(const Data: TVec; const NumBins: Integer; const Results: TVec; const Bins: TVec; CenterBins: boolean; Max,Min: double); overload;
(*Divide the Data vector elements into NumBins equal intervals.
The number of elements falling in each interval is counted and the result for each interval
(frequency distribution) is written to the Results vector. if CenterBins is true then
Bins will store bins center points. If CenterBins is false, Bins will store bins
edges.The Length and Complex properties of the Results and Bins vectors are
adjusted automatically.
*)
procedure Histogram(const Data: TVec; const NumBins: Integer; const Results: TVecInt; const Bins: TVec; CenterBins: boolean = false); overload;
(*Same as above, but here only values between Min and Max parameters will be counted.
*)
procedure Histogram(const Data: TVec; const NumBins: Integer; const Results: TVecInt; const Bins: TVec; CenterBins: boolean; Max,Min: double); overload;
(*InterQuartile range.
the interquartile range of all X vector elements. The interquartile range is defined as
a difference between 75th percentile and 25th percentile of given values.
Data. An exception is raised if X is complex.
Defines one of the available methods for calculating percentile. Software packages use
different methods to calculate percentiles. For example, Excel uses pctMethodNMinus, while on the other hand,
NCSS's default method is pctMethodNPlus.
Calculate the IQR for given values.
using Dew.Math;
using Dew.Stats.Units;
using Dew.Stats;
namespace Dew.Examples
{
private void Example()
{
Vector a = new Vector(0);
a.SetIt(false, new double[] {2,0.1,3,4});
double res = Statistics.InterQuartile(a,TPercentileMethod.pctMethodNMinus); // Res = 1.725 //Excel convention
}
}
*)
function InterQuartile(const X: TVec; Method: TPercentileMethod= pctMethodNPlus): double;
(*Geometric mean.
the geometric mean of given values.
Data. An exception is raised if Data is complex.
Calculate the geometric mean for given values.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector a = new Vector(0);
a.SetIt(false, new double[] {2,0.1,3,4});
double MG = Statistics.MeanG(a);
// MG = 1.24466595457696
}
}
*)
function MeanG(const Data: TVec): double;
(*Harmonic mean.
the harmonic mean of given values.
Data. An exception is raised if Data is complex.
Calculate the harmonic mean for given values.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector a = new Vector(0);
a.SetIt(false, new double[] {2,3,5,4});
double MH = Statistics.MeanH(a);
// MH = 3.11688311688312
}
}
*)
function MeanH(const Data: TVec): double;
(*Trimmed mean.
the mean of all Data vector elements, excluding highest and lowest 100*Alpha/2 percents of value.
Data. An exception is raised if Data vector is complex.
Defines the percent of value excluded from mean calculation. If Alpha parameter is omitted,
the default value 0.05 is used (meaning highest 2.5% and lowest 2.5% of data will
be excluded). Alpha paramater must lie on the interval [0,1) otherwise an exception is raised.
*)
function MeanT(const Data: TVec; Alpha: double = 0.05): double;
(*Mode of all Src vector elements.
The mode of a data sample is the element that occurs most often in the collection.
For example, the mode of the sample [1, 3, 6, 6, 6, 6, 7, 7, 12, 12, 17] is 6.
Given the list of data [1, 1, 2, 4, 4] the mode is not unique, unlike the
arithmetic mean.
Calculate the mode of sample data [1,5,2,11,5,3].
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector a = new Vector(0);
a.SetIt(false, new double[] {1,5,2,11,5,3});
double md = Statistics.Mode(a); // md = 5
}
}
*)
function Mode(const Src: TVec): double; overload;
(*Mode of Src vector elements [SrcIndex..SrcIndex+Len].*)
function Mode(const Src: TVec; SrcIndex: Integer; Len: Integer): double; overload;
(*Moments.
the AOrder-th central moment of all X vector elements.
Data. An exception is raised if X vector is complex.
A mean value of all X vector elements.
Defines the order of central moment.
Calculate the fourth moment (related to Kurtosis)
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector a = new Vector(0);
a.SetIt(false, new double[] {1, 2, 7, 8});
double amean = a.Mean();
double m4 = Statistics.Moment(a,amean,4);
// m4 = 94.5625
}
}
*)
function Moment(const X: TVec; AMean : double; const AOrder: Integer): double;
(*Cumulative histogram.
Divide the Data vector elements into intervals, specified by the Bins vector. The Bins elements
define the center points for the individual intervals. The Bins elements must be sorted in
ascending order. The number of elements falling in each interval is counted and the relative
cumulative frequency for each interval is written to the Results vector. The Length and
Complex properties of the Results vector are adjusted automatically.
Note
Use this version if you need non-equidistant histogram.
Unequal bins -> slower that equidistant bins algorithm.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector Data = new Vector(0);
Vector Bins = new Vector(0);
Vector CumRelFreq = new Vector(0);
Data.SetIt(false, new double[] {1,2,3,4,5,6,7,8,9,10});
// define centerpoints, note that values are sorted!
Bins.SetIt(false, new double[] {1.5, 2, 6, 9});
Statistics.CumulativeHist(Data,Bins, CumRelFreq);
// CumRelFreq holds the relative cumulative count of
// elements in each bin.
}
}
*)
procedure CumulativeHist(const Data, Bins: TVec; const Results: TVec);overload;
(*Divide the Data vector elements into NumBins equal intervals.
The number of elements falling in each interval is counted and the relative cumulative frequency for each interval
is written to the Results vector. if CenterBins is true then Bins will store bins center points.
If CenterBins is false, Bins will store bins edges. The Length and Complex properties of the
Results and Bins vectors are adjusted automatically.
Note
Use this version if you need equidistant cumulative histogram.
Equal bins -> faster algorithm.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector Data = new Vector(1000, false);
Vector Bins = new Vector(0);
Vector CumRelFreq = new Vector(0);
// generate some normally distributed data
StatRandom.RandomNormal(-3.0, 0.2, Data, -1);
Statistics.CumulativeHist(Data, 15, CumRelFreq, Bins, false);
// Bins holds the 15 intervals centerpoints
// and CumRelFreq holds the relative cumulative count of
// elements in each bin.
}
}
*)
procedure CumulativeHist(const Data: TVec; const NumBins: Integer; const Results: TVec; const Bins: TVec; CenterBins: boolean = false);overload;
(*Percentile.
the 100*P th percentile of all X vector elements. The 100*P -th percentile is
defined as the value that is greater than 100*P percent of the values in X.
Data. An exception is raised if X is complex.
Defines the 100*p percentile. The parameter P must lie in the interval (0,1), otherwise an exception is raised.
Defines one of the available methods for calculating percentile. Software packages use
different methods to calculate percentiles. For example, Excel uses pctMethodNMinus, while on the other hand,
NCSS's default method is pctMethodNPlus.
Calculate the 2nd quantile location = median position.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector a = new Vector(0);
double pctile;
a.SetIt(false, new double[] {1,2,3,10,15,21});
pctile = Statistics.Percentile(a,0.75,TPercentileMethod.pctMethodNPlus); // = 16.5
pctile = Statistics.Percentile(a,0.75, TPercentileMethod.pctMethodNMinus); // = 13.75
pctile = Statistics.Percentile(a,0.75,TPercentileMethod.pctMethodClosestN); // = 10
}
}
*)
function Percentile(const X: TVec; P : double; Method: TPercentileMethod = pctMethodNPlus): double; overload;
(*Percentile (several percentiles in one go).
This overload calculates percentiles for all P elements in one pass. The advantage of this overload is X sample values only
have to be sorted once.
procedure Example;
var Data: Vector;
P, Pct: Array of double;
begin
Data.LoadFromFile('c:\temp\Examples\Data\Percentile.vec');
SetLength(P,4);
SetLength(Pct,4);
Percentile(Data,P,Pct);
end;
*)
procedure Percentile(const X: TVec; const P: Array of double; var Percentiles: Array of double; Method: TPercentileMethod = pctMethodNPlus); overload;
(*Range for matrix rows.
Calculates the range for each Src matrix row and stores the results in Result vector.
Length and Complex properties of Result vector are adjusted automatically.
*)
procedure RangeRows(const Src: TMtx; const aResult: TVec);
(*Range for matrix columns.
Calculates the range for each Src matrix column and stores the results in Result vector.
Length and Complex properties of Result vector are adjusted automatically.
*)
procedure RangeCols(const Src: TMtx; const aResult: TVec);
(*Tied ranks for vector elements.
Defines data for rank calculation.
Returns data ranks for all elements. If some data elements are tied, the average ranks will be returned.
The adjustment for ties.
Calculates the ranks of elements in Data vector elements. If some data elements are
tied, routine calculates their average rank.
*)
procedure TiedRanks(const Data: TVec; const rks: TVec; out Adjustment: double);
(*Rank vector elements (no tie adjustment).
Defines data for rank calculation.
Returns data ranks for all elements.
Ranks all Data vector elements with no adjustment for ties.
*)
procedure Ranks(const Data: TVec; const rks: TVec);
(*Finds the unique elements in vector/matrix.
Defines data unique elements search.
Returns unique elements. Size and properties
of Dst are adjusted automatically.
If specifed, stores the count of each unique element in dataset.
Finds and sorts the unique elements in Src vector/matrix array. Sorted values are returned
in Dst vector. An exception is raised if Src is Complex. Length and Complex properties of
Dst vector are adjusted automatically.
Collect unique elements from matrix.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Matrix m1 = new Matrix(0,0);
Vector uvec = new Vector(0);
m1.SetIt(4,3,false, new double[]
{ 1,2,3,
4,2,1,
3,2,3,
3,3,1});
Statistics.Unique(m1,uvec,null);
// uvec elements are [1,2,3,4]
}
}
*)
procedure Unique(const Src: TMtxVec; const Dst: TVec; const Counts: TVecInt = nil);
(*Two sample Mann-Whitney test.
First sample dataset.
Second sample dataset.
Returns the result of the null hypothesis (default assumption is that there is no difference between samples).
Returns the 100*(1-Alpha) percent confidence interval for Mann-Whitney test. Valid only if normal approximation
is used to calculate significance level.
Defines the type of the null hypothesis (left, right and two - tailed).
Defines the desired significance level. If the significance probability (Signif)
is bellow the desired significance (Alpha), the null hypothesis is rejected.
If either of the sample size is greater than 8, a z-value can be used to approximate the significance level
for the test. In this case, the calculated z is compared to the standard normal significance levels. Set NormalApprox to true
if you want normal approximation for significance level.
(Significance level) returns the probability of observing the given result
The Mann-Whitney statistics (U, the smallest value of U1 and U2).
Performs two sample Mann-Whitney test. This is a method for the comparison of two independent random samples (Data1 and Data2).
The Mann Whitney U statistic is defined as:
U1 = n1*n2 + 0.5*(n2+1)*n2 - sum (i=n1+1, n2) Ri
where samples of size n1 and n2 are pooled and Ri are the ranks. Actually, there are two versions of the U statistic calculated,
where U2 = n1n2 - U1, where n1 and n2 are the sample sizes of the two groups. The smallest of U1 or U2 is compared to the critical
value for the purpose of the test.
U can be resolved as the number of times observations in one sample precede observations in the other sample in the ranking.
Wilcoxon rank sum, Kendall's S and the Mann-Whitney U test are exactly equivalent tests. In the presence of ties the Mann-Whitney test
is also equivalent to a chi-square test for trend.
In most circumstances a two sided test is required; here the alternative hypothesis is that Data1 values tend to be distributed
differently to Data2 values. For a lower side test the alternative hypothesis is that Data1 values tend to be smaller than Data2 values.
For an upper side test the alternative hypothesis is that Data1 values tend to be larger than Data2 values.
Assumptions of the Mann-Whitney test:
- random samples from populations
- independence within samples and mutual independence between samples
- measurement scale is at least ordinal
A confidence interval for the difference between two measures of location is provided with the sample medians. The assumptions of this
method are slightly different from the assumptions of the Mann-Whitney test:
- random samples from populations
- independence within samples and mutual independence between samples
- two population distribution functions are identical apart from a possible difference in location parameters
At the end of the experimental treatment period, the subjects are individually placed in a series of claustrophobia test
situations, knowing that their reactions to these situations are being recorded on videotape. Subsequently three clinical experts,
uninvolved in the experimental treatment and not knowing which subject received which treatment, independently view the videotapes
and rate each subject according to the degree of claustrophobic tendency shown in the test situations. Each judge's rating takes place
along a 10-point scale, with 1="very low" and 10="very high"; and the final measure for each subject is the simple average of the
ratings of the three judges for that subject. In this example data for first and second group is as follows:
Group 1: 4.6, 4.7, 4.9, 5.1, 5.2, 5.5, 5.8, 6.1, 6.5, 6.5, 7.2
Group 2: 5.2, 5.3, 5.4, 5.6, 6.2, 6.3, 6.8, 7.7, 8.0, 8.1
The investigators expected first group to prove the more effective, and sure enough it is first group that appears to show
the lower mean level of claustrophobic tendency in the test situations. You might suppose that all they need to do now is plug their
data into an independent-samples t-test to see whether the observed mean difference is significant. We will use Mann-Whitney U test
to check, if this hypothesis can be rejected.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector v1 = new Vector(0);
Vector v2 = new Vector(0);
v1.SetIt(false, new double[] {4.6, 4.7, 4.9, 5.1, 5.2, 5.5, 5.8, 6.1, 6.5, 6.5, 7.2});
v2.SetIt(false, new double[] {5.2, 5.3, 5.4, 5.6, 6.2, 6.3, 6.8, 7.7, 8.0, 8.1});
double signif, UStat;
double[] ci = new double[2];
THypothesisResult hres;
// don't use normal approximation
UStat = Statistics.MannWhitneyTest(v1,v2,out hres,out signif,out ci,THypothesisType.htTwoTailed,0.05, false);
}
}
*)
function MannWhitneyTest(const Data1, Data2 : TVec; out hRes: THypothesisResult; out Signif: double;
var ConfInt: TTwoElmReal; const hType: THypothesisType = htTwoTailed; const Alpha: double = 0.05;
const NormalApprox: boolean = true): double;
(*One or two-sample Sign test.
Performs one-sample Sign test by comparing Data median value with given Median.
The null hypothesis is that Data median is equal to Median. Additional assumption
is that underlying population is continuous.
*)
function SignTest(const Data: TVec; Median: double; out hRes: THypothesisResult; out Signif: double; var ConfInt: TTwoElmReal;
const hType: THypothesisType = htTwoTailed; const Alpha: double = 0.05):double; overload;
(*Performs paired Sign test. The routine tests the null hypothesis that Data1 and Data2 have equal median value.
First set of data from which to compute the median.
Second set of data from which to compute the median.
Returns the result of the null hypothesis (default assumption is that the means are equal).
Defines the type of the null hypothesis (left, right and two - tailed).
(Significance level) returns the probability of observing the given result by chance
given that the null hypothesis is true.
Returns the 100*(1-Alpha) percent confidence interval for the mean.
Defines the desired significance level. If the significance probability (Signif)
is bellow the desired significance (Alpha), the null hypothesis is rejected.
The Sign test statistics.
In this example we'll use paired sign test to verify if both sample medians are equal.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector d1 = new Vector(0);
Vector d2 = new Vector(0);
d1.SetIt(false, new double[] {1,4,5,6,7,12});
d2.SetIt(false, new double[] {3,3.5,2,3.1,10,9});
double signif, SStat;
double[] ci = new double[2];
THypothesisResult hres;
// don't use normal approximation
SStat = Statistics.SignTest(d1,d2,out hres,out signif,out ci,THypothesisType.htTwoTailed,0.05);
}
}
*)
function SignTest(const Data1, Data2 : TVec; out hRes: THypothesisResult; out Signif: double; var ConfInt: TTwoElmReal;
const hType: THypothesisType = htTwoTailed; const Alpha: double = 0.05):double; overload;
(*One or two-sample Wilcoxon signed rank test.
Performs one-sample Wilcoxon Signed-Rank test by comparing Data median value with given Median.
The null hypothesis is that Data median is equal to Median.
*)
function WilcoxonSign(const Data: TVec; const Median: double; out hRes: THypothesisResult;
out Signif : double; var ConfInt: TTwoElmReal; const hType: THypothesisType = htTwoTailed; const Alpha: double = 0.05):double; overload;
(*Performs two-sample Wilcoxon Signed-Rank test by comparing Data1 and Data2 medians.
First set of data from which to compute the median.
Second set of data from which to compute the median.
Returns the result of the null hypothesis (default assumption is that the means are equal).
Defines the type of the null hypothesis (left, right and two - tailed).
(Significance level) returns the probability of observing the given result by chance
given that the null hypothesis is true.
Returns the 100*(1-Alpha) percent confidence interval for the mean.
Defines the desired significance level. If the significance probability (Signif)
is bellow the desired significance (Alpha), the null hypothesis is rejected.
The Wilcoxon W statistics.
The routine tests the null hypothesis that Data1 and Data2 have equal median value.
*)
function WilcoxonSign(const Data1, Data2: TVec; out hRes: THypothesisResult;
out Signif : double; var ConfInt: TTwoElmReal;
const hType: THypothesisType = htTwoTailed; const Alpha: double = 0.05):double; overload;
(*One or two sample paired or pooled T-test.
Performs the one sample T-test. It compares Data mean value with the Mean.
The null hypothesis is that Data mean value is equal to the Mean.
*)
function tTest(const Data: TVec; Mean: double; out hRes: THypothesisResult;
out Signif : double; var ConfInt: TTwoElmReal ;
const hType: THypothesisType = htTwoTailed; const Alpha: double = 0.05 ):double; overload;
(*Performs the two sample pooled or paired t-test.
First set of data from which to compute the mean.
Second set of data from which to compute the mean.
Returns the result of the null hypothesis (default assumption is that the means are equal).
Defines the type of the null hypothesis (left, right and two - tailed).
If true, tTest routine will perform Two-Sample Paired t-Test. If false, tTest will perform
Two-Sample Pooled (unpaired) t-Test.
(Significance level) returns the probability of observing the given result by chance
given that the null hypothesis is true.
Returns the 100*(1-Alpha) percent confidence interval for the mean.
Defines the desired significance level. If the significance probability (Signif)
is bellow the desired significance (Alpha), the null hypothesis is rejected.
The T-test statistics.
It compares Data1 mean value with Data2 mean value. The assumption is Data1
and Data2 variances are equal, but unknown. The null hypothesis is that Data1 mean is equal to Data2 mean.
In this example we'll try to confirm hypothesis that first sample
mean is smaller from second sample mean.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector d1 = new Vector(0);
Vector d2 = new Vector(0);
d1.SetIt(false, new double[] {2, 3, 5, 2.5}); // mean = 3.125
d2.SetIt(false, new double[] {5, 7, 8}); // mean = 6.6666666667
double signif, TStat;
double[] ci = new double[2];
THypothesisResult hres;
// don't use normal approximation
TStat = Statistics.tTest(d1,d2,out hres,out signif,out ci,false,THypothesisType.htLeftTailed,0.05);
// signif = 0.01070092300; hres = hrReject;
// ci=[ -5.702239060933, +INF ]
// Comment : Since the signif is smaller than Alpha (0.05), the
// null hypothesis (H0) that data means are equal is rejected and
// the alternative (Ha) that d1 mean is smaller than d2 mean
// CANNOT be rejected
}
}
*)
function tTest(const Data1, Data2: TVec; out hRes: THypothesisResult;
out Signif : double; var ConfInt: TTwoElmReal;
Paired: boolean = false; hType: THypothesisType = htTwoTailed; Alpha: double = 0.05):double; overload;
(*Z Test.
The data samples to be analyzed.
The standard deviation to compare the data to.
The mean to value to compare the data to.
Returns the result of the null hypothesis (default assumption is that the means are equal).
Defines the type of the null hypothesis (left, right and two - tailed).
(Significance level) returns the probability of observing the given result by chance
given that the null hypothesis is true.
Returns the 100*(1-Alpha) percent confidence interval for the mean.
Defines the desired significance level. If the significance probability (Signif)
is bellow the desired significance (Alpha), the null hypothesis is rejected.
Z statistics.
Compares the normally distributed Data elements mean value with known standard deviation Sigma, to a mean value Mean.
We'll use Z-Test to determine if data mean and variance are equal
to specific values.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector d1 = new Vector(0);
d1.SetIt(false, new double[] { 3, 3.5, 2, 3.1, 10, 9 });
double signif, ZStat;
double[] ci = new double[2];
THypothesisResult hres;
// don't use normal approximation
ZStat = Statistics.ZTest(d1, 5.0, 1.29, out hres, out signif, out ci, THypothesisType.htTwoTailed, 0.05);
// signif = 0.84940087229; hres = hrNotReject;
// ci=[4.06780398958, 6.13219601041]
// Comment : Since the SignRes is greater than Alpha (0.05), the
// null hypothesis (H0) that data mean is equal to 5.1 cannot be rejected
// the alternative (Ha) that data mean are NOT equal can therefore be
// rejected
}
}
*)
function ZTest(const Data: TVec; const Mean, Sigma: double; out hRes: THypothesisResult;
out Signif : double; var ConfInt: TTwoElmReal;
const hType: THypothesisType = htTwoTailed; const Alpha: double = 0.05): double;
(*Chi-Squared Test.
Data to be tested.
Standard deviation which is compared to data standard deviation.
Returns the result of the null hypothesis (default assumption is that the means are equal).
Defines the type of the null hypothesis (left, right and two - tailed).
(Significance level) returns the probability of observing the given result by chance
given that the null hypothesis is true.
Returns the 100*(1-Alpha) percent confidence interval for the mean.
Defines the desired significance level. If the significance probability (Signif)
is bellow the desired significance (Alpha), the null hypothesis is rejected.
Performs Chi-Squared test by comparing sample standard deviation with Sigma.
*)
procedure ChiSquareTest(const Data: TVec; Sigma: double; out hRes: THypothesisResult;
out Signif : double; var ConfInt: TTwoElmReal;
hType: THypothesisType = htTwoTailed; Alpha: double = 0.05);
(*F Test.
First data set from which to compute standard deviation.
Second data set from which to compute standard deviation.
Returns the result of the null hypothesis (default assumption is that the means are equal).
Defines the type of the null hypothesis (left, right and two - tailed).
(Significance level) returns the probability of observing the given result by chance
given that the null hypothesis is true.
Returns the 100*(1-Alpha) percent confidence interval for the mean.
Defines the desired significance level. If the significance probability (Signif)
is bellow the desired significance (Alpha), the null hypothesis is rejected.
The F-test statistics.
Performs F-Test by comparing Data1 and Data2 standard deviations.
*)
function FTest(const Data1, Data2: TVec; out hRes: THypothesisResult;
out Signif : double; var ConfInt: TTwoElmReal;
hType: THypothesisType = htTwoTailed; Alpha: double = 0.05):double;
(*One sample Kolmogorov-Smirnov GOF test.
Samples to be tested.
Defines set of possible x values.
Defines set of hypothesized CDF values, evaluated at CDFx.
Returns the result of the null hypothesis (default assumption is that data comes from specific distribution).
Defines the type of the null hypothesis (left, right and two - tailed).
(Significance level) returns the probability of observing the given result by chance
given that the null hypothesis is true.
Defines the desired significance level. If the significance probability (Signif)
is bellow the desired significance (Alpha), the null hypothesis is rejected.
K-S statistics.
Performs one-sample Kolmogorov-Smirnov (KS) goodnes of fit test.
The KS test is used to decide if a sample comes from a population with a specific distribution.
Test is based on the empirical distribution function (ECDF). An attractive feature of this test is that the
distribution of the K-S test statistic itself does not depend on the underlying cumulative distribution
function being tested. Another advantage is that it is an exact test (the chi-square goodness-of-fit test
depends on an adequate sample size for the approximations to be valid). Despite these advantages, the K-S test
has several important limitations:
- It only applies to continuous distributions.
- It tends to be more sensitive near the center of the distribution than at the tails.
- Perhaps the most serious limitation is that the distribution must be fully specified. That is,
if location, scale, and shape parameters are estimated from the data, the critical region of the K-S
test is no longer valid. It typically must be determined by simulation.
If CDFx and CDFy vectors are not defined, Data values are compared with standard normal distribution.
If defined, CDFx and CDfy vectors represent hypothesized distribution x and CDF(x) values. In this case all
Data values must lie within the [Min(CDFx),Max(CDFx)] interval. The KS test assumes CDFx and CDFy are predefined -
KS test is not very accurate if CDFx and CDFy values are calculated from Data values.
In this example sample is generated using Normal (mu=2,sigma=1) distribution.
Then a KS test is used to determine if sample comes from normal distribution.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector d = new Vector(300);
StatRandom.RandomNormal(2, 1, d, -1);
THypothesisResult hRes;
double Signif;
double KS = Statistics.GOFKolmogorov(d, out hRes, out Signif, null, null, THypothesisType.htTwoTailed, 0.05);
// Result should be significance below 0.05 meaning d1 and d2 value
// do not come from same distribution => H0 is therefore rejected!
}
}
*)
function GOFKolmogorov(const Data: TVec; out hRes: THypothesisResult; out Signif : double;
const CDFx: TVec = nil; const CDFy: TVec = nil;
hType: THypothesisType = htTwoTailed; Alpha: double = 0.05):double; overload;
(*Two sample Kolmogorov-Smirnov GOF test.
First dataset.
Second dataset.
Returns the result of the null hypothesis (default assumption is that data comes from specific distribution).
Defines the type of the null hypothesis (left, right and two - tailed).
(Significance level) returns the probability of observing the given result by chance
given that the null hypothesis is true.
Defines the desired significance level. If the significance probability (Signif)
is below the desired significance (Alpha), the null hypothesis is rejected.
K-S statistics.
Performs two-sample Kolmogorov-Smirnov goodnes of fit test on indepentent random samples Data1 and Data2.
Test determines if Data1 and Data2 samples are drawn from the same continuous population.
In this example two differently sized samples are generated using diferent Weibull(a=2,b=3) and Normal(mu=2,sigma=1)
distributions. Then a KS test is used to determine if both samples come from the same distribution.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example1()
{
Vector d1 = new Vector(30);
Vector d2 = new Vector(22);
StatRandom.RandomWeibull(2, 3, d1, -1);
StatRandom.RandomNormal(2, 1, d2, -1);
THypothesisResult hRes;
double Signif;
double KS = Statistics.GOFKolmogorov(d1, d2, out hRes, out Signif, THypothesisType.htTwoTailed, 0.05);
// Result should be significance below 0.05 meaning d1 and d2 value
// do not come from same distribution => H0 is therefore rejected!
}
}
*)
function GOFKolmogorov(const Data1, Data2: TVec; out hRes: THypothesisResult; out Signif : double;
hType: THypothesisType = htTwoTailed; Alpha: double = 0.05): double; overload;
(*The Chi-Squared goodness of fit test.
Observed frequencies.
Estimated (theoretical) frequencies.
Number of samples in original data vector.
Degrees of freedom for Chi2 statistics.
Returns the result of the null hypothesis (default assumption is that data comes from specific distribution).
Defines the type of the null hypothesis (left, right and two - tailed).
(Significance level) returns the probability of observing the given result by chance
given that the null hypothesis is true.
Defines the desired significance level. If the significance probability (Signif)
is bellow the desired significance (Alpha), the null hypothesis is rejected.
Performs the Chi-squared goodness of fit test to test if data is coming from specific distribution.
*)
procedure GOFChi2Test(const O,E: TVec; NumSamples, DegF: Integer; out hRes: THypothesisResult; out Signif : double;
hType: THypothesisType = htTwoTailed; Alpha: double = 0.05); overload;
(*The Chi-Squared goodness of fit test.
Samples to be tested.
Number of frequency histogram bins.
Distribution name (string). At the moment the following string values are supported : 'beta', 'exponential',
'gamma', 'normal', 'rayleigh' and 'weibull'.
Returns the result of the null hypothesis (default assumption is that data comes from specific distribution).
Defines the type of the null hypothesis (left, right and two - tailed).
(Significance level) returns the probability of observing the given result by chance
given that the null hypothesis is true.
Defines the desired significance level. If the significance probability (Signif)
is bellow the desired significance (Alpha), the null hypothesis is rejected.
Performs the Chi-squared goodness of fit test to test if data is coming from specific distribution.
This overload version does not require expected and actual frequencies, but only data vector.
*)
procedure GOFChi2Test(const Data: TVec; Distribution: TDistribution; NumBins: Integer; out hRes: THypothesisResult; out Signif : double;
hType: THypothesisType = htTwoTailed; Alpha: double = 0.05); overload;
(*The Bera-Jarque GOF test to a normal distribution.
Samples to be tested.
Returns the result of the null hypothesis (default assumption is that data composite normality is true).
(Significance level) returns the probability of observing the given result by chance
given that the null hypothesis is true.
Defines the desired significance level. If the significance probability (Signif)
is bellow the desired significance (Alpha), the null hypothesis is rejected.
Bera-Jarque Statistics.
Performs the Bera-Jarque test to a normal distribution.
*)
function GOFBeraJarqueTest(const Data:TVec; out hRes: THypothesisResult; out Signif : double; Alpha: double = 0.05): double;
(*The Lilliefors GOF test to a normal distribution.
KS test statistics value.
Vector, storing sample values to-be-tested.
Returns the result of the null hypothesis (default assumption is that data comes from normal distribution).
(Significance level) returns the probability of observing the given result by chance
given that the null hypothesis is true.
Defines the desired significance level. If the significance probability (Signif)
is bellow the desired significance (Alpha), the null hypothesis is rejected.
Defines default tolerance for Monte Carlo algorithm (0,0001).
Performs the Lilliefors goodnes of fit test to a normal distribution with unknown parameters mu and sigma.
The test proceeds as follows:
- First estimate the population mean and population variance based on the data.
- Then find the maximum discrepancy between the empirical distribution function and the cumulative distribution function (CDF) of
the normal distribution with the estimated mean and estimated variance. Just as in the Kolmogorov-Smirnov test, this will
be the test statistic.
- Finally, we confront the question of whether the maximum discrepancy is large enough to be statistically significant, thus requiring
rejection of the null hypothesis. This is where this test becomes more complicated than the Kolmogorov-Smirnov test. Since the
hypothesized CDF has been moved closer to the data by estimation based on those data, the maximum discrepancy has been
made smaller than it would have been if the null hypothesis had singled out just one normal distribution. Thus we need the "null distribution" of the test statistic, i.e. its probability distribution assuming the null hypothesis is true. This is the Lilliefors distribution. To date, tables for
this distribution have been computed only by Monte Carlo methods.
Note:
The test is relatively weak and a large amount of data is typically required to reject the normality hypothesis. A more sensitive test is the Jarque-Bera test which is based on a combination of the estimates of skewness and kurtosis. The Jarque-Bera test is therefore highly attentive to
outliers, which the Lilliefors is not.
*)
function GOFLillieTest(const Data:TVec; out hRes: THypothesisResult; out Signif : double;
Alpha: double = 0.05; MCTol2: double = 1E-4): double;
(*The Shapiro-Wilks test for normality of data.
Data vector, containing ordered and unique deviates from unknown distribution. Data Length must be betweeen 3 and 5000, otherwise
an execption is raised.
Returns the result of the null hypothesis (default assumption is that data comes from normal distribution).
Defines the type of the null hypothesis (left, right and two - tailed).
(Significance level) returns the probability of observing the given result by chance
given that the null hypothesis is true.
Defines the desired significance level. If the significance probability (Signif)
is bellow the desired significance (Alpha), the null hypothesis is rejected.
the Shapiro-Wilks statistics.
Performs the The Shapiro-Wilks test for normality of data.
Note
Basic assumption is Data values i.e. deviates from unknown distribution are ordered and unique.ž
In this example we'll use ShapiroWilks test to determine if data is coming from normal distribution.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector a = new Vector(0);
Vector b = new Vector(0);
a.Size(1000, false);
StatRandom.RandomWeibull(2.5, 1.0, a, -1);
// filter original data, retain only sorted unique values
Statistics.Unique(a, b, null);
THypothesisResult hres;
double sign;
Statistics.ShapiroWilks(b, out hres, out sign, THypothesisType.htTwoTailed, 0.05);
// Significance approx. 0
// hres = hrReject -> data is not coming from normal distribution
}
}
*)
function ShapiroWilks(const Data: TVec; out hRes: THypothesisResult; out Signif: double;
hType: THypothesisType = htTwoTailed; Alpha: double = 0.05): double;
(*The Shapiro-Francia test for normality of data.
Data vector, containing ordered and unique deviates from unknown distribution.
Returns the result of the null hypothesis (default assumption is that data comes from normal distribution).
Defines the type of the null hypothesis (left, right and two - tailed).
(Significance level) returns the probability of observing the given result by chance
given that the null hypothesis is true.
Defines the desired significance level. If the significance probability (Signif)
is bellow the desired significance (Alpha), the null hypothesis is rejected.
the Shapiro-Francia statistics.
Performs the The Shapiro-Francia test for normality of data.
Note
Basic assumption is Data values i.e. deviates from unknown distribution are ordered and unique.ž
In this example we'll use Shapiro-Francia test to determine if data is coming from normal distribution.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector a = new Vector(0);
Vector b = new Vector(0);
a.Size(1000,false);
StatRandom.RandomWeibull(2.5,1.0,a,-1);
// filter original data, retain only sorted unique values
Statistics.Unique(a,b, null);
THypothesisResult hres;
double sign;
ShapiroFrancia(b,out hres, out sign,THypothesisType.htTwoTailed,0.05);
// Significance approx. 0
// hres = hrReject -> data is not coming from normal distribution
}
}
*)
function ShapiroFrancia(const Data: TVec; out hRes: THypothesisResult; out Signif: double;
hType: THypothesisType = htTwoTailed; Alpha: double = 0.05): double;
(*Anderson-Darling GOF test.
Stores ordered data.
Perform test for this distribution. Supported distributions : exponential, log-normal, normal and weibull.
Returns the result of the null hypothesis.
(Significance level) returns the probability of observing the given result by chance
given that the null hypothesis is true.
Defines the desired significance level. If the significance probability (Signif)
is bellow the desired significance (Alpha), the null hypothesis is rejected.
Anderson-Darling test statistics, adjusted with small sample size factor.
The Anderson-Darling test (Stephens, 1974) is used to test if a sample of data came from a population with
a specific distribution. It is a modification of the Kolmogorov-Smirnov (K-S) test and gives more weight to
the tails than does the K-S test. The K-S test is distribution free in the sense that the critical values
do not depend on the specific distribution being tested. The Anderson-Darling test makes use of the specific
distribution in calculating critical values. This has the advantage of allowing a more sensitive test and
the disadvantage that critical values must be calculated for each distribution.
The Anderson-Darling test is defined as:
- H0: The data follow a specified distribution.
- Ha: The data do not follow the specified distribution.
The test statistics is defined as:
where F is the cumulative distribution function of distribution being tested.
To learn more about A-D test, check the following links:
- http://www.itl.nist.gov/div898/handbook/eda/section3/eda35e.htm
- http://Src.alionscience.com/pdf/A_DTest.pdf
Note
The basic assumption is that the data values are sorted in ascending order.
*)
function AndersonDarling(const Data: TVec; Distribution: TDistribution; out hRes: THypothesisResult;
out Signif : double; Alpha: double = 0.05): double;
(*Spearman rank correlation test.
X dataset.
Y dataset.
Returns Spearman rank correlation coefficient.
Returns the result of the null hypothesis (default assumption is there is no monotonic relation between the variables => Rs=0).
(Significance level) returns the probability of observing the given result by chance
given that the null hypothesis is true.
Returns the 100*(1-Alpha) percent confidence interval for the Rs coefficient.
Defines the type of the null hypothesis (one or two - tailed, default value two-tailed).
Defines the desired significance level. If the significance probability (Signif)
is bellow the desired significance (Alpha), the null hypothesis is rejected.
Performs the Spearman rank correlation test. Spearman rank correlation is a distribution-free
analog of correlation analysis mentioned. Like regression, it can be applied to compare
two independent random variables, each at several levels (which may be discrete or continuous).
Unlike regression, Spearman's rank correlation works on ranked (relative) data, rather than
directly on the data itself. Like the R2 value produced by regression, the Spearman's Rs
coefficient indicates agreement. A value of rs near one indicates good agreement; a value
near zero, poor agreement. Of course, as a distribution-free method, the Spearman rank
correlation does not make any assumptions about the distribution of the underlying data.
Spearman test is a distribution free test that determines whether there is a monotonic relation
between two variables (X , Y). A monotonic relation exists when any increase in one variable
is invariably associated with either an increase or a decrease in the other variable.
*)
procedure SpearmanRankCorr(const X, Y: TVec; out Rs: double; out hRes: THypothesisResult;
out Signif : double; var ConfInt: TTwoElmReal; hType: THypothesisType = htTwoTailed; Alpha: double = 0.05);
(*Empirical CDF.
Data, sorted in ascending order, with NANs and INFs removed.
Values at which CDF increases.
Calculated CDF.
Calculates empirical cumulative distribution function. Given N ordered data points Y1, Y2, ..., YN,
the empirical CDF (ECDF) is defined as:
E(n)=n(i)/N ,
where n(i) is the number of points less than Yi and the Yi are ordered from smallest to largest value.
This is a step function that increases by 1/N at the value of each ordered data point.
*)
procedure EmpiricalCDF(const Data: TVec; const xCDF,yCDF: TVec);
(*Grubb's test for outliers.
dataset.
Returns the result of the null hypothesis (default assumption is there are no outliers).
(Significance level) returns the probability of observing the given result by chance
given that the null hypothesis is true.
Returns the confidence interval to determine the outliers.
Defines the type of the null hypothesis (one or two - tailed, default value two-tailed).
Defines the desired significance level. If the significance probability (Signif)
is bellow the desired significance (Alpha), the null hypothesis is rejected.
Grubb's (G) statistic.
Performs the Grubbs test for outliers. Test is used to detect outliers in a univariate data set. It is based on the assumption
of normality. That is, you should first verify that your data can be reasonably approximated by a normal distribution before
applying the Grubbs' test.
Grubbs' test detects one outlier at a time. This outlier is expunged from the dataset and the test is iterated until no outliers
are detected. However, multiple iterations change the probabilities of detection, and the test should NOT be used for sample
sizes of six or less since it frequently tags most of the points as outliers. Grubbs' test is also known as the maximum normed
residual test.
Grubbs' test is defined for the hypothesis:
* H0: There are no outliers in the data set.
* Ha: There is at least one outlier in the data set.
More about the test can be found at here.*)
function GrubbsTest(const Data: TVec; out hRes: THypothesisResult; out Signif: double; var ConfInt: TTwoElmReal;
hType: THypothesisType = htTwoTailed; Alpha: double = 0.05): double;
(*Performs a principal component analysis (PCA).
Perform a PCA by using the original data covariance matrix CovMat. Return the principal
components in PC matrix, eigenvalues of the covariance matrix (variances) in vector
EigenVec and (optional) the percentage of total variance in vector VarPct. The PC,
EigenVec and VarPct dimensions are adjusted automatically.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example
{
Matrix data = new Matrix(0, 0);
Matrix PC = new Matrix(0, 0);
Vector Z = new Vector(0);
Vector variances = new Vector(0);
data.SetIt(2,4, false, new double[] { 1,3,5,2,2,5,7,9});
Statistics.Covariance(data,covMat,false);
Statistics.PCA(covMat, PC, Z, variances); //requires cov matrix
// Z = [29 , 0, 0 ,0 ]
//variance = [100, 0, 0 ,0 ]
}
}
*)
procedure PCA(const CovMat: TMtx; const PC: TMtx; const EigenVec: TVec; const VarPct: TVec = nil);overload;
(*Perform a PCA on Data matrix, where Data columns are variables and rows are the observables.
The (optional) PCAMode parameter defines whether the analysis should be run on
correlation or covariance matrix. PCA procedure returns the principal components in matrix
PC, the Z-scores (data, transformed in the PC space) in ZScores, the eigenvalues of the
covariance matrix (variances) in the EigenVec vector and (optional) the percentage of
total variance in VarPct vector. The PC, ZScores, EigenVec and VarPct dimensions are
adjusted automatically.
In this example we derive the covariance matrix from original data
and get the same results as in first example.
using Dew.Math;
using Dew.Stat.Units;
namespace Dew.Examples
{
private void Example()
{
Matrix data = new Matrix(0, 0);
Matrix PC = new Matrix(0, 0);
Matrix Z = new Matrix(0, 0);
Vector variances = new Vector(0);
Vector varPercent = new Vector(0);
data.SetIt(2, 4, false, new double[]
{1,3,5,2,
2,5,7,9});
Statistics.PCA(data, PC, Z, variances, varPercent, TPCAMode.PCACovMat); //works on raw data
// ... variances = [29,0,0,0]
// varPercent = [100,0,0,0]
}
}
*)
procedure PCA(const Data: TMtx; const PC, ZScores:TMtx; const EigenVec: TVec; const VarPct: TVec = nil; const PCAMode: TPCAMode = PCACorrMat);overload;
(*PC residuals.
Constructs the residuals, copied from NumPC principal components of Data, where Data columns are variables
and rows are the observables. An exception is raised if NumPC is more or equal than the number of Data
columns (number of variables). The Rows, Cols and Complex properties of the Residuals matrix are
adjusted automatically.
*)
procedure PCAResiduals(const Data: TMtx; const NumPC: Integer; const Residuals: TMtx);
(*Performs the Bartlett test for dimensionality of data.
Data to be analyzed.
Returns the number of dimensions needed to explain the non random
variation in data. If NumDim is less than zero, the test cannot be applied to this data.
Returns calculated significance probabilities for one,
two ... variables (dimensions).
Desired significance.
using Dew.Math;
using Dew.Stat.Units;
namespace Dew.Examples
{
private void Example()
{
int numdim;
Matrix data = new Matrix(0,0);
Vector signifVec = new Vector(0);
data.LoadFromFile("c:\\temp\\data.mtx"); // load test data
Statistics.BartlettTest(data, out numdim, signifVec, 0.05); // Alpha = 5%
// returns numdim and signifVec
}
}
*)
procedure BartlettTest(const Data:TMtx; out NumDim: Integer; const SignifVec: TVec; const Alpha: double = 0.05);
(*Creates the factors for full factorial design.
Performs Two-Level Full Factorial design. The results (factors) are stored in Result matrix. Parameter
n defines number of Result matrix columns. Optional parameter MulVec (default value nil) is used only
by the Wilcoxon Signed Rank test procedure. Size and Complex properties of Result matrix are adjusted
automatically.
Matrix m = new Matrix(0,0);
Statistics.FullFactDesign(2,m,null);
*)
procedure FullFactDesign(const n: Integer; const aResult: TMtx; const fp: TMtxFloatPrecision; const MulVec: TVec = nil);overload;
(*Performs Mixed-Level Full Factorial Design.
The UniqueLevels vector stores the number of unique settings for each column.
UniqueLevels values must be integers, greater than 1. The results (factors)
are stored in Result matrix. Size and Complex properties of Result matrix are adjusted automatically.
using Dew.Math;
using Dew.Stat.Units;
namespace Dew.Examples
{
private void Example()
{
Vector tmpVec = new Vector(0);
Matrix m = new Matrix(0,0);
tmpVec.SetIt(false, new double[] {2,3,2});
Statistics.FullFactDesign(tmpVec,m);
}
}
*)
procedure FullFactDesign(const UniqueLevels: TVec; const aResult: TMtx);overload;
(*One-sample Hotelling T2 test.
Stores test values, each row representing different case and each column representing different response variable.
The assumption is data is approximately multivariate normal.
Stores estimated mean for each variable. An exception is raised if Means Length is not equal to Data columns. If
Means is nil, the assumption is means are equal.
(Significance level) returns the probability of observing the given result by chance
given that the null hypothesis is true.
Returns the result of the null hypothesis (default assumption is variable means are equal to Means vector).
Defines the desired significance level. If the significance probability (Signif)
is bellow the desired significance (Alpha), the null hypothesis is rejected.
Hotelling T2 Statistics for one-sample test.
Performs one-sample Hotelling T2 test. The one-sample T2 is used to test hypotheses about a set of means simultaneously.
The null hypothesis is that sample means are equal to Means vector values. The following assumptions are made when using T2:
* The population follows the multivariate normal distribution.
* The members of the sample are independent.
The one-sample T2 test may also be applied to the situation in which two samples are to be compared that had a natural pairing
between two observation vectors. In this case the differences between the first and second measurements are formed and then used
as data in unpaired Hotelling T2 test.
*)
function HotellingT2One(const X: TMtx; const Means: TVec; out Signif: double; out hRes: THypothesisResult; const Alpha: double = 0.05): double; overload;
(*Two-sample Hotelling T2 test.
First test data.
Second test data.
Stores estimated mean for each variable. An exception is raised if Means Length is not equal to X1 or X2 columns.
Returns the result of the null hypothesis (default assumption is both tests variable means are equal to Means vector).
Returns n1+n2-2 for equal covariances, v for unequal covariance matrices.
Defines the desired significance level. If the significance probability (Signif)
is bellow the desired significance (Alpha), the null hypothesis is rejected.
Assume both test have equal variances. If this is not the case, set EqualCovariances to false.
(Significance level) returns the probability of observing the given result by chance
given that the null hypothesis is true.
Hotelling T2 Statistics for two-sample test.
Performs two-sample Hotelling T2 test on samples X1 and X2. For this test the assumptions of equal variances and normally distributed residuals are used.
Use the routine to check if both tests have equal variances.
*)
function HotellingT2Two(const X1, X2: TMtx; out Signif: double; out hRes: THypothesisResult; out v: double;
const Means: TVec = nil; const EqualCovariances: boolean = true; const Alpha: double = 0.05): double; overload;
(*M-Box test for equal covariances.
First matrix. The number of columns for X1 and X2 must be equal, otherwise an exception is raised.
Second matrix. The number of columns for X1 and X2 must be equal, otherwise an exception is raised.
Nominator degrees of freedom.
Denominator degrees of freedom.
Defines the desired significance level.
Returns the result of the null hypothesis.
(Significance level) returns the probability of observing the given result
Performs M-Box test for equal covariances. In this case the null hypothesis is that X1 and X2 covariances are equal and the alternative hypothesis is
that X1 and X2 covariances are not equal.
Suppose we have two matrices, representing two tests with 5 samples x 3 variables. We want to test if
two test matrices have the same covariances. Performing M-Box test with default significance level 5% will give us an answer.
using Dew.Math;
using Dew.Stats.Units;
using Dew.Stats;
namespace Dew.Examples
{
private void Example()
{
int df1, df2;
Matrix X1 = new Matrix(0, 0);
Matrix X2 = new Matrix(0, 0);
X1.SetIt(5, 3, false, new double[] { 23, 45, 15, 40, 85, 18, 215, 307, 60, 110, 110, 50, 65, 105, 24 });
X2.SetIt(5, 3, false, new double[] { 277, 230, 63, 153, 80, 29, 306, 440, 105, 252, 350, 175, 143, 205, 42 });
THypothesisResult hres;
double sign;
double MB = Statistics.MBoxTest(X1, X2, out sign, out hres, out df1, out df2, 0.05);
// MB : 27,16221062
// Sign : 0,01619810
// Sign < Alpha meaning hres = hrReject i.e. covariance matrices are significantly different.
}
}
*)
function MBoxTest(const X1,X2: TMtx; out Signif: double; out hRes: THypothesisResult;
out df1, df2: Integer; const Alpha: double = 0.05): double;
(*The stress factor for multidimensional scaling.
Data distance matrix.
Estimated distance matrix.
The stress factor.
Calculates the GOF statistics for multidimensional scaling. The stress factor is defined as:
where hat(d(i,j)) is the predicted distance based on the MDS model. In his original paper on multi dimensional scaling, Kruskal (1964)
gave following advise about stress values based on his experience:
Stress Goodness-of-fit
- 0.2 poor
- 0.1 fair
- 0.05 good
- 0.025 excellent
- 0.000 perfect
More recent articles caution against using a table like this since acceptable values of stress depends
on the quality of the distance matrix and the number of objects in that matrix.
D stores data distance matrix, DHat stores reduced dimension estimated distance matrix.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Matrix D = new Matrix(2, 2);
Matrix DHat = new Matrix(2, 2);
D.SetIt(2, 2, false, new double[4] { 0.3, 3, 2, 0.1 });
DHat.SetIt(2, 2, false, new double[4] { 0.1, 3.1, 2.2, 0.3 });
// Calculate stress value - measure for GOF
// Smaller stress value means better GOF.
double stress = Statistics.MDScaleStress(D, DHat);
}
}
*)
function MDScaleStress(const D, DHat: TMtx): double;
(*Classical multidimensional scaling.
Distance matrix.
Returns the coordinates of object in reduced space.>
Defines number of dimensions/variables to use in classical
MD scaling algorithm.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Matrix X = new Matrix(0,0);
Matrix Y = new Matrix(0,0);
Matrix D = new Matrix(0,0);
Matrix DHat = new Matrix(0,0);
Vector eigen = new Vector(0);
X.SetIt(5,2,false,new double[] {1,2,3,4,3,11,7,8,9,4});
// Use all dimensions i.e. 2
Statistics.PairwiseDistance(X,D,2,TPWDistMethod.pwdistEuclidian);
// Reduce to just one variable (1d subspace)
Statistics.MDScaleMetric(D,Y,eigen,1);
// Calculate estimated distance matrix
Statistics.PairwiseDistance(Y,DHat,1,TPWDistMethod.pwdistEuclidian);
// Calculate stress - measure of GOF
double stress = Statistics.MDScaleStress(D,DHat);
// if stress > 0.2, the GOF is poor.
}
}
*)
procedure MDScaleMetric(const D: TMtx; const Y: TMtx; const EigenValues: TVec; const NumDim: Integer);
(*Pairwise distance.
Calculates the pairwise distance for variables, stored in X.
Suppose we have collected 5 data points with 3 variables. The data matrix will look like:
X=[2,1,3,
2,3,1,
4,5,6,
6,6,6
7,5,3]
Generated distance matrix has zero diagonal and is always symmetric.
To calculate Euclidian distance between each pair of points, we use the following code:
Statistics.PairWiseDistance(X,D,3); // 3 dimensions for 3 variables
*)
procedure PairwiseDistance(const X: TMtx; const D: TMtx; p: Integer; const Method: TPWDistMethod=pwdistEuclidian); overload;
(*The Cronbach Alpha coefficient.
the Cronbach Alpha coefficient, defined by the following relation:
where K defines number of items (Data columns) and sigma(i,j) estimated covariance between item i and j.
Note
Cronbach's Alpha measures how well a set of items (or variables) measures a single unidimensional latent construct.
When data have a multidimensional structure, Cronbach's Alpha will usually be low. Technically speaking, Cronbach's
Alpha is not a statistical test - it is a coefficient of reliability (or consistency).
*)
function CronbachAlpha(const Data: TMtx): double;
(*Remove matrix row.
Removes index-th row from Src matrix and stores the result in Dst matrix. Size and complex properties of Dst matrix are
adjusted automatically.
*)
procedure RemoveRow(const Src,Dst: TMtx; const Index: Integer);
(*Remove matrix column.
Removes index-th column from Src matrix and stores the result in Dst matrix. Size and complex properties of Dst matrix are
adjusted automatically.
*)
procedure RemoveColumn(const Src,Dst: TMtx; const Index: Integer);
(*Orthogonal rotation of matrix.
Defines original matrix (to be rotated).
Stores rotated matrix (X*R, where R is calculated rotation matrix).
Defines different types of Varimax rotation (see above).
If true, X matrix is normalized (by rows) prior to rotation. After the rotation the result is
then renormalized.
Convergence tolerance in iteration algorithm.
Maximum number of rotations. Together with Tolerance parameter it defines
convergence criteria.
Returns the rotation matrix, used for calculating Y=X*R.
true, if number of rotation did not exceed maximum number of iterations for rotation.
Performs orthogonal rotation of X matrix. Based on the Gamma parameter value, the following rotations can be performed:
- Gamma = 1 => Varimax rotation
- Gamma <> 1 => Orthomax rotation
- Gamma = X.Cols div 2 => Equimax rotation
- Gamma = 0 => Quartimax rotation
*)
function OrthogonalRotation(const X,Y: TMtx; const R: TMtx; const Gamma: double = 1.0; const Normalize: boolean = true; const Tolerance: double = SQRTEPS;
const MaxIter: Integer = 200): boolean;
(*Latin Hyper-Cube design.
Stores experimental design values (rows representing values, columns variables).
Defines the number of values for specific variable.
Defines the number of variables.
If true, generate random values for n random (permuted) values are distributed
per one point in intervals (0,1/n),(1/n,2/n)... ((n-1)/n,n). If false, produce the points in the
middle of define intervals i.e. 0.5/n, 1.5/n, ... .
The criteria for design improvement.
Maximum number of iterations used for improving the design.
Floating point precision to be used for the design.
Generates n x p samples for Latin Hypercube experimental design.
using Dew.Math;
using Dew.Stats.Units;
using Dew.Stats;
namespace Dew.Examples
{
private void Example()
{
Matrix e = new Matrix(0,0);
Statistics.LatinHyperCubeDesign(e,50,3,mvDouble,true,TLHCImprove.lhcImproveMinDist, 4);
}
}
*)
procedure LatinHyperCubeDesign(const X: TMtx; const n,p: Integer; const fp: TMtxFloatPrecision; const smooth: boolean = true; const IterCriteria: TLHCImprove = lhcImproveMinDist; const MaxIter: Integer = 4);
function BetaLike(const Pars: TVec; Const Con: TVec; Const PConst: Array of TObject; var Sigma: TTwoElmReal): double; overload;
function BetaLike(const Pars: TVec; Const Con: TVec; Const OConst: Array of TObject): double; overload;
function MannWCDF1(size1, size2: Integer; x: double): double; overload;
const hhStatVersion = 'Stats Master v6.3.5';
StatAuthor = '(c) 1999-2025 by Dew Research';
(*Probabilities plots.
The probability plot (Chambers 1983) is a graphical technique for assessing whether or not a data set
follows a given distribution such as the normal or Weibull. The data are plotted against a theoretical distribution
in such a way that the points should form approximately a straight line. Departures from this straight line indicate
departures from the specified distribution.
The correlation coefficient associated with the linear fit to the data in the probability plot is a measure of the
goodness of the fit. Estimates of the location and scale parameters of the distribution are given by the intercept
and slope. Probability plots can be generated for several competing distributions to see which provides the best fit,
and the probability plot generating the highest correlation coefficient is the best choice since it generates the
straightest probability plot.
For distributions with shape parameters (not counting location and scale parameters), the shape parameters must
be known in order to generate the probability plot. For distributions with a single shape parameter, the probability
plot correlation coefficient (PPCC) plot provides an excellent method for estimating the shape parameter.
Check the following link to learn more
about probability plots.
*)
unit StatProbPlots;
{$I BdsppDefs.inc}
interface
uses MtxVec, Math387
,Classes
,SysUtils
;
(*Constructs the Normal Probability Chart.
Data to be drawn. The assumption is data values are sorted.
If true, algorithm assumes Data is already sorted in ascending order.
If Data is not sorted, you must set this parameter to false so that internal algorithm will
automatically do the sorting.
Returns normal probability plot horizontal values to be drawn - > estimated quantiles
from Data vector or in this case sorted Data values.
Returns normal probability plot vertical values to be drawn - >
Values are generated from theoretical standard normal distribution with
parameters mu=0, sigma^2=1.
Returns slope line start X point, XDrawVec 25th percentile.
These value are used by series.
Returns slope line start Y point, YDrawVec 25th percentile.
These value are used by series.
Returns slope line end X point, XDrawVec 75th percentile.
These value are used by series.
Returns slope line end Y point, YDrawVec 75th percentile.
These value are used by series.
Standard error
How to construct Normal distribution probability plot?
- If needed, Data values are sorted (DataSorted parameter must be set to false).
- Abscissa drawing values are formed by estimated data quantiles - ordered Data values. After
calculation they are copied to XDrawVec. and
properties of XDrawVec are adjusted automatically.
- Ordinate drawing values are formed by using theoretical probability p i.e the so
Z values. After calculation they are copied to YDrawVec. and
properties of YDrawVec are adjusted automatically.
- XDrawVec and YDrawVec 25th and 75th percentile points are used to construct a reference line.
Drawing points departures from this straight line indicate departures from normality.
The normal probability plot is used to answer the following questions:
- Are the data normally distributed?
- What is the nature of the departure from normality (data skewed, shorter than expected tails, longer than expected tails)?
The following code will create probability plot and then plot calculated values.
using Dew.Math;
using Dew.Math.Units;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example(ProbabilityPlot Series1)
{
double x1,x2;
double y1,y2;
Vector data = new Vector(100,false);
Vector xvec = new Vector(0);
Vector yvec = new Vector(0);
data.RandGauss(0.0, 1.0); // standard distribution
StatProbPlots.StatNormalPlot(data, xvec, yvec, out x1, out x2, out y1, out y2, null,false);
Series1.MinX = x1;
Series1.MaxX = x2;
Series1.MinY = y1;
Series1.MaxY = y2;
MtxVecTee.DrawValues(xvec,yvec,Series1,false);
}
}
*)
procedure StatNormalPlot(const Data: TVec; const XDrawVec, YDrawVec: TVec; out MinX, MaxX, MinY, MaxY: double;
const StdErrs: TVec = nil; const DataSorted: boolean = true);
(*Constructs the Quantile-Quantile probability plot.
X Data (first dataset).
Y Data (second dataset).
If true, algorithm assumes XData is already sorted in ascending order.
If XData is not sorted, you must set this parameter to false so that internal algorithm will
automatically do the sorting.
If true, algorithm assumes YData is already sorted in ascending order.
If YData is not sorted, you must set this parameter to false so that internal algorithm will
automatically do the sorting.
Returns q-q plot horizontal values to be drawn - > estimated quantiles
from XData vector or in this case sorted XData values.
Returns q-q plot vertical values to be drawn - > estimated quantiles
from YData vector or in this case sorted YData values.
Returns slope line start X point, XDrawVec 25th percentile.
These value are used by series.
Returns slope line start Y point, YDrawVec 25th percentile.
These value are used by series.
Returns slope line end X point, XDrawVec 75th percentile.
These value are used by series.
Returns slope line end Y point, YDrawVec 75th percentile.
These value are used by series.
Constructs the Quantile-Quantile Chart. Use to visualize/plot constructed values.
The QQ chart is a graphical technique for determining if two data sets come from populations with a common distribution.
Specifically, QQ chart is a plot of the quantiles of the first data set against the quantiles of the second data set.
By a quantile, we mean the fraction (or percent) of points below the given value. That is, the 0.3 (or 30%) quantile is the point
at which 30% percent of the data fall below and 70% fall above that value. A 45-degree reference line is also plotted. If the two
sets come from a population with the same distribution, the points should fall approximately along reference line. The greater
the departure from this reference line, the greater the evidence for the conclusion that the two data sets have come from populations
with different distributions.
The advantages of the q-q plot are:
- The sample sizes do not need to be equal.
- Many distributional aspects can be simultaneously tested. For example, shifts in location, shifts in scale, changes in symmetry,
and the presence of outliers can all be detected from this plot. For example, if the two data sets come from populations whose distributions
differ only by a shift in location, the points should lie along a straight line that is displaced either up or down from the 45-degree
reference line.
The QQ Chart is similar to a probability plot. For a probability plot, the quantiles for one of the data samples are replaced with the
quantiles of a theoretical distribution. The QQ chart is used to answer the following questions:
- Do two data sets come from populations with a common distribution?
- Do two data sets have common location and scale?
- Do two data sets have similar distributional shapes?
- Do two data sets have similar tail behavior?
When there are two data samples, it is often desirable to know if the assumption of a common distribution is justified. If so, then
location and scale estimators can pool both data sets to obtain estimates of the common location and scale. If two samples do differ,
it is also useful to gain some understanding of the differences. The QQ Chart can provide more insight into the nature of the difference
than analytical methods such as the and two sample tests.
If the data sets have the same size, the q-q plot is essentially a plot of sorted X against sorted Y. If the data sets are not of equal
size, the quantiles are usually picked to correspond to the sorted values from the smaller data set and then the quantiles for the
larger data set are interpolated.
How to construct two datasets Q-Q plot?
- If needed, XData values or YData are sorted (DataSorted parameter set to false).
- Abscissa drawing values are formed by estimated XData quantiles - ordered XData values. After
calculation they are copied to XDrawVec. and
properties of XDrawVec are adjusted automatically.
- Ordinate drawing values are formed by estimated YData quantiles - ordered YData values. After
calculation they are copied to YDrawVec. and
properties of YDrawVec are adjusted automatically.
- XDrawVec and YDrawVec 25th and 75th percentile points are used to construct a reference line.
Drawing points departures from this straight line indicate XData and YData do not come from
the same distribution.
The following code will create probability plot and then plot calculated values.
using Dew.Math;
using Dew.Math.Units;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example(ProbabilityPlot Series1)
{
double x1,x2;
double y1,y2;
Vector xdata = new Vector(100,false);
Vector ydata = new Vector(100,false);
Vector xvec = new Vector(0);
Vector yvec = new Vector(0);
xdata.RandGauss(0.0,1.0); // standard distribution
ydata.RandGauss(1.0,2.3); // standard distribution
StatProbPlots.StatQQPlot(xdata, ydata, xvec, yvec, out x1, out x2, out y1, out y2, false,false);
Series1.MinX = x1;
Series1.MaxX = x2;
Series1.MinY = y1;
Series1.MaxY = y2;
MtxVecTee.DrawValues(xvec,yvec,Series1,false);
}
}
*)
procedure StatQQPlot(const XData, YData: TVec; const XDrawVec, YDrawVec: TVec; out MinX, MaxX, MinY, MaxY: double;
const XDataSorted: boolean = true; const YDataSorted: boolean = true); overload;
(*Constructs an Q-Q plot of the quantiles of the data set Data againsts the quantiles of a
standard Normal distribution.
How to construct Q-Q normal distribution probability plot?
- If needed, Data values are sorted (DataSorted parameter set to false).
- Abscissa drawing values are formed by estimated data quantiles - ordered Data values. After
calculation they are copied to XDrawVec. and
properties of XDrawVec are adjusted automatically.
- Ordinate drawing values are formed by using theoretical probabilitie p i.e the so
Z values. After calculation they are copied to XDrawVec. and
properties of XDrawVec are adjusted automatically.
- XDrawVec and YDrawVec 25th and 75th percentile points are used to construct a reference line.
Drawing points departures from this straight line indicate departures from normality.
Note:
Use this overload if you want to check if specific data comes from standard Normal distribution
(mean = 0, sigma^2 = 1.0)
*)
procedure StatQQPlot(const Data: TVec; const XDrawVec, YDrawVec: TVec; out MinX, MaxX, MinY, MaxY: double; const DataSorted: boolean = true); overload;
(*Constructs the Weibull Probability Chart.
Data to be drawn.
If true, algorithm assumes Data is already sorted in ascending order.
If Data is not sorted, you must set this parameter to false so that internal algorithm will
automatically do the sorting.
Returns vector of X values to be drawn - >
Data estimated quantiles or in this case ordered data values.
Returns vector of Y values to be drawn - > theretical Weibull probability values
or in this case ln(ln(1/(1-p)), where p are predefined theoretical probabilities.
Returns slope line start X point, XDrawVec 25th percentile.
These value are used by series.
Returns slope line start Y point, YDrawVec 25th percentile.
These value are used by series.
Returns slope line end X point, XDrawVec 75th percentile.
These value are used by series.
Returns slope line end Y point, YDrawVec 75th percentile.
These value are used by series.
Constructs the Weibull Probability Chart. Use
to visualize/plot constructed values. The Weibull plot is a graphical technique for determining if a data set comes from a population
that would logically be fitted by a 2-parameter Weibull distribution (the location is assumed to be zero).
The Weibull plot has special scales that are designed so that if the data do in fact follow a Weibull distribution,
the points will be linear (or nearly linear). The least squares fit of this line yields estimates for the shape and scale
parameters of the Weibull distribution. Weibull distribution (the location is assumed to be zero).
How to construct Weibull distribution probability plot?
- If needed, Data values are sorted (DataSorted parameter set to false).
- Abscissa drawing values are formed by estimated data quantiles - ordered Data values. After
calculation they are copied to XDrawVec. and
properties of XDrawVec are adjusted automatically.
- Ordinate drawing values are formed by using theoretical probability p
to p - > ln[ln[1/(1-p)]]. After calculation they are copied to YDrawVec. and
properties of YDrawVec are adjusted automatically.
- XDrawVec and YDrawVec 25th and 75th percentile points are used to construct a reference line.
Drawing points departures from this straight line indicate departures from Weibull distribution.
The Weibull plot can be used to answer the following questions:
- Do the data follow a 2-parameter Weibull distribution?
- What is the best estimate of the shape parameter for the 2-parameter Weibull distribution?
- What is the best estimate of the scale (= variation) parameter for the 2-parameter Weibull distribution?
ExampleThe following code will create probability plot and then plot calculated values.
using Dew.Math;
using Dew.Math.Units;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example(ProbabilityPlot Series1)
{
double x1,x2;
double y1,y2;
Vector data = new Vector(100,false);
Vector xvec = new Vector(0);
Vector yvec = new Vector(0);
StatRandom.RandomWeibull(3.0, 1.2,data,-1); // some random values
StatProbPlots.StatWeibullPlot(data, xvec, yvec, out x1, out x2, out y1, out y2, false);
Series1.MinX = x1;
Series1.MaxX = x2;
Series1.MinY = y1;
Series1.MaxY = y2;
MtxVecTee.DrawValues(xvec,yvec,Series1,false);
}
}
*)
procedure StatWeibullPlot(const Data: TVec; const XDrawVec, YDrawVec: TVec; out MinX, MaxX, MinY, MaxY: double; const DataSorted: boolean = true);
(*Exponential distribution probability plot.
Data to be drawn.
If true, algorithm assumes Data is already sorted in ascending order.
If Data is not sorted, you must set this parameter to false so that internal algorithm will
automatically do the sorting.
Returns vector of X values to be drawn - >
Data estimated quantiles or in this case ordered data values.
Returns vector of Y values to be drawn - > theretical Exponential probability values
or in this case -ln(1-p), where p are predefined theoretical probabilities.
Returns slope line start X point, XDrawVec 25th percentile.
These value are used by series.
Returns slope line start Y point, YDrawVec 25th percentile.
These value are used by series.
Returns slope line end X point, XDrawVec 75th percentile.
These value are used by series.
Returns slope line end Y point, YDrawVec 75th percentile.
These value are used by series.
How to construct Exponential distribution probability plot?
- If needed, Data values are sorted (DataSorted parameter set to false).
- Abscissa drawing values are formed by estimated data quantiles - ordered Data values. After
calculation they are copied to XDrawVec. and
properties of XDrawVec are adjusted automatically.
- Ordinate drawing values are formed by using theoretical probability p
0 to p -> -ln(1-p). After calculation they are copied to YDrawVec. and
properties of YDrawVec are adjusted automatically.
- XDrawVec and YDrawVec 25th and 75th percentile points are used to construct a reference line.
Drawing points departures from this straight line indicate departures from Exponential distribution.
*)
procedure StatExpPlot(const Data: TVec; const XDrawVec, YDrawVec: TVec; out MinX, MaxX, MinY, MaxY: double; const DataSorted: boolean = true);
(*Random number generators.
Introduces several pseudo random number generators.
*)
unit StatRandom;
{$I bdsppdefs.inc}
interface
uses MtxVec, Math387, AbstractMtxVec, AbstractMtxVecInt
,RndGenerators
,SysUtils
;
(*Beta distribution random generator.
This routine generates Result.Length pseudo random numbers, using the beta
distribution with parameters a and b. The random numbers are stored in the Results vector.
An exception is raised if Result vector is complex.
Generate 2000 random numbers by using the beta
distribution with parameters a=2.5 and b=3.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector r = new Vector(2000,false);
StatRandom.RandomBeta(2.5, 3,r,-1);
}
}
*)
procedure RandomBeta(const A, B: double; const Results: TDenseMtxVec; Seed: Integer = -1);
(*Binomial distribution random generator.
This routine generates Result.Length pseudo random numbers, using the binomial distribution
with parameter p. The random numbers are stored in the Results vector. An exception is raised
if Result vector is complex.
Generate 2000 random numbers by using the binomial
distribution with parameters N=4 and p=0.2.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
VectorInt r = new VectorInt(2000);
StatRandom.RandomBinom(4, 0.2, r,-1);
}
}
*)
procedure RandomBinom(const AN: Integer; const P: double; const Results: TMtxVecInt; Seed: Integer = -1);
(*ChiSquared distribution random generator.
This routine generates Result.Length pseudo random numbers, using the chi-squared
distribution with parameter ANu. The random numbers are stored in the Results vector. An
exception is raised if Result vector is complex.
Generate 2000 random numbers by using the chi-squared
distribution with parameter Nu=7.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector r = new Vector(2000,false);
StatRandom.RandomChiSquare(7, r,-1);
}
}
*)
procedure RandomChiSquare(const ANu: Integer; const Results: TDenseMtxVec; const Seed: Integer = -1);
(*Cauchy distribution random generator.
This routine generates Result.Length pseudo random numbers, using the Cauchy
distribution with parameters b and m. The random numbers are stored in the Results vector. An
exception is raised if Result vector is complex.
Generate 2000 random numbers by using the Generalized Pareto distribution with
parameters m=1.2, b=0.5.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector r = new Vector(2000,false);
StatRandom.RandomCauchy(1.2, 0.5, r, -1);
}
}
*)
procedure RandomCauchy(const m,b : double; const Results: TDenseMtxVec; Seed: Integer = -1);
(*Erlang distribution random generator.
Generates Result.Length pseudo random numbers, using the Erlang distribution
with parameters k and lambda. The random numbers are stored in the Results vector. An exception is raised
if Result vector is complex.
Generate 2000 random numbers by using the Erlang distribution with
parameters k = 2, lambda=1.23.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector r = new Vector(2000,false);
StatRandom.RandomErlang(2,1.23, r,-1);
}
}
*)
procedure RandomErlang(const k: Integer; const lambda: double; const Results: TDenseMtxVec; Seed: Integer = -1);
(*Exponent distribution random generator.
This routine generates Result.Length pseudo random numbers, using the exponential distribution
with parameter mu. The random numbers are stored in the Results vector. An exception is raised
if Result vector is complex.
Generate 2000 random numbers by using the exponential
distribution with parameter mu=1.23.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector r = new Vector(2000,false);
StatRandom.RandomExponent(1.23, r,-1);
}
}
*)
procedure RandomExponent(const mu: double; const Results: TDenseMtxVec; Seed: Integer = -1);
(*F distribution random generator.
This routine generates Result.Length pseudo random numbers, using the F
distribution with parameters ANu1 and ANu2. The random numbers are stored
in the Results vector. An exception is raised if Result vector is complex.
Generate 2000 random numbers by using the F distribution with parameters Nu1=2 and Nu2=5.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector r = new Vector(2000,false);
StatRandom.RandomF(2, 5, r, -1);
}
}
*)
procedure RandomF(const ANu1, ANu2: Integer; const Results: TDenseMtxVec; const Seed: Integer = -1);
(*Gamma distribution random generator.
This routine generates Result.Length pseudo random numbers, using the gamma distribution
with parameters a and b. The random numbers are stored in the Results vector. An
exception is raised if Result vector is complex.
Generate 2000 random numbers by using the gamma
distribution with parameters Nu1=4 and Nu2=0.25.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector r = new Vector(2000,false);
StatRandom.RandomGamma(4, 0.25, r, -1);
}
}
*)
procedure RandomGamma(const A, B: double; const Results: TDenseMtxVec; Seed: Integer = -1);
(*Generalized Pareto distribution random generator.
This routine generates Result.Length pseudo random numbers, using the Generalized Pareto
distribution with parameters k, mu and sigma. The random numbers are stored in the Results vector.
An exception is raised if Result vector is complex.
Generate 2000 random numbers by using the Generalized Pareto distribution with
parameters k=1.2, mu=0 and sigma=0.5.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector r = new Vector(2000,false);
StatRandom.RandomGenPareto(1.2, 0, 0.5, r, -1);
}
}
*)
procedure RandomGenPareto(const k, mu, sigma: double; const Results: TDenseMtxVec; Seed: Integer = -1);
(*Geometric distribution random generator.
This routine generates Result.Length pseudo random numbers, using the geometric
distribution with parameter P. The random numbers are stored in the Results vector.
An exception is raised if Result vector is complex.
Generate 2000 random numbers by using the geometric distribution with parameters P=0.6.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector r = new Vector(2000,false);
StatRandom.RandomGeometric(0.6, r, -1);
}
}
*)
procedure RandomGeometric(const P: double; const Results: TDenseMtxVec; Seed: Integer = -1);
(*Gumbel distribution random generator.
This routine generates Result.Length pseudo random numbers, using the Gumbel
distribution with parameters mu and beta. The random numbers are stored in the Results vector.
An exception is raised if Result vector is complex or beta is less or equal to zero.
Generate 2000 random numbers by using the Gumbel
distribution with parameters mu=1, beta = 0.55.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector r = new Vector(2000,false);
StatRandom.RandomGumbel(1.0, 0.55, r, -1);
}
}
*)
procedure RandomGumbel(const mu, beta: double; const Results: TDenseMtxVec; Seed: Integer = -1);
(*Hypergeometric distribution random generator.
This routine generates Result.Length pseudo random numbers, using the hypergeometric distribution
with parameters M (lot size), K (size of sampling) and N (marked elements in the lot).
The random numbers are stored in the Results vector. An exception
is raised if Result vector is complex.
Generate 2000 random numbers by using the hypergeometric
distribution with parameters M=100, K=50 and N=30.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
VectorInt r = new VectorInt(2000);
StatRandom.RandomHypGeometric(100,50,30,r, -1);
}
}
*)
procedure RandomHypGeometric(const M, K, N: Integer; const Results: TMtxVecInt; Seed: Integer = -1);
(*Laplace distribution random generator.
This routine generates Result.Length pseudo random numbers, using the Laplace
distribution with parameters m and b. The random numbers are stored in the Results vector. An
exception is raised if Result vector is complex.
Generate 2000 random numbers by using the Laplace
distribution with parameters m=4.2, b=2.4.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector r = new Vector(2000,false);
StatRandom.RandomLaplace(4.2, 2.4,r, -1);
}
}
*)
procedure RandomLaplace(const m,b : double; const Results: TDenseMtxVec; Seed: Integer = -1);
(*Logistic distribution random generator.
This routine generates Result.Length pseudo random numbers, using the lognormal
distribution with parameters mu and sigma. The random numbers are stored in the
Results vector. An exception is raised if Result vector is complex.
Generate 2000 random numbers by using the logistic
distribution with parameters m=3.0 and b=2.4.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector r = new Vector(2000,false);
StatRandom.RandomLogistic(3.0, 2.4, r, -1);
}
}
*)
procedure RandomLogistic(const m, b: double; const Results: TDenseMtxVec; Seed: Integer = -1);
(*Lognormal distribution random generator.
This routine generates Result.Length pseudo random numbers, using the lognormal
distribution with parameters mu and sigma. The random numbers are stored in the
Results vector. An exception is raised if Result vector is complex.
Generate 2000 random numbers by using the lognormal
distribution with parameters Mu=3.0 and Sigma=2.4.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector r = new Vector(2000,false);
StatRandom.RandomLogNormal(3.0, 2.4, r, -1);
}
}
*)
procedure RandomLogNormal(const mu, sigma: double; const Results: TDenseMtxVec; Seed: Integer = -1);
(*Maxwell distribution random generator.
This routine generates Result.Length pseudo random numbers, using the Maxwell
distribution with parameter a. The random numbers are stored in the
Results vector. An exception is raised if Result vector is complex.
Generate 2000 random numbers by using the Maxwell distribution with parameter a=3.0.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector r = new Vector(2000,false);
StatRandom.RandoMaxwell(3.0, r, -1);
}
}
*)
procedure RandomMaxwell(const a: double; const Results: TDenseMtxVec; Seed: Integer = -1);
(*Negative binomial distribution random generator.
This routine generates Result.Length pseudo random numbers, using the hypergeometric
distribution with parameters M, K and N. The random numbers are stored in the
Results vector. An exception is raised if Result vector is complex.
Generate 2000 random numbers by using the hypergeometric
distribution with parameters M=100, K=50 and N=30.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
VectorInt r = new VectorInt(2000);
StatRandom.RandomNegBinom(10,0.2,r, -1);
}
}
*)
procedure RandomNegBinom(const R,p: double; const Results: TMtxVecInt; Seed: Integer = -1);
(*Normal distribution random generator.
This routine generates Result.Length pseudo random numbers, using the normal
distribution with parameters mu and sigma. The random numbers are stored in the
Results vector. An exception is raised if Result vector is complex.
Generate 2000 random numbers by using the normal
distribution with parameters mu=3.5 and sigma=0.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector r = new Vector(2000,false);
StatRandom.RandomNormal(0.0, 1.0 ,r, -1);
}
}
*)
procedure RandomNormal(const mu, sigma: double; const Results: TDenseMtxVec; Seed: Integer = -1);
(*Poisson distribution random generator.
This routine generates Result.Length pseudo random numbers, using the Poisson
distribution with parameter lambda. The random numbers are stored in the
Results vector. An exception is raised if Result vector is complex.
Generate 2000 random numbers by using the Poisson
distribution with parameter lambda=2.31.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
VectorInt r = new VectorInt(2000);
StatRandom.RandomPoisson(2.31,r,-1);
}
}
*)
procedure RandomPoisson(const lambda: double; const Results: TMtxVecInt; Seed: Integer = -1);
(*Rayleigh distribution random generator.
This routine generates Result.Length pseudo random numbers, using the Rayleigh
distribution with parameter b. The random numbers are stored in the Results
vector. An exception is raised if Result vector is complex.
Generate 2000 random numbers by using Rayleigh distribution
with parameters P=0.6.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector r = new Vector(2000,false);
StatRandom.RandomRayleigh(0.6,r,-1);
}
}
*)
procedure RandomRayleigh(const B: double; const Results: TDenseMtxVec; Seed: Integer = -1);
(*Student(T) distribution random generator.
This routine generates Result.Length pseudo random numbers, using the Student (t)
distribution with parameter V. The random numbers are stored in the Results
vector. An exception is raised if Result vector is complex.
Generate 2000 random numbers by using the Student (t)
distribution with parameter V = 5.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector r = new Vector(2000,false);
StatRandom.RandomStudent(5,r,-1);
}
}
*)
procedure RandomStudent(const V: Integer; const Results: TDenseMtxVec; Seed: Integer = -1);
(*Triangular distribution random generator.
This routine generates Result.Length pseudo random numbers, using the triangular
distribution with parameters a,b and c. The random numbers are stored in
the Results vector. An exception is raised if Result vector is complex.
Generate 2000 random numbers by using the triangular distribution witg parameters
a=2.2, b=8 and c=5.5.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector r = new Vector(2000,false);
StatRandom.RandomTriangular(2.2, 8.0, 5.5, r, -1);
}
}
*)
procedure RandomTriangular(const a, b,c : double; const Results: TDenseMtxVec; Seed: Integer = -1);
(*Uniform distribution random generator.
This routine generates Result.Length pseudo random numbers, using the continuous
uniform distribution with parameters a and b. The random numbers are stored in
the Results vector. An exception is raised if Result vector is complex.
Generate 2000 random numbers by using the continuous
uniform distribution with parameters a=-1 and b=1.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector r = new Vector(2000,false);
StatRandom.RandomUniform(-1,1,r,-1);
}
}
*)
procedure RandomUniform(const a, b: double; const Results: TDenseMtxVec; Seed: Integer = -1);
(*Discrete uniform distribution random generator.
This routine generates Result.Length pseudo random numbers, using the discrete
uniform distribution with parameter N. The random numbers are stored in the
Results vector. An exception is raised if Result vector is complex.
Generate 2000 random numbers by using the discrete
uniform distribution with parameter N=30.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector r = new Vector(2000,false);
StatRandom.RandomUniformD(30, r,-1);
}
}
*)
procedure RandomUniformD(const N: Integer; const Results: TDenseMtxVec; Seed: Integer = -1);
(*Weibull distribution random generator.
This routine generates Result.Length pseudo random numbers, using the Weibull distribution
with parameters a and b. The random numbers are stored in the Results vector. An
exception is raised if Result vector is complex.
Generate 2000 random numbers by using the Weibull
distribution with parameters a=3 and b=1.2.
using Dew.Math;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector r = new Vector(2000,false);
StatRandom.RandomWeibull(3, 1.2, r,-1);
}
}
*)
procedure RandomWeibull(const a, b: double; const Results: TDenseMtxVec; Seed: Integer = -1);
(*Time series analysis.
Introduces several routines for handling/analyzing univariante time series. Includes ARMA, ARIMA and
exponential smoothing routines.
As stated at NIST pages, time series is an ordered sequence of values of a variable at
equally spaced time intervals. The usage of time series models is twofold:
- Obtain an understanding of the underlying forces and structure that produced the observed data.
- Fit a model and proceed to forecasting, monitoring or even feedback and feedforward control.
The fitting of time series models can be an ambitious undertaking. This unit utilizes the following:
- average smoothing,
- Holt-Winters single, double and triple exponential smoothing,
- ARMA and ARIMA models,
- ARAR model.
Literature used
- Brockwell, P.J. and Davis, R.A. : Introduction to Time Series and Forecasting - second edition, Springer Verlag, New York, 2002.
- Brockwell, P.J. and Davis, R.A. : Time Series: Theory and Methods - second edition, Springer Verlag, New York, 1991.
- Shumway, R.H. and Stoffer, D.S. : Time Series Analysis and Its Applications, Springer Verlag, New York, 2000.
- www.stat.unc.edu/faculty/hurd/progs
- eww.itl.nist.gov/div898/handbook/pmc/section4/pmc43.htm.
- www.it.iitb.ac.in/~praj/acads/seminar/04329008_ExponentialSmoothing.pdf.
*)
unit StatTimeSerAnalysis;
{$I BdsppDefs.inc}
interface
uses Probabilities, MtxVec, AbstractMtxVec, Math387, Statistics, Optimization
,Types
;
(*ARMA/ARIMA coefficients initial estimate method.*)
type TcfInitMethod=(
(*User suplied values for phi and theta.*)cfInitFixed,
(*Yule-Walker estimates for pure AR(p) process.*)cfInitYW,
(*Burg estimates for pure AR(p) process.*)cfInitBurg,
(*Innovations estimates for ARMA(p,q) process.*)cfInitInno,
(*Hannah-Rissanen estimates for ARMA(p,q) process.*)cfInitHannah
);
(*Autocovariance function.
Sample data set.
Stores sample autocorrelation (ACF) function. Size of Dst is adjusted automatically.
Number of lags to calculate. If Lags is -1 the number of lags is computed as
Lags := Ceil(10*Log10(Src.Length))
*)
procedure AutoCov(const Src, Dst: TVec; Lags: integer = -1);
(*Autocorrelation/autocovariance function.
Sample data set.
Stores sample autocorrelation (ACF) function. Size of Dst is adjusted automatically.
Number of lags to calculate. If Lags is -1 the number of lags is computed as
Lags := Ceil(10*Log10(Src.Length))
Calculates sample autocorrelation function at lags Lags.
*)
procedure ACF(const Src,Dst: TVec; Lags: integer = -1);
(*Partial autocorrelation function.
Autocorrelation function.
autoregressive model parameters.
partial autocorrelation (-reflection coefficients)
Error variance.
Calculates sample partial autocorrelation function (PACF).
*)
procedure PACF(const Acf, Arp, ParCorr, Variance: TVec); overload;
(*Partial autocorelation function.
Stores the sample autocorrelation function (ACF).
Returns the sample partial autocorrelation function (PACF) on output.
*)
procedure PACF(const Src, Dst: TVec); overload;
(*Setup initial values for integrating ARMA series.
Setup initial values for integrating ARMA series.
*)
procedure TimeSeriesIntInit(const Data: TVec; const Init: TVec; const d: Integer; const Future: boolean = true);
(*Estimates autocorrelation/autocovariance function for the ARMA model.
Stores Phi values for ARMA process, without the initial 1.
Stores Theta values for ARMA process, without the initial 1.
Number of lags to calculate.
If true, ACF values are normalized by ACF[0]. If false, no normalization is performed and the ResultACF
stores ACVF (gamma[0], gamma[1], ...gamma[n]) values.
Returns autocovarialce (gamma[0], gamma[1], ...) or
autocorrelation (rho[0], rho[1], ...) function for the ARMA model..
Estimates autocorrelation/autocovariance function for the ARMA model.
*)
procedure ARMAAcf(const Phi,Theta: TVec; const n: integer; const ResultACF: TVec; const Normalize: boolean = True);
(*ARMA process covariances.
Time series ACVF.
The ACVF of a MA part of the model.
Stores Phi values for ARMA process.
Stores Theta values for ARMA process.
Calculates ARMA (p,q) process covariances. For ARMA process, covariances are defined as:
where gamma is time series autocovariance function, sigma^2 is estimated white noise, m=max(p,q) and phi, theta are
AR and MA coefficients.
*)
function ARMAKappa(const gamma, maacvf: TVec; const i,j: Integer; const Phi, Theta: TVec): double; overload;
(*Calculate necessary covariances for ARMA(p,q) process up to kappa(KappaSize,KappaSize)*)
procedure ARMAKappa(const Data: TVec; const Phi, Theta: TVec; const Cov: TMtx; const KappaSize: Integer); overload;
(*Simulate the ARMA (p,q) process.
stores the AR coefficients. Length of the p vector defines AR(p) order.
stores the MA coefficients. Length of the t vector defines MA(q) order.
defines number of points to simulate.
returns ARMA (p,q) time series. Size of Result vector is adjusted automatiacally.
Simulate ARMA(1,1) process with Phi=[1.0], Theta=[-0.25].
Uses MtxExpr, StatTimeSerAnalysis;
procedure Example;
var phi,theta,ts: Vector;
begin
phi.SetIt(false,[1.0]);
theta.SetIt(false,[-0.25]);
ARMASimulate(phi,theta,100,ts);
// ts now stores 100 points from ARMA(1,1) process.
end;
#include "MtxExpr.hpp"
#include "Math387.hpp"
#include "StatTimeSerAnalysis.hpp"
void __fastcall Example();
{
sVector phi,theta,ts;
phi.SetIt(false,OPENARRAY(double,(1.0)));
theta.SetIt(false,OPENARRAY(double,(-0.25)));
ARMASimulate(phi,theta,100,ts);
// ts now stores 100 points from ARMA(1,1) process.
C# Example
Simulate ARMA(1,1) process with Phi=[1.0], Theta=[-0.25].
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector phi = new Vector(0);
Vector theta = new Vector(0);
Vector ts = new Vector(0);
phi.SetIt(false, new double[] {1.0});
theta.SetIt(false,new double[] {-0.25});
StatTimeSerAnalysis.ARMASimulate(phi,theta,100,ts);
// ts now stores 100 points from ARMA(1,1) process.
}
}
*)
procedure ARMASimulate(const p,t: TVec; const n: integer; const aResult: TVec); overload;
(*Simulate the ARIMA process.
stores the AR coefficients. Length of the p vector defines AR(p) order.
stores the MA coefficients. Length of the t vector defines MA(q) order.
defines how many times time series is differentiated (d parameter in ARIMA).
defines initial values for integration: r[-d+1],Dr[-d+2],...,D^(d-1)r[0]. The length of ResInit must be
equal to d, otherwise an exception will be raised.
defines number of points to simulate.
returns ARIMA (p,d,q) time series. Size of Result vector is adjusted automatiacally.
Simulate the ARIMA (p,d,q) process.
Simulate ARIMA(1,2,1) process with Phi=[1.0], Theta=[-0.25], d=2.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector phi = new Vector(0);
Vector theta = new Vector(0);
Vector init = new Vector(0);
Vector ts = new Vector(0);
phi.SetIt(false, new double[] {1.0});
theta.SetIt(false,new double[] {-0.25});
theta.SetIt(false,new double[] {0.0});
StatTimeSerAnalysis.ARIMASimulate(phi,theta,2,init,100,ts);
// ts now stores 100 points from ARIMA(1,1,2) process.
}
}
*)
procedure ARIMASimulate(const p,t: TVec; const d: Integer; const ResInit: TVec; const n: integer; const aResult: TVec);
(*ARMA model one-step ahead predictors.
Calculate the ARMA (p,q) model one-step ahead predictors.
*)
procedure ARMAPredictors(const Data: TVec; const Phi,Theta: TVec; const Predictors: TVec; const r: TVec);
(*Forecast time series by using ARMA(p,q) model.
The original time series data set.
ARIMA Phi (AR) coefficients. Assumes P (AR model) to be without the leading 1.0.
ARIMA Theta (MA) coefficients.
Residuals as returned by ARMAMLE.
Number of samples to forecast.
This modified average value is included in to the optimization process of ARMAMLE, which returns optimal value for it.
Results of the forecasting (beyond the last index value of Data starting at Data.Length).
Returns standard deviation of residuals.
*)
procedure ARMAForecast(const Data, P, T, Residuals: TVec; const n: integer; const mu: double; const Forecast, fStdDev: TVec); overload;
(*Estimate ARMA process AR and MA coefficients.
Time series data set.
ARIMA Before call stores initial estimates for ARIMA Phi coefficients. After call returns MLE estimates for Phi coefficients without leading 1.0.
ARIMA Before call stores initial estimates for ARIMA Theta coefficients. After call returns MLE estimates for Theta coefficients.
Returns residuals between predicted (MLE) and actual time series values.
Returns -2 log likelihood of ARMA model.
Returns the estimated modified series average value (constant).
Number of evaluations needed to converge to MLE solution.
Estimate ARMA(p,t) process coefficients by using MLE.
*)
function ARMAMLE(const Data: TVec; const P, T: TVec; const Residuals: TVec; out MLE: double; out mu: double): Integer; overload;
(*Check AR(MA) coeefficients.
Phi (AR) or Theta (MA) coefficients.
If true, Params are checked for causality. If false, Params are checked for invertibility.
true, if coefficients are causal/invertible.
Check if AR coefficients are causal or MA coefficients are invertible.
*)
function CheckARMACoeffs(const Pars: Array of double; const Causality: boolean = true): boolean; overload;
function CheckARMACoeffs(const Coeffs: TVec; const Causality: boolean = true): boolean; overload;
(*-2log likelihood.
Input date.
Optional trend line. Can be nil, if constant (average value) is assumed.
stores phi[0]..phi[p-1] coefficients. The order of AR(p) is defined by Phi vector length.
stores theta[0]..theta[q-1] coefficients. The order of AR(p) is defined by Phi vector length.
stores the "errors" left after the fitting process.
-2log likelihood for ARIMA(p,q,d) process.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector phi = new Vector(0);
Vector theta = new Vector(0);
Vector ts = new Vector(0);
phi.SetIt(false, new double[] {0.33,-0.24});
theta.SetIt(false,new double[] {0.9});
// ARMA(2,1,2) process -> evaluate -2log likelihood
double l = StatTimeSerAnalysis.ARMALogLike(ts,phi,theta);
}
}
*)
function ARMALogLike(const Data, Trend, Phi, Theta, Residuals: TVec): double;
(*The Durbin-Levinson algorithm.
Defines covariances for Durbin-Levinson algorithm.
Returns phi[n,1]...phi[n,n] coefficients.
Defines number of iterations of the Durbin-Levinson algorithm.
Returns estimated variance.
If not nil, it returns phi coefficients variances.
Uses the Durbin-Levinson algorithm to calculate phi[n,1]...phi[n,n] coefficients. Coefficients are
calculated resursively from the following relations:
where phi(1,1) = gamma(1)/gamma(0) and v(0)=gamma(0).
*)
procedure DurbinLevinson(const gamma: TVec; const Phi: TVec; out Sigma2: double; const NumEvals: Integer; const PhiVar: TVec=nil);
(*Yule-Walker AR estimation.
Time series.
Returns estimates for Phi coefficients. AR(p) order is determined by Phi length.
Returns estimate for Sigma^2 i.e. (AR) model variance.
If not nil, it returns estimated phi coefficients standard errors.
Performs Yule-Walker estimation for pure (AR) model.
Calculate initial estimates for AR(3) process by using Yule-Walker algorithm.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector ts = new Vector(0);
Vector phi = new Vector(0);
double s2;
ts.LoadFromFile("timeser.vec");
phi.Length = 3; // for AR(3) process
StatTimeSerAnalysis.ARYuleWalkerFit(ts,phi,out s2,null);
}
}
*)
procedure ARYuleWalkerFit(const Data: TVec; const Phi: TVec; out Sigma2: double; const StdErrs: TVec = nil);
(*Burg AR estimation.
Zero-mean time series. If this is not the case, subtract the mean from data.
Returns estimates for Phi coefficients. AR(p) order is determined by Phi length.
Returns Burg estimated variance for AR process.
Returns estimated phi coefficients standard errors.
Performs Burg estimation for pure (AR) model.
Calculate initial estimates for AR(3) process by using Burg's algorithm.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector ts = new Vector(0);
Vector phi = new Vector(0);
Vector stdErr = new Vector(0);
double s2;
ts.LoadFromFile("timeser.vec");
phi.Length = 3; // for AR(3) process
StatTimeSerAnalysis.ARBurgFit(ts,phi,out s2,stdErr);
}
}
*)
procedure ARBurgFit(const Data: TVec; const Phi: TVec; out Sigma2: double; const StdErrs: TVec);
(*The innovations algorithm.
Defines covariances for innovations algorithm.
Returns Theta[n,1]...Theta[n,n] coefficients.
Returns variance.
Defines number of iterations of the innovation algorithm.
If not nil, returns theta[n,1]..Theta[n,n] variances.
If not nil, returns sum of squares for each theta[n,i] element.
Uses the Innnovations algorithm to recursively calculate Theta[n,1]...Theta[n,n] coefficients.
*)
procedure Innovations(const kappa: TDenseMtxVec; const Theta: TVec; out Sigma2: double; const NumEvals: Integer; const ThetaVar: TVec = nil; const SumSqr: TVec = nil);overload;
(*Uses the Innnovations algorithm to recursively calculate Theta[1,1]...Theta[n,n] coefficients (all coefficients).
The recursion relations are defined by the following equations:
where kappa(i,j) are covariances.
Use this overloaded variant only when you need all theta[1,1]..theta[n,n] values, otherwise use vector version.
*)
procedure Innovations(const kappa: TDenseMtxVec; const q: Integer; const ThetaMtx: TMtx; const Variances: TVec; const NumEvals: Integer);overload;
(*Hannah-Rissanen ARMA estimation.
Time series.
Returns estimates for Phi coefficients. AR(p) order is determined by Phi length.
Returns estimates for Theta coefficients. MA(q) order is determined by Theta length.
Returns estimate for Sigma^2 i.e. ARMA model variance.
Performs Hannah-Rissanen estimation for ARMA(p,q) model.
*)
procedure ARMAHannahFit(const Data: TVec; const Phi, Theta: TVec; out Sigma2: double);
(*Innovations ARMA estimation.
Zero-mean time series. If this is not the case, subtract the mean from data.
Returns estimates for phi coefficients phi[1]..phi[p]. AR(p) order is determined by Phi length.
Returns estimates for theta coefficients theta[1]..theta[q]. MA(q) order is determined by Theta length.
If not nil, returns estimated phi coefficients standard errors.
If not nil, returns estimated phi coefficients standard errors.
Returns estimate for Sigma^2 i.e. MA model variance.
Defines maximum lag used in calculation of ACVF. If MaxLags is -1 then the following formula
will be used to automatically set lag number:Ceil(10*Log10(Data.Length)).
Uses innovations algorithm to predict ARMA(p,q) process coefficients.
*)
procedure ARMAInnovationsFit(const Data: TVec; const Phi,Theta: TVec; out Sigma2: double; const PhiSE :TVec = nil; const ThetaSE: TVec = nil; MaxLags: Integer = -1); overload;
(*Innovations ARMA estimation.*)
procedure ARMAInnovationsFit(const Data: TVec; const Theta: TVec; out Sigma2: double; const StdErrs: TVec = nil; MaxLags: Integer = -1); overload;
(*Single exponential smoothing.
Time series data set.
Smoothed values (see above equation). Size and complex properties of S are set automatically.
Defines initial estimate for Alpha, returns Alpha which minimizes MSE.
Defines how the initial values for S[0] are calculated.
MSE, evaluated at minimum.
Performs single exponential smoothing using the following equation:
This is the basic equation of exponential smoothing and the variable Alpha is called the smoothing constant.
This smoothing scheme begins by setting S[0] to Y[0], where S[i] stands for smoothed observation and Y stands for
the original observation. The subscripts refer to the time periods, 0, 1, ..., n. Note that there is no S[0];
the smoothed series starts with the smoothed version of the second observation. Also note that the internal algorithm
automatically accounts for this by resizing S vector to Y.Length-1.
Setting S[0] to Y[0] is not mandatory. There are numerous ways to initialize S[0]. Some of the choices are:
Different initialization methods are controlled by the InitMethod parameter. Default value (0) uses first equation,
setting it to (1) means the second equation will be used to initialize S[0].
The smoothing constant alpha determines how fast the weights of the series decays. The value may be chosen either subjectively or
objectively. Values near one put almost all weight on the most recent observations. Values of the smoothing constant near zero
allow the distant past observations to have a large influence. When selecting the smoothing constant subjectively, you use your
own experience with this, and similar, series. Also, specifying the smoothing constant yourself lets you tune the forecast to
your own beliefs about the future of the series. If you believe that the mechanism generating the series has recently gone through
some fundamental changes, use a smoothing constant value of 0.9 which will cause distant observations to be ignored. If, however,
you think the series is fairly stable and only going through random fluctuations, use a value of 0.1.
Note
To select the value of the smoothing constant objectively, internal algorithm searches for an Alpha that minimizes the mean squared
error (MSE)of the combined forecast errors of the currently available series.
Load data, perform smoothing and read Alpha + MSE.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector Data = new Vector(0);
Vector S = new Vector(0);
Data.LoadFromFile("aerosol_particles.vec");
// smooth data, initial alpha = 0.1
double alpha = 0.1;
double MSE = StatTimeSerAnalysis.SingleExpSmooth(Data,S,ref alpha,0);
// results: MSE and MLE estimate for Alpha
}
}
*)
function SingleExpSmooth(const Y, S: TVec; var Alpha: double; const InitMethod: Integer = 0): double; overload;
(*In this case a fixed smoothing constant Alpha is used in smoothing equations (no minimization is performed).
Returns MSE, evaluated for constant Alpha.
Time series data set.
Smoothed values (see above equation). Size and complex properties of S are set automatically.
Defines initial estimate for Alpha, returns Alpha which minimizes MSE.
Defines how the initial values for S[0] are calculated.
Load data, perform smoothing with Alpha = 0.33.
Uses MtxExpr, StatTimeSerAnalysis, Math387;
procedure Example;
var Data,S: Vector;
MSE: double;
begin
Data.LoadFromFile('aerosol_particles.vec');
// smooth data with Alpha=0.33
SingleExpSmooth(Data,S,0.33,MSE,0);
// results: MSE
end;
#include "MtxExpr.hpp"
#include "Math387.hpp"
#include "StatTimeSerAnalysis.hpp"
void __fastcall Example();
{
sVector Data,S;
Data.LoadFromFile("aerosol_particles.vec");
// smooth data with Alpha=0.33
double MSE;
SingleExpSmooth(Data,S,0.33,MSE,0);
// results: MSE
}
C# Example
Load data, perform smoothing with Alpha = 0.33.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector Data = new Vector(0);
Vector S = new Vector(0);
Data.LoadFromFile("aerosol_particles.vec");
// smooth data with Alpha=0.33
double MSE;
StatTimeSerAnalysis.SingleExpSmooth(Data,S,0.33,out MSE,0);
// results: MSE
}
}
*)
procedure SingleExpSmooth(const Y, S: TVec; const Alpha: double; out MSE: double; const InitMethod: Integer = 0); overload;
(*Single exponential forecast.
Time series data set.
Time series forecasts. Size of the YHat vector are adjusted automatically.
Overal smoothing parameter used for forecast.
Forecast values up to T period.
Defines how the initial values for S[0] are calculated.
Forecasts time series values by using single exponential smoothing equations. For single exponential smoothing,
the h period ahead forecast is given by:
Load data, assume Alpha is 0.33, forecast 20 points past the last value.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector Data = new Vector(0);
Vector YHat = new Vector(0);
Vector Residuals = new Vector(0);
int NumPoints = 20;
Data.LoadFromFile("aerosol_particles.vec");
// last point period = Data.Length-1 + NumPoints
int T = Data.Length - 1 + NumPoints;
StatTimeSerAnalysis.SingleExpForecast(Data, YHat, 0.33, T, 0);
// YHat now stores estimates for YHat[1,...Length-1]
// so, if we need residuals, we have to subtract
// these values from y[1,...,Length-1)
Residuals.Size(YHat);
Residuals.Sub(Data, YHat, 1, 0, 0, YHat.Length);
}
}
*)
procedure SingleExpForecast(const Y: TVec; const YHat: TVec; const Alpha: double; const T: Integer; const InitMethod: Integer = 0); overload;
(*rst estimate Alpha parameters by single smoothing and then use returned value to forecast up to T periods.
MSE, evaluated at minimum.
Time series data set.
Time series forecasts. Size of the YHat vector are adjusted automatically.
Overal smoothing parameter used for forecast.
Forecast values up to T period.
Defines how the initial values for S[0] are calculated.
Use this routine if you don't know the best estimates for Alpha.
Do estimation and forecasting in single routine call.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector Data = new Vector(0);
Vector YHat = new Vector(0);
int NumPoints = 20;
Data.LoadFromFile("aerosol_particles.vec");
// last point period = Data.Length-1 + NumPoints
int T = Data.Length-1+NumPoints;
// initial alpha estimate = 0.6
double alpha = 0.6;
double mse;
StatTimeSerAnalysis.SingleExpForecast(Data,YHat,ref alpha,T,out mse,0);
// returs mse and estimated Alpha (from MLE)
}
}
*)
procedure SingleExpForecast(const Y: TVec; const YHat: TVec; var Alpha: double; const T: Integer; out MSE: double; const InitMethod: Integer = 0); overload;
(*Double exponential smoothing.
Time series data set.
Smoothed values (see above equation). Size and complex properties of S are set automatically.
Trend values (see above equation). Size and complex properties of b are set automatically.
Defines initial estimate for Alpha, returns Alpha which minimizes MSE.
Defines initial estimate for Gamma, returns Gamma which minimizes MSE.
Defines how the initial values for b[0] are calculated.
MSE,evaluated at minimum.
Performs double exponential smoothing using the following equations:
Smoothing scheme begins by setting S[0] to Y[0] and b[0] to pne of the following choices:
Different initialization methods are controlled by the InitMethod parameter. Default value (0) uses first equation,
setting it to (1) means the second equation will be used and setting it to (2) means the third equation will be used to initialize b[0].
The first smoothing equation adjusts S[i] directly for the trend of the previous period, b[i-1], by adding it to the last smoothed value, S[i-1].
This helps to eliminate the lag and brings S[i] to the appropriate base of the current value. The second smoothing equation then updates the trend,
which is expressed as the difference between the last two values. The equation is similar to the basic form of single smoothing, but here applied
to the updating of the trend.
Load data, perform smoothing and read Alpha,Gamma + MSE.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector Data = new Vector(0);
Vector S = new Vector(0);
Vector b = new Vector(0);
Data.LoadFromFile("aerosol_particles.vec");
// smooth data, initial alpha = 0.1, gamma = 0.3
double alpha = 0.1;
double gamma = 0.3;
double MSE = StatTimeSerAnalysis.DoubleExpSmooth(Data,S,b,ref alpha,ref gamma,1);
// results: MSE and MLE estimate for alpha,gamma
}
}
*)
function DoubleExpSmooth(const Y: TVec; const S, B: TVec; var Alpha, Gamma: double; const InitMethod: Integer = 0): double; overload;
(*In this case a fixed smoothing constants Alpha, Gamma are used in smoothing equations (no minimization is performed).
Returns MSE, evaluated for constant Alpha and Gamma.
Time series data set.
Smoothed values (see above equation). Size and complex properties of S are set automatically.
Trend values (see above equation). Size and complex properties of b are set automatically.
Defines initial estimate for Alpha, returns Alpha which minimizes MSE.
Defines initial estimate for Gamma, returns Gamma which minimizes MSE.
Defines how the initial values for b[0] are calculated.
Load data, perform smoothing with Alpha = 0.2, Gamma = 0.18
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector Data = new Vector(0);
Vector S = new Vector(0);
Vector b = new Vector(0);
Data.LoadFromFile("aerosol_particles.vec");
// smooth data with Alpha=0.2, Gamma=0.18
double MSE;
StatTimeSerAnalysis.DoubleExpSmooth(Data,S,b,0.2,0.18,out MSE,1);
}
}
*)
procedure DoubleExpSmooth(const Y: TVec; const S,B: TVec; const Alpha, Gamma: double; out MSE: double; const InitMethod: Integer = 0); overload;
(*Double exponential forecast.
Time series data set.
Time series forecasts. Size of the YHat vector are adjusted automatically.
Overal smoothing parameter used for forecast.
Trend smoothing parameter used for forecast.
Forecast values up to T period.
Defines how the initial values for b[0] are calculated.
Forecasts time series values by using double exponential smoothing equations. For double exponential smoothing,
the h period ahead forecast is given by:
*)
procedure DoubleExpForecast(const Y: TVec; const YHat: TVec; const Alpha, Gamma: double; const T: Integer; const InitMethod: Integer = 0); overload;
(*First estimate Alpha and Gamma parameters by double smoothing and then use returned values to forecast up to T periods.
Time series data set.
MSE, evaluated at minimum.
Time series forecasts. Size of the YHat vector are adjusted automatically.
Overal smoothing parameter used for forecast.
Trend smoothing parameter used for forecast.
Forecast values up to T period.
Defines how the initial values for b[0] are calculated.
Use this routine if you don't know the best estimates for Alpha and Gamma.
Do estimation and forecasting in single routine call.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector Data = new Vector(0);
Vector YHat = new Vector(0);
int NumPoints = 20;
Data.LoadFromFile("aerosol_particles.vec");
// last point period = Data.Length-1 + NumPoints
int T = Data.Length-1+NumPoints;
// initial estimates for alpha, gamma
double alpha = 0.1;
double gamma = 0.1;
double MSE;
StatTimeSerAnalysis.DoubleExpForecast(Data,YHat,ref alpha,ref gamma,T,out MSE,1);
// returs MSE and estimated Alpha (from MLE)
}
}
*)
procedure DoubleExpForecast(const Y: TVec; const YHat: TVec; var Alpha,Gamma: double; const T: Integer; out MSE: double; const InitMethod: Integer = 0); overload;
(*Triple exponential smoothing.
Time series data set.
Smoothed values (see above equation). Size and complex properties of S are set automatically.
Trend values (see above equation). Size and complex properties of b are set automatically.
Seasonal indices (see above equation). Size and complex properties of L are set automatically.
Defines initial estimate for Alpha, returns Alpha which minimizes MSE.
Defines initial estimate for Beta, returns Beta which minimizes MSE.
Defines initial estimate for Gamma, returns Gamma which minimizes MSE.
Period length. An exception is raised if Y.Length mod Period is not 0.
MSE, evaluated at minimum.
Performs triple exponential smoothing (also known as Holt-Winters smoothing) using the following equations:
where Y are the observations, S are the smoothed observations, b trend factors, L the seasonal indices and P is the period length.
To initialize triple exponential smoothing method we need at least one complete season's data to determine initial estimates of the seasonal indices
L[0]..L[P-1]. Again, there are several ways to initialize L values. The algorithm uses approach, described at
www.itl.nist.gov/div898/handbook/pmc/section4/pmc435.htm page.
For initial estimate for S and b, the following equations are being used:
Note
There are no S[0]..S[P-2] values; the smoothed series starts with the smoothed version of the Y[P] observation. Also note that the internal
algorithm automatically accounts for this by resizing S,b vector to Y.Length-Period.
Generate 24 random values representing 4 quarters x 6 years = 24, perform smoothing and read
Alpha,Beta,Gamma + MSE.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector Data = new Vector(0);
Vector S = new Vector(0);
Vector b = new Vector(0);
Vector L = new Vector(0);
Data.Size(24,false);
Data.RandGauss();
// smooth data, initial alpha = 0.1, beta=0.1, gamma = 0.3
double alpha = 0.1;
double beta = 0.1;
double gamma = 0.3;
// Period = 4
double MSE = StatTimeSerAnalysis.TripleExpSmooth(Data,S, b, L, ref alpha,ref beta, ref gamma,4);
// results: MSE and MLE estimate for Alpha,Beta,Gamma
}
}
*)
function TripleExpSmooth(const Y: TVec; const S,B,L: TVec; var Alpha,Beta,Gamma: double; const Period: Integer): double; overload;
(*In this case a fixed smoothing constants Alpha, Beta and Gamma are used in smoothing equations (no minimization is performed).
Returns MSE, evaluated for constant Alpha, Beta and Gamma.
Time series data set.
Smoothed values (see above equation). Size and complex properties of S are set automatically.
Trend values (see above equation). Size and complex properties of b are set automatically.
Seasonal indices (see above equation). Size and complex properties of L are set automatically.
Defines initial estimate for Alpha, returns Alpha which minimizes MSE.
Defines initial estimate for Beta, returns Beta which minimizes MSE.
Defines initial estimate for Gamma, returns Gamma which minimizes MSE.
Period length. An exception is raised if Y.Length mod Period is not 0.*)
procedure TripleExpSmooth(const Y: TVec; const S,B,L: TVec; const Alpha,Beta,Gamma: double; out MSE: double; const Period: Integer); overload;
(*Triple exponential forecast.
Time series data set.
Time series forecasts. Size of the YHat vector are adjusted automatically.
Overal smoothing parameter used for forecast.
Trend smoothing parameter used for forecast.
Seasonal smoothing parameter used for forecast.
Forecast values up to T period.
Period length. An exception is raised if Y.Length mod Period is not 0.
The h period ahead forecast is given by:
where P is period length.
*)
procedure TripleExpForecast(const Y: TVec; const YHat: TVec; const Alpha,Beta,Gamma: double; T: Integer; const Period: Integer); overload;
(*First estimate Alpha, Beta and Gamma parameters by triple exponential smoothing and then use returned values to forecast up to T periods.
MSE, evaluated at minimum.
Time series data set.
Time series forecasts. Size of the YHat vector are adjusted automatically.
Overal smoothing parameter used for forecast.
Trend smoothing parameter used for forecast.
Seasonal smoothing parameter used for forecast.
Forecast values up to T period.
Period length. An exception is raised if Y.Length mod Period is not 0.
Use this routine if you don't know the best estimates for Alpha, Beta and Gamma.
*)
procedure TripleExpForecast(const Y: TVec; const YHat: TVec; var Alpha,Beta,Gamma: double; const T: Integer; out MSE: double; const Period: Integer); overload;
(*Single moving average.
Sample data.
Number of elements in period.
Smoothed data.
Index of first value in smoothed data.
If true, a centered moving average is perfomed.
MSE.
Performs single moving average smoothing on data Y. General equation for moving average smoothing is:
where N indicates number of points in period and X.Length data sample size. When using single moving average smoothing,
bear in mind that when used as forecasts for the next period, single moving average is not able to cope with a significant
trend.
Load sample time series, perform centered moving average smoothing with period 12 (yearly average) and finally plot
the results using TChart.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
using Steema.TeeChart;
namespace Dew.Examples
{
private void Example(Styles.Line line1, Styles.Line line2)
{
Vector ts = new Vector(0);
Vector Mv = new Vector(0);
int FirstIndex;
ts.LoadFromFile("testdata.vec");
MtxVecTee.DrawValues(ts,line1,0.0,1.0); // draw original time series
StatTimeSerAnalysis.MovingAverage(ts,12,Mv,out FirstIndex,true);
MtxVecTee.DrawValues(Mv,line2,FirstIndex,1.0); // draw MA over original time series
}
}
*)
function MovingAverage(const Y: TVec; const N: Integer; const M: TVec; out Index: Integer; const Centered: boolean = true): double;
(*Memory-shortening filter.
Original time series.
Returns transformed time series.
Returns optimal Tau shortening index.
If set, it returns coefficients of memory shortening filter. Size of Phi is adjusted automatically.
Maximum number of iterations in ERR minimization (default value 15).
Decide if time series is "long-memory", and if so, apply a memory-shortening transformation, as defined in
Brockwell, page 318.
Shorten time series and retrieve optimal tau lag and shortening filter polynomial coefficients.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector ts = new Vector(0);
Vector s = new Vector(0);
Vector phi = new Vector(0);
int tau;
ts.LoadFromFile("Deaths.vec");
StatTimeSerAnalysis.ShortenFilter(ts,s,out tau,phi,15);
}
}
*)
procedure ShortenFilter(const Data: TVec; const S: TVec; out Tau: Integer; const Filter: TVec = nil; const MaxTau: Integer = 15);
(*Fit ARAR algorithm.
Memory-shortened time series. If no memory-shortening was performed, then S defines the original unshortened time series.
Returns ARAR model phi coefficients (phi[1],phi[l1],phi[l2],phi[l3]). Size of Phi vector is adjusted automatically (4).
Returns ARAR model phil1 lag.
Returns ARAR model phil2 lag.
Returns ARAR model phil3 lag.
Returns ARAR model estimated WN variance.
Defines upper limit for l3, where 1 < l1 < l2 < l3 <= MaxLag.
Fit ARAR algorithm to (optionaly) memory-shortened series. Let S[t] denote memory-shortened series, derived from Y[t] and let
avg(S) denote sample mean of S[t]. The ARAR algorithm tries to fit an autoregressive (AR) process to the mean-corrected series:
The fitted model then has the form:
where Z[t] is WN(0,sigma2).
Fit and then forecast time series values by using ARAR algorithm. Before applying the ARAR algorithm, use the
shortening filter on original series.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector timeseries = new Vector(0);
Vector s = new Vector(0);
Vector filter = new Vector(0);
Vector phi = new Vector(0);
Vector forecasts = new Vector(0);
Vector stderrs = new Vector(0);
int l1, l2, l3, tau, s22;
double s2, rmse;
timeseries.LoadFromFile("deaths.vec");
// #1: shorten series
StatTimeSerAnalysis.ShortenFilter(timeseries, s, out tau, filter, 15);
// #2 : fit ARAR model on shortened series
StatTimeSerAnalysis.ARARFit(s, phi, out l1, out l2, out l3, out s2, 13);
// #3: forecast 100 values by using ARAR fit parameters
StatTimeSerAnalysis.ARARForecast(timeseries, phi, filter, tau, l1, l2, l3, s.Mean(), 100, forecasts, stderrs, out rmse);
}
}
*)
procedure ARARFit(const S: TVec; const Phi: TVec; out l1,l2,l3: Integer; out Sigma2: double; const MaxLag: Integer);
(*Forecast time series by ARAR.
Defines original time series.
Defines ARAR model Phi coefficients (phi[0],phi[1],phi[2],phi[3]).
Defines memory shortening filter, obtained from memory-shortening operation. In case no memory-shortening is performed, set filter to 1.0 by using Filter.SetIt([1.0]).
Defines memory-shortening optimal lag, obtained from memory-shortening operation. In case no memory-shortening is performed, set it to 1.
Defines optimal lag for phi[l1] (see equation above).
Defines optimal lag for phi[l2] (see equation above).
Defines optimal lag for phi[l3] (see equation above).
Defines memory-shortened series mean.
Defines number of forecasts.
Returns forecasts. Size and complex properties of Result are adjusted automatically.
Returns forecasts standard errors. Size and complex properties of StdErrs are adjusted automatically.
Returns fit root mean square error (RMSE).
Forecast time series values by using ARAR model, defined by the following relation:
Fit and then forecast time series values by using ARAR algorithm. Before applying the ARAR algorithm, use the
shortening filter on original series.
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Vector timeseries = new Vector(0);
Vector s = new Vector(0);
Vector filter = new Vector(0);
Vector phi = new Vector(0);
Vector forecasts = new Vector(0);
Vector stderrs = new Vector(0);
int l1, l2, l3, tau;
double s2, rmse;
timeseries.LoadFromFile("deaths.vec");
// #1: shorten series
StatTimeSerAnalysis.ShortenFilter(timeseries, s, out tau, filter, 15);
// #2 : fit ARAR model on shortened series
StatTimeSerAnalysis.ARARFit(s, phi, out l1, out l2, out l3, out s2, 13);
// #3: forecast 100 values by using ARAR fit parameters
StatTimeSerAnalysis.ARARForecast(timeseries, phi, filter, tau, l1, l2, l3, s.Mean(), 100, forecasts, stderrs, out rmse);
}
}
*)
procedure ARARForecast(const Data: TVec; const Phi, Filter: TVec; const tau,l1,l2,l3: Integer; const SMean: double; const N: Integer; const aResult, StdErrs: TVec; out RMSE: double);
(*Box-Cox transformation.
Time series.
Transformed time series. Size of Y is adjusted automatically.
Box-Cox transformation lambda value. An exception is raised if Lambda < 0 .
Performs Box-Cox transformation on time series. The Box-Cox transformation is defined by the following equation:
*)
procedure BoxCox(const U: TVec; const Y: TVec; const Lambda: double);
(*Inverse Box-Cox transformation.
Transformed time series.
Time series. Size of Data is adjusted automatically.
Box-Cox transformation lambda value. An exception is raised if Lambda < 0 .
Performs the inverse Box-Cox transformation on time series. The inverse Box-Cox transformation is defined by the following
equation:
*)
procedure BoxCoxInv(const Y: TVec; const U: TVec; const Lambda: double);
(*The box-Ljung statistics.
Defines the residuals of predicted values.
Defines the number of lags used in statistics.
the Box-Ljung statistics.
The Ljung-Box test is based on the autocorrelation plot. However, instead of testing randomness
at each distinct lag, it tests the "overall" randomness based on a number of lags. For this reason, it is often referred to
as a "portmanteau" test. The Ljung-Box test statistics can be defined as follows:
where n is the sample size, rho(j) is the autocorrelation at lag j, and h is the number of lags being tested.
Actually we are testing the hypothesis:
- H: The data are random.
- Ha: The data are not random.
The Ljung-Box test is commonly used in ARIMA modeling. Note that it is applied to the residuals of a fitted ARIMA model, not the original series.
*)
function BoxLjung(const X: TVec; const h: Integer): double;
procedure InvTransformParams(const Src, Dst: TVec; k, p, q: integer);
procedure TransformParams(const Src, Dst: TVec; k, p, q: integer);
(*Calculates the Durbin-Watson statistic
The Durbin-Watson tests for presence of correlation between consecutive residuals. We are testing hypothesis:
- H0: The residuals are not correlated.
- HA: Residuals are autocorrelated.
The test statistics ranges from 0 to 4 where d value of means:
- 0 : autocorrelation not preset.
- < 2: positive serial correlation.
- > 2: negative serial correlation.
A perfect result is a value equal to two. Value less than 1.5 or bigger than 2.5 indicates an autocorrelation problem.
*)
function DurbinWatson(const Residuals: TVec): double;
(*Interfaces several Stats Master routines and introduces new components.*)
unit StatTools;
{$I bdsppdefs.inc}
interface
Uses Math387, MtxVec, MtxVecInt, Optimization, Statistics, Regress, MtxBaseComp
,Classes
,SysUtils
,Grids
,Types
;
type
(*Calculates statistics from multiple linear regression parameters.
Calculates statistics from multiple linear regression parameters. This class is used
by the component to calculate regression statistics parameters.
The following example was taken from TMtxMulLinReg.Recalc method:
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Matrix A = new Matrix(0,0);
Matrix V = new Matrix(0,0);
TRegResultClass regres = new TRegResultClass();
TRegStatsClass regstat = new TRegStatsClass();
Regress.MulLinRegress(Y,A,regres.B,true,regres.YCalc,V);
regres.CalculateRegResults(regres.Y,V,regstatfalse,null,0.03);
}
}
*)
TRegStatsClass = class
private
FResidualVar: double;
FSignifProb: double;
FAdjustedR2: double;
FR2: double;
FF: double;
FTSS: double;
FSSE: double;
FRSS: double;
public
(*Calculates statistics parameters from multiple linear regression.*)
procedure CalculateRegStats(Y,YCalc: TVec; V: TMtx; Constant: Boolean = true; Weights: TVec = nil);
(*Returns the residual variance.*)
property ResidualVar: double read FResidualVar;
(*Returns the coefficient of determination (R squared).
Returns the coefficient of determination (R squared).
R2 is a measure of the amount of reduction in the variability of Y obtained by using the regressor variables X.
If R lies in the [0,1] interval the regression model adequately describes relation
between Y and X. However, a large value of R2 does not necessariliy imply that regression model
is a good one. Adding variable to the model will always increase R2, regardless of whether or
not the additional variable is statistically significant. Thus, models that have large values
or R2 may yield poor predictions of new observations or estimates of the mean response.
*)
property R2: double read FR2;
(*Returns the adjusted coefficient of determination.
Returns the adjusted coefficient of determination. The adjustment seeks to remove
the distortion due to a small sample size.
*)
property AdjustedR2: double read FAdjustedR2;
(*Returns the variance ratio (explained/residual ratio).
Returns the variance ratio (explained/residual ratio). This is the F (Fisher) statistic for testing
the null hypothesis that all b[i] regression coefficients = 0.
*)
property F: double read FF;
(*Returns the probability of F.
Returns the probability of F. This is the p-value for the value. The p-value
is the probability that the test statistic will take on a value at least as extreme as the observed
value, assuming that the null hypothesis is true.
*)
property SignifProb: double read FSignifProb;
(*Returns total sum of squares.*)
property TSS: double read FTSS;
(*Returns regression sum of squares.*)
property RSS: double read FRSS;
(*Returns error sum of squares.*)
property SSE: double read FSSE;
Destructor Destroy; override;
end;
(*Calculates additional parameters from multiple linear regression.
Calculates additional parameters from multiple linear regression. Use this class to calculate
regression coefficients (B) confidence interval, regression coefficients standard deviations and
residuals.
How to use TRegResultClass class?
1. Create TRegResultClass: TestClass := TRegResultClass.Create;
2. Use TestClass.B and TestClass.YCalc vectors as parameters in routine call.
3. Calculate additional parameters by using the method.
The following example was taken from TMtxMulLinReg.Recalc method:
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example()
{
Matrix A = new Matrix(0, 0);
Matrix V = new Matrix(0, 0);
Vector Y = new Vector(100);
TRegResultClass regres = new TRegResultClass();
TRegStatsClass regstat = new TRegStatsClass();
Regress.MulLinRegress(Y, A, regres.B, true, regres.YCalc, V, TRegSolveMethod.regSolveLQR);
regres.CalculateRegResults(Y, V, regstat,false, null, 0.03);
}
}
*)
TRegResultClass = class
strict private
FBConfInt: TMtx;
FB: TVec;
FYCalc: TVec;
FResiduals: TVec;
FBStdDev: TVec;
tmpVec: TVec;
function TVal(Y: TVec; V: TMtx; Alpha: double): double;
procedure SetB(const Value: TVec);
public
(*Calculates additional parameters from multiple linear regression.*)
procedure CalculateRegResults(Y: TVec; V: TMtx; Constant: Boolean = True; Weights: TVec = nil; Alpha: double = 0.05); overload;
(*Calculates regression coefficient confidence intervals, standard deviation of regression coefficients, residuals and (optional) basic regression statistics.
Vector of dependent variable.
inverse matrix of normal equations.
If true then include intercept term b(0) in calculations.
If false - set intercept term (0) to 0.0.
Weights (optional).
Desired significance probability.
Returns regression statistics.
The results are stored to:
- : regression coefficients 100(1-Alpha) confidence intervals.
- : standard deviation of regression coefficients.
- : Residuals.
- RegStats : Regression statistics.
*)
procedure CalculateRegResults(Y: TVec; V: TMtx; RegStats: TRegStatsClass; Constant: Boolean = True; Weights: TVec = nil; Alpha: double = 0.05); overload;
procedure Assign(Source: TRegResultClass);
(*Regression coefficients.
Defines regression coefficients. Regression coefficients can be passed
from routine or from component.
*)
property B: TVec read FB write SetB;
(*Regression coefficients standard deviations.
Returns regression coefficients standard deviations.
*)
property BStdDev: TVec read FBStdDev;
(*Regression coefficients confidence intervals.
BConfInt matrix returns the 100*(1-Alpha) regression coefficients
confidence intervals. The first BConfInt column is lower confidence
interval, the second column is upper confidence interval.
*)
property BConfInt: TMtx read FBConfInt;
(*Dependent variable predicted values.
Defines dependent variable predicted (calculated) values. YCalc can be passed from
routine or from component.
*)
property YCalc: TVec read FYCalc;
(*Regresion equation residuals.
Returns the residuals of the regression equation. The residuals are defined
as difference between predicted and actual values of the dependent variable:
Residual = YCalc - Y.
*)
property Residuals: TVec read FResiduals;
constructor Create;
Destructor Destroy; override;
end;
(*Defines hypothesis test/method.
Defines which hypothesis test/method will be used by component.
*)
THypothesisMethod = (
(*Sign test (comparing data median with value).*)
hmSignTest,
(*Sign test on paired data (comparing medians of two datasets).*)
hmSignTestPaired,
(*Wilcoxon Signed Rank test (comparing data median with value).*)
hmWilcoxonSign,
(*Wilcoxon Signed Rank test (comparing medians of two datasets).*)
hmWilcoxonSignPaired,
(*Z-Test (comparing dataset mean with value. Dataset variance is known).*)
hmZTest,
(*One sample t-Test (comparing dataset mean with value).*)
hmTTest1,
(*Two sample t-Test on pooled data(comparing means of two datasets. Dataset variances are equal, but unknown).*)
hmTTest2Pooled,
(*Two sample t-Test on paired data (comparing means of two datasets. Dataset variances are equal, but unknown).*)
hmTTest2Paired,
(*One sample Chi-Squared test (comparing data variance with value).*)
hmChiSquareTest,
(*Two sample F-Test (comparing variances of two datasets).*)
hmFTest,
(*Two sample Mann-Whitney U test.*)
hmMannWhitney,
(*One sample Shapiro-Francia test.*)
hmShapiroFrancia,
(*One sample Shapiro-Wilks test.*)
hmShapiroWilks,
(*One sample Anderson Darling normality test.*)
hmAndersonDarling
);
(*Encapsulates parametric and non-parametric hypothesis testing routines.
Encapsulates parametric and non-parametric hypothesis testing routines.
Many problems in engineering require that we decide whether to accept or
reject a statement about some parameter. The statement is called a hypothesis
and the decision-making procedure about the hypothesis is called hypothesis testing.
The following hypothesis tests are supported:
* one or two sample Sign test,
* one or two sample Wilcoxon Signed-Rank test,
* one or two sample (on pooled or paired data) T test,
* one sample Z test,
* one sample ChiSquared test,
* two sample F test.
How to use TMtxHypothesisTest component?
* Drop a TMtxHypothesisTest component on the form.
* Define the hypothesis test you want to perform (Sign, Wilcoxon, Z, T, ChiSquared, F).
* Define hypothesis type (the null hypothesis type) : two, left or right tailed.
* Depending on hypothesis method define one or two sample datasets. You can do this by
clicking on (and ) properties.
* Depending on hypothesis method you'll also have to define test mean (and in Z, F Test
case sigma) property.
* Define the desired significance level .
* Call the method to calculate hypothesis test results.
Results:
* : The hypothesis result (accept, reject null hypothesis).
* : actual significance level.
* , : 100*(1-Alpha) confidence
interval limits for mean (median, standard deviation).
How to setup and run two sample t-test.
using Dew.Stats.Units;
using Dew.Stats;
using Dew.Math;
namespace Dew.Examples
{
private void Example(TMtxHypothesis hyp)
{
hyp.Alpha = 0.03; // desired significance level at 3%
hyp.HypothesisType = THypothesisType.htTwoTailed; // two tailed => Ha= means are not equal
hyp.DataVec1.SetIt(false, new double[] { 1.0, 2.0, 3.0, 4.0 }); // first dataset
hyp.DataVec2.SetIt(false, new double[] { 1.5, 3.1, 4.2, 4.3 }); // second dataset
hyp.HypothesisMethod = THypothesisMethod.hmTTest2Pooled; // comparing means of two datasets
hyp.Recalc();
//Results ==> Significance = 0.43036618314 , Result = hrNotReject
}
}
*)
TMtxHypothesisTest=class(TMtxComponent)
strict private
FResult: THypothesisResult;
FHypothesisType: THypothesisType;
FAlpha: double;
FConfLower: double;
FConfUpper: double;
FSignificance: double;
FDataVec1: TVec;
FDataVec2: TVec;
FMean,FSigma,FTestStatistics : double;
FHypothesisMethod: THypothesisMethod;
FAutoUpdate: boolean;
FDirty: boolean;
procedure SetHypothesisType(const Value: THypothesisType);
procedure SetAlpha(const Value: double);
procedure SetMean(const Value: double);
procedure SetSigma(const Value: double);
procedure SetHypothesisMethod(const Value: THypothesisMethod);
procedure SetAutoUpdate(const Value: boolean);
procedure SetDirty(const Value: boolean);
procedure SetDataVec1(const Value: TVec);
procedure SetDataVec2(const Value: TVec);
strict protected
procedure Loaded; override;
public
class function EditorClass: string; override;
(*When any of the TMtxHypothesisTest properties changes, this property is automatically set to true.
If and Dirty are both true then the
method will be called automatically.
*)
property Dirty : boolean read FDirty write SetDirty;
(*Returns calculated significance level.
Returns calculated significance level. If this value is bellow the desired significance
(), the null hypothesis is rejected.
*)
property Significance : double read FSignificance;
(*Returns test statistics.*)
property TestStatistics: double read FTestStatistics;
(*Returns lower confidence interval limit.
Returns 100(1-) confidence interval upper limit.
*)
property ConfLower: double read FConfLower;
(*Returns upper confidence interval limit.
Returns 100(1-) confidence interval upper limit.
*)
property ConfUpper: double read FConfUpper;
(*Returns hypothesis result.*)
property Result: THypothesisResult read FResult;
(*Triggers hypothesis test recalculation.
Triggers hypothesis test recalculation. After the Recalc call property is set to false.
The results of hypothesis test are stored to:
* - Reject/do not reject hypothesis.
* - Lower confidence interval for mean, standard deviation, median, ...
* - Upper confidence interval for mean, standard deviation, median, ...
* - Calculated significance probability.
*)
procedure Recalc;
constructor Create(AOwner: TComponent); override;
destructor Destroy ; override;
published
(*If true then changing any of the TMtxHypothesisTest properties will trigger the
method.
*)
property AutoUpdate : boolean read FAutoUpdate write SetAutoUpdate default false;
(*Defines hypothesis type (left, right and two-tailed).*)
property HypothesisType : THypothesisType read FHypothesisType write SetHypothesisType default htTwoTailed;
(*Defines which hypothesis routine will be used for data testing.*)
property HypothesisMethod: THypothesisMethod read FHypothesisMethod write SetHypothesisMethod default hmZTest;
(*Desired significance level.
Desired significance level. If the probability is bellow the
desired significance (), the null hypothesis is rejected.
*)
property Alpha : double read FAlpha write SetAlpha;
(*Defines mean/median value for one-sample test.
Defines value (mean, median) for one-sample test.
If is hmTTest1 or hmZTest Mean will be compared with
mean value. If HypothesisMethod is hmSignTest or hmWilcoxonSign Mean will be compared with
sample, in this case values.
median value. Mean will be ignored for hmChiSquaredTest and all two-sample tests.
*)
property Mean : double read FMean write SetMean;
(*Defines standard deviation value for single test.
Defines standard deviation for one-sample test.
Note
Sigma will be used only if is hmZTest,
hmChiSquaredTest. Sigma will be ignored for all other tests.
*)
property Sigma : double read FSigma write SetSigma;
(*First sample values.
Stores first sample values. If property is
hmTTest1, hmZTest then DataVec1 mean is compared with (+ for Z-Test).
If HypothesisMethod property is hmSignTest or hmWilcoxonTest then DataVec1 median
is compared with . If HypothesisMethod is hmChiSquared then
DataVec1 Standard Deviation is compated with . When HypothesisMethod is
something else then DataVec1 mean, median or standard deviation is compared with DataVec2 mean, median or
standard deviation.
*)
property DataVec1: TVec read FDataVec1 write SetDataVec1 stored AllowStreaming;
(*Second sample values.
Stores second sample values. DataVec2 is used only if is
hmSignTestPaired, hmWilcoxonSignPaired, hmTTest2Pooled, hmTTest2Paired or
hmFTest (all two-sample tests). If HypothesisMethod is something else then DataVec2 is ignored.
*)
property DataVec2: TVec read FDataVec2 write SetDataVec2 stored AllowStreaming;
end;
(*Visual representation of One and Two way ANOVA.
Encapsulates one and two way analysis of variance (ANOVA) routines. The assumptions of
the one and two-way analysis of variance are:
- The data are continuous (not discrete).
- The data follow the normal probability distribution. Each group is normally distributed about the group mean.
- The variances of the populations are equal.
- The groups are independent. There is no relationship among the individuals in one group as compared to another.
- Each group is a simple random sample from its population. Each individual in the population has an equal probability of being selected in the sample.
How to use TMtxAnova component?
- Drop a TMtxAnova component on the form.
- Define if you'll do a one-way or two-way ANOVA (the IsAnova1 property).
- If you'll be doing the two-way ANOVA, define the number of replications per "cell" (the property).
- Define the data you wish to analyze. Depending on IsAnova1 and Replications properties Data matrix can be interpreted in different ways.
- Define the default number format for ANOVA table - the property. Default value is '0.0000'.
- Define (optional) the ResultDest TStringGrid. TMtxAnova results will be outputted to a string grid.
- Perform ANOVA by calling the method.
Results:
- If you specified then the standard ANOVA table will be written to ResultDest string grid.
If ResultDest is nil, you can still use or methods
to output ANOVA table to a TStringGrid or TStrings.
- returns the significance probability of null hypothesis that means are equal. If P is less than desired
significance then the result suggests, the null hypothesis (means are equal) can be rejected.
How to setup and run two sample t-test.
using Dew.Stats.Units;
using Dew.Stats;
using Dew.Math;
namespace Dew.Examples
{
private void Example()
{
TMtxAnova an1 = new TMtxAnova(this);
try
{
an1.Data.SetIt(6, 2, false, new double[] { 1, 2,
2, 3,
5, 7,
12, 1,
5, 8,
3, 8 });
an1.IsAnova1 = false; // do two-way ANOVA
an1.Replications = 3; // three rows per "group"
// output ANOVA table to grid1
an1.Recalc();
double aresult = an1.Anova2Res.FCrit;
}
finally
{
an1.Free();
}
}
}
*)
TMtxAnova = class(TMtxComponent)
strict private
FDirty: boolean;
FIsAnova1: boolean;
FAutoUpdate: boolean;
FReplications: Integer;
FFmtString: String;
FData: TMtx;
FAlpha: double;
FP: double;
FA1Res: TAnova1Result;
FA2Res: TAnova2Result;
pRows, pCols, pInt : double;
FResultDest: TStringGrid;
procedure SetAlpha(const Value: double);
procedure SetAutoUpdate(const Value: boolean);
procedure SetData(const Value: TMtx);
procedure SetDirty(const Value: boolean);
procedure SetFmtString(const Value: String);
procedure SetIsAnova1(const Value: boolean);
procedure SetReplications(const Value: Integer);
procedure SetResultDest(const Value: TStringGrid);
procedure SetupResultGrid(const DestGrid: TStringGrid);
strict protected
procedure Loaded; override;
public
(*Results of the one way ANOVA.*)
property Anova1Res: TAnova1Result read FA1Res;
(*Results of the two way ANOVA.*)
property Anova2Res: TAnova2Result read FA2Res;
(*Calculated significance probability of null hypothesis.
Returns the probability for the null hypothesis that the means of the groups
are equal.
*)
property P: double read FP;
(*When any of the TMtxANOVA properties changes, this property is automatically set to true.
If and Dirty are both true then the method
will be called automatically.
*)
property Dirty: boolean read FDirty write SetDirty;
(*Outputs results to string grid.
Outputs standard ANOVA table to DestGrid string grid.
In this example we write the ANOVA table to
StringGrid1. This is the same as connecting
property to StringGrid1. DestGrid dimensions are adjusted automatically.
MtxANOVA1.Data.SetIt(6,2,false,[1,2,
2,3,
5,7,
12,1,
5,8,
3,8]);
MtxANOVA1.FmtString := '0.00'; // 2 digits
MtxANOVA1.IsAnova1 := true; // one-way ANOVA
MtxANOVA1.Recalc;
MtxANOVA1.WriteResultToGrid(StringGrid1);
MtxAnova1->Data->SetIt(6,2,false,OPENARRAY(double,(1,2,
2,3,
5,7,
12,1,
5,8,
3,8)));
MtxAnova1->FmtString = "0.00"; // 2 digits
MtxAnova1->IsAnova1 = true; // one-way ANOVA
MtxAnova1->Recalc();
MtxAnova1->WriteResultToGrid(StringGrid1);
*)
procedure WriteResultToGrid(DestGrid: TStringGrid);
(*Outputs results to string list.
Outputs standard ANOVA table to DestStrings. ANOVA table columns
are separated by Delimiter (default value kTab - meaning tab char is
the delimiter).
In this example we write ANOVA table to Memo1.
MtxANOVA1.Data.SetIt(6,2,false,[1,2,
2,3,
5,7,
12,1,
5,8,
3,8]);
MtxANOVA1.FmtString := '0.00'; // 2 digits
MtxANOVA1.IsAnova1 := true; // do one-way ANOVA
MtxANOVA1.Recalc;
Memo1.Lines.Clear;
MtxANOVA1.WriteResultToStrings(Memo1.Lines);
MtxAnova1->Data->SetIt(6,2,false,OPENARRAY(double,(1,2,
2,3,
5,7,
12,1,
5,8,
3,8)));
MtxAnova1->FmtString = "0.00"; // 2 digits
MtxAnova1->IsAnova1 = true; // one-way ANOVA
MtxAnova1->Recalc();
MtxAnova1->WriteResultToStrings(Memo1->Lines);
*)
procedure WriteResultToStrings(DestStrings: TStrings; Delimiter: string = kTab);
(*Triggers ANOVA recalculation.
Triggers ANOVA recalculation. After the Recalc call property is set to false.
If property is not nil then the standard ANOVA table will be written to
ResultTest string grid. To visualize ANOVA table you can also use or
methods.
*)
procedure Recalc;
constructor Create(AOwner: TComponent); override;
Destructor Destroy; override;
published
(*If true then changing any of the TMtxAnova properties will trigger the
method.
*)
property AutoUpdate: boolean read FAutoUpdate write SetAutoUpdate default false;
(*Data for ANOVA.
Stores the data which is analyzed with analysis of variance (ANOVA).
Depending on and
properties, Data storage format is:
* IsAnova1 = true : each column represents a separate group
* IsAnova1 = false : each column represents changes in one factor,
rows represent changes in other factor.
*)
property Data: TMtx read FData write SetData stored AllowStreaming;
(*One or two-way ANOVA.
If true then TMtxANOVA performs one-way ANOVA. If false, then TMtxAnova
performs two-way ANOVA.
*)
property IsAnova1: boolean read FIsAnova1 write SetIsAnova1 default true;
(*Number of observations per data matrix "cell".
Defines the number of observations per matrix "cell",
where each cell includes number of rows. For example,
if Data has 6 rows and Replications is 2, then the "row" factor has 6/2 = 3 levels
(1st level row 0..1, 2nd level row 2..3, 3rd level row 4..5). An exception is raised
if A.Rows mod Replications is not zero (if A.Rows/Replications is not an integer).
The Replications factor is taken into the account ONLY if
is false (two-way ANOVA).
Note that Data.Rows mod 3 = 0 !!
MtxAnova1.Data.SetIt(6,2,false,[1,2,
2,3,
5,7,
12,1,
5,8,
3,8]);
MtxAnova1.IsAnova1 := false; // do two-way ANOVA
MtxAnova1.Replications := 3; // three rows per "group"
MtxAnova1.ResultDest := StringGrid1;
MtxAnova1.Recalc;
MtxAnova1->Data->SetIt(6,2,false,OPENARRAY(double,(1,2,
2,3,
5,7,
12,1,
5,8,
3,8)));
MtxAnova1->IsAnova1 = false; // do two-way ANOVA
MtxAnova1->Replications = 3; // three rows per "group"
MtxAnova1->ResultDest = StringGrid1;
MtxAnova1->Recalc();
*)
property Replications: Integer read FReplications write SetReplications default 1;
(*Desired significance probability.
Desired significance probability, used to calculate the critical F value for
one-way and two-way ANOVA.
*)
property Alpha : double read FAlpha write SetAlpha;
(*Numeric format for ANOVA table cells.
Defines the number format used for all TMtxANOVA floating-point values.
In the following example we do two-way ANOVA with
two rows per group and set default format for numbers to '0.000000'
i.e. 6 digits.
MtxAnova1.Data.SetIt(4,4,false,[1,2,3,4,
3,4,5,6,
1,5,11,13,
4,7,9,10]);
MtxAnova1.FmtString := '0.000000'; // < - - 6 digits
MtxAnova1.IsAnova1 := false; //do two-way ANOVA
MtxAnova1.Replications := 2; //two rows per "group"
MtxAnova1.ResultDest := StringGrid1;
MtxAnova1.Recalc;
MtxAnova1->Data->SetIt(4,4,false,OPENARRAY(double,(1,2,3,4,
3,4,5,6,
1,5,11,13,
4,7,9,10)));
MtxAnova1->FmtString = "0.000000"; // < - - 6 digits
MtxAnova1->IsAnova1 = false; //do two-way ANOVA
MtxAnova1->Replications = 2; //two rows per "group"
MtxAnova1->ResultDest = StringGrid1;
MtxAnova1->Recalc();
*)
property FmtString: String read FFmtString write SetFmtString;
(*Outputs ANOVA table.
Use this property to output standard ANOVA table to ResultDest string grid.
ResultDest grid dimensions are adjusted automatically after the call to
call.
*)
property ResultDest: TStringGrid read FResultDest write SetResultDest;
end;
(*Performs nonlinear regression.
How to use TMtxNonLinReg component?
- Drop a component on the form.
- Define the vector of independent variables.
- Define the vector of dependent variables. Make sure that X and Y have the same length.
- Define initial estimates for regression parameters.
- (Optionaly) define . If you'll use weights, set property to true.
- Define regression function.
- (Optionally) define the derivative procedure. If you don't define derivative procedure then the numeric approximation will be used to calculate the
derivative at specific point.
- Define the optimization method. Depending on your function different methods will give better/worse results.
- (Optionally) define desired tolerance for result.
- Define the maximum number of steps for optimization method. Internal optimization
method will stop when number of iterations exceeds MaxIter or internal minimum precision is within Tolerance.
- Verbose property, if assigned, stores Fun, evaluated at each iteration step.
Optionally, you can also assign object to the Verbose property. This allows the optimization procedure
to be interrupted from another thread and optionally also allows logging and iteration count monitoring.
In case of performance problems consider using TMtxMultiNonLinReg. That component internally performs vectorized
optimization process. It can also be used to speed-up nonlinear regression with only one independent variable
by 10-20x.
Results:
- : Regression coefficients estimates.
- : Fitted Y values.
In the following example we use the TMtxNonLinReg component to
fit generated data to non-linear function B[0]/Power((1.0 + Exp(B[1]-B[2]*X)),1/B[3]).
In this example exact derivate procedure is not used - algorithm uses numerical
derivatives:
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
using System;
namespace Dew.Examples
{
private double Rat43(TVec b, double x)
{
return b[0] / Math.Pow((1.0 + Math.Exp(b[1]-b[2]*x)),1/b[3]);
}
private void Example(TMtxNonLinReg nlr)
{
// Load data - independent variable
nlr.X.SetIt(false, new double[] {9.000, 14.000, 21.000, 28.000,
42.000, 57.000, 63.000, 70.000,
79.000});
// Load data - dependent variable
nlr.Y.SetIt(false, new double[] {8.930, 10.800, 18.590, 22.330,
39.350, 56.110, 61.730, 64.620,
67.080});
// Initial estimates for regression coefficients
nlr.B.SetIt(false, new double[] { 100, 10, 1, 1 });
// setup optimization parameters
nlr.Tolerance = 1.0e-6; // 6 digits should do the trick
nlr.GradTolerance = 1.0e-3; // 3 digits
nlr.MaxIteration = 400;
nlr.RegressFunction += Rat43; // regression function
// Marquardt method
nlr.OptMethod = TOptMethod.optMarquardt;
nlr.Recalc();
// MtxNinLinReg->b now stores calculated regression parameter estimates
}
}
*)
TMtxNonLinReg=class(TMtxComponent)
strict private
FVerbose: TStrings;
FOptMethod: TOptMethod;
FX: TVec;
FY: TVec;
FWeights: TVec;
FMaxIteration: Integer;
FB: TVec;
FGradTolerance: double;
FDirty: boolean;
FAutoUpdate: boolean;
FUseWeights: boolean;
FDeriveProcedure: TDeriveProc;
FRegressFunction: TRegressFun;
FYCalc: TVec;
FStopReason: TOptStopReason;
FSoftSearch: boolean;
FTolerance: double;
procedure SetVerbose(const Value: TStrings);
procedure SetOptMethod(const Value: TOptMethod);
procedure SetMaxIteration(const Value: Integer);
procedure SetGradTolerance(const Value: double);
procedure SetDirty(const Value: boolean);
procedure SetAutoUpdate(const Value: boolean);
procedure SetX(const Value: TVec);
procedure SetY(const Value: TVec);
procedure SetWeights(const Value: TVec);
procedure SetB(const Value: TVec);
procedure SetUseWeights(const Value: boolean);
procedure SetDeriveProcedure(const Value: TDeriveProc);
procedure SetRegressFunction(const Value: TRegressFun);
procedure SetSoftSearch(const Value: boolean);
procedure SetTolerance(const Value: double);
strict protected
procedure Loaded; override;
public
(*Number of iterations needed to converge to solution.
Returns number of iterations needed to converge to solution with
given precision (specified by property).
*)
Iterations: Integer;
(*If not nil then the optimization method uses it for logging each optimization step.
If not nil then the optimization method uses it for logging each optimization step.
By default the Verbose property is nil meaning no logging is done.
If assigned, stores Fun, evaluated at each iteration step.
Optionally, you can also assign object to the Verbose property. This allows the optimization procedure
to be interrupted from another thread and optionally also allows logging and iteration count monitoring.
Log to Memo.Lines:
TStringList log = new TStringList();
nlr.Verbose = log;
nlr.Recalc();
*)
property Verbose: TStrings read FVerbose write SetVerbose;
(*Triggers non-linear regression recalculation.
Triggers non-linear regression recalculation. After the Recalc call
property is set to false.
The results are returned as:
* : Regression coefficients estimates.
* : Fitted Y values.
*)
procedure Recalc;
constructor Create(AOwner: TComponent); override;
destructor Destroy; override;
(*Defines the regression function of several regression parameters.
Defines the regression function of several regression parameters B and independent variable X.
In the following example we define general polynomial of second degree (three parameters)
and also the procedure for derivatives:
// y=b0*x*x + b1*x + b2 i.e. parabola
double SimpleParabola(TVec b, double x)
{
return b[0]*x*x + b[1]*x + b[2];
}
// procedure for derivatives
void SimplePabolaDeriv(TRegressFun RegressFun, double x, double y, double[] pars, TVec Grad)
{
Grad[0] = x*x;
Grad[1] = x;
Grad[2] = 1;
}
void Example();
{
// ...
Regress.MtxNonLinReg1.RegressFunction = SimpleParabola;
Regress.MtxNonLinReg1.DeriveProcedure = SimpleParabolaDeriv;
}
*)
property RegressFunction: TRegressFun read FRegressFunction write SetRegressFunction;
(*Defines derivatives of regression function.
Defines the procedure for calculating the derivatives of a regression function
with respect to the regression parameters , evaluated
at specific set of (,) points.
If DeriveProcedure is not declared (for example if it's nil) will
use the approximate numeric derivative. In this case the gradient step size is defined by
GradientStepSize (Optimization unit) global variable. Normally the optimal stepsize depends on
seventh partial derivatives of the function. Since they (in this case) are not available, the initial
value for GradientStepSize is Exp(Ln(EPS)/7)*0.25 (as suggested by Spellucci).
*)
property DeriveProcedure: TDeriveProc read FDeriveProcedure write SetDeriveProcedure;
(*Vector of calculated values.
Returns fitted dependent (YCalc=B*X) values. Use YCalc to calculate the residuals:
Residuals = YCalc - Y.
*)
property YCalc: TVec read FYCalc;
(*Returns the reason why regression parameters calculation stopped.
Returns the reason why regression parameters calculation stopped. All stop reasons are
declared and described in MtxVec Optimization.pas unit.
*)
property StopReason: TOptStopReason read FStopReason;
(*When any of the TMtxNonLinReg properties changes, this property is automatically set to true.
If and Dirty are both true then the
method will be called automatically.
*)
property Dirty: boolean read FDirty write SetDirty;
published
(*If true then changing any of the TMtxNonLinReg properties will trigger the
method.
*)
property AutoUpdate : boolean read FAutoUpdate write SetAutoUpdate default false;
(*Stores the regression coefficients.
Stores the regression coefficients. Set B to define initial values for regression
coefficients. After the call B stores calculated regression coefficients.
*)
property B : TVec read FB write SetB stored AllowStreaming;
(*Vector of independant variables.
Stores independant variables.
*)
property X : TVec read FX write SetX stored AllowStreaming;
(*Vector of the dependant variable.
Stores dependant variables.
*)
property Y : TVec read FY write SetY stored AllowStreaming;
(*Weights.
Stores non-linear regression weights.
Note
Weights are used only if property is set to true.
*)
property Weights: TVec read FWeights write SetWeights stored AllowStreaming;
(*Optimization method used.
Defines which optimization method will be used to find
regression coefficients estimates. Several different optimization algorithms are supported
(see Dew Math Optimization assembly help for more on this topic):
* optSimplex : Nelder-Mead optimization method.
* optMarquardt : Marquardt optimization method.
* optBFGS : Quasi-Newton optimization method (Broyden-Fletcher-Goldfarb-Shanno Hessian update).
* optDFP: Quasi-Newton optimization method (Davidson-Fletcher-Power Hessian update).
* optConjGradFR: Conjugate Gradient optimization method (Fletcher-Reeves).
* optConjGradPR : Conjugate Gradient optimization method (Polak-Ribiere).
*)
property OptMethod : TOptMethod read FOptMethod write SetOptMethod default optMarquardt;
(*Defines one of the conditions for terminating the regression coefficient calculation.
Tolerance and MaxIter parameters define the conditions for terminating the regression
coefficients calculation (number of iterations exceeds MaxIter or internal minimum
precision is within Tolerance).
*)
property MaxIteration : Integer read FMaxIteration write SetMaxIteration default 500;
(*Defines one of the conditions for terminating the regression coefficient calculation.
Tolerance and MaxIter parameters define the conditions for terminating the regression
coefficients calculation (number of iterations exceeds MaxIter or internal minimum
precision is within Tolerance).
*)
property Tolerance: double read FTolerance write SetTolerance;
(*Defines minimal allowed gradient C-Norm.
Defines minimal allowed gradient C-Norm. If during the regression parameters calculation
gradient C-Norm falls bellow GradTolerance then calculation will stop and return
StopReason = OptResSmallGrad.
*)
property GradTolerance : double read FGradTolerance write SetGradTolerance;
(*Weighted non-linear regression.
If this property is true then internal algorithm will perform weighted
non-linear regression. In this case you must specify the weights (set the
vector).
*)
property UseWeights : boolean read FUseWeights write SetUseWeights default false;
(*Defines line search algorithm for Quasi-Newton and Conjugate methods.
If true then Quasi-Newton and Conjugate gradient method internal line search
algoritm will use soft line search method. Set to true if
you're using numerical approximation for derivative. If SoftSearch if false, then the internal
line search algorithm will use exact line search method. Set SoftSearch to false if you're using
exact derivative procedure.
*)
property SoftSearch: boolean read FSoftSearch write SetSoftSearch default true;
end;
(*Performs multiple-nonlinear regression.
How to use TMtxMultiNonLinReg component?
1. Drop a component on the form.
2. Define the vector of independent variables.
3. Define the vector of dependent variables. Make sure
that X and Y have the same length.
4. Define initial estimates for regression parameters.
5. (Optionaly) define . If you'll use weights, set
property to true.
6. Define regression function.
7. (Optionally) define the derivative procedure. If you don't define derivative
procedure then the numeric approximation will be used to calculate the
derivative at specific point.
8. Define the optimization method. Depending on your function different
methods will give better/worse results.
9. (Optionally) define desired tolerance for result.
10. Define the maximum number of steps for optimization method. Internal optimization
method will stop when number of iterations exceeds MaxIter or internal minimum
precision is within Tolerance.
11. Verbose property, if assigned, stores Fun, evaluated at each iteration step.
Optionally, you can also assign object to the Verbose property. This allows the optimization procedure
to be interrupted from another thread and optionally also allows logging and iteration count monitoring.
Results:
1. : Regression coefficients estimates.
2. : Fitted Y values.
The component internally performs vectorized optimization process. It can also be used to speed-up
nonlinear regression with only one independent variable by 10-20x.
In the following example we use the TMtxMultiNonLinReg component to
fit generated data to non-linear function B[0]/Power((1.0 + Exp(B[1]-B[2]*X)),1/B[3]).
In this example exact derivate procedure is not used - algorithm uses numerical
derivatives:
using Dew.Math;
using Dew.Stats;
using Dew.Stats.Units;
using System;
namespace Dew.Examples
{
private void Rat43(TVec b, TVecList x, TVec y)
{
// return b[0] / Math.Pow((1.0 + Math.Exp(b[1]-b[2]*x)),1/b[3]);
y.Mul(x[0], -B[2]);
y.Add(B[1]);
y.Exp();
y.Add(1);
y.Power(1/B[3]);
y.DivideBy(B[0]);
}
private void Example(TMtxMultiNonLinReg nlr)
{
// Load data - independent variable
nlr.X.Add():
nlr.X[0].SetIt(false,new double[] {9.000, 14.000, 21.000, 28.000,
42.000, 57.000, 63.000, 70.000,
79.000});
// Load data - dependent variable
nlr.Y.SetIt(false,new double[] {8.930, 10.800, 18.590, 22.330,
39.350, 56.110, 61.730, 64.620,
67.080});
// Initial estimates for regression coefficients
nlr.B.SetIt(false,new double[] {100,10,1,1});
// setup optimization parameters
nlr.Tolerance = 1.0e-6; // 6 digits should do the trick
nlr.GradTolerance = 1.0e-3; // 3 digits
nlr.MaxIterations = 400;
nlr.RegressFunction = +Rat43; // regression function
// Marquardt method
nlr.OptMethod = TOptMethod.optMarquardt;
nlr.Recalc();
// MtxNinLinReg->b now stores calculated regression parameter estimates
}
}
*)
TMtxMultiNonLinReg=class(TMtxComponent)
strict private
FVerbose: TStrings;
FOptMethod: TOptMethod;
FX: TVecList;
FY: TVec;
FWeights: TVec;
FMaxIteration: Integer;
FB: TVec;
FGradTolerance: double;
FDirty: boolean;
FAutoUpdate: boolean;
FUseWeights: boolean;
FDeriveProcedure: TMultiDeriveProc;
FRegressFunction: TMultiRegressFun;
FYCalc: TVec;
FStopReason: TOptStopReason;
FSoftSearch: boolean;
FTolerance: double;
procedure SetVerbose(const Value: TStrings);
procedure SetOptMethod(const Value: TOptMethod);
procedure SetMaxIteration(const Value: Integer);
procedure SetGradTolerance(const Value: double);
procedure SetDirty(const Value: boolean);
procedure SetAutoUpdate(const Value: boolean);
procedure SetX(const Value: TVecList);
procedure SetY(const Value: TVec);
procedure SetWeights(const Value: TVec);
procedure SetB(const Value: TVec);
procedure SetUseWeights(const Value: boolean);
procedure SetDeriveProcedure(const Value: TMultiDeriveProc);
procedure SetRegressFunction(const Value: TMultiRegressFun);
procedure SetSoftSearch(const Value: boolean);
procedure SetTolerance(const Value: double);
strict protected
procedure Loaded; override;
public
(*Number of iterations needed to converge to solution.
Returns number of iterations needed to converge to solution with
given precision (specified by property).
*)
Iterations: Integer;
(*If not nil then the optimization method uses it for logging each optimization step.
If not nil then the optimization method uses it for logging each optimization step.
By default the Verbose property is nil meaning no logging is done.
If assigned, stores Fun, evaluated at each iteration step.
Optionally, you can also assign object to the Verbose property. This allows the optimization procedure
to be interrupted from another thread and optionally also allows logging and iteration count monitoring.
Log to Memo.Lines:
TStringList log = new TStringList();
nlr.Verbose = log;
nlr.Recalc();
*)
property Verbose: TStrings read FVerbose write SetVerbose;
(*Triggers non-linear regression recalculation.
Triggers non-linear regression recalculation. After the Recalc call property is set to false.
The results are returned as:
* : Regression coefficients estimates.
* : Fitted Y values.
*)
procedure Recalc;
constructor Create(AOwner: TComponent); override;
destructor Destroy; override;
(*Defines the regression function of several regression parameters.
Defines the regression function of several regression parameters B and independent variable X.
In the following example we define general polynomial of second degree (three parameters)
and also the procedure for derivatives:
// y=b0*x*x + b1*x + b2 i.e. parabola
void SimpleParabola(TVec b, TVecList x, TVec y)
{
// return b[0]*x*x + b[1]*x + b[2];
y.Sqr(x[0]);
y.Scale(b[0));
y.AddScaled(x[0], b[1]);
y.Add(b[2]);
}
// procedure for derivatives
void SimplePabolaDeriv(TRegressFun RegressFun, TVecList x, TVec y, double[] pars, TVecList Grad)
{
Grad[0].Sqr(x[0]); // Grad[0] := x*x;
Grad[1].Copy(x[0]); // Grad[1] := x;
Grad[2].Size(x[0]); // Grad[2] := 1;
Grad[2].SetVal(1);
}
void Example();
{
// ...
Regress.MtxNonLinReg1.RegressFunction = SimpleParabola;
Regress.MtxNonLinReg1.DeriveProcedure = SimpleParabolaDeriv;
}
*)
property RegressFunction: TMultiRegressFun read FRegressFunction write SetRegressFunction;
(*Defines derivatives of regression function.
Defines the procedure for calculating the derivatives of a regression function
with respect to the regression parameters , evaluated
at specific set of (,) points.
If DeriveProcedure is not declared (for example if it's nil) will
use the approximate numeric derivative. In this case the gradient step size is defined by
GradientStepSize (Optimization unit) global variable. Normally the optimal stepsize depends on
seventh partial derivatives of the function. Since they (in this case) are not available, the initial
value for GradientStepSize is Exp(Ln(EPS)/7)*0.25 (as suggested by Spellucci).
*)
property DeriveProcedure: TMultiDeriveProc read FDeriveProcedure write SetDeriveProcedure;
(*Vector of calculated values.
Returns fitted dependent (YCalc=B*X) values. Use YCalc to calculate the residuals:
Residuals = YCalc - Y.
*)
property YCalc: TVec read FYCalc;
(*Returns the reason why regression parameters calculation stopped.
Returns the reason why regression parameters calculation stopped. All stop reasons are
declared and described in MtxVec Optimization.pas unit.
*)
property StopReason: TOptStopReason read FStopReason;
(*When any of the TMtxNonLinReg properties changes, this property is automatically set to true.
If and Dirty are both true then the
method will be called automatically.
*)
property Dirty: boolean read FDirty write SetDirty;
published
(*If true then changing any of the TMtxMultiNonLinReg properties will trigger the
method.
*)
property AutoUpdate : boolean read FAutoUpdate write SetAutoUpdate default false;
(*Stores the regression coefficients.
Stores the regression coefficients. Set B to define initial values for regression
coefficients. After the call B stores calculated regression coefficients.
*)
property B : TVec read FB write SetB stored AllowStreaming;
(*Vector of independant variables.
Stores independant variables.
*)
property X : TVecList read FX write SetX stored AllowStreaming;
(*Vector of dependant variables.
Stores dependant variables.
*)
property Y : TVec read FY write SetY stored AllowStreaming;
(*Weights.
Stores non-linear regression weights.
Note
Weights are used only if property is set to true.
*)
property Weights: TVec read FWeights write SetWeights stored AllowStreaming;
(*Optimization method used.
Defines which optimization method will be used to find
regression coefficients estimates. Several different optimization algorithms are supported
(see Dew Math Optimization assembly help for more on this topic):
* optSimplex : Nelder-Mead optimization method.
* optMarquardt : Marquardt optimization method.
* optBFGS : Quasi-Newton optimization method (Broyden-Fletcher-Goldfarb-Shanno Hessian update).
* optDFP: Quasi-Newton optimization method (Davidson-Fletcher-Power Hessian update).
* optConjGradFR: Conjugate Gradient optimization method (Fletcher-Reeves).
* optConjGradPR : Conjugate Gradient optimization method (Polak-Ribiere).
*)
property OptMethod : TOptMethod read FOptMethod write SetOptMethod default optMarquardt;
(*Defines one of the conditions for terminating the regression coefficient calculation.
Tolerance and MaxIter parameters define the conditions for terminating the regression
coefficients calculation (number of iterations exceeds MaxIter or internal minimum
precision is within Tolerance).
*)
property MaxIteration : Integer read FMaxIteration write SetMaxIteration default 500;
(*Defines one of the conditions for terminating the regression coefficient calculation.
Tolerance and MaxIter parameters define the conditions for terminating the regression
coefficients calculation (number of iterations exceeds MaxIter or internal minimum
precision is within Tolerance).
*)
property Tolerance: double read FTolerance write SetTolerance;
(*Defines minimal allowed gradient C-Norm.
Defines minimal allowed gradient C-Norm. If during the regression parameters calculation
gradient C-Norm falls bellow GradTolerance then calculation will stop and return
StopReason = OptResSmallGrad.
*)
property GradTolerance : double read FGradTolerance write SetGradTolerance;
(*Weighted non-linear regression.
If this property is true then internal algorithm will perform weighted
non-linear regression. In this case you must specify the weights (set the
vector).
*)
property UseWeights : boolean read FUseWeights write SetUseWeights default false;
(*Defines line search algorithm for Quasi-Newton and Conjugate methods.
If true then Quasi-Newton and Conjugate gradient method internal line search
algoritm will use soft line search method. Set to true if
you're using numerical approximation for derivative. If SoftSearch if false, then the internal
line search algorithm will use exact line search method. Set SoftSearch to false if you're using
exact derivative procedure.
*)
property SoftSearch: boolean read FSoftSearch write SetSoftSearch default true;
end;
(*Performs multiple linear regression.
Use TMtxMulLinReg component to perform multiple linear regression on dataset. The TMulLinRegress component fits
equations to data by minimizing the sum of squared residuals:
SS = Sum [y(k) - ycalc(k)]^2 ,
where y(k) and ycalc(k) are respectively the observed and calculated value of the dependent variable for
observation k. ycalc(k) is a function of the regression parameters b(0), b(1) ... Here the observed values obey
the following equation:
y(k) = b(0) + b(1) * x(1,k) + b(2) * x(2,k) + ...
i.e
y = A * b
How to use TMtxMulLinReg component?
1. Drop a TMtxMulLinReg component on the form.
2. Define , the matrix of independent variables.
3. Define , vector of dependent variables.
4. Define (optional) vector.
5. If you don't want to use intercept term b(0) in our calculations,
set property to false,
6. Call the method to trigger calculation.
Results:
1. : regression coefficients (b),
regression coefficients confidence intervals (BConfInt) and standard deviation
(BStdDev), residuals (Residuals) and calculated dependent variables (YCalc).
2. : basic regression statistics (R2, Adjusted
R2, residuals variance, F statistics, significance probability).
How to setup and run multiple linear regression? In this example
we've also used the weights. This is done by specifying weights for each variable
and setting UseWeights property to true.
using Dew.Stats;
using Dew.Stats.Units;
namespace Dew.Examples
{
private void Example(StatTools.TMtxMulLinReg mlr)
{
// y = A*b
mlr.A.SetIt(3,2,false,new double[] {-5,2,
1,4,
8,0.5});
mlr.Y.SetIt(false,new double[] {-2,1,11});
mlr.Weights.SetIt(false, new double[] {2,6,1});
mlr.UseWeights = true;
mlr.Recalc();
// Result ==> b = ( 4.586, 0.871, -1.114 )
}
}
*)
TMtxMulLinReg = class(TMtxComponent)
strict private
FY: TVec;
FA: TMtx;
FWeights: TVec;
FConstant: boolean;
FDirty: boolean;
FAutoUpdate: boolean;
FUseWeights: boolean;
FRegressStatistics: TRegStatsClass;
IV : TMtx;
FRegressResult: TRegResultClass;
FAlpha: double;
FSolveMethod: TRegSolveMethod;
procedure SetConstant(const Value: boolean);
procedure SetDirty(const Value: boolean);
procedure SetAutoUpdate(const Value: boolean);
procedure SetA(const Value: TMtx);
procedure SetY(const Value: TVec);
procedure SetWeights(const Value: TVec);
procedure SetUseWeights(const Value: boolean);
procedure SetAlpha(const Value: double);
procedure SetSolveMethod(const Value: TRegSolveMethod);
strict protected
procedure Loaded; override;
public
class function EditorClass: string; override;
(*Triggers multiple linear regression recalculation.
Triggers multiple linear regression (MLR) recalculation. After the Recalc call property is set to false.
The results of MLR are stored to:
* : regression coefficients (b),
regression coefficients confidence intervals (BConfInt) and standard deviation
(BStdDev), residuals (Residuals) and calculated dependent variables (YCalc).
* : basic regression statistics (R2, Adjusted
R2, residuals variance, F statistics, significance probability).
*)
procedure Recalc;
(*Checks if system is valid.
Checks if system is valid.
True if is not zero matrix and .Rows
(number of observables) is equal to .Length.
Checks if defined system is valid:
// ...
// y = A*b
MtxMulLinReg1.A.SetIt(3,2,false,new double[]
{-5,2, 1,4, 8,0.5});
MtxMulLinReg1.Y.SetIt(false,new double[] {-2, 1,11});
MtxMulLinReg1.UseWeights = false;
if (MtxMulLinReg1.ValidSystem()) MtxMulLinReg1.Recalc();
// ...
*)
function ValidSystem: boolean;
(*Regression results statistics.
Returns regression coefficients statistics parameters: adjusted coefficient of determination,
variance ratio (explained/residual ratio), coefficient of determination (R squared),
residual variance and the probability of F.
*)
property RegressStatistics: TRegStatsClass read FRegressStatistics;
(*When any of the TMtxMulLinReg properties changes, this property is automatically set to true.
If and Dirty are both true then the
method will be called automatically.
*)
property Dirty : boolean read FDirty write SetDirty;
(*Regression results.
Returns regression coefficients, their confidence intervals and standard deviations,
residuals and predicted dependant variable values.
*)
property RegressResult: TRegResultClass read FRegressResult;
constructor Create(AOwner: TComponent); override;
destructor Destroy; override;
published
(*If true then changing any of the TMtxMulLinReg properties will trigger the
method.
*)
property AutoUpdate : boolean read FAutoUpdate write SetAutoUpdate default false;
(*Desired significant value for the statistical tests.
Defines the Alpha level for the statistical tests.
*)
property Alpha: double read FAlpha write SetAlpha;
(*Matrix of independent variables.
Matrix of independent variables. Number of A rows is equal to number of
observables and number of A columns is equal to number of variables.
You can use method to check whether defined
system of equations (A.Rows = Y.Length i.e. number of observables is
the same for A and Y) has been properly defined.
*)
property A : TMtx read FA write SetA stored AllowStreaming;
(*Vector of dependent variables.*)
property Y : TVec read FY write SetY stored AllowStreaming;
(*Weights for MLR.
Defines weights for multiple linear regression (MLR).
Weights are used only if property is
set to true.
*)
property Weights : TVec read FWeights write SetWeights stored AllowStreaming;
(*Intercept term.
Defines if intercept term is used. If true then all calculations will
include intercept term b(0). If false then intercept term b(0) is set to 0.0.
*)
property Constant : boolean read FConstant write SetConstant default true;
(*Weighted multiple linear regression.
If true the internal algorithm will perform weighted multiple linear regression.
In this case you must specify the weights (set the vector).
*)
property UseWeights : boolean read FUseWeights write SetUseWeights default false;
(*Defines A*x=b solving method.*)
property SolveMethod: TRegSolveMethod read FSolveMethod write SetSolveMethod default regSolveLQR;
end;
(*Performs Principle Component Analysis (PCA).
Principal Components Analysis - PCA - is a data analysis tool that is usually used to reduce the dimensionality
(number of variables) of a large number of interrelated variables, while retaining as much of the information (variation)
as possible. PCA calculates an uncorrelated set of variables (factors or PCs). These factors are ordered so
that the first few retain most of the variation present in all of the original variables.
The PCA procedure is reduced to an eigenvalue-eigenvector problem. PCA routines perform a PCA on either a
correlation or a covariance matrix. Data matrix can be either "raw" data or pre-calculated
correlation/covariance matrix.
How to use TMtxPCA component?
- Drop a TMtxPCA component on the form.
- By setting the property define whether PCA will use correlation or covariance matrix to calculate PCA.
- Define the actual (by changing Data matrix values).
- Call the method to calculate PCA results.
Results:
- - Data eigenvalues
- - Data eigenvectors
- - the percentage of the total variation in the variables (columns)
- - Z-Scores (eigenvectors in PC space).
An example how to setup TMtxPCA component:
using Dew.Stats;
using Dew.Stats.Units;
using Dew.Math;
namespace Dew.Examples
{
private void Example(StatTools.TMtxPCA MtxPCA1)
{
// ...
MtxPCA1.Data.SetIt(4,3,false,new double[]
{1,2,3,
5,7,9,
1,11,13,
3,7,4});
MtxPCA1.PCAMode = TPCAMode.PCARawData; // using data matrix to evaluate PCA
MtxPCA1.Recalc(); // force recalculation
// ...
}
}
*)
TMtxPCA = class(TMtxComponent)
strict private
FZScores: TMtx;
FData: TMtx;
FPC: TMtx;
FPCAMode: TPCAMode;
FTotalVarPct: TVec;
FEigValues: TVec;
FDirty: boolean;
FAutoUpdate: boolean;
tmpMtx: TMtx;
procedure SetData(const Value: TMtx);
procedure SetPCAMode(const Value: TPCAMode);
procedure SetAutoUpdate(const Value: boolean);
procedure SetDirty(const Value: boolean);
strict protected
procedure Loaded; override;
public
(*Triggers PCA recalculation.
Triggers PCA recalculation. After the Recalc call property is set to false. The results of PCA are stored to:
- - Data eigenvalues.
- - Data eigenvectors.
- - the percentage of the total variation in the variables (columns).
- - Z-Scores (eigenvectors in PC space).
*)
procedure Recalc;
(*Returns the principal components (PC).
*)
property PC :TMtx read FPC;
(*Returns the Z-Scores.
Returns the so called "Z-scores" (data, transformed in the PC space).
*)
property ZScores: TMtx read FZScores;
(*Returns the eigenvalues.
Returns the matrix eigenvalues. Eigenvalues can
be used to determine how many factors to retain. When the PCA is run on
the correlations, one rule-of-thumb is to retain those factors whose eigenvalues
are greater than one. The sum of the eigenvalues is equal to the number of
variables. When the PCA is run on the covariances, the sum of the eigenvalues is
equal to the sum of the variances of the variables.
*)
property EigValues : TVec read FEigValues;
(*Returns the percentage of the total variation in the variables.
Returns the percentage of the total variation in the variables (columns).
*)
property TotalVarPct: TVec read FTotalVarPct;
(*When any of the TMtxPCA properties changes, this property is automatically set to true.
If and Dirty are both true then the
method will be called automatically.
*)
property Dirty : boolean read FDirty write SetDirty;
constructor Create(AOwner: TComponent); override;
destructor Destroy; override;
published
(*If true then changing any of the TMtxPCA properties will trigger the
method.
*)
property AutoUpdate : boolean read FAutoUpdate write SetAutoUpdate default false;
(*Defines the data to be analyzed by PCA.
Defines the data to be analyzed by PCA. Depending on property, Data can be
interpreted as raw data or precalculated data correlation/covariance matrix. Generally, Data columns
and model variables and Data rows are model observables.
*)
property Data : TMtx read FData write SetData stored AllowStreaming;
(*PCA type.
Defines type of PCA. Normally, the analysis is run on the scale-invariant correlation matrix since
the scale of the variables changes the analysis when the covariance matrix is used. For example,
when a covariance matrix was used, a variable that was measured in Celsius deg. results in a
different analysis than if it were measured in Kelvin deg.
*)
property PCAMode: TPCAMode read FPCAMode write SetPCAMode default PCACorrMat;
end;
(*Performs logistic regression.
Use TMtxLogistReg component to perform ordinary or ordinal logistic regression on dataset.Suppose y takes values in k ordered categories, and let p_ij
be the cumulative probability that y(i) falls in the j'th category
or higher. The ordinal logistic regression model is defined as:
logit(p_ij) = theta(j) + A_i'B , i = 1,..,length(Y), j = 1,..,k-1,
where A_i is the i'th row of A . The number of ordinal categories k is taken to be the number of distinct values of int(y).
If k is 2 (two categories) the model is ordinary logistic regression.
How to use TMtxLogistReg component?
- Drop a TMtxLogistReg component on the form.
- Define , the matrix of independent variables.
- Define , grouping (categories) vector.
Note: All values have to be integers.
- Define initial estimates for and
coefficients. Alternatively you can also set to true
and let the algorithm calculate initial estimates for Theta and B.
- (Optionally) Set to define desired B, Theta estimates precision. Set
to define maximum number of iterations in main calculation loop.
- Call the method to trigger calculation.
Results:
- : B coefficients estimates.
- : Theta coefficients estimates.
- : Theta and B coefficients estimates standard errors.
How to setup and run logistic regression? In the following example we'll perform
ordinary (i.e. with two categories) logistic regression on data, taken from NCSS statistics program.
Note that TMtxLogistReg.Y vector values are binary integer values.
using Dew.Stats;
using Dew.Stats.Units;
using Dew.Math;
namespace Dew.Examples
{
private void Example(StatTools.TMtxLogistReg tc)
{
tc.A.SetIt(27,3,false, new double[]
{0.8, 1.9, 0.996,
0.9, 1.4, 0.992,
0.8, 0.8, 0.982,
1, 0.7, 0.986,
0.9, 1.3, 0.98,
1, 0.6, 0.982,
0.95, 1, 0.992,
0.95, 1.9, 1.02,
1, 0.8, 0.999,
0.95, 0.5, 1.038,
0.85, 0.7, 0.988,
0.7, 1.2, 0.982,
0.8, 0.4, 1.006,
0.2, 0.8, 0.99,
1, 1.1, 0.99,
1, 1.9, 1.02,
0.65, 0.5, 1.014,
1, 1, 1.004,
0.5, 0.6, 0.99,
1, 1.1, 0.986,
1, 0.4, 1.01,
0.9, 0.6, 1.02,
1, 1, 1.002,
0.95, 1.6, 0.988,
1, 1.7, 0.99,
1, 0.9, 0.986,
1, 0.7, 0.986});;
tc.Y.SetIt(false,new double[] {1,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,1,0});
tc.Recalc();
// Results =>
// B = (9.65215222458842, 3.86710032907408, -82.073774279211)
// Theta = (-67.6339061278272)
// TBStdErr = (56.8875416435276, 7.75107606604495, 1.77827769017976, 61.712376172072)
// meaning Theta StdErr = 56.8875416435276,
// Beta StdErrs = 7.75107606604495, 1.77827769017976, 61.712376172072
}
}
*)
TMtxLogistReg = class(TMtxComponent)
strict private
FDirty: boolean;
FAutoUpdate: boolean;
FAutoInitEstimates: boolean;
FMaxIteration: Integer;
FTolerance: double;
FA: TMtx;
FB: TVec;
FY: TVec;
FTheta: TVec;
FTBStdErr: TVec;
FAlpha: double;
procedure SetAutoUpdate(const Value: boolean);
procedure SetDirty(const Value: boolean);
procedure SetAutoInitEstimates(const Value: boolean);
procedure SetMaxIteration(const Value: Integer);
procedure SetTolerance(const Value: double);
procedure SetA(const Value: TMtx);
procedure SetB(const Value: TVec);
procedure SetTheta(const Value: TVec);
procedure SetY(const Value: TVec);
procedure SetAlpha(const Value: double);
strict protected
procedure Loaded; override;
public
Iterations: Integer;
FMin: double;
StopReason: TOptStopReason;
(*When any of the TMtxLogistReg properties changes, this property is automatically set to true.
If and Dirty are both true then the
method will be called automatically.
*)
property Dirty: boolean read FDirty write SetDirty;
(*Standard errors of Theta, B estimates.
Returns Theta and B estimates standard errors.
*)
property TBStdErr: TVec read FTBStdErr;
(*Triggers logistic regression recalculation.
Triggers logistic regression (ordinary or ordinal) recalculation. After the Recalc call property is set to false.
The results of logistic regression are stored to:
* : B coefficients estimates.
* : Theta coefficients estimates. For ordinary regression (two categories),
Theta is single value vector.
* : Theta and B coefficients estimates standard errors.
*)
procedure Recalc;
constructor Create(AOwner: TComponent); override;
Destructor Destroy; override;
published
(*Desired significant value for the statistical tests.
Defines the Alpha level for the statistical tests.
*)
property Alpha: double read FAlpha write SetAlpha;
(*If true then changing any of the TMtxLogistReg properties will trigger the
method.
*)
property AutoUpdate: boolean read FAutoUpdate write SetAutoUpdate default false;
(*Automatically calculate initial estimates for regression coefficients.
*)
property AutoInitEstimates: boolean read FAutoInitEstimates write SetAutoInitEstimates default true;
(*Defines one of the conditions for terminating the regression coefficient calculation.
Tolerance and MaxIteration parameters define the conditions for terminating logistic regression
coefficients calculation (number of iterations exceeds MaxIteration or internal minimum
precision is within Tolerance).
*)
property MaxIteration: Integer read FMaxIteration write SetMaxIteration default 100;
(*Defines one of the conditions for terminating the regression coefficient calculation.
Tolerance and MaxIter parameters define the conditions for terminating the regression
coefficients calculation (number of iterations exceeds MaxIter or internal minimum
precision is within Tolerance).
*)
property Tolerance: double read FTolerance write SetTolerance;
(*Grouping variable.
Defines the grouping,or classification, variable. It's values can be only integers. The internal
algorithm will use smallest value for first group, next for second group ..., and largest integer value for last
group. Values can be binaries in which case the smallest value will be the first group and the largest value
will be the second group.
*)
property Y: TVec read FY write SetY stored AllowStreaming;
(*B coefficients estimates.
Defines the logistic regression B coefficients. Suppose y takes values in k ordered categories, and let p_ij
be the cumulative probability that y(i) falls in the j'th category
or higher. Ordinal logistic regression model is defined as:
logit(p_ij) = theta(j) + A_i'B , i = 1,..,length(Y), j = 1,..,k-1,
where A_i is the i'th row of A . The number of ordinal categories k is taken to be the number of distinct values of int(y).
If k is 2 the model is ordinary logistic regression.
Set B values to define initial estimates for B coefficients. Setting B values is not mandatory. You can
also set to true and force the algorithm to calculate initial
estimates for and . After the call B stores logistic regression model B coefficient estimates.
*)
property B: TVec read FB write SetB stored AllowStreaming;
(*Theta coefficients estimates.
Set Theta values to define initial estimates for Theta coefficients. Setting Theta values is not mandatory. You can
also set to true and force the algorithm to calculate initiate
estimates for and Theta. After the call Theta stores logistic regression model Theta coefficient estimates.
*)
property Theta: TVec read FTheta write SetTheta stored AllowStreaming;
(*Independent variables.
Defines logistic regression matrix of independent variables.
Suppose y takes values in k ordered categories, and let p_ij
be the cumulative probability that y(i) falls in the j'th category
or higher. Ordinal logistic regression model is defined as:
logit(p_ij) = theta(j) + A_i'B , i = 1,..,length(Y), j = 1,..,k-1,
where A_i is the i'th row of A . The number of ordinal categories k is taken to be the number of distinct values of int(y).
If k is 2 the model is ordinary logistic regression.
*)
property A: TMtx read FA write SetA stored AllowStreaming;
end;
(*Defines type of binary test.*)
TBinaryTestType = (
(*one-sample binary test.*)btOneSample,
(*two-sample binary test.*)btTwoSample
);
(*Defines binary test table structure.
Defines 2x2 binary test table structure. An important task in diagnostic is to measure the
accuracy of a diagnostic test. This can be done by comparing the test result with the true
condition status of a number of patients. The results of such a study can be displayed in a 2-by-2
table in which the true condition is shown as the rows and the diagnostic test result is shown
as the columns.
Class Diagnostic Test Results
--------------- ---------- -------- -------
True condition Positive Negative Total
Present(true) T1 T0 N1
Absent (false) F1 F0 N0
Total M1 M0 N
*)
TBinaryTestTable = class(TPersistent)
strict private
FF0: Integer;
FF1: Integer;
FT0: Integer;
FT1: Integer;
procedure SetF0(const Value: Integer);
procedure SetF1(const Value: Integer);
procedure SetT0(const Value: Integer);
procedure SetT1(const Value: Integer);
function GetN1: Integer;
function GetN0: Integer;
function GetM1: Integer;
function GetM0: Integer;
function GetN: Integer;
function GetSe1: double;
function GetSe2: double;
function GetSp1: double;
function GetSp2: double;
procedure CalcCI(s: double; n: Integer; var CI: TTwoElmReal);
protected
FPaired: Boolean;
FAlpha: double;
public
procedure Assign(Source: TPersistent); override;
(*Returns first row total (T1+T0).*)
property N1: Integer read GetN1;
(*Returns second row total (F1+F0).*)
property N0: Integer read GetN0;
(*Returns first column total (T1+F1).*)
property M1: Integer read GetM1;
(*Returns second column total (T0+F0).*)
property M0: Integer read GetM0;
(*Returns table grand total (T1+T0+F1+F0).*)
property N: Integer read GetN;
(*Returns test sensitivity.
Returns sensitiviy, calculated from table values. For one and two sample
unpaired binary test, sensitivity is defines as:
Se = T1/N1.
For two-sample paired test, sensitivity is defines as:
Se = N1/N .
*)
property Se1: double read GetSe1;
(*Returns test sensitivity.
Returns sensitiviy, calculated from table values. For two sample
unpaired binary test, sensitivity is defines as:
Se = T1/N1.
For two-sample paired test, sensitivity is defines as:
Se = M1/N .
*)
property Se2: double read GetSe2;
(*Returns specificity.
Returns specificity, defined as:
Sp = F0/N0.
Note
This value is valid only for unpaired test.
*)
property Sp1: double read GetSp1;
(*Returns specificity.
Returns specificity, defined as:
Sp = F0/M0 .
Note
This value is valid only for unpaired test.
*)
property Sp2: double read GetSp2;
(*Returns confidence interval.*)
procedure GetSe1CI(out Bottom, Top: double);
(*Returns confidence interval.*)
procedure GetSe2CI(out Bottom, Top: double);
(*Returns confidence interval.*)
procedure GetSp1CI(out Bottom, Top: double);
(*Returns confidence interval.*)
procedure GetSp2CI(out Bottom, Top: double);
published
(*Defines number of true positive values.*)
property T1: Integer read FT1 write SetT1;
(*Defines number of true negative values.*)
property T0: Integer read FT0 write SetT0;
(*Defines number of false positive values.*)
property F1: Integer read FF1 write SetF1;
(*Defines number of false negative values.*)
property F0: Integer read FF0 write SetF0;
end;
(*Performs one and two test paired or unpaired binary test.
The component supports the following types of binary diagonstic tests:
* One-sample test where the results can be arranged in single 2x2 table,
* two-sample test where the results can be arranged in two 2x2 tables,
* two-sample paired test where the results can be combined and arranged in single 2x2 table.
How to use TMtxBinaryTest component?
* Drop a TMtxBinaryTest component on the form.
* Set to true if you're performing paired binary diagnostic test,
otherwise set it to false.
* Define the Sensitivity and Specificity confidence level by setting the
property.
* Define binary test type (one, two test) by setting property.
* Define binary diagnostic test values by setting and (only if you
use two-sample test) table values. See
class for more details.
Results:
Depending on test type, component returns the following results:
1. One-test : The returns test row totals, column totals, sensitivity and
its CIAlpha confidence interval, specificity and it's CIAlpha confidence interval.
2. Two-test unpaired: The returns first test row and column totals, first test sensitivity
and specificity, the returns second test row and column totals, second test sensitivity and specificity.
3. Two-test paired : The returns first and second test sensitivity i.e and
and their CIAlpha confidence intervals.
*)
TMtxBinaryTest = class(TMtxComponent)
strict private
FTestType: TBinaryTestType;
FTest2Table: TBinaryTestTable;
FTest1Table: TBinaryTestTable;
FCIAlpha: double;
FDirty: boolean;
FAutoUpdate: boolean;
FPaired: boolean;
procedure SetTestType(const Value: TBinaryTestType);
procedure SetTest1Table(const Value: TBinaryTestTable);
procedure SetTest2Table(const Value: TBinaryTestTable);
procedure SetCIAlpha(const Value: double);
procedure SetDirty(const Value: boolean);
procedure SetAutoUpdate(const Value: boolean);
procedure SetPaired(const Value: boolean);
public
(*Recalculates test resutlts.*)
procedure Recalc;
(*When any of the TMtxBinaryTest properties changes,
this property is automatically set to true.*)
property Dirty: boolean read FDirty write SetDirty;
constructor Create(AOwner: TComponent); override;
Destructor Destroy; override;
published
(*If true then the table is paired.*)
property Paired: boolean read FPaired write SetPaired default false;
(*If true, any change will trigger the recalculation.*)
property AutoUpdate : boolean read FAutoUpdate write SetAutoUpdate default false;
(*Defines Se and SP desired confidence level.*)
property CIAlpha: double read FCIAlpha write SetCIAlpha;
(* type.*)
property TestType: TBinaryTestType read FTestType write SetTestType default btOneSample;
(*First test table.*)
property Test1Table: TBinaryTestTable read FTest1Table write SetTest1Table;
(*Second test (optional) table.*)
property Test2Table: TBinaryTestTable read FTest2Table write SetTest2Table;
end;
(*Defines type of multidimensional scaling.*)
TMDScalingMethod = (
(*Classical (metric) multidimensional scaling algorithm.*)mdScalingMetric,
(*Non-metric multidimensional scaling algorithm.*)mdScalingNonMetric
);
(*MD matrix data type.*)
TMDDataFormat = (
(*Dissimilarities represent the distance between two objects.
They may be measured directly or approximated. MDS algorithms use the dissimilarities directly.
A dissimilarity matrix is always symmetrical and with zero main diagonal.*)
mdFormatDissimilarities,
(*Similarities represent how close (in some sense) two objects are.
Similarities must obey the rule:
similarity(i,j) <= similarity(i,i) and similarity(j,j)
for all i and j. Similarity matrices are symmetrical. Similarities are converted to dissimilarities
by the following relation:
d(i,j) = Sqrt[s(i,i) + s(j,j) -2*s(i,j)]
where d(i,j) represents a dissimilarity and s(i,j) represents a similarity.
In case your data consists of standard measures rather than dissimilarities or similarities, you can
create a similarity matrix by creating the correlation matrix.*)
mdFormatSimilarities,
(*In this case, dissimilarities matrix is calculated by using routine.*)
mdFormatRaw
);
(*Performs multidimensional scaling.
Use TMtxMDScaling component to perform multidimensional scaling on dataset.
How to use TMtxMDScaling component?
1. Drop a TMtxMDScaling component on the form.
2. Set property to define how data will be interpreted.
Data can be interpreted as dissimilarities matrix, similarities matrix or raw data matrix.
3. Define , the matrix representing dissimilarities matrix, similarities matrix
or raw data.
4. Set to define number of variables in reduced space.
5. Set to define how pairwise distance will be calculated.
6. Set to define scaling algorithm. Currently only metric (classical)
multidimensional scaling algorithm is supported.
7. Call the method to trigger calculation.
Results:
1. : Point coordinates in reduced space.
2. : Eigenvalues in reduced space, sorted in descending order.
3. : Estimated dissimilarities matrix.
4. : Calculated stress factor.
The "beauty" of MDS is that we can analyze any kind of distance or similarity matrix. These similarities can represent people's ratings of
similarities between objects, the percent agreement between judges, the number of times a subjects fails to discriminate between stimuli, etc.
For example, MDS methods used to be very popular in psychological research on person perception where similarities between trait descriptors were
analyzed to uncover the underlying dimensionality of people's perceptions of traits (see, for example Rosenberg, 1977).
In this example 6x6 similarities (extracted directly from questionare correlation matrix) is used to perform classical MD scaling.
using Dew.Stats;
using Dew.Stats.Units;
using Dew.Math;
namespace Dew.Examples
{
private void Example(StatTools.TMtxMDScaling mds)
{
// similarities matrix (symmetric with 1.0 on diagonal)
mds.Data.SetIt(5,5,false, new double[] {
1.00, 0.3, 0.2, 0.25, 0.33,
0.30, 1.0, 0.11, 0.21, 0.8,
0.20, 0.11, 1.0, 0.40, 0.5,
0.25, 0.21, 1.0, 0.10, 0.05,
0.33, 0.80, 0.5, 0.05, 1.00});
mds.DataFormat = TMDDataFormat.mdFormatSimilarities;
// use "standard" Euclidian metric
mds.DistanceMethod = TPWDistMethod.pwdistEuclidian;
// define number of desired dimensions (1)
mds.Dimensions = 1;
// Do the math
mds.Recalc();
// check Stress, DHat, EigeValues to evaluate GOF if (1) dimension is used
}
}
*)
TMtxMDScaling = class(TMtxComponent)
strict private
FDirty: boolean;
FScalingMethod: TMDScalingMethod;
FAutoUpdate: boolean;
FDataFormat: TMDDataFormat;
FData: TMtx;
FY: TMtx;
FDHat: TMtx;
FEigenValues: TVec;
FinternalD: TMtx;
FStress: double;
FDimensions: Integer;
FDistanceMethod: TPWDistMethod;
procedure Similarity2Dissimilarity(S,D: TMtx);
procedure SetDirty(const Value: boolean);
procedure SetScalingMethod(const Value: TMDScalingMethod);
procedure SetAutoUpdate(const Value: boolean);
procedure SetDataFormat(const Value: TMDDataFormat);
procedure SetData(const Value: TMtx);
procedure SetDimensions(const Value: Integer);
procedure SetDistanceMethod(const Value: TPWDistMethod);
procedure SetDHat(const Value: TMtx);
public
(*Triggers MDScalling recalculation.
Triggers non-linear regression recalculation. After the Recalc call
property is set to false.
*)
procedure Recalc;
(*When any of the TMtxMDScaling properties changes, this property is automatically set to true.
If and Dirty are both true then the
method will be called automatically.
*)
property Dirty: boolean read FDirty write SetDirty;
(*Reduced space point coordinates.*)
property Y: TMtx read FY;
(*Reduced space eigenvalues.
Returns reduced space eigenvalues, sorted in descending order.
*)
property EigenValues: TVec read FEigenValues;
(*Returns multidimensional scaling stress value which is an estimate for GOF.*)
property Stress: double read FStress;
(*Estimated dissimilarities matrix.
Returns estimated (reduced dimension) dissimilarities matrix.
Note
Define DHat values ONLY if property is set to pwdistCustom.
If this is not the case, DHat values will be ignored and the internal algorithm will be used
to calculate pairwise distances.
*)
property DHat: TMtx read FDHat write SetDHat;
(*Used dissimilarities matrix.
Returns used dissimilarity matrix. If is set to dissimilarity
matrix, it returns matrix, otherwise it returns calculated or user-supplied
dissimilarity matrix.
*)
property D: TMtx read FinternalD;
constructor Create(AOwner: TComponent); override;
Destructor Destroy; override;
published
(*If true then changing any of the TMtxPCA properties will trigger the
method.
*)
property AutoUpdate : boolean read FAutoUpdate write SetAutoUpdate default false;
(*Defines multidimensional scaling algorithm.
Note
Currently only classical multidimensional scaling algorithm is supported.
*)
property ScalingMethod: TMDScalingMethod read FScalingMethod write SetScalingMethod default mdScalingMetric;
(*Defines data format used in calculations.*)
(*Data format.*)
property DataFormat: TMDDataFormat read FDataFormat write SetDataFormat default mdFormatDissimilarities;
(*Data to be analyzed by MDS.*)
property Data: TMtx read FData write SetData stored AllowStreaming;
(*Reduced dimension size.*)
property Dimensions: Integer read FDimensions write SetDimensions default 1;
(*Metric for distance calculation.*)
property DistanceMethod: TPWDistMethod read FDistanceMethod write SetDistanceMethod default pwdistEuclidian;
end;
(*Stepwise regression variable action.*)
TStepwiseAction = (swaUnchanged, swaRemoved, swaAdded);
(*Callback function for custom quality criteria of stepwise regression optimization algorithm.
The Stat contains the details of the regression results, b contains the coefficients and t the corresponding t-Values for each item in b.
The ParamMask.Length equals the initial total variable count. b and t can and will have length less than this.
Which variables are included is determined from ParamMask, which has 1 at corresponding included variable index and 0
otherewise.
*)
TOnStepwiseQualityCriteria = function(const Stat: TRegStats; const b, t: TVec; const ParamMask: TVecInt): double of object;
(*Stepwise regression is an optimization aglorithm aiming to improve the quality of the multiple linear regression
by excluding noisy variables.
For models with less than 15 variables, the exhaustive search is the recommended
method. Alternatively it is possible to perform "backward search" by starting with all and removing one by one or
"forward search" by starting with none and adding one by one variable. Both backward and forward search can have
selected variables already pre-included (or pre-excluded). Single step mode allows the user to manually include or
exclude individual variables from the model after each step.
*)
TMtxStepwiseReg = class(TMtxComponent)
strict private
FMethod: TStepwiseMethod;
FA: TVecList;
FBitMask: TVecInt;
FReportCoeff: TMtx;
FReportSSE: TMtx;
FMaxIter: integer;
FAutoInitBitMask: boolean;
FOnQualityCriteria: TOnStepwiseQualityCriteria;
procedure SetMethod(const Value: TStepwiseMethod);
procedure SetA(const Value: TVecList);
procedure SetBitMask(const Value: TVecInt);
procedure SetReportCoeff(const Value: TMtx);
procedure SetReportSSE(const Value: TMtx);
procedure SetMaxIter(const Value: integer);
procedure SetAutoInitBitMask(const Value: boolean);
procedure SetOnQualityCriteria(const Value: TOnStepwiseQualityCriteria);
public
(*Triggers stepwise regression recalculation.
The result of this calculation is stored in the "BitMask" vector. The BitMask
defines the variables which are to be included in to the regression by having BitMask.Bits[i] set to True.
For detail statistics of the final configuration of the multiple regression
call the followed by .
*)
procedure Recalc;
constructor Create(AOwner: TComponent); override;
Destructor Destroy; override;
(* Holds a list of vectors with data of variables
Each item holds data for one variable. All items in the list need to have equal length and the last item is
the dependent variable (y) from the equation:
y = b0 + b1*x1 + b2*x2 + ... + bn*xn;
*)
property A: TVecList read FA write SetA;
(* Result matrix size of IterCount x (VarCount + 7).
Each row starts with Step number followed by selection list of variables in columns followed by Standard Error (quality criteria).
We strive to reduce standard error and the model with the smallest standard error is considered best. Additional columns are as follows:
[0] Standard error = sqrt(SSE/dFE)
[1] SSE = Residual sum of squares.
[2] SSR = Regression sum of squares.
[3] SST = Total sum of squares = SSE + SSR
[4] R2 = Coefficient of determination
[5] Adjusted R2 = Adjusted coefficient of determination
[6] MSE = Residual variance
*)
property ReportSSE: TMtx read FReportSSE write SetReportSSE;
(* Result matrix with size of (IterCount*VarCount) x 5.
Each iteration adds independent variable count rows. The columns are as follows:
[0] Iteration Step
[1] Variable index
[2] variable selection where 0 means excluded and 1 means included.
[3] holds the normalized coeffients and Fourth column the
[4] corresponding t-values for each coefficient
[5] two tailed p-values. Bigger p-values suggest the probability that the model would better, if the variable would be excluded.
*)
property ReportCoeff: TMtx read FReportCoeff write SetReportCoeff;
(*Bit vector holding 1 for included and 0 for excluded vars.
Initialize the BitMask with BitMask.BitCount and then set BitMask.Bits[i] and
set AutoInitBitMask property to false. When doing forward search, those variables already 1 will
not be excluded. When doing backward search, the variables already 0 will not be included.
*)
property BitMask: TVecInt read FBitMask write SetBitMask;
published
(*Defines stepwise regression model.*)
property Method: TStepwiseMethod read FMethod write SetMethod default swBackward;
(*Maximum iteration count before an exception will be raised.*)
property MaxIter: integer read FMaxIter write SetMaxIter default 10000;
(*Set this property to false, if you want to manually initialize the BitMask.
This value does not affect the "exhaustive" method.
*)
property AutoInitBitMask: boolean read FAutoInitBitMask write SetAutoInitBitMask default True;
(*Implement this event, to return custom quality criteria other than Std. Error to guide the optimization.*)
property OnQualityCriteria: TOnStepwiseQualityCriteria read FOnQualityCriteria write SetOnQualityCriteria;
end;