Next: Fitting References and Further Reading, Previous: Troubleshooting, Up: Least-Squares Fitting [Index]
The following program computes a least squares straight-line fit to a simple dataset, and outputs the best-fit line and its associated one standard-deviation error bars.
#include <stdio.h> #include <gsl/gsl_fit.h> int main (void) { int i, n = 4; double x[4] = { 1970, 1980, 1990, 2000 }; double y[4] = { 12, 11, 14, 13 }; double w[4] = { 0.1, 0.2, 0.3, 0.4 }; double c0, c1, cov00, cov01, cov11, chisq; gsl_fit_wlinear (x, 1, w, 1, y, 1, n, &c0, &c1, &cov00, &cov01, &cov11, &chisq); printf ("# best fit: Y = %g + %g X\n", c0, c1); printf ("# covariance matrix:\n"); printf ("# [ %g, %g\n# %g, %g]\n", cov00, cov01, cov01, cov11); printf ("# chisq = %g\n", chisq); for (i = 0; i < n; i++) printf ("data: %g %g %g\n", x[i], y[i], 1/sqrt(w[i])); printf ("\n"); for (i = -30; i < 130; i++) { double xf = x[0] + (i/100.0) * (x[n-1] - x[0]); double yf, yf_err; gsl_fit_linear_est (xf, c0, c1, cov00, cov01, cov11, &yf, &yf_err); printf ("fit: %g %g\n", xf, yf); printf ("hi : %g %g\n", xf, yf + yf_err); printf ("lo : %g %g\n", xf, yf - yf_err); } return 0; }
The following commands extract the data from the output of the program
and display it using the GNU plotutils graph
utility,
$ ./demo > tmp $ more tmp # best fit: Y = -106.6 + 0.06 X # covariance matrix: # [ 39602, -19.9 # -19.9, 0.01] # chisq = 0.8 $ for n in data fit hi lo ; do grep "^$n" tmp | cut -d: -f2 > $n ; done $ graph -T X -X x -Y y -y 0 20 -m 0 -S 2 -Ie data -S 0 -I a -m 1 fit -m 2 hi -m 2 lo
The next program performs a quadratic fit y = c_0 + c_1 x + c_2
x^2 to a weighted dataset using the generalised linear fitting function
gsl_multifit_wlinear
. The model matrix X for a quadratic
fit is given by,
where the column of ones corresponds to the constant term c_0.
The two remaining columns corresponds to the terms c_1 x and
c_2 x^2.
The program reads n lines of data in the format (x, y, err) where err is the error (standard deviation) in the value y.
#include <stdio.h> #include <gsl/gsl_multifit.h> int main (int argc, char **argv) { int i, n; double xi, yi, ei, chisq; gsl_matrix *X, *cov; gsl_vector *y, *w, *c; if (argc != 2) { fprintf (stderr,"usage: fit n < data\n"); exit (-1); } n = atoi (argv[1]); X = gsl_matrix_alloc (n, 3); y = gsl_vector_alloc (n); w = gsl_vector_alloc (n); c = gsl_vector_alloc (3); cov = gsl_matrix_alloc (3, 3); for (i = 0; i < n; i++) { int count = fscanf (stdin, "%lg %lg %lg", &xi, &yi, &ei); if (count != 3) { fprintf (stderr, "error reading file\n"); exit (-1); } printf ("%g %g +/- %g\n", xi, yi, ei); gsl_matrix_set (X, i, 0, 1.0); gsl_matrix_set (X, i, 1, xi); gsl_matrix_set (X, i, 2, xi*xi); gsl_vector_set (y, i, yi); gsl_vector_set (w, i, 1.0/(ei*ei)); } { gsl_multifit_linear_workspace * work = gsl_multifit_linear_alloc (n, 3); gsl_multifit_wlinear (X, w, y, c, cov, &chisq, work); gsl_multifit_linear_free (work); } #define C(i) (gsl_vector_get(c,(i))) #define COV(i,j) (gsl_matrix_get(cov,(i),(j))) { printf ("# best fit: Y = %g + %g X + %g X^2\n", C(0), C(1), C(2)); printf ("# covariance matrix:\n"); printf ("[ %+.5e, %+.5e, %+.5e \n", COV(0,0), COV(0,1), COV(0,2)); printf (" %+.5e, %+.5e, %+.5e \n", COV(1,0), COV(1,1), COV(1,2)); printf (" %+.5e, %+.5e, %+.5e ]\n", COV(2,0), COV(2,1), COV(2,2)); printf ("# chisq = %g\n", chisq); } gsl_matrix_free (X); gsl_vector_free (y); gsl_vector_free (w); gsl_vector_free (c); gsl_matrix_free (cov); return 0; }
A suitable set of data for fitting can be generated using the following program. It outputs a set of points with gaussian errors from the curve y = e^x in the region 0 < x < 2.
#include <stdio.h> #include <math.h> #include <gsl/gsl_randist.h> int main (void) { double x; const gsl_rng_type * T; gsl_rng * r; gsl_rng_env_setup (); T = gsl_rng_default; r = gsl_rng_alloc (T); for (x = 0.1; x < 2; x+= 0.1) { double y0 = exp (x); double sigma = 0.1 * y0; double dy = gsl_ran_gaussian (r, sigma); printf ("%g %g %g\n", x, y0 + dy, sigma); } gsl_rng_free(r); return 0; }
The data can be prepared by running the resulting executable program,
$ GSL_RNG_TYPE=mt19937_1999 ./generate > exp.dat $ more exp.dat 0.1 0.97935 0.110517 0.2 1.3359 0.12214 0.3 1.52573 0.134986 0.4 1.60318 0.149182 0.5 1.81731 0.164872 0.6 1.92475 0.182212 ....
To fit the data use the previous program, with the number of data points given as the first argument. In this case there are 19 data points.
$ ./fit 19 < exp.dat 0.1 0.97935 +/- 0.110517 0.2 1.3359 +/- 0.12214 ... # best fit: Y = 1.02318 + 0.956201 X + 0.876796 X^2 # covariance matrix: [ +1.25612e-02, -3.64387e-02, +1.94389e-02 -3.64387e-02, +1.42339e-01, -8.48761e-02 +1.94389e-02, -8.48761e-02, +5.60243e-02 ] # chisq = 23.0987
The parameters of the quadratic fit match the coefficients of the expansion of e^x, taking into account the errors on the parameters and the O(x^3) difference between the exponential and quadratic functions for the larger values of x. The errors on the parameters are given by the square-root of the corresponding diagonal elements of the covariance matrix. The chi-squared per degree of freedom is 1.4, indicating a reasonable fit to the data.
The next program demonstrates the difference between ordinary and regularized least squares when the design matrix is near-singular. In this program, we generate two random normally distributed variables u and v, with v = u + noise so that u and v are nearly colinear. We then set a third dependent variable y = u + v + noise and solve for the coefficients c_1,c_2 of the model Y(c_1,c_2) = c_1 u + c_2 v. Since u \approx v, the design matrix X is nearly singular, leading to unstable ordinary least squares solutions.
Here is the program output:
=== Unregularized fit === best fit: y = -259.999 u + 262.007 v chisq/dof = 1.59069 === Regularized fit === best fit: y = 1.01594 u + 1.02738 v chisq/dof = 1.69568
We see that the regularized method with \lambda = 1 finds the correct solution c_1 \approx c_2 \approx 1, while the ordinary least squares solution is unstable and has a large norm. The program is given below.
#include <gsl/gsl_math.h> #include <gsl/gsl_vector.h> #include <gsl/gsl_matrix.h> #include <gsl/gsl_rng.h> #include <gsl/gsl_randist.h> #include <gsl/gsl_multifit.h> #define N 50 /* number of data */ int main() { const size_t n = N; const size_t p = 2; size_t i; gsl_rng *r = gsl_rng_alloc(gsl_rng_default); gsl_matrix *X = gsl_matrix_alloc(n, p); gsl_vector *y = gsl_vector_alloc(n); for (i = 0; i < n; ++i) { /* generate first random variable u */ double ui = gsl_ran_gaussian(r, 1.0); /* set v = u + noise */ double vi = ui + gsl_ran_gaussian(r, 0.001); /* set y = u + v + noise */ double yi = ui + vi + gsl_ran_gaussian(r, 1.0); /* since u =~ v, the matrix X is ill-conditioned */ gsl_matrix_set(X, i, 0, ui); gsl_matrix_set(X, i, 1, vi); /* rhs vector */ gsl_vector_set(y, i, yi); } { gsl_multifit_linear_workspace *w = gsl_multifit_linear_alloc(n, p); gsl_vector *c = gsl_vector_alloc(p); gsl_vector *c_ridge = gsl_vector_alloc(p); gsl_matrix *cov = gsl_matrix_alloc(p, p); double chisq; /* unregularized (standard) least squares fit, lambda = 0 */ gsl_multifit_linear_ridge(0.0, X, y, c, cov, &chisq, w); fprintf(stderr, "=== Unregularized fit ===\n"); fprintf(stderr, "best fit: y = %g u + %g v\n", gsl_vector_get(c, 0), gsl_vector_get(c, 1)); fprintf(stderr, "chisq/dof = %g\n", chisq / (n - p)); /* regularize with lambda = 1 */ gsl_multifit_linear_ridge(1.0, X, y, c_ridge, cov, &chisq, w); fprintf(stderr, "=== Regularized fit ===\n"); fprintf(stderr, "best fit: y = %g u + %g v\n", gsl_vector_get(c_ridge, 0), gsl_vector_get(c_ridge, 1)); fprintf(stderr, "chisq/dof = %g\n", chisq / (n - p)); gsl_multifit_linear_free(w); gsl_matrix_free(cov); gsl_vector_free(c); gsl_vector_free(c_ridge); } gsl_rng_free(r); gsl_matrix_free(X); gsl_vector_free(y); return 0; }
The next program demonstrates the advantage of robust least squares on a dataset with outliers. The program generates linear (x,y) data pairs on the line y = 1.45 x + 3.88, adds some random noise, and inserts 3 outliers into the dataset. Both the robust and ordinary least squares (OLS) coefficients are computed for comparison.
#include <stdio.h> #include <gsl/gsl_multifit.h> #include <gsl/gsl_randist.h> int dofit(const gsl_multifit_robust_type *T, const gsl_matrix *X, const gsl_vector *y, gsl_vector *c, gsl_matrix *cov) { int s; gsl_multifit_robust_workspace * work = gsl_multifit_robust_alloc (T, X->size1, X->size2); s = gsl_multifit_robust (X, y, c, cov, work); gsl_multifit_robust_free (work); return s; } int main (int argc, char **argv) { int i; size_t n; const size_t p = 2; /* linear fit */ gsl_matrix *X, *cov; gsl_vector *x, *y, *c, *c_ols; const double a = 1.45; /* slope */ const double b = 3.88; /* intercept */ gsl_rng *r; if (argc != 2) { fprintf (stderr,"usage: robfit n\n"); exit (-1); } n = atoi (argv[1]); X = gsl_matrix_alloc (n, p); x = gsl_vector_alloc (n); y = gsl_vector_alloc (n); c = gsl_vector_alloc (p); c_ols = gsl_vector_alloc (p); cov = gsl_matrix_alloc (p, p); r = gsl_rng_alloc(gsl_rng_default); /* generate linear dataset */ for (i = 0; i < n - 3; i++) { double dx = 10.0 / (n - 1.0); double ei = gsl_rng_uniform(r); double xi = -5.0 + i * dx; double yi = a * xi + b; gsl_vector_set (x, i, xi); gsl_vector_set (y, i, yi + ei); } /* add a few outliers */ gsl_vector_set(x, n - 3, 4.7); gsl_vector_set(y, n - 3, -8.3); gsl_vector_set(x, n - 2, 3.5); gsl_vector_set(y, n - 2, -6.7); gsl_vector_set(x, n - 1, 4.1); gsl_vector_set(y, n - 1, -6.0); /* construct design matrix X for linear fit */ for (i = 0; i < n; ++i) { double xi = gsl_vector_get(x, i); gsl_matrix_set (X, i, 0, 1.0); gsl_matrix_set (X, i, 1, xi); } /* perform robust and OLS fit */ dofit(gsl_multifit_robust_ols, X, y, c_ols, cov); dofit(gsl_multifit_robust_bisquare, X, y, c, cov); /* output data and model */ for (i = 0; i < n; ++i) { double xi = gsl_vector_get(x, i); double yi = gsl_vector_get(y, i); gsl_vector_view v = gsl_matrix_row(X, i); double y_ols, y_rob, y_err; gsl_multifit_robust_est(&v.vector, c, cov, &y_rob, &y_err); gsl_multifit_robust_est(&v.vector, c_ols, cov, &y_ols, &y_err); printf("%g %g %g %g\n", xi, yi, y_rob, y_ols); } #define C(i) (gsl_vector_get(c,(i))) #define COV(i,j) (gsl_matrix_get(cov,(i),(j))) { printf ("# best fit: Y = %g + %g X\n", C(0), C(1)); printf ("# covariance matrix:\n"); printf ("# [ %+.5e, %+.5e\n", COV(0,0), COV(0,1)); printf ("# %+.5e, %+.5e\n", COV(1,0), COV(1,1)); } gsl_matrix_free (X); gsl_vector_free (x); gsl_vector_free (y); gsl_vector_free (c); gsl_vector_free (c_ols); gsl_matrix_free (cov); gsl_rng_free(r); return 0; }
The output from the program is shown in the following plot.
Next: Fitting References and Further Reading, Previous: Troubleshooting, Up: Least-Squares Fitting [Index]