SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
lbfgs.cpp
Go to the documentation of this file.
00001 /*
00002  *      Limited memory BFGS (L-BFGS).
00003  *
00004  * Copyright (c) 1990, Jorge Nocedal
00005  * Copyright (c) 2007-2010 Naoaki Okazaki
00006  * All rights reserved.
00007  *
00008  * Permission is hereby granted, free of charge, to any person obtaining a copy
00009  * of this software and associated documentation files (the "Software"), to deal
00010  * in the Software without restriction, including without limitation the rights
00011  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
00012  * copies of the Software, and to permit persons to whom the Software is
00013  * furnished to do so, subject to the following conditions:
00014  *
00015  * The above copyright notice and this permission notice shall be included in
00016  * all copies or substantial portions of the Software.
00017  *
00018  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00019  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00020  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
00021  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
00022  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
00023  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
00024  * THE SOFTWARE.
00025  */
00026 
00027 /* $Id$ */
00028 
00029 /*
00030 This library is a C port of the FORTRAN implementation of Limited-memory
00031 Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) method written by Jorge Nocedal.
00032 The original FORTRAN source code is available at:
00033 http://www.ece.northwestern.edu/~nocedal/lbfgs.html
00034 
00035 The L-BFGS algorithm is described in:
00036     - Jorge Nocedal.
00037       Updating Quasi-Newton Matrices with Limited Storage.
00038       <i>Mathematics of Computation</i>, Vol. 35, No. 151, pp. 773--782, 1980.
00039     - Dong C. Liu and Jorge Nocedal.
00040       On the limited memory BFGS method for large scale optimization.
00041       <i>Mathematical Programming</i> B, Vol. 45, No. 3, pp. 503-528, 1989.
00042 
00043 The line search algorithms used in this implementation are described in:
00044     - John E. Dennis and Robert B. Schnabel.
00045       <i>Numerical Methods for Unconstrained Optimization and Nonlinear
00046       Equations</i>, Englewood Cliffs, 1983.
00047     - Jorge J. More and David J. Thuente.
00048       Line search algorithm with guaranteed sufficient decrease.
00049       <i>ACM Transactions on Mathematical Software (TOMS)</i>, Vol. 20, No. 3,
00050       pp. 286-307, 1994.
00051 
00052 This library also implements Orthant-Wise Limited-memory Quasi-Newton (OWL-QN)
00053 method presented in:
00054     - Galen Andrew and Jianfeng Gao.
00055       Scalable training of L1-regularized log-linear models.
00056       In <i>Proceedings of the 24th International Conference on Machine
00057       Learning (ICML 2007)</i>, pp. 33-40, 2007.
00058 
00059 I would like to thank the original author, Jorge Nocedal, who has been
00060 distributing the effieicnt and explanatory implementation in an open source
00061 licence.
00062 */
00063 
00064 #include <algorithm>
00065 #include <cstdio>
00066 #include <cstdlib>
00067 #include <cmath>
00068 
00069 #include <shogun/optimization/lbfgs/lbfgs.h>
00070 #include <shogun/lib/SGVector.h>
00071 #include <shogun/lib/common.h>
00072 
00073 namespace shogun
00074 {
00075 
00076 #define min2(a, b)      ((a) <= (b) ? (a) : (b))
00077 #define max2(a, b)      ((a) >= (b) ? (a) : (b))
00078 #define max3(a, b, c)   max2(max2((a), (b)), (c));
00079 
00080 struct tag_callback_data {
00081     int32_t n;
00082     void *instance;
00083     lbfgs_evaluate_t proc_evaluate;
00084     lbfgs_progress_t proc_progress;
00085 };
00086 typedef struct tag_callback_data callback_data_t;
00087 
00088 struct tag_iteration_data {
00089     float64_t alpha;
00090     float64_t *s;     /* [n] */
00091     float64_t *y;     /* [n] */
00092     float64_t ys;     /* vecdot(y, s) */
00093 };
00094 typedef struct tag_iteration_data iteration_data_t;
00095 
00096 static const lbfgs_parameter_t _defparam = {
00097     6, 1e-5, 0, 1e-5,
00098     0, LBFGS_LINESEARCH_DEFAULT, 40,
00099     1e-20, 1e20, 1e-4, 0.9, 0.9, 1.0e-16,
00100     0.0, 0, -1,
00101 };
00102 
00103 /* Forward function declarations. */
00104 
00105 typedef int32_t (*line_search_proc)(
00106     int32_t n,
00107     float64_t *x,
00108     float64_t *f,
00109     float64_t *g,
00110     float64_t *s,
00111     float64_t *stp,
00112     const float64_t* xp,
00113     const float64_t* gp,
00114     float64_t *wa,
00115     callback_data_t *cd,
00116     const lbfgs_parameter_t *param
00117     );
00118 
00119 static int32_t line_search_backtracking(
00120     int32_t n,
00121     float64_t *x,
00122     float64_t *f,
00123     float64_t *g,
00124     float64_t *s,
00125     float64_t *stp,
00126     const float64_t* xp,
00127     const float64_t* gp,
00128     float64_t *wa,
00129     callback_data_t *cd,
00130     const lbfgs_parameter_t *param
00131     );
00132 
00133 static int32_t line_search_backtracking_owlqn(
00134     int32_t n,
00135     float64_t *x,
00136     float64_t *f,
00137     float64_t *g,
00138     float64_t *s,
00139     float64_t *stp,
00140     const float64_t* xp,
00141     const float64_t* gp,
00142     float64_t *wp,
00143     callback_data_t *cd,
00144     const lbfgs_parameter_t *param
00145     );
00146 
00147 static int32_t line_search_morethuente(
00148     int32_t n,
00149     float64_t *x,
00150     float64_t *f,
00151     float64_t *g,
00152     float64_t *s,
00153     float64_t *stp,
00154     const float64_t* xp,
00155     const float64_t* gp,
00156     float64_t *wa,
00157     callback_data_t *cd,
00158     const lbfgs_parameter_t *param
00159     );
00160 
00161 static int32_t update_trial_interval(
00162     float64_t *x,
00163     float64_t *fx,
00164     float64_t *dx,
00165     float64_t *y,
00166     float64_t *fy,
00167     float64_t *dy,
00168     float64_t *t,
00169     float64_t *ft,
00170     float64_t *dt,
00171     const float64_t tmin,
00172     const float64_t tmax,
00173     int32_t *brackt
00174     );
00175 
00176 static float64_t owlqn_x1norm(
00177     const float64_t* x,
00178     const int32_t start,
00179     const int32_t n
00180     );
00181 
00182 static void owlqn_pseudo_gradient(
00183     float64_t* pg,
00184     const float64_t* x,
00185     const float64_t* g,
00186     const int32_t n,
00187     const float64_t c,
00188     const int32_t start,
00189     const int32_t end
00190     );
00191 
00192 static void owlqn_project(
00193     float64_t* d,
00194     const float64_t* sign,
00195     const int32_t start,
00196     const int32_t end
00197     );
00198 
00199 
00200 void lbfgs_parameter_init(lbfgs_parameter_t *param)
00201 {
00202     memcpy(param, &_defparam, sizeof(*param));
00203 }
00204 
00205 int32_t lbfgs(
00206     int32_t n,
00207     float64_t *x,
00208     float64_t *ptr_fx,
00209     lbfgs_evaluate_t proc_evaluate,
00210     lbfgs_progress_t proc_progress,
00211     void *instance,
00212     lbfgs_parameter_t *_param
00213     )
00214 {
00215     int32_t ret;
00216     int32_t i, j, k, ls, end, bound;
00217     float64_t step;
00218 
00219     /* Constant parameters and their default values. */
00220     lbfgs_parameter_t param = (_param != NULL) ? (*_param) : _defparam;
00221     const int32_t m = param.m;
00222 
00223     float64_t *xp = NULL;
00224     float64_t *g = NULL, *gp = NULL, *pg = NULL;
00225     float64_t *d = NULL, *w = NULL, *pf = NULL;
00226     iteration_data_t *lm = NULL, *it = NULL;
00227     float64_t ys, yy;
00228     float64_t xnorm, gnorm, beta;
00229     float64_t fx = 0.;
00230     float64_t rate = 0.;
00231     line_search_proc linesearch = line_search_morethuente;
00232 
00233     /* Construct a callback data. */
00234     callback_data_t cd;
00235     cd.n = n;
00236     cd.instance = instance;
00237     cd.proc_evaluate = proc_evaluate;
00238     cd.proc_progress = proc_progress;
00239 
00240     /* Check the input parameters for errors. */
00241     if (n <= 0) {
00242         return LBFGSERR_INVALID_N;
00243     }
00244     if (param.epsilon < 0.) {
00245         return LBFGSERR_INVALID_EPSILON;
00246     }
00247     if (param.past < 0) {
00248         return LBFGSERR_INVALID_TESTPERIOD;
00249     }
00250     if (param.delta < 0.) {
00251         return LBFGSERR_INVALID_DELTA;
00252     }
00253     if (param.min_step < 0.) {
00254         return LBFGSERR_INVALID_MINSTEP;
00255     }
00256     if (param.max_step < param.min_step) {
00257         return LBFGSERR_INVALID_MAXSTEP;
00258     }
00259     if (param.ftol < 0.) {
00260         return LBFGSERR_INVALID_FTOL;
00261     }
00262     if (param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_WOLFE ||
00263         param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
00264         if (param.wolfe <= param.ftol || 1. <= param.wolfe) {
00265             return LBFGSERR_INVALID_WOLFE;
00266         }
00267     }
00268     if (param.gtol < 0.) {
00269         return LBFGSERR_INVALID_GTOL;
00270     }
00271     if (param.xtol < 0.) {
00272         return LBFGSERR_INVALID_XTOL;
00273     }
00274     if (param.max_linesearch <= 0) {
00275         return LBFGSERR_INVALID_MAXLINESEARCH;
00276     }
00277     if (param.orthantwise_c < 0.) {
00278         return LBFGSERR_INVALID_ORTHANTWISE;
00279     }
00280     if (param.orthantwise_start < 0 || n < param.orthantwise_start) {
00281         return LBFGSERR_INVALID_ORTHANTWISE_START;
00282     }
00283     if (param.orthantwise_end < 0) {
00284         param.orthantwise_end = n;
00285     }
00286     if (n < param.orthantwise_end) {
00287         return LBFGSERR_INVALID_ORTHANTWISE_END;
00288     }
00289     if (param.orthantwise_c != 0.) {
00290         switch (param.linesearch) {
00291         case LBFGS_LINESEARCH_BACKTRACKING:
00292             linesearch = line_search_backtracking_owlqn;
00293             break;
00294         default:
00295             /* Only the backtracking method is available. */
00296             return LBFGSERR_INVALID_LINESEARCH;
00297         }
00298     } else {
00299         switch (param.linesearch) {
00300         case LBFGS_LINESEARCH_MORETHUENTE:
00301             linesearch = line_search_morethuente;
00302             break;
00303         case LBFGS_LINESEARCH_BACKTRACKING_ARMIJO:
00304         case LBFGS_LINESEARCH_BACKTRACKING_WOLFE:
00305         case LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE:
00306             linesearch = line_search_backtracking;
00307             break;
00308         default:
00309             return LBFGSERR_INVALID_LINESEARCH;
00310         }
00311     }
00312 
00313     /* Allocate working space. */
00314     xp = SG_CALLOC(float64_t, n);
00315     g = SG_CALLOC(float64_t, n);
00316     gp = SG_CALLOC(float64_t, n);
00317     d = SG_CALLOC(float64_t, n);
00318     w = SG_CALLOC(float64_t, n);
00319     if (xp == NULL || g == NULL || gp == NULL || d == NULL || w == NULL) {
00320         ret = LBFGSERR_OUTOFMEMORY;
00321         goto lbfgs_exit;
00322     }
00323 
00324     if (param.orthantwise_c != 0.) {
00325         /* Allocate working space for OW-LQN. */
00326         pg = SG_CALLOC(float64_t, n);
00327         if (pg == NULL) {
00328             ret = LBFGSERR_OUTOFMEMORY;
00329             goto lbfgs_exit;
00330         }
00331     }
00332 
00333     /* Allocate limited memory storage. */
00334     lm = SG_CALLOC(iteration_data_t, m);
00335     if (lm == NULL) {
00336         ret = LBFGSERR_OUTOFMEMORY;
00337         goto lbfgs_exit;
00338     }
00339 
00340     /* Initialize the limited memory. */
00341     for (i = 0;i < m;++i) {
00342         it = &lm[i];
00343         it->alpha = 0;
00344         it->ys = 0;
00345         it->s = SG_CALLOC(float64_t, n);
00346         it->y = SG_CALLOC(float64_t, n);
00347         if (it->s == NULL || it->y == NULL) {
00348             ret = LBFGSERR_OUTOFMEMORY;
00349             goto lbfgs_exit;
00350         }
00351     }
00352 
00353     /* Allocate an array for storing previous values of the objective function. */
00354     if (0 < param.past) {
00355         pf = SG_CALLOC(float64_t, param.past);
00356     }
00357 
00358     /* Evaluate the function value and its gradient. */
00359     fx = cd.proc_evaluate(cd.instance, x, g, cd.n, 0);
00360     if (0. != param.orthantwise_c) {
00361         /* Compute the L1 norm of the variable and add it to the object value. */
00362         xnorm = owlqn_x1norm(x, param.orthantwise_start, param.orthantwise_end);
00363         fx += xnorm * param.orthantwise_c;
00364         owlqn_pseudo_gradient(
00365             pg, x, g, n,
00366             param.orthantwise_c, param.orthantwise_start, param.orthantwise_end
00367             );
00368     }
00369 
00370     /* Store the initial value of the objective function. */
00371     if (pf != NULL) {
00372         pf[0] = fx;
00373     }
00374 
00375     /*
00376         Compute the direction;
00377         we assume the initial hessian matrix H_0 as the identity matrix.
00378      */
00379     if (param.orthantwise_c == 0.) {
00380         std::copy(g,g+n,d);
00381         SGVector<float64_t>::scale_vector(-1, d, n);
00382     } else {
00383         std::copy(pg,pg+n,d);
00384         SGVector<float64_t>::scale_vector(-1, d, n);
00385     }
00386 
00387     /*
00388        Make sure that the initial variables are not a minimizer.
00389      */
00390     xnorm = SGVector<float64_t>::twonorm(x, n);
00391     if (param.orthantwise_c == 0.) {
00392         gnorm = SGVector<float64_t>::twonorm(g, n);
00393     } else {
00394         gnorm = SGVector<float64_t>::twonorm(pg, n);
00395     }
00396     if (xnorm < 1.0) xnorm = 1.0;
00397     if (gnorm / xnorm <= param.epsilon) {
00398         ret = LBFGS_ALREADY_MINIMIZED;
00399         goto lbfgs_exit;
00400     }
00401 
00402     /* Compute the initial step:
00403         step = 1.0 / sqrt(vecdot(d, d, n))
00404      */
00405     step = 1.0 / SGVector<float64_t>::twonorm(d, n);
00406 
00407     k = 1;
00408     end = 0;
00409     for (;;) {
00410         /* Store the current position and gradient vectors. */
00411         std::copy(x,x+n,xp);
00412         std::copy(g,g+n,gp);
00413 
00414         /* Search for an optimal step. */
00415         if (param.orthantwise_c == 0.) {
00416             ls = linesearch(n, x, &fx, g, d, &step, xp, gp, w, &cd, &param);
00417         } else {
00418             ls = linesearch(n, x, &fx, g, d, &step, xp, pg, w, &cd, &param);
00419             owlqn_pseudo_gradient(
00420                 pg, x, g, n,
00421                 param.orthantwise_c, param.orthantwise_start, param.orthantwise_end
00422                 );
00423         }
00424         if (ls < 0) {
00425             /* Revert to the previous point. */
00426             std::copy(xp,xp+n,x);
00427             std::copy(gp,gp+n,g);
00428             ret = ls;
00429             goto lbfgs_exit;
00430         }
00431 
00432         /* Compute x and g norms. */
00433         xnorm = SGVector<float64_t>::twonorm(x, n);
00434         if (param.orthantwise_c == 0.) {
00435             gnorm = SGVector<float64_t>::twonorm(g, n);
00436         } else {
00437             gnorm = SGVector<float64_t>::twonorm(pg, n);
00438         }
00439 
00440         /* Report the progress. */
00441         if (cd.proc_progress) {
00442             if ((ret = cd.proc_progress(cd.instance, x, g, fx, xnorm, gnorm, step, cd.n, k, ls))) {
00443                 goto lbfgs_exit;
00444             }
00445         }
00446 
00447         /*
00448             Convergence test.
00449             The criterion is given by the following formula:
00450                 |g(x)| / \max2(1, |x|) < \epsilon
00451          */
00452         if (xnorm < 1.0) xnorm = 1.0;
00453         if (gnorm / xnorm <= param.epsilon) {
00454             /* Convergence. */
00455             ret = LBFGS_SUCCESS;
00456             break;
00457         }
00458 
00459         /*
00460             Test for stopping criterion.
00461             The criterion is given by the following formula:
00462                 (f(past_x) - f(x)) / f(x) < \delta
00463          */
00464         if (pf != NULL) {
00465             /* We don't test the stopping criterion while k < past. */
00466             if (param.past <= k) {
00467                 /* Compute the relative improvement from the past. */
00468                 rate = (pf[k % param.past] - fx) / fx;
00469 
00470                 /* The stopping criterion. */
00471                 if (rate < param.delta) {
00472                     ret = LBFGS_STOP;
00473                     break;
00474                 }
00475             }
00476 
00477             /* Store the current value of the objective function. */
00478             pf[k % param.past] = fx;
00479         }
00480 
00481         if (param.max_iterations != 0 && param.max_iterations < k+1) {
00482             /* Maximum number of iterations. */
00483             ret = LBFGSERR_MAXIMUMITERATION;
00484             break;
00485         }
00486 
00487         /*
00488             Update vectors s and y:
00489                 s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}.
00490                 y_{k+1} = g_{k+1} - g_{k}.
00491          */
00492         it = &lm[end];
00493         SGVector<float64_t>::add(it->s, 1, x, -1, xp, n);
00494         SGVector<float64_t>::add(it->y, 1, g, -1, gp, n);
00495 
00496         /*
00497             Compute scalars ys and yy:
00498                 ys = y^t \cdot s = 1 / \rho.
00499                 yy = y^t \cdot y.
00500             Notice that yy is used for scaling the hessian matrix H_0 (Cholesky factor).
00501          */
00502         ys = SGVector<float64_t>::dot(it->y, it->s, n);
00503         yy = SGVector<float64_t>::dot(it->y, it->y, n);
00504         it->ys = ys;
00505 
00506         /*
00507             Recursive formula to compute dir = -(H \cdot g).
00508                 This is described in page 779 of:
00509                 Jorge Nocedal.
00510                 Updating Quasi-Newton Matrices with Limited Storage.
00511                 Mathematics of Computation, Vol. 35, No. 151,
00512                 pp. 773--782, 1980.
00513          */
00514         bound = (m <= k) ? m : k;
00515         ++k;
00516         end = (end + 1) % m;
00517 
00518         /* Compute the steepest direction. */
00519         if (param.orthantwise_c == 0.) {
00520             /* Compute the negative of gradients. */
00521             std::copy(g, g+n, d);
00522             SGVector<float64_t>::scale_vector(-1, d, n);
00523         } else {
00524             std::copy(pg, pg+n, d);
00525             SGVector<float64_t>::scale_vector(-1, d, n);
00526         }
00527 
00528         j = end;
00529         for (i = 0;i < bound;++i) {
00530             j = (j + m - 1) % m;    /* if (--j == -1) j = m-1; */
00531             it = &lm[j];
00532             /* \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}. */
00533             it->alpha = SGVector<float64_t>::dot(it->s, d, n);
00534             it->alpha /= it->ys;
00535             /* q_{i} = q_{i+1} - \alpha_{i} y_{i}. */
00536             SGVector<float64_t>::add(d, 1, d, -it->alpha, it->y, n);
00537         }
00538 
00539         SGVector<float64_t>::scale_vector(ys / yy, d, n);
00540 
00541         for (i = 0;i < bound;++i) {
00542             it = &lm[j];
00543             /* \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}. */
00544             beta = SGVector<float64_t>::dot(it->y, d, n);
00545             beta /= it->ys;
00546             /* \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}. */
00547             SGVector<float64_t>::add(d, 1, d, it->alpha-beta, it->s, n);
00548             j = (j + 1) % m;        /* if (++j == m) j = 0; */
00549         }
00550 
00551         /*
00552             Constrain the search direction for orthant-wise updates.
00553          */
00554         if (param.orthantwise_c != 0.) {
00555             for (i = param.orthantwise_start;i < param.orthantwise_end;++i) {
00556                 if (d[i] * pg[i] >= 0) {
00557                     d[i] = 0;
00558                 }
00559             }
00560         }
00561 
00562         /*
00563             Now the search direction d is ready. We try step = 1 first.
00564          */
00565         step = 1.0;
00566     }
00567 
00568 lbfgs_exit:
00569     /* Return the final value of the objective function. */
00570     if (ptr_fx != NULL) {
00571         *ptr_fx = fx;
00572     }
00573 
00574     SG_FREE(pf);
00575 
00576     /* Free memory blocks used by this function. */
00577     if (lm != NULL) {
00578         for (i = 0;i < m;++i) {
00579             SG_FREE(lm[i].s);
00580             SG_FREE(lm[i].y);
00581         }
00582         SG_FREE(lm);
00583     }
00584     SG_FREE(pg);
00585     SG_FREE(w);
00586     SG_FREE(d);
00587     SG_FREE(gp);
00588     SG_FREE(g);
00589     SG_FREE(xp);
00590 
00591     return ret;
00592 }
00593 
00594 
00595 
00596 static int32_t line_search_backtracking(
00597     int32_t n,
00598     float64_t *x,
00599     float64_t *f,
00600     float64_t *g,
00601     float64_t *s,
00602     float64_t *stp,
00603     const float64_t* xp,
00604     const float64_t* gp,
00605     float64_t *wp,
00606     callback_data_t *cd,
00607     const lbfgs_parameter_t *param
00608     )
00609 {
00610     int32_t count = 0;
00611     float64_t width, dg;
00612     float64_t finit, dginit = 0., dgtest;
00613     const float64_t dec = 0.5, inc = 2.1;
00614 
00615     /* Check the input parameters for errors. */
00616     if (*stp <= 0.) {
00617         return LBFGSERR_INVALIDPARAMETERS;
00618     }
00619 
00620     /* Compute the initial gradient in the search direction. */
00621     dginit = SGVector<float64_t>::dot(g, s, n);
00622 
00623     /* Make sure that s points to a descent direction. */
00624     if (0 < dginit) {
00625         return LBFGSERR_INCREASEGRADIENT;
00626     }
00627 
00628     /* The initial value of the objective function. */
00629     finit = *f;
00630     dgtest = param->ftol * dginit;
00631 
00632     for (;;) {
00633         std::copy(xp,xp+n,x);
00634         SGVector<float64_t>::add(x, 1, x, *stp, s, n);
00635 
00636         /* Evaluate the function and gradient values. */
00637         *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp);
00638 
00639         ++count;
00640 
00641         if (*f > finit + *stp * dgtest) {
00642             width = dec;
00643         } else {
00644             /* The sufficient decrease condition (Armijo condition). */
00645             if (param->linesearch == LBFGS_LINESEARCH_BACKTRACKING_ARMIJO) {
00646                 /* Exit with the Armijo condition. */
00647                 return count;
00648             }
00649 
00650             /* Check the Wolfe condition. */
00651             dg = SGVector<float64_t>::dot(g, s, n);
00652             if (dg < param->wolfe * dginit) {
00653             width = inc;
00654             } else {
00655                 if(param->linesearch == LBFGS_LINESEARCH_BACKTRACKING_WOLFE) {
00656                     /* Exit with the regular Wolfe condition. */
00657                     return count;
00658                 }
00659 
00660                 /* Check the strong Wolfe condition. */
00661                 if(dg > -param->wolfe * dginit) {
00662                     width = dec;
00663                 } else {
00664                     /* Exit with the strong Wolfe condition. */
00665                     return count;
00666                 }
00667             }
00668         }
00669 
00670         if (*stp < param->min_step) {
00671             /* The step is the minimum value. */
00672             return LBFGSERR_MINIMUMSTEP;
00673         }
00674         if (*stp > param->max_step) {
00675             /* The step is the maximum value. */
00676             return LBFGSERR_MAXIMUMSTEP;
00677         }
00678         if (param->max_linesearch <= count) {
00679             /* Maximum number of iteration. */
00680             return LBFGSERR_MAXIMUMLINESEARCH;
00681         }
00682 
00683         (*stp) *= width;
00684     }
00685 }
00686 
00687 
00688 
00689 static int32_t line_search_backtracking_owlqn(
00690     int32_t n,
00691     float64_t *x,
00692     float64_t *f,
00693     float64_t *g,
00694     float64_t *s,
00695     float64_t *stp,
00696     const float64_t* xp,
00697     const float64_t* gp,
00698     float64_t *wp,
00699     callback_data_t *cd,
00700     const lbfgs_parameter_t *param
00701     )
00702 {
00703     int32_t i, count = 0;
00704     float64_t width = 0.5, norm = 0.;
00705     float64_t finit = *f, dgtest;
00706 
00707     /* Check the input parameters for errors. */
00708     if (*stp <= 0.) {
00709         return LBFGSERR_INVALIDPARAMETERS;
00710     }
00711 
00712     /* Choose the orthant for the new point. */
00713     for (i = 0;i < n;++i) {
00714         wp[i] = (xp[i] == 0.) ? -gp[i] : xp[i];
00715     }
00716 
00717     for (;;) {
00718         /* Update the current point. */
00719         std::copy(xp,xp+n,x);
00720         SGVector<float64_t>::add(x, 1, x, *stp, s, n);
00721 
00722         /* The current point is projected onto the orthant. */
00723         owlqn_project(x, wp, param->orthantwise_start, param->orthantwise_end);
00724 
00725         /* Evaluate the function and gradient values. */
00726         *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp);
00727 
00728         /* Compute the L1 norm of the variables and add it to the object value. */
00729         norm = owlqn_x1norm(x, param->orthantwise_start, param->orthantwise_end);
00730         *f += norm * param->orthantwise_c;
00731 
00732         ++count;
00733 
00734         dgtest = 0.;
00735         for (i = 0;i < n;++i) {
00736             dgtest += (x[i] - xp[i]) * gp[i];
00737         }
00738 
00739         if (*f <= finit + param->ftol * dgtest) {
00740             /* The sufficient decrease condition. */
00741             return count;
00742         }
00743 
00744         if (*stp < param->min_step) {
00745             /* The step is the minimum value. */
00746             return LBFGSERR_MINIMUMSTEP;
00747         }
00748         if (*stp > param->max_step) {
00749             /* The step is the maximum value. */
00750             return LBFGSERR_MAXIMUMSTEP;
00751         }
00752         if (param->max_linesearch <= count) {
00753             /* Maximum number of iteration. */
00754             return LBFGSERR_MAXIMUMLINESEARCH;
00755         }
00756 
00757         (*stp) *= width;
00758     }
00759 }
00760 
00761 
00762 
00763 static int32_t line_search_morethuente(
00764     int32_t n,
00765     float64_t *x,
00766     float64_t *f,
00767     float64_t *g,
00768     float64_t *s,
00769     float64_t *stp,
00770     const float64_t* xp,
00771     const float64_t* gp,
00772     float64_t *wa,
00773     callback_data_t *cd,
00774     const lbfgs_parameter_t *param
00775     )
00776 {
00777     int32_t count = 0;
00778     int32_t brackt, stage1, uinfo = 0;
00779     float64_t dg;
00780     float64_t stx, fx, dgx;
00781     float64_t sty, fy, dgy;
00782     float64_t fxm, dgxm, fym, dgym, fm, dgm;
00783     float64_t finit, ftest1, dginit, dgtest;
00784     float64_t width, prev_width;
00785     float64_t stmin, stmax;
00786 
00787     /* Check the input parameters for errors. */
00788     if (*stp <= 0.) {
00789         return LBFGSERR_INVALIDPARAMETERS;
00790     }
00791 
00792     /* Compute the initial gradient in the search direction. */
00793     dginit = SGVector<float64_t>::dot(g, s, n);
00794 
00795     /* Make sure that s points to a descent direction. */
00796     if (0 < dginit) {
00797         return LBFGSERR_INCREASEGRADIENT;
00798     }
00799 
00800     /* Initialize local variables. */
00801     brackt = 0;
00802     stage1 = 1;
00803     finit = *f;
00804     dgtest = param->ftol * dginit;
00805     width = param->max_step - param->min_step;
00806     prev_width = 2.0 * width;
00807 
00808     /*
00809         The variables stx, fx, dgx contain the values of the step,
00810         function, and directional derivative at the best step.
00811         The variables sty, fy, dgy contain the value of the step,
00812         function, and derivative at the other endpoint of
00813         the interval of uncertainty.
00814         The variables stp, f, dg contain the values of the step,
00815         function, and derivative at the current step.
00816     */
00817     stx = sty = 0.;
00818     fx = fy = finit;
00819     dgx = dgy = dginit;
00820 
00821     for (;;) {
00822         /*
00823             Set the minimum and maximum steps to correspond to the
00824             present interval of uncertainty.
00825          */
00826         if (brackt) {
00827             stmin = min2(stx, sty);
00828             stmax = max2(stx, sty);
00829         } else {
00830             stmin = stx;
00831             stmax = *stp + 4.0 * (*stp - stx);
00832         }
00833 
00834         /* Clip the step in the range of [stpmin, stpmax]. */
00835         if (*stp < param->min_step) *stp = param->min_step;
00836         if (param->max_step < *stp) *stp = param->max_step;
00837 
00838         /*
00839             If an unusual termination is to occur then let
00840             stp be the lowest point obtained so far.
00841          */
00842         if ((brackt && ((*stp <= stmin || stmax <= *stp) || param->max_linesearch <= count + 1 || uinfo != 0)) || (brackt && (stmax - stmin <= param->xtol * stmax))) {
00843             *stp = stx;
00844         }
00845 
00846         /*
00847             Compute the current value of x:
00848                 x <- x + (*stp) * s.
00849          */
00850         std::copy(xp,xp+n,x);
00851         SGVector<float64_t>::add(x, 1, x, *stp, s, n);
00852 
00853         /* Evaluate the function and gradient values. */
00854         *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp);
00855         dg = SGVector<float64_t>::dot(g, s, n);
00856 
00857         ftest1 = finit + *stp * dgtest;
00858         ++count;
00859 
00860         /* Test for errors and convergence. */
00861         if (brackt && ((*stp <= stmin || stmax <= *stp) || uinfo != 0)) {
00862             /* Rounding errors prevent further progress. */
00863             return LBFGSERR_ROUNDING_ERROR;
00864         }
00865         if (*stp == param->max_step && *f <= ftest1 && dg <= dgtest) {
00866             /* The step is the maximum value. */
00867             return LBFGSERR_MAXIMUMSTEP;
00868         }
00869         if (*stp == param->min_step && (ftest1 < *f || dgtest <= dg)) {
00870             /* The step is the minimum value. */
00871             return LBFGSERR_MINIMUMSTEP;
00872         }
00873         if (brackt && (stmax - stmin) <= param->xtol * stmax) {
00874             /* Relative width of the interval of uncertainty is at most xtol. */
00875             return LBFGSERR_WIDTHTOOSMALL;
00876         }
00877         if (param->max_linesearch <= count) {
00878             /* Maximum number of iteration. */
00879             return LBFGSERR_MAXIMUMLINESEARCH;
00880         }
00881         if (*f <= ftest1 && fabs(dg) <= param->gtol * (-dginit)) {
00882             /* The sufficient decrease condition and the directional derivative condition hold. */
00883             return count;
00884         }
00885 
00886         /*
00887             In the first stage we seek a step for which the modified
00888             function has a nonpositive value and nonnegative derivative.
00889          */
00890         if (stage1 && *f <= ftest1 && min2(param->ftol, param->gtol) * dginit <= dg) {
00891             stage1 = 0;
00892         }
00893 
00894         /*
00895             A modified function is used to predict the step only if
00896             we have not obtained a step for which the modified
00897             function has a nonpositive function value and nonnegative
00898             derivative, and if a lower function value has been
00899             obtained but the decrease is not sufficient.
00900          */
00901         if (stage1 && ftest1 < *f && *f <= fx) {
00902             /* Define the modified function and derivative values. */
00903             fm = *f - *stp * dgtest;
00904             fxm = fx - stx * dgtest;
00905             fym = fy - sty * dgtest;
00906             dgm = dg - dgtest;
00907             dgxm = dgx - dgtest;
00908             dgym = dgy - dgtest;
00909 
00910             /*
00911                 Call update_trial_interval() to update the interval of
00912                 uncertainty and to compute the new step.
00913              */
00914             uinfo = update_trial_interval(
00915                 &stx, &fxm, &dgxm,
00916                 &sty, &fym, &dgym,
00917                 stp, &fm, &dgm,
00918                 stmin, stmax, &brackt
00919                 );
00920 
00921             /* Reset the function and gradient values for f. */
00922             fx = fxm + stx * dgtest;
00923             fy = fym + sty * dgtest;
00924             dgx = dgxm + dgtest;
00925             dgy = dgym + dgtest;
00926         } else {
00927             /*
00928                 Call update_trial_interval() to update the interval of
00929                 uncertainty and to compute the new step.
00930              */
00931             uinfo = update_trial_interval(
00932                 &stx, &fx, &dgx,
00933                 &sty, &fy, &dgy,
00934                 stp, f, &dg,
00935                 stmin, stmax, &brackt
00936                 );
00937         }
00938 
00939         /*
00940             Force a sufficient decrease in the interval of uncertainty.
00941          */
00942         if (brackt) {
00943             if (0.66 * prev_width <= fabs(sty - stx)) {
00944                 *stp = stx + 0.5 * (sty - stx);
00945             }
00946             prev_width = width;
00947             width = fabs(sty - stx);
00948         }
00949     }
00950 
00951     return LBFGSERR_LOGICERROR;
00952 }
00953 
00954 
00955 
00959 #define USES_MINIMIZER \
00960     float64_t a, d, gamma, theta, p, q, r, s;
00961 
00972 #define CUBIC_MINIMIZER(cm, u, fu, du, v, fv, dv) \
00973     d = (v) - (u); \
00974     theta = ((fu) - (fv)) * 3 / d + (du) + (dv); \
00975     p = fabs(theta); \
00976     q = fabs(du); \
00977     r = fabs(dv); \
00978     s = max3(p, q, r); \
00979     /* gamma = s*sqrt((theta/s)**2 - (du/s) * (dv/s)) */ \
00980     a = theta / s; \
00981     gamma = s * sqrt(a * a - ((du) / s) * ((dv) / s)); \
00982     if ((v) < (u)) gamma = -gamma; \
00983     p = gamma - (du) + theta; \
00984     q = gamma - (du) + gamma + (dv); \
00985     r = p / q; \
00986     (cm) = (u) + r * d;
00987 
01000 #define CUBIC_MINIMIZER2(cm, u, fu, du, v, fv, dv, xmin, xmax) \
01001     d = (v) - (u); \
01002     theta = ((fu) - (fv)) * 3 / d + (du) + (dv); \
01003     p = fabs(theta); \
01004     q = fabs(du); \
01005     r = fabs(dv); \
01006     s = max3(p, q, r); \
01007     /* gamma = s*sqrt((theta/s)**2 - (du/s) * (dv/s)) */ \
01008     a = theta / s; \
01009     gamma = s * sqrt(max2(0, a * a - ((du) / s) * ((dv) / s))); \
01010     if ((u) < (v)) gamma = -gamma; \
01011     p = gamma - (dv) + theta; \
01012     q = gamma - (dv) + gamma + (du); \
01013     r = p / q; \
01014     if (r < 0. && gamma != 0.) { \
01015         (cm) = (v) - r * d; \
01016     } else if (a < 0) { \
01017         (cm) = (xmax); \
01018     } else { \
01019         (cm) = (xmin); \
01020     }
01021 
01031 #define QUARD_MINIMIZER(qm, u, fu, du, v, fv) \
01032     a = (v) - (u); \
01033     (qm) = (u) + (du) / (((fu) - (fv)) / a + (du)) / 2 * a;
01034 
01043 #define QUARD_MINIMIZER2(qm, u, du, v, dv) \
01044     a = (u) - (v); \
01045     (qm) = (v) + (dv) / ((dv) - (du)) * a;
01046 
01047 #define fsigndiff(x, y) (*(x) * (*(y) / fabs(*(y))) < 0.)
01048 
01078 static int32_t update_trial_interval(
01079     float64_t *x,
01080     float64_t *fx,
01081     float64_t *dx,
01082     float64_t *y,
01083     float64_t *fy,
01084     float64_t *dy,
01085     float64_t *t,
01086     float64_t *ft,
01087     float64_t *dt,
01088     const float64_t tmin,
01089     const float64_t tmax,
01090     int32_t *brackt
01091     )
01092 {
01093     int32_t bound;
01094     int32_t dsign = fsigndiff(dt, dx);
01095     float64_t mc; /* minimizer of an interpolated cubic. */
01096     float64_t mq; /* minimizer of an interpolated quadratic. */
01097     float64_t newt;   /* new trial value. */
01098     USES_MINIMIZER;     /* for CUBIC_MINIMIZER and QUARD_MINIMIZER. */
01099 
01100     /* Check the input parameters for errors. */
01101     if (*brackt) {
01102         if (*t <= min2(*x, *y) || max2(*x, *y) <= *t) {
01103             /* The trival value t is out of the interval. */
01104             return LBFGSERR_OUTOFINTERVAL;
01105         }
01106         if (0. <= *dx * (*t - *x)) {
01107             /* The function must decrease from x. */
01108             return LBFGSERR_INCREASEGRADIENT;
01109         }
01110         if (tmax < tmin) {
01111             /* Incorrect tmin and tmax specified. */
01112             return LBFGSERR_INCORRECT_TMINMAX;
01113         }
01114     }
01115 
01116     /*
01117         Trial value selection.
01118      */
01119     if (*fx < *ft) {
01120         /*
01121             Case 1: a higher function value.
01122             The minimum is brackt. If the cubic minimizer is closer
01123             to x than the quadratic one, the cubic one is taken, else
01124             the average of the minimizers is taken.
01125          */
01126         *brackt = 1;
01127         bound = 1;
01128         CUBIC_MINIMIZER(mc, *x, *fx, *dx, *t, *ft, *dt);
01129         QUARD_MINIMIZER(mq, *x, *fx, *dx, *t, *ft);
01130         if (fabs(mc - *x) < fabs(mq - *x)) {
01131             newt = mc;
01132         } else {
01133             newt = mc + 0.5 * (mq - mc);
01134         }
01135     } else if (dsign) {
01136         /*
01137             Case 2: a lower function value and derivatives of
01138             opposite sign. The minimum is brackt. If the cubic
01139             minimizer is closer to x than the quadratic (secant) one,
01140             the cubic one is taken, else the quadratic one is taken.
01141          */
01142         *brackt = 1;
01143         bound = 0;
01144         CUBIC_MINIMIZER(mc, *x, *fx, *dx, *t, *ft, *dt);
01145         QUARD_MINIMIZER2(mq, *x, *dx, *t, *dt);
01146         if (fabs(mc - *t) > fabs(mq - *t)) {
01147             newt = mc;
01148         } else {
01149             newt = mq;
01150         }
01151     } else if (fabs(*dt) < fabs(*dx)) {
01152         /*
01153             Case 3: a lower function value, derivatives of the
01154             same sign, and the magnitude of the derivative decreases.
01155             The cubic minimizer is only used if the cubic tends to
01156             infinity in the direction of the minimizer or if the minimum
01157             of the cubic is beyond t. Otherwise the cubic minimizer is
01158             defined to be either tmin or tmax. The quadratic (secant)
01159             minimizer is also computed and if the minimum is brackt
01160             then the the minimizer closest to x is taken, else the one
01161             farthest away is taken.
01162          */
01163         bound = 1;
01164         CUBIC_MINIMIZER2(mc, *x, *fx, *dx, *t, *ft, *dt, tmin, tmax);
01165         QUARD_MINIMIZER2(mq, *x, *dx, *t, *dt);
01166         if (*brackt) {
01167             if (fabs(*t - mc) < fabs(*t - mq)) {
01168                 newt = mc;
01169             } else {
01170                 newt = mq;
01171             }
01172         } else {
01173             if (fabs(*t - mc) > fabs(*t - mq)) {
01174                 newt = mc;
01175             } else {
01176                 newt = mq;
01177             }
01178         }
01179     } else {
01180         /*
01181             Case 4: a lower function value, derivatives of the
01182             same sign, and the magnitude of the derivative does
01183             not decrease. If the minimum is not brackt, the step
01184             is either tmin or tmax, else the cubic minimizer is taken.
01185          */
01186         bound = 0;
01187         if (*brackt) {
01188             CUBIC_MINIMIZER(newt, *t, *ft, *dt, *y, *fy, *dy);
01189         } else if (*x < *t) {
01190             newt = tmax;
01191         } else {
01192             newt = tmin;
01193         }
01194     }
01195 
01196     /*
01197         Update the interval of uncertainty. This update does not
01198         depend on the new step or the case analysis above.
01199 
01200         - Case a: if f(x) < f(t),
01201             x <- x, y <- t.
01202         - Case b: if f(t) <= f(x) && f'(t)*f'(x) > 0,
01203             x <- t, y <- y.
01204         - Case c: if f(t) <= f(x) && f'(t)*f'(x) < 0,
01205             x <- t, y <- x.
01206      */
01207     if (*fx < *ft) {
01208         /* Case a */
01209         *y = *t;
01210         *fy = *ft;
01211         *dy = *dt;
01212     } else {
01213         /* Case c */
01214         if (dsign) {
01215             *y = *x;
01216             *fy = *fx;
01217             *dy = *dx;
01218         }
01219         /* Cases b and c */
01220         *x = *t;
01221         *fx = *ft;
01222         *dx = *dt;
01223     }
01224 
01225     /* Clip the new trial value in [tmin, tmax]. */
01226     if (tmax < newt) newt = tmax;
01227     if (newt < tmin) newt = tmin;
01228 
01229     /*
01230         Redefine the new trial value if it is close to the upper bound
01231         of the interval.
01232      */
01233     if (*brackt && bound) {
01234         mq = *x + 0.66 * (*y - *x);
01235         if (*x < *y) {
01236             if (mq < newt) newt = mq;
01237         } else {
01238             if (newt < mq) newt = mq;
01239         }
01240     }
01241 
01242     /* Return the new trial value. */
01243     *t = newt;
01244     return 0;
01245 }
01246 
01247 
01248 
01249 
01250 
01251 static float64_t owlqn_x1norm(
01252     const float64_t* x,
01253     const int32_t start,
01254     const int32_t n
01255     )
01256 {
01257     int32_t i;
01258     float64_t norm = 0.;
01259 
01260     for (i = start;i < n;++i) {
01261         norm += fabs(x[i]);
01262     }
01263 
01264     return norm;
01265 }
01266 
01267 static void owlqn_pseudo_gradient(
01268     float64_t* pg,
01269     const float64_t* x,
01270     const float64_t* g,
01271     const int32_t n,
01272     const float64_t c,
01273     const int32_t start,
01274     const int32_t end
01275     )
01276 {
01277     int32_t i;
01278 
01279     /* Compute the negative of gradients. */
01280     for (i = 0;i < start;++i) {
01281         pg[i] = g[i];
01282     }
01283 
01284     /* Compute the psuedo-gradients. */
01285     for (i = start;i < end;++i) {
01286         if (x[i] < 0.) {
01287             /* Differentiable. */
01288             pg[i] = g[i] - c;
01289         } else if (0. < x[i]) {
01290             /* Differentiable. */
01291             pg[i] = g[i] + c;
01292         } else {
01293             if (g[i] < -c) {
01294                 /* Take the right partial derivative. */
01295                 pg[i] = g[i] + c;
01296             } else if (c < g[i]) {
01297                 /* Take the left partial derivative. */
01298                 pg[i] = g[i] - c;
01299             } else {
01300                 pg[i] = 0.;
01301             }
01302         }
01303     }
01304 
01305     for (i = end;i < n;++i) {
01306         pg[i] = g[i];
01307     }
01308 }
01309 
01310 static void owlqn_project(
01311     float64_t* d,
01312     const float64_t* sign,
01313     const int32_t start,
01314     const int32_t end
01315     )
01316 {
01317     int32_t i;
01318 
01319     for (i = start;i < end;++i) {
01320         if (d[i] * sign[i] <= 0) {
01321             d[i] = 0;
01322         }
01323     }
01324 }
01325 
01326 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation