SHOGUN
v3.2.0
|
00001 /* 00002 * Limited memory BFGS (L-BFGS). 00003 * 00004 * Copyright (c) 1990, Jorge Nocedal 00005 * Copyright (c) 2007-2010 Naoaki Okazaki 00006 * All rights reserved. 00007 * 00008 * Permission is hereby granted, free of charge, to any person obtaining a copy 00009 * of this software and associated documentation files (the "Software"), to deal 00010 * in the Software without restriction, including without limitation the rights 00011 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 00012 * copies of the Software, and to permit persons to whom the Software is 00013 * furnished to do so, subject to the following conditions: 00014 * 00015 * The above copyright notice and this permission notice shall be included in 00016 * all copies or substantial portions of the Software. 00017 * 00018 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 00019 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 00020 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 00021 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 00022 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 00023 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 00024 * THE SOFTWARE. 00025 */ 00026 00027 /* $Id$ */ 00028 00029 /* 00030 This library is a C port of the FORTRAN implementation of Limited-memory 00031 Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) method written by Jorge Nocedal. 00032 The original FORTRAN source code is available at: 00033 http://www.ece.northwestern.edu/~nocedal/lbfgs.html 00034 00035 The L-BFGS algorithm is described in: 00036 - Jorge Nocedal. 00037 Updating Quasi-Newton Matrices with Limited Storage. 00038 <i>Mathematics of Computation</i>, Vol. 35, No. 151, pp. 773--782, 1980. 00039 - Dong C. Liu and Jorge Nocedal. 00040 On the limited memory BFGS method for large scale optimization. 00041 <i>Mathematical Programming</i> B, Vol. 45, No. 3, pp. 503-528, 1989. 00042 00043 The line search algorithms used in this implementation are described in: 00044 - John E. Dennis and Robert B. Schnabel. 00045 <i>Numerical Methods for Unconstrained Optimization and Nonlinear 00046 Equations</i>, Englewood Cliffs, 1983. 00047 - Jorge J. More and David J. Thuente. 00048 Line search algorithm with guaranteed sufficient decrease. 00049 <i>ACM Transactions on Mathematical Software (TOMS)</i>, Vol. 20, No. 3, 00050 pp. 286-307, 1994. 00051 00052 This library also implements Orthant-Wise Limited-memory Quasi-Newton (OWL-QN) 00053 method presented in: 00054 - Galen Andrew and Jianfeng Gao. 00055 Scalable training of L1-regularized log-linear models. 00056 In <i>Proceedings of the 24th International Conference on Machine 00057 Learning (ICML 2007)</i>, pp. 33-40, 2007. 00058 00059 I would like to thank the original author, Jorge Nocedal, who has been 00060 distributing the effieicnt and explanatory implementation in an open source 00061 licence. 00062 */ 00063 00064 #include <algorithm> 00065 #include <cstdio> 00066 #include <cstdlib> 00067 #include <cmath> 00068 00069 #include <shogun/optimization/lbfgs/lbfgs.h> 00070 #include <shogun/lib/SGVector.h> 00071 #include <shogun/lib/common.h> 00072 00073 namespace shogun 00074 { 00075 00076 #define min2(a, b) ((a) <= (b) ? (a) : (b)) 00077 #define max2(a, b) ((a) >= (b) ? (a) : (b)) 00078 #define max3(a, b, c) max2(max2((a), (b)), (c)); 00079 00080 struct tag_callback_data { 00081 int32_t n; 00082 void *instance; 00083 lbfgs_evaluate_t proc_evaluate; 00084 lbfgs_progress_t proc_progress; 00085 }; 00086 typedef struct tag_callback_data callback_data_t; 00087 00088 struct tag_iteration_data { 00089 float64_t alpha; 00090 float64_t *s; /* [n] */ 00091 float64_t *y; /* [n] */ 00092 float64_t ys; /* vecdot(y, s) */ 00093 }; 00094 typedef struct tag_iteration_data iteration_data_t; 00095 00096 static const lbfgs_parameter_t _defparam = { 00097 6, 1e-5, 0, 1e-5, 00098 0, LBFGS_LINESEARCH_DEFAULT, 40, 00099 1e-20, 1e20, 1e-4, 0.9, 0.9, 1.0e-16, 00100 0.0, 0, -1, 00101 }; 00102 00103 /* Forward function declarations. */ 00104 00105 typedef int32_t (*line_search_proc)( 00106 int32_t n, 00107 float64_t *x, 00108 float64_t *f, 00109 float64_t *g, 00110 float64_t *s, 00111 float64_t *stp, 00112 const float64_t* xp, 00113 const float64_t* gp, 00114 float64_t *wa, 00115 callback_data_t *cd, 00116 const lbfgs_parameter_t *param 00117 ); 00118 00119 static int32_t line_search_backtracking( 00120 int32_t n, 00121 float64_t *x, 00122 float64_t *f, 00123 float64_t *g, 00124 float64_t *s, 00125 float64_t *stp, 00126 const float64_t* xp, 00127 const float64_t* gp, 00128 float64_t *wa, 00129 callback_data_t *cd, 00130 const lbfgs_parameter_t *param 00131 ); 00132 00133 static int32_t line_search_backtracking_owlqn( 00134 int32_t n, 00135 float64_t *x, 00136 float64_t *f, 00137 float64_t *g, 00138 float64_t *s, 00139 float64_t *stp, 00140 const float64_t* xp, 00141 const float64_t* gp, 00142 float64_t *wp, 00143 callback_data_t *cd, 00144 const lbfgs_parameter_t *param 00145 ); 00146 00147 static int32_t line_search_morethuente( 00148 int32_t n, 00149 float64_t *x, 00150 float64_t *f, 00151 float64_t *g, 00152 float64_t *s, 00153 float64_t *stp, 00154 const float64_t* xp, 00155 const float64_t* gp, 00156 float64_t *wa, 00157 callback_data_t *cd, 00158 const lbfgs_parameter_t *param 00159 ); 00160 00161 static int32_t update_trial_interval( 00162 float64_t *x, 00163 float64_t *fx, 00164 float64_t *dx, 00165 float64_t *y, 00166 float64_t *fy, 00167 float64_t *dy, 00168 float64_t *t, 00169 float64_t *ft, 00170 float64_t *dt, 00171 const float64_t tmin, 00172 const float64_t tmax, 00173 int32_t *brackt 00174 ); 00175 00176 static float64_t owlqn_x1norm( 00177 const float64_t* x, 00178 const int32_t start, 00179 const int32_t n 00180 ); 00181 00182 static void owlqn_pseudo_gradient( 00183 float64_t* pg, 00184 const float64_t* x, 00185 const float64_t* g, 00186 const int32_t n, 00187 const float64_t c, 00188 const int32_t start, 00189 const int32_t end 00190 ); 00191 00192 static void owlqn_project( 00193 float64_t* d, 00194 const float64_t* sign, 00195 const int32_t start, 00196 const int32_t end 00197 ); 00198 00199 00200 void lbfgs_parameter_init(lbfgs_parameter_t *param) 00201 { 00202 memcpy(param, &_defparam, sizeof(*param)); 00203 } 00204 00205 int32_t lbfgs( 00206 int32_t n, 00207 float64_t *x, 00208 float64_t *ptr_fx, 00209 lbfgs_evaluate_t proc_evaluate, 00210 lbfgs_progress_t proc_progress, 00211 void *instance, 00212 lbfgs_parameter_t *_param 00213 ) 00214 { 00215 int32_t ret; 00216 int32_t i, j, k, ls, end, bound; 00217 float64_t step; 00218 00219 /* Constant parameters and their default values. */ 00220 lbfgs_parameter_t param = (_param != NULL) ? (*_param) : _defparam; 00221 const int32_t m = param.m; 00222 00223 float64_t *xp = NULL; 00224 float64_t *g = NULL, *gp = NULL, *pg = NULL; 00225 float64_t *d = NULL, *w = NULL, *pf = NULL; 00226 iteration_data_t *lm = NULL, *it = NULL; 00227 float64_t ys, yy; 00228 float64_t xnorm, gnorm, beta; 00229 float64_t fx = 0.; 00230 float64_t rate = 0.; 00231 line_search_proc linesearch = line_search_morethuente; 00232 00233 /* Construct a callback data. */ 00234 callback_data_t cd; 00235 cd.n = n; 00236 cd.instance = instance; 00237 cd.proc_evaluate = proc_evaluate; 00238 cd.proc_progress = proc_progress; 00239 00240 /* Check the input parameters for errors. */ 00241 if (n <= 0) { 00242 return LBFGSERR_INVALID_N; 00243 } 00244 if (param.epsilon < 0.) { 00245 return LBFGSERR_INVALID_EPSILON; 00246 } 00247 if (param.past < 0) { 00248 return LBFGSERR_INVALID_TESTPERIOD; 00249 } 00250 if (param.delta < 0.) { 00251 return LBFGSERR_INVALID_DELTA; 00252 } 00253 if (param.min_step < 0.) { 00254 return LBFGSERR_INVALID_MINSTEP; 00255 } 00256 if (param.max_step < param.min_step) { 00257 return LBFGSERR_INVALID_MAXSTEP; 00258 } 00259 if (param.ftol < 0.) { 00260 return LBFGSERR_INVALID_FTOL; 00261 } 00262 if (param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_WOLFE || 00263 param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE) { 00264 if (param.wolfe <= param.ftol || 1. <= param.wolfe) { 00265 return LBFGSERR_INVALID_WOLFE; 00266 } 00267 } 00268 if (param.gtol < 0.) { 00269 return LBFGSERR_INVALID_GTOL; 00270 } 00271 if (param.xtol < 0.) { 00272 return LBFGSERR_INVALID_XTOL; 00273 } 00274 if (param.max_linesearch <= 0) { 00275 return LBFGSERR_INVALID_MAXLINESEARCH; 00276 } 00277 if (param.orthantwise_c < 0.) { 00278 return LBFGSERR_INVALID_ORTHANTWISE; 00279 } 00280 if (param.orthantwise_start < 0 || n < param.orthantwise_start) { 00281 return LBFGSERR_INVALID_ORTHANTWISE_START; 00282 } 00283 if (param.orthantwise_end < 0) { 00284 param.orthantwise_end = n; 00285 } 00286 if (n < param.orthantwise_end) { 00287 return LBFGSERR_INVALID_ORTHANTWISE_END; 00288 } 00289 if (param.orthantwise_c != 0.) { 00290 switch (param.linesearch) { 00291 case LBFGS_LINESEARCH_BACKTRACKING: 00292 linesearch = line_search_backtracking_owlqn; 00293 break; 00294 default: 00295 /* Only the backtracking method is available. */ 00296 return LBFGSERR_INVALID_LINESEARCH; 00297 } 00298 } else { 00299 switch (param.linesearch) { 00300 case LBFGS_LINESEARCH_MORETHUENTE: 00301 linesearch = line_search_morethuente; 00302 break; 00303 case LBFGS_LINESEARCH_BACKTRACKING_ARMIJO: 00304 case LBFGS_LINESEARCH_BACKTRACKING_WOLFE: 00305 case LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE: 00306 linesearch = line_search_backtracking; 00307 break; 00308 default: 00309 return LBFGSERR_INVALID_LINESEARCH; 00310 } 00311 } 00312 00313 /* Allocate working space. */ 00314 xp = SG_CALLOC(float64_t, n); 00315 g = SG_CALLOC(float64_t, n); 00316 gp = SG_CALLOC(float64_t, n); 00317 d = SG_CALLOC(float64_t, n); 00318 w = SG_CALLOC(float64_t, n); 00319 if (xp == NULL || g == NULL || gp == NULL || d == NULL || w == NULL) { 00320 ret = LBFGSERR_OUTOFMEMORY; 00321 goto lbfgs_exit; 00322 } 00323 00324 if (param.orthantwise_c != 0.) { 00325 /* Allocate working space for OW-LQN. */ 00326 pg = SG_CALLOC(float64_t, n); 00327 if (pg == NULL) { 00328 ret = LBFGSERR_OUTOFMEMORY; 00329 goto lbfgs_exit; 00330 } 00331 } 00332 00333 /* Allocate limited memory storage. */ 00334 lm = SG_CALLOC(iteration_data_t, m); 00335 if (lm == NULL) { 00336 ret = LBFGSERR_OUTOFMEMORY; 00337 goto lbfgs_exit; 00338 } 00339 00340 /* Initialize the limited memory. */ 00341 for (i = 0;i < m;++i) { 00342 it = &lm[i]; 00343 it->alpha = 0; 00344 it->ys = 0; 00345 it->s = SG_CALLOC(float64_t, n); 00346 it->y = SG_CALLOC(float64_t, n); 00347 if (it->s == NULL || it->y == NULL) { 00348 ret = LBFGSERR_OUTOFMEMORY; 00349 goto lbfgs_exit; 00350 } 00351 } 00352 00353 /* Allocate an array for storing previous values of the objective function. */ 00354 if (0 < param.past) { 00355 pf = SG_CALLOC(float64_t, param.past); 00356 } 00357 00358 /* Evaluate the function value and its gradient. */ 00359 fx = cd.proc_evaluate(cd.instance, x, g, cd.n, 0); 00360 if (0. != param.orthantwise_c) { 00361 /* Compute the L1 norm of the variable and add it to the object value. */ 00362 xnorm = owlqn_x1norm(x, param.orthantwise_start, param.orthantwise_end); 00363 fx += xnorm * param.orthantwise_c; 00364 owlqn_pseudo_gradient( 00365 pg, x, g, n, 00366 param.orthantwise_c, param.orthantwise_start, param.orthantwise_end 00367 ); 00368 } 00369 00370 /* Store the initial value of the objective function. */ 00371 if (pf != NULL) { 00372 pf[0] = fx; 00373 } 00374 00375 /* 00376 Compute the direction; 00377 we assume the initial hessian matrix H_0 as the identity matrix. 00378 */ 00379 if (param.orthantwise_c == 0.) { 00380 std::copy(g,g+n,d); 00381 SGVector<float64_t>::scale_vector(-1, d, n); 00382 } else { 00383 std::copy(pg,pg+n,d); 00384 SGVector<float64_t>::scale_vector(-1, d, n); 00385 } 00386 00387 /* 00388 Make sure that the initial variables are not a minimizer. 00389 */ 00390 xnorm = SGVector<float64_t>::twonorm(x, n); 00391 if (param.orthantwise_c == 0.) { 00392 gnorm = SGVector<float64_t>::twonorm(g, n); 00393 } else { 00394 gnorm = SGVector<float64_t>::twonorm(pg, n); 00395 } 00396 if (xnorm < 1.0) xnorm = 1.0; 00397 if (gnorm / xnorm <= param.epsilon) { 00398 ret = LBFGS_ALREADY_MINIMIZED; 00399 goto lbfgs_exit; 00400 } 00401 00402 /* Compute the initial step: 00403 step = 1.0 / sqrt(vecdot(d, d, n)) 00404 */ 00405 step = 1.0 / SGVector<float64_t>::twonorm(d, n); 00406 00407 k = 1; 00408 end = 0; 00409 for (;;) { 00410 /* Store the current position and gradient vectors. */ 00411 std::copy(x,x+n,xp); 00412 std::copy(g,g+n,gp); 00413 00414 /* Search for an optimal step. */ 00415 if (param.orthantwise_c == 0.) { 00416 ls = linesearch(n, x, &fx, g, d, &step, xp, gp, w, &cd, ¶m); 00417 } else { 00418 ls = linesearch(n, x, &fx, g, d, &step, xp, pg, w, &cd, ¶m); 00419 owlqn_pseudo_gradient( 00420 pg, x, g, n, 00421 param.orthantwise_c, param.orthantwise_start, param.orthantwise_end 00422 ); 00423 } 00424 if (ls < 0) { 00425 /* Revert to the previous point. */ 00426 std::copy(xp,xp+n,x); 00427 std::copy(gp,gp+n,g); 00428 ret = ls; 00429 goto lbfgs_exit; 00430 } 00431 00432 /* Compute x and g norms. */ 00433 xnorm = SGVector<float64_t>::twonorm(x, n); 00434 if (param.orthantwise_c == 0.) { 00435 gnorm = SGVector<float64_t>::twonorm(g, n); 00436 } else { 00437 gnorm = SGVector<float64_t>::twonorm(pg, n); 00438 } 00439 00440 /* Report the progress. */ 00441 if (cd.proc_progress) { 00442 if ((ret = cd.proc_progress(cd.instance, x, g, fx, xnorm, gnorm, step, cd.n, k, ls))) { 00443 goto lbfgs_exit; 00444 } 00445 } 00446 00447 /* 00448 Convergence test. 00449 The criterion is given by the following formula: 00450 |g(x)| / \max2(1, |x|) < \epsilon 00451 */ 00452 if (xnorm < 1.0) xnorm = 1.0; 00453 if (gnorm / xnorm <= param.epsilon) { 00454 /* Convergence. */ 00455 ret = LBFGS_SUCCESS; 00456 break; 00457 } 00458 00459 /* 00460 Test for stopping criterion. 00461 The criterion is given by the following formula: 00462 (f(past_x) - f(x)) / f(x) < \delta 00463 */ 00464 if (pf != NULL) { 00465 /* We don't test the stopping criterion while k < past. */ 00466 if (param.past <= k) { 00467 /* Compute the relative improvement from the past. */ 00468 rate = (pf[k % param.past] - fx) / fx; 00469 00470 /* The stopping criterion. */ 00471 if (rate < param.delta) { 00472 ret = LBFGS_STOP; 00473 break; 00474 } 00475 } 00476 00477 /* Store the current value of the objective function. */ 00478 pf[k % param.past] = fx; 00479 } 00480 00481 if (param.max_iterations != 0 && param.max_iterations < k+1) { 00482 /* Maximum number of iterations. */ 00483 ret = LBFGSERR_MAXIMUMITERATION; 00484 break; 00485 } 00486 00487 /* 00488 Update vectors s and y: 00489 s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}. 00490 y_{k+1} = g_{k+1} - g_{k}. 00491 */ 00492 it = &lm[end]; 00493 SGVector<float64_t>::add(it->s, 1, x, -1, xp, n); 00494 SGVector<float64_t>::add(it->y, 1, g, -1, gp, n); 00495 00496 /* 00497 Compute scalars ys and yy: 00498 ys = y^t \cdot s = 1 / \rho. 00499 yy = y^t \cdot y. 00500 Notice that yy is used for scaling the hessian matrix H_0 (Cholesky factor). 00501 */ 00502 ys = SGVector<float64_t>::dot(it->y, it->s, n); 00503 yy = SGVector<float64_t>::dot(it->y, it->y, n); 00504 it->ys = ys; 00505 00506 /* 00507 Recursive formula to compute dir = -(H \cdot g). 00508 This is described in page 779 of: 00509 Jorge Nocedal. 00510 Updating Quasi-Newton Matrices with Limited Storage. 00511 Mathematics of Computation, Vol. 35, No. 151, 00512 pp. 773--782, 1980. 00513 */ 00514 bound = (m <= k) ? m : k; 00515 ++k; 00516 end = (end + 1) % m; 00517 00518 /* Compute the steepest direction. */ 00519 if (param.orthantwise_c == 0.) { 00520 /* Compute the negative of gradients. */ 00521 std::copy(g, g+n, d); 00522 SGVector<float64_t>::scale_vector(-1, d, n); 00523 } else { 00524 std::copy(pg, pg+n, d); 00525 SGVector<float64_t>::scale_vector(-1, d, n); 00526 } 00527 00528 j = end; 00529 for (i = 0;i < bound;++i) { 00530 j = (j + m - 1) % m; /* if (--j == -1) j = m-1; */ 00531 it = &lm[j]; 00532 /* \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}. */ 00533 it->alpha = SGVector<float64_t>::dot(it->s, d, n); 00534 it->alpha /= it->ys; 00535 /* q_{i} = q_{i+1} - \alpha_{i} y_{i}. */ 00536 SGVector<float64_t>::add(d, 1, d, -it->alpha, it->y, n); 00537 } 00538 00539 SGVector<float64_t>::scale_vector(ys / yy, d, n); 00540 00541 for (i = 0;i < bound;++i) { 00542 it = &lm[j]; 00543 /* \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}. */ 00544 beta = SGVector<float64_t>::dot(it->y, d, n); 00545 beta /= it->ys; 00546 /* \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}. */ 00547 SGVector<float64_t>::add(d, 1, d, it->alpha-beta, it->s, n); 00548 j = (j + 1) % m; /* if (++j == m) j = 0; */ 00549 } 00550 00551 /* 00552 Constrain the search direction for orthant-wise updates. 00553 */ 00554 if (param.orthantwise_c != 0.) { 00555 for (i = param.orthantwise_start;i < param.orthantwise_end;++i) { 00556 if (d[i] * pg[i] >= 0) { 00557 d[i] = 0; 00558 } 00559 } 00560 } 00561 00562 /* 00563 Now the search direction d is ready. We try step = 1 first. 00564 */ 00565 step = 1.0; 00566 } 00567 00568 lbfgs_exit: 00569 /* Return the final value of the objective function. */ 00570 if (ptr_fx != NULL) { 00571 *ptr_fx = fx; 00572 } 00573 00574 SG_FREE(pf); 00575 00576 /* Free memory blocks used by this function. */ 00577 if (lm != NULL) { 00578 for (i = 0;i < m;++i) { 00579 SG_FREE(lm[i].s); 00580 SG_FREE(lm[i].y); 00581 } 00582 SG_FREE(lm); 00583 } 00584 SG_FREE(pg); 00585 SG_FREE(w); 00586 SG_FREE(d); 00587 SG_FREE(gp); 00588 SG_FREE(g); 00589 SG_FREE(xp); 00590 00591 return ret; 00592 } 00593 00594 00595 00596 static int32_t line_search_backtracking( 00597 int32_t n, 00598 float64_t *x, 00599 float64_t *f, 00600 float64_t *g, 00601 float64_t *s, 00602 float64_t *stp, 00603 const float64_t* xp, 00604 const float64_t* gp, 00605 float64_t *wp, 00606 callback_data_t *cd, 00607 const lbfgs_parameter_t *param 00608 ) 00609 { 00610 int32_t count = 0; 00611 float64_t width, dg; 00612 float64_t finit, dginit = 0., dgtest; 00613 const float64_t dec = 0.5, inc = 2.1; 00614 00615 /* Check the input parameters for errors. */ 00616 if (*stp <= 0.) { 00617 return LBFGSERR_INVALIDPARAMETERS; 00618 } 00619 00620 /* Compute the initial gradient in the search direction. */ 00621 dginit = SGVector<float64_t>::dot(g, s, n); 00622 00623 /* Make sure that s points to a descent direction. */ 00624 if (0 < dginit) { 00625 return LBFGSERR_INCREASEGRADIENT; 00626 } 00627 00628 /* The initial value of the objective function. */ 00629 finit = *f; 00630 dgtest = param->ftol * dginit; 00631 00632 for (;;) { 00633 std::copy(xp,xp+n,x); 00634 SGVector<float64_t>::add(x, 1, x, *stp, s, n); 00635 00636 /* Evaluate the function and gradient values. */ 00637 *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp); 00638 00639 ++count; 00640 00641 if (*f > finit + *stp * dgtest) { 00642 width = dec; 00643 } else { 00644 /* The sufficient decrease condition (Armijo condition). */ 00645 if (param->linesearch == LBFGS_LINESEARCH_BACKTRACKING_ARMIJO) { 00646 /* Exit with the Armijo condition. */ 00647 return count; 00648 } 00649 00650 /* Check the Wolfe condition. */ 00651 dg = SGVector<float64_t>::dot(g, s, n); 00652 if (dg < param->wolfe * dginit) { 00653 width = inc; 00654 } else { 00655 if(param->linesearch == LBFGS_LINESEARCH_BACKTRACKING_WOLFE) { 00656 /* Exit with the regular Wolfe condition. */ 00657 return count; 00658 } 00659 00660 /* Check the strong Wolfe condition. */ 00661 if(dg > -param->wolfe * dginit) { 00662 width = dec; 00663 } else { 00664 /* Exit with the strong Wolfe condition. */ 00665 return count; 00666 } 00667 } 00668 } 00669 00670 if (*stp < param->min_step) { 00671 /* The step is the minimum value. */ 00672 return LBFGSERR_MINIMUMSTEP; 00673 } 00674 if (*stp > param->max_step) { 00675 /* The step is the maximum value. */ 00676 return LBFGSERR_MAXIMUMSTEP; 00677 } 00678 if (param->max_linesearch <= count) { 00679 /* Maximum number of iteration. */ 00680 return LBFGSERR_MAXIMUMLINESEARCH; 00681 } 00682 00683 (*stp) *= width; 00684 } 00685 } 00686 00687 00688 00689 static int32_t line_search_backtracking_owlqn( 00690 int32_t n, 00691 float64_t *x, 00692 float64_t *f, 00693 float64_t *g, 00694 float64_t *s, 00695 float64_t *stp, 00696 const float64_t* xp, 00697 const float64_t* gp, 00698 float64_t *wp, 00699 callback_data_t *cd, 00700 const lbfgs_parameter_t *param 00701 ) 00702 { 00703 int32_t i, count = 0; 00704 float64_t width = 0.5, norm = 0.; 00705 float64_t finit = *f, dgtest; 00706 00707 /* Check the input parameters for errors. */ 00708 if (*stp <= 0.) { 00709 return LBFGSERR_INVALIDPARAMETERS; 00710 } 00711 00712 /* Choose the orthant for the new point. */ 00713 for (i = 0;i < n;++i) { 00714 wp[i] = (xp[i] == 0.) ? -gp[i] : xp[i]; 00715 } 00716 00717 for (;;) { 00718 /* Update the current point. */ 00719 std::copy(xp,xp+n,x); 00720 SGVector<float64_t>::add(x, 1, x, *stp, s, n); 00721 00722 /* The current point is projected onto the orthant. */ 00723 owlqn_project(x, wp, param->orthantwise_start, param->orthantwise_end); 00724 00725 /* Evaluate the function and gradient values. */ 00726 *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp); 00727 00728 /* Compute the L1 norm of the variables and add it to the object value. */ 00729 norm = owlqn_x1norm(x, param->orthantwise_start, param->orthantwise_end); 00730 *f += norm * param->orthantwise_c; 00731 00732 ++count; 00733 00734 dgtest = 0.; 00735 for (i = 0;i < n;++i) { 00736 dgtest += (x[i] - xp[i]) * gp[i]; 00737 } 00738 00739 if (*f <= finit + param->ftol * dgtest) { 00740 /* The sufficient decrease condition. */ 00741 return count; 00742 } 00743 00744 if (*stp < param->min_step) { 00745 /* The step is the minimum value. */ 00746 return LBFGSERR_MINIMUMSTEP; 00747 } 00748 if (*stp > param->max_step) { 00749 /* The step is the maximum value. */ 00750 return LBFGSERR_MAXIMUMSTEP; 00751 } 00752 if (param->max_linesearch <= count) { 00753 /* Maximum number of iteration. */ 00754 return LBFGSERR_MAXIMUMLINESEARCH; 00755 } 00756 00757 (*stp) *= width; 00758 } 00759 } 00760 00761 00762 00763 static int32_t line_search_morethuente( 00764 int32_t n, 00765 float64_t *x, 00766 float64_t *f, 00767 float64_t *g, 00768 float64_t *s, 00769 float64_t *stp, 00770 const float64_t* xp, 00771 const float64_t* gp, 00772 float64_t *wa, 00773 callback_data_t *cd, 00774 const lbfgs_parameter_t *param 00775 ) 00776 { 00777 int32_t count = 0; 00778 int32_t brackt, stage1, uinfo = 0; 00779 float64_t dg; 00780 float64_t stx, fx, dgx; 00781 float64_t sty, fy, dgy; 00782 float64_t fxm, dgxm, fym, dgym, fm, dgm; 00783 float64_t finit, ftest1, dginit, dgtest; 00784 float64_t width, prev_width; 00785 float64_t stmin, stmax; 00786 00787 /* Check the input parameters for errors. */ 00788 if (*stp <= 0.) { 00789 return LBFGSERR_INVALIDPARAMETERS; 00790 } 00791 00792 /* Compute the initial gradient in the search direction. */ 00793 dginit = SGVector<float64_t>::dot(g, s, n); 00794 00795 /* Make sure that s points to a descent direction. */ 00796 if (0 < dginit) { 00797 return LBFGSERR_INCREASEGRADIENT; 00798 } 00799 00800 /* Initialize local variables. */ 00801 brackt = 0; 00802 stage1 = 1; 00803 finit = *f; 00804 dgtest = param->ftol * dginit; 00805 width = param->max_step - param->min_step; 00806 prev_width = 2.0 * width; 00807 00808 /* 00809 The variables stx, fx, dgx contain the values of the step, 00810 function, and directional derivative at the best step. 00811 The variables sty, fy, dgy contain the value of the step, 00812 function, and derivative at the other endpoint of 00813 the interval of uncertainty. 00814 The variables stp, f, dg contain the values of the step, 00815 function, and derivative at the current step. 00816 */ 00817 stx = sty = 0.; 00818 fx = fy = finit; 00819 dgx = dgy = dginit; 00820 00821 for (;;) { 00822 /* 00823 Set the minimum and maximum steps to correspond to the 00824 present interval of uncertainty. 00825 */ 00826 if (brackt) { 00827 stmin = min2(stx, sty); 00828 stmax = max2(stx, sty); 00829 } else { 00830 stmin = stx; 00831 stmax = *stp + 4.0 * (*stp - stx); 00832 } 00833 00834 /* Clip the step in the range of [stpmin, stpmax]. */ 00835 if (*stp < param->min_step) *stp = param->min_step; 00836 if (param->max_step < *stp) *stp = param->max_step; 00837 00838 /* 00839 If an unusual termination is to occur then let 00840 stp be the lowest point obtained so far. 00841 */ 00842 if ((brackt && ((*stp <= stmin || stmax <= *stp) || param->max_linesearch <= count + 1 || uinfo != 0)) || (brackt && (stmax - stmin <= param->xtol * stmax))) { 00843 *stp = stx; 00844 } 00845 00846 /* 00847 Compute the current value of x: 00848 x <- x + (*stp) * s. 00849 */ 00850 std::copy(xp,xp+n,x); 00851 SGVector<float64_t>::add(x, 1, x, *stp, s, n); 00852 00853 /* Evaluate the function and gradient values. */ 00854 *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp); 00855 dg = SGVector<float64_t>::dot(g, s, n); 00856 00857 ftest1 = finit + *stp * dgtest; 00858 ++count; 00859 00860 /* Test for errors and convergence. */ 00861 if (brackt && ((*stp <= stmin || stmax <= *stp) || uinfo != 0)) { 00862 /* Rounding errors prevent further progress. */ 00863 return LBFGSERR_ROUNDING_ERROR; 00864 } 00865 if (*stp == param->max_step && *f <= ftest1 && dg <= dgtest) { 00866 /* The step is the maximum value. */ 00867 return LBFGSERR_MAXIMUMSTEP; 00868 } 00869 if (*stp == param->min_step && (ftest1 < *f || dgtest <= dg)) { 00870 /* The step is the minimum value. */ 00871 return LBFGSERR_MINIMUMSTEP; 00872 } 00873 if (brackt && (stmax - stmin) <= param->xtol * stmax) { 00874 /* Relative width of the interval of uncertainty is at most xtol. */ 00875 return LBFGSERR_WIDTHTOOSMALL; 00876 } 00877 if (param->max_linesearch <= count) { 00878 /* Maximum number of iteration. */ 00879 return LBFGSERR_MAXIMUMLINESEARCH; 00880 } 00881 if (*f <= ftest1 && fabs(dg) <= param->gtol * (-dginit)) { 00882 /* The sufficient decrease condition and the directional derivative condition hold. */ 00883 return count; 00884 } 00885 00886 /* 00887 In the first stage we seek a step for which the modified 00888 function has a nonpositive value and nonnegative derivative. 00889 */ 00890 if (stage1 && *f <= ftest1 && min2(param->ftol, param->gtol) * dginit <= dg) { 00891 stage1 = 0; 00892 } 00893 00894 /* 00895 A modified function is used to predict the step only if 00896 we have not obtained a step for which the modified 00897 function has a nonpositive function value and nonnegative 00898 derivative, and if a lower function value has been 00899 obtained but the decrease is not sufficient. 00900 */ 00901 if (stage1 && ftest1 < *f && *f <= fx) { 00902 /* Define the modified function and derivative values. */ 00903 fm = *f - *stp * dgtest; 00904 fxm = fx - stx * dgtest; 00905 fym = fy - sty * dgtest; 00906 dgm = dg - dgtest; 00907 dgxm = dgx - dgtest; 00908 dgym = dgy - dgtest; 00909 00910 /* 00911 Call update_trial_interval() to update the interval of 00912 uncertainty and to compute the new step. 00913 */ 00914 uinfo = update_trial_interval( 00915 &stx, &fxm, &dgxm, 00916 &sty, &fym, &dgym, 00917 stp, &fm, &dgm, 00918 stmin, stmax, &brackt 00919 ); 00920 00921 /* Reset the function and gradient values for f. */ 00922 fx = fxm + stx * dgtest; 00923 fy = fym + sty * dgtest; 00924 dgx = dgxm + dgtest; 00925 dgy = dgym + dgtest; 00926 } else { 00927 /* 00928 Call update_trial_interval() to update the interval of 00929 uncertainty and to compute the new step. 00930 */ 00931 uinfo = update_trial_interval( 00932 &stx, &fx, &dgx, 00933 &sty, &fy, &dgy, 00934 stp, f, &dg, 00935 stmin, stmax, &brackt 00936 ); 00937 } 00938 00939 /* 00940 Force a sufficient decrease in the interval of uncertainty. 00941 */ 00942 if (brackt) { 00943 if (0.66 * prev_width <= fabs(sty - stx)) { 00944 *stp = stx + 0.5 * (sty - stx); 00945 } 00946 prev_width = width; 00947 width = fabs(sty - stx); 00948 } 00949 } 00950 00951 return LBFGSERR_LOGICERROR; 00952 } 00953 00954 00955 00959 #define USES_MINIMIZER \ 00960 float64_t a, d, gamma, theta, p, q, r, s; 00961 00972 #define CUBIC_MINIMIZER(cm, u, fu, du, v, fv, dv) \ 00973 d = (v) - (u); \ 00974 theta = ((fu) - (fv)) * 3 / d + (du) + (dv); \ 00975 p = fabs(theta); \ 00976 q = fabs(du); \ 00977 r = fabs(dv); \ 00978 s = max3(p, q, r); \ 00979 /* gamma = s*sqrt((theta/s)**2 - (du/s) * (dv/s)) */ \ 00980 a = theta / s; \ 00981 gamma = s * sqrt(a * a - ((du) / s) * ((dv) / s)); \ 00982 if ((v) < (u)) gamma = -gamma; \ 00983 p = gamma - (du) + theta; \ 00984 q = gamma - (du) + gamma + (dv); \ 00985 r = p / q; \ 00986 (cm) = (u) + r * d; 00987 01000 #define CUBIC_MINIMIZER2(cm, u, fu, du, v, fv, dv, xmin, xmax) \ 01001 d = (v) - (u); \ 01002 theta = ((fu) - (fv)) * 3 / d + (du) + (dv); \ 01003 p = fabs(theta); \ 01004 q = fabs(du); \ 01005 r = fabs(dv); \ 01006 s = max3(p, q, r); \ 01007 /* gamma = s*sqrt((theta/s)**2 - (du/s) * (dv/s)) */ \ 01008 a = theta / s; \ 01009 gamma = s * sqrt(max2(0, a * a - ((du) / s) * ((dv) / s))); \ 01010 if ((u) < (v)) gamma = -gamma; \ 01011 p = gamma - (dv) + theta; \ 01012 q = gamma - (dv) + gamma + (du); \ 01013 r = p / q; \ 01014 if (r < 0. && gamma != 0.) { \ 01015 (cm) = (v) - r * d; \ 01016 } else if (a < 0) { \ 01017 (cm) = (xmax); \ 01018 } else { \ 01019 (cm) = (xmin); \ 01020 } 01021 01031 #define QUARD_MINIMIZER(qm, u, fu, du, v, fv) \ 01032 a = (v) - (u); \ 01033 (qm) = (u) + (du) / (((fu) - (fv)) / a + (du)) / 2 * a; 01034 01043 #define QUARD_MINIMIZER2(qm, u, du, v, dv) \ 01044 a = (u) - (v); \ 01045 (qm) = (v) + (dv) / ((dv) - (du)) * a; 01046 01047 #define fsigndiff(x, y) (*(x) * (*(y) / fabs(*(y))) < 0.) 01048 01078 static int32_t update_trial_interval( 01079 float64_t *x, 01080 float64_t *fx, 01081 float64_t *dx, 01082 float64_t *y, 01083 float64_t *fy, 01084 float64_t *dy, 01085 float64_t *t, 01086 float64_t *ft, 01087 float64_t *dt, 01088 const float64_t tmin, 01089 const float64_t tmax, 01090 int32_t *brackt 01091 ) 01092 { 01093 int32_t bound; 01094 int32_t dsign = fsigndiff(dt, dx); 01095 float64_t mc; /* minimizer of an interpolated cubic. */ 01096 float64_t mq; /* minimizer of an interpolated quadratic. */ 01097 float64_t newt; /* new trial value. */ 01098 USES_MINIMIZER; /* for CUBIC_MINIMIZER and QUARD_MINIMIZER. */ 01099 01100 /* Check the input parameters for errors. */ 01101 if (*brackt) { 01102 if (*t <= min2(*x, *y) || max2(*x, *y) <= *t) { 01103 /* The trival value t is out of the interval. */ 01104 return LBFGSERR_OUTOFINTERVAL; 01105 } 01106 if (0. <= *dx * (*t - *x)) { 01107 /* The function must decrease from x. */ 01108 return LBFGSERR_INCREASEGRADIENT; 01109 } 01110 if (tmax < tmin) { 01111 /* Incorrect tmin and tmax specified. */ 01112 return LBFGSERR_INCORRECT_TMINMAX; 01113 } 01114 } 01115 01116 /* 01117 Trial value selection. 01118 */ 01119 if (*fx < *ft) { 01120 /* 01121 Case 1: a higher function value. 01122 The minimum is brackt. If the cubic minimizer is closer 01123 to x than the quadratic one, the cubic one is taken, else 01124 the average of the minimizers is taken. 01125 */ 01126 *brackt = 1; 01127 bound = 1; 01128 CUBIC_MINIMIZER(mc, *x, *fx, *dx, *t, *ft, *dt); 01129 QUARD_MINIMIZER(mq, *x, *fx, *dx, *t, *ft); 01130 if (fabs(mc - *x) < fabs(mq - *x)) { 01131 newt = mc; 01132 } else { 01133 newt = mc + 0.5 * (mq - mc); 01134 } 01135 } else if (dsign) { 01136 /* 01137 Case 2: a lower function value and derivatives of 01138 opposite sign. The minimum is brackt. If the cubic 01139 minimizer is closer to x than the quadratic (secant) one, 01140 the cubic one is taken, else the quadratic one is taken. 01141 */ 01142 *brackt = 1; 01143 bound = 0; 01144 CUBIC_MINIMIZER(mc, *x, *fx, *dx, *t, *ft, *dt); 01145 QUARD_MINIMIZER2(mq, *x, *dx, *t, *dt); 01146 if (fabs(mc - *t) > fabs(mq - *t)) { 01147 newt = mc; 01148 } else { 01149 newt = mq; 01150 } 01151 } else if (fabs(*dt) < fabs(*dx)) { 01152 /* 01153 Case 3: a lower function value, derivatives of the 01154 same sign, and the magnitude of the derivative decreases. 01155 The cubic minimizer is only used if the cubic tends to 01156 infinity in the direction of the minimizer or if the minimum 01157 of the cubic is beyond t. Otherwise the cubic minimizer is 01158 defined to be either tmin or tmax. The quadratic (secant) 01159 minimizer is also computed and if the minimum is brackt 01160 then the the minimizer closest to x is taken, else the one 01161 farthest away is taken. 01162 */ 01163 bound = 1; 01164 CUBIC_MINIMIZER2(mc, *x, *fx, *dx, *t, *ft, *dt, tmin, tmax); 01165 QUARD_MINIMIZER2(mq, *x, *dx, *t, *dt); 01166 if (*brackt) { 01167 if (fabs(*t - mc) < fabs(*t - mq)) { 01168 newt = mc; 01169 } else { 01170 newt = mq; 01171 } 01172 } else { 01173 if (fabs(*t - mc) > fabs(*t - mq)) { 01174 newt = mc; 01175 } else { 01176 newt = mq; 01177 } 01178 } 01179 } else { 01180 /* 01181 Case 4: a lower function value, derivatives of the 01182 same sign, and the magnitude of the derivative does 01183 not decrease. If the minimum is not brackt, the step 01184 is either tmin or tmax, else the cubic minimizer is taken. 01185 */ 01186 bound = 0; 01187 if (*brackt) { 01188 CUBIC_MINIMIZER(newt, *t, *ft, *dt, *y, *fy, *dy); 01189 } else if (*x < *t) { 01190 newt = tmax; 01191 } else { 01192 newt = tmin; 01193 } 01194 } 01195 01196 /* 01197 Update the interval of uncertainty. This update does not 01198 depend on the new step or the case analysis above. 01199 01200 - Case a: if f(x) < f(t), 01201 x <- x, y <- t. 01202 - Case b: if f(t) <= f(x) && f'(t)*f'(x) > 0, 01203 x <- t, y <- y. 01204 - Case c: if f(t) <= f(x) && f'(t)*f'(x) < 0, 01205 x <- t, y <- x. 01206 */ 01207 if (*fx < *ft) { 01208 /* Case a */ 01209 *y = *t; 01210 *fy = *ft; 01211 *dy = *dt; 01212 } else { 01213 /* Case c */ 01214 if (dsign) { 01215 *y = *x; 01216 *fy = *fx; 01217 *dy = *dx; 01218 } 01219 /* Cases b and c */ 01220 *x = *t; 01221 *fx = *ft; 01222 *dx = *dt; 01223 } 01224 01225 /* Clip the new trial value in [tmin, tmax]. */ 01226 if (tmax < newt) newt = tmax; 01227 if (newt < tmin) newt = tmin; 01228 01229 /* 01230 Redefine the new trial value if it is close to the upper bound 01231 of the interval. 01232 */ 01233 if (*brackt && bound) { 01234 mq = *x + 0.66 * (*y - *x); 01235 if (*x < *y) { 01236 if (mq < newt) newt = mq; 01237 } else { 01238 if (newt < mq) newt = mq; 01239 } 01240 } 01241 01242 /* Return the new trial value. */ 01243 *t = newt; 01244 return 0; 01245 } 01246 01247 01248 01249 01250 01251 static float64_t owlqn_x1norm( 01252 const float64_t* x, 01253 const int32_t start, 01254 const int32_t n 01255 ) 01256 { 01257 int32_t i; 01258 float64_t norm = 0.; 01259 01260 for (i = start;i < n;++i) { 01261 norm += fabs(x[i]); 01262 } 01263 01264 return norm; 01265 } 01266 01267 static void owlqn_pseudo_gradient( 01268 float64_t* pg, 01269 const float64_t* x, 01270 const float64_t* g, 01271 const int32_t n, 01272 const float64_t c, 01273 const int32_t start, 01274 const int32_t end 01275 ) 01276 { 01277 int32_t i; 01278 01279 /* Compute the negative of gradients. */ 01280 for (i = 0;i < start;++i) { 01281 pg[i] = g[i]; 01282 } 01283 01284 /* Compute the psuedo-gradients. */ 01285 for (i = start;i < end;++i) { 01286 if (x[i] < 0.) { 01287 /* Differentiable. */ 01288 pg[i] = g[i] - c; 01289 } else if (0. < x[i]) { 01290 /* Differentiable. */ 01291 pg[i] = g[i] + c; 01292 } else { 01293 if (g[i] < -c) { 01294 /* Take the right partial derivative. */ 01295 pg[i] = g[i] + c; 01296 } else if (c < g[i]) { 01297 /* Take the left partial derivative. */ 01298 pg[i] = g[i] - c; 01299 } else { 01300 pg[i] = 0.; 01301 } 01302 } 01303 } 01304 01305 for (i = end;i < n;++i) { 01306 pg[i] = g[i]; 01307 } 01308 } 01309 01310 static void owlqn_project( 01311 float64_t* d, 01312 const float64_t* sign, 01313 const int32_t start, 01314 const int32_t end 01315 ) 01316 { 01317 int32_t i; 01318 01319 for (i = start;i < end;++i) { 01320 if (d[i] * sign[i] <= 0) { 01321 d[i] = 0; 01322 } 01323 } 01324 } 01325 01326 }