optimizers.cc

Go to the documentation of this file.
00001 /* MLPACK 0.2
00002  *
00003  * Copyright (c) 2008, 2009 Alexander Gray,
00004  *                          Garry Boyer,
00005  *                          Ryan Riegel,
00006  *                          Nikolaos Vasiloglou,
00007  *                          Dongryeol Lee,
00008  *                          Chip Mappus, 
00009  *                          Nishant Mehta,
00010  *                          Hua Ouyang,
00011  *                          Parikshit Ram,
00012  *                          Long Tran,
00013  *                          Wee Chin Wong
00014  *
00015  * Copyright (c) 2008, 2009 Georgia Institute of Technology
00016  *
00017  * This program is free software; you can redistribute it and/or
00018  * modify it under the terms of the GNU General Public License as
00019  * published by the Free Software Foundation; either version 2 of the
00020  * License, or (at your option) any later version.
00021  *
00022  * This program is distributed in the hope that it will be useful, but
00023  * WITHOUT ANY WARRANTY; without even the implied warranty of
00024  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00025  * General Public License for more details.
00026  *
00027  * You should have received a copy of the GNU General Public License
00028  * along with this program; if not, write to the Free Software
00029  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
00030  * 02110-1301, USA.
00031  */
00040 #include "optimizers.h"
00041 
00042 void NelderMead::Eval(double **pts) {
00043 
00044   index_t dim = dimension(), num_func_eval;
00045   index_t i, j, ihi, ilo, inhi,mpts = dim + 1;
00046   double sum, swap, *psum;
00047   long double swap_y, rtol, ytry, ysave, TINY = 1.0e-10;
00048   long double *y;
00049   Vector param_passed;
00050   long double tol = fx_param_double(opt_module_,"tolerance", 1.0e-5);
00051   index_t NMAX = fx_param_int(opt_module_, "MAX_FUNC_EVAL", 50000);
00052 
00053   param_passed.Init(dim);
00054   psum = (double*)malloc(dim * sizeof(double));
00055   num_func_eval = 0;
00056   y = (long double*)malloc(mpts*sizeof(long double));
00057   for(i = 0; i < mpts; i++) {
00058     param_passed.CopyValues(pts[i]);
00059     y[i] = (*func_ptr_)(param_passed, data());
00060   }
00061   
00062   for(;;) {
00063     ilo = 0;
00064     ihi = y[0] > y[1] ? (inhi = 1,0) : (inhi = 0,1);
00065     for( i = 0; i < mpts; i++ ) {
00066       if(y[i] <= y[ilo]) ilo = i;
00067       if(y[i] > y[ihi]) {
00068         inhi = ihi;
00069         ihi = i;
00070       }
00071       else if((y[i] > y[inhi])&&(i != ihi)) inhi = i;
00072     }
00073                 
00074     rtol = 2.0 * fabs(y[ihi] - y[ilo]) / ( fabs(y[ihi]) + fabs(y[ilo]) + TINY ) ;
00075     if(rtol < tol) {
00076       swap_y = y[0];
00077       y[0] = y[ilo];
00078       y[ilo] = swap_y;
00079       for( i = 0; i < dim; i++ ) {
00080         swap = pts[0][i];
00081         pts[0][i] =  pts[ilo][i] ;
00082         pts[ilo][i] = swap;
00083       }
00084       break;
00085     }
00086     if(num_func_eval > NMAX){
00087       NOTIFY("Maximum number of function evaluations exceeded");
00088       break;
00089     }
00090     num_func_eval += 2;
00091                 
00092     // Beginning a new iteration. 
00093     // Extrapolating by a factor of -1.0 through the face of the simplex
00094     // across from the high point, i.e, reflect the simplex from the high point
00095     for( j = 0 ; j < dim ; j++ ){
00096       sum = 0.0;
00097       for( i = 0 ; i < mpts ; i++ ) 
00098         if (i != ihi)
00099           sum += pts[i][j];
00100       
00101       psum[j] = sum / dim;
00102     }
00103 
00104     ytry = ModSimplex_(pts, y, psum, ihi, -1.0);
00105     if( ytry <= y[ilo] ) {      
00106       // result better than best point 
00107       // so additional extrapolation by a factor of 2
00108       ytry = ModSimplex_(pts, y, psum, ihi, 2.0);
00109     }
00110     else if( ytry >= y[ihi] ) { 
00111       // result worse than the worst point 
00112       // so there is a lower intermediate point, 
00113       // i.e., do a one dimensional contraction
00114       ysave = y[ihi];
00115 
00116       ytry = ModSimplex_(pts, y, psum, ihi, 0.5);
00117       if( ytry > y[ihi] ) { 
00118         // Can't get rid of the high point, 
00119         // try to contract around the best point
00120         for( i = 0; i < mpts; i++ ) {
00121           if( i != ilo ) {
00122             for( j = 0; j < dim; j++ ) {
00123               pts[i][j] = psum[j] = 0.5 * ( pts[i][j] + pts[ilo][j] );
00124             }
00125             param_passed.CopyValues(psum);
00126             y[i] = (*func_ptr_)(param_passed, data());
00127           }
00128         }
00129         num_func_eval += dim;
00130         for( j = 0 ; j < dim ; j++ ){
00131           sum = 0.0;
00132           for( i = 0 ; i < mpts ; i++ )
00133             if (i != ihi)
00134               sum += pts[i][j];
00135           psum[j] = sum / dim;
00136         }
00137       }
00138     }
00139     else --num_func_eval;
00140   }
00141   fx_result_int(opt_module_, "func_evals", num_func_eval);
00142   return;
00143 }
00144 
00145 long double NelderMead::ModSimplex_(double **pts, long double *y,
00146                                     double *psum, index_t ihi,
00147                                     float fac) {
00148 
00149   index_t j, dim = dimension();
00150   long double ytry;
00151   double *ptry;
00152   Vector param_passed;
00153         
00154   param_passed.Init(dim);
00155   ptry = (double*) malloc (dim * sizeof(double));
00156   for (j = 0; j < dim; j++) {
00157     ptry[j] = psum[j] * (1 - fac) + pts[ihi][j] * fac;
00158   }
00159   param_passed.CopyValues(ptry);
00160   ytry = (*func_ptr_)(param_passed, data());
00161   if (ytry < y[ihi]) {
00162     y[ihi] = ytry;
00163     for (j = 0; j < dim; j++) {
00164       pts[ihi][j] = ptry[j];
00165     }
00166   }
00167   return ytry;
00168 }
00169 
00170 void QuasiNewton::Eval(double *pt) {
00171 
00172   index_t n = dimension(), iters;
00173   index_t i, its, MAXIMUM_ITERATIONS = fx_param_int(opt_module_,"MAX_ITERS",200);
00174   long double temp_1, temp_2, temp_3, temp_4, f_previous, f_min, 
00175     maximum_step_length, sum = 0.0, sumdg, sumxi, temp, test;
00176   Vector dgrad, grad, hdgrad, xi;
00177   Vector pold, pnew;
00178   Matrix hessian;
00179   double EPSILON = fx_param_double(opt_module_, "EPSILON", 3.0e-8);
00180   double TOLERANCE = fx_param_double(opt_module_, "TOLERANCE", 1.0e-5);
00181   double MAX_STEP_SIZE = fx_param_double(opt_module_, "MAX_STEP_SIZE", 100.0);
00182   double g_tol = fx_param_double(opt_module_, "gtol", 1.0e-7);
00183 
00184   dgrad.Init(n);
00185   grad.Init(n);
00186   hdgrad.Init(n);
00187   hessian.Init(n,n);
00188   pnew.Init(n);
00189   xi.Init(n);
00190   pold.Copy(pt,n);
00191   f_previous = (*func_ptr_)(pold, data(), &grad);
00192   Vector tmp;
00193   tmp.Init(n);
00194   tmp.SetAll(1.0);
00195   hessian.SetDiagonal(tmp);
00196   la::ScaleOverwrite(-1.0, grad, &xi);
00197   
00198   sum = la::Dot(pold, pold);
00199   double fmax;
00200   if( sqrt(sum) > (float)n ) {
00201     fmax = sqrt(sum);
00202   }
00203   else { 
00204     fmax = (float)n;
00205   }
00206   maximum_step_length = MAX_STEP_SIZE*fmax;
00207 
00208   for(its = 0; its < MAXIMUM_ITERATIONS; its++) {
00209       
00210     dgrad.CopyValues(grad);
00211     LineSearch_(pold, f_previous, &grad, &xi,
00212                 &pnew, &f_min, maximum_step_length);
00213     f_previous = f_min;
00214     la::SubOverwrite(pold, pnew, &xi);
00215     pold.CopyValues(pnew);
00216 
00217     for(i = 0; i < n; i++) {
00218       pt[i] = pold.get(i);
00219     }
00220 
00221     test = 0.0;
00222     for(i = 0; i < n; i++){
00223       if(fabs(pold.get(i)) > 1.0) fmax = fabs(pold.get(i));
00224       else{ fmax = 1.0; }
00225       temp = fabs(xi.get(i)) / fmax;
00226       if(temp > test) test = temp;
00227     }
00228     if(test < TOLERANCE) {
00229       iters = its;
00230       fx_result_int(opt_module_, "iters", iters);
00231       return;
00232     }
00233 
00234     test = 0.0;
00235     if(f_min > 1.0) temp_1 = f_min;
00236     else{ temp_1 = 1.0; }
00237 
00238     for(i = 0; i < n; i++) {
00239       if(fabs(pold.get(i)) > 1.0) fmax = pold.get(i);
00240       else{ fmax = 1.0; }
00241 
00242       temp = fabs(grad.get(i))*fmax / temp_1;
00243       if(temp > test) test = temp;
00244     }
00245     if(test < g_tol) {
00246       iters = its;
00247       fx_result_int(opt_module_, "iters", iters);
00248       return;
00249     }
00250 
00251     la::SubFrom(grad, &dgrad);
00252     la::Scale(-1.0, &dgrad);
00253     la::MulOverwrite(hessian,dgrad, &hdgrad);
00254 
00255     temp_2 = la::Dot(dgrad, xi);
00256     temp_4 = la::Dot(dgrad, hdgrad);
00257     sumdg = la::Dot(dgrad, dgrad);
00258     sumxi = la::Dot(xi, xi);
00259 
00260     if (temp_2 > sqrt(EPSILON*sumdg*sumxi)) {
00261       temp_2 = 1.0 / temp_2;
00262       temp_3 = 1.0 / temp_4;
00263 
00264       la::ScaleOverwrite(temp_2, xi, &dgrad);
00265       la::AddExpert((-1.0*temp_3), hdgrad, &dgrad);
00266 
00267       Matrix co, ro, tmp;
00268       co.AliasColVector(xi);
00269       ro.AliasRowVector(xi);
00270       la::MulInit(co, ro, &tmp);
00271       la::AddExpert(temp_2, tmp, &hessian);
00272 
00273       co.Destruct();
00274       ro.Destruct();
00275       tmp.Destruct();
00276       co.AliasColVector(hdgrad);
00277       ro.AliasRowVector(hdgrad);
00278       la::MulInit(co, ro, &tmp);
00279       la::AddExpert((-1.0*temp_3), tmp, &hessian);
00280 
00281       co.Destruct();
00282       ro.Destruct();
00283       tmp.Destruct();
00284       co.AliasColVector(dgrad);
00285       ro.AliasRowVector(dgrad);
00286       la::MulInit(co, ro, &tmp);
00287       la::AddExpert(temp_4, tmp, &hessian);
00288     }
00289     la::MulOverwrite(hessian, grad, &xi);
00290     la::Scale((-1.0), &xi);
00291   }
00292   NOTIFY("Too many iterations in Quasi Newton\n");
00293 }
00294 
00295 void QuasiNewton::LineSearch_(Vector pold, long double fold,
00296                               Vector *grad, Vector *xi,
00297                               Vector *pnew, long double *f_min,
00298                               long double maximum_step_length) {
00299 
00300   index_t i, n = dimension();
00301   long double a, step_length, previous_step_length = 0.0, 
00302     minimum_step_length, b, disc, previous_f_value = 0.0,
00303     rhs1, rhs2, slope, sum, temp, test, temp_step_length,
00304     MIN_DECREASE = 1.0e-4, TOLERANCE = 1.0e-7;
00305     
00306   sum = la::Dot(*xi, *xi);
00307   sum = sqrt(sum);
00308   if(sum > maximum_step_length) {
00309     la::Scale((maximum_step_length/sum), xi);
00310   }
00311   slope = la::Dot(*grad, *xi);
00312   if(slope >= 0.0){
00313     return;
00314   }
00315   test = 0.0;
00316   for(i = 0; i < n; i++) {
00317     double fmax;
00318     fmax = (fabs(pold.get(i)) > 1.0 ? fabs(pold.get(i)) : 1.0);
00319     temp = fabs((*xi).get(i)) / fmax;
00320     if(temp > test) test = temp;
00321   }
00322 
00323   minimum_step_length = TOLERANCE/test;
00324   step_length = 1.0;
00325   for(;;) {
00326 
00327     pnew->CopyValues(pold);
00328     la::AddExpert(step_length, *xi, pnew);
00329 
00330     *f_min = (*func_ptr_)((*pnew), data(), grad);
00331     if(step_length < minimum_step_length) {
00332       pnew->CopyValues(pold);
00333       return;
00334     }
00335     else if( *f_min <= fold + MIN_DECREASE*step_length*slope) {
00336       return;
00337     }
00338     else {
00339       if (step_length == 1.0) {
00340         temp_step_length = -slope/(2.0*(*f_min - fold - slope));
00341       }
00342       else {
00343         rhs1 = *f_min - fold - step_length*slope;
00344         rhs2 = previous_f_value - fold - previous_step_length*slope;
00345         a = (rhs1 / (step_length*step_length) 
00346              - rhs2/(previous_step_length*previous_step_length))
00347           / (step_length-previous_step_length);
00348         b = (-previous_step_length*rhs1/(step_length*step_length)
00349              +step_length*rhs2/(previous_step_length*previous_step_length)) 
00350           / (step_length - previous_step_length);
00351         if(a == 0.0) {
00352           temp_step_length = -slope / (2.0*b);
00353         }
00354         else {
00355           disc = b*b - 3.0*a*slope;
00356           if(disc < 0.0) {
00357             temp_step_length = 0.5*step_length;
00358           }
00359           else if (b <= 0.0) {
00360             temp_step_length = (-b+sqrt(disc))/(3.0*a);
00361           }
00362           else {
00363             temp_step_length = -slope / (b+sqrt(disc));
00364           }
00365         }
00366         if(temp_step_length > 0.5*step_length) {
00367           temp_step_length = 0.5*step_length;
00368         }
00369       }
00370     }
00371     previous_step_length = step_length;
00372     previous_f_value = *f_min;
00373     step_length = (temp_step_length > 0.1*step_length 
00374                    ? temp_step_length : 0.1*step_length);
00375   }
00376 }