Q-Learning Controller for the Cart-Pole Problem

/*----------------------------------------------------------------------------+ | | | This program is a demonstration of Q-learning. To the learning | | system, its environment is a black box from which it has several | | signal lines and a reinforcement line. Its task is to learn to give | | responses which maximize the scalar signals on its reinforcement line. | | | | The system must must learn to correlate its state in the environment | | with the future reinforcements it will see for each of its candidate | | actions. Therefore, the system must determine its state from the | | signal values it gets from the environment. For the current | | demonstration, the system does no feature extraction; instead, it | | determines its state according to a built-in look-up table. This | | table is the "boxes" state representation of Barto, Sutton, and | | Anderson, described in their paper, "Neuronlike Adaptive Elements That | | Solve Difficult Learning Control Problems," IEEE Trans. Syst., Man, | | Cybern., Vol. SMC-13, pp. 834-846, Sep.- Oct. 1983. | | | | The Q-value update is the following, if the system takes action a from | | state s at time t, and arrives at state ss with feedback r at time t+1 : | | | | Q(t+1, s, a) = Q(t, s, a) | | + \alpha (r + \gamma max_{b}Q(t,ss, b) - Q(t, s, a)) | | | | To apply this system to different problems: | | 1. Change the number of input parameters | | 2. Replace comparisons of Q(state, 0) and Q(state, 1) with expressions | | for choosing the maximum over i of Q(state, i) | | 3. Rewrite the state identification routine. Ideally, this should not | | be a set of pre-defined "boxes," but the controller should | | do feature extraction on-line, as it learns. | | | +----------------------------------------------------------------------------*/ /* $Log: q.c,v $ * Revision 1.1.1.1 1995/02/10 21:49:24 finton * Corrected bug in setting predicted_value for failure state, * where cur_state = -1. The predicted value should be 0.0. * * Revision 1.1 1994/11/17 19:49:17 finton * Initial revision * */ #include #define sqr(x) ( (x) * (x) ) #define W_INIT 0.0 #define NUM_BOXES 162 extern int RND_SEED; static float ALPHA = 0.5; /* learning rate parameter */ static float BETA = 0.0; /* magnitude of noise added to choice */ static float GAMMA = 0.999; /* discount factor for future reinf */ static float q_val[NUM_BOXES][2]; /* state-action values */ static first_time = 1; static int cur_action, prev_action; static int cur_state, prev_state; static char rcs_controller_id[] = "$Id: q.c,v 1.1.1.1 1995/02/10 21:49:24 finton Exp $"; /*----------------------------------------------------------------------------+ | get_action : returns either 0 or 1 as action choice; | | accepts five "black box" inputs, the first four of which are | | system variables, and the last is a reinforcement signal. | | Note that reinf is the result of the previous state and | | action. | +----------------------------------------------------------------------------*/ int get_action(float x, /* system variables == state information */ float x_dot, float theta, float theta_dot, float reinf) /* reinforcement signal */ { int i,j; float predicted_value; /* max_{b} Q(t, ss, b) */ double rnd(double, double); int get_box(float x, float x_dot, float theta, float theta_dot); /*state*/ void srandom(int); void reset_controller(void); /* reset state/action before new trial */ if (first_time) { first_time = 0; reset_controller(); /* set state and action to null values */ for (i = 0; i < NUM_BOXES; i++) for (j = 0; j < 2; j++) q_val[i][j] = W_INIT; printf("Controller: %s\n", rcs_controller_id); printf("... setting learning parameter ALPHA to %.4f.\n", ALPHA); printf("... setting noise parameter BETA to %.4f.\n", BETA); printf("... setting discount parameter GAMMA to %.4f.\n", GAMMA); printf("... random RND_SEED is %d.\n", RND_SEED); srandom(RND_SEED); /* initialize random number generator */ } prev_state = cur_state; prev_action = cur_action; cur_state = get_box(x, x_dot, theta, theta_dot); if (prev_action != -1) /* Update, but not for first action in trial */ { if (cur_state == -1) /* failure state has Q-value of 0, since the value won't be updated */ predicted_value = 0.0; else if (q_val[cur_state][0] <= q_val[cur_state][1]) predicted_value = q_val[cur_state][1]; else predicted_value = q_val[cur_state][0]; q_val[prev_state][prev_action] += ALPHA * (reinf + GAMMA * predicted_value - q_val[prev_state][prev_action]); } /* Now determine best action */ if (q_val[cur_state][0] + rnd(-BETA, BETA) <= q_val[cur_state][1]) cur_action = 1; else cur_action = 0; return cur_action; } double rnd(double low_bound, double hi_bound) /* rnd scales the output of the system random function to the * range [low_bound, hi_bound]. */ { long random(void); /* system random number generator */ double highest = (double) RAND_MAX; /* if RAND_MAX is not defined, try ((1 << 31) -1) */ return (random() / highest) * (hi_bound - low_bound) + low_bound; } void reset_controller(void) { cur_state = prev_state = 0; cur_action = prev_action = -1; /* "null" action value */ } /* The following routine was written by Rich Sutton and Chuck Anderson, with translation from FORTRAN to C by Claude Sammut */ /*---------------------------------------------------------------------- get_box: Given the current state, returns a number from 0 to 161 designating the region of the state space encompassing the current state. Returns a value of -1 if a failure state is encountered. ----------------------------------------------------------------------*/ #define one_degree 0.0174532 /* 2pi/360 */ #define six_degrees 0.1047192 #define twelve_degrees 0.2094384 #define fifty_degrees 0.87266 int get_box(float x, float x_dot, float theta, float theta_dot) { int box=0; if (x < -2.4 || x > 2.4 || theta < -twelve_degrees || theta > twelve_degrees) return(-1); /* to signal failure */ if (x < -0.8) box = 0; else if (x < 0.8) box = 1; else box = 2; if (x_dot < -0.5) ; else if (x_dot < 0.5) box += 3; else box += 6; if (theta < -six_degrees) ; else if (theta < -one_degree) box += 9; else if (theta < 0) box += 18; else if (theta < one_degree) box += 27; else if (theta < six_degrees) box += 36; else box += 45; if (theta_dot < -fifty_degrees) ; else if (theta_dot < fifty_degrees) box += 54; else box += 108; return(box); }