00001 00002 /* 00003 DESCRIPTION: 00004 00005 00006 The class 'Sym' in this package provides a reference counted, hashed tree structure that can be used in genetic programming. 00007 The hash table behind the scenes makes sure that every subtree in the application is stored only once. 00008 This has a couple of advantages: 00009 00010 o Memory: all subtrees are stored only once 00011 o Comparison: comparison for equality for two subtrees boils down to a pointer comparison 00012 o Overview: by accessing the hashtable, you get an instant overview of the state of the population 00013 00014 00015 The disadvantage of this method is the constant time overhead for computing hashes. In practice, 00016 it seems to be fast enough. 00017 00018 00019 ===== How to Use this ========= 00020 00021 In essence, the Sym data structure contains two important pieces of data, 00022 the 'token' (of type token_t = int) and the children, a vector of Sym (called SymVec). 00023 The token should contain all information to be able to figure out which 00024 function/terminal is represented by the node in the tree. By retrieving this token value 00025 and the SymVec it is possible to write recursive traversal routines for evaluation, printing, 00026 etc. 00027 00028 */ 00029 00030 #include <iostream> 00031 #include "Sym.h" 00032 00033 using namespace std; 00034 00035 00036 /* 00037 * Suppose token value '0' designates our terminal, and token value '1' designates a binary function. 00038 * Later on a ternary function will be used as well, designated with token value '2' 00039 * The function below will create a tree of size three 00040 */ 00041 Sym test1() { 00042 00043 SymVec children; 00044 children.push_back( Sym(0) ); // push_back is a member from std::vector, SymVec is derived from std::vector 00045 children.push_back( Sym(0) ); 00046 00047 Sym tree = Sym(token_t(1), children); // creates the tree 00048 00049 /* Done, now print some information about the node */ 00050 00051 cout << "Size = " << tree.size() << endl; // prints 3 00052 cout << "Depth = " << tree.depth() << endl; // prints 2 00053 cout << "Refcount = " << tree.refcount() << endl; // prints 1 00054 00055 Sym tree2 = tree; // make a copy (this only changes refcount) 00056 00057 cout << "Refcount now = " << tree.refcount() << endl; // print 2 00058 00059 return tree; // tree2 will be deleted and reference count returns to 1 00060 } 00061 00062 /* To actually use the tree, evaluate it, the following simple recursive function 00063 * can be used 00064 */ 00065 00066 int eval(const Sym& sym) { 00067 if (sym.token() == 0) { // it's a terminal in this example 00068 return 1; 00069 } 00070 // else it's the function 00071 const SymVec& children = sym.args(); // get the children out, children.size() is the arity 00072 00073 // let's assume that we've also got a ternary function designated by token '2' 00074 00075 if (sym.token() == token_t(1)) 00076 return eval(children[0]) + eval(children[1]); // evaluate 00077 00078 return eval(children[0]) + eval(children[1]) * eval(children[2]); // a ternary function 00079 } 00080 00081 /* Note that you simply use the stored token that was defined above. Simply checking the size of SymVec in 00082 * this particular example could have sufficed, but it's instructive to use the tokens. 00083 * 00084 * And to test this: 00085 */ 00086 00087 void test_eval() { 00088 00089 Sym tree = test1(); 00090 00091 cout << "Evaluating tree1 returns " << eval(tree) << endl; 00092 } 00093 00094 /* Writing initialization functions. 00095 * 00096 * As the Sym class is recursive in nature, initialization can simply be done using 00097 * recursive routines as above. As an example, the following code does 'full' initialization. 00098 */ 00099 00100 Sym init_full(int depth_left) { 00101 if (depth_left == 0) return Sym(0); // create terminal 00102 // else create either a binary or a ternary function 00103 00104 depth_left--; 00105 00106 if (rand() % 2 == 0) { // create binary 00107 SymVec vec(2); 00108 vec[0] = init_full(depth_left); 00109 vec[1] = init_full(depth_left); 00110 00111 return Sym(token_t(1), vec); 00112 00113 } else { // create ternary tree 00114 SymVec vec(3); 00115 vec[0] = init_full(depth_left); 00116 vec[1] = init_full(depth_left); 00117 vec[2] = init_full(depth_left); 00118 00119 return Sym(token_t(2), vec); // token value 2 designates a ternary now, even though the arity can simply be read from the size of the 'SymVec' 00120 } 00121 00122 } 00123 00124 00125 /* Examining the hash table. 00126 * 00127 * The hash table is a static member of the Sym class, but can be obtained and inspected 00128 * at any point during the run. The hash table follows the SGI implementation of hashmap (and effectively 00129 * uses it in gcc). An example: 00130 */ 00131 00132 void inspect_hashtable() { 00133 SymMap& dag = Sym::get_dag(); // get the hashmap 00134 unsigned i = 0; 00135 for (SymMap::iterator it = dag.begin(); it != dag.end(); ++it) { 00136 Sym node(it); // initialize a 'sym' with the iterator 00137 00138 cout << "Node " << i++ << " size " << node.size(); 00139 cout << " refcount " << node.refcount()-1; // -1: note that by creating the Sym above the refcount is increased 00140 cout << " depth " << node.depth(); 00141 cout << '\n'; 00142 } 00143 00144 } 00145 00146 /* The above code effectively examines all distinct subtrees in use in the application and prints some stats for the node */ 00147 00148 /* Manipulating trees 00149 * 00150 * The Sym class is set up in such a way that you cannot change a Sym, so how do you perform crossover and mutation? 00151 * 00152 * Simple, you create new syms. The Sym class supports two functions to make this easier: 'get_subtree' and 'insert_subtree'. 00153 * These traverse the tree by index, where 0 designates the root and other values are indexed depth first. 00154 */ 00155 00156 Sym subtree_xover(Sym a, Sym b) { 00157 00158 Sym to_insert = get_subtree(a, rand() % a.size() ); // select random subtree, will crash if too high a value is given 00159 00160 /* 'insert' it into b. This will not really insert, it will however create a new sym, 00161 * equal to 'b' but with a's subtree inserted at the designated spot. */ 00162 return insert_subtree(b, rand() % b.size(), to_insert); 00163 00164 } 00165 00166 /* Tying it together, we can create a simple genetic programming system. Mutation is not implemented here, 00167 * but would be easy enough to add by using recursion and/or 'set'. */ 00168 00169 void run_gp() { 00170 00171 int ngens = 50; 00172 int popsize = 1000; 00173 00174 cout << "Starting running " << popsize << " individuals for " << ngens << " generations." << endl; 00175 00176 vector<Sym> pop(popsize); 00177 00178 // init population 00179 for (unsigned i = 0; i < pop.size(); ++i) { 00180 pop[i] = init_full(5); 00181 } 00182 00183 double best = 0.0; 00184 00185 // do a very simple steady state tournament 00186 for (unsigned gen = 0; gen < ngens * pop.size(); ++gen) { 00187 int sel1 = rand()% pop.size(); 00188 int sel2 = rand() % pop.size(); 00189 int sel3 = rand() % pop.size(); 00190 00191 double ev1 = eval(pop[sel1]); 00192 double ev3 = eval(pop[sel3]); 00193 00194 double bst = max(ev1,ev3); 00195 if (bst > best) { 00196 best = bst; 00197 } 00198 00199 if (ev3 > ev1) { 00200 sel1 = sel3; // selection pressure 00201 } 00202 00203 Sym child = subtree_xover(pop[sel1], pop[sel2]); 00204 00205 // Check for uniqueness 00206 if (child.refcount() == 1) pop[ rand() % pop.size() ] = child; 00207 } 00208 00209 // and at the end: 00210 00211 inspect_hashtable(); 00212 00213 // and also count number of nodes in the population 00214 int sz = 0; 00215 for (unsigned i = 0; i < pop.size(); ++i) { sz += pop[i].size(); } 00216 cout << "Number of distinct nodes " << Sym::get_dag().size() << endl; 00217 cout << "Nodes in population " << sz << endl; 00218 cout << "ratio " << double(Sym::get_dag().size())/sz << endl; 00219 cout << "Best fitness " << best << endl; 00220 00221 } 00222 00223 /* One extra mechanism is supported to add annotations to nodes. Something derived from 00224 * 'UniqueNodeStats' can be used to attach new information to nodes. For this to function, 00225 * we need to supply a 'factory' function that creates these node-stats; attach this function to the 00226 * Sym class, so that it gets called whenever a new node is created. The constructors of the Sym class 00227 * take care of this. 00228 * 00229 * IMPORTANT: 00230 * in a realistic application, the factory function needs to be set BEFORE any Syms are created 00231 * Mixing Syms creating with and without the factory can lead to unexpected results 00232 * 00233 * First we derive some structure from UniqueNodeStats: */ 00234 00235 struct MyNodeStats : public UniqueNodeStats { 00236 00237 int sumsize; 00238 00239 ~MyNodeStats() { cout << "MyNodeStats::~MyNodeStats, sumsize = " << sumsize << endl; } 00240 }; 00241 00242 /* then define the factory function. It will get a Sym, which is just created. */ 00243 UniqueNodeStats* create_stats(const Sym& sym) { 00244 MyNodeStats* stats = new MyNodeStats; // Sym will take care of memory management 00245 00246 int sumsize = sym.size(); 00247 for (unsigned i = 0; i < sym.args().size(); ++i) { 00248 // retrieve the extra node stats of the child 00249 UniqueNodeStats* unique_stats = sym.args()[i].extra_stats(); // extra_stats retrieves the stats 00250 MyNodeStats* child_stats = static_cast<MyNodeStats*>(unique_stats); // cast it to the right struct 00251 sumsize += child_stats->sumsize; 00252 } 00253 00254 stats->sumsize = sumsize; 00255 return stats; // now it will get attached to the node and deleted when its reference count goes to zero 00256 } 00257 00258 void test_node_stats() { 00259 00260 if (Sym::get_dag().size() != 0) { 00261 cerr << "Cannot mix nodes with and without factory functions" << endl; 00262 exit(1); 00263 } 00264 00265 /* Very Important: attach the factory function to the Sym class */ 00266 Sym::set_factory_function(create_stats); 00267 00268 Sym tree = init_full(5); // create a tree 00269 00270 // get extra node stats out 00271 MyNodeStats* stats = static_cast<MyNodeStats*>( tree.extra_stats() ); 00272 00273 cout << "Size = " << tree.size() << " SumSize = " << stats->sumsize << endl; 00274 00275 Sym::clear_factory_function(); // reset 00276 } 00277 00278 00279 /* And run the code above */ 00280 00281 int main() { 00282 srand(time(0)); 00283 cout << "********** TEST EVALUATION **************\n"; 00284 test_eval(); 00285 cout << "********** TEST ALGORITHM ***************\n"; 00286 run_gp(); 00287 00288 cout << "********** TEST FACTORY ****************\n"; 00289 test_node_stats(); // can work because there are no live nodes 00290 00291 } 00292 00293 /* ********** Member function reference: ******************** 00294 * 00295 * Sym() The default constructor will create an undefined node (no token and no children), check for empty() to see if a node is undefined 00296 * 00297 * Sym(token_t) Create a terminal 00298 * 00299 * Sym(token_t, const SymVec&) 00300 * Create a node with token and SymVec as the children 00301 * 00302 * Sym(SymIterator it) 00303 * Create a sym from an iterator (taken from the hashtable directly, or from Sym::iterator) 00304 * 00305 * dtor, copy-ctor and assignment 00306 * 00307 * UniqueNodeStats* extra_stats() 00308 * Returns an UniqueNodeStats pointer (= 0 if no factory is defined) 00309 * 00310 * 00311 * int hashcode() returns the hashcode for the node 00312 * 00313 * int refcount() returns the reference count for the node 00314 * 00315 * bool operator== checks for equality (note that this is a pointer compare, really really fast) 00316 * 00317 * bool empty() returns whether the node is undefined, i.e. created through the default ctor 00318 * 00319 * int arity() shorthand for sym.args().size() 00320 * 00321 * token_t token() return identifying token for the node 00322 * 00323 * const SymVec& args() 00324 * returns the children of the node (in a vector<Sym>) 00325 * 00326 * unsigned size() returns the size, i.e., number of nodes 00327 * 00328 * unsigned depth() returns the depth 00329 * 00330 * iterator() returns the pointer to the node in the hashtable 00331 * 00332 * 00333 ********** Static functions: ******************** 00334 * 00335 * 00336 * 00337 * SymMap& get_dag() returns the hash table containing all nodes. This should only be used for inspection, 00338 * even though the dag itself is not const. This to enable the use of the ctor Sym(SymIterator) to inspect 00339 * using the Sym interface (rather than the hash table interface). This does allow you to make destructive 00340 * changes to the class, so use with care 00341 * 00342 * set_factory_function( UniqueNodeStats (*)(const Sym&) ) 00343 * Set the factory function 00344 * 00345 * clear_factory_function() 00346 * Clears the factory function, allocated UniqueNodeStats will still be deleted, but no new ones will be created. 00347 * 00348 ********** Utility Functions ******************** 00349 * 00350 * Sym get_subtree(const Sym& org, int i) 00351 * Retreive the i-th subtree from the Sym. Standard depth first ordering, where root has index 0 and the 00352 * rightmost terminal has index sym.size()-1 00353 * 00354 * Sym insert_subtree(const Sym& org, int i, const Sym& subtree) 00355 * Returns a Sym that is equal to 'org', for which the i-th subtree (same ordering as get_subtree) is replaced 00356 * by the third argument subtree. 00357 * 00358 * Sym next(const Sym&) 00359 * Returns the successor of the argument sym from the hashtable with wrap around. This is implemented just because 00360 * it can be done. It may be an interesting way to mutate... 00361 * 00362 * */ 00363 00364