#| This code may be freely copied and used for educational or research purposes. All software written by Antoine CornuŽjols (using code from Kevin Knight (knight@cs.cmu.edu) and Rich Sutton). Comments, bugs, improvements to antoine@lri.fr ----------------------------------------------------------------------------|# #|---------------------------------------------------------------------------- Reinforcement Learning for game plays "RL-game.lisp" ----------------------------------------------------------------------------|# #|----------------------------------------------------------------------------- This file contains functions for doing game-playing with reinforcement learning. This program will play any game for which the following functions and variables are defined: (print-board b) (movegen pos player) (opposite player) (static pos player) (won? pos player) (drawn? pos) (deep-enough pos depth) (make-move pos player move) *start* These functions are implemented for tic-tac-toe in the file "tictactoe.lisp". ----------------------------------------------------------------------------|# #|----------------------------------------------------------------------------- Pour jouer : (init) ; rŽ-initialise les scores et utilitŽs V(s) (defparameter *max-depth* 1) ; ˆ modifier pour changer la profondeur ; d'exploration de alpha-beta (RL-game 0) ; jouera avec un adversaire alŽatoire (RL-game 1) ; jouera avec un adversaire humain (RL-game 2 t) ; jouera avec un alpha-beta sans affichage ; voir la fonction RL-game (dotimes (n 100) (RL-game 0 t)) ; Fait jouer 100 parties simulŽes au systme ; contre un adversaire alŽatoire et sans impression (get-score) ; permet de voir les score respectif des deux joueurs (setf score '(0 0)) ; rŽ-initialise les scores sans modifier le reste ----------------------------------------------------------------------------|# (defvar value-table) (defvar initial-state) (defvar alpha 0.1) (defparameter *epsilon* 100000) ; rgle la prob de coup exploratoire ; The value function will be implemented as a hash table. This has two advantages. ; First, I can acess it with states in the two-list format without converting to ; an array index. Second, I won't allocate space for all those states that never occur. (defun init () (setq initial-state '(nil nil)) (setf score '(0 0)) (setq value-table (make-hash-table :test #'equal))) (defun value (state) (or (gethash state value-table) (setf (gethash state value-table) (cond ((won? state 'X) 0) ((won? state 'O) 1) ((drawn? state) 0.5) (t 0.5))))) (defun set-value (state value) (setf (gethash state value-table) value)) (defun mise-a-jour-score (state) "Fonction de mise ˆ jour des scores des joueurs. Le joueur qui gagne augmente son score d'un. En cas de match nul, les scores des deux joueurs sont augmentŽs d'un demi." (cond ((won? state 'X) ; le joueur 1 (les X) a gagnŽ (setf (car score) (1+ (car score)))) ((won? state 'O) ; le joueur 2 (les 0) a gagnŽ (setf (cadr score) (1+ (cadr score)))) (t ; match nul (setf score (list (+ 0.5 (car score))(+ 0.5 (cadr score))))))) (defun terminal-state-p (state) (or (won? state 'X)(won? state 'O)(drawn? state))) (defun random-move (player state) "Retourne une nouvelle position aprs qu'un coup alŽatoire parmi les coups possibles ait ŽtŽ jouŽ." (let ((possible-moves (movegen state player))) (if (null possible-moves) nil (nth (random (length possible-moves)) possible-moves)))) (defun greedy-move (player state) "Retourne une nouvelle position aprs que le meilleur coup (selon la fonction d'Žvaluation V) ait ŽtŽ jouŽ." (let ((possible-moves (movegen state player))) (if (null possible-moves) nil (loop with best-value = -1 with best-move for move in possible-moves for move-value = (value move) do (when (> move-value best-value) (setf best-value move-value) (setf best-move move)) finally (return best-move))))) (defun attend-coup (player state) "Retourne une nouvelle position aprs que le coup choisi par le joueur humain ait ŽtŽ jouŽ." ; (print-board state) (format t "Your move: ") (let ((m (read))) (loop until (setq next (make-move state 'X m)) do (format t "~%~a Illegal move, Try again: " m) (setq m (read))) (setq state next))) (defun get-score () (format t "Le score est de : ~A pour (X O)" score)) ; ; Now here is the main function ; (defvar state) (defun RL-game (joueur &optional quiet) "La fonction principale. Les X jouent en premier. La table des valeurs V(state) est mise ˆ jour pour le jeu des O, c'est-ˆ-dire aprs chaque coup de O (sauf ˆ la fin). Si joueur vaut : - 0 : le coup de X est jouŽ au hasard - 1 : X est jouŽ par le joueur humain - 2 : X est jouŽ par un algorithme alpha-beta - 3 : X est jouŽ par un algorithme d'apprentissage jumeau (XXX a faire)." (setq state *start*) (unless quiet (show-state state)) (loop for new-state = (cond ((= joueur 0) (random-move 'X state)) ((= joueur 1) (attend-coup 'X state)) ((= joueur 2) (minimax-a-b state 0 'X))) ; D'ABORD L'ADVERSAIRE for exploratory-move? = (= 0 (random *epsilon*)) ; Un coup exploratoire sur *epsilon* do (when (terminal-state-p new-state) (unless quiet (show-state new-state)) (mise-a-jour-score new-state) (update state new-state quiet) (return (value new-state))) (unless quiet (show-state new-state)) (setf new-state (if exploratory-move? (random-move 'O new-state) (greedy-move 'O new-state))) ; ENSUITE LE JOUEUR O (unless exploratory-move? (update state new-state quiet)) (unless quiet (show-state new-state)) (when (terminal-state-p new-state) (mise-a-jour-score new-state) (return (value new-state))) (setq state new-state))) (defun update (state new-state &optional quiet) "Met ˆ jour V(state)" (set-value state (+ (value state) (* alpha (- (value new-state) (value state))))) (unless quiet (format t "~A ~,3F" "Updated state-value (2 states above)" (value state)))) (defun show-state (state) (format t "~%") (print-board state) (format t "state-value = ~,3F" (value state)) (format t "~%") (values)) #|-------------------------------------------------------------------------------- Fonctions pour le jeu par la machine utilisant un minimax ou alpha-beta (Antoine CornuŽjols. Le 17 juin 2001) --------------------------------------------------------------------------------|# (defparameter *max-depth* 4) (defvar *static-evaluations* 0) ;; Function MINIMAX performs minimax search from a given position (pos), ;; to a given search ply (depth), for a given player (player). It returns ;; a list of board positions representing what it sees as the best moves for ;; both players. The first element of the list is the value of the board ;; position after the proposed move. (defun minimax (pos depth player) (cond ((deep-enough pos depth) (setq *static-evaluations* (+ *static-evaluations* 1)) (list (static pos player))) (t (let ((successors (movegen pos player)) (best-score -99999) (best-path nil)) (cond ((null successors) (setq *static-evaluations* (+ *static-evaluations* 1)) (list (static pos player))) (t (loop for succ in successors do (let* ((result-succ (minimax succ (1+ depth) (opposite player))) (new-value (- (car result-succ)))) (when (> new-value best-score) (setq best-score new-value) (setq best-path succ )))) (cons best-score best-path))))))) ;; Function MINIMAX-A-B performs minimax search with alpha-beta pruning. ;; It is far more efficient than MINIMAX. (defun minimax-a-b (pos depth player) (minimax-a-b-1 pos depth player 99999 -99999 t)) (defun minimax-a-b-1 (pos depth player use-thresh pass-thresh return-move) (cond ((deep-enough pos depth) (setq *static-evaluations* (+ *static-evaluations* 1)) (unless return-move (static pos player))) (t (let ((successors (movegen pos player)) (best-move nil) (quit nil) (new-value nil)) (declare (dynamic-extent successors)) (cond ((null successors) (setq *static-evaluations* (+ *static-evaluations* 1)) (unless return-move (static pos player))) (t (loop for succ in successors until quit do (setq new-value (- (minimax-a-b-1 succ (1+ depth) (opposite player) (- pass-thresh) (- use-thresh) nil))) (when (> new-value pass-thresh) (setq pass-thresh new-value) (setq best-move succ)) (when (>= pass-thresh use-thresh) (setq quit t))) (if return-move best-move pass-thresh))))))) (defun deep-enough (pos depth) (or (won? pos 'x) (won? pos 'o) (drawn? pos) (>= depth *max-depth*))) #|-------------------------------------------------------------------------------- Fonctions dŽfinissant le jeu de Tic-Tac-Toe (Reprise de Kevin Knight (knight@cs.cmu.edu). Le 17 juin 2001) --------------------------------------------------------------------------------|# ;; Function PLAY allows you to play a game against the computer. Call (play) ;; if you want to move first, or (play t) to let the computer move first. (defun play (&optional machine-first?) (let ((b *start*) (next nil)) (setq *static-evaluations* 0) (when machine-first? (setq b (minimax-a-b b 0 'o))) (do () ((or (won? b 'x) (won? b 'o) (drawn? b)) (format t "Final position: ~%") (print-board b) (cond ((won? b 'o) (format t "I win.~%")) ((won? b 'x) (format t "You win.~%")) (t (format t "Drawn.~%"))) *static-evaluations*) (print-board b) (format t "~a Your move: " *static-evaluations*) (let ((m (read))) (loop until (setq next (make-move b 'x m)) do (format t "~%~a Illegal move, Try again: " m) (setq m (read))) (setq b next)) (when (and (not (drawn? b)) (not (won? b 'o)) (not (won? b 'x))) (print-board b) (setq b (minimax-a-b b 0 'o)) (if (and (not (drawn? b)) (not (won? b 'o))) (format t "My move: ~%")))))) (defun self-play () (let ((b *start*)) (setq *static-evaluations* 0) (do () ((or (won? b 'x) (won? b 'o) (drawn? b)) (format t "Final position: ~%") (print-board b) (cond ((won? b 'o) (format t "O win.~%")) ((won? b 'x) (format t "X win.~%")) (t (format t "Drawn.~%"))) *static-evaluations*) (print-board b) (format t "~a X move: " *static-evaluations*) (setq b (minimax-a-b b 0 'x)) (when (and (not (drawn? b)) (not (won? b 'o)) (not (won? b 'x))) (print-board b) (setq b (minimax-a-b b 0 'o)) (if (and (not (drawn? b)) (not (won? b 'o))) (format t "~a 0 move: " *static-evaluations*)))))) ;; Function NULL-BOARD creates an empty tic-tac-toe board. The board is ;; stored as a list of nine elements. Elements are either 'x, 'o, or nil ;; (empty). (defun null-board () (list nil nil nil nil nil nil nil nil nil)) ;; Variable *START* is the starting board position. (defvar *start* nil) (setq *start* (null-board)) ;; Function MAKE-MOVE takes a board position (pos), a player (player, which ;; is 'x or 'o), and a move (which is a number between 0 and 8). It returns ;; a new board position. (defun make-move (pos player move) (unless (nth move pos) (let ((b (copy-list pos))) (setf (nth move b) player) b))) ;; Function MOVEGEN takes a position and a player and generates all legal ;; successor positions, i.e., all possible moves a player could make. (defun movegen (pos player) (loop for m from 0 to 8 unless (nth m pos) collect (make-move pos player m))) ;; Function WON? returns t is pos is a winning position for player, ;; nil otherwise. (defun won? (pos player) (or (and (eq (nth 0 pos) player) ;top (eq (nth 1 pos) player) (eq (nth 2 pos) player)) (and (eq (nth 3 pos) player) ;middle row (eq (nth 4 pos) player) (eq (nth 5 pos) player)) (and (eq (nth 6 pos) player) ;bottom (eq (nth 7 pos) player) (eq (nth 8 pos) player)) (and (eq (nth 0 pos) player) ;left (eq (nth 3 pos) player) (eq (nth 6 pos) player)) (and (eq (nth 1 pos) player) ;middle column (eq (nth 4 pos) player) (eq (nth 7 pos) player)) (and (eq (nth 2 pos) player) ;right (eq (nth 5 pos) player) (eq (nth 8 pos) player)) (and (eq (nth 0 pos) player) ;diagonal down (eq (nth 4 pos) player) (eq (nth 8 pos) player)) (and (eq (nth 2 pos) player) ;diagnonal up (eq (nth 4 pos) player) (eq (nth 6 pos) player)))) ;; Function DRAWN? returns t if pos is a drawn position, i.e., if there are ;; no more moves to be made. (defun drawn? (pos) (not (member nil pos))) ;; Function OPPOSITE returns 'x when given 'o, and vice-versa. (defun opposite (player) (if (eq player 'x) 'o 'x)) ;; Function PRINT-BOARD prints a two-dimensional representation of the board. (defun print-board (b) (format t "~% ~d ~d ~d 0 1 2~% ~d ~d ~d 3 4 5~% ~d ~d ~d 6 7 8~%~%" (or (nth 0 b) ".") (or (nth 1 b) ".") (or (nth 2 b) ".") (or (nth 3 b) ".") (or (nth 4 b) ".") (or (nth 5 b) ".") (or (nth 6 b) ".") (or (nth 7 b) ".") (or (nth 8 b) "."))) #| STATIC EVALUATION FUNCTION TO BE USED IF NOT DOING FULL SEARCH 1. *ttt-lines* IS A LIST OF ALL WINNING LINES 2. IF PLAYER WINS, 10000 POINTS 3. IF OPPONENT WINS, -10000 POINTS 4. IF DRAWN, 0 5. OTHERWISE SUBTRACT "BOARD-VALUE" OF OPPONENT FROM SCORE OF PLAYER WHERE BOARD VALUE IS taken as the sum of line score for each line, where linescore is 0 if the opponent has a piece in the line, and otherwise is the number of pieces you have in the line. |# (defparameter *ttt-lines* '((0 1 2) (3 4 5) (6 7 8) (0 3 6) (1 4 7) (2 5 8) (0 4 8) (2 4 6))) (defun static (pos player) (cond ((won? pos player) 10000) ((won? pos (opposite player)) -10000) ((drawn? pos) 0) (- (board-value pos player) (board-value pos player)))) (defun board-value (pos player) (loop for line in *ttt-lines* summing (line-score pos player line))) (defun line-score (pos player line) (let* ((pieces (loop for point in line collect (nth point pos))) (good (count player pieces))) (if (member (opposite player) line) 0 good)))