#!/usr/bin/env python
# coding: utf-8

# $$\newcommand{\Rv}{\mathbf{R}}
# \newcommand{\rv}{\mathbf{r}}
# \newcommand{\Qv}{\mathbf{Q}}
# \newcommand{\Qnv}{\mathbf{Qn}}
# \newcommand{\Av}{\mathbf{A}}
# \newcommand{\Aiv}{\mathbf{Ai}}
# \newcommand{\av}{\mathbf{a}}
# \newcommand{\xv}{\mathbf{x}}
# \newcommand{\Xv}{\mathbf{X}}
# \newcommand{\yv}{\mathbf{y}}
# \newcommand{\Yv}{\mathbf{Y}}
# \newcommand{\zv}{\mathbf{z}}
# \newcommand{\av}{\mathbf{a}}
# \newcommand{\Wv}{\mathbf{W}}
# \newcommand{\wv}{\mathbf{w}}
# \newcommand{\betav}{\mathbf{\beta}}
# \newcommand{\gv}{\mathbf{g}}
# \newcommand{\Hv}{\mathbf{H}}
# \newcommand{\dv}{\mathbf{d}}
# \newcommand{\Vv}{\mathbf{V}}
# \newcommand{\vv}{\mathbf{v}}
# \newcommand{\Uv}{\mathbf{U}}
# \newcommand{\uv}{\mathbf{u}}
# \newcommand{\tv}{\mathbf{t}}
# \newcommand{\Tv}{\mathbf{T}}
# \newcommand{\TDv}{\mathbf{TD}}
# \newcommand{\Tiv}{\mathbf{Ti}}
# \newcommand{\Sv}{\mathbf{S}}
# \newcommand{\Gv}{\mathbf{G}}
# \newcommand{\zv}{\mathbf{z}}
# \newcommand{\Zv}{\mathbf{Z}}
# \newcommand{\Norm}{\mathcal{N}}
# \newcommand{\muv}{\boldsymbol{\mu}}
# \newcommand{\sigmav}{\boldsymbol{\sigma}}
# \newcommand{\phiv}{\boldsymbol{\phi}}
# \newcommand{\Phiv}{\boldsymbol{\Phi}}
# \newcommand{\Sigmav}{\boldsymbol{\Sigma}}
# \newcommand{\Lambdav}{\boldsymbol{\Lambda}}
# \newcommand{\half}{\frac{1}{2}}
# \newcommand{\argmax}[1]{\underset{#1}{\operatorname{argmax}}}
# \newcommand{\argmin}[1]{\underset{#1}{\operatorname{argmin}}}
# \newcommand{\dimensionbar}[1]{\underset{#1}{\operatorname{|}}}
# \newcommand{\grad}{\mathbf{\nabla}}
# \newcommand{\ebx}[1]{e^{\betav_{#1}^T \xv_n}}
# \newcommand{\eby}[1]{e^{y_{n,#1}}}
# \newcommand{\Tiv}{\mathbf{Ti}}
# \newcommand{\Fv}{\mathbf{F}}
# \newcommand{\ones}[1]{\mathbf{1}_{#1}}
# $$

# # Reinforcement Learning for Two-Player Games

# How does Tic-Tac-Toe differ from the maze problem?
# 
#    * Different state and action sets.
#    * Two players rather than one.
#    * Reinforcement is 0 until end of game, when it is 1 for win, 0 for draw, or -1 for loss.
#    * Maximizing sum of reinforcement rather than minimizing.
#    * Anything else?

# ## Representing the Q Table

# The state is the board configuration.  There are $3^9$ of them, though
# not all are reachable. Is this too big? 
# 
# It is a bit less than 20,000.  Not bad. Is this the full size of the Q table?
# 
# No. We must add the action dimension.  There are at most 9 actions,
# one for each cell on the board.  So the Q table will contain about
# $20,000 \cdot 9$ values or about 200,000. No worries. 
# 
# Instead of thinking about the Q table as a three-dimensional array, as
# we did last time, let's be more pythonic and use a dictionary.  Use
# the current state as the key, and the value associated with the state
# is an array of Q values for each action taken in that state.
# 
# We still need a way to represent a board.  
# 
# How about an array of characters?  So
# 
#      X |   | O
#      ---------
#        | X | O
#      ---------
#      X |   |
# 
# would be
# 
#      board = np.array(['X',' ','O', ' ','X','O', 'X',' ',' '])
# 
# The initial board would be
# 
#      board = np.array([' ']*9)
# 
# We can represent a move as an index, 0 to 8,  into this array.
# 
# What should the reinforcement values be?  
# 
# How about 0 every move except when X wins, with a reinforcement of 1,
# and when O wins, with a reinforcement of -1.
# 
# For the above board, let's say we, meaning Player X, prefer move to
# index 3. In fact, this always results in a win.  So the Q value for
# move to 3 should be 1.  What other Q values do you know?
# 
# If we don't play a move to win, O could win in one move.  So the other
# moves might have Q values close to -1, depending on the skill of
# Player O.  In the following discussion we will be using a random
# player for O, so the Q value for a move other than 8 or 3 will be
# close to but not exactly -1.

# ## Agent-World Interaction Loop

# For our agent to interact with its world, we must implement
# 
#    1. Initialize Q.
#    1. Set initial state, as empty board.
#    1. Repeat:
#      1. Agent chooses next X move.
#      1. If X wins, set Q(board,move) to 1.
#      1. Else, if board is full, set Q(board,move) to 0.
#      1. Else, let O take move.
#      1. If O won, update Q(board,move) by (-1 - Q(board,move))
#      1. For all cases, update Q(oldboard,oldmove) by Q(board,move) - Q(oldboard,oldmove)
#      1. Shift current board and move to old ones.

# ## Now in Python

# In[ ]:


import numpy as np
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
from copy import copy


# Let's write a function to print a board in the usual Tic-Tac-Toe style.

# In[ ]:


def printBoard(board):
    print('{}|{}|{}\n-----\n{}|{}|{}\n-----\n{}|{}|{}'.format(*board))

board = np.array(['X',' ','O', ' ','X','O', 'X',' ',' '])
printBoard(board)


# Let's write a function that returns *True* if the current board is a winning board for us.  We will be Player X.  What does the value of *combos* represent?

# In[ ]:


def winner(board):
    combos = np.array((0,1,2, 3,4,5, 6,7,8, 0,3,6, 1,4,7, 2,5,8, 0,4,8, 2,4,6))
    return np.any(np.logical_or(np.all('X' == board[combos].reshape((-1,3)), axis=1),
                                np.all('O' == board[combos].reshape((-1,3)), axis=1)))          


# In[ ]:


board = np.array(['X',' ','O', ' ','X','O', 'X',' ',' '])
printBoard(board)
winner(board)


# In[ ]:


board[3] = 'X'
printBoard(board)
winner(board)


# How can we find all valid moves from a board?  Just find all of the spaces in the board representation

# In[ ]:


np.where(board == ' ')


# In[ ]:


np.where(board == ' ')[0]


# And how do we pick one at random and make that move?

# In[ ]:


board = np.array(['X',' ','O', ' ','X','O', 'X',' ',' '])
validMoves = np.where(board == ' ')[0]
move = np.random.choice(validMoves)
boardNew = copy(board)
boardNew[move] = 'X'
print('From this board')
printBoard(board)
print('\n  Move',move)
print('\nresults in board')
printBoard(boardNew)


# If X just won, we want to set the Q value for the previous state (board) to 1, because X will always win from that state and that action (move).
# 
# First we must figure out how to implement the Q table.  We want to associate a value with each board and move.  We can use a python dictionary for this.  We know how to represent a board.  A move can be an integer from 0 to 8 to index into the board array for the location to place a marker.

# In[ ]:


Q = {}  # empty table
Q[(tuple(board),1)] = 0
Q


# In[ ]:


Q[(tuple(board),1)]


# What if we try to look up a Q value for a state,action we have not encountered yet?  It will not be in the dictionary.  We can use the *get* method for the dictionary, that has a second argument as the value returned if the key does not exist.

# In[ ]:


board[1] = 'X'
Q[(tuple(board),1)]


# In[ ]:


Q.get((tuple(board),1), 42)


# Now we can set the Q value for (board,move) to 1.

# In[ ]:


Q[(tuple(board),move)] = 1


# If the board is full and we have a draw, then the previous state and action should be assigned 0.

# In[ ]:


Q[(tuple(board),move)] = 0


# If the board is not full, better check to see if O just won.  If O did just win, then we should adjust the Q value of the previous state and X action to be closer to -1, because we just received a -1 reinforcement and the game is over.

# In[ ]:


rho = 0.1 # learning rate
Q[(tuple(board),move)] += rho * (-1 - Q[(tuple(board),move)])


# If nobody won yet, let's calculate the temporal difference error and use it to adjust the Q value of the previous board,move. We do this only if we are not at the first move of a game.

# In[ ]:


step = 0
if step > 0:
    Q[(tuple(boardOld),moveOld)] += rho * (Q[(tuple(board),move)] - Q[(tuple(boardOld),moveOld)])


# Initially, taking random moves is a good strategy, because we know nothing about how to play Tic-Tac-Toe.  But, once we have gained some experience and our Q table has acquired some good predictions of the sum of future reinforcement, we should rely on our Q values to pick good moves.  For a given board, which move is predicted to lead to the best possible future using the current Q table?

# In[ ]:


validMoves = np.where(board == ' ')[0]
print('Valid moves are',validMoves)
Qs = np.array([Q.get((tuple(board),m), 0) for m in validMoves]) 
print('Q values for validMoves are',Qs)
bestMove = validMoves[np.argmax(Qs)]
print('Best move is',bestMove)


# To slowly transition from taking random actions to taking the action currently believed to be best, called the *greedy* action, we slowly decay a parameter, $\epsilon$, from 1 down towards 0 as the probability of selecting a random action.  This is called the $\epsilon$-greedy policy.

# In[ ]:


def epsilonGreedy(epsilon, Q, board):
    validMoves = np.where(board == ' ')[0]
    if np.random.uniform() < epsilon:
        # Random Move
        return np.random.choice(validMoves)
    else:
        # Greedy Move
        Qs = np.array([Q.get((tuple(board),m), 0) for m in validMoves]) 
        return validMoves[ np.argmax(Qs) ]
    
epsilonGreedy(0.8, Q, board)


# Now write a function to make plots to show results of some games.  Say the variable *outcomes* is a vector of 1's, 0's, and -1's, for games in which X wins, draws, and loses, respectively.

# In[ ]:


outcomes = np.random.choice([-1,0,1],replace=True,size=(1000))
outcomes[:10]


# In[ ]:


def plotOutcomes(outcomes,epsilons,maxGames,nGames):
    if nGames==0:
        return
    nBins = 100
    nPer = int(maxGames/nBins)
    outcomeRows = outcomes.reshape((-1,nPer))
    outcomeRows = outcomeRows[:int(nGames/float(nPer))+1,:]
    avgs = np.mean(outcomeRows,axis=1)
    plt.subplot(3,1,1)
    xs = np.linspace(nPer,nGames,len(avgs))
    plt.plot(xs, avgs)
    plt.xlabel('Games')
    plt.ylabel('Mean of Outcomes\n(0=draw, 1=X win, -1=O win)')
    plt.title('Bins of {:d} Games'.format(nPer))
    plt.subplot(3,1,2)
    plt.plot(xs,np.sum(outcomeRows==1,axis=1),'g-',label='Wins')
    plt.plot(xs,np.sum(outcomeRows==-1,axis=1),'r-',label='Losses')
    plt.plot(xs,np.sum(outcomeRows==0,axis=1),'b-',label='Draws')
    plt.legend(loc="center")
    plt.ylabel('Number of Games\nin Bins of {:d}'.format(nPer))
    plt.subplot(3,1,3)
    plt.plot(epsilons[:nGames])
    plt.ylabel('$\epsilon$')


# In[ ]:


plt.figure(figsize=(8,8))
plotOutcomes(outcomes,np.zeros(1000),1000,1000)


# Finally, let's write the whole Tic-Tac-Toe learning loop!

# In[ ]:


from IPython.display import display, clear_output


# In[ ]:


maxGames = 50000
rho = 0.2
epsilonDecayRate = 0.9999
epsilon = 1.0
graphics = True
showMoves = not graphics

outcomes = np.zeros(maxGames)
epsilons = np.zeros(maxGames)
Q = {}

if graphics:
    fig = plt.figure(figsize=(10,10))

for nGames in range(maxGames):
    epsilon *= epsilonDecayRate
    epsilons[nGames] = epsilon
    step = 0
    board = np.array([' '] * 9)  # empty board
    done = False
    
    while not done:        
        step += 1
        
        # X's turn
        move = epsilonGreedy(epsilon, Q, board)
        boardNew = copy(board)
        boardNew[move] = 'X'
        if (tuple(board),move) not in Q:
            Q[(tuple(board),move)] = 0  # initial Q value for new board,move
        if showMoves:
            printBoard(boardNew)
            
        if winner(boardNew):
            # X won!
            if showMoves:
                print('        X Won!')
            Q[(tuple(board),move)] = 1
            done = True
            outcomes[nGames] = 1
            
        elif not np.any(boardNew == ' '):
            # Game over. No winner.
            if showMoves:
                print('        draw.')
            Q[(tuple(board),move)] = 0
            done = True
            outcomes[nGames] = 0
            
        else:
            # O's turn.  O is a random player!
            moveO = np.random.choice(np.where(boardNew==' ')[0])
            boardNew[moveO] = 'O'
            if showMoves:
                printBoard(boardNew)
            if winner(boardNew):
                # O won!
                if showMoves:
                    print('        O Won!')
                Q[(tuple(board),move)] += rho * (-1 - Q[(tuple(board),move)])
                done = True
                outcomes[nGames] = -1
        
        if step > 1:
            Q[(tuple(boardOld),moveOld)] += rho * (Q[(tuple(board),move)] - Q[(tuple(boardOld),moveOld)])
            
        boardOld, moveOld = board, move # remember board and move to Q(board,move) can be updated after next steps
        board = boardNew
        
        if graphics and (nGames % (maxGames/10) == 0 or nGames == maxGames-1):
            fig.clf() 
            plotOutcomes(outcomes,epsilons,maxGames,nGames-1)
            clear_output(wait=True)
            display(fig);

if graphics:
    clear_output(wait=True)
print('Outcomes: {:d} X wins {:d} O wins {:d} draws'.format(np.sum(outcomes==1), np.sum(outcomes==-1), np.sum(outcomes==0)))


# How can we examine the Q function that predicts the future for every board and move?

# In[ ]:


Q[(tuple([' ']*9),0)]


# In[ ]:


Q[(tuple([' ']*9),1)]


# In[ ]:


Q.get((tuple([' ']*9),0), 0)


# In[ ]:


[Q.get((tuple([' ']*9),m), 0) for m in range(9)]


# In[ ]:


board = np.array([' ']*9)
Qs = [Q.get((tuple(board),m), 0) for m in range(9)]
printBoard(board)
print()
print('''{:5.2f} | {:5.2f} | {:5.2f}
---------------------
{:5.2f} | {:5.2f} | {:5.2f}
---------------------
{:5.2f} | {:5.2f} | {:5.2f}'''.format(*Qs))


# In[ ]:


def printBoardQs(board,Q):
    printBoard(board)
    Qs = [Q.get((tuple(board),m), 0) for m in range(9)]
    print()
    print('''{:5.2f} | {:5.2f} | {:5.2f}
---------------------
{:5.2f} | {:5.2f} | {:5.2f}
---------------------
{:5.2f} | {:5.2f} | {:5.2f}'''.format(*Qs))


# In[ ]:


board[0] = 'X'
board[1] = 'O'
printBoardQs(board,Q)


# In[ ]:


board[4] = 'X'
board[3] = 'O'
printBoardQs(board,Q)


# In[ ]:


board[0] = 'X'
board[4] = 'O'
printBoardQs(board,Q)


# In[ ]:


board[2] = 'X'
board[1] = 'O'
printBoardQs(board,Q)


# In[ ]:


board[7] = 'X'
board[3] = 'O'
printBoardQs(board,Q)


# In[ ]:


board[5] = 'X'
board[6] = 'O'
printBoardQs(board,Q)