How to teach computer to play Tictactoe

What make people become good game players? They get the game instruction, start practicing and then learn from failure. How about a computer? Can we train it to play a game? The answer is absolutely! And the process of teaching or training the computer to play game is super interesting. This post is written to guide you, step by step, to create your own Game AI that is able to learn to play this game without any hardcoded rules. Let's try teaching the computer yourself, after about a hundred thousands episodes, it can become expert that you can hardly win. In addition, if you would like to know more how it actually works, keep scrolling down to read further til the end of the post.

1. How is it working?

The computer learns by interacting with the game enviroment. Then it observes how the game environment changes, leading to its reward. By playing many many times, it gets the idea of receiving as many good rewards as possible. This type of learning is called Reinforcement Learning.

If you want to learn more about it, there is a free ebook you should read, Reinforcement Learning, Richard S. Sutton and Andrew G. Barto.

Next, I'll walk you through the three algorithms and how to implement them.

  • Monte Carlo
  • SARSA
  • Q-Learning

Before the algorithms can interact with the game environment, we must have the game enviroment implemented first.

/* game.js */
import { every } from 'lodash';

export const X = 'X';
export const O = 'O';
export const EMPTY = '-';


export const initGame = () => {
return {
turn: X,
winner: null,
gameover: false,
board: [EMPTY, EMPTY, EMPTY, EMPTY, EMPTY, EMPTY, EMPTY, EMPTY, EMPTY]
}
}


export const move = (game, location) => {
let { turn, board } = game;

const validMoves = getValidMoves(board);
if (!validMoves.includes(location)) {
return game;
}

let nextState = [...board];
nextState[location] = turn;
turn = turn === X ? O : X;

const winner = checkWinner(nextState);
const gameover = checkGameover(nextState);

return {
turn,
winner,
gameover,
board: nextState
}
}

export const getValidMoves = (board) => {
let validMoves = [];
for (let i = 0; i < board.length; i++) {
if (board[i] === EMPTY) validMoves.push(i);
}
return validMoves;
}

export const checkWinner = (board) => {
const lines = [
[0, 1, 2],
[3, 4, 5],
[6, 7, 8],
[0, 3, 6],
[1, 4, 7],
[2, 5, 8],
[0, 4, 8],
[2, 4, 6],
];
for (let i = 0; i < lines.length; i++) {
const [a, b, c] = lines[i];
if (board[a] !== EMPTY && board[a] === board[b] && board[a] === board[c]) {
return board[a];
}
}
return null;
}

export const checkGameover = (board) => {
return every(board, it => it !== EMPTY) || !!checkWinner(board);
}

export const boardToString = (board) => {
return board.join('')
}

export const calculateReward = (game, player) => {
const { gameover, winner } = game;
if (!gameover) {
return 0;
} else {
if (!winner) {
// Reward for a draw game
return 0;
} else if (winner === player) {
// Reward for win
return 10;
} else {
// Reward for lose
return -10;
}
}
}

2. Monte Carlo

The algorithm

Initialize, for all s ∈ S, a ∈ A(s): 
Q(s, a) ← arbitrary
π(s) ← arbitrary
Returns(s, a) ← empty list

Repeat forever:
Choose S0 ∈ S and A0 ∈ A(S0) s.t. all pairs have probability > 0
Generate an episode starting from S0,A0, following π
For each pair s, a appearing in the episode:
G ← return following the first occurrence of s, a
Append G to Returns(s, a)
Q(s, a) ← average(Returns(s, a))
For each s in the episode:
π(s) ← argmax(Q(s, a))

Implementation

/* MonteCarloAgent.js */
export class MorteCarloAgent {
constructor({ epsilon = 0.1, discount = 0.9, player = O }) {
this.epsilon = epsilon;
this.discount = discount;
this.episode = [];
this.player = player;

this.Q = new Proxy(
{},
{
get: (target, name) => {
if (!(name in target)) {
target[name] = range(0, 9, 0.0);
}
return target[name];
},
}
);

this.returnSums = new Proxy(
{},
{
get: (target, name) => {
if (!(name in target)) {
target[name] = range(0, 9, 0.0);
}
return target[name];
},
}
);

this.returnCounts = new Proxy(
{},
{
get: (target, name) => {
if (!(name in target)) {
target[name] = range(0, 9, 0.0);
}
return target[name];
},
}
);
}

policy = (game) => {
const { board } = game;
const validMoves = getValidMoves(board);
const boardString = boardToString(board);
let A = range(0, 9, 0.0);
const cloned = clone(this.Q[boardString]);
const pulled = pullAt(cloned, validMoves);
const bestScore = max(pulled);
const bestAction = validMoves[pulled.indexOf(bestScore)];
for (let i = 0; i < validMoves.length; i++) {
const a = validMoves[i];
A[a] = this.epsilon / validMoves.length;
}
if (validMoves.includes(bestAction)) {
A[bestAction] += 1.0 - this.epsilon;
} else {
const someAction = sample(validMoves);
A[someAction] += 1.0 - this.epsilon;
}

return A;
};

learn = () => {
const memory = this.episode;

if (memory.length > 0 && memory[memory.length - 1][0].gameover) {
for (let i = 0; i < memory.length - 1; i++) {
const record = memory[i];

const state = record[0];
const action = record[1];

let G = 0;
let discount = this.discount;
const stateString = boardToString(state.board);

for (let j = i + 1; j < memory.length; j++) {
const nextRecord = memory[j];
const nextState = nextRecord[0];
const reward = calculateReward(nextState, this.player);
G += reward * discount;
discount = discount * this.discount;
}
this.returnSums[stateString][action] += G;
this.returnCounts[stateString][action] += 1.0;
this.Q[stateString][action] =
this.returnSums[stateString][action] /
this.returnCounts[stateString][action];

}
}
};

observe = (state, action) => {
this.episode.push([state, action]);
};

newEpisode = () => {
this.episode = [];
};
}

3. SARSA

The algorithm

Initialize Q(s, a), ∀s ∈ S, a ∈ A(s), arbitrarily, and Q(terminal-state, ·) = 0 
Repeat (for each episode):
Initialize S
Choose A from S using policy derived from Q (e.g., ε-greedy)
Repeat (for each step of episode):
Take action A, observe R, S′
Choose A′ from S′ using policy derived from Q (e.g., ε-greedy)
Q(S, A) ← Q(S, A) + α[R + γQ(S′, A′) − Q(S, A)]
S ← S′; A ← A′;
until S is terminal

Implementation

/* SarsaAgent.js */
export class SarsaAgent {
constructor({ epsilon = 0.1, discount = 0.9, alpha = 0.01, player = O }) {
this.epsilon = epsilon;
this.discount = discount;
this.alpha = alpha;
this.player = player;
this.episode = [];

this._Q = {};
this.Q = new Proxy(this._Q, {
get: (target, name) => {
if (!(name in target)) {
target[name] = range(0, 9, 0.0);
}

return target[name];
},
});
}

policy = (game) => {
const { board } = game;
const validMoves = getValidMoves(board);
const boardString = boardToString(board);
let A = range(0, 9, 0.0);
const cloned = clone(this.Q[boardString]);
const pulled = pullAt(cloned, validMoves);
const bestScore = max(pulled);
const bestAction = validMoves[pulled.indexOf(bestScore)];
for (let i = 0; i < validMoves.length; i++) {
const a = validMoves[i];
A[a] = this.epsilon / validMoves.length;
}
if (validMoves.includes(bestAction)) {
A[bestAction] += 1.0 - this.epsilon;
} else {
const someAction = sample(validMoves);
A[someAction] += 1.0 - this.epsilon;
}

return A;
};

learn = () => {
const memory = this.episode;
if (memory.length >= 2) {
const record1 = memory[memory.length - 2];
const record2 = memory[memory.length - 1];
const state = record1[0];
const action = record1[1];
const nextState = record2[0];
const nextAction = record2[1];

const reward = calculateReward(nextState, this.player);
const stateString = boardToString(state.board);
const nextStateString = boardToString(nextState.board);
const nextStateValue = nextState.gameover
? 0
: this.Q[nextStateString][nextAction];
const tdTarget = reward + this.discount * nextStateValue;
const tdDelta = tdTarget - this.Q[stateString][action];
this.Q[stateString][action] += this.alpha * tdDelta;
}
};

observe = (state, action) => {
this.episode.push([state, action]);
};

newEpisode = () => {
this.episode = [];
};
}

4. Q-Learning

The algorithm

Initialize Q(s, a), ∀s ∈ S, a ∈ A(s), arbitrarily, and Q(terminal-state, ·) = 0 
Repeat (for each episode):
Initialize S
Repeat (for each step of episode):
Choose A from S using policy derived from Q (e.g., ε-greedy)
Take action A, observe R, S′
Q(S, A) ← Q(S, A) + α[R + γ*max(Q(S′, a)) − Q(S, A)]
S ← S′;
until S is terminal

Implementation

/* QLearningAgent.js */
export class QLearningAgent {
constructor({ epsilon = 0.1, discount = 0.9, alpha = 0.01, player = O }) {
this.epsilon = epsilon;
this.discount = discount;
this.alpha = alpha;
this.player = player;
this.episode = [];

this._Q = {};
this.Q = new Proxy(this._Q, {
get: (target, name) => {
if (!(name in target)) {
target[name] = range(0, 9, 0.0);
}

return target[name];
},
});
}

policy = (game) => {
const { board } = game;
const validMoves = getValidMoves(board);
const boardString = boardToString(board);
let A = range(0, 9, 0.0);
const cloned = clone(this.Q[boardString]);
const pulled = pullAt(cloned, validMoves);
const bestScore = max(pulled);
const bestAction = validMoves[pulled.indexOf(bestScore)];
for (let i = 0; i < validMoves.length; i++) {
const a = validMoves[i];
A[a] = this.epsilon / validMoves.length;
}
if (validMoves.includes(bestAction)) {
A[bestAction] += 1.0 - this.epsilon;
} else {
const someAction = sample(validMoves);
A[someAction] += 1.0 - this.epsilon;
}

return A;
};

learn = () => {
const memory = this.episode;
if (memory.length >= 2) {
const record1 = memory[memory.length - 2];
const record2 = memory[memory.length - 1];
const state = record1[0];
const action = record1[1];
const nextState = record2[0];

const reward = calculateReward(nextState, this.player);
const stateString = boardToString(state.board);
const nextStateString = boardToString(nextState.board);
const nextStateValue = nextState.gameover
? 0
: max(this.Q[nextStateString]);
const tdTarget = reward + this.discount * nextStateValue;
const tdDelta = tdTarget - this.Q[stateString][action];
this.Q[stateString][action] += this.alpha * tdDelta;
}
};

observe = (state, action) => {
this.episode.push([state, action]);
};

newEpisode = () => {
this.episode = [];
};
}