The 10-armed testbed: learn by implementing!
Author: Rodrigo Chang
I was reading Sutton and Barto's book (2ed) on Reinforcement Learning, section 2.3, where they explain about the k-armed bandit problem and I realized that maybe it would not be so hard to implement an interactive version of figure 2.2 of the book. I share my work here to show that sometimes you can implement something as you read and that is a great way to learn and practice the concepts you are studying.
My workflow:
- I created the module Bandits and started writing functions and trying them in next cells. 
- When I was happy with the result, I added the interactivity and plots. 
Have fun!
xxxxxxxxxxxxxxxxxxxx13
1
begin2
    import Pkg3
    # activate a clean environment4
    Pkg.activate(mktempdir())5
​6
    Pkg.add([7
        Pkg.PackageSpec(name="Plots"),8
        Pkg.PackageSpec(name="PlutoUI"),9
    ])10
​11
    using Plots12
    using PlutoUI13
endMain.workspace24.Banditsxxxxxxxxxx79
1
# Provides functions to K-armed bandits example2
module Bandits3
​4
    # Action selection algorithm5
    function epsilon_greedy(Q_a, epsilon) 6
        if rand() < epsilon7
            action_index = rand(1:length(Q_a))8
        else9
            action_index = argmax(Q_a) 10
        end11
        action_index12
    end13
​14
    function run_episode(k=10, steps=1000, epsilon=0)15
        Q_star = randn(k)16
        Q_a = zeros(k) # Ac17
        N_a = zeros(Int, k) # number of times choosing every bandit18
        rewards = zeros(steps)19
​20
        for i in 1:steps21
            # select action22
            a = epsilon_greedy(Q_a, epsilon)23
            # get reward from bandit, ~  N(Q_star(a), 1)24
            r = Q_star[a] + randn()25
            N_a[a] += 1 26
            # Incremental implementation27
            Q_a[a] +=  (1/N_a[a]) * (r - Q_a[a])28
​29
            rewards[i] = maximum(Q_a)30
        end31
        32
        rewards33
    end34
​35
    # This runs several episodes with different bandits to measure 36
    # performance of ϵ-greedy algorithm37
    function run_series(episodes=2000, steps=1000, k = 10, epsilon=0)38
        rewards = zeros(steps)39
        for j in 1:episodes 40
            rewards .+= run_episode(k, steps, epsilon)41
        end42
        rewards / episodes43
    end44
​45
​46
    # For optimal action selection47
    function run_episode_opt(k=10, steps=1000, epsilon=0)48
        Q_star = randn(k)49
        Q_a = zeros(k)50
        N_a = zeros(Int, k)51
        opt_choice_times = 052
        opt_choice = zeros(steps)53
​54
        for i in 1:steps55
            # select action56
            a = epsilon_greedy(Q_a, epsilon)57
            # get random reward from bandit, (distributes) ~ Normal(Q_star(a), 1)58
            r = Q_star[a] + randn()59
            N_a[a] += 1 60
            Q_a[a] +=  (1/N_a[a]) * (r - Q_a[a])61
​62
            opt_choice_times += argmax(Q_star) == a63
            opt_choice[i] = opt_choice_times / i64
        end65
​66
        opt_choice 67
    end68
​69
    # This runs several episodes with different bandits to measure 70
    # performance of action selection of ϵ-greedy algorithm71
    function run_series_choice(episodes=2000, steps=1000, k = 10, epsilon=0)72
        opt_choice = zeros(steps)73
        for j in 1:episodes 74
            opt_choice .+= run_episode_opt(k, steps, epsilon)75
        end76
        opt_choice / episodes77
    end78
​79
endxxxxxxxxxxYou like to explore a little!
xxxxxxxxxxNumber of armed bandits 
xxxxxxxxxxSteps = 
xxxxxxxxxxEpisodes = 
xxxxxxxxxxxxxxxxxxxx16
1
begin2
    # Average reward and % of time of optimal choice3
    performance_e1 = Bandits.run_series(episodes, steps, k, ϵ)4
    opt_choice = 100 * Bandits.run_series_choice(episodes, steps, k, ϵ)5
    6
    p1 = plot(1:steps, performance_e1; 7
        linewidth = 2,8
        label = "Average reward", 9
        legend = :bottomright)10
    title!("Average reward over $episodes episodes with ϵ=$ϵ")11
    p2 = plot(1:steps, opt_choice; 12
        linewidth = 2,13
        label = "% optimal action", 14
        legend = :bottomright)15
    plot(p1, p2, layout = (2, 1))16
end