Back to Portfolio of Projects | LearnableLoopAI.com | Blog |

In this project we refactor one of the PyMDP examples to serve as a baseline for a future client project. Here are some of the changes:

add a lookup table _lup to lookup a value/level
- label, given the index
- index, given the label
use zero-based factors and modalities to align better with Python’s convention
plot variables against time
add get_policy_labels(policies) to provide text tick labels (rather than indexes) in distribution bar plots

A few coding conventions:

Global variable names usually start with an underscore, i.e. _
This notebook follows a standardized framework. This is my own framework, developed over a number of years, and is informed by the CRISP-DM framework, the work of Warren Powell (Princeton), the work of Bert De Vries, the work of Karl Friston as well as that of the larger Active Inference community.

! python --version
import sys
!{sys.executable} -m pip list | grep inferactively-pymdp

Python 3.10.12
inferactively-pymdp           0.0.7.1

[notice] A new release of pip is available: 23.0.1 -> 25.1
[notice] To update, run: pip install --upgrade pip

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pymdp
from pymdp import utils
from pymdp.agent import Agent
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from matplotlib.lines import Line2D
from matplotlib.ticker import MaxNLocator
import math

Define some utilities

def plot_likelihood(
  matrix, ## conditional distribution
  title_str="Likelihood distribution (A)",
  xlabels=None, 
  ylabels=None,
  size=None):   
    """
    Plots a 2-D likelihood matrix as a heatmap
    """
    if not np.isclose(matrix.sum(axis=0), 1.0).all():
      raise ValueError("Distribution not column-normalized! Please normalize (ensure matrix.sum(axis=0) == 1.0 for all columns)")
    if size == None:
      fig = plt.figure(figsize = (5,5))
    else:
      fig = plt.figure(figsize = size)

    ## fig = plt.figure(figsize = (6,6))
    fig = plt.figure(figsize = size)

    if ylabels == None and xlabels == None: 
        sns.heatmap(
            matrix, 
            cmap='OrRd', 
            linewidths=1,
            cbar=True, 
            vmin=0.0, 
            vmax=1.0)
    else:
        ax = sns.heatmap(
            matrix, 
            xticklabels = xlabels, 
            yticklabels = ylabels, 
            ## cmap = 'gray', 
            cmap = "OrRd",
            linewidths=1,
            cbar = True,
            square=True,
            vmin = 0.0, 
            vmax = 1.0)
    plt.title(title_str)
    plt.show()

def plot_beliefs(
    vector, ## belief distribution
    title_str="",
    xlabels=None,
    size=None): 
    """
    Plot a categorical distribution or belief distribution, stored in the 1-D numpy vector `vector`
    """
    if not np.isclose(vector.sum(), 1.0):
      raise ValueError("Distribution not normalized! Please normalize")
    if size == None:
      fig = plt.figure(figsize = (1,1))
    else:
      fig = plt.figure(figsize = size)
    plt.grid(zorder=0)
    plt.bar(range(vector.shape[0]), vector, color='r', zorder=3)
    if xlabels == None: 
      plt.xticks(range(vector.shape[0]))
    else:
      ##+ plt.xticks(range(len(xlabels)), xlabels, rotation='vertical', fontname='DejaVu Sans Mono') ##.fixed-width font
      plt.xticks(range(len(xlabels)), xlabels, rotation='vertical', fontname='Liberation Mono') ##.fixed-width font
    plt.title(title_str)
    plt.show()

def plot_efe(
    vector, 
    title_str="",
    xlabels=None,
    size=None): 
    if size == None:
      fig = plt.figure(figsize = (1,1))
    else:
      fig = plt.figure(figsize = size)
    plt.grid(zorder=0)
    plt.bar(range(vector.shape[0]), vector, color='r', zorder=3)
    if xlabels == None: 
      plt.xticks(range(vector.shape[0]))
    else:
      plt.xticks(range(len(xlabels)), xlabels, rotation='vertical')
    plt.title(title_str)
    plt.show()

_lup = { ## lookup between indexes & labels
    ## control/action factors
    "aNull_0": [ 
        'Null'], ## 'Null'; s_0 uncontrollable
    "aChoice_1": [
        'Move-Start', 'Get-Hint', 'Play-Left', 'Play-Right'],

    ## state factors
    "sContext_0": [ ## uncontrollable (exog info)
        'Left-Better', 'Right-Better'],
    "sChoice_1": [
        'Start', 'Hint', 'Left', 'Right'],

    "s̆Context_0": [ ## uncontrollable (exog info)
        'Left-Better', 'Right-Better'],
    "s̆Choice_1": [
        'Start', 'Hint', 'Left', 'Right'],

    ## observation modalities
    "oHint_0": [
        'Null', ## Hint not played
        'Hint-Left', ## Hint said left
        'Hint-Right' ## Hint said right
    ],
    "oRew_1": [
        'Null', 'Loss', 'Reward'],
    "oChoice_2": [
        'Start', ## Move-start played
        'Hint', ## Get-hint played
        'Left', ## Play-left played
        'Right' ## Play-right played
    ],
}

1 BUSINESS UNDERSTANDING

The current problem involves an explore/exploit task with an epistemic two-armed bandit. The multi-armed bandit (or MAB) is a classic decision-making task that captures the core features of the “explore/exploit tradeoff”. The multi-armed bandit formulation is ubiquitous across problem spaces that require sequential decision-making under uncertainty – this includes disciplines ranging from economics, neuroscience, machine learning, engineering all the way to advertising.

In the standard MAB problem formulation, an agent must choose between mutually-exclusive alternatives (also known as ‘arms’) in order to maximize reward over time. The probability of reward depends on which arm the agent chooses. A common real-world analogy for a MAB problem is imagining a special slot machine with three possible levers to pull (rather than the usual one), where playing each lever has different probabilities of payoff (e.g. getting a winning combination of symbols or bonus). In fact, the ‘standard’ slot machine, which usually only has one lever, was historically referred to as a ‘one-armed bandit’ – this was the direct ancestor of the name for the generic machine learning / decision-making problem class, the MAB.

Crucially, MAB problems are interesting and difficult because in general, the reward statistics of each arm are unknown or only partially known. In a probabilistic or Bayesian context, an agent must therefore act based on beliefs about the reward statistics, since they don’t have perfect access to this information.

The inherent partial-observability of the task creates a conflict between exploitation or choosing the arm that is currently believed to be most rewarding, and exploration or gathering information about the remaining arms, in the hopes of discovering a potentially more rewarding option.

The fact that expected reward or utility is contextualized by beliefs – i.e. which arm is currently thought to be the most rewarding – motivates the use of active inference in this context. This is because the key objective function for action-selection, the expected free energy $G$ , depends on the agent’s beliefs about the world. And not only that, but expected free energy balances the desire to maximize rewards with the drive to resolve uncertainty about unknown parts of the agent’s model. The more accurate the agent’s beliefs are, the more faithfully decision-making can be guided by maximizing expected utility or rewards.

MAB with an epistemic twist

In the MAB formulation we’ll be exploring in this problem, the agent must choose to play among two possible arms, each of which has unknown reward statistics. These reward statistics take the form of Bernoulli distributions over two possible reward outcomes: “Loss” and “Reward”. However, one of the arms has probability $p$ of yielding “Reward”” and probabiliity $(1 - p)$ of yielding “Loss”. The other arm has swapped statistics. In this example, the agent knows that the bandit has this general reward structure, except they don’t know which of the two arms is the rewarding one (the arm where reward probability is $p$ , assuming $p \in [0.5, 1]$ ).

However, we introduce an additional feature in the environment, that induces an explicit trade-off in decision-making between exploration (information-seeking) and exploitation (reward-seeking). In this special “epistemic bandit” problem, there is an additional action available to the agent, besides playing the two arms. We call this third action “Get Hint”, and allows the agent to acquire information that reveals (potentially probabilistically) which arm is the more rewarding one. There is a trade-off here, because by choosing to acquire the hint, the agent forgoes the possibility of playing an arm and thus the possibility of getting a reward at that moment. The mutual exclusivity of hint-acquisition and arm-playing imbues the system with an explore/exploit trade-off, which active inference is particularly equipped to handle, when compared to simple reinforcement learning schemes (e.g. epsilon-greedy reward maximization).

##. Visualize: 3 levers, button in the center (Move-Start) 

## Play-Left              Play-Right
##            Move-Start
##             Get-Hint

2 DATA UNDERSTANDING

There is no pre-existing data to be analyzed.

3 DATA PREPARATION

There is no pre-existing data to be prepared.

4 MODELING

4.1 Narrative

Please review the narrative in section 1.

4.2 Core Elements

This section attempts to answer three important questions:

What metrics are we going to track?
What decisions do we intend to make?
What are the sources of uncertainty?

For this problem, the main metric we are interested in is the probability of observing a Reward. In addition, we want to keep track of what the hint lever reveals as well as the choices played by the agent.

The only source of uncertainty is which lever is more profitable to play: left or right.

4.3 System-Under-Steer / Environment / Generative Process

4.3.1 State variables

The state at time $t$ of the system-under-steer/environment (envir) is given by

$\begin{aligned} {\overset{˘}{s}}_{t} & = ({\overset{˘}{s}}_{0, t}^{Context}, {\overset{˘}{s}}_{1, t}^{Choice}) \end{aligned}$ where

${\overset{˘}{s}}_{0, t}^{Context} \in {Left-Better, Right-Better}$
${\overset{˘}{s}}_{1, t}^{Choice} \in {Start, Hint, Left, Right}$

4.3.2 Decision variables

The decision variables represent what we control.

The environment is steered by decisions/actions $a_{t}$ . Each component of this vector is called a control factor or control state factor. The action at time $t$ is given by

$\begin{aligned} {\overset{˘}{a}}_{t} & = (a_{0, t}^{Null}, a_{1, t}^{Choice}) \end{aligned}$ where

$a_{0, t}^{Null} \in {Null}$
$a_{1, t}^{Choice} \in {Move-Start, Get-Hint, Play-Left, Play-Right}$

4.3.3 Exogenous information variables

The exogenous information is not controllable by the agent. It is captured in the state factor ${\overset{˘}{s}}_{0, t}^{Context} \in {Left-Better, Right-Better}$

4.3.4 Next State

To find the next state for each time step, the generative process starts with the transition function $f$ . To this is added exogenous information and system noise to arrive at the next state.

4.3.5 Observation

To find the observation generated by the external state of the generative process for each time step, the generative process starts with the generating function $g$ . To this is added observation or measurement noise to arrive at the observation.

4.3.6 Implementation of the System-Under-Steer / Environment / Generative Process

We assume that the reality of the generative process is given by the following implementation:

class EpistemicMABEnvir(object):
  def __init__(self, context=None, p_hint=None, p_reward=None):
    if context == None:
      ## randomly sample which bandit arm is better (Left or Right):
      self.s̆Context_0 = _lup['s̆Context_0'][utils.sample(np.array([0.5, 0.5]))]
    else:
      self.s̆Context_0 = context
    if p_hint == None:
      self.p_hint = 1.0
    else:
      self.p_hint = p_hint
    if p_reward == None:
      self.p_reward = 0.8
    else:
      self.p_reward = p_reward

  def step(self, action):
    if action == "Move-Start":
      oHint_0 = "Null"
      oRew_1 = "Null"
      oChoice_2 = "Start"
    elif action == "Get-Hint":
      if self.s̆Context_0 == "Left-Better":
        oHint_0 = _lup['oHint_0'][utils.sample(np.array([0.0, self.p_hint, 1.0 - self.p_hint]))]
      elif self.s̆Context_0 == "Right-Better":
        oHint_0 = _lup['oHint_0'][utils.sample(np.array([0.0, 1.0 - self.p_hint, self.p_hint]))]
      else: print(f'ERROR: Invalid self.s̆Context_0: {self.s̆Context_0}')
      oRew_1 = "Null"
      oChoice_2 = "Hint"
    elif action == "Play-Left":
      oHint_0 = "Null"
      oChoice_2 = "Left"
      if self.s̆Context_0 == "Left-Better":
        oRew_1 = _lup['oRew_1'][utils.sample(np.array([0.0, 1.0 - self.p_reward, self.p_reward]))]
      elif self.s̆Context_0 == "Right-Better":
        oRew_1 = _lup['oRew_1'][utils.sample(np.array([0.0, self.p_reward, 1.0 - self.p_reward]))]
      else: print(f'ERROR: Invalid self.s̆Context_0: {self.s̆Context_0}')        
    elif action == "Play-Right":
      oHint_0 = "Null"
      oChoice_2 = "Right"
      if self.s̆Context_0 == "Right-Better":
        oRew_1 = _lup['oRew_1'][utils.sample(np.array([0.0, 1.0 - self.p_reward, self.p_reward]))]
      elif self.s̆Context_0 == "Left-Better":
        oRew_1 = _lup['oRew_1'][utils.sample(np.array([0.0, self.p_reward, 1.0 - self.p_reward]))]
      else: print(f'ERROR: Invalid self.s̆Context_0: {self.s̆Context_0}')        
    else: print(f'ERROR: Invalid action: {action}')
    obs = [oHint_0, oRew_1, oChoice_2]
    return obs

  def reset(self):
    ## self.s̆Context_0 = _lup['s̆Context_0'][utils.sample(np.array([0.5, 0.5]))]
    print(f'Re-initialized to:\n{self.s̆Context_0=}\n{self.p_hint=}\n{self.p_reward=}')
    oHint_0 = "Null"
    oRew_1 = "Null"
    oChoice_2 = "Start"
    print(f'{oHint_0=}, {oRew_1=}, {oChoice_2=}')
    return oHint_0, oRew_1, oChoice_2

Give this environment class a quick test drive:

_envir = EpistemicMABEnvir()
_envir

<__main__.EpistemicMABEnvir at 0x7f8876c9b3d0>

_envir.reset()

Re-initialized to:
self.s̆Context_0='Left-Better'
self.p_hint=1.0
self.p_reward=0.8
oHint_0='Null', oRew_1='Null', oChoice_2='Start'

('Null', 'Null', 'Start')

_envir.step('Play-Left')

['Null', 'Loss', 'Left']

4.4 Uncertainty Model

The uncertainty of which side is better to play is provided for in the __init__() function of class EpistemicMABEnvir() when context= None.

4.5 Agent / Generative Model

Specify the dimensionalities of the hidden state factors, the control state factors, and the observation modalities:

## define by manually inspecting _lup
_a_dims = [1, 4]; print(f'{_a_dims=}') ## s_0 uncontrollable
_s_dims = [2, 4]; print(f'{_s_dims=}')
_o_dims = [3, 3, 4]; print(f'{_o_dims=}')

_a_dims=[1, 4]
_s_dims=[2, 4]
_o_dims=[3, 3, 4]

4.5.1 State variables

According to the agent the state of the system-under-steer/environment/generative process will be $s_{t}$ , rather than ${\overset{˘}{s}}_{t}$ . It is given by

$\begin{aligned} s_{t} & = (s_{0, t}^{Context}, s_{1, t}^{Choice}) \end{aligned}$ where

$s_{0, t}^{Context} \in {Left-Better, Right-Better}$
$s_{1, t}^{Choice} \in {Start, Hint, Left, Right}$

4.5.2 Decision variables

According to the agent the action on the environment at time $t$ will be represented by $u_{t}$ , also known as the control state of the agent.

4.5.3 Implementation of the Agent / Generative Model / Internal Model

4.5.3.1 Observation likelihood matrix, $A$ or $P (o_{t} ∣ s_{t})$

print(_s_dims)
print(_o_dims)

[2, 4]
[3, 3, 4]

_A_shapes = [[o_dim] + _s_dims for o_dim in _o_dims]
_A_shapes

[[3, 2, 4], [3, 2, 4], [4, 2, 4]]

_A = utils.obj_array_zeros(_A_shapes)
_A

array([array([[[0., 0., 0., 0.],
               [0., 0., 0., 0.]],

              [[0., 0., 0., 0.],
               [0., 0., 0., 0.]],

              [[0., 0., 0., 0.],
               [0., 0., 0., 0.]]]), array([[[0., 0., 0., 0.],
                                            [0., 0., 0., 0.]],

                                           [[0., 0., 0., 0.],
                                            [0., 0., 0., 0.]],

                                           [[0., 0., 0., 0.],
                                            [0., 0., 0., 0.]]]),
       array([[[0., 0., 0., 0.],
               [0., 0., 0., 0.]],

              [[0., 0., 0., 0.],
               [0., 0., 0., 0.]],

              [[0., 0., 0., 0.],
               [0., 0., 0., 0.]],

              [[0., 0., 0., 0.],
               [0., 0., 0., 0.]]])], dtype=object)

4.5.3.1.1 Modality $o_{0}^{Hint}$

## accuracy of the hint, according to the agent's generative model (how much does 
##  the agent trust the hint?)
_p_hint = 0.7 
for s1_idx, s1_lab in enumerate(_lup['sChoice_1']):
  if s1_lab == 'Start':
    _A[0][0, :, s1_idx] = 1.0
  elif s1_lab == 'Hint':
    ## note colon after the 1
    _A[0][1:, :, s1_idx] = np.array([[_p_hint,       1.0 - _p_hint],
                                     [1.0 - _p_hint, _p_hint]])
    ## _A[0][1, :, choice_state_id] = np.array([_p_hint,       1.0 - _p_hint])
    ## _A[0][2, :, choice_state_id] = np.array([1.0 - _p_hint, _p_hint])
  elif s1_lab == 'Left':
    _A[0][0, :, s1_idx] = 1.0
  elif s1_lab == 'Right':
    _A[0][0, :, s1_idx] = 1.0
    
_A[0]
## 3 matrices of 2x4

array([[[1. , 0. , 1. , 1. ],
        [1. , 0. , 1. , 1. ]],

       [[0. , 0.7, 0. , 0. ],
        [0. , 0.3, 0. , 0. ]],

       [[0. , 0.3, 0. , 0. ],
        [0. , 0.7, 0. , 0. ]]])

plot_likelihood(
    _A[0][:,:,1], 
    title_str="""
    Modality $o^{\mathrm{Hint}}_0$
    vs
    Factor $s^{\mathrm{Context}}_0$
    ($s^{\mathrm{Choice}}_1=\mathrm{Hint}$), Probability of the two hint types, for the two game contexts
    """,
    ylabels=_lup['oHint_0'],
    xlabels=_lup['sContext_0']
)
## look at cuboid & see slice
## lift slice up and roll backwards
## look at slice from left
## now original cuboid rows increase left-to-right
##   and original cuboid mats increase from top-to-bottom (depth)

<Figure size 500x500 with 0 Axes>

4.5.3.1.2 Modality $o_{1}^{Rew}$

_p_reward = 0.8 ## probability of getting a rewarding outcome, if you are sampling the more rewarding bandit
for s1_idx, s1_nam in enumerate(_lup['sChoice_1']):
  if s1_nam == 'Start':
    _A[1][0, :, s1_idx] = 1.0
  elif s1_nam == 'Hint':
    _A[1][0, :, s1_idx] = 1.0
  elif s1_nam == 'Left':
    _A[1][1:, :, s1_idx] = np.array([ [1.0-_p_reward, _p_reward],
                                      [_p_reward,     1.0-_p_reward] ])
  elif s1_nam == 'Right':
    _A[1][1:, :, s1_idx] = np.array([ [ _p_reward,   1.0- _p_reward],
                                      [1-_p_reward,  _p_reward] ])
    
_A[1]
## 3 matrices of 2x4

array([[[1. , 1. , 0. , 0. ],
        [1. , 1. , 0. , 0. ]],

       [[0. , 0. , 0.2, 0.8],
        [0. , 0. , 0.8, 0.2]],

       [[0. , 0. , 0.8, 0.2],
        [0. , 0. , 0.2, 0.8]]])

plot_likelihood(
    _A[1][:,:,2], 
    title_str="""
    Modality $o^{\mathrm{Rew}}_1$
    vs
    Factor $s^{\mathrm{Context}}_0$
    ($s^{\mathrm{Choice}}_1=\mathrm{Left}$), Payoff structure if playing the Left Arm, for the two contexts
    """,
    ylabels=_lup['oRew_1'],
    xlabels=_lup['sContext_0']
)

<Figure size 500x500 with 0 Axes>

4.5.3.1.3 Modality $o_{2}^{Choice}$

for s1_idx in range(len(_lup['sChoice_1'])):
  _A[2][s1_idx, :, s1_idx] = 1.0

_A[2]
## 4 matrices of 2x4

array([[[1., 0., 0., 0.],
        [1., 0., 0., 0.]],

       [[0., 1., 0., 0.],
        [0., 1., 0., 0.]],

       [[0., 0., 1., 0.],
        [0., 0., 1., 0.]],

       [[0., 0., 0., 1.],
        [0., 0., 0., 1.]]])

"""
Condition on context (first hidden state factor) and display the remaining 
indices (outcome and choice state) 
"""
plot_likelihood(
    _A[2][:,0,:], 
    title_str="""
    Modality $o^{\mathrm{Choice}}_2$
    vs
    $s^{\mathrm{Choice}}_1$
    ($s^{\mathrm{Context}}_0=\mathrm{Left-Better}$), Mapping between sensed states and true states""",
    ylabels=_lup['oChoice_2'],
    xlabels=_lup['sChoice_1'],
)

<Figure size 500x500 with 0 Axes>

utils.is_normalized(_A)

True

4.5.3.2 Transition likelihood matrix, $B$ or $P (s_{t} ∣ s_{t - 1}, u_{t - 1})$

print(_a_dims)
print(_s_dims)

[1, 4]
[2, 4]

_B_shapes = [[s_dim, s_dim, _a_dims[f]] for f,s_dim in enumerate(_s_dims)]
_B_shapes

[[2, 2, 1], [4, 4, 4]]

_B = utils.obj_array_zeros(_B_shapes)
_B
## 2 matrices of 2x1
## 4 matrices of 4x4

array([array([[[0.],
               [0.]],

              [[0.],
               [0.]]]), array([[[0., 0., 0., 0.],
                                [0., 0., 0., 0.],
                                [0., 0., 0., 0.],
                                [0., 0., 0., 0.]],

                               [[0., 0., 0., 0.],
                                [0., 0., 0., 0.],
                                [0., 0., 0., 0.],
                                [0., 0., 0., 0.]],

                               [[0., 0., 0., 0.],
                                [0., 0., 0., 0.],
                                [0., 0., 0., 0.],
                                [0., 0., 0., 0.]],

                               [[0., 0., 0., 0.],
                                [0., 0., 0., 0.],
                                [0., 0., 0., 0.],
                                [0., 0., 0., 0.]]])], dtype=object)

4.5.3.2.1 Factor $s_{0}^{Context}$

_B[0][:,:,0] = np.eye(len(_lup['sContext_0']))
_B[0]

array([[[1.],
        [0.]],

       [[0.],
        [1.]]])

plot_likelihood(
    _B[0][:,:,_lup['aNull_0'].index('Null')],
    title_str="""
    Factor $s^{\mathrm{Context}}_{0,t}$
    vs
    Factor $s^{\mathrm{Context}}_{0,t-1}$
    ($a^{\mathrm{Null}}_{0,t-1}=\mathrm{Null}$)""",
    ylabels=_lup['sContext_0'],
    xlabels=_lup['sContext_0'],
)

<Figure size 500x500 with 0 Axes>

4.5.3.2.2 Factor $s_{1}^{Choice}$

for s1_idx in range(len(_lup['sChoice_1'])):
  _B[1][s1_idx, :, s1_idx] = 1.0
_B[1]
## 4 matrices of 4x4

array([[[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.]],

       [[0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.]],

       [[0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.]],

       [[0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.]]])

plot_likelihood(
    _B[1][:,:,_lup['aChoice_1'].index('Move-Start')],
    title_str="""
    Factor $s^{\mathrm{Choice}}_{1,t}$
    vs
    Factor $s^{\mathrm{Choice}}_{1,t-1}$
    ($a^{\mathrm{Choice}}_{1,t-1}=\mathrm{Move-Start}$)""",
    ylabels=_lup['sChoice_1'],
    xlabels=_lup['sChoice_1'],
)

<Figure size 500x500 with 0 Axes>

plot_likelihood(
    _B[1][:,:,_lup['aChoice_1'].index('Get-Hint')],
    title_str="""
    Factor $s^{\mathrm{Choice}}_{1,t}$
    vs
    Factor $s^{\mathrm{Choice}}_{1,t-1}$
    ($a^{\mathrm{Choice}}_{1,t-1}=\mathrm{Get-Hint}$)""",
    ylabels=_lup['sChoice_1'],
    xlabels=_lup['sChoice_1'],
)

<Figure size 500x500 with 0 Axes>

plot_likelihood(
    _B[1][:,:,_lup['aChoice_1'].index('Play-Left')],
    title_str="""
    Factor $s^{\mathrm{Choice}}_{1,t}$
    vs
    Factor $s^{\mathrm{Choice}}_{1,t-1}$
    ($a^{\mathrm{Choice}}_{1,t-1}=\mathrm{Play-Left}$)""",
    ylabels=_lup['sChoice_1'],
    xlabels=_lup['sChoice_1'],
)

<Figure size 500x500 with 0 Axes>

plot_likelihood(
    _B[1][:,:,_lup['aChoice_1'].index('Play-Right')],
    title_str="""
    Factor $s^{\mathrm{Choice}}_{1,t}$
    vs
    Factor $s^{\mathrm{Choice}}_{1,t-1}$
    ($a^{\mathrm{Choice}}_{1,t-1}=\mathrm{Play-Right}$)""",
    ylabels=_lup['sChoice_1'],
    xlabels=_lup['sChoice_1'],
)

<Figure size 500x500 with 0 Axes>

utils.is_normalized(_B)

True

4.5.3.3 The $C$ vectors

## For C, each observation modality has a VECTOR, NOT a MATRIX as in the case of A
_C = utils.obj_array_zeros(_o_dims) ##.
_C

array([array([0., 0., 0.]), array([0., 0., 0.]), array([0., 0., 0., 0.])],
      dtype=object)

from pymdp.maths import softmax

## leave _C[0]

_C[1][1] = -4.0 ##. do not want to see the oRew_1 == 'Loss' observation
_C[1][2] = 2.0 ##. want to see the oRew_1 == 'Reward' observation 

## leave _C[2]
_C

array([array([0., 0., 0.]), array([ 0., -4.,  2.]),
       array([0., 0., 0., 0.])], dtype=object)

softmax(_C[1])

array([0.11894324, 0.00217852, 0.87887824])

plot_beliefs(softmax(_C[1]), title_str="Prior preferences", size=(4,2))

4.5.3.4 The $D$ vectors

## For D, each state factor has a VECTOR, NOT a MATRIX as in the case of B
_D = utils.obj_array_zeros(_s_dims)
_D

array([array([0., 0.]), array([0., 0., 0., 0.])], dtype=object)

## _D[0] = [0.5, 0.5] ## breaks later, has to be np.array()
_D[0] = np.array([0.5, 0.5])
_D

array([array([0.5, 0.5]), array([0., 0., 0., 0.])], dtype=object)

## _D[1] = [1.0, 0.0, 0.0, 0.0] ## breaks later, has to be np.array()
_D[1] = np.array([1.0, 0.0, 0.0, 0.0]) ##. certain about 'Start'
_D

array([array([0.5, 0.5]), array([1., 0., 0., 0.])], dtype=object)

print(f'Beliefs about which arm is better: {_D[0]}')
print(f'Beliefs about starting location: {_D[1]}')

Beliefs about which arm is better: [0.5 0.5]
Beliefs about starting location: [1. 0. 0. 0.]

4.6 Agent Evaluation

Now we’ll write a function that will take the agent, the environment, and a time length and run the active inference loop

def get_policy_labels(policies):
    policy_len = len(policies[0]) ## use first policy to find policy_len
    labels = []
    for p in range(len(policies)): ## for each action-sequence / policy
        lab = ''
        for a in range(policy_len): ## for each action
            if a < policy_len - 1:
                lab = lab+f"{[_lup['aNull_0'][policies[p][a][0]],_lup['aChoice_1'][policies[p][a][1]]]} ---> "
            else:
                lab = lab+f"{[_lup['aNull_0'][policies[p][a][0]],_lup['aChoice_1'][policies[p][a][1]]]}"
        labels.append(lab)
    return labels

def run_active_inference_loop(my_agt, my_env, T=5):
  """ Initialize the first observation """
  ## agent observes itself seeing a `Null` hint, getting a `Null` reward, 
  ##   and seeing itself in the `Start` location
  obs_lab = my_env.reset() ## reset the environment and get an initial observation
  obs_idx = [
    _lup['oHint_0'].index(obs_lab[0]), 
    _lup['oRew_1'].index(obs_lab[1]), 
    _lup['oChoice_2'].index(obs_lab[2])]
  print(f'Number of policies: {math.prod(_a_dims)**my_agt.policy_len=}')
  print(f'\tbecause {_a_dims=}, {my_agt.policy_len=}')

  for t in range(T):                               ##. with a slide
    print(f"\n===================== Time {t}: =====================")
    
    qIsI = my_agt.infer_states(obs_idx)            ##. infer.            
    plot_beliefs(qIsI[0], title_str=r"$q(s^{\mathrm{Context}}_{0,t})$"+f" at t = {t}", xlabels=_lup['sContext_0'])
    plot_beliefs(qIsI[1], title_str=r"$q(s^{\mathrm{Choice}}_{1,t})$"+f" at t = {t}", xlabels=_lup['sChoice_1'])
    for sfi, sfn in enumerate(_s_fac_names):
        _s_facs[sfn].append( _lup[sfn][int(np.argmax(qIsI[sfi].round(3).T))] )

    qIpiI, efe = my_agt.infer_policies()
    ## efe
    print(f'{len(efe)=}'); print(f'EFE: {np.round(efe, 2)}')
    print(f'{np.argmin(efe)=}'); print(f'{np.min(efe)=}')
    lowest_efe_idx = np.round(np.argmin(efe), 2)
    print(f'LOWEST efe in {lowest_efe_idx}: {efe[lowest_efe_idx]}')
    plot_efe(
      efe, 
      title_str=r"$G$"+f" at t = {t}", 
      size=(12,1)
    )
    _GNegs.append(efe)
    ## qIpiI
    print(f'{len(qIpiI)=}'); print(f'qIpiI: {np.round(qIpiI, 2)}')
    print(f'{np.argmax(qIpiI)=}'); print(f'{np.max(qIpiI)=}')
    print(f'BEST qIpiI: {np.round(qIpiI[lowest_efe_idx], 2)}')
    ps = my_agt.policies
    plot_beliefs(
      qIpiI, 
      title_str=r"$q(\pi_{t})$"+f" at t = {t}", 
      xlabels=get_policy_labels(my_agt.policies),
      size=(12,2)
    )
    _qIpiIs.append(qIpiI)
    
    act_idx = my_agt.sample_action()               ##. Act future &
    act_idx_controllable = int(act_idx[1])
    act_lab_controllable = _lup['aChoice_1'][act_idx_controllable] ##.
    for afi, afn in enumerate(_a_fac_names):
        _a_facs[afn].append(_a_val_names[afi][int(act_idx[afi])])

    obs_lab = my_env.step(act_lab_controllable)    ##. next observe, howto
    obs_idx = [
      _lup['oHint_0'].index(obs_lab[0]), 
      _lup['oRew_1'].index(obs_lab[1]), 
      _lup['oChoice_2'].index(obs_lab[2])]
    for omi, omn in enumerate(_o_mod_names):
        _o_mods[omn].append(_o_val_names[omi][obs_idx[omi]])

    print(f'Action at time {t}: {act_lab_controllable}')
    print(f'Reward at time {t}: {obs_lab[1]}')

_a_fac_names = ['aNull_0', 'aChoice_1'] ## control factor names
_s_fac_names = ['sContext_0', 'sChoice_1'] ## state factor names
_s̆_fac_names = ['s̆Context_0', 's̆Choice_1'] ## state factor names
_o_mod_names = ['oHint_0', 'oRew_1', 'oChoice_2'] ## observation modality names

_a_val_names = [_lup[cfn] for cfn in _a_fac_names]; print(f'{_a_val_names=}')
_s_val_names = [_lup[sfn] for sfn in _s_fac_names]; print(f'{_s_val_names=}')
_s̆_val_names = [_lup[sfn] for sfn in _s̆_fac_names]; print(f'{_s̆_val_names=}')
_o_val_names = [_lup[omn] for omn in _o_mod_names]; print(f'{_o_val_names=}')

_a_val_names=[['Null'], ['Move-Start', 'Get-Hint', 'Play-Left', 'Play-Right']]
_s_val_names=[['Left-Better', 'Right-Better'], ['Start', 'Hint', 'Left', 'Right']]
_s̆_val_names=[['Left-Better', 'Right-Better'], ['Start', 'Hint', 'Left', 'Right']]
_o_val_names=[['Null', 'Hint-Left', 'Hint-Right'], ['Null', 'Loss', 'Reward'], ['Start', 'Hint', 'Left', 'Right']]

4.6.1 First evaluation

Now all we have to do is define the bandit environment, choose the length of the simulation, and run the function we wrote above.

Try playing with the hint accuracy and/or reward statistics of the environment - remember this is different than the agent’s representation of the reward statistics (i.e. the agent’s generative model, e.g. the A or B matrices).

## Create an envir
## this is the "true" accuracy of the hint - i.e. how often does the hint actually 
##   signal which arm is better. REMEMBER: THIS IS INDEPENDENT OF HOW YOU PARAMETERIZE 
##   THE A MATRIX FOR THE HINT MODALITY
_p_hint_env = 1.0
## this is the "true" reward probability - i.e. how often does the better arm actually 
##   return a reward, as opposed to a loss. REMEMBER: THIS IS INDEPENDENT OF HOW YOU 
##   PARAMETERIZE THE A MATRIX FOR THE REWARD MODALITY
_p_reward_env = 0.7
_my_envir = EpistemicMABEnvir(
    context="Right-Better", 
    p_hint=_p_hint_env, 
    p_reward=_p_reward_env)

print(f'Context: {_my_envir.s̆Context_0}')

Context: Right-Better

## Create an agent
## list of the indices of the hidden state factors that are controllable
## sContext_0: uncontrollable; sChoice_1: controllable
_controllable_indices = [1]

_my_agent = Agent(
    A=_A, 
    B=_B, 
    C=_C, 
    D=_D,
    ## number of action values (aChoice_1); aNull_0 has a single value, so 1*4=4
    # policy_len=1, ##. 4**1 = 4 ele
    policy_len=2, ##. 4**2 = 16 ele
    # policy_len=3, ##. 4**3 = 64 ele 
    # policy_len=4, ##. 4**4 = 256 ele 
    control_fac_idx=_controllable_indices
    )
_my_agent

<pymdp.agent.Agent at 0x7f8876d24ee0>

_T = 5

_a_facs = {'aNull_0': [], 'aChoice_1': []}
_s_facs = {'sContext_0': [], 'sChoice_1': []}
_s̆_facs = {'s̆Context_0': [], 's̆Choice_1': []}
_o_mods = {'oHint_0': [], 'oRew_1': [], 'oChoice_2': []}

## min_F = []
_qIpiIs = []
_GNegs = []

run_active_inference_loop(_my_agent, _my_envir, T=_T)

Re-initialized to:
self.s̆Context_0='Right-Better'
self.p_hint=1.0
self.p_reward=0.7
oHint_0='Null', oRew_1='Null', oChoice_2='Start'
Number of policies: math.prod(_a_dims)**my_agt.policy_len=16
    because _a_dims=[1, 4], my_agt.policy_len=2

===================== Time 0: =====================
len(efe)=16
EFE: [ -9.23  -9.15 -10.04 -10.04  -9.15  -9.06  -9.95  -9.95 -10.04  -9.95
 -10.84 -10.84 -10.04  -9.95 -10.84 -10.84]
np.argmin(efe)=np.int64(10)
np.min(efe)=np.float64(-10.84254115305152)
LOWEST efe in 10: -10.84254115305152
len(qIpiI)=16
qIpiI: [0.04 0.17 0.   0.   0.17 0.62 0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.  ]
np.argmax(qIpiI)=np.int64(5)
np.max(qIpiI)=np.float64(0.62189411232507)
BEST qIpiI: 0.0
Action at time 0: Get-Hint
Reward at time 0: Null

===================== Time 1: =====================
len(efe)=16
EFE: [ -9.23  -9.16 -10.78  -9.34  -9.16  -9.09 -10.71  -9.27 -10.78 -10.71
 -12.34 -10.9   -9.34  -9.27 -10.9   -9.46]
np.argmin(efe)=np.int64(10)
np.min(efe)=np.float64(-12.340707261043184)
LOWEST efe in 10: -12.340707261043184
len(qIpiI)=16
qIpiI: [0.06 0.17 0.   0.01 0.17 0.52 0.   0.03 0.   0.   0.   0.   0.01 0.03
 0.   0.  ]
np.argmax(qIpiI)=np.int64(5)
np.max(qIpiI)=np.float64(0.524743555540357)
BEST qIpiI: 0.0
Action at time 1: Get-Hint
Reward at time 1: Null

===================== Time 2: =====================
len(efe)=16
EFE: [ -9.23  -9.18 -11.36  -8.88  -9.18  -9.14 -11.32  -8.84 -11.36 -11.32
 -13.5  -11.02  -8.88  -8.84 -11.02  -8.54]
np.argmin(efe)=np.int64(10)
np.min(efe)=np.float64(-13.501779892521045)
LOWEST efe in 10: -13.501779892521045
len(qIpiI)=16
qIpiI: [0.   0.   0.   0.   0.   0.   0.   0.01 0.   0.   0.   0.   0.   0.01
 0.   0.98]
np.argmax(qIpiI)=np.int64(15)
np.max(qIpiI)=np.float64(0.9766159358268385)
BEST qIpiI: 0.0
Action at time 2: Play-Right
Reward at time 2: Reward

===================== Time 3: =====================
len(efe)=16
EFE: [ -9.23  -9.21 -11.84  -8.55  -9.21  -9.2  -11.82  -8.54 -11.84 -11.82
 -14.44 -11.16  -8.55  -8.54 -11.16  -7.88]
np.argmin(efe)=np.int64(10)
np.min(efe)=np.float64(-14.443102433432257)
LOWEST efe in 10: -14.443102433432257
len(qIpiI)=16
qIpiI: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
np.argmax(qIpiI)=np.int64(15)
np.max(qIpiI)=np.float64(0.9999100285917482)
BEST qIpiI: 0.0
Action at time 3: Play-Right
Reward at time 3: Reward

===================== Time 4: =====================
len(efe)=16
EFE: [ -9.23  -9.22 -11.98  -8.46  -9.22  -9.22 -11.97  -8.46 -11.98 -11.97
 -14.73 -11.21  -8.46  -8.46 -11.21  -7.69]
np.argmin(efe)=np.int64(10)
np.min(efe)=np.float64(-14.72772287828566)
LOWEST efe in 10: -14.72772287828566
len(qIpiI)=16
qIpiI: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
np.argmax(qIpiI)=np.int64(15)
np.max(qIpiI)=np.float64(0.9999811293152724)
BEST qIpiI: 0.0
Action at time 4: Play-Right
Reward at time 4: Reward

colors = [
{'Null':'black'}, ## aNull_0
{'Move-Start':'red', 'Get-Hint':'green', 'Play-Left': 'blue', 'Play-Right': 'purple'}, ## aChoice_1

{'Left-Better':'orange', 'Right-Better':'purple'}, ## sContext_0
{'Start':'red', 'Hint':'green', 'Left': 'blue', 'Right': 'purple'}, ## sChoice_1

{'Null':'orange', 'Hint-Left':'purple', 'Hint-Right':'pink'}, ## oHint_0
{'Null':'red', 'Loss':'green', 'Reward': 'blue'}, ## oRew_1
{'Start':'red', 'Hint':'green', 'Left': 'blue', 'Right': 'purple'} ## oChoice_2
]

ylabel_size = 12
msi = 9 ## markersize for Line2D, diameter in points
siz = (msi/2)**2 * np.pi ## size for scatter, area of marker in points squared

fig = plt.figure(figsize=(9, 6))
## gs = GridSpec(6, 1, figure=fig, height_ratios=[1, 3, 1, 3, 3, 1])
grid_rows = 7
gs = GridSpec(grid_rows, 1, figure=fig, height_ratios=[1, 1, 1, 1, 1, 1, 1])
ax = [fig.add_subplot(gs[i]) for i in range(grid_rows)]

i = 0
ax[i].set_title(f'Three-Arm Bandit', fontweight='bold',fontsize=14)
y_pos = 0
for t, s in zip(range(_T), _a_facs['aNull_0']):
    ax[i].scatter(t, y_pos, color=colors[i][s], s=siz)
ax[i].set_ylabel('$a^{\mathrm{Null}}_{0t}$', rotation=0, fontweight='bold', fontsize=ylabel_size)
ax[i].set_yticks([])
ax[i].set_xticklabels([])
leg_items = [
    Line2D([0],[0],marker='o',color='w',markerfacecolor='black',markersize=msi,label='Null')]
ax[i].legend(handles=leg_items, bbox_to_anchor=(1.05, 0.5), loc='center left', borderaxespad=0, labelspacing=0.1)
ax[i].spines['top'].set_visible(False); ax[i].spines['right'].set_visible(False)

i = 1
y_pos = 0
for t, s in zip(range(_T), _a_facs['aChoice_1']):
    ax[i].scatter(t, y_pos, color=colors[i][s], s=siz, label=s)
ax[i].set_ylabel('$a^{\mathrm{Choice}}_{1t}$', rotation=0, fontweight='bold', fontsize=ylabel_size)
ax[i].set_yticks([])
leg_items = [
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Move-Start'],markersize=msi,label='Move-Start'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Get-Hint'],markersize=msi,label='Get-Hint'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Play-Left'],markersize=msi,label='Play-Left'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Play-Right'],markersize=msi,label='Play-Right')]
ax[i].legend(handles=leg_items, bbox_to_anchor=(1.05, 0.5), loc='center left', borderaxespad=0, labelspacing=0.1)
ax[i].spines['top'].set_visible(False); ax[i].spines['right'].set_visible(False)

i = 2
y_pos = 0
for t, s in zip(range(_T), _s_facs['sContext_0']):
    ax[i].scatter(t, y_pos, color=colors[i][s], s=siz)
ax[i].set_ylabel('$s^{\mathrm{Context}}_{0t}$', rotation=0, fontweight='bold', fontsize=ylabel_size)
ax[i].set_yticks([])
ax[i].set_xticklabels([])
leg_items = [
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Left-Better'],markersize=msi,label='Left-Better'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Right-Better'],markersize=msi,label='Right-Better')]
ax[i].legend(handles=leg_items, bbox_to_anchor=(1.05, 0.5), loc='center left', borderaxespad=0, labelspacing=0.1)
ax[i].spines['top'].set_visible(False); ax[i].spines['right'].set_visible(False)

i = 3
y_pos = 0
for t, s in zip(range(_T), _s_facs['sChoice_1']):
    ax[i].scatter(t, y_pos, color=colors[i][s], s=siz, label=s)
ax[i].set_ylabel('$s^{\mathrm{Choice}}_{1t}$', rotation=0, fontweight='bold', fontsize=ylabel_size)
ax[i].set_yticks([])
leg_items = [
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Start'],markersize=msi,label='Start'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Hint'],markersize=msi,label='Hint'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Left'],markersize=msi,label='Left'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Right'],markersize=msi,label='Right')]
ax[i].legend(handles=leg_items, bbox_to_anchor=(1.05, 0.5), loc='center left', borderaxespad=0, labelspacing=0.1)
ax[i].spines['top'].set_visible(False); ax[i].spines['right'].set_visible(False)

i = 4
y_pos = 0
for t, s in zip(range(_T), _o_mods['oHint_0']):
    ax[i].scatter(t, y_pos, color=colors[i][s], s=siz, label=s)
ax[i].set_ylabel('$o^{\mathrm{Hint}}_{0t}$', rotation=0, fontweight='bold', fontsize=ylabel_size)
ax[i].set_yticks([])
leg_items = [
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Null'],markersize=msi,label='Null'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Hint-Left'],markersize=msi,label='Hint-Left'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Hint-Right'],markersize=msi,label='Hint-Right')]
ax[i].legend(handles=leg_items, bbox_to_anchor=(1.05, 0.5), loc='center left', borderaxespad=0, labelspacing=0.1)
ax[i].spines['top'].set_visible(False); ax[i].spines['right'].set_visible(False)

i = 5
y_pos = 0
for t, s in zip(range(_T), _o_mods['oRew_1']):
    ax[i].scatter(t, y_pos, color=colors[i][s], s=siz, label=s)
ax[i].set_ylabel('$o^{\mathrm{Rew}}_{1t}$', rotation=0, fontweight='bold', fontsize=ylabel_size)
ax[i].set_yticks([])
leg_items = [
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Null'],markersize=msi,label='Null'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Loss'],markersize=msi,label='Loss'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Reward'],markersize=msi,label='Reward')]
ax[i].legend(handles=leg_items, bbox_to_anchor=(1.05, 0.5), loc='center left', borderaxespad=0, labelspacing=0.1)
ax[i].spines['top'].set_visible(False); ax[i].spines['right'].set_visible(False)

i = 6
y_pos = 0
for t, s in zip(range(_T), _o_mods['oChoice_2']):
    ax[i].scatter(t, y_pos, color=colors[i][s], s=siz, label=s)
ax[i].set_ylabel('$o^{\mathrm{Choice}}_{2t}$', rotation=0, fontweight='bold', fontsize=ylabel_size)
ax[i].set_yticks([])
leg_items = [
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Start'],markersize=msi,label='Start'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Hint'],markersize=msi,label='Hint'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Left'],markersize=msi,label='Left'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Right'],markersize=msi,label='Right')]
ax[i].legend(handles=leg_items, bbox_to_anchor=(1.05, 0.5), loc='center left', borderaxespad=0, labelspacing=0.1)
ax[i].spines['top'].set_visible(False); ax[i].spines['right'].set_visible(False)

ax[i].xaxis.set_major_locator(MaxNLocator(integer=True))
## ax[i].xaxis.set_major_locator(MaxNLocator(nbins=10, integer=True))
ax[i].set_xlabel('$\mathrm{time,}\ t$', fontweight='bold', fontsize=12)

plt.tight_layout()
plt.subplots_adjust(hspace=0.1) ## Adjust this value as needed
plt.show()

4.6.2 Second evaluation

Now let’s try manipulating the agent’s prior preferences over reward observations ( $C [1]$ ) in order to examine the tension between exploration and exploitation.

## Create an envir
## re-initialize the environment -- this time, the hint is not always accurate 
##   (`p_hint = 0.8`)
## _my_envir = ThreeArmedBanditEnvir(p_hint=0.8, p_reward=0.8) ##. previously: p_hint=1.0 & p_reward=0.7
_my_envir = EpistemicMABEnvir(
    context="Right-Better", 
    # context="Left-Better", 
    p_hint=0.7,
    p_reward=0.8
) ##. previously: p_hint=1.0 & p_reward=0.7
print(f'Context: {_my_envir.s̆Context_0}')

Context: Right-Better

## Create an agent
## change the 'shape' of the agent's reward function
## makes the Loss "less aversive" than before (higher prior probability assigned 
##   to seeing the Loss outcome). This should make the agent less 
##   risk-averse / willing to explore both arms, under uncertainty
_C[1][1] = 0.0 ##. previously -4.0 for oRew_1 == 'Loss' observation

## redefine the agent with the new preferences
_my_agent = Agent(
    A=_A, 
    B=_B, 
    C=_C, 
    D=_D,
    ## number of action values (aChoice_1); aNull_0 has a single value, 1*4=4
    # policy_len=1, ##. 4**1 = 4 ele
    policy_len=2,   ##. 4**2 = 16 ele
    # policy_len=3, ##. 4**3 = 64 ele 
    # policy_len=4, ##. 4**4 = 256 ele 
    control_fac_idx=_controllable_indices ##.
)

_T = 5

_a_facs = {'aNull_0': [], 'aChoice_1': []}
_s_facs = {'sContext_0': [], 'sChoice_1': []}
_s̆_facs = {'s̆Context_0': [], 's̆Choice_1': []}
_o_mods = {'oHint_0': [], 'oRew_1': [], 'oChoice_2': []}

## min_F = []
_qIpiIs = []
_GNegs = []

run_active_inference_loop(_my_agent, _my_envir, T=_T)

Re-initialized to:
self.s̆Context_0='Right-Better'
self.p_hint=0.7
self.p_reward=0.8
oHint_0='Null', oRew_1='Null', oChoice_2='Start'
Number of policies: math.prod(_a_dims)**my_agt.policy_len=16
    because _a_dims=[1, 4], my_agt.policy_len=2

===================== Time 0: =====================
len(efe)=16
EFE: [-9.45 -9.37 -8.26 -8.26 -9.37 -9.28 -8.17 -8.17 -8.26 -8.17 -7.06 -7.06
 -8.26 -8.17 -7.06 -7.06]
np.argmin(efe)=np.int64(0)
np.min(efe)=np.float64(-9.448902606949428)
LOWEST efe in 0: -9.448902606949428
len(qIpiI)=16
qIpiI: [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.25 0.25 0.   0.
 0.25 0.25]
np.argmax(qIpiI)=np.int64(10)
np.max(qIpiI)=np.float64(0.2499999939074112)
BEST qIpiI: 0.0
Action at time 0: Play-Left
Reward at time 0: Loss

===================== Time 1: =====================
len(efe)=16
EFE: [-9.45 -9.4  -8.68 -7.96 -9.4  -9.34 -8.63 -7.91 -8.68 -8.63 -7.92 -7.2
 -7.96 -7.91 -7.2  -6.48]
np.argmin(efe)=np.int64(0)
np.min(efe)=np.float64(-9.448902606949428)
LOWEST efe in 0: -9.448902606949428
len(qIpiI)=16
qIpiI: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
np.argmax(qIpiI)=np.int64(15)
np.max(qIpiI)=np.float64(0.9999801409738043)
BEST qIpiI: 0.0
Action at time 1: Play-Right
Reward at time 1: Reward

===================== Time 2: =====================
len(efe)=16
EFE: [-9.45 -9.43 -8.93 -7.87 -9.43 -9.41 -8.91 -7.86 -8.93 -8.91 -8.42 -7.36
 -7.87 -7.86 -7.36 -6.3 ]
np.argmin(efe)=np.int64(0)
np.min(efe)=np.float64(-9.448902606949428)
LOWEST efe in 0: -9.448902606949428
len(qIpiI)=16
qIpiI: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
np.argmax(qIpiI)=np.int64(15)
np.max(qIpiI)=np.float64(0.9999999121309135)
BEST qIpiI: 0.0
Action at time 2: Play-Right
Reward at time 2: Reward

===================== Time 3: =====================
len(efe)=16
EFE: [-9.45 -9.44 -9.02 -7.85 -9.44 -9.44 -9.01 -7.85 -9.02 -9.01 -8.59 -7.42
 -7.85 -7.85 -7.42 -6.26]
np.argmin(efe)=np.int64(0)
np.min(efe)=np.float64(-9.448902606949428)
LOWEST efe in 0: -9.448902606949428
len(qIpiI)=16
qIpiI: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
np.argmax(qIpiI)=np.int64(15)
np.max(qIpiI)=np.float64(0.9999999834018658)
BEST qIpiI: 0.0
Action at time 3: Play-Right
Reward at time 3: Reward

===================== Time 4: =====================
len(efe)=16
EFE: [-9.45 -9.45 -9.04 -7.85 -9.45 -9.45 -9.04 -7.85 -9.04 -9.04 -8.63 -7.44
 -7.85 -7.85 -7.44 -6.25]
np.argmin(efe)=np.int64(0)
np.min(efe)=np.float64(-9.448902606949428)
LOWEST efe in 0: -9.448902606949428
len(qIpiI)=16
qIpiI: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
np.argmax(qIpiI)=np.int64(15)
np.max(qIpiI)=np.float64(0.9999999893155956)
BEST qIpiI: 0.0
Action at time 4: Play-Right
Reward at time 4: Reward

colors = [
{'Null':'black'}, ## aNull_0
{'Move-Start':'red', 'Get-Hint':'green', 'Play-Left': 'blue', 'Play-Right': 'purple'}, ## aChoice_1

{'Left-Better':'orange', 'Right-Better':'purple'}, ## sContext_0
{'Start':'red', 'Hint':'green', 'Left': 'blue', 'Right': 'purple'}, ## sChoice_1

{'Null':'orange', 'Hint-Left':'purple', 'Hint-Right':'pink'}, ## oHint_0
{'Null':'red', 'Loss':'green', 'Reward': 'blue'}, ## oRew_1
{'Start':'red', 'Hint':'green', 'Left': 'blue', 'Right': 'purple'} ## oChoice_2
]

ylabel_size = 12
msi = 9 ## markersize for Line2D, diameter in points
siz = (msi/2)**2 * np.pi ## size for scatter, area of marker in points squared

fig = plt.figure(figsize=(9, 6))
## gs = GridSpec(6, 1, figure=fig, height_ratios=[1, 3, 1, 3, 3, 1])
grid_rows = 7
gs = GridSpec(grid_rows, 1, figure=fig, height_ratios=[1, 1, 1, 1, 1, 1, 1])
ax = [fig.add_subplot(gs[i]) for i in range(grid_rows)]

i = 0
ax[i].set_title(f'Three-Arm Bandit', fontweight='bold',fontsize=14)
y_pos = 0
for t, s in zip(range(_T), _a_facs['aNull_0']):
    ax[i].scatter(t, y_pos, color=colors[i][s], s=siz)
ax[i].set_ylabel('$a^{\mathrm{Null}}_{0t}$', rotation=0, fontweight='bold', fontsize=ylabel_size)
ax[i].set_yticks([])
ax[i].set_xticklabels([])
leg_items = [
    Line2D([0],[0],marker='o',color='w',markerfacecolor='black',markersize=msi,label='Null')]
ax[i].legend(handles=leg_items, bbox_to_anchor=(1.05, 0.5), loc='center left', borderaxespad=0, labelspacing=0.1)
ax[i].spines['top'].set_visible(False); ax[i].spines['right'].set_visible(False)

i = 1
y_pos = 0
for t, s in zip(range(_T), _a_facs['aChoice_1']):
    ax[i].scatter(t, y_pos, color=colors[i][s], s=siz, label=s)
ax[i].set_ylabel('$a^{\mathrm{Choice}}_{1t}$', rotation=0, fontweight='bold', fontsize=ylabel_size)
ax[i].set_yticks([])
leg_items = [
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Move-Start'],markersize=msi,label='Move-Start'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Get-Hint'],markersize=msi,label='Get-Hint'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Play-Left'],markersize=msi,label='Play-Left'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Play-Right'],markersize=msi,label='Play-Right')]
ax[i].legend(handles=leg_items, bbox_to_anchor=(1.05, 0.5), loc='center left', borderaxespad=0, labelspacing=0.1)
ax[i].spines['top'].set_visible(False); ax[i].spines['right'].set_visible(False)

i = 2
y_pos = 0
for t, s in zip(range(_T), _s_facs['sContext_0']):
    ax[i].scatter(t, y_pos, color=colors[i][s], s=siz)
ax[i].set_ylabel('$s^{\mathrm{Context}}_{0t}$', rotation=0, fontweight='bold', fontsize=ylabel_size)
ax[i].set_yticks([])
ax[i].set_xticklabels([])
leg_items = [
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Left-Better'],markersize=msi,label='Left-Better'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Right-Better'],markersize=msi,label='Right-Better')]
ax[i].legend(handles=leg_items, bbox_to_anchor=(1.05, 0.5), loc='center left', borderaxespad=0, labelspacing=0.1)
ax[i].spines['top'].set_visible(False); ax[i].spines['right'].set_visible(False)

i = 3
y_pos = 0
for t, s in zip(range(_T), _s_facs['sChoice_1']):
    ax[i].scatter(t, y_pos, color=colors[i][s], s=siz, label=s)
ax[i].set_ylabel('$s^{\mathrm{Choice}}_{1t}$', rotation=0, fontweight='bold', fontsize=ylabel_size)
ax[i].set_yticks([])
leg_items = [
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Start'],markersize=msi,label='Start'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Hint'],markersize=msi,label='Hint'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Left'],markersize=msi,label='Left'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Right'],markersize=msi,label='Right')]
ax[i].legend(handles=leg_items, bbox_to_anchor=(1.05, 0.5), loc='center left', borderaxespad=0, labelspacing=0.1)
ax[i].spines['top'].set_visible(False); ax[i].spines['right'].set_visible(False)

i = 4
y_pos = 0
for t, s in zip(range(_T), _o_mods['oHint_0']):
    ax[i].scatter(t, y_pos, color=colors[i][s], s=siz, label=s)
ax[i].set_ylabel('$o^{\mathrm{Hint}}_{0t}$', rotation=0, fontweight='bold', fontsize=ylabel_size)
ax[i].set_yticks([])
leg_items = [
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Null'],markersize=msi,label='Null'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Hint-Left'],markersize=msi,label='Hint-Left'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Hint-Right'],markersize=msi,label='Hint-Right')]
ax[i].legend(handles=leg_items, bbox_to_anchor=(1.05, 0.5), loc='center left', borderaxespad=0, labelspacing=0.1)
ax[i].spines['top'].set_visible(False); ax[i].spines['right'].set_visible(False)

i = 5
y_pos = 0
for t, s in zip(range(_T), _o_mods['oRew_1']):
    ax[i].scatter(t, y_pos, color=colors[i][s], s=siz, label=s)
ax[i].set_ylabel('$o^{\mathrm{Rew}}_{1t}$', rotation=0, fontweight='bold', fontsize=ylabel_size)
ax[i].set_yticks([])
leg_items = [
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Null'],markersize=msi,label='Null'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Loss'],markersize=msi,label='Loss'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Reward'],markersize=msi,label='Reward')]
ax[i].legend(handles=leg_items, bbox_to_anchor=(1.05, 0.5), loc='center left', borderaxespad=0, labelspacing=0.1)
ax[i].spines['top'].set_visible(False); ax[i].spines['right'].set_visible(False)

i = 6
y_pos = 0
for t, s in zip(range(_T), _o_mods['oChoice_2']):
    ax[i].scatter(t, y_pos, color=colors[i][s], s=siz, label=s)
ax[i].set_ylabel('$o^{\mathrm{Choice}}_{2t}$', rotation=0, fontweight='bold', fontsize=ylabel_size)
ax[i].set_yticks([])
leg_items = [
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Start'],markersize=msi,label='Start'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Hint'],markersize=msi,label='Hint'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Left'],markersize=msi,label='Left'),
    Line2D([0],[0],marker='o',color='w',markerfacecolor=colors[i]['Right'],markersize=msi,label='Right')]
ax[i].legend(handles=leg_items, bbox_to_anchor=(1.05, 0.5), loc='center left', borderaxespad=0, labelspacing=0.1)
ax[i].spines['top'].set_visible(False); ax[i].spines['right'].set_visible(False)

ax[i].xaxis.set_major_locator(MaxNLocator(integer=True))
ax[i].set_xlabel('$\mathrm{time,}\ t$', fontweight='bold', fontsize=12)

plt.tight_layout()
plt.subplots_adjust(hspace=0.1) ## Adjust this value as needed
plt.show()