Hi all,
First time posting in the forum so want to start by saying thanks for the amazing tool! I’ve already used it for one project where it enabled some really cool things we wouldn’t have been able to do otherwise.
I am now applying it to another project but am having a few issues. I am implementing a multilevel cumulative link model where the simplest version would be something like outcome ~ 1 + (1 | area_id/household_id) where the outcome is a categorical variable with 3 levels. I’m training the amortizer to handle a varying number of areas, households, and observations per household. I’ve successfully setup a pipeline that simulates and trains this using a bayesflow workflow.
But the problem is that the global loss / error remains quite high after training (~5) and when I get posterior samples they don’t seem to be very good (as compared to an equivalent model in brms). It’s possible that my current version is just undertrained and I need to train it more (the loss perhaps was creeping down from ~5, but not very quickly). But I also have a hunch I may be missing something obvious somewhere that is causing issues.
One area where I was a bit unsure (and maybe this is the problem?) was how I handled masking / missing data. Following what I had seen recommended elsewhere, in my final configurated summary conditions, missing data is indicated both with an implausible value (-1) as well as adding an ‘availability mask’ where 1 indicates real data, and 0 indicates missing data.
Here’s the main setup of my model (I can also provide the amortizer / training code too if helpful).
import os
os.environ["NUMBA_THREADING_LAYER"] = "tbb"
import bayesflow.simulation as simulation
import numpy as np
from numba import njit, prange
from tensorflow.keras.utils import to_categorical
# ----------------- Config ----------------- #
MIN_AREAS = 100
MAX_AREAS = 600
MIN_HH = 1
MAX_HH = 10
MAX_N_HH = 8
rng = np.random.default_rng()
# ----------------- Context generator ----------------- #
def sample_trunc_geometric_cap(p, max_k):
k = rng.geometric(p)
return min(k, max_k)
def random_num_obs():
n_areas = rng.integers(MIN_AREAS, MAX_AREAS + 1)
n_hhs_per_area = []
n_obs_per_hh = []
for area in range(n_areas):
n_hhs = rng.integers(MIN_HH, MAX_HH + 1)
n_hhs_per_area.append(n_hhs)
area_obs_per_hh = [sample_trunc_geometric_cap(0.5, MAX_N_HH) for _ in range(n_hhs)]
n_obs_per_hh.append(area_obs_per_hh)
selector_array = np.zeros((MAX_AREAS, MAX_HH, MAX_N_HH), dtype=int)
area_indices, household_indices = [], []
global_household_index = 0
for area in range(n_areas):
n_hhs = n_hhs_per_area[area]
for hh in range(n_hhs):
n_obs = n_obs_per_hh[area][hh]
selector_array[area, hh, :n_obs] = 1
area_indices.extend([area] * n_obs)
household_indices.extend([global_household_index] * n_obs)
global_household_index += 1
return selector_array, np.array(area_indices), np.array(household_indices)
# ----------------- Priors ----------------- #
def sample_hyper_params():
sigma_area = rng.exponential(1.0)
sigma_hh = rng.exponential(1.0)
return np.array([sigma_area, sigma_hh], dtype=np.float32)
def sample_local_params(hyper_params):
sigma_area, sigma_hh = hyper_params
raw_area = rng.normal(0.0, sigma_area, size=MAX_AREAS)
raw_hh = rng.normal(0.0, sigma_hh, size=MAX_AREAS * MAX_HH)
return np.tile(raw_area[:, None], (1, MAX_HH)), raw_hh.reshape(MAX_AREAS, MAX_HH)
def sample_shared_params(*args, **kwargs):
cut1, cut2 = sorted([rng.normal(0, 2), rng.normal(0, 2)])
return np.array([cut1, cut2])
# ----------------- Likelihood ----------------- #
def sample_likelihood(theta, nonbatchable_context, *args):
local_params, shared_params = theta
u_area, u_hh_dev = local_params
selector_array, _, _ = nonbatchable_context
u_area_rep = np.repeat(u_area[:, :, None], MAX_N_HH, axis=-1)
u_hh_rep = np.repeat(u_hh_dev[:, :, None], MAX_N_HH, axis=-1)
u_area_obs = u_area_rep.flatten()[selector_array.flatten() == 1]
u_hh_obs = u_hh_rep.flatten()[selector_array.flatten() == 1]
cut1, cut2 = shared_params
eta = u_area_obs + u_hh_obs
return _sample_likelihood_jit(eta.astype(np.float64), cut1, cut2, *args)
@njit(parallel=True)
def _sample_likelihood_jit(eta, cut1, cut2):
N = eta.shape[0]
out = np.empty(N, dtype=np.int32)
for i in prange(N):
u = np.random.rand()
p1 = 1.0 / (1.0 + np.exp(-(cut1 - eta[i])))
p2 = 1.0 / (1.0 + np.exp(-(cut2 - eta[i])))
if u < p1:
out[i] = 0
elif u < p2:
out[i] = 1
else:
out[i] = 2
return out
# ----------------- Configurator ----------------- #
def configurator(forward_dict: dict):
MASK_VALUE = -1.0
sim_data = np.asarray(forward_dict["sim_data"])
context_selector, area_idx_flat, hh_idx_flat = forward_dict["sim_non_batchable_context"]
context_selector = np.asarray(context_selector).astype(bool)
B = sim_data.shape[0]
D_out = 6 # (3 one-hot + 2 indices + 1 mask)
hh_exists = context_selector.any(axis=2)
G = int(hh_exists.sum())
ah_to_g, g_order = {}, []
valid_flat_idx = np.flatnonzero(context_selector.ravel())
for flat in valid_flat_idx:
a, h, _o = np.unravel_index(flat, (MAX_AREAS, MAX_HH, MAX_N_HH))
if (a, h) not in ah_to_g:
ah_to_g[(a, h)] = len(ah_to_g)
g_order.append((a, h))
summary_conditions = np.full((B, G, MAX_N_HH, D_out), MASK_VALUE, dtype=np.float32)
summary_conditions[..., :3] = 0.0
summary_conditions[..., 5] = 0.0
area_col = np.asarray(area_idx_flat).reshape(-1, 1).astype(np.float32)
hh_col = np.asarray(hh_idx_flat).reshape(-1, 1).astype(np.float32)
for b in range(B):
labels = sim_data[b].astype(int)
one_hot = to_categorical(labels, num_classes=3).astype(np.float32)
feats_stream = np.concatenate([one_hot, area_col, hh_col], axis=1)
k = 0
for flat in valid_flat_idx:
a, h, o = np.unravel_index(flat, (MAX_AREAS, MAX_HH, MAX_N_HH))
g = ah_to_g[(a, h)]
summary_conditions[b, g, int(o), :5] = feats_stream[k]
summary_conditions[b, g, int(o), 5] = 1.0
k += 1
local_params = np.asarray(forward_dict["local_prior_draws"], dtype=np.float32)
local_params = np.transpose(local_params, (0, 2, 3, 1)).copy()
local_params = np.stack([local_params[:, a, h, :] for (a, h) in g_order], axis=1)
hyper_params = np.asarray(forward_dict["hyper_prior_draws"], dtype=np.float32)
shared_params = np.asarray(forward_dict["shared_prior_draws"], dtype=np.float32)
n_households = int((context_selector.sum(axis=2) > 0).sum())
frac_households = n_households / float(MAX_AREAS * MAX_HH)
direct_global_conditions = np.ones((B, 1), dtype=np.float32) * frac_households
filled_obs_per_hh = context_selector.sum(axis=2).astype(np.float32)
per_hh_sqrt = np.array([np.sqrt(filled_obs_per_hh[a, h]) for (a, h) in g_order], dtype=np.float32)
direct_local_conditions = np.broadcast_to(per_hh_sqrt[None, :, None], (B, G, 1))
return {
"summary_conditions": summary_conditions,
"hyper_parameters": hyper_params,
"local_parameters": local_params,
"shared_parameters": shared_params,
"direct_global_conditions": direct_global_conditions,
"direct_local_conditions": direct_local_conditions,
}
If it helps at all, here are the shapes of the final outputs from the configurator:
summary_conditions: (batches, n_groups (n_areas*n_households), n_obs_per_group, 6 (3 (one-hot) + 2 (area, hh) + 1 (mask) = 6))
hyper_params: (batches, 2)
local_params: (batches, n_groups, 2)
shared_params: (batches, 2)
direct_global_conditions: (batches, 1)
direct_local_conditions: (batches, n_groups, 1)
Thanks in advance!