1 Setup

2 Helper functions

2.1 Demographics

2.2 Earth Mover’s Distance (Wasserstein distance)

compute_emd = function(path_data) {
  models = setdiff(unique(path_data$model), 'human')
  data = expand.grid(trial = trials,
                     agent = agents,
                     agent_type = agent_types,
                     model = models,
                     emd = NA)
  
  for (this_trial in trials) {
    for (this_agent in agents) {
      for (this_agent_type in agent_types) {
        d = path_data %>% 
          filter(agent == this_agent,
                 trial == this_trial,
                 path_type == 'return path') %>%
          group_by(agent_type, model, x, y) %>%
          summarise(n = n()) %>%
          ungroup(x, y) %>% 
          mutate(p = n / sum(n)) %>% 
          ungroup()
        d.human = d %>%
          filter(model == 'human', agent_type == this_agent_type)
        for (this_model in models) {
          d.model = d %>%
            filter(model == this_model, agent_type == this_agent_type)
          if (length(d.model$p) > 0) {
            this_w = wasserstein(wpp(data.matrix(d.human %>% select(x, y)), d.human$p),
                                 wpp(data.matrix(d.model %>% select(x, y)), d.model$p))
            data = data %>%
              mutate(emd = ifelse((trial == this_trial) &
                                    (agent == this_agent) &
                                    (agent_type == this_agent_type) &
                                    (model == this_model),
                                  this_w,
                                  emd)
              )
          }
        }
      }
    }
  }
  
  return(data)
}

2.3 Miscellaneous

3 Experiment 1 (Suspect)

3.1 Data

3.1.1 Trial images

images.suspect = data.frame(trial = trials) %>%
  mutate(grob = map(.x = trial,
                    .f = ~ readPNG(str_c(images_dir, 'exp1_suspect/images/',
                                          .x, '_A1.png'))))

3.1.2 Humans

data.suspect.human = read_csv(str_c(data_dir, 'exp1_suspect/humans/human_trials.csv'),
                              show_col_types = F)

data.suspect.feedback = read_csv(str_c(data_dir, 'exp1_suspect/humans/human_feedback.csv'),
                              show_col_types = F)

data.suspect.feedback %>%
  select(id, agent_type, prediction_factors) %>%
  datatable(rownames = F)

Demographics:

print_demographics(data.suspect.feedback)

## # A tibble: 3 × 2
##   gender         n
##   <chr>      <int>
## 1 Female        39
## 2 Male          56
## 3 Non-binary     5
## [1] "age: M = 37, SD = 12"
## # A tibble: 8 × 2
##   race                              n
##   <chr>                         <int>
## 1 American Indian/Alaska Native     2
## 2 Asian                            16
## 3 Biracial                          2
## 4 Black/African American           11
## 5 Hispanic                          1
## 6 Multiracial                       1
## 7 Puerto Rican                      1
## 8 White                            66
## # A tibble: 2 × 2
##   ethnicity        n
##   <chr>        <int>
## 1 Hispanic        16
## 2 Non-Hispanic    84
## [1] "time: 13.2 minutes (SD = 6.7)"

3.1.3 RSM

Grid search:

best_w.suspect.naive = 0.7
best_temp.suspect.naive = 0.01
best_w.suspect.sophisticated = 0.7
best_temp.suspect.sophisticated = 0.05

data.suspect.rsm = import_suspect_model_data(best_w.suspect.naive,
                                             best_temp.suspect.naive) %>%
  filter(agent_type == 'naive') %>%
  rbind(import_suspect_model_data(best_w.suspect.sophisticated,
                                  best_temp.suspect.sophisticated) %>%
          filter(agent_type == 'sophisticated'))

3.1.4 Uniform simulation

data.suspect.unif = import_suspect_data('uniform/') %>%
  mutate(agent_type = 'naive') %>%
  rbind(import_suspect_data('uniform/') %>%
          mutate(agent_type = 'sophisticated'))

3.1.5 GPT-4o

data.suspect.gpt = read_csv(str_c(data_dir, 'exp1_suspect/gpt4o/gpt4o.csv'),
                            show_col_types = F)
data.suspect.gpt %>%
  summarise(n = n(),
            n_response = sum(response_valid),
            p_response = n_response/n,
            n_path = sum(path_valid),
            p_path = n_path/n)

## # A tibble: 1 × 5
##       n n_response p_response n_path p_path
##   <int>      <int>      <dbl>  <int>  <dbl>
## 1  3600       3351      0.931    997  0.277

data.suspect.gpt = data.suspect.gpt %>%
  filter(path_valid) %>%
  mutate(is_return = (path_type == 2),
         path = gsub(" ", "", path)) %>%
  select(id=query, trial, agent_type, agent, path, is_return)

3.1.6 Combined

data.suspect = combine_suspect_data(list(
  'human' = data.suspect.human,
  'unif' = data.suspect.unif,
  'rsm' = data.suspect.rsm,
  'gpt4o' = data.suspect.gpt
))
data.suspect.paths = get_suspect_paths(data.suspect, 0.1)
data.suspect.emd = compute_emd(data.suspect.paths)

3.2 Analysis

3.2.1 Path length

fit.suspect.path_len.human = brm(
  formula = path_len ~ 1 + agent_type + (1 | id) + (1 | trial),
  data = data.suspect %>%
    filter(model == 'human'),
  iter = 2000,
  seed = 1,
  file = 'cache/fit.suspect.path_len.human'
)

summary(fit.suspect.path_len.human)

##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: path_len ~ 1 + agent_type + (1 | id) + (1 | trial) 
##    Data: data.suspect %>% filter(model == "human") (Number of observations: 3600) 
##   Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
##          total post-warmup draws = 4000
## 
## Group-Level Effects: 
## ~id (Number of levels: 100) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.99      0.16     1.70     2.33 1.00      862     1693
## 
## ~trial (Number of levels: 9) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     2.14      0.59     1.32     3.61 1.01     1304     2037
## 
## Population-Level Effects: 
##                         Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept                  12.60      0.76    11.11    14.17 1.01      992
## agent_typesophisticated     2.15      0.41     1.36     2.95 1.00      741
##                         Tail_ESS
## Intercept                   1768
## agent_typesophisticated     1400
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma     3.93      0.05     3.84     4.02 1.00     6557     2831
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

fit.suspect.path_len.rsm = brm(
  formula = path_len ~ 1 + agent_type + (1 | trial),
  data = data.suspect %>%
    filter(model == 'sim'),
  iter = 2000,
  seed = 1,
  file = 'cache/fit.suspect.path_len.rsm'
)

summary(fit.suspect.path_len.rsm)

##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: path_len ~ 1 + agent_type + (1 | trial) 
##    Data: data.suspect %>% filter(model == "sim") (Number of observations: 3600) 
##   Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
##          total post-warmup draws = 4000
## 
## Group-Level Effects: 
## ~trial (Number of levels: 9) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.68      0.49     1.01     2.94 1.01      791     1060
## 
## Population-Level Effects: 
##                         Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept                  12.59      0.57    11.45    13.66 1.01      840
## agent_typesophisticated     2.56      0.13     2.31     2.80 1.00     2843
##                         Tail_ESS
## Intercept                   1169
## agent_typesophisticated     2122
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma     3.76      0.04     3.68     3.85 1.00     2284     1928
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

fit.suspect.path_len.gpt = brm(
  formula = path_len ~ 1 + agent_type + (1 | trial),
  data = data.suspect %>%
    filter(model == 'gpt4o'),
  iter = 2000,
  seed = 1,
  file = 'cache/fit.suspect.path_len.gpt4o'
)

summary(fit.suspect.path_len.gpt)

##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: path_len ~ 1 + agent_type + (1 | trial) 
##    Data: data.suspect %>% filter(model == "gpt4o") (Number of observations: 997) 
##   Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
##          total post-warmup draws = 4000
## 
## Group-Level Effects: 
## ~trial (Number of levels: 9) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.51      0.45     0.87     2.63 1.00     1059     1614
## 
## Population-Level Effects: 
##                         Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept                  11.80      0.54    10.68    12.88 1.00      826
## agent_typesophisticated    -0.16      0.17    -0.48     0.18 1.00     3332
##                         Tail_ESS
## Intercept                   1342
## agent_typesophisticated     2944
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma     2.75      0.06     2.63     2.88 1.00     3368     2776
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

3.2.2 EMD comparison

data.suspect.emd %>%
  group_by(model, agent_type) %>%
  summarise(mean = mean(emd, na.rm = T),
            low = smean.cl.boot(emd)[2],
            high = smean.cl.boot(emd)[3])

## # A tibble: 6 × 5
## # Groups:   model [3]
##   model agent_type     mean   low  high
##   <fct> <fct>         <dbl> <dbl> <dbl>
## 1 unif  naive         1.87  1.53  2.24 
## 2 unif  sophisticated 1.06  0.947 1.19 
## 3 rsm   naive         0.273 0.190 0.359
## 4 rsm   sophisticated 0.802 0.651 0.964
## 5 gpt4o naive         0.519 0.361 0.690
## 6 gpt4o sophisticated 1.32  1.08  1.54

fit.suspect.emd = brm(
  formula = emd ~ 1 + agent_type + model,
  data = data.suspect.emd,
  iter = 2000,
  seed = 1,
  file = 'cache/fit.suspect.emd'
)

summary(fit.suspect.emd)

##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: emd ~ 1 + agent_type + model 
##    Data: data.suspect.emd (Number of observations: 105) 
##   Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
##          total post-warmup draws = 4000
## 
## Population-Level Effects: 
##                         Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept                   1.39      0.11     1.17     1.62 1.00     3930
## agent_typesophisticated     0.15      0.11    -0.07     0.37 1.00     4628
## modelrsm                   -0.93      0.14    -1.21    -0.66 1.00     3799
## modelgpt4o                 -0.56      0.14    -0.84    -0.29 1.00     3620
##                         Tail_ESS
## Intercept                   2562
## agent_typesophisticated     2935
## modelrsm                    2756
## modelgpt4o                  2618
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma     0.58      0.04     0.50     0.67 1.00     4074     3130
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

fit.suspect.emd %>%
  emmeans(pairwise ~ model)

## $emmeans
##  model emmean lower.HPD upper.HPD
##  unif   1.473     1.276     1.660
##  rsm    0.536     0.346     0.734
##  gpt4o  0.909     0.719     1.109
## 
## Results are averaged over the levels of: agent_type 
## Point estimate displayed: median 
## HPD interval probability: 0.95 
## 
## $contrasts
##  contrast     estimate lower.HPD upper.HPD
##  unif - rsm      0.934     0.671     1.219
##  unif - gpt4o    0.565     0.278     0.827
##  rsm - gpt4o    -0.370    -0.658    -0.118
## 
## Results are averaged over the levels of: agent_type 
## Point estimate displayed: median 
## HPD interval probability: 0.95

3.3 Figures

3.3.1 Path length

ggplot(data = data.suspect %>%
         mutate(model = factor(model,
                               levels = names(model_labels))) %>%
         group_by(model, trial, agent_type, path_type) %>%
         summarise(path_len = mean(path_len)),
       mapping = aes(x = agent_type,
                     y = path_len,
                     fill = agent_type,
                     group = path_type,
                     alpha = path_type)) +
  stat_summary(fun = "mean",
               geom = "bar",
               color = "black",
               linewidth = 0.1,
               position = position_dodge(width=dodge_width)) +
  geom_point(size = 2,
             shape = 21,
             color = 'gray40',
             alpha = 0.3,
             position = position_jitterdodge(dodge.width=dodge_width),
             show.legend = F) +
  stat_summary(fun.data = mean_cl_boot,
               geom = "linerange",
               color = "black",
               alpha = 1,
               linewidth = 1,
               position = position_dodge(width=dodge_width),
               show.legend = F) +
  scale_alpha_manual(values = path_type_alphas) +
  scale_y_continuous(name = 'Path length',
                     breaks = seq(0, 30, 10),
                     expand = expansion(mult = 0.01)) +
  scale_fill_manual(values = agent_type_colors) +
  coord_cartesian(clip = "off",
                  ylim = c(0, 33)) +
  facet_wrap(~ model,
             nrow = 1,
             strip.position = 'bottom',
             labeller = as_labeller(model_labels)) +
  theme(text = element_text(size = 12),
        legend.position = c(0.5, 0.97),
        legend.box = 'horizontal',
        legend.direction = 'horizontal',
        legend.text = element_text(size = 10),
        legend.title = element_blank(),
        legend.key.size = unit(8, "pt"),
        legend.margin = margin(0),
        axis.title.y = element_text(size = 12),
        axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.line.x = element_blank(),
        axis.ticks.x = element_blank(),
        plot.margin = margin(t = 0.1, r = 0, l = 0.1, b = -0.1, unit = 'cm'),
        strip.text = element_text(size = 12),
        strip.background = element_blank(),
        panel.grid.major.y = element_line())

# ggsave(str_c(figures_dir, 'suspect_path_len.pdf'), width = 5, height = 2)

3.3.2 Path traces

path_map = function(data, images_data, title, kitchen_access) {
  g = ggplot(data = data,
         mapping = aes(x = xx,
                       y = yy,
                       color = agent_type,
                       group = id)) +
    geom_custom(data = images_data,
                mapping = aes(data = grob,
                              x = -Inf, y = -Inf,
                              color = NULL, group = NULL),
                grob_fun = function(x) rasterGrob(x,
                                                  interpolate = T,
                                                  hjust = 0,
                                                  vjust = 0)) +
    annotate('rect',
             xmin = kitchen_access[1], xmax = kitchen_access[1] + 1,
             ymin = kitchen_access[2] - 1, ymax = kitchen_access[2],
             fill = 'green',
             alpha = 0.5) +
    geom_path(alpha = 0.2, linewidth = 0.2) +
    geom_point(alpha = 0.2, size = 0.5) +
    coord_fixed() +
    facet_wrap(~ agent_type, nrow = 1) +
    scale_x_continuous(limits = c(0, 15), expand = c(0, 0)) +
    scale_y_continuous(limits = c(0, 16), expand = c(0, 0)) +
    scale_color_manual(values = agent_type_colors) +
    ggtitle(title) +
    guides(color = guide_legend(override.aes = list(size = 2,
                                                    linewidth = 1,
                                                    alpha = 1))) +
    theme(plot.title = element_text(size = 16, hjust = 0.5),
          text = element_text(size = 16),
          legend.direction = 'horizontal',
          legend.title = element_blank(),
          legend.text = element_text(size = 14),
          axis.text = element_blank(),
          axis.ticks = element_blank(),
          axis.title = element_blank(),
          axis.line = element_blank(),
          strip.background = element_blank(),
          strip.text = element_blank(),
          plot.margin = margin(t = 0.1, r = 0, b = 0.1, l = 0, unit = 'cm'))
  
  return(g)
}

select_trial = 'snack2'
select_agent = 'A'
kitchen_access = c(5, 12)
path_map.human = path_map(
  data.suspect.paths %>%
    filter(trial == select_trial, model == 'human',
           agent == select_agent, path_type == 'return path'),
  images.suspect %>%
   filter(trial == select_trial),
  'Humans',
  kitchen_access
)
path_map.sim = path_map(
  data.suspect.paths %>%
    filter(trial == select_trial, model == 'rsm',
           agent == select_agent, path_type == 'return path'),
  images.suspect %>%
    filter(trial == select_trial),
  'RSM',
  kitchen_access
)
path_map.gpt = path_map(
  data.suspect.paths %>%
    filter(trial == select_trial, model == 'gpt4o',
           agent == select_agent, path_type == 'return path'),
  images.suspect %>%
    filter(trial == select_trial),
  'GPT-4o',
  kitchen_access
)
path_map.unif = path_map(
  data.suspect.paths %>%
    filter(trial == select_trial, model == 'unif',
           agent == select_agent, path_type == 'return path'),
  images.suspect %>%
    filter(trial == select_trial),
  'Uniform simulation',
  kitchen_access
)

path_map.human + path_map.sim + path_map.gpt + path_map.unif +
  plot_layout(guides = 'collect', nrow = 1) &
  theme(legend.position = 'bottom',
        legend.margin = margin(t = -0.5, b = 0, unit = 'cm'),
        )

# ggsave(str_c(figures_dir, 'suspect_paths.pdf'), width = 14, height = 2.5)

3.3.3 EMD

ggplot(data.suspect.emd %>%
         mutate(model = factor(model, levels = names(model_labels))),
       aes(x = model,
           y = emd,
           fill = agent_type)) +
  stat_summary(fun = "mean",
               geom = "bar",
               color = "black",
               linewidth = 0.1,
               alpha = 0.8,
               position = position_dodge(width=dodge_width)) +
  geom_point(size = 2,
             shape = 21,
             color = 'gray40',
             alpha = 0.3,
             position = position_jitterdodge(jitter.width = 0.2,
                                             dodge.width = dodge_width),
             show.legend = F) +
  stat_summary(fun.data = mean_cl_boot,
               geom = "linerange",
               color = "black",
               alpha = 1,
               linewidth = 1,
               position = position_dodge(width=dodge_width),
               show.legend = F) +
  scale_fill_manual(values = agent_type_colors) +
  scale_y_continuous(name = "Earth Mover's Distance",
                     breaks = seq(0, 3, 0.5),
                     expand = expansion(mult = 0.01)) +
  coord_cartesian(clip = "off",
                  ylim = c(0, 3.4)) +
  scale_x_discrete(labels = model_labels) +
  theme(text = element_text(size = 12),
        legend.position = c(0.5, 0.97),
        legend.direction = 'horizontal',
        legend.text = element_text(size = 10),
        legend.title = element_blank(),
        legend.key.size = unit(8, "pt"),
        axis.title.y = element_text(size = 12),
        axis.text.x = element_text(size = 10),
        axis.title.x = element_blank(),
        axis.line.x = element_blank(),
        axis.ticks.x = element_blank(),
        plot.margin = margin(t = 0.1, r = -0.2, l = 0.1, b = 0.1, unit = 'cm'),
        panel.grid.major.y = element_line())

# ggsave(str_c(figures_dir, 'suspect_emd.pdf'), width = 5, height = 2.25)

4 Experiment 2 (Detective)

4.1 Data

4.1.1 Human

data.detective.human = read_csv(str_c(data_dir, 'exp2_detective/humans/human_trials.csv'),
                                show_col_types = F)

data.detective.feedback = read_csv(str_c(data_dir, 'exp2_detective/humans/human_feedback.csv'),
                                show_col_types = F)

data.detective.feedback %>%
  select(id, agent_type, prediction_factors) %>%
  datatable(rownames = F)

Demographics:

print_demographics(data.detective.feedback)

## # A tibble: 3 × 2
##   gender         n
##   <chr>      <int>
## 1 Female        67
## 2 Male          32
## 3 Non-binary     1
## [1] "age: M = 37, SD = 12"
## # A tibble: 7 × 2
##   race                         n
##   <chr>                    <int>
## 1 Asian                       10
## 2 Biracial; Filipino/White     1
## 3 Black/African American      18
## 4 Hispanic                     1
## 5 White                       68
## 6 mixed race                   1
## 7 <NA>                         1
## # A tibble: 2 × 2
##   ethnicity        n
##   <chr>        <int>
## 1 Hispanic         9
## 2 Non-Hispanic    91
## [1] "time: 11.0 minutes (SD = 5.0)"

4.1.2 RSM

Grid search:

best_w.detective.naive = 0.8
best_temp.detective.naive = 0.05
best_w.detective.sophisticated = 0.5
best_temp.detective.sophisticated = 0.2

data.detective.rsm = import_detective_model_data(best_w.detective.naive,
                                                 best_temp.detective.naive) %>%
  filter(agent_type == 'naive') %>%
  rbind(import_detective_model_data(best_w.detective.sophisticated,
                                    best_temp.detective.sophisticated) %>%
          filter(agent_type == 'sophisticated'))

4.1.3 Uniform simulation

data.detective.unif = read_csv(str_c(data_dir, 'exp2_detective/uniform/uniform_model.csv'),
                             show_col_types = F) %>%
  mutate(agent_type = 'naive') %>%
  rbind(read_csv(str_c(data_dir, 'exp2_detective/uniform/uniform_model.csv'),
                 show_col_types = F) %>%
          mutate(agent_type = 'sophisticated')) %>%
  select(-grid, -crumb) %>%
  mutate(id = NA)

4.1.4 Empirical model

data.detective.emp = read_csv(str_c(data_dir, 'exp2_detective/',
                                    'empirical/empirical_model.csv'),
                              show_col_types = F) %>%
  select(-grid, -crumb) %>%
  mutate(id = NA)

4.1.5 Empirical mismatched model

data.detective.emp_mis = read_csv(str_c(data_dir, 'exp2_detective/',
                                        'empirical_mismatched',
                                        '/empirical_mismatched.csv'),
                                  show_col_types = F) %>%
  select(-grid, -crumb) %>%
  mutate(id = NA)

4.1.6 GPT-4o

data.detective.gpt = read_csv(str_c(data_dir, 'exp2_detective/gpt4o/gpt4o.csv'),
                              show_col_types = F) %>%
  select(-ends_with('prompt'))

sum(data.detective.gpt$is_valid) / length(data.detective.gpt$is_valid)

## [1] 0.4653846

data.detective.gpt = data.detective.gpt %>%
  filter(is_valid) %>%
  select(id, trial, agent_type, response)

4.1.7 Heuristic model

data.detective.heu.modeling = read_csv(str_c(data_dir, 'exp2_detective/',
                                             'heuristic/features.csv'),
                                       show_col_types = F)

fit.detective.heu.naive = brm(
  formula = response ~ 1 + fridge_A + crumb_A + fridge_B + crumb_B + (1 | id) + (1 | trial),
  data = data.detective.human %>%
    filter(agent_type == 'naive') %>%
    left_join(data.detective.heu.modeling,
              by = c('trial', 'agent_type')),
  iter = 2000,
  seed = 1,
  file = 'cache/fit.detective.heu.naive'
)

fit.detective.heu.sophisticated = brm(
  formula = response ~ 1 + fridge_A + crumb_A + fridge_B + crumb_B + (1 | id) + (1 | trial),
  data = data.detective.human %>%
    filter(agent_type == 'sophisticated') %>%
    left_join(data.detective.heu.modeling,
              by = c('trial', 'agent_type')),
  iter = 2000,
  seed = 1,
  file = 'cache/fit.detective.heu.sophisticated'
)

data.detective.heu = data.detective.human %>%
  filter(agent_type == 'naive') %>%
  cbind(fit.detective.heu.naive %>%
          fitted(newdata = data.detective.human %>%
                   filter(agent_type == 'naive') %>%
                   left_join(data.detective.heu.modeling,
                             by = c('trial', 'agent_type')),
                 re_formula = NA) %>%
          as_tibble() %>%
          select(pred = Estimate)) %>%
  distinct(trial, pred) %>%
  mutate(agent_type = 'naive') %>%
  rbind(data.detective.human %>%
          filter(agent_type == 'sophisticated') %>%
          cbind(fit.detective.heu.sophisticated %>%
                  fitted(newdata = data.detective.human %>%
                           filter(agent_type == 'sophisticated') %>%
                           left_join(data.detective.heu.modeling,
                                     by = c('trial', 'agent_type')),
                         re_formula = NA) %>%
                  as_tibble() %>%
                  select(pred = Estimate)) %>%
          distinct(trial, pred) %>%
          mutate(agent_type = 'sophisticated')) %>%
  rename(response = pred) %>%
  mutate(id = NA)

4.1.8 Combined

data.detective = combine_detective_data(list(
  'human' = data.detective.human,
  'unif' = data.detective.unif,
  'rsm' = data.detective.rsm,
  'emp' = data.detective.emp,
  'emp_mis' = data.detective.emp_mis,
  'gpt4o' = data.detective.gpt,
  'heu' = data.detective.heu
))

data.detective.means = compute_detective_means(data.detective)

4.2 Analysis

4.2.1 Uncertainty

fit.detective.human = brm(
  formula = abs_response ~ 1 + agent_type + (1 | id) + (1 | trial),
  data = data.detective %>%
    filter(model == 'human') %>%
    mutate(abs_response = abs(response)),
  iter = 2000,
  seed = 1,
  file = 'cache/fit.detective.human'
)

summary(fit.detective.human)

##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: abs_response ~ 1 + agent_type + (1 | id) + (1 | trial) 
##    Data: data.detective %>% filter(model == "human") %>% mu (Number of observations: 2700) 
##   Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
##          total post-warmup draws = 4000
## 
## Group-Level Effects: 
## ~id (Number of levels: 100) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     7.94      0.61     6.86     9.25 1.00      476     1075
## 
## ~trial (Number of levels: 27) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)    10.67      1.68     7.93    14.39 1.02      441      710
## 
## Population-Level Effects: 
##                         Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept                  35.21      2.32    30.81    39.86 1.02      285
## agent_typesophisticated   -10.28      1.65   -13.57    -7.13 1.02      304
##                         Tail_ESS
## Intercept                    668
## agent_typesophisticated      655
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma    11.47      0.16    11.16    11.80 1.00     4396     2531
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

4.2.2 Model fit comparison

# removed one trial because it's NA for GPT 
n_trials = data.detective.means %>% 
  pull(trial) %>% 
  as.character() %>% 
  as.numeric() %>% 
  max() - 1

data.detective.means %>% 
  select(trial, agent_type, contains("mean"))  %>%
  na.omit() %>%
  mutate(residuals_rsm = mean_human - mean_rsm,
         residuals_emp = mean_human - mean_emp,
         residuals_emp_mis = mean_human - mean_emp_mis,
         residuals_gpt4o = mean_human - mean_gpt4o,
         residuals_unif = mean_human - mean_unif,
         residuals_heu = mean_human - mean_heu) %>% 
  select(trial, agent_type, contains("residuals")) %>% 
  pivot_longer(cols = -c(trial, agent_type)) %>% 
  group_by(agent_type, name) %>%
  nest() %>% 
  mutate(log_likelihood = map_dbl(.x = data,
                                  .f = ~ .x %>% 
                                na.omit() %>% 
                                pull(value) %>% 
                                fitdistr(densfun = "normal") %>% 
                                .[["loglik"]])) %>% 
  ungroup() %>%
  mutate(n_parameters = case_when(name == "residuals_rsm" ~ 2,
                                  name == "residuals_heu" ~ 5,
                                  TRUE ~ 0),
         AIC = -2 * log_likelihood + 2 * n_parameters,
         AIC = format(round(AIC, 2), nsmall = 2)) %>%
  select(agent_type, name, AIC) %>% 
  mutate(name = factor(name,
                       levels = str_c("residuals_",
                                      c("rsm", "emp", "emp_mis",
                                        "gpt4o", "unif", "heu")),
                       labels = model_labels[-1])) %>%
  pivot_wider(names_from = agent_type,
              values_from = AIC)

## # A tibble: 6 × 3
##   name                 naive  sophisticated
##   <fct>                <chr>  <chr>        
## 1 RSM                  206.22 203.02       
## 2 Empirical            239.43 241.10       
## 3 Mismatched empirical 247.85 245.34       
## 4 GPT-4o               232.45 212.51       
## 5 Uniform simulation   200.85 203.24       
## 6 Heuristic            207.16 184.87

4.3 Figures

detective.scatterplot = function(data, model, x_title = F, y_title = '', tagg = '') {
  r = cor(data[[str_c('mean_', model)]],
          data[['mean_human']],
          use = 'complete.obs') %>% round(2)
  rmse = rmse2(data[[str_c('mean_', model)]], data[['mean_human']]) %>% round(2)
  
  g = ggplot(data = data,
         aes(x = get(str_c('mean_', model)),
             y = mean_human,
             fill = agent_type)) +
    geom_abline(intercept = 0, slope = 1,
                linetype = 2, linewidth = 0.5) +
    # error bars
    geom_linerange(size = 0.5,
                   mapping = aes(ymin = low_human,
                                 ymax = high_human),
                   color = 'gray50',
                   alpha = 0.4) +
    geom_errorbarh(mapping = aes(xmin = get(str_c('low_', model)),
                                 xmax = get(str_c('high_', model))),
                   color = 'gray50',
                   alpha = 0.4,
                   height = 0) +
    # means
    geom_point(shape = 21,
               size = 2) +
    geom_text(label = sprintf('RMSE = %.2f\nr = %s', rmse, r),
              hjust = 0,   # left align
              x = -50, y = 50,
              size = 4, check_overlap = T) +
    scale_fill_manual(values = agent_type_colors) +
    scale_x_continuous(name = ifelse(x_title, model_labels[model], ''),
                       limits = c(-50, 50)) +
    scale_y_continuous(name = y_title,
                       limits = c(-50, 55),
                       breaks = seq(-50, 50, 25)) +
    coord_cartesian(clip = 'off') +
    labs(tag = tagg) +
    theme(legend.position = 'bottom',
          legend.title = element_blank(),
          legend.text = element_text(size = 12),
          axis.title.x = element_text(size = 12),
          axis.title.y = element_text(size = 12),
          aspect.ratio = 1
          )
  
  return (g)
}

scatter.n1 = detective.scatterplot(
  data.detective.means %>%
    filter(agent_type == 'naive'),
  model = 'rsm',
  y_title = 'Naive humans',
  tagg = 'A'
)

scatter.n2 = detective.scatterplot(
  data.detective.means %>%
    filter(agent_type == 'naive'),
  model = 'emp',
  tagg = 'B'
)

scatter.n3 = detective.scatterplot(
  data.detective.means %>%
    filter(agent_type == 'naive'),
  model = 'emp_mis',
  tagg = 'C'
)

scatter.n4 = detective.scatterplot(
  data.detective.means %>%
    filter(agent_type == 'naive'),
  model = 'gpt4o',
  tagg = 'D'
)

scatter.n5 = detective.scatterplot(
  data.detective.means %>%
    filter(agent_type == 'naive'),
  model = 'unif',
  tagg = 'E'
)

scatter.n6 = detective.scatterplot(
  data.detective.means %>%
    filter(agent_type == 'naive'),
  model = 'heu',
  tagg = 'F'
)

scatter.s1 = detective.scatterplot(
  data.detective.means %>%
    filter(agent_type == 'sophisticated'),
  model = 'rsm',
  x_title = T,
  y_title = 'Sophisticated humans'
)

scatter.s2 = detective.scatterplot(
  data.detective.means %>%
    filter(agent_type == 'sophisticated'),
  model = 'emp',
  x_title = T
)

scatter.s3 = detective.scatterplot(
  data.detective.means %>%
    filter(agent_type == 'sophisticated'),
  model = 'emp_mis',
  x_title = T
)

scatter.s4 = detective.scatterplot(
  data.detective.means %>%
    filter(agent_type == 'sophisticated'),
  model = 'gpt4o',
  x_title = T
)

scatter.s5 = detective.scatterplot(
  data.detective.means %>%
    filter(agent_type == 'sophisticated'),
  model = 'unif',
  x_title = T
)

scatter.s6 = detective.scatterplot(
  data.detective.means %>%
    filter(agent_type == 'sophisticated'),
  model = 'heu',
  x_title = T
)

scatter.n1 + scatter.n2 + scatter.n3 + scatter.n4 + scatter.n5 + scatter.n6 + scatter.s1 + scatter.s2 + scatter.s3 + scatter.s4 + scatter.s5 + scatter.s6 +
  plot_layout(guides = 'collect',
              ncol = 6) &
  theme(legend.position = 'bottom',
        legend.direction = 'horizontal',
        legend.title = element_blank(),
        legend.spacing.y = unit(0, 'cm'),
        legend.spacing.x = unit(0, 'cm'),
        legend.margin = margin(t = 0, b = 0, unit = 'cm'),
        plot.tag.position = c(0.1, 1.05),
        plot.margin = margin(t = 0.1, r = 0.1, b = -0.1, l = 0.1, unit = 'cm')
        )

# ggsave(str_c(figures_dir, 'detective.pdf'), width = 13, height = 4.75)

5 Session info

sessionInfo()

## R version 4.1.3 (2022-03-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] lubridate_1.8.0   forcats_1.0.0     stringr_1.5.1     dplyr_1.1.2      
##  [5] purrr_0.3.4       readr_2.1.2       tidyr_1.2.0       tibble_3.2.1     
##  [9] tidyverse_2.0.0   MASS_7.3-55       transport_0.13-0  egg_0.4.5        
## [13] gridExtra_2.3     png_0.1-7         patchwork_1.3.0   scales_1.3.0     
## [17] Hmisc_4.6-0       ggplot2_3.4.4     Formula_1.2-4     survival_3.2-13  
## [21] lattice_0.20-45   emmeans_1.8.2     brms_2.17.0       Rcpp_1.0.8.3     
## [25] DT_0.22           Metrics_0.1.4     ggstatsplot_0.9.1 knitr_1.38       
## 
## loaded via a namespace (and not attached):
##   [1] backports_1.4.1        plyr_1.8.7             igraph_1.3.0          
##   [4] splines_4.1.3          crosstalk_1.2.0        TH.data_1.1-0         
##   [7] rstantools_2.2.0       inline_0.3.19          digest_0.6.29         
##  [10] htmltools_0.5.2        magrittr_2.0.3         checkmate_2.0.0       
##  [13] paletteer_1.4.0        cluster_2.1.2          tzdb_0.3.0            
##  [16] RcppParallel_5.1.5     matrixStats_0.61.0     vroom_1.5.7           
##  [19] xts_0.12.1             sandwich_3.0-1         prettyunits_1.1.1     
##  [22] jpeg_0.1-9             colorspace_2.0-3       xfun_0.30             
##  [25] callr_3.7.0            crayon_1.5.1           jsonlite_1.8.9        
##  [28] zeallot_0.1.0          zoo_1.8-9              glue_1.6.2            
##  [31] gtable_0.3.0           statsExpressions_1.3.1 distributional_0.3.0  
##  [34] pkgbuild_1.3.1         rstan_2.21.3           abind_1.4-5           
##  [37] mvtnorm_1.1-3          DBI_1.2.3              miniUI_0.1.1.1        
##  [40] xtable_1.8-4           performance_0.9.0      htmlTable_2.4.0       
##  [43] bit_4.0.4              foreign_0.8-82         stats4_4.1.3          
##  [46] StanHeaders_2.21.0-7   datawizard_0.4.0       htmlwidgets_1.5.4     
##  [49] threejs_0.3.3          RColorBrewer_1.1-3     posterior_1.2.1       
##  [52] ellipsis_0.3.2         pkgconfig_2.0.3        reshape_0.8.8         
##  [55] loo_2.5.1              farver_2.1.0           nnet_7.3-17           
##  [58] sass_0.4.1             utf8_1.2.2             labeling_0.4.2        
##  [61] tidyselect_1.2.1       rlang_1.1.1            reshape2_1.4.4        
##  [64] later_1.3.0            munsell_0.5.0          tools_4.1.3           
##  [67] cli_3.6.1              generics_0.1.2         ggridges_0.5.3        
##  [70] evaluate_0.15          fastmap_1.1.0          yaml_2.3.5            
##  [73] rematch2_2.1.2         bit64_4.0.5            processx_3.5.3        
##  [76] WRS2_1.1-3             nlme_3.1-155           mime_0.12             
##  [79] correlation_0.8.0      compiler_4.1.3         bayesplot_1.9.0       
##  [82] shinythemes_1.2.0      rstudioapi_0.17.1      bslib_0.3.1           
##  [85] stringi_1.7.6          highr_0.9              ps_1.6.0              
##  [88] parameters_0.17.0      Brobdingnag_1.2-9      Matrix_1.5-1          
##  [91] markdown_1.1           shinyjs_2.1.0          tensorA_0.36.2        
##  [94] vctrs_0.6.2            pillar_1.10.1          lifecycle_1.0.3       
##  [97] mc2d_0.1-21            jquerylib_0.1.4        bridgesampling_1.1-2  
## [100] estimability_1.4.1     data.table_1.14.2      insight_0.17.0        
## [103] httpuv_1.6.5           R6_2.5.1               latticeExtra_0.6-29   
## [106] bookdown_0.26          promises_1.2.0.1       codetools_0.2-18      
## [109] colourpicker_1.1.1     gtools_3.9.2           withr_2.5.0           
## [112] shinystan_2.6.0        multcomp_1.4-18        hms_1.1.3             
## [115] bayestestR_0.11.5      parallel_4.1.3         rpart_4.1.16          
## [118] coda_0.19-4            rmarkdown_2.13         shiny_1.7.1           
## [121] base64enc_0.1-3        dygraphs_1.1.1.6

Leave a trace: Recursive reasoning about deceptive behavior

Verona Teo, Sarah A. Wu, Erik Brockbank, and Tobias Gerstenberg

May 13, 2025

1 Setup

2 Helper functions

2.1 Demographics

2.2 Earth Mover’s Distance (Wasserstein distance)

2.3 Miscellaneous

3 Experiment 1 (Suspect)

3.1 Data

3.1.1 Trial images

3.1.2 Humans

3.1.3 RSM

3.1.4 Uniform simulation

3.1.5 GPT-4o

3.1.6 Combined

3.2 Analysis

3.2.1 Path length

3.2.2 EMD comparison

3.3 Figures

3.3.1 Path length

3.3.2 Path traces

3.3.3 EMD

4 Experiment 2 (Detective)

4.1 Data

4.1.1 Human

4.1.2 RSM

4.1.3 Uniform simulation

4.1.4 Empirical model

4.1.5 Empirical mismatched model

4.1.6 GPT-4o

4.1.7 Heuristic model

4.1.8 Combined

4.2 Analysis

4.2.1 Uncertainty

4.2.2 Model fit comparison

4.3 Figures

5 Session info