---
title: "Robust Bayesian statistical modelling for reliable inference and prediction with uncertainty"
author: "Sean van der Merwe"
format: 
  revealjs: 
    theme: [default, UFS1.scss]
    logo: UFSlogo2022SvdM.svg
    footer: "2026/06/24 - NAS Conference"
    slide-number: "c/t"
    chalkboard: true
    parallax-background-image: "pastel_curves_giraffe.jpg"
    parallax-background-size: "2976px 1663px"
    parallax-background-vertical: 50
    margin: 0.05
execute:
  echo: false
---


```{r}
#| warning: false
#| message: false
#| results: hide
options(scipen = 12)
library(tidyverse)
theme_set(theme_bw())
library(echarts4r)
alpha <- 0.05
library(openxlsx)
library(DT)
```

# Introduction

Statistical regression modelling is used to relate one set of measurements to another set of measurements, or to predict a quantity based on observed quantities.

```{r}
#| warning: false
#| message: false
#| results: hide
options(scipen = 12)
library(knitr)
opts_chunk$set(fig.ext = 'svg', dev = 'svg', fig.align = 'center')

UFScolours <- c(Blue="#0F204BFF", Red="#A71930FF", LightGray="#A7A8AAFF", DarkGray="#8D8D8EFF", NAS="#0039A7FF", EDU="#00675AFF", THEO="#C69317FF", HUM="#EA8400FF", SA="#A40084FF", SC="#9E83B7FF", QWA="#00B140FF", LAW="#BB133EFF", HLTH="#490E6FFF", Black="#000000FF", VeryLight="#D7D8DDFF")
cols <- UFScolours[c(8,11,10,2)]

# Sean's functions that are useful for stuff
mapvalues <- function(x, from=unique(x), to=seq_along(from), warn_missing=FALSE) { 
    # Simplified replacement for mapvalues from plyr package, by Sean van der Merwe, UFS
    to <- unlist(to) # Make sure it's a vector. Also note that warn_missing is ignored.
    to[match(x, from)]
}
wrap_strings <- function(vector_of_strings, width){ as.character( sapply( vector_of_strings, \(x) { paste( strwrap(x, width=width), collapse="\n")}))}

fig_w <- 960
fig_h <- 400

library(plotly)
plotly_table_UFS <- function(datafrm, width = fig_w, height = fig_h, colors=UFScolours,
    useRowNames = FALSE, rowNamesHeading = " ", highlights = NULL, 
    header_font_size = 14, cell_font_size = 12) {
  # For the colour palette: the first two and last two colours are used
  datafrm <- as.data.frame(datafrm)
  nms <- names(datafrm)
  if (useRowNames) {
    datafrm <- data.frame(row.names(datafrm), datafrm, row.names = NULL)
    nms <- c(rowNamesHeading, nms)
    names(datafrm) <- nms
  }
  k <- ncol(datafrm)
  algn <- rep('center', k)
  cell_values <- rbind(t(as.matrix(unname(datafrm))))
  col_list <- rep(c("#FFFFFFFF", colors[length(colors)]), 
                  nrow(datafrm) %/% 2 + 2)[1:nrow(datafrm)] |> rep(k)
  if (!is.null(highlights)) {
    col_list[highlights] <- "#FFBBBBFF"
  }
  col_list <- unname(col_list)
  col_list <- col_list |> matrix(nrow(cell_values), ncol(cell_values), byrow = TRUE)
  fig <- plot_ly(
  type = 'table',
  height = height,
  width = wdith,
  header = list(
    values = nms,
    align = algn,
    line = list(width = 1, color = colors[2]),
    fill = list(color = colors[1]),
    font = list(family = "Arial", size = header_font_size, color = "#FFFFFFFF")
  ),
  cells = list(
    values = cell_values,
    align = algn,
    line = list(color = colors[2], width = 1),
    fill = list(color = col_list),
    font = list(family = "Arial", size = cell_font_size, color = colors[length(colors)-1])
  ))
fig
}
plotly_colours <- UFScolours |> unname()

shortestinterval <- function(postsims, width=0.95) { # Coded by Sean van der Merwe, UFS
  sort(postsims) -> sorted.postsims
  round(length(postsims)*width) -> gap
  which.min(diff(sorted.postsims, gap)) -> pos
  sorted.postsims[c(pos, pos + gap)]
}

```

```{r}
#| fig-height: 4
library(visNetwork)
outline_network <- \() {
  cols <- rev(viridisLite::turbo(9))[-1]
  outline_node_lables <- c('Dependent measurements', 
                           'Explanatory variables', 
                           'Relationships', 'Extra Variation')
  n_outline_nodes <- length(outline_node_lables)
  nodes <- data.frame(id = 1:n_outline_nodes, 
                      label = outline_node_lables,
                      shape = c(rep("box", 2), rep("ellipse", 2)),
                      color = cols[c(1, 2, 3, 4)],
                      x = c(-2,2, 0, -1)*100, 
                      y = c(2,2, 4, 0)*100, 
                      physics = rep(c(FALSE, TRUE), c(2, n_outline_nodes-2)),
                      font = "32px"
                      )
  edges <- data.frame(from = c(1:4),
                      to   = c(2:3, 1, 1),
                      arrows = "to;from", 
                      length = c(30, rep(280, n_outline_nodes-1))
                      )
visNetwork(nodes, edges, physics = TRUE)
}
library(htmlwidgets)
library(widgetframe)
frameWidget(outline_network())
```


## Abstract {.smaller}

Beginning with examples of statistical regression models fitted for the NAS faculty in recent years, the basic assumptions of standard statistical modelling are explained. Then the common measures of location (mean, median, mode) are discussed and illustrated. The presentation will dig into the relevance and uses of each, and then explain their role in parametric regression models. In particular, the connection or disconnection between the measure of location used in the model and the measure of location implied by the problem may help researchers avoid making misleading conclusions. Further, by first understanding and then changing the underlying assumptions of statistical models, one can obtain robust results that retain accurate predictive coverage. Examples will be drawn from a selection of recent papers focussing on robust mixed-effects regression models, including quantile regression models and joint modules. Robust here referring to models less influenced by outliers and skewness.

**Keywords: regression; parametric model; skewness; robust regression; quantile regression**


# Measures of location

Where are your measurements situated on the number line?

## Non-symmetry illustration

What would you consider to be the **typical value** for something like income? Is it any of these?

```{r}
lognormal_plot <- \() {
lognormal_x <- (1:400)/60
lognormal_data <- data.frame(x = lognormal_x, y = dlnorm(lognormal_x))
locations <- c(mean = 0.5, median = 0, mode = -1) |> exp()
loc_data <- data.frame(x = locations, Statistic = names(locations))
lognormal_data |> ggplot(aes(x = x, y = y)) + 
  labs(y = "lognormal(x,0,1)") + 
  geom_line(linewidth = 1, colour = "purple") + 
  scale_colour_viridis_d(option = "H") + 
  geom_vline(aes(xintercept = x, colour = Statistic, linetype = Statistic), data = loc_data)
}
lognormal_plot() |> ggplotly(width = 900, height = 400) |> 
  layout(legend = list(x = 100, y = 0.5))
```

> For the measurements you model, what is the explanatory structure actually related to?

## Non-symmetric conditional distributions

-   For symmetric distributions such as the *normal*, *logistic*, and *t* distributions:
    -   $\mu\ \equiv$ mean $\equiv$ median $\equiv$ mode
-   For the rest we have to explicitly state what we are trying to explain

When building models, ask yourself whether you are trying to **model or predict**, then ask yourself whether you are trying to model or predict

-   The expected value (mean)
-   The middle value (median)
-   The most likely value (mode)

## Log-transform example

-   Suppose your values are very skew to the right (some values much larger than others) then it is common to take a log transform to avoid those extreme values dominating the analysis.
-   OLS regression then assumes a conditional normal distribution

$$\begin{aligned}
\log Y_i & \sim N\left(\mu_i = \mathbf{x}_i \boldsymbol\beta,\ \sigma^2\right) \\
\equiv Y_i & \sim logN\text{ with median }e^{\mu_i}=e^{\beta_0+\beta_1 x_{1i}+\dots}
\end{aligned}$$

-   On the original scale you've actually fitted a multiplicative model for the median.
    -   If done directly this would be called a non-linear quantile regression model.

## Quantile regression {.scrollable}

-   We don't have to focus only on the relationship between the explanatory variables and the expected or typical value!
-   Have you ever considered regression on the 95% Value-at-Risk (VaR) directly? 
    -   The factors explaining it might be quite different to the factors explaining the expected value.

The simplest quantile regression is *median* regression. One way to do it is to just replace the *normal* distribution with a *Laplace* distribution (double exponential - one exponential tail pointing left and one pointing right).

```{r}
dexp_plot <- \() {
dexp_x <- seq(-5, 5, l = 401)
dexp_data <- data.frame(x = c(dexp_x, dexp_x), 
                        Density = c(0.5*dexp(abs(dexp_x)), dnorm(dexp_x)), 
                        Distribution = rep(c("Laplace", "Normal"), each = length(dexp_x)))
dexp_data |> ggplot(aes(x = x, y = Density, colour = Distribution, 
                        linetype = Distribution)) + 
    scale_colour_viridis_d(option = "H") + 
  geom_line()
}
dexp_plot() |> ggplotly(width = 900, height = 400) |> 
  layout(legend = list(x = 100, y = 0.5))
```


From the *Laplace* distribution the logical evolution is the *Skew Laplace* distribution. For this distribution the lower quantiles produce right skew and upper quantiles produce left skew. This makes sense when modelling a simple sample **but it makes less sense when modelling residuals**, where upper observations could easily have right skew residuals. 

Instead, I would recommend the *skew exponential power distribution* for flexible modelling.


$$F_{\text{SEP}}\left(y \left| \mu, \sigma, \kappa_1, \kappa_2, p_0 \right.\right) =
        \begin{cases}
            p_0 \left(1 - G\left(\frac{1}{\kappa_1}\left(\frac{\mu - y}{2 p_0 \sigma K_1}\right)^{\kappa_1}, \frac{1}{\kappa_1}\right)\right), & y \leq \mu, \\
            p_0 + \left(1 - p_0\right) G\left(\frac{1}{\kappa_2}\left(\frac{y - \mu}{2\left(1 - p_0\right) \sigma K_2}\right)^{\kappa_2}, \frac{1}{\kappa_2}\right), & y > \mu,
        \end{cases}$$
        
where $G\left(a, b\right)$ is the regularized incomplete gamma function:

The SEP distribution reduces to a normal distribution with mean $\mu$ and standard deviation $\sigma/\sqrt{2\pi}$ when $p_0 = 0.5$ and $\kappa_1 = \kappa_2 = 2$. When $p_0 = 0.5$ and $\kappa_1 = \kappa_2 = 1$, it becomes the symmetric Laplace distribution. As $\left(\kappa_1, \kappa_2\right) \to \infty$, the SEP distribution approaches a uniform distribution.

If we vary $\kappa_2$ it changes the right shape independently of $p_0$.

```{r}
kep <- function(p) {1/(2*p^(1/p)*gamma(1 + 1/p))}

dsep <- function(x, mu = 0, sigma = 1, kappa1 = 1, kappa2 = 1, p0 = 0.5) {
  lt_mu <- x <= mu
  z <- abs(x - mu)
  scale <- ifelse(lt_mu, 2*p0*sigma*kep(kappa1), 2*(1 - p0)*sigma*kep(kappa2))
  shape <- ifelse(lt_mu, kappa1, kappa2)
  (1/sigma)*exp(-(z/scale)^shape/shape)
}

dsep_plot <- \() {
dsep_x <- seq(-5, 5, l = 401)
p0_vals <- c(0.2, 0.5, 0.9)
k2_vals <- c(0.25, 1, 2)
dsep_data <- p0_vals |> lapply(\(p0v) {
  k2_vals |> lapply(\(k2v) {
    data.frame(x = dsep_x, Density = dsep(dsep_x, kappa2 = k2v, p0 = p0v),
               Quantile = paste("Quantile", p0v), Shape = k2v)
  }) |> bind_rows()
}) |> bind_rows() |>
  mutate(Quantile = factor(Quantile), Shape = factor(Shape))
dsep_data |> ggplot(aes(x = x, y = Density, colour = Shape, linetype = Shape)) + 
    scale_colour_viridis_d(option = "H") + 
  geom_line() + facet_wrap(~Quantile)
}
dsep_plot() |> ggplotly(width = 960, height = 360) |> 
  layout(legend = list(x = 100, y = 0.5))
```

## Curvy data

Sometime you just want to fit a curve, but even then ...

```{r}
curves_plot_Gompertz <- \() {
plt <- readRDS('presentation plot curves1.rds')
plt$d$Putrescine <- plt$d$H04
plt$d |> plot_ly(x = ~Hour, y= ~Putrescine, width = fig_w*0.8, height = fig_h*1.2, 
                 color = ~StrainDescription, colors =  viridisLite::plasma(6), 
                 type = 'scatter', mode = 'lines', 
                 legendgroup = ~StrainDescription
                 ) |>
  add_trace(x = ~hour, y = ~value, 
            color = ~StrainDescription, 
            type = 'scatter', mode = 'lines', 
            legendgroup = ~StrainDescription, linetype = ~StrainDescription,
            data = plt$plot_data)
}
curves_plot_Gompertz()
```

# Why distributions?

::: {.callout-warning}
# If you only care about **predictions** 

then just use the most accurate model (depending on your data size and structure) and optimise it using your loss function of choice. Works well when you have a lot of data.
:::

::: {.callout-note}
In a linear model optimising with squared error loss is equivalent to maximising a normal distribution likelihood. **The loss function is like a distribution and matters a lot.**
:::

::: {.callout-caution}
# But note that a point estimate is always wrong

One of the main things that drive statisticians is wanting to understand **uncertainty**.
:::

## Coverage

Which interval is more useful?

1.  Channel A says they're 95% sure tomorrow's maximum temperature is going to be between -20 and 46.
1.  Channel B says they're 95% sure tomorrow's maximum temperature is going to be between 20 and 26.
1.  Channel C says they're 3% sure tomorrow's maximum temperature is going to be 23 (i.e. between 22.5 and 23.5).

Shorter intervals carry more information than wider intervals, and intervals carry more information than point estimates, but only if they're valid. 

-   A 95% interval should cover the target value 95% of the time. 

## Bayesian Inference {.smaller}

::: {.callout-note}
# Inference is about going small to big

We perform experiments and collect samples because we want to [understand]{.mark} the bigger picture - the data generating process.
:::

> The specific sample is often not important. For example, a pharmaceutical company is far more interested in future paying customers than the ones they paid to take part in their clinical trial.

-   The biggest reason I like Bayesian parametric regression models is because they can do inference while still producing useful predictions with accurate coverage.
-   They can accurately describe **both the patterns and uncertainty for future data**.
    -   Frequentist models focus on inference, while Data Science models (see also machine learning, big data, AI models, etc.) focus on predictions.
    -   I am convinced that the Bayesian models make inference easier and more reliable while still making useful predictions.

## Inference example: river cleanup {.smaller}

Constructing a model that has a single parameter that captures the research question, yet also captures the variation, is very useful. 

Here we want to know how much pollution was cleaned up each month, but much of the data was censored. It's tricky to model the data when you don't measure it exactly, but not impossible.

```{r}
#| warning: false
crazyts_plot <- \() {
crazyts <- readRDS('crazyts.rds')
pnttype <- rep('Observed', nrow(crazyts$m_data))
pnttype[crazyts$m_data$IsLeftCensored | crazyts$m_data$IsRightCensored] <- 'Censored'
pnttype <- factor(pnttype, levels = rev(unique(pnttype)))
cols <- viridisLite::magma(6)[c(2,4)] |> setNames(NULL)
plot_ly(width = fig_w, height = fig_h) |> add_trace(x = ~Date, y = ~LogValue, 
            color = ~Location, colors = cols, 
            type = 'scatter', mode = 'markers', 
            symbol = ~pnttype,
            data = crazyts$m_data) |>
  add_trace(x = ~Date, y= ~Value, 
            color = ~Location, linetype = ~Line,
            type = 'scatter', mode = 'lines', 
            colors = cols, data = crazyts$mu_data,
            visible = "legendonly")
}
crazyts_plot()
```























# Example: HIV Viral Load

The ACTG 315 dataset, available in the *ushr* **R** package, includes longitudinal measurements of HIV viral load (log$_{10}$ RNA copies/mL) over time. It features data on 46 patients, with the longest measurement recorded on Day 196 after baseline (Day 0). 

## Data set {.smaller}

```{css}
table.dataTable tbody td {
  font-size: 16px;
  padding: 6px 10px;
}
```

```{r}
actg315 <- ushr::actg315raw
display_data <- actg315 |> select(-Patid)
display_data$PatientID <- actg315$Patid |> factor()
datatable(display_data, rownames = FALSE, filter = "top", 
          options = list(pageLength = 10, dom = "tp"))
```

```{r}
set.seed(12345)
```

```{r}
actg315$CD4 <- actg315$CD4/100
actg315$RNA <- 10^(actg315$log10.RNA)
actg315$threshold <- 1 - (actg315$RNA <= 100)
actg315$log10.RNA.censored <- ifelse(actg315$RNA <= 100, 2, actg315$log10.RNA)
actg315$PatientID <- actg315$Patid |> factor()
actg315_long <- actg315 |>
  pivot_longer(cols = c(log10.RNA., CD4), 
               names_to = "Measure", 
               values_to = "Value")
```

## RNA by time and subject

```{r data2}
actg315_long_filtered <- actg315_long[actg315_long$Measure == "log10.RNA.", ]
RNAplot <- actg315_long_filtered |> ggplot(aes(x = Day, y = Value)) +
  geom_line(aes(colour = PatientID), linewidth = 0.5) +
  scale_colour_viridis_d(option = "F") +
  labs(x = "Day", y = "Log_10  RNA Copies/mL") +
  guides(colour = "none") +
  scale_x_continuous(breaks = seq(0, 1000, by = 14)) + 
  geom_hline(yintercept = 2, colour = "purple", linetype = 2)
RNAplot |> ggplotly(width = 950, height = 600)
```

## Quantile regression fits

```{r}
info <- readRDS("needed_info.rds")
```

```{r}
CalcMu <- function(b1, b2, b3, b4, g, time, cd4_val) {
  log10(exp(b1 - b2*time) + exp(b3 - (b4 + g*cd4_val)*time))
}
quantiles <- c(0.05, 0.25, 0.50, 0.75, 0.95)
quantile_names <- paste0("Percentile", sprintf("%02d", quantiles*100))
NCores <- (parallel::detectCores(logical = FALSE) * 0.8) |> ceiling()
times <- seq(0, 196, 1)
CD4_coefs <- lme4::lmer(CD4 ~ Day + (1 | Patid), data = actg315) |> summary() |> coef()
CD4_seq <- CD4_coefs[2,1]*times + CD4_coefs[1,1]
# info$nm
est_data <- info$e |> t() |> data.frame(row.names = NULL) |> setNames(info$nm)
est_data <- data.frame(Quantile = quantiles, Percentile = quantile_names, est_data)
mu_curves <- seq_len(nrow(est_data)) |> sapply(\(i) {
  d <- est_data[i,]
  CalcMu(d$beta_rna.1., d$beta_rna.2., d$beta_rna.3., d$beta_rna.4., 
         d$gamma_rna, times, CD4_seq)
}) |> data.frame() |> setNames(quantiles)
mu_curves_long <- mu_curves |> mutate(Day = times) |> 
  pivot_longer(-Day, names_to = "Quantile", values_to = "Value")
RNAplot_SEP <- actg315_long_filtered |> ggplot(aes(x = Day, y = Value)) +
  geom_line(aes(colour = PatientID), linewidth = 0.2, alpha = 0.3) +
  scale_colour_viridis_d(option = "F") +
  labs(x = "Day", y = "Log_10  RNA Copies/mL") +
  guides(colour = "none") +
  scale_x_continuous(breaks = seq(0, 1000, by = 14)) + 
  geom_hline(yintercept = 2, colour = "purple", linetype = 2) + 
  geom_line(aes(linetype = Quantile), data = mu_curves_long, linewidth = 1, colour = "#013002")
RNAplot_SEP |> ggplotly(width = 950, height = 600)
```

# Example: Medication adherence

In this study you are interested in how long people are going to keep taking the medication (persistence) and how consistently they are taking the medication (adherence). Further, you know that these two behaviours interact, and you want to know what the effect of an intervention is on the joint behaviour.

In this case we fitted a joint model where the output of the adherence model feeds into the persistence model.

## Adherence and persistence {.scrollable}


```{r}
knitr::include_graphics("adherence_plot.svg")
knitr::include_graphics("persistence_plot.svg")
```



## Adherence by age, time, and treatment

```{r}
create_predictions_by_age_3d_plotly <- \() {
  predictions_by_age <- readRDS("predictions_by_age_jointmodel.rds")
  Ages <- seq(40,70)
  KUMparsims <- "KUMJointParameterSims.rds" |> readRDS()
  KM_plot_data <- KUMparsims$KMCurve |> mutate(
    Treatment = Treatment |> str_remove("^.*="),
    Time = Time/30
  )
  treatment_plot_lines <- unique(KM_plot_data$Treatment)
  gap <- 0.02
  endpoint <- 12
  time_seq <- seq(0, endpoint, gap)
  
  est1 <- predictions_by_age |> sapply(\(d) {
    d$Estimate[d$Treatment == treatment_plot_lines[1]]}
    ) |> t()
  est1l <- predictions_by_age |> sapply(\(d) {
    d$Cred_Lower[d$Treatment == treatment_plot_lines[1]]}
    ) |> t()
  est1u <- predictions_by_age |> sapply(\(d) {
    d$Cred_Upper[d$Treatment == treatment_plot_lines[1]]}
    ) |> t()
  est2 <- predictions_by_age |> sapply(\(d) {
    d$Estimate[d$Treatment == treatment_plot_lines[2]]}
    ) |> t()
  est2l <- predictions_by_age |> sapply(\(d) {
    d$Cred_Lower[d$Treatment == treatment_plot_lines[2]]}
    ) |> t()
  est2u <- predictions_by_age |> sapply(\(d) {
    d$Cred_Upper[d$Treatment == treatment_plot_lines[2]]}
    ) |> t()
  
  m <- 0.76
  plot_ly(x = time_seq, y = Ages, z = est1, type = 'surface', 
          width = fig_w*0.9, height = fig_h*1.5, colorscale = "Blues", opacity = 1, 
          name = treatment_plot_lines[1], cmin = m, cmax = 1, 
          colorbar = list(title = list(text = treatment_plot_lines[1]),
                          nticks = 8)) |> 
    layout(scene = list(
      xaxis = list(title = "Month", range = c(0, 12)),
      yaxis = list(title = "Age", range = c(40,70)),
      zaxis = list(title = "Adherence", range = c(m,1))
    )) |>
    add_surface(z = est1l, colorscale = "Blues", showscale = FALSE, 
                opacity = 0.3, name = treatment_plot_lines[1], 
                cmin = m, cmax = 1) |> 
    add_surface(z = est1u, colorscale = "Blues", showscale = FALSE, 
                opacity = 0.3, name = treatment_plot_lines[1], 
                cmin = m, cmax = 1) |> 
    add_surface(z = est2, colorscale = "Reds", opacity = 1, 
                cmin = m, cmax = 1, reversescale = TRUE,
                name = treatment_plot_lines[2],
                colorbar = list(title = list(text = treatment_plot_lines[2]))) |>  
    add_surface(z = est2l, colorscale = "Reds", showscale = FALSE, 
                opacity = 0.3, name = treatment_plot_lines[2], 
                cmin = m, cmax = 1, reversescale = TRUE) |> 
    add_surface(z = est2u, colorscale = "Reds", showscale = FALSE, 
                opacity = 0.3, name = treatment_plot_lines[2], 
                cmin = m, cmax = 1, reversescale = TRUE)
}
create_predictions_by_age_3d_plotly()
```

## Web interface {.smaller}

-   The company that owns the data (typically the one who developed the medication) can host an app on their website that would allow a doctor or pharmacist to type in a patient's record anonymously and get reasonable predictions.

```{r}
knitr::include_graphics("Screenshot_of_app.png")
```


# Example: Giraffes


## The giraffe video data {.smaller}

Kaitlyn Taylor, M student in Animal Science, went to 3 reserves and filmed 252 video of giraffes reacting to sounds that she played them, then coded the behaviours for 180s from the sound (for 55 behaviours).

```{r}
get_G_data <- function() {
  filecore <- 'TaylorK_2025Jan'
  # Read in data
  d_raw <- read.xlsx(paste0(filecore,'.xlsx'), "Details")
  a_info <- read.xlsx(paste0(filecore,'.xlsx'), "Animals")
  v_info <- read.xlsx(paste0(filecore,'.xlsx'), "Variables")
  s_info <- read.xlsx(paste0(filecore,'.xlsx'), "Sounds")
  sites <- c("APGR", "FGR", "WGL")
  d_raw$Location[d_raw$Location %in% "Amanzi_Tikva"] <- "Amanzi"
  n_sites <- length(sites)
  v_data <- readRDS("TaylorK.rds")
  v_data$Location[v_data$Location %in% "Amanzi_Tikva"] <- "Amanzi"
  old_sites <- c("Amanzi", "NavalHill", "Weltevreden")
  d_raw$Location <- sites[match(d_raw$Location, old_sites)]
  v_data$Location <- sites[match(v_data$Location, old_sites)]
  d_raw$Code <- a_info$Code[match(d_raw$ID, a_info$Name)]
  d_int <- d_raw[,1:22]
  n_obs <- nrow(d_int)
  v_info <- v_info |> arrange(Vnum) # Ensure variable information is sorted correctly
  intensity_max <- function(state) {
    max(c(state) * v_info$Intensity)
  }
  adjustment <- 12
  d <- readRDS("procced_data5.rds")
  d$Std_grp_sz <- scale(d$Group_Size)
  d$Std_Speaker_dist <- scale(d$Speaker_Distance)
  d$Std_Wind_speed <- scale(d$Wind_Speed)
  d$Sex_Age <- ifelse(d$Age %in% c("Adult", "SubAdult"), paste(d$Age, d$Sex, sep = "_"), d$Age)
  d$Date <- d$Date |> dmy()
  d <- d |> group_by(Location, Sound_Type) |> 
    mutate(Std_Date = Date |> decimal_date() |> scale()) |> 
    ungroup()
  d$HNR <- s_info$HNR[match(d$Sound_Variation, s_info$Sound_Variation)] |> scale()
  d$Sex_Age[d$Sex_Age %in% "BigBull"] <- "Adult_M"
  fit_R <- readRDS("fit_R.rds")
list(d = d, v_info = v_info, fit_R = fit_R, v_data = v_data)
}
Gd <- get_G_data()
```

```{r}
plot_model <- function(fit, current_data, yl, y_name) {
  y_name <- sym(y_name)
  sounds <- sort(unique(current_data$Sound_Type))
  sites <- c("APGR", "FGR", "WGL")
  pred_d <- expand.grid(Location = sites, Std_Date = 0, 
                      Sex_Age = sort(unique(current_data$Sex_Age)), Std_grp_sz = 0, 
                      Std_Speaker_dist = 0, Std_Wind_speed = 0,
                      Sound_Type = sounds)
  pred_d_mmat <- model.matrix(~ Location * Sound_Type * Std_Date + Sex_Age + Std_grp_sz + 
                  Std_Speaker_dist + Std_Wind_speed, data = pred_d)
  post_sims <- fit |> rstanarm::as_draws_matrix()
  nsims <- nrow(post_sims)
  post_sims_beta <- post_sims[, seq_len(ncol(pred_d_mmat))]
  post_sims_names <- colnames(post_sims)
  expected_sims_mat <- pred_d_mmat %*% t(post_sims_beta)
  pred_d$Expected <- rowMeans(expected_sims_mat)
  intervals <- expected_sims_mat |> apply(1, shortestinterval)
  pred_d$Lower <- intervals[1,]
  pred_d$Upper <- intervals[2,]
  pred_d |> ggplot(aes(x = Sex_Age, colour = Sound_Type)) + 
  facet_grid(cols = vars(Location)) + 
  geom_segment(aes(y = Lower, yend = Upper), 
               arrow = arrow(angle = 90, ends = "both", length = unit(0.1, "inches")),
               position = position_dodge(width = 0.4)) + 
  geom_point(aes(y = Expected), size = 3, shape = 4, 
             position = position_dodge(width = 0.4)) +
  geom_jitter(data = current_data, aes(y = !!y_name), width = 0.2, height = 0.1, size = 1) + 
  labs(x = "Age, sex, and location of giraffe",
       y = yl, colour = "Sound Type") + 
  theme(axis.text.x = element_text(size = 6), legend.position = "bottom")
}
```

```{r}
plot_model(Gd$fit_R$R_ST, Gd$d, "Reaction intensity", "Reaction")
```








# Conclusion

-   An expansive statistical toolbox is incredibly useful when encountering real data.
-   Real data is difficult but rewarding to work with.
-   So **expand your statistical toolbox to expand your impact**.

> This presentation was created using the Reveal.js format in [Quarto](https://quarto.org/), using the [RStudio IDE](https://posit.co/products/open-source/rstudio/). Font and line colours according to UFS branding, and background image using image editor [GIMP](https://www.gimp.org/) by compositing images from CoPilot.



