--- title: "Generalized Linear Mixed Models (cont'd)" subtitle: "Analysis of Ecological and Environmental Data
QERM 514" author: "Mark Scheuerell" date: "29 May 2026" output: ioslides_presentation: wide: true css: ../lecture_slides.css --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE) ``` ## Goals for today * Understand some of the diagnostics available for evaluating GLMM fits * Become familiar with some of the inferential tools for GLMMs ## Diagnostics Diagnostics for GLMMs are similar to those for GLMs, but we are limited in our choices ## {.smaller data-background=brown_tree_snake.jpg data-background-size=100%} ## Brown tree snakes Our data consist of counts of the number of eggs per female at 10 locations over 13 years We are interested in the fixed effect of body size and the random effects of location and year We'll begin with only the effects of body size and location ## Brown tree snakes ```{r tree_snakes, fig.height = 5, fig.width = 7, fig.align = "center"} ## code adapted from Sarah Converse (2019) set.seed(514) ## sample and group sizes nn <- 234 n_loc <- 10 n_yrs <- 13 ## grand mean mu <- 1 ## slope beta <- 0.5 ## random effects re_loc <- rnorm(n_loc, 0, 0.5) re_year <- rnorm(n_yrs, 0, 0.25) ## covariates loc <- sample(seq(n_loc), nn, replace = TRUE) year <- sample(seq(n_yrs), nn, replace = TRUE) size <- runif(nn, -1, 1) |> round(2) ## mean number of eggs mean <- exp(mu + re_loc[loc] + re_year[year] + beta * size) ## number of eggs eggs <- rpois(nn, mean) ## data frame df_eggs <- data.frame(cbind(eggs, size, loc, year)) ## plot area par(mai = c(0.9, 0.9, 0.1, 0.1), omi = c(0, 0, 0, 0), bg = NA, cex.main = 1.3, cex.lab = 1.3) ## histogram of eggs hist(eggs, breaks = seq(0, max(eggs)), las = 1, col = "brown", border = "gray", main = "", xlab = "Number of eggs") ``` ## Let's fit some models ```{r echo = TRUE, results = TRUE} library(lme4) ## set control parameters for model fitting cntrl <- glmerControl(optimizer = "bobyqa", tol = 1e-4, optCtrl = list(maxfun = 100000)) ## fixed effect of size plus random effect of location fit_1 <- glmer(eggs ~ size + (1|loc), family = poisson, data = df_eggs, control = cntrl) ## fixed effect of size plus random effects of location & year fit_2 <- glmer(eggs ~ size + (1|loc) + (1|year), family = poisson, data = df_eggs, control = cntrl) ``` ## Goodness of fit Recall our goodness of fit test based on the Pearson's $\chi^2$ $$ X^2 = \sum_{i = 1}^n \frac{(O_i - E_i)^2}{E_i} \sim \chi^2_{(n - 1)} $$
where $O_i$ is the observed count and $E_i$ is the expected count ## Pearson's $\chi^2$ statistic For a binomial distribution $$ X^2 = \sum_{i = 1}^n \frac{(y_i - n_i \hat{p}_i)^2}{n_i \hat{p}_i} $$ For a Poisson distribution $$ X^2 = \sum_{i = 1}^n \frac{(y_i - \lambda_i)^2}{\lambda_i} $$ ## Goodness of fit $H_0$: Our model is correctly specified ```{r ted_gof, echo = TRUE} ## Pearson's X^2 statistic X2 <- sum((eggs - fitted(fit_2))^2 / fitted(fit_2)) ## likelihood ratio test pchisq(X2, df = nn - length(coef(fit_2)), lower.tail = FALSE) ``` The $p$-value is large so we cannot reject $H_0$ ## Model diagnostics Check for heteroscedasticity ```{r heteroscas, echo = FALSE, results = FALSE, fig.width = 5, fig.height = 5, fig.align = "center"} ## plot residuals plot(fit_2, sqrt(abs(resid(.))) ~ fitted(.), pch = 16, type = c("p", "smooth"), ylab = "Residual", xlab = "Fitted value") ``` ## Diagnostics Use [`DHARMa::simulateResiduals`](https://cran.r-project.org/web/packages/DHARMa/vignettes/DHARMa.html#calculating-scaled-residuals) to scale the residuals ```{r diagnostics, echo = FALSE, results = FALSE, fig.width = 8, fig.height = 4.5, fig.align = "center"} ## MC residuals sim_output <- DHARMa::simulateResiduals(fit_2, n = 1000) ## set plot area par(mai = c(0.9, 0.9, 0.3, 0.1), omi = c(0, 0, 0, 0)) ## residual plots plot(sim_output) ``` ## Model diagnostics ```{r re_diagnostics, fig.width = 9, fig.height = 4, fig.align = "center"} ## set plot area par(mfrow = c(1,3), mai = c(0.9, 0.9, 0.4, 0.1), omi = c(0, 0, 0, 0), cex.main = 1.5, cex.lab = 1.5, cex.axis = 1.4) ## qq resids qqnorm(residuals(fit_2), main = "QQ plot (residuals)", pch = 16) qqline(residuals(fit_2)) ## qq RE's qqnorm(unlist(ranef(fit_2)), main = "QQ plot (RE's)", pch = 16) qqline(unlist(ranef(fit_2))) ## resids vs fitted plot(fitted(fit_2), residuals(fit_2), pch = 16, xlab = "Fitted", ylab = "Residuals", main = "Residuals vs fitted") abline(h=0, lty = "dashed") ``` ## Leverage For other models, we can calculate the leverages to evaluate potentially extreme values in predictor space For GLMMs, however, the leverages depend on the estimated variance-covariance matrices of the random effects ## Leverage We can fit a GLM and evaluate leverage of the fixed effects ```{r leverage, echo = FALSE, results = FALSE, fig.width = 5.5, fig.height = 4.5, fig.align = "center"} ## fit fixed effects only fit_fixed <- glm(eggs ~ size, data = df_eggs, family = poisson) ## set plot area par(mai = c(0.9, 0.9, 0.1, 0.1), omi = c(0, 0, 0, 0), cex.axis = 1.3, cex.lab = 1.5) faraway::halfnorm(hatvalues(fit_fixed)) ``` ## Cook's Distance For other models, we can calculate Cook's distances to identify potentially influential data points For GLMMs, however, the Cook's distances involve derivatives of the likelihood with respect to the random effects (this is an active area of research) ## Cook's Distance Again, we can fit a GLM and look at the Cook's distances for the fixed effects ```{r cooks, echo = FALSE, results = FALSE, fig.width = 5.5, fig.height = 4.5, fig.align = "center"} ## set plot area par(mai = c(0.9, 0.9, 0.1, 0.1), omi = c(0, 0, 0, 0), cex.axis = 1.3, cex.lab = 1.5) ## calculate Cook's D cooks_d <- cooks.distance(fit_fixed) ## plot Cook's D faraway::halfnorm(cooks_d) ``` ## Large Cook's distances (from GLM) Large snakes with lots of eggs ```{r which_cooks, echo = TRUE, results = TRUE} ## large Cook's D cooks_big <- sort(cooks_d, decreasing = TRUE)[1:2] ## details about them df_eggs[as.numeric(names(cooks_big)),] ``` ## Inference for fixed effects {.smaller} We can test the significance of the fixed effects via a $\chi^2$ test
by comparing models with and without the effect(s) ```{r chi2_fixed, echo = TRUE} ## fit reduced model with only random effects fit_0 <- glmer(eggs ~ (1 | loc) + (1 | year), data = df_eggs, family = poisson) ## compare null and full models anova(fit_2, fit_0) ``` ## Inference for random effects {.smaller} We can test the significance of the random effects via a $\chi^2$ test
by comparing models with and without the effect(s) ```{r chi2_rdm, echo = TRUE} ## fit reduced model with only fixed effects fit_0 <- glm(eggs ~ size, data = df_eggs, family = poisson(link = "log")) ## compare null and full models anova(fit_2, fit_0) ``` ## Inference for fixed effects {.smaller} We can evaluate our fixed effects with AIC - as long as we keep the random effects unchanged ```{r aic, echo = TRUE, results = TRUE} ## fit null model with only RE's fit_0 <- glmer(eggs ~ (1 | loc) + (1 | year), family = poisson, data = df_eggs) ## compare AIC AIC(fit_0, fit_2) ``` ## Inference for random effects {.smaller} We can test the significance of the random effects using a bootstrap LRT A significant p-value leads us to reject the null hypothesis that the simpler model is adequate ```{r boot_LRT, echo = TRUE, results = TRUE} ## fit_1 and fit_2 fit on slide 7 ## threshold deviance lrt <- 2 * (logLik(fit_2) - logLik(fit_1)) ## bootstrapped estimates b_boot <- rep(NA, 100) for(i in 1:100) { y <- unlist(simulate(fit_1)) b_null <- update(fit_1, y ~ ., control = cntrl) b_alt <- update(fit_2, y ~ ., control = cntrl) b_boot[i] <- as.numeric(2 * (logLik(b_alt) - logLik(b_null))) } ## approximate p-value (p_value <- mean(b_boot > lrt)) ``` ## Inference For confidence intervals, we will again use bootstrapping * We will cover this in lab ## Overdispersion As with GLMs, we can check GLMMs for evidence of overdispersion, which we estimate as $$ \hat{c} = \frac{X^2}{n - k} $$
Let's do so for our snake model applied to another data set ```{r overdispersion, echo = FALSE} ## generate nb data eggs_nb <- rnbinom(nn, mu = mean, size = 1/2) df_eggs$eggs_nb <- eggs_nb ## non-dispersed model snakes <- glmer(eggs_nb ~ size + (1 | loc) + (1 | year), family = poisson, data = df_eggs) ``` ## Overdispersion ```{r ted_over, echo = TRUE} ## Pearson's X^2 statistic X2 <- sum((eggs - fitted(snakes))^2 / fitted(snakes)) ## number of parameters k <- length(coef(snakes)) + length(ranef(snakes)) ## overdispersion parameter (c_hat <- X2 / (nn - k)) ## test for overdispersion pchisq(deviance(snakes), k, lower.tail = FALSE) ``` ## Brown tree snakes | Negative binomial ```{r fit_nb, echo = TRUE, results = FALSE} ## neg binomial model for overdispersion fit_nb <- glmer.nb(eggs_nb ~ size + (1 | loc) + (1 | year), data = df_eggs, control = cntrl) ``` ## Brown tree snakes {.smaller} ```{r, echo = FALSE, results = TRUE} ## model summary summary(fit_nb) ``` ## Summary * Diagnostics for GLMMs are an art rather than a science * Use LRTs to test random effects (or just leave them in) * Use robust inferential methods like bootstrapped confidence intervals