Today we learn:

Load packages for today:

library(lmerTest)
## Loading required package: lme4
## Loading required package: Matrix
## 
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
## 
##     lmer
## The following object is masked from 'package:stats':
## 
##     step
library(ggplot2)

Load data:

download.file("https://timotheenivalis.github.io/data/thorndata.csv", 
              destfile = "data/thorndata.csv")

download.file("https://timotheenivalis.github.io/data/droughtdata.csv", 
                        destfile = "data/droughtdata.csv")

download.file("https://timotheenivalis.github.io/data/roo.csv", 
                        destfile = "data/roo.csv")

download.file("https://timotheenivalis.github.io/data/storks.csv", 
                        destfile = "data/storks.csv")

Visualize

We model the relationship EscapeDistance / Tail_Length.

roo <- read.csv("data/roo.csv")

ggplot(roo, aes(x=Tail_Length, y=EscapeDistance)) + 
  geom_point(aes(color=as.factor(Year)), alpha=0.5) + 
  geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

mm0 <- lmer(EscapeDistance ~ Tail_Length + (1|Year), data = roo)
summary(mm0)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: EscapeDistance ~ Tail_Length + (1 | Year)
##    Data: roo
## 
## REML criterion at convergence: 15630
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.6300 -0.6676 -0.0983  0.5826  6.0700 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  Year     (Intercept)  4.108   2.027   
##  Residual             33.561   5.793   
## Number of obs: 2456, groups:  Year, 8
## 
## Fixed effects:
##               Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)  -25.52373    1.18717   48.52328  -21.50   <2e-16 ***
## Tail_Length    1.14330    0.01763 2451.96379   64.85   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## Tail_Length -0.789
# Fixed effects:
fixef(mm0)
## (Intercept) Tail_Length 
##  -25.523733    1.143302
# Random deviations (a.k.a. "random predictions"):
ranef(mm0)
## $Year
##      (Intercept)
## 2006  0.05975949
## 2007 -1.62167497
## 2008 -2.07637086
## 2009  0.45260470
## 2010 -1.07825970
## 2011 -1.09178499
## 2012  1.27687634
## 2013  4.07885000
## 
## with conditional variances for "Year"
# Fixed effects plus random deviations:
coef(mm0)
## $Year
##      (Intercept) Tail_Length
## 2006   -25.46397    1.143302
## 2007   -27.14541    1.143302
## 2008   -27.60010    1.143302
## 2009   -25.07113    1.143302
## 2010   -26.60199    1.143302
## 2011   -26.61552    1.143302
## 2012   -24.24686    1.143302
## 2013   -21.44488    1.143302
## 
## attr(,"class")
## [1] "coef.mer"
#Random effect variance components
as.numeric(VarCorr(mm0)) 
## [1] 4.108495

Visualize the year slope predictions:

predlm <- as.data.frame(coef( mm0 )$Year)
predlm$Year <- rownames(predlm)
mainslope <- as.data.frame(t(fixef(mm0)))

ggplot(roo, aes(x=Tail_Length, y=EscapeDistance)) + 
  geom_point(aes(color=as.factor(Year)), alpha=0.5) + 
  geom_smooth(method = "lm")+
  geom_abline(inherit.aes = FALSE, 
              data = predlm,
              aes(intercept=`(Intercept)`,
                  slope=Tail_Length, 
                  color=as.factor(Year))) + 
  geom_abline(inherit.aes = FALSE, 
              data = mainslope,
              aes(intercept=`(Intercept)`,
                  slope=Tail_Length), size=2) 
## Warning: Ignoring unknown parameters: inherit.aes

## Warning: Ignoring unknown parameters: inherit.aes
## `geom_smooth()` using formula 'y ~ x'

Visualize the distribution of year effects and where to expect new years:

yeardeviations <- as.data.frame(ranef(mm0)$Year)
yeardeviations$Year <- row.names(yeardeviations)

ggplot(yeardeviations, aes(x=`(Intercept)`, color=Year)) + 
  geom_vline(aes(xintercept=`(Intercept)`, color=Year), size=2)

as.numeric(VarCorr(mm0))
## [1] 4.108495
xgrid <- seq(-8,8, by = 0.01)
randomdens <- dnorm(x = xgrid, mean = 0, sd = sqrt(as.numeric(VarCorr(mm0))))
REdensity <- data.frame(xgrid=xgrid, randomdens=randomdens)

ggplot(REdensity, aes(x=xgrid, y=randomdens)) + geom_line()+
  geom_vline(data= yeardeviations,
             aes(xintercept=`(Intercept)`, color=Year), size=2, alpha=0.5)

Visualize the projected range of observed and unobserved years around linear regression:

Int_low95 <- fixef(mm0)["(Intercept)"] - 1.96 * sqrt(as.numeric(VarCorr(mm0)))
Int_upp95 <- fixef(mm0)["(Intercept)"] + 1.96 * sqrt(as.numeric(VarCorr(mm0))) 

Tail_Length <- seq(20,75, by = 1)


intervalRE <- data.frame(Tail_Length=Tail_Length,
                         low95 =  Int_low95 + fixef(mm0)["Tail_Length"]*Tail_Length,
                         upp95 = Int_upp95 + fixef(mm0)["Tail_Length"]*Tail_Length)

ggplot(roo, aes(x=Tail_Length, y=EscapeDistance)) + 
  geom_point(aes(color=as.factor(Year)), alpha=0.5) + 
  geom_abline(inherit.aes = FALSE, 
              data = predlm,
              aes(intercept=`(Intercept)`,
                  slope=Tail_Length, 
                  color=as.factor(Year))) + 
  geom_abline(inherit.aes = FALSE, 
              data = mainslope,
              aes(intercept=`(Intercept)`,
                  slope=Tail_Length), size=2) +
  geom_ribbon(inherit.aes = FALSE,
                data=intervalRE,
                aes(x=Tail_Length,ymin=low95, ymax=upp95), alpha=0.3)
## Warning: Ignoring unknown parameters: inherit.aes

## Warning: Ignoring unknown parameters: inherit.aes

# Adding residual variation for the year 2006:

Int_low95 <- coef(mm0)$Year["2013", "(Intercept)"] - 1.96 * sigma(mm0)
Int_upp95 <- coef(mm0)$Year["2013", "(Intercept)"] + 1.96 * sigma(mm0)

Tail_Length <- seq(20,75, by = 1)

intervalPrediction2013 <- data.frame(Tail_Length=Tail_Length,
                         low95 =  Int_low95 + fixef(mm0)["Tail_Length"]*Tail_Length,
                         upp95 = Int_upp95 + fixef(mm0)["Tail_Length"]*Tail_Length)

ggplot(roo, aes(x=Tail_Length, y=EscapeDistance)) + 
  geom_point(aes(color=as.factor(Year)), alpha=0.5) + 
  geom_abline(inherit.aes = FALSE, 
              data = predlm,
              aes(intercept=`(Intercept)`,
                  slope=Tail_Length, 
                  color=as.factor(Year))) + 
  geom_abline(inherit.aes = FALSE, 
              data = mainslope,
              aes(intercept=`(Intercept)`,
                  slope=Tail_Length), size=2) +
  geom_ribbon(inherit.aes = FALSE,
                data=intervalRE,
                aes(x=Tail_Length,ymin=low95, ymax=upp95), alpha=0.3)+
  geom_ribbon(inherit.aes = FALSE,
                data=intervalPrediction2013,
                aes(x=Tail_Length,ymin=low95, ymax=upp95, fill="2013"), alpha=0.3)
## Warning: Ignoring unknown parameters: inherit.aes

## Warning: Ignoring unknown parameters: inherit.aes

Repeat the process for id, a random effect that captures far more variation:

mmid <- lmer(EscapeDistance ~ Tail_Length + (1|id), data = roo)
summary(mmid)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: EscapeDistance ~ Tail_Length + (1 | id)
##    Data: roo
## 
## REML criterion at convergence: 15261.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.6347 -0.5656 -0.0712  0.5400  4.5110 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept) 18.08    4.252   
##  Residual             20.19    4.493   
## Number of obs: 2456, groups:  id, 818
## 
## Fixed effects:
##               Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)  -21.37495    0.96619 2358.89369  -22.12   <2e-16 ***
## Tail_Length    1.04703    0.01853 2353.53387   56.51   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## Tail_Length -0.981
Int_low95 <- fixef(mmid)["(Intercept)"] - 1.96 * sqrt(as.numeric(VarCorr(mmid)))
Int_upp95 <- fixef(mmid)["(Intercept)"] + 1.96 * sqrt(as.numeric(VarCorr(mmid))) 

Tail_Length <- seq(20,75, by = 1)

intervalRE <- data.frame(Tail_Length=Tail_Length,
                         low95 =  Int_low95 + fixef(mmid)["Tail_Length"]*Tail_Length,
                         upp95 = Int_upp95 + fixef(mmid)["Tail_Length"]*Tail_Length)

mainslope <- as.data.frame(t(fixef(mmid)))

ggplot(roo, aes(x=Tail_Length, y=EscapeDistance)) + 
  geom_point(aes(color=as.factor(Year)), alpha=0.5) + 
  geom_abline(inherit.aes = FALSE, 
              data = mainslope,
              aes(intercept=`(Intercept)`,
                  slope=Tail_Length), size=2) +
  geom_ribbon(inherit.aes = FALSE,
                data=intervalRE,
                aes(x=Tail_Length,ymin=low95, ymax=upp95), alpha=0.3)
## Warning: Ignoring unknown parameters: inherit.aes

Year as fixed or random effect?

## [1] 217
## 
## 1991 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 
##    1    1    4    3    3    5    5    5    7    7    8    6    8    7   10    9 
## 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 
##   10   10   10    9   10    9   10   10   10   10   10   10   10

Year as a fixed effect models a linear change in the response with years. This does not correct for the non-independence between data collected on a given year.

summary(lm(babies ~ 1 + year, data = data))
## 
## Call:
## lm(formula = babies ~ 1 + year, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6103 -1.4083 -0.1328  1.1969  5.3491 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 986.31562   34.83814  28.311  < 2e-16 ***
## year          0.10678    0.01734   6.158 3.58e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.86 on 215 degrees of freedom
## Multiple R-squared:  0.1499, Adjusted R-squared:  0.146 
## F-statistic: 37.92 on 1 and 215 DF,  p-value: 3.579e-09
summary(lm(babies ~ 1 + storks, data = data))
## 
## Call:
## lm(formula = babies ~ 1 + storks, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0552 -1.3745 -0.0409  1.1854  4.7969 
## 
## Coefficients:
##              Estimate Std. Error  t value Pr(>|t|)    
## (Intercept) 1.200e+03  1.612e-01 7445.831  < 2e-16 ***
## storks      4.066e-03  1.007e-03    4.037 7.53e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.945 on 215 degrees of freedom
## Multiple R-squared:  0.07046,    Adjusted R-squared:  0.06614 
## F-statistic:  16.3 on 1 and 215 DF,  p-value: 7.526e-05
summary(lm(babies ~ 1 + storks +  year, data = data))
## 
## Call:
## lm(formula = babies ~ 1 + storks + year, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6607 -1.4066 -0.1324  1.1480  5.4436 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.618e+02  5.282e+01  18.208  < 2e-16 ***
## storks      -9.056e-04  1.463e-03  -0.619    0.537    
## year         1.190e-01  2.634e-02   4.519 1.03e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.863 on 214 degrees of freedom
## Multiple R-squared:  0.1514, Adjusted R-squared:  0.1435 
## F-statistic:  19.1 on 2 and 214 DF,  p-value: 2.341e-08

Year as a random effect models differences among years, without considering that years come in a specific order. The random effect correct the estimation of the fixed effect for non-independence between data collected on a give year. However, the random effect does not correct for linear changes in other fixed effects with time.

summary(lmer(babies ~ 1 + (1|year), data = data))
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: babies ~ 1 + (1 | year)
##    Data: data
## 
## REML criterion at convergence: 850.9
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.40554 -0.61816 -0.05888  0.64060  3.08569 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  year     (Intercept) 1.942    1.394   
##  Residual             2.293    1.514   
## Number of obs: 217, groups:  year, 29
## 
## Fixed effects:
##             Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept) 1200.631      0.284   26.870    4228   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lmer(babies ~ 1 + storks + (1|year), data = data))
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: babies ~ 1 + storks + (1 | year)
##    Data: data
## 
## REML criterion at convergence: 857.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.4089 -0.6134 -0.1086  0.6304  2.8922 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  year     (Intercept) 1.672    1.293   
##  Residual             2.296    1.515   
## Number of obs: 217, groups:  year, 29
## 
## Fixed effects:
##              Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept) 1.200e+03  3.127e-01 2.748e+01 3838.79   <2e-16 ***
## storks      4.473e-03  2.140e-03 2.570e+01    2.09   0.0467 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##        (Intr)
## storks -0.522

Sometimes you need both. Year can be both a fixed and a random effect, meaning we want to model a linear change, and the dependence between data collected on the same years.

summary(lmer(babies ~ 1 + storks +  year + (1|year) , data = data))
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: babies ~ 1 + storks + year + (1 | year)
##    Data: data
## 
## REML criterion at convergence: 856
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.4804 -0.6266 -0.0410  0.6304  3.0011 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  year     (Intercept) 1.351    1.162   
##  Residual             2.296    1.515   
## Number of obs: 217, groups:  year, 29
## 
## Fixed effects:
##               Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)  9.827e+02  8.868e+01  3.226e+01  11.081 1.56e-12 ***
## storks      -5.718e-04  2.843e-03  2.685e+01  -0.201   0.8421    
## year         1.086e-01  4.426e-02  3.222e+01   2.454   0.0197 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##        (Intr) storks
## storks  0.723       
## year   -1.000 -0.724

Try with the roo dataset:

summary(lm(EscapeDistance ~ Tail_Length + Year , data = roo))
## 
## Call:
## lm(formula = EscapeDistance ~ Tail_Length + Year, data = roo)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -22.101  -4.024  -0.593   3.493  36.124 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.389e+03  1.100e+02  -12.63   <2e-16 ***
## Tail_Length  1.086e+00  1.695e-02   64.09   <2e-16 ***
## Year         6.802e-01  5.467e-02   12.44   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.917 on 2453 degrees of freedom
## Multiple R-squared:  0.6269, Adjusted R-squared:  0.6266 
## F-statistic:  2061 on 2 and 2453 DF,  p-value: < 2.2e-16
summary(lmer(EscapeDistance ~Tail_Length + (1|Year), data = roo))
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: EscapeDistance ~ Tail_Length + (1 | Year)
##    Data: roo
## 
## REML criterion at convergence: 15630
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.6300 -0.6676 -0.0983  0.5826  6.0700 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  Year     (Intercept)  4.108   2.027   
##  Residual             33.561   5.793   
## Number of obs: 2456, groups:  Year, 8
## 
## Fixed effects:
##               Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)  -25.52373    1.18717   48.52328  -21.50   <2e-16 ***
## Tail_Length    1.14330    0.01763 2451.96379   64.85   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## Tail_Length -0.789
summary(lmer(EscapeDistance ~Tail_Length + Year + (1|Year), data = roo))
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: EscapeDistance ~ Tail_Length + Year + (1 | Year)
##    Data: roo
## 
## REML criterion at convergence: 15627.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.6240 -0.6688 -0.0988  0.5875  6.0857 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  Year     (Intercept)  2.68    1.637   
##  Residual             33.56    5.793   
## Number of obs: 2456, groups:  Year, 8
## 
## Fixed effects:
##               Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept) -1.122e+03  5.206e+02  5.822e+00  -2.155   0.0760 .  
## Tail_Length  1.142e+00  1.761e-02  2.436e+03  64.867   <2e-16 ***
## Year         5.456e-01  2.591e-01  5.822e+00   2.106   0.0812 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) Tl_Lng
## Tail_Length -0.011       
## Year        -1.000  0.009

Uncertainty in random effects

Lme4 summary does not show any measure of uncertainty in random effect variances. How do we know whether a random effect is statistically significant, or what range of values are likely?

You can test a random effect using a Likelihood Ratio Test that compares a model with and without the random effect. The models need to be nested, which means one model is a subset of the other one.

mm0 <- lmer(EscapeDistance ~ Age*Sex + (1|Year), data = roo)
m0 <- lm(EscapeDistance ~ Age*Sex, data = roo)
anova(mm0, m0)
## refitting model(s) with ML (instead of REML)
## Data: roo
## Models:
## m0: EscapeDistance ~ Age * Sex
## mm0: EscapeDistance ~ Age * Sex + (1 | Year)
##     Df   AIC   BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)    
## m0   5 15052 15082 -7521.2    15042                             
## mm0  6 15005 15040 -7496.3    14993 49.802      1  1.701e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mm1 <- lmer(EscapeDistance ~ Age*Sex + (1|Year) + (1|id), data = roo)
mm2 <- lmer(EscapeDistance ~ Age*Sex + (1|Year), data = roo)
mm3 <- lmer(EscapeDistance ~ Age*Sex + (1|id), data = roo)
anova(mm1, mm2)
## refitting model(s) with ML (instead of REML)
## Data: roo
## Models:
## mm2: EscapeDistance ~ Age * Sex + (1 | Year)
## mm1: EscapeDistance ~ Age * Sex + (1 | Year) + (1 | id)
##     Df   AIC   BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)    
## mm2  6 15005 15040 -7496.3    14993                             
## mm1  7 14584 14625 -7285.2    14570 422.24      1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(mm1,mm3)
## refitting model(s) with ML (instead of REML)
## Data: roo
## Models:
## mm3: EscapeDistance ~ Age * Sex + (1 | id)
## mm1: EscapeDistance ~ Age * Sex + (1 | Year) + (1 | id)
##     Df   AIC   BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)    
## mm3  6 14629 14664 -7308.6    14617                             
## mm1  7 14584 14625 -7285.2    14570 46.837      1  7.713e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Estimate confidence intervals of standard deviation parameters:

confint(mm1)
## Computing profile confidence intervals ...
##                        2.5 %     97.5 %
## .sig01             2.9833569   3.583530
## .sig02             0.5156091   1.666197
## .sigma             3.8752567   4.151607
## (Intercept)       38.9228850  40.642918
## AgeYoung         -15.4480436 -14.229890
## SexMale            2.9932686   4.572653
## AgeYoung:SexMale  -3.8330801  -2.037551

.sig01 is the standard deviation for the first random effect, .sig02 for the second random effect, .sigma for the residual standard deviation.

What about uncertainty in the random level predictions produced by ranef() ? You can’t with lme4, and it is very difficult in general with maximum likelihood methods. You need to go Bayesian (for instance with MCMCglmm), but we won’t cover that here.