library(lmerTest)
## Loading required package: lme4
## Loading required package: Matrix
##
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
##
## lmer
## The following object is masked from 'package:stats':
##
## step
library(ggplot2)
Take-home:
Imagine you have measured how much a plant species is attacked by big herbivores as a function of how many thorns the plant grows on stems. You have collected data in 5 locations and visualise the relationship between herbivory and quantity of thorns
thorndata <- read.csv("data/thorndata.csv")
str(thorndata)
## 'data.frame': 100 obs. of 3 variables:
## $ herbivory : num 4.53 4.6 3.89 4.05 3.73 ...
## $ thorndensity: num 1.72 1.75 2.2 2.3 2.43 ...
## $ site : Factor w/ 5 levels "a","b","c","d",..: 1 1 1 1 1 1 1 1 1 1 ...
ggplot(aes(x=thorndensity, y= herbivory), data = thorndata) + geom_point()+
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
m0 <- lm(thorndensity ~ herbivory, data=thorndata)
summary(m0)
##
## Call:
## lm(formula = thorndensity ~ herbivory, data = thorndata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.2660 -0.4745 -0.2017 0.3563 2.8093
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.9691 0.5428 3.628 0.000456 ***
## herbivory 0.4447 0.1264 3.520 0.000657 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9627 on 98 degrees of freedom
## Multiple R-squared: 0.1122, Adjusted R-squared: 0.1032
## F-statistic: 12.39 on 1 and 98 DF, p-value: 0.0006566
plot(m0)
library(performance)
check_model(m0)
## Not enough model terms in the conditional part of the model to check for multicollinearity.
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 100 rows containing missing values (geom_text_repel).
Model checks suggest some major unexplained structure in the data. It is likely due to site, which we have not included in our model.
Let’s visualize the data with colors related to site:
ggplot(aes(x=thorndensity, y= herbivory, color=site), data = thorndata) + geom_point()+
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
Indeed, site is a major source of variation.
A model that account for site changes the direction of the effect of thorndensity:
m1 <- lm( herbivory ~thorndensity+ as.factor(site), data=thorndata)
summary(m1)
##
## Call:
## lm(formula = herbivory ~ thorndensity + as.factor(site), data = thorndata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.68769 -0.28889 0.04982 0.24924 1.39683
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.0471 0.3872 15.617 < 2e-16 ***
## thorndensity -0.9752 0.1414 -6.899 6.02e-10 ***
## as.factor(site)b 0.9400 0.1762 5.334 6.62e-07 ***
## as.factor(site)c 1.9958 0.2147 9.295 5.80e-15 ***
## as.factor(site)d 2.9706 0.2812 10.562 < 2e-16 ***
## as.factor(site)e 3.7674 0.4219 8.929 3.48e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4875 on 94 degrees of freedom
## Multiple R-squared: 0.6151, Adjusted R-squared: 0.5947
## F-statistic: 30.05 on 5 and 94 DF, p-value: < 2.2e-16
anova(m1)
## Analysis of Variance Table
##
## Response: herbivory
## Df Sum Sq Mean Sq F value Pr(>F)
## thorndensity 1 6.5156 6.5156 27.412 9.992e-07 ***
## as.factor(site) 4 29.1932 7.2983 30.705 2.407e-16 ***
## Residuals 94 22.3426 0.2377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
If we want to visualize what the model m1 predicts in ggplot, we need some more coding:
predlm <- predict(m1, interval = "confidence")
datlm = cbind(thorndata, predlm)
predlm <- predict(m1, interval = "confidence")
datalm <- cbind(thorndata, predlm)
head(datalm)
## herbivory thorndensity site fit lwr upr
## 1 4.526133 1.716481 a 4.373192 4.037951 4.708433
## 2 4.599632 1.748753 a 4.341719 4.013343 4.670096
## 3 3.888889 2.203868 a 3.897880 3.650773 4.144988
## 4 4.054832 2.295535 a 3.808484 3.572708 4.044260
## 5 3.727945 2.427895 a 3.679403 3.455740 3.903066
## 6 3.745884 2.634347 a 3.478066 3.261607 3.694525
ggplot(datalm, aes(x=thorndensity, y=herbivory, color=site)) +
geom_point() + geom_line(aes(y=fit), size=2, alpha=0.9) +
geom_ribbon(aes(ymin=lwr, ymax=upr, fill=site), alpha=0.1)
check_model(m1)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 100 rows containing missing values (geom_text_repel).
The default model used by ggplot corresponds to a different question: does the relationship herbivory/thorndensity varies among sites?
ggplot(aes(x=thorndensity, y= herbivory, color=site), data = thorndata) + geom_point()+
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
m2 <- lm(herbivory ~ thorndensity * as.factor(site), data=thorndata)
summary(m2)
##
## Call:
## lm(formula = herbivory ~ thorndensity * as.factor(site), data = thorndata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.53118 -0.26592 0.00189 0.22158 1.21267
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.1978 0.7527 9.563 2.31e-15 ***
## thorndensity -1.4130 0.2835 -4.985 2.99e-06 ***
## as.factor(site)b 3.0457 2.0111 1.514 0.1334
## as.factor(site)c 1.4588 2.9031 0.503 0.6165
## as.factor(site)d -0.3892 1.6854 -0.231 0.8179
## as.factor(site)e 1.7014 1.2876 1.321 0.1897
## thorndensity:as.factor(site)b -0.5696 0.6419 -0.887 0.3773
## thorndensity:as.factor(site)c 0.2713 0.8113 0.334 0.7389
## thorndensity:as.factor(site)d 0.9524 0.4507 2.113 0.0374 *
## thorndensity:as.factor(site)e 0.6070 0.3425 1.772 0.0797 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4764 on 90 degrees of freedom
## Multiple R-squared: 0.6481, Adjusted R-squared: 0.6129
## F-statistic: 18.41 on 9 and 90 DF, p-value: < 2.2e-16
anova(m2)
## Analysis of Variance Table
##
## Response: herbivory
## Df Sum Sq Mean Sq F value Pr(>F)
## thorndensity 1 6.5156 6.5156 28.7028 6.432e-07 ***
## as.factor(site) 4 29.1932 7.2983 32.1507 < 2.2e-16 ***
## thorndensity:as.factor(site) 4 1.9124 0.4781 2.1061 0.08653 .
## Residuals 90 20.4302 0.2270
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
check_model(m2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 100 rows containing missing values (geom_text_repel).
It seems like there is some sign of variation in the effect of thorndensity among sites, but it is not clear. The model with interaction is not necessary if we just want to know what is the effect of thorndensity on herbivory on average. (so, m1 is better given our question.)
library(lmerTest)
mm1 <- lmer(thorndensity ~ herbivory + (1|site), data=thorndata)
summary(mm1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: thorndensity ~ herbivory + (1 | site)
## Data: thorndata
##
## REML criterion at convergence: 67
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.15700 -0.54602 -0.04339 0.54773 2.63404
##
## Random effects:
## Groups Name Variance Std.Dev.
## site (Intercept) 1.48095 1.2169
## Residual 0.08401 0.2899
## Number of obs: 100, groups: site, 5
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 5.29049 0.58443 5.26551 9.052 0.000209 ***
## herbivory -0.34085 0.04991 94.36965 -6.829 8.23e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## herbivory -0.361
Why random effect?
Fixed or random effect?
Our previous models were over-confident, because they failed to acknowledge that there were multiple measurements taken on the same plants. Therefore, the residuals were non-independent.
drought_data <- read.csv("data/droughtdata.csv")
ggplot(drought_data, aes(x=interaction(Genotype, WaterCondition),
y=Temperature, color=as.factor(plant)))+ geom_point()+xlab("Genotype-by-exposure")
mint <- lmer(Temperature ~ 1 + Genotype*WaterCondition + (1|plant), data = drought_data)
summary(mint)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Temperature ~ 1 + Genotype * WaterCondition + (1 | plant)
## Data: drought_data
##
## REML criterion at convergence: 95.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.51698 -0.57122 -0.07203 0.71235 1.93025
##
## Random effects:
## Groups Name Variance Std.Dev.
## plant (Intercept) 3.1420 1.7726
## Residual 0.8815 0.9389
## Number of obs: 32, groups: plant, 8
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 31.6875 1.2966 4.0000 24.439 1.66e-05
## GenotypeWT 6.9000 1.8337 4.0000 3.763 0.0197
## WaterConditionNormal 0.3875 1.8337 4.0000 0.211 0.8430
## GenotypeWT:WaterConditionNormal -7.1250 2.5932 4.0000 -2.748 0.0515
##
## (Intercept) ***
## GenotypeWT *
## WaterConditionNormal
## GenotypeWT:WaterConditionNormal .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) GntyWT WtrCnN
## GenotypeWT -0.707
## WtrCndtnNrm -0.707 0.500
## GntypWT:WCN 0.500 -0.707 -0.707
anova(mint)
## Type III Analysis of Variance Table with Satterthwaite's method
## Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
## Genotype 5.8402 5.8402 1 4 6.6257 0.06172 .
## WaterCondition 5.2854 5.2854 1 4 5.9962 0.07054 .
## Genotype:WaterCondition 6.6542 6.6542 1 4 7.5491 0.05150 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Now the result of an interaction genotype-by-water appears unclear. It seems possible that the result is entirelly due to having used two atypical plants for a given treatment. We should repeat the experiment with more plants!
Last time we saw there was clear interaction Age-by-Sex for Escape distance in the kangaroo data set.
roo <- read.csv("data/roo.csv")
ggplot(roo, aes(x=Tail_Length, y=EscapeDistance, color=Age)) +
geom_point() + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
summary(lm(EscapeDistance ~ Age*Sex, data = roo))
##
## Call:
## lm(formula = EscapeDistance ~ Age * Sex, data = roo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.4427 -3.4427 -0.2214 3.5573 27.1196
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.2214 0.1777 226.367 <2e-16 ***
## AgeYoung -14.8257 0.2977 -49.808 <2e-16 ***
## SexMale 4.2213 0.2782 15.174 <2e-16 ***
## AgeYoung:SexMale -3.7366 0.4278 -8.735 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.177 on 2452 degrees of freedom
## Multiple R-squared: 0.7145, Adjusted R-squared: 0.7142
## F-statistic: 2046 on 3 and 2452 DF, p-value: < 2.2e-16
However, all data points were not collected independently of each others. We measured individuals multiple times and we repeated the experiment on multiple years.
Transform this model into a mixed model with id and year as random effects. Does that change the parameter estimates and Std. Error?
m1 <- lmer(EscapeDistance ~ Sex*Age + (1|id) + (1|Year), data = roo)
summary(m1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: EscapeDistance ~ Sex * Age + (1 | id) + (1 | Year)
## Data: roo
##
## REML criterion at convergence: 14572
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.7235 -0.5637 -0.0198 0.5645 6.5002
##
## Random effects:
## Groups Name Variance Std.Dev.
## id (Intercept) 10.7936 3.2854
## Year (Intercept) 0.8944 0.9457
## Residual 16.0825 4.0103
## Number of obs: 2456, groups: id, 818; Year, 8
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 39.7782 0.4295 14.4181 92.609 < 2e-16 ***
## SexMale 3.7820 0.4028 923.2886 9.390 < 2e-16 ***
## AgeYoung -14.8392 0.3098 2303.3270 -47.894 < 2e-16 ***
## SexMale:AgeYoung -2.9305 0.4576 2257.7653 -6.404 1.84e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) SexMal AgeYng
## SexMale -0.406
## AgeYoung -0.350 0.359
## SexMl:AgYng 0.237 -0.643 -0.647
anova(m1)
## Type III Analysis of Variance Table with Satterthwaite's method
## Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
## Sex 899 899 1 660.73 55.911 2.420e-13 ***
## Age 75583 75583 1 1940.15 4699.689 < 2.2e-16 ***
## Sex:Age 660 660 1 2257.77 41.012 1.835e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The results do not change dramatically, but parameters estimate change a bit and the uncertainty in the parameter estimates increases (larger standard errors and p-values).