Linear models, interpretation, prediction, additive effects, interactions

library(ggplot2)
library(performance)
library(emmeans)
library(plyr)

Goals for today:

Understand the ouput of lm(), summary(lm()), anova(lm()), emmeans(lm())
Be able to plot model predictions from lm()
Understand the difference between additive and full-factorial models, choose the appropriate model for your question, and interpret the results correctly

Practice with lm() and associated functions

Practice fit and prediction with drought data

# download.file("https://timotheenivalis.github.io/data/droughtdata.csv", 
#              destfile = "data/droughtdata.csv")

drought_data <- read.csv("data/droughtdata.csv")
str(drought_data)

## 'data.frame':    32 obs. of  4 variables:
##  $ plant         : int  1 1 1 1 2 2 2 2 3 3 ...
##  $ Genotype      : Factor w/ 2 levels "mutant","WT": 2 2 2 2 2 2 2 2 1 1 ...
##  $ WaterCondition: Factor w/ 2 levels "Drought","Normal": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Temperature   : num  30 31 31.6 31.9 32 32.1 32.7 33.5 30 31 ...

wt_data <- drought_data[drought_data$Genotype=="WT",]

str(drought_data)

## 'data.frame':    32 obs. of  4 variables:
##  $ plant         : int  1 1 1 1 2 2 2 2 3 3 ...
##  $ Genotype      : Factor w/ 2 levels "mutant","WT": 2 2 2 2 2 2 2 2 1 1 ...
##  $ WaterCondition: Factor w/ 2 levels "Drought","Normal": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Temperature   : num  30 31 31.6 31.9 32 32.1 32.7 33.5 30 31 ...

mutant_data <- drought_data[drought_data$Genotype=="mutant",]

mmut <- lm(Temperature ~ WaterCondition, data = mutant_data)
summary(mmut)

## 
## Call:
## lm(formula = Temperature ~ WaterCondition, data = mutant_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.78750 -0.85938 -0.03125  0.99687  3.01250 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           31.6875     0.5739  55.218   <2e-16 ***
## WaterConditionNormal   0.3875     0.8116   0.477     0.64    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.623 on 14 degrees of freedom
## Multiple R-squared:  0.01602,    Adjusted R-squared:  -0.05426 
## F-statistic: 0.228 on 1 and 14 DF,  p-value: 0.6404

ggplot(mutant_data, aes(x=WaterCondition, y=Temperature)) + 
  geom_violin() + geom_jitter()

Visualize confidence interval:

pred <- c(coefficients(mmut)[1]+coefficients(mmut)[2], coefficients(mmut)[1])

confpred <- as.data.frame(emmeans(mmut, ~WaterCondition))

ggplot(mutant_data, aes(x=WaterCondition, y=Temperature)) + 
  geom_violin() + geom_jitter() + 
  geom_point(data = confpred, mapping = aes(x=WaterCondition, y=emmean), inherit.aes = FALSE, size=10, color="red") +
  geom_errorbar(data = confpred, aes(x=WaterCondition, ymin=lower.CL, ymax =upper.CL), inherit.aes = FALSE, color="blue")

What does that mean?

Visualize prediction interval:

confpred <- as.data.frame(emmeans(mmut, ~WaterCondition))

confpred$lower.PI <- confpred$emmean - 1.96*sigma(mmut)
confpred$upper.PI <- confpred$emmean + 1.96*sigma(mmut)


ggplot(mutant_data, aes(x=WaterCondition, y=Temperature)) + 
  geom_violin() + geom_jitter() + 
  geom_point(data = confpred, mapping = aes(x=WaterCondition, y=emmean), inherit.aes = FALSE, size=10, color="red") +
  geom_errorbar(data = confpred, aes(x=WaterCondition, ymin=lower.CL, ymax =upper.CL), inherit.aes = FALSE, color="blue", width=0.2)+
  geom_errorbar(data = confpred, aes(x=WaterCondition, ymin=lower.PI, ymax = upper.PI), inherit.aes = FALSE, color="red", width= 0.5)

Not too bad. Although the variation seems over-estimated in the Normal water condition, and possibly underestimated in the Drought water condition. However, given the sample size it is not really concerning.

We can repeat the procedure for the wild type data:

wt_data <- drought_data[drought_data$Genotype=="WT",]
mwt <- lm(Temperature ~ WaterCondition, data = wt_data)
summary(mwt)

## 
## Call:
## lm(formula = Temperature ~ WaterCondition, data = wt_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7875 -0.9844  0.1000  1.0625  3.4125 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           38.5875     0.5830  66.187  < 2e-16 ***
## WaterConditionNormal  -6.7375     0.8245  -8.172 1.07e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.649 on 14 degrees of freedom
## Multiple R-squared:  0.8267, Adjusted R-squared:  0.8143 
## F-statistic: 66.78 on 1 and 14 DF,  p-value: 1.069e-06

preddf <- data.frame(WaterCondition = unique(mutant_data$WaterCondition),
           prediction = pred)

confpred <- as.data.frame(emmeans(mwt, ~WaterCondition))

confpred$lower.PI <- confpred$emmean - 1.96*sigma(mwt)
confpred$upper.PI <- confpred$emmean + 1.96*sigma(mwt)

ggplot(wt_data, aes(x=WaterCondition, y=Temperature)) + 
  geom_violin() + geom_jitter() + 
  geom_point(data = confpred, mapping = aes(x=WaterCondition, y=emmean), inherit.aes = FALSE, size=10, color="red") +
  geom_errorbar(data = confpred, aes(x=WaterCondition, ymin=lower.PI, ymax = upper.PI), inherit.aes = FALSE, color="red", width= 0.5) + 
geom_errorbar(data = confpred, aes(x=WaterCondition, ymin=lower.CL, ymax = upper.CL), inherit.aes = FALSE, color="blue", width= 0.2)

Practice with the full data set

mall <- lm(Temperature ~ WaterCondition + Genotype, data = drought_data)
summary(mall)

## 
## Call:
## lm(formula = Temperature ~ WaterCondition + Genotype, data = drought_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5688 -1.7406 -0.2125  1.8063  5.1938 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           33.4688     0.7553  44.310  < 2e-16 ***
## WaterConditionNormal  -3.1750     0.8722  -3.640 0.001052 ** 
## GenotypeWT             3.3375     0.8722   3.827 0.000639 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.467 on 29 degrees of freedom
## Multiple R-squared:  0.4903, Adjusted R-squared:  0.4551 
## F-statistic: 13.95 on 2 and 29 DF,  p-value: 5.705e-05

confpred <- as.data.frame(emmeans(mall, ~ WaterCondition + Genotype))
confpred$lower.PI <- confpred$emmean - 1.96*sigma(mall)
confpred$upper.PI <- confpred$emmean + 1.96*sigma(mall)

ggplot(drought_data, aes(x=interaction(WaterCondition,Genotype), y=Temperature)) + 
  geom_violin() + geom_jitter() +
   geom_point(data = confpred, mapping = aes(x=interaction(WaterCondition,Genotype), y=emmean), inherit.aes = FALSE, size=10, color="red") +
    geom_errorbar(data = confpred, aes(x=interaction(WaterCondition,Genotype), ymin=lower.PI, ymax = upper.PI), inherit.aes = FALSE, color="red", width= 0.5)+ 
geom_errorbar(data = confpred, aes(x=interaction(WaterCondition,Genotype), ymin=lower.CL, ymax = upper.CL), inherit.aes = FALSE, color="blue", width= 0.2)

It looks wrong! Not necessarily, it depends what we want to estimate.

Additive and interactive (full factorial) models

Here, the question that led to the experiment was: does the mutant deal better with drought than the wild type? In other words:

mallinter <- lm(Temperature ~ WaterCondition * Genotype, data = drought_data)

confpred <- as.data.frame(emmeans(mallinter, ~ WaterCondition * Genotype))
confpred$lower.PI <- confpred$emmean - 1.96*sigma(mall)
confpred$upper.PI <- confpred$emmean + 1.96*sigma(mall)

ggplot(drought_data, aes(x=interaction(WaterCondition,Genotype), y=Temperature)) + 
  geom_violin() + geom_jitter() +
   geom_point(data = confpred, mapping = aes(x=interaction(WaterCondition,Genotype), y=emmean), inherit.aes = FALSE, size=10, color="red") +
    geom_errorbar(data = confpred, aes(x=interaction(WaterCondition,Genotype), ymin=lower.PI, ymax = upper.PI), inherit.aes = FALSE, color="red", width= 0.5)+ 
geom_errorbar(data = confpred, aes(x=interaction(WaterCondition,Genotype), ymin=lower.CL, ymax = upper.CL), inherit.aes = FALSE, color="blue", width= 0.2)

Now we know this model fits the data well, let’s try to understand why based on the summary:

summary(mallinter)

## 
## Call:
## lm(formula = Temperature ~ WaterCondition * Genotype, data = drought_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7875 -0.9062  0.0313  1.0625  3.4125 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      31.6875     0.5785  54.780  < 2e-16 ***
## WaterConditionNormal              0.3875     0.8181   0.474    0.639    
## GenotypeWT                        6.9000     0.8181   8.435 3.58e-09 ***
## WaterConditionNormal:GenotypeWT  -7.1250     1.1569  -6.159 1.20e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.636 on 28 degrees of freedom
## Multiple R-squared:  0.7835, Adjusted R-squared:  0.7603 
## F-statistic: 33.78 on 3 and 28 DF,  p-value: 1.921e-09

# Water Drought, Genotype Mutant
coef(mallinter)["(Intercept)"]

## (Intercept) 
##     31.6875

# Water Normal, Genotype Mutant
coef(mallinter)["(Intercept)"] + coef(mallinter)["WaterConditionNormal"]

## (Intercept) 
##      32.075

# Water Drought, Genotype WT
coef(mallinter)["(Intercept)"] + coef(mallinter)["GenotypeWT"]

## (Intercept) 
##     38.5875

# Water Normal, Genotype WT

coef(mallinter)["(Intercept)"] + coef(mallinter)["GenotypeWT"]+ coef(mallinter)["WaterConditionNormal"]  ## WRONG

## (Intercept) 
##      38.975

coef(mallinter)["(Intercept)"] + coef(mallinter)["GenotypeWT"]+ coef(mallinter)["WaterConditionNormal"] + coef(mallinter)["WaterConditionNormal:GenotypeWT"] ## RIGHT

## (Intercept) 
##       31.85

Draw transitions with a pencil.

What question did we want to ask?

Are we trying to build a predictive model?
Are we interested in average effects?
Are we interested in differences in differences?

summary(mallinter)

## 
## Call:
## lm(formula = Temperature ~ WaterCondition * Genotype, data = drought_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7875 -0.9062  0.0313  1.0625  3.4125 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      31.6875     0.5785  54.780  < 2e-16 ***
## WaterConditionNormal              0.3875     0.8181   0.474    0.639    
## GenotypeWT                        6.9000     0.8181   8.435 3.58e-09 ***
## WaterConditionNormal:GenotypeWT  -7.1250     1.1569  -6.159 1.20e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.636 on 28 degrees of freedom
## Multiple R-squared:  0.7835, Adjusted R-squared:  0.7603 
## F-statistic: 33.78 on 3 and 28 DF,  p-value: 1.921e-09

anova(mallinter)

Why not to just do two tests and see that one is significant and the other not? Many cases where differences in p-values is unrelated to differences in differences. Differences in sample sizes or variability can generate differences in p-values for a predictor although effects are the same across other predictors.

Below is an example of a difference in p-value giving the wrong impression that differences are different, while they are not. You don’t need to worry about what the code does exactly. Just know it simulates data with an effect of the variable x, and that this effect is the same in both populations (pop). The sample size is very different among populations though.

set.seed(1859)
mean1 <- 0
mean2 <- 0.15

samp1 <- 8
samp2 <- 1000

sdall <- 3

dat <- data.frame(y = c(rnorm(n = samp1, mean = mean1, sd = sdall),
                rnorm(n = samp1, mean = mean2, sd = sdall),
                rnorm(n = samp2, mean = mean1, sd = sdall),
                rnorm(n = samp2, mean = mean2, sd = sdall)),
           x = c(rep("A", times=samp1), 
                 rep("B", times=samp1),
                 rep("A", times=samp2),
                 rep("B", times=samp2)),
           pop = c(rep("p1", times=2*samp1), 
                   rep("p2", times=2*samp2))
            )

If we fit models separately for populations 1 and 2, we get one significant result and the other non-significant:

summary(lm(y ~ x ,  data = dat[dat$pop=="p1",]))

## 
## Call:
## lm(formula = y ~ x, data = dat[dat$pop == "p1", ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5792 -1.4286 -0.0339  1.2641  3.4056 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  -0.6468     0.7680  -0.842    0.414
## xB            1.5616     1.0861   1.438    0.172
## 
## Residual standard error: 2.172 on 14 degrees of freedom
## Multiple R-squared:  0.1287, Adjusted R-squared:  0.06642 
## F-statistic: 2.067 on 1 and 14 DF,  p-value: 0.1725

summary(lm(y ~ x ,  data = dat[dat$pop=="p2",]))

## 
## Call:
## lm(formula = y ~ x, data = dat[dat$pop == "p2", ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.2221 -2.1222 -0.0161  2.0985 11.4287 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  0.05277    0.09520   0.554  0.57941   
## xB           0.34811    0.13463   2.586  0.00979 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.01 on 1998 degrees of freedom
## Multiple R-squared:  0.003335,   Adjusted R-squared:  0.002836 
## F-statistic: 6.686 on 1 and 1998 DF,  p-value: 0.00979

That’s just an artefact of sample size though.

If we fit a single model with an interaction, we realize that there is no clear evidence that the differences among x are different among populations:

summary(mtot <- lm(y ~ x*pop ,  data = dat))

## 
## Call:
## lm(formula = y ~ x * pop, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.2221 -2.1200 -0.0161  2.0949 11.4287 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  -0.6468     1.0626  -0.609    0.543
## xB            1.5616     1.5027   1.039    0.299
## popp2         0.6996     1.0668   0.656    0.512
## xB:popp2     -1.2135     1.5087  -0.804    0.421
## 
## Residual standard error: 3.005 on 2012 degrees of freedom
## Multiple R-squared:  0.003863,   Adjusted R-squared:  0.002378 
## F-statistic: 2.601 on 3 and 2012 DF,  p-value: 0.05056

anova(mtot)

Practice with factors

Kangaroos behavioural data: At what distance do individuals hop-away when you approach them?

We know Age has a strong effect on EscapeDistance. We wonder whether the effect is the same for both males and females.

roo <- read.csv("data/roo.csv")

summary(lm(EscapeDistance ~ 1 + Sex, data = roo))

## 
## Call:
## lm(formula = EscapeDistance ~ 1 + Sex, data = roo)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.431  -7.939   1.061   7.569  26.569 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  34.9386     0.2666 131.047   <2e-16 ***
## SexMale       0.4924     0.3918   1.257    0.209    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.683 on 2454 degrees of freedom
## Multiple R-squared:  0.000643,   Adjusted R-squared:  0.0002358 
## F-statistic: 1.579 on 1 and 2454 DF,  p-value: 0.209

summary(lm(EscapeDistance ~ 1 + Age, data = roo))

## 
## Call:
## lm(formula = EscapeDistance ~ 1 + Age, data = roo)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.9435  -3.6575   0.0565   3.3425  27.3425 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  41.9435     0.1430  293.32   <2e-16 ***
## AgeYoung    -16.2860     0.2217  -73.47   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.415 on 2454 degrees of freedom
## Multiple R-squared:  0.6875, Adjusted R-squared:  0.6873 
## F-statistic:  5398 on 1 and 2454 DF,  p-value: < 2.2e-16

summary(lm(EscapeDistance ~ 1 + Age*Sex, data = roo))

## 
## Call:
## lm(formula = EscapeDistance ~ 1 + Age * Sex, data = roo)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -23.4427  -3.4427  -0.2214   3.5573  27.1196 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       40.2214     0.1777 226.367   <2e-16 ***
## AgeYoung         -14.8257     0.2977 -49.808   <2e-16 ***
## SexMale            4.2213     0.2782  15.174   <2e-16 ***
## AgeYoung:SexMale  -3.7366     0.4278  -8.735   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.177 on 2452 degrees of freedom
## Multiple R-squared:  0.7145, Adjusted R-squared:  0.7142 
## F-statistic:  2046 on 3 and 2452 DF,  p-value: < 2.2e-16

ggplot(roo, aes(x=interaction(Sex, Age), y=EscapeDistance)) + geom_violin()

So we have good evidence that the age differences are different in males and females; or equivalently, the sex differences are different in young and adults.

Now, let’s say you want to estimate the effect of sex on average. You know that years where quite different sex ration through years. So you would like to control for year, as a factor. Which model do you fit?

An additive model ?

summary(lm(EscapeDistance ~ 1 + Sex + as.factor(Year), data = roo))

## 
## Call:
## lm(formula = EscapeDistance ~ 1 + Sex + as.factor(Year), data = roo)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -26.196  -7.619   1.351   7.381  26.169 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          30.8309     0.6880  44.811  < 2e-16 ***
## SexMale               0.5774     0.3878   1.489 0.136598    
## as.factor(Year)2007   4.2654     0.7899   5.400 7.31e-08 ***
## as.factor(Year)2008   3.4286     0.8210   4.176 3.07e-05 ***
## as.factor(Year)2009   4.6459     0.7999   5.808 7.14e-09 ***
## as.factor(Year)2010   6.7877     0.8652   7.846 6.38e-15 ***
## as.factor(Year)2011   6.4078     1.1875   5.396 7.46e-08 ***
## as.factor(Year)2012   4.7933     1.0000   4.793 1.74e-06 ***
## as.factor(Year)2013   2.8913     0.8372   3.454 0.000562 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.547 on 2447 degrees of freedom
## Multiple R-squared:  0.03117,    Adjusted R-squared:  0.028 
## F-statistic:  9.84 on 8 and 2447 DF,  p-value: 1.5e-13

Or an interactive model?

summary(mx <- lm(EscapeDistance ~ 1 + Sex * as.factor(Year), data = roo))

## 
## Call:
## lm(formula = EscapeDistance ~ 1 + Sex * as.factor(Year), data = roo)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.469  -7.234   1.012   7.361  24.191 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  32.8091     0.9049  36.257  < 2e-16 ***
## SexMale                      -3.6441     1.3219  -2.757 0.005882 ** 
## as.factor(Year)2007           2.4250     1.0669   2.273 0.023119 *  
## as.factor(Year)2008           2.6709     1.1548   2.313 0.020812 *  
## as.factor(Year)2009           2.1789     1.0866   2.005 0.045042 *  
## as.factor(Year)2010           3.8397     1.1640   3.299 0.000986 ***
## as.factor(Year)2011           4.2951     1.6418   2.616 0.008948 ** 
## as.factor(Year)2012           0.3909     1.3293   0.294 0.768726    
## as.factor(Year)2013           1.1805     1.1349   1.040 0.298361    
## SexMale:as.factor(Year)2007   3.9031     1.5766   2.476 0.013369 *  
## SexMale:as.factor(Year)2008   2.0167     1.6362   1.233 0.217876    
## SexMale:as.factor(Year)2009   5.2956     1.5944   3.321 0.000909 ***
## SexMale:as.factor(Year)2010   6.4641     1.7284   3.740 0.000188 ***
## SexMale:as.factor(Year)2011   4.4965     2.3627   1.903 0.057138 .  
## SexMale:as.factor(Year)2012  10.0324     2.0050   5.004 6.03e-07 ***
## SexMale:as.factor(Year)2013   3.6292     1.6693   2.174 0.029796 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.491 on 2440 degrees of freedom
## Multiple R-squared:  0.04537,    Adjusted R-squared:  0.0395 
## F-statistic:  7.73 on 15 and 2440 DF,  p-value: < 2.2e-16