library(ggplot2)
library(performance)
library(emmeans)
library(plyr)
Goals for today:
# download.file("https://timotheenivalis.github.io/data/droughtdata.csv",
# destfile = "data/droughtdata.csv")
drought_data <- read.csv("data/droughtdata.csv")
str(drought_data)
## 'data.frame': 32 obs. of 4 variables:
## $ plant : int 1 1 1 1 2 2 2 2 3 3 ...
## $ Genotype : Factor w/ 2 levels "mutant","WT": 2 2 2 2 2 2 2 2 1 1 ...
## $ WaterCondition: Factor w/ 2 levels "Drought","Normal": 2 2 2 2 2 2 2 2 2 2 ...
## $ Temperature : num 30 31 31.6 31.9 32 32.1 32.7 33.5 30 31 ...
wt_data <- drought_data[drought_data$Genotype=="WT",]
str(drought_data)
## 'data.frame': 32 obs. of 4 variables:
## $ plant : int 1 1 1 1 2 2 2 2 3 3 ...
## $ Genotype : Factor w/ 2 levels "mutant","WT": 2 2 2 2 2 2 2 2 1 1 ...
## $ WaterCondition: Factor w/ 2 levels "Drought","Normal": 2 2 2 2 2 2 2 2 2 2 ...
## $ Temperature : num 30 31 31.6 31.9 32 32.1 32.7 33.5 30 31 ...
mutant_data <- drought_data[drought_data$Genotype=="mutant",]
mmut <- lm(Temperature ~ WaterCondition, data = mutant_data)
summary(mmut)
##
## Call:
## lm(formula = Temperature ~ WaterCondition, data = mutant_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.78750 -0.85938 -0.03125 0.99687 3.01250
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.6875 0.5739 55.218 <2e-16 ***
## WaterConditionNormal 0.3875 0.8116 0.477 0.64
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.623 on 14 degrees of freedom
## Multiple R-squared: 0.01602, Adjusted R-squared: -0.05426
## F-statistic: 0.228 on 1 and 14 DF, p-value: 0.6404
ggplot(mutant_data, aes(x=WaterCondition, y=Temperature)) +
geom_violin() + geom_jitter()
Visualize confidence interval:
pred <- c(coefficients(mmut)[1]+coefficients(mmut)[2], coefficients(mmut)[1])
confpred <- as.data.frame(emmeans(mmut, ~WaterCondition))
ggplot(mutant_data, aes(x=WaterCondition, y=Temperature)) +
geom_violin() + geom_jitter() +
geom_point(data = confpred, mapping = aes(x=WaterCondition, y=emmean), inherit.aes = FALSE, size=10, color="red") +
geom_errorbar(data = confpred, aes(x=WaterCondition, ymin=lower.CL, ymax =upper.CL), inherit.aes = FALSE, color="blue")
What does that mean?
Visualize prediction interval:
confpred <- as.data.frame(emmeans(mmut, ~WaterCondition))
confpred$lower.PI <- confpred$emmean - 1.96*sigma(mmut)
confpred$upper.PI <- confpred$emmean + 1.96*sigma(mmut)
ggplot(mutant_data, aes(x=WaterCondition, y=Temperature)) +
geom_violin() + geom_jitter() +
geom_point(data = confpred, mapping = aes(x=WaterCondition, y=emmean), inherit.aes = FALSE, size=10, color="red") +
geom_errorbar(data = confpred, aes(x=WaterCondition, ymin=lower.CL, ymax =upper.CL), inherit.aes = FALSE, color="blue", width=0.2)+
geom_errorbar(data = confpred, aes(x=WaterCondition, ymin=lower.PI, ymax = upper.PI), inherit.aes = FALSE, color="red", width= 0.5)
Not too bad. Although the variation seems over-estimated in the Normal water condition, and possibly underestimated in the Drought water condition. However, given the sample size it is not really concerning.
We can repeat the procedure for the wild type data:
wt_data <- drought_data[drought_data$Genotype=="WT",]
mwt <- lm(Temperature ~ WaterCondition, data = wt_data)
summary(mwt)
##
## Call:
## lm(formula = Temperature ~ WaterCondition, data = wt_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7875 -0.9844 0.1000 1.0625 3.4125
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.5875 0.5830 66.187 < 2e-16 ***
## WaterConditionNormal -6.7375 0.8245 -8.172 1.07e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.649 on 14 degrees of freedom
## Multiple R-squared: 0.8267, Adjusted R-squared: 0.8143
## F-statistic: 66.78 on 1 and 14 DF, p-value: 1.069e-06
preddf <- data.frame(WaterCondition = unique(mutant_data$WaterCondition),
prediction = pred)
confpred <- as.data.frame(emmeans(mwt, ~WaterCondition))
confpred$lower.PI <- confpred$emmean - 1.96*sigma(mwt)
confpred$upper.PI <- confpred$emmean + 1.96*sigma(mwt)
ggplot(wt_data, aes(x=WaterCondition, y=Temperature)) +
geom_violin() + geom_jitter() +
geom_point(data = confpred, mapping = aes(x=WaterCondition, y=emmean), inherit.aes = FALSE, size=10, color="red") +
geom_errorbar(data = confpred, aes(x=WaterCondition, ymin=lower.PI, ymax = upper.PI), inherit.aes = FALSE, color="red", width= 0.5) +
geom_errorbar(data = confpred, aes(x=WaterCondition, ymin=lower.CL, ymax = upper.CL), inherit.aes = FALSE, color="blue", width= 0.2)
mall <- lm(Temperature ~ WaterCondition + Genotype, data = drought_data)
summary(mall)
##
## Call:
## lm(formula = Temperature ~ WaterCondition + Genotype, data = drought_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5688 -1.7406 -0.2125 1.8063 5.1938
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.4688 0.7553 44.310 < 2e-16 ***
## WaterConditionNormal -3.1750 0.8722 -3.640 0.001052 **
## GenotypeWT 3.3375 0.8722 3.827 0.000639 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.467 on 29 degrees of freedom
## Multiple R-squared: 0.4903, Adjusted R-squared: 0.4551
## F-statistic: 13.95 on 2 and 29 DF, p-value: 5.705e-05
confpred <- as.data.frame(emmeans(mall, ~ WaterCondition + Genotype))
confpred$lower.PI <- confpred$emmean - 1.96*sigma(mall)
confpred$upper.PI <- confpred$emmean + 1.96*sigma(mall)
ggplot(drought_data, aes(x=interaction(WaterCondition,Genotype), y=Temperature)) +
geom_violin() + geom_jitter() +
geom_point(data = confpred, mapping = aes(x=interaction(WaterCondition,Genotype), y=emmean), inherit.aes = FALSE, size=10, color="red") +
geom_errorbar(data = confpred, aes(x=interaction(WaterCondition,Genotype), ymin=lower.PI, ymax = upper.PI), inherit.aes = FALSE, color="red", width= 0.5)+
geom_errorbar(data = confpred, aes(x=interaction(WaterCondition,Genotype), ymin=lower.CL, ymax = upper.CL), inherit.aes = FALSE, color="blue", width= 0.2)
It looks wrong! Not necessarily, it depends what we want to estimate.
Here, the question that led to the experiment was: does the mutant deal better with drought than the wild type? In other words:
mallinter <- lm(Temperature ~ WaterCondition * Genotype, data = drought_data)
confpred <- as.data.frame(emmeans(mallinter, ~ WaterCondition * Genotype))
confpred$lower.PI <- confpred$emmean - 1.96*sigma(mall)
confpred$upper.PI <- confpred$emmean + 1.96*sigma(mall)
ggplot(drought_data, aes(x=interaction(WaterCondition,Genotype), y=Temperature)) +
geom_violin() + geom_jitter() +
geom_point(data = confpred, mapping = aes(x=interaction(WaterCondition,Genotype), y=emmean), inherit.aes = FALSE, size=10, color="red") +
geom_errorbar(data = confpred, aes(x=interaction(WaterCondition,Genotype), ymin=lower.PI, ymax = upper.PI), inherit.aes = FALSE, color="red", width= 0.5)+
geom_errorbar(data = confpred, aes(x=interaction(WaterCondition,Genotype), ymin=lower.CL, ymax = upper.CL), inherit.aes = FALSE, color="blue", width= 0.2)
Now we know this model fits the data well, let’s try to understand why based on the summary:
summary(mallinter)
##
## Call:
## lm(formula = Temperature ~ WaterCondition * Genotype, data = drought_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7875 -0.9062 0.0313 1.0625 3.4125
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.6875 0.5785 54.780 < 2e-16 ***
## WaterConditionNormal 0.3875 0.8181 0.474 0.639
## GenotypeWT 6.9000 0.8181 8.435 3.58e-09 ***
## WaterConditionNormal:GenotypeWT -7.1250 1.1569 -6.159 1.20e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.636 on 28 degrees of freedom
## Multiple R-squared: 0.7835, Adjusted R-squared: 0.7603
## F-statistic: 33.78 on 3 and 28 DF, p-value: 1.921e-09
# Water Drought, Genotype Mutant
coef(mallinter)["(Intercept)"]
## (Intercept)
## 31.6875
# Water Normal, Genotype Mutant
coef(mallinter)["(Intercept)"] + coef(mallinter)["WaterConditionNormal"]
## (Intercept)
## 32.075
# Water Drought, Genotype WT
coef(mallinter)["(Intercept)"] + coef(mallinter)["GenotypeWT"]
## (Intercept)
## 38.5875
# Water Normal, Genotype WT
coef(mallinter)["(Intercept)"] + coef(mallinter)["GenotypeWT"]+ coef(mallinter)["WaterConditionNormal"] ## WRONG
## (Intercept)
## 38.975
coef(mallinter)["(Intercept)"] + coef(mallinter)["GenotypeWT"]+ coef(mallinter)["WaterConditionNormal"] + coef(mallinter)["WaterConditionNormal:GenotypeWT"] ## RIGHT
## (Intercept)
## 31.85
Draw transitions with a pencil.
summary(mallinter)
##
## Call:
## lm(formula = Temperature ~ WaterCondition * Genotype, data = drought_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7875 -0.9062 0.0313 1.0625 3.4125
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.6875 0.5785 54.780 < 2e-16 ***
## WaterConditionNormal 0.3875 0.8181 0.474 0.639
## GenotypeWT 6.9000 0.8181 8.435 3.58e-09 ***
## WaterConditionNormal:GenotypeWT -7.1250 1.1569 -6.159 1.20e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.636 on 28 degrees of freedom
## Multiple R-squared: 0.7835, Adjusted R-squared: 0.7603
## F-statistic: 33.78 on 3 and 28 DF, p-value: 1.921e-09
anova(mallinter)
Why not to just do two tests and see that one is significant and the other not? Many cases where differences in p-values is unrelated to differences in differences. Differences in sample sizes or variability can generate differences in p-values for a predictor although effects are the same across other predictors.
Below is an example of a difference in p-value giving the wrong impression that differences are different, while they are not. You don’t need to worry about what the code does exactly. Just know it simulates data with an effect of the variable x, and that this effect is the same in both populations (pop). The sample size is very different among populations though.
set.seed(1859)
mean1 <- 0
mean2 <- 0.15
samp1 <- 8
samp2 <- 1000
sdall <- 3
dat <- data.frame(y = c(rnorm(n = samp1, mean = mean1, sd = sdall),
rnorm(n = samp1, mean = mean2, sd = sdall),
rnorm(n = samp2, mean = mean1, sd = sdall),
rnorm(n = samp2, mean = mean2, sd = sdall)),
x = c(rep("A", times=samp1),
rep("B", times=samp1),
rep("A", times=samp2),
rep("B", times=samp2)),
pop = c(rep("p1", times=2*samp1),
rep("p2", times=2*samp2))
)
If we fit models separately for populations 1 and 2, we get one significant result and the other non-significant:
summary(lm(y ~ x , data = dat[dat$pop=="p1",]))
##
## Call:
## lm(formula = y ~ x, data = dat[dat$pop == "p1", ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5792 -1.4286 -0.0339 1.2641 3.4056
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.6468 0.7680 -0.842 0.414
## xB 1.5616 1.0861 1.438 0.172
##
## Residual standard error: 2.172 on 14 degrees of freedom
## Multiple R-squared: 0.1287, Adjusted R-squared: 0.06642
## F-statistic: 2.067 on 1 and 14 DF, p-value: 0.1725
summary(lm(y ~ x , data = dat[dat$pop=="p2",]))
##
## Call:
## lm(formula = y ~ x, data = dat[dat$pop == "p2", ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.2221 -2.1222 -0.0161 2.0985 11.4287
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.05277 0.09520 0.554 0.57941
## xB 0.34811 0.13463 2.586 0.00979 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.01 on 1998 degrees of freedom
## Multiple R-squared: 0.003335, Adjusted R-squared: 0.002836
## F-statistic: 6.686 on 1 and 1998 DF, p-value: 0.00979
That’s just an artefact of sample size though.
If we fit a single model with an interaction, we realize that there is no clear evidence that the differences among x are different among populations:
summary(mtot <- lm(y ~ x*pop , data = dat))
##
## Call:
## lm(formula = y ~ x * pop, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.2221 -2.1200 -0.0161 2.0949 11.4287
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.6468 1.0626 -0.609 0.543
## xB 1.5616 1.5027 1.039 0.299
## popp2 0.6996 1.0668 0.656 0.512
## xB:popp2 -1.2135 1.5087 -0.804 0.421
##
## Residual standard error: 3.005 on 2012 degrees of freedom
## Multiple R-squared: 0.003863, Adjusted R-squared: 0.002378
## F-statistic: 2.601 on 3 and 2012 DF, p-value: 0.05056
anova(mtot)
Kangaroos behavioural data: At what distance do individuals hop-away when you approach them?
We know Age has a strong effect on EscapeDistance. We wonder whether the effect is the same for both males and females.
roo <- read.csv("data/roo.csv")
summary(lm(EscapeDistance ~ 1 + Sex, data = roo))
##
## Call:
## lm(formula = EscapeDistance ~ 1 + Sex, data = roo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.431 -7.939 1.061 7.569 26.569
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.9386 0.2666 131.047 <2e-16 ***
## SexMale 0.4924 0.3918 1.257 0.209
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.683 on 2454 degrees of freedom
## Multiple R-squared: 0.000643, Adjusted R-squared: 0.0002358
## F-statistic: 1.579 on 1 and 2454 DF, p-value: 0.209
summary(lm(EscapeDistance ~ 1 + Age, data = roo))
##
## Call:
## lm(formula = EscapeDistance ~ 1 + Age, data = roo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.9435 -3.6575 0.0565 3.3425 27.3425
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.9435 0.1430 293.32 <2e-16 ***
## AgeYoung -16.2860 0.2217 -73.47 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.415 on 2454 degrees of freedom
## Multiple R-squared: 0.6875, Adjusted R-squared: 0.6873
## F-statistic: 5398 on 1 and 2454 DF, p-value: < 2.2e-16
summary(lm(EscapeDistance ~ 1 + Age*Sex, data = roo))
##
## Call:
## lm(formula = EscapeDistance ~ 1 + Age * Sex, data = roo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.4427 -3.4427 -0.2214 3.5573 27.1196
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.2214 0.1777 226.367 <2e-16 ***
## AgeYoung -14.8257 0.2977 -49.808 <2e-16 ***
## SexMale 4.2213 0.2782 15.174 <2e-16 ***
## AgeYoung:SexMale -3.7366 0.4278 -8.735 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.177 on 2452 degrees of freedom
## Multiple R-squared: 0.7145, Adjusted R-squared: 0.7142
## F-statistic: 2046 on 3 and 2452 DF, p-value: < 2.2e-16
ggplot(roo, aes(x=interaction(Sex, Age), y=EscapeDistance)) + geom_violin()
So we have good evidence that the age differences are different in males and females; or equivalently, the sex differences are different in young and adults.
Now, let’s say you want to estimate the effect of sex on average. You know that years where quite different sex ration through years. So you would like to control for year, as a factor. Which model do you fit?
An additive model ?
summary(lm(EscapeDistance ~ 1 + Sex + as.factor(Year), data = roo))
##
## Call:
## lm(formula = EscapeDistance ~ 1 + Sex + as.factor(Year), data = roo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.196 -7.619 1.351 7.381 26.169
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.8309 0.6880 44.811 < 2e-16 ***
## SexMale 0.5774 0.3878 1.489 0.136598
## as.factor(Year)2007 4.2654 0.7899 5.400 7.31e-08 ***
## as.factor(Year)2008 3.4286 0.8210 4.176 3.07e-05 ***
## as.factor(Year)2009 4.6459 0.7999 5.808 7.14e-09 ***
## as.factor(Year)2010 6.7877 0.8652 7.846 6.38e-15 ***
## as.factor(Year)2011 6.4078 1.1875 5.396 7.46e-08 ***
## as.factor(Year)2012 4.7933 1.0000 4.793 1.74e-06 ***
## as.factor(Year)2013 2.8913 0.8372 3.454 0.000562 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.547 on 2447 degrees of freedom
## Multiple R-squared: 0.03117, Adjusted R-squared: 0.028
## F-statistic: 9.84 on 8 and 2447 DF, p-value: 1.5e-13
Or an interactive model?
summary(mx <- lm(EscapeDistance ~ 1 + Sex * as.factor(Year), data = roo))
##
## Call:
## lm(formula = EscapeDistance ~ 1 + Sex * as.factor(Year), data = roo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.469 -7.234 1.012 7.361 24.191
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.8091 0.9049 36.257 < 2e-16 ***
## SexMale -3.6441 1.3219 -2.757 0.005882 **
## as.factor(Year)2007 2.4250 1.0669 2.273 0.023119 *
## as.factor(Year)2008 2.6709 1.1548 2.313 0.020812 *
## as.factor(Year)2009 2.1789 1.0866 2.005 0.045042 *
## as.factor(Year)2010 3.8397 1.1640 3.299 0.000986 ***
## as.factor(Year)2011 4.2951 1.6418 2.616 0.008948 **
## as.factor(Year)2012 0.3909 1.3293 0.294 0.768726
## as.factor(Year)2013 1.1805 1.1349 1.040 0.298361
## SexMale:as.factor(Year)2007 3.9031 1.5766 2.476 0.013369 *
## SexMale:as.factor(Year)2008 2.0167 1.6362 1.233 0.217876
## SexMale:as.factor(Year)2009 5.2956 1.5944 3.321 0.000909 ***
## SexMale:as.factor(Year)2010 6.4641 1.7284 3.740 0.000188 ***
## SexMale:as.factor(Year)2011 4.4965 2.3627 1.903 0.057138 .
## SexMale:as.factor(Year)2012 10.0324 2.0050 5.004 6.03e-07 ***
## SexMale:as.factor(Year)2013 3.6292 1.6693 2.174 0.029796 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.491 on 2440 degrees of freedom
## Multiple R-squared: 0.04537, Adjusted R-squared: 0.0395
## F-statistic: 7.73 on 15 and 2440 DF, p-value: < 2.2e-16