Today: * Understand why linear models do not work well with some type of data, such as binary data. * Fit generalized linear models, in particular binomial (a.k.a. logistic regression) * Interpret and visualize binomial GLMs

download.file("https://timotheenivalis.github.io/data/survivalweight.csv", 
              destfile = "data/survivalweight.csv")

download.file("https://timotheenivalis.github.io/data/voles.csv", 
              destfile = "data/voles.csv")
library(ggplot2)
library(performance)

Failure of linear models

That’s a typical linear model (linear regression) performing okay:

set.seed(123)
x <- rnorm(20)
y <- 1 + x + rnorm(20)

datlinear <- data.frame(x=x, y=y)
lm0 <- lm(y~x, data = datlinear)
  
ggplot(datlinear, aes(x=x, y=y))+
geom_smooth(method="lm") + geom_point() +
geom_segment(aes(x=x, y=y, xend= x, yend=lm0$fitted.values))
## `geom_smooth()` using formula 'y ~ x'

check_model(lm0)
## Not enough model terms in the conditional part of the model to check for multicollinearity.
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 20 rows containing missing values (geom_text_repel).

Now a model with the same structure, but fitted to binary data different data has more questionable performance:

set.seed(123)
x <- rnorm(30)
latent <- 1 + 2*x + rnorm(30, sd = 0.5)
y <- 1/(1+exp(-latent))
obs <- sapply(y, FUN=function(x){rbinom(1,1,x)})

datbinary <- data.frame(x=x, y=obs)
lm1 <- lm(y~x, data = datbinary)

ggplot(datbinary, aes(x=x, y=y))+
geom_smooth(method="lm", fullrange=TRUE) + geom_point() +
geom_segment(aes(x=x, y=y, xend= x, yend=lm1$fitted.values)) +
  xlim(c(-3,2))
## `geom_smooth()` using formula 'y ~ x'

check_model(lm1)
## Not enough model terms in the conditional part of the model to check for multicollinearity.
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 30 rows containing missing values (geom_text_repel).

Fit GLM for binary data: “binomial” family

glm1 <- glm(y~x, data = datbinary, family = "binomial")

ggplot(datbinary, aes(x=x, y=y))+
geom_smooth(method="glm", method.args = list(family="binomial"), fullrange=TRUE) + geom_point() +
geom_segment(aes(x=x, y=y, xend= x, yend=glm1$fitted.values)) +
  xlim(c(-3,2))
## `geom_smooth()` using formula 'y ~ x'

For binary binomial glms there are no assumptions about the distribution of residuals, apart from the independence of the data generating process. Diagnostic plots are not really useful; they look ugly but that’s fine, because they are assessing linear model’s assumptions, not glm’s.

plot(glm1)

check_model(glm1)
## Not enough model terms in the conditional part of the model to check for multicollinearity.
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 29 rows containing missing values (geom_text_repel).

How to interpret results?

summary(glm1)
## 
## Call:
## glm(formula = y ~ x, family = "binomial", data = datbinary)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3629  -0.6372   0.2605   0.6473   1.2642  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   1.2915     0.6017   2.147   0.0318 *
## x             2.0488     0.8032   2.551   0.0108 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 38.191  on 29  degrees of freedom
## Residual deviance: 25.764  on 28  degrees of freedom
## AIC: 29.764
## 
## Number of Fisher Scoring iterations: 5
coef(glm1)[1] + coef(glm1)[2]*datbinary$x
##  [1]  0.14322184  0.81993689  4.48500716  1.43598230  1.55640945  4.80535080
##  [7]  2.23585006 -1.30033352 -0.11569985  0.37845217  3.79942416  2.02871144
## [13]  2.11262545  1.51829148  0.15271703  4.95255333  2.31152101 -2.73768169
## [19]  2.72846304  0.32286936 -0.89623315  0.84493754 -0.81055384 -0.20182810
## [25]  0.01094386 -2.16417356  3.00798325  1.60575560 -1.04029094  3.86034140
predict(glm1)
##           1           2           3           4           5           6 
##  0.14322184  0.81993689  4.48500716  1.43598230  1.55640945  4.80535080 
##           7           8           9          10          11          12 
##  2.23585006 -1.30033352 -0.11569985  0.37845217  3.79942416  2.02871144 
##          13          14          15          16          17          18 
##  2.11262545  1.51829148  0.15271703  4.95255333  2.31152101 -2.73768169 
##          19          20          21          22          23          24 
##  2.72846304  0.32286936 -0.89623315  0.84493754 -0.81055384 -0.20182810 
##          25          26          27          28          29          30 
##  0.01094386 -2.16417356  3.00798325  1.60575560 -1.04029094  3.86034140
datbinary$latent_y <- predict(glm1)

Visualise data (black) vs. what is predicted by regression coefficients:

ggplot(datbinary, aes(x=x, y=y))+ geom_point(color="black")+
  geom_point(inherit.aes = FALSE, aes(x=x, y=latent_y), color="red")

The difference is because regression coefficients of a glm are expressed on a different scale.

To go from the data to the regression scale we apply a logit transform:

\[ \mathrm{logit}(y) = \log(\frac{y}{1-y}) \]

To go from model predictions on the regression scale to the data scale we apply the inverse tranformation, which is \[ \mathrm{logit}^{-1}(y) = \frac{1}{1+e^{-y}} \] in R you can run this inverse logit function with plogis:

plogis(0)
## [1] 0.5
1/(1+exp(-0))
## [1] 0.5
plogis(0.98)
## [1] 0.7271082
1/(1+exp(-0.98))
## [1] 0.7271082
plogis(-0.98)
## [1] 0.2728918
1/(1+exp(--0.98))
## [1] 0.2728918

We can better understand what the model says after the plogis transformation:

datbinary$transformed_latent_y <- plogis(predict(glm1))

ggplot(datbinary, aes(x=x, y=y))+ geom_point(color="black")+
  geom_point(inherit.aes = FALSE, aes(x=x, y=transformed_latent_y), color="red")

The model predictions obtained from the regression parameters are probabilities to observe 1 rather than a 0.

How does the model relate model predictions to the data? The Bernouilli distribution, rbinom(n=, size= 1, prob=), a special case of the binomial distribution with size=1.

rbinom(n = 100, size = 1, prob = 0.9)
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1
##  [38] 1 0 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1
##  [75] 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1
sapply(datbinary$transformed_latent_y, 
       function(x) rbinom(n = 10, size = 1, prob = x))
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
##  [1,]    1    1    1    1    1    1    1    0    0     1     1     1     1
##  [2,]    0    1    1    1    1    1    1    0    0     1     0     1     0
##  [3,]    1    1    1    1    0    1    1    1    1     0     1     1     1
##  [4,]    1    1    1    1    1    1    1    0    1     1     1     1     1
##  [5,]    1    1    1    1    1    1    1    0    1     1     1     1     1
##  [6,]    0    1    1    1    1    1    1    1    1     0     1     1     0
##  [7,]    0    0    1    1    1    1    1    0    0     1     1     1     1
##  [8,]    0    1    1    1    1    1    1    0    0     0     1     1     1
##  [9,]    0    1    1    1    1    1    1    0    0     1     1     1     1
## [10,]    0    1    1    0    1    1    1    1    0     1     1     1     1
##       [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25]
##  [1,]     1     1     1     1     0     1     0     0     1     0     1     1
##  [2,]     0     1     1     1     0     1     0     1     1     0     0     0
##  [3,]     1     1     1     0     0     1     0     1     1     0     0     1
##  [4,]     1     1     1     1     0     1     1     1     1     0     1     1
##  [5,]     1     1     1     1     0     1     1     0     1     0     0     0
##  [6,]     1     0     1     1     0     1     0     0     0     1     1     0
##  [7,]     0     0     1     1     1     1     1     1     1     0     1     1
##  [8,]     1     1     1     1     0     1     1     0     0     1     0     1
##  [9,]     1     0     1     1     0     1     1     0     1     0     0     1
## [10,]     1     0     1     1     1     1     1     0     0     0     1     1
##       [,26] [,27] [,28] [,29] [,30]
##  [1,]     0     1     1     0     1
##  [2,]     0     1     1     1     1
##  [3,]     0     1     0     0     1
##  [4,]     0     1     1     1     1
##  [5,]     1     1     0     0     1
##  [6,]     0     1     1     0     1
##  [7,]     0     1     0     1     1
##  [8,]     0     1     1     0     1
##  [9,]     0     1     1     1     1
## [10,]     0     1     1     0     1

From this example, we saw what a GLM is:

  1. A linear function (reponse = intercept + slope × predictor . . . ), what you see with summary(glm1)
  2. A “Link function”" = a map between the linear function (−∞ to +∞) and a probability distribution (from 0 to 1 for Bernouilli)
  3. Probability distribution (Bernouilli, Binomial, Poisson. . . ) assumed to generate the data (either 0 or 1 for Bernouilli)

Practice

survdat <- read.csv("data/survivalweight.csv")

str(survdat)
## 'data.frame':    300 obs. of  4 variables:
##  $ X       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ survival: int  1 1 1 0 1 0 0 0 1 1 ...
##  $ weight  : num  27.9 27 34.8 31 34.6 ...
##  $ sex     : Factor w/ 2 levels "Female","Male": 2 2 1 2 1 1 1 2 2 2 ...
  1. Model effect of weight
ggplot(survdat, aes(x=weight, y=survival)) +
   geom_jitter(width = 0.0, height = 0.02)  +
  geom_smooth(method = "glm", method.args = list(family="binomial"))
## `geom_smooth()` using formula 'y ~ x'

mw <- glm(survival ~ weight, data=survdat, family = "binomial")
summary(mw)
## 
## Call:
## glm(formula = survival ~ weight, family = "binomial", data = survdat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4751  -0.6718  -0.3502   0.7168   2.2858  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -18.0472     2.0750  -8.697   <2e-16 ***
## weight        0.5748     0.0670   8.578   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 402.13  on 299  degrees of freedom
## Residual deviance: 273.63  on 298  degrees of freedom
## AIC: 277.63
## 
## Number of Fisher Scoring iterations: 5

What probability of survival does the model predict for a weigth of 30?

plogis(coef(mw)[1] + coef(mw)[2]*30)
## (Intercept) 
##   0.3090714
ggplot(survdat, aes(x=weight, y=survival)) +
   geom_jitter(width = 0.0, height = 0.02)  +
  geom_smooth(method = "glm", method.args = list(family="binomial"))+
  geom_point(x=30, y=plogis(coef(mw)[1] + coef(mw)[2]*30), color="red")
## `geom_smooth()` using formula 'y ~ x'

  1. Model effect of Sex
ggplot(survdat, aes(x=sex, y=survival)) +
  geom_jitter(width = 0.1, height = 0.02) 

msex <- glm(survival ~ sex, data=survdat, family = "binomial")
summary(msex)
## 
## Call:
## glm(formula = survival ~ sex, family = "binomial", data = survdat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0277  -1.0277  -0.9695   1.3349   1.4006  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  -0.5108     0.1721  -2.968    0.003 **
## sexMale       0.1479     0.2369   0.624    0.532   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 402.13  on 299  degrees of freedom
## Residual deviance: 401.74  on 298  degrees of freedom
## AIC: 405.74
## 
## Number of Fisher Scoring iterations: 4
plogis(coef(msex)[1]) #female prediction
## (Intercept) 
##       0.375
plogis(coef(msex)[1]+coef(msex)[2]) #male prediction
## (Intercept) 
##   0.4102564
# automated calculation with predict:
newdat <- data.frame(sex=c("Female", "Male")) 
(newdat$pred <- predict(object = msex, newdata = newdat, type = "response"))
##         1         2 
## 0.3750000 0.4102564
ggplot(survdat, aes(x=sex, y=survival)) +
  geom_jitter(width = 0.1, height = 0.02) +
  geom_point(data = newdat, aes(x=sex, y=pred, color=sex))

# Adding SE (approximate CI)
newdat <- data.frame(sex=c("Female", "Male")) 
predobj <- predict(object = msex, newdata = newdat,
                        type = "response", se.fit=TRUE)
newdat$pred <- predobj$fit
newdat$SE <- predobj$se.fit
newdat$lowci <- newdat$pred -1.96*newdat$SE
newdat$upci <- newdat$pred +1.96*newdat$SE

ggplot(survdat, aes(x=sex, y=survival)) +
  geom_jitter(width = 0.1, height = 0.02) +
  geom_point(data = newdat, aes(x=sex, y=pred, color=sex))+
  geom_errorbar(inherit.aes = FALSE,
                data = newdat, aes(x=sex, ymin=lowci, ymax=upci, color=sex), width=0.2)

  1. Interaction weight:Sex?
ggplot(survdat, aes(x=weight, y=survival, color=sex)) +
  geom_point() + geom_smooth(method = "glm", method.args = list(family="binomial"))
## `geom_smooth()` using formula 'y ~ x'

msw <- glm(survival ~ sex*weight, data=survdat, family = "binomial")
summary(msw)
## 
## Call:
## glm(formula = survival ~ sex * weight, family = "binomial", data = survdat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1963  -0.6817  -0.3200   0.7037   2.5095  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -14.20374    2.51039  -5.658 1.53e-08 ***
## sexMale         -9.50348    4.42085  -2.150   0.0316 *  
## weight           0.44572    0.08072   5.522 3.36e-08 ***
## sexMale:weight   0.31769    0.14294   2.223   0.0262 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 402.13  on 299  degrees of freedom
## Residual deviance: 267.32  on 296  degrees of freedom
## AIC: 275.32
## 
## Number of Fisher Scoring iterations: 5
  1. Draw prediction and CI for low and heigh weight for case of intercation.

You must first calculate CI, then back-transform (i.e., apply plogis), or the CI goes below 0 or above 1. In a GLM CIs are asymetrical.

newdat <- expand.grid(sex=c("Female", "Male"), weight=c(18,40)) 

# this is inexact:
predobj <- predict(object = msw, newdata = newdat,
                        type = "response", se.fit=TRUE)
newdat$pred <- predobj$fit
newdat$SE <- predobj$se.fit
newdat$lowci <- newdat$pred -1.96*newdat$SE
newdat$upci <- newdat$pred +1.96*newdat$SE
newdat # some values beyond 0 / 1
##      sex weight         pred           SE         lowci         upci
## 1 Female     18 2.064623e-03 0.0022019830 -2.251264e-03 0.0063805095
## 2   Male     18 4.697992e-05 0.0000716602 -9.347407e-05 0.0001874339
## 3 Female     40 9.740463e-01 0.0190829292  9.366438e-01 1.0114488538
## 4   Male     40 9.989197e-01 0.0011984553  9.965707e-01 1.0012686238

The correct way:

predobj <- predict(object = msw, newdata = newdat,
                        type = "link", se.fit=TRUE)
newdat$pred <- plogis(predobj$fit)
newdat$lowci <- plogis(predobj$fit -1.96*predobj$se.fit)
newdat$upci <- plogis(predobj$fit +1.96*predobj$se.fit)
newdat
##      sex weight         pred           SE        lowci         upci
## 1 Female     18 2.064623e-03 0.0022019830 2.546246e-04 0.0165282718
## 2   Male     18 4.697992e-05 0.0000716602 2.363076e-06 0.0009332137
## 3 Female     40 9.740463e-01 0.0190829292 8.952587e-01 0.9939682650
## 4   Male     40 9.989197e-01 0.0011984553 9.905549e-01 0.9998773453
ggplot(survdat, aes(x=weight, y=survival, color=sex)) +
  geom_point(alpha=0.1) + geom_smooth(method = "glm", method.args = list(family="binomial"))+
  geom_point(data=newdat, aes(x=weight, color=sex, y=pred), size=2, alpha=0.5)+
geom_errorbar(data=newdat,  inherit.aes = FALSE,
              aes(x=weight, color=sex, ymin=lowci, ymax=upci), width=1)
## `geom_smooth()` using formula 'y ~ x'

More practice

Explain variation in annual survival in the vole data set.

voles <- read.csv("data/voles.csv")
head(voles)
##         id year    sex age mass survival reproduction fitness      StMass
## 1 CN06-001 2006 Female   A   34        0            0       0  0.29151301
## 2 CN06-002 2006   Male   A   37        0            4       4  0.58055655
## 3 CN06-003 2006 Female   A   30        0            0       0 -0.09387836
## 4 CN06-004 2006 Female   A   35        0            2       2  0.38786086
## 5 CN06-005 2006   Male   A   45        0            5       5  1.35133930
## 6 CN06-006 2006   Male   A   35        0            2       2  0.38786086
##     mother
## 1     <NA>
## 2     <NA>
## 3 CN06-013
## 4     <NA>
## 5     <NA>
## 6     <NA>