Intro to Data Analysis with R: Session 4

UCI Data Science Initiative

April 20, 2018

Session 4 Agenda

O-Ring Data

O-Ring Data, contd.

More information about the O-ring data is available here.

oring <- read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/space-shuttle/o-ring-erosion-only.data")
names(oring) <- c("n_risk", "n_fail", "temp", "psi", "order")
head(oring)
##   n_risk n_fail temp psi order
## 1      6      0   66  50     1
## 2      6      1   70  50     2
## 3      6      0   69  50     3
## 4      6      0   68  50     4
## 5      6      0   67  50     5
## 6      6      0   72  50     6
summary(oring)
##      n_risk      n_fail            temp            psi       
##  Min.   :6   Min.   :0.0000   Min.   :53.00   Min.   : 50.0  
##  1st Qu.:6   1st Qu.:0.0000   1st Qu.:67.00   1st Qu.: 75.0  
##  Median :6   Median :0.0000   Median :70.00   Median :200.0  
##  Mean   :6   Mean   :0.3043   Mean   :69.57   Mean   :152.2  
##  3rd Qu.:6   3rd Qu.:0.5000   3rd Qu.:75.00   3rd Qu.:200.0  
##  Max.   :6   Max.   :2.0000   Max.   :81.00   Max.   :200.0  
##      order     
##  Min.   : 1.0  
##  1st Qu.: 6.5  
##  Median :12.0  
##  Mean   :12.0  
##  3rd Qu.:17.5  
##  Max.   :23.0

Visualizing the O-Ring Data

oring <- oring[order(oring$temp), ]  # sort by ascending temp
# some flights have the same number of failures & launch temp, so make point size vary 
pt.size <- rep(1, nrow(oring))
dups <- oring[duplicated(cbind(oring$temp, oring$n_fail)), c("n_fail", "temp")]
pt.size[as.numeric(rownames(dups))] <- 2
trips <- dups[duplicated(dups), ]
pt.size[as.numeric(rownames(trips))] <- 3
plot(oring$temp, oring$n_fail, pch = 20, cex = pt.size, 
     xlab = "Temperature (deg F)", ylab = "Number of Failures")  

Naive Analysis of the O-Ring Data

## Fit a linear model
linearReg <- lm(n_fail ~ temp, data = oring)
summary(linearReg)
## 
## Call:
## lm(formula = n_fail ~ temp, data = oring)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.50921 -0.30810  0.00794  0.20905  0.74381 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.30159    0.83110   5.176 3.96e-05 ***
## temp        -0.05746    0.01189  -4.833 8.90e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3935 on 21 degrees of freedom
## Multiple R-squared:  0.5266, Adjusted R-squared:  0.5041 
## F-statistic: 23.36 on 1 and 21 DF,  p-value: 8.895e-05

Naive Analysis Results

## Predict the output for the Challenger launch day
launchDay <- data.frame(temp = 31)
predict(linearReg, launchDay, interval="predict")  # point estimate & 95% CI
##        fit      lwr     upr
## 1 2.520317 1.252255 3.78838
## Plot the fit extrapolating to launch day
plot(oring$temp, oring$n_fail, pch = 20, cex = pt.size/2, 
     xlab = "Temperature (deg F)", ylab = "Num O-Ring Failures",
     xlim = c(30, 85), ylim = c(0, 5))
abline(reg = linearReg, col = "red", lwd = 2)
points(x=31, y=2.52, pch=4, col="black", lwd=2)
x <- seq(25,90,.1)
pred <- predict(linearReg, data.frame(temp=x), interval="predict")
lines(x, pred[,2], lty=2, col="red")
lines(x, pred[,3], lty=2, col="red")

Generalized Linear Models (GLMs)

Logistic Regression

GLMs in R

Logistic Regression - Format Data

## Reformat the response variable
oring$fail <- as.numeric(oring$n_fail > 0)
head(oring)
##    n_risk n_fail temp psi order fail
## 14      6      2   53 200    14    1
## 9       6      1   57 200     9    1
## 23      6      1   58 200    23    1
## 10      6      1   63 200    10    1
## 1       6      0   66  50     1    0
## 5       6      0   67  50     5    0
plot(oring$temp, oring$fail, pch = 20, cex = pt.size,
     xlab = "Temperature (deg F)", ylab = "O-Ring Failure")

Logistic Regression - Fit the Model

## Fit a GLM with logistic link
logisticReg <- glm(fail ~ temp, data = oring, family=binomial(link="logit"))
summary(logisticReg)
## 
## Call:
## glm(formula = fail ~ temp, family = binomial(link = "logit"), 
##     data = oring)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.00214  -0.57815  -0.21802   0.04401   2.01706  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  23.7750    11.8204   2.011   0.0443 *
## temp         -0.3667     0.1752  -2.093   0.0363 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 26.402  on 22  degrees of freedom
## Residual deviance: 14.426  on 21  degrees of freedom
## AIC: 18.426
## 
## Number of Fisher Scoring iterations: 6

Logistic Regression

names(logisticReg)
##  [1] "coefficients"      "residuals"         "fitted.values"    
##  [4] "effects"           "R"                 "rank"             
##  [7] "qr"                "family"            "linear.predictors"
## [10] "deviance"          "aic"               "null.deviance"    
## [13] "iter"              "weights"           "prior.weights"    
## [16] "df.residual"       "df.null"           "y"                
## [19] "converged"         "boundary"          "model"            
## [22] "call"              "formula"           "terms"            
## [25] "data"              "offset"            "control"          
## [28] "method"            "contrasts"         "xlevels"

Logistic Regression - Plot the Fit

plot(oring$temp, oring$fail, pch = 20, cex = pt.size,
     xlab = "Temperature (deg F)", ylab = "O-Ring Failure")
curve(predict(logisticReg, data.frame(temp=x), type="response"), add=TRUE, col="red", lwd=2)

Logistic Regression - Prediction

To get probabilities, be sure to use type = "response".

launchDay
##   temp
## 1   31
predict(logisticReg, launchDay, type="response")
##         1 
## 0.9999959

predict() simply computes the linear combination & evaulates the inverse link

XB <- as.numeric(coef(logisticReg)[1] + coef(logisticReg)[2]*31)
XB  # estimated log-odds of o-ring failure on launch day
## [1] 12.40723
exp(XB)/(1 + exp(XB))  # estimated probability of o-ring failure on launch day
## [1] 0.9999959

Logistic Regression - Exponentiated CIs

exp(confint(logisticReg, "temp", level=0.95))
##     2.5 %    97.5 % 
## 0.4159216 0.8853986
exp(confint.default(logisticReg, "temp", level=0.95))
##          2.5 %    97.5 %
## temp 0.4916331 0.9768919

Logistic Regression - Updates

logisticReg2 = update(logisticReg, . ~ . + psi, data=oring)
summary(logisticReg2)
## 
## Call:
## glm(formula = fail ~ temp + psi, family = binomial(link = "logit"), 
##     data = oring)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.00837  -0.54507  -0.28098   0.02512   2.21488  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept) 21.843631  11.936459   1.830   0.0673 .
## temp        -0.350098   0.172977  -2.024   0.0430 *
## psi          0.006007   0.009749   0.616   0.5378  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 26.402  on 22  degrees of freedom
## Residual deviance: 14.033  on 20  degrees of freedom
## AIC: 20.033
## 
## Number of Fisher Scoring iterations: 6

Logistic Regression - Model Comparison

anova(logisticReg, logisticReg2, test='LRT')
## Analysis of Deviance Table
## 
## Model 1: fail ~ temp
## Model 2: fail ~ temp + psi
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1        21     14.426                     
## 2        20     14.033  1   0.3929   0.5308

End of Session 4

Make sure you submit a course evaluation survey!