Intro to R Workshop: Session 5

UCI Data Science Initiative

April 13, 2017

Session 5 Agenda

Line Charts

cars = c(2,3,4,3,5)
plot(cars, type = "o")

Line Charts - Change Some Attributes

plot(cars, type="o", col="red", ylim=c(0.5,5), 
     xlab="Months", ylab="Sales in Millions",
     main="Autos Sales")

Line Charts - Add Another Line

plot(cars, type="o", col="red", ylim=c(0.5,5), 
     xlab="Months", ylab="Sales in Millions",
     main="Autos Sales")
suvs = c(1,4,3,3,4)
lines(suvs, type="o", col="blue", lty=2, pch=6)

Line Charts - Add a Horizontal Line

plot(cars, type="o", col="red", ylim=c(0.5,5), 
     xlab="Months", ylab="Sales in Millions",
     main="Autos Sales")
lines(suvs, type="o", col="blue", lty=2, pch=6)
abline(h=c(3,4), lty=2, col="darkgrey") # h for horizontal

Line Charts - Add Legend

plot(cars, type="o", col="red", ylim=c(0.5,5), 
     xlab="Months", ylab="Sales in Millions",
     main="Autos Sales")
lines(suvs, type="o", col="blue", lty=2, pch=6)
abline(h=c(3,4), lty=2, col="darkgrey") # h for horizontal
legend(4.2, 1.2, c("cars","suvs"), col=c("red","blue"), lty=1:2, pch=c(1,6), bty="n")

Logistic Regression - Data

library(car)
data(Mroz) # load Mroz data
str(Mroz)
## 'data.frame':    753 obs. of  8 variables:
##  $ lfp : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ k5  : int  1 0 1 0 1 0 0 0 0 0 ...
##  $ k618: int  0 2 3 3 2 0 2 0 2 2 ...
##  $ age : int  32 30 35 34 31 54 37 54 48 39 ...
##  $ wc  : Factor w/ 2 levels "no","yes": 1 1 1 1 2 1 2 1 1 1 ...
##  $ hc  : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ lwg : num  1.2102 0.3285 1.5141 0.0921 1.5243 ...
##  $ inc : num  10.9 19.5 12 6.8 20.1 ...

Logistic Regression - Data Description

Logistic Regression - Exploratory Data Analysis (EDA)

summary(Mroz)
##   lfp            k5              k618            age          wc     
##  no :325   Min.   :0.0000   Min.   :0.000   Min.   :30.00   no :541  
##  yes:428   1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:36.00   yes:212  
##            Median :0.0000   Median :1.000   Median :43.00            
##            Mean   :0.2377   Mean   :1.353   Mean   :42.54            
##            3rd Qu.:0.0000   3rd Qu.:2.000   3rd Qu.:49.00            
##            Max.   :3.0000   Max.   :8.000   Max.   :60.00            
##    hc           lwg               inc        
##  no :458   Min.   :-2.0541   Min.   :-0.029  
##  yes:295   1st Qu.: 0.8181   1st Qu.:13.025  
##            Median : 1.0684   Median :17.700  
##            Mean   : 1.0971   Mean   :20.129  
##            3rd Qu.: 1.3997   3rd Qu.:24.466  
##            Max.   : 3.2189   Max.   :96.000

Logistic Regression - EDA

barplot(table(Mroz$lfp), col = "blue", 
        main = "Count of Females by Labor Force Participation",
        xlab = "Labor Force Participation Status", ylab = "Number of Females")

Logistic Regression - EDA

counts <- table(Mroz$lfp, Mroz$k5)
barplot(counts, col = c("blue","red"),
        main = "Labor Force Participation by # of Children < 5",
        xlab = "# of Children < 5", ylab = "Count",
        legend = rownames(counts))

Logistic Regression - EDA

counts <- table(Mroz$lfp, Mroz$k5)
barplot(counts, col = c("blue","red"),
        main = "Labor Force Participation by # of Children < 5",
        xlab = "# of Children < 5", ylab = "Count",
        legend = rownames(counts), beside = T)

Logistic Regression - EDA

plot(Mroz$age, Mroz$lwg, col = ifelse(Mroz$lfp == "yes", "red", "blue"),
     main = "Age vs. Log Wage by LFP", xlab = "Age (Years)", ylab = "Wage")
legend(x = 50, y = -1.5, legend = c("in LF", "Out of LF"),
       fill = c("red", "blue"), bty = "n")

Logistic Regression - Model Fit

fitLogistic <- glm(lfp ~ k5 + age, family=binomial(logit), data=Mroz)
fitLogistic
## 
## Call:  glm(formula = lfp ~ k5 + age, family = binomial(logit), data = Mroz)
## 
## Coefficients:
## (Intercept)           k5          age  
##     3.08578     -1.32042     -0.05847  
## 
## Degrees of Freedom: 752 Total (i.e. Null);  750 Residual
## Null Deviance:       1030 
## Residual Deviance: 964.5     AIC: 970.5

Logistic Regression - Model Fit

summary(fitLogistic)
## 
## Call:
## glm(formula = lfp ~ k5 + age, family = binomial(logit), data = Mroz)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7698  -1.1719   0.7588   1.0144   1.9663  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.08578    0.49712   6.207 5.39e-10 ***
## k5          -1.32042    0.18638  -7.085 1.39e-12 ***
## age         -0.05847    0.01090  -5.364 8.13e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1029.75  on 752  degrees of freedom
## Residual deviance:  964.48  on 750  degrees of freedom
## AIC: 970.48
## 
## Number of Fisher Scoring iterations: 4

Logistic Regression - Model Fit

names(fitLogistic)
##  [1] "coefficients"      "residuals"         "fitted.values"    
##  [4] "effects"           "R"                 "rank"             
##  [7] "qr"                "family"            "linear.predictors"
## [10] "deviance"          "aic"               "null.deviance"    
## [13] "iter"              "weights"           "prior.weights"    
## [16] "df.residual"       "df.null"           "y"                
## [19] "converged"         "boundary"          "model"            
## [22] "call"              "formula"           "terms"            
## [25] "data"              "offset"            "control"          
## [28] "method"            "contrasts"         "xlevels"

Logistic Regression - Exponentiated CIs

exp(confint(fitLogistic, level=0.95))
##                 2.5 %     97.5 %
## (Intercept) 8.3698492 58.8728998
## k5          0.1832502  0.3808541
## age         0.9230190  0.9633534
exp(confint.default(fitLogistic, level=0.95))
##                 2.5 %     97.5 %
## (Intercept) 8.2602111 57.9810036
## k5          0.1853124  0.3847626
## age         0.9232736  0.9635756

Logistic Regression - Predictions

To get probabilities, be sure to use type = “response”

head(Mroz,1)
##   lfp k5 k618 age wc hc      lwg   inc
## 1 yes  1    0  32 no no 1.210165 10.91
XB <- as.vector(head(predict(fitLogistic),1))
XB
## [1] -0.1055827
p <- as.vector(head(predict(fitLogistic, type = "response"),1))
p
## [1] 0.4736288
exp(XB)/(1 + exp(XB))  # inverse logit function
## [1] 0.4736288

Logistic Regression - Updates

fitLogistic2 = update(fitLogistic, . ~ . + inc + lwg, data=Mroz)
fitLogistic2
## 
## Call:  glm(formula = lfp ~ k5 + age + inc + lwg, family = binomial(logit), 
##     data = Mroz)
## 
## Coefficients:
## (Intercept)           k5          age          inc          lwg  
##     2.75867     -1.34227     -0.05900     -0.02464      0.78765  
## 
## Degrees of Freedom: 752 Total (i.e. Null);  748 Residual
## Null Deviance:       1030 
## Residual Deviance: 925.2     AIC: 935.2

Model Comparison

anova(fitLogistic, fitLogistic2, test='LRT')
## Analysis of Deviance Table
## 
## Model 1: lfp ~ k5 + age
## Model 2: lfp ~ k5 + age + inc + lwg
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       750     964.48                          
## 2       748     925.17  2   39.318 2.899e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

End of Session 5

Break