April 13, 2017
More Plotting (Line Charts)
Logistic Regression
cars = c(2,3,4,3,5)
plot(cars, type = "o")
plot(cars, type="o", col="red", ylim=c(0.5,5),
xlab="Months", ylab="Sales in Millions",
main="Autos Sales")
plot(cars, type="o", col="red", ylim=c(0.5,5),
xlab="Months", ylab="Sales in Millions",
main="Autos Sales")
suvs = c(1,4,3,3,4)
lines(suvs, type="o", col="blue", lty=2, pch=6)
plot(cars, type="o", col="red", ylim=c(0.5,5),
xlab="Months", ylab="Sales in Millions",
main="Autos Sales")
lines(suvs, type="o", col="blue", lty=2, pch=6)
abline(h=c(3,4), lty=2, col="darkgrey") # h for horizontal
plot(cars, type="o", col="red", ylim=c(0.5,5),
xlab="Months", ylab="Sales in Millions",
main="Autos Sales")
lines(suvs, type="o", col="blue", lty=2, pch=6)
abline(h=c(3,4), lty=2, col="darkgrey") # h for horizontal
legend(4.2, 1.2, c("cars","suvs"), col=c("red","blue"), lty=1:2, pch=c(1,6), bty="n")
glm()
is used to fit generalized linear models (logistic regression in this example).Mroz
data.library(car)
data(Mroz) # load Mroz data
str(Mroz)
## 'data.frame': 753 obs. of 8 variables:
## $ lfp : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ k5 : int 1 0 1 0 1 0 0 0 0 0 ...
## $ k618: int 0 2 3 3 2 0 2 0 2 2 ...
## $ age : int 32 30 35 34 31 54 37 54 48 39 ...
## $ wc : Factor w/ 2 levels "no","yes": 1 1 1 1 2 1 2 1 1 1 ...
## $ hc : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ lwg : num 1.2102 0.3285 1.5141 0.0921 1.5243 ...
## $ inc : num 10.9 19.5 12 6.8 20.1 ...
summary(Mroz)
## lfp k5 k618 age wc
## no :325 Min. :0.0000 Min. :0.000 Min. :30.00 no :541
## yes:428 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:36.00 yes:212
## Median :0.0000 Median :1.000 Median :43.00
## Mean :0.2377 Mean :1.353 Mean :42.54
## 3rd Qu.:0.0000 3rd Qu.:2.000 3rd Qu.:49.00
## Max. :3.0000 Max. :8.000 Max. :60.00
## hc lwg inc
## no :458 Min. :-2.0541 Min. :-0.029
## yes:295 1st Qu.: 0.8181 1st Qu.:13.025
## Median : 1.0684 Median :17.700
## Mean : 1.0971 Mean :20.129
## 3rd Qu.: 1.3997 3rd Qu.:24.466
## Max. : 3.2189 Max. :96.000
barplot(table(Mroz$lfp), col = "blue",
main = "Count of Females by Labor Force Participation",
xlab = "Labor Force Participation Status", ylab = "Number of Females")
counts <- table(Mroz$lfp, Mroz$k5)
barplot(counts, col = c("blue","red"),
main = "Labor Force Participation by # of Children < 5",
xlab = "# of Children < 5", ylab = "Count",
legend = rownames(counts))
counts <- table(Mroz$lfp, Mroz$k5)
barplot(counts, col = c("blue","red"),
main = "Labor Force Participation by # of Children < 5",
xlab = "# of Children < 5", ylab = "Count",
legend = rownames(counts), beside = T)
plot(Mroz$age, Mroz$lwg, col = ifelse(Mroz$lfp == "yes", "red", "blue"),
main = "Age vs. Log Wage by LFP", xlab = "Age (Years)", ylab = "Wage")
legend(x = 50, y = -1.5, legend = c("in LF", "Out of LF"),
fill = c("red", "blue"), bty = "n")
fitLogistic <- glm(lfp ~ k5 + age, family=binomial(logit), data=Mroz)
fitLogistic
##
## Call: glm(formula = lfp ~ k5 + age, family = binomial(logit), data = Mroz)
##
## Coefficients:
## (Intercept) k5 age
## 3.08578 -1.32042 -0.05847
##
## Degrees of Freedom: 752 Total (i.e. Null); 750 Residual
## Null Deviance: 1030
## Residual Deviance: 964.5 AIC: 970.5
summary(fitLogistic)
##
## Call:
## glm(formula = lfp ~ k5 + age, family = binomial(logit), data = Mroz)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7698 -1.1719 0.7588 1.0144 1.9663
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.08578 0.49712 6.207 5.39e-10 ***
## k5 -1.32042 0.18638 -7.085 1.39e-12 ***
## age -0.05847 0.01090 -5.364 8.13e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1029.75 on 752 degrees of freedom
## Residual deviance: 964.48 on 750 degrees of freedom
## AIC: 970.48
##
## Number of Fisher Scoring iterations: 4
names(fitLogistic)
## [1] "coefficients" "residuals" "fitted.values"
## [4] "effects" "R" "rank"
## [7] "qr" "family" "linear.predictors"
## [10] "deviance" "aic" "null.deviance"
## [13] "iter" "weights" "prior.weights"
## [16] "df.residual" "df.null" "y"
## [19] "converged" "boundary" "model"
## [22] "call" "formula" "terms"
## [25] "data" "offset" "control"
## [28] "method" "contrasts" "xlevels"
exp(confint(fitLogistic, level=0.95))
## 2.5 % 97.5 %
## (Intercept) 8.3698492 58.8728998
## k5 0.1832502 0.3808541
## age 0.9230190 0.9633534
exp(confint.default(fitLogistic, level=0.95))
## 2.5 % 97.5 %
## (Intercept) 8.2602111 57.9810036
## k5 0.1853124 0.3847626
## age 0.9232736 0.9635756
To get probabilities, be sure to use type = “response”
head(Mroz,1)
## lfp k5 k618 age wc hc lwg inc
## 1 yes 1 0 32 no no 1.210165 10.91
XB <- as.vector(head(predict(fitLogistic),1))
XB
## [1] -0.1055827
p <- as.vector(head(predict(fitLogistic, type = "response"),1))
p
## [1] 0.4736288
exp(XB)/(1 + exp(XB)) # inverse logit function
## [1] 0.4736288
fitLogistic2 = update(fitLogistic, . ~ . + inc + lwg, data=Mroz)
fitLogistic2
##
## Call: glm(formula = lfp ~ k5 + age + inc + lwg, family = binomial(logit),
## data = Mroz)
##
## Coefficients:
## (Intercept) k5 age inc lwg
## 2.75867 -1.34227 -0.05900 -0.02464 0.78765
##
## Degrees of Freedom: 752 Total (i.e. Null); 748 Residual
## Null Deviance: 1030
## Residual Deviance: 925.2 AIC: 935.2
anova(fitLogistic, fitLogistic2, test='LRT')
## Analysis of Deviance Table
##
## Model 1: lfp ~ k5 + age
## Model 2: lfp ~ k5 + age + inc + lwg
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 750 964.48
## 2 748 925.17 2 39.318 2.899e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Break