April 13, 2017
T-Tests
Plotting
Linear Regression
T-test can be used to draw statistical conclusions about parameters of interest in the data
Is the mean of this data different from zero (or another number)?
Is the mean of this data different from that data?
Is the regression slope coefficient different from zero?
T-tests can be categorized into two groups:
One-sample t-test
Two-sample t-test
set.seed(123)
oneSampData <- rnorm(100, mean = 0, sd = 1)
mean(oneSampData)
## [1] 0.09040591
sd(oneSampData)
## [1] 0.9128159
oneSampTest.0 <- t.test(oneSampData)
oneSampTest.0
##
## One Sample t-test
##
## data: oneSampData
## t = 0.99041, df = 99, p-value = 0.3244
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -0.09071657 0.27152838
## sample estimates:
## mean of x
## 0.09040591
names(oneSampTest.0)
## [1] "statistic" "parameter" "p.value" "conf.int" "estimate"
## [6] "null.value" "alternative" "method" "data.name"
oneSampTest.0$statistic
## t
## 0.9904068
oneSampTest.0$estimate
## mean of x
## 0.09040591
oneSampTest.mu <- t.test(oneSampData, mu = 0.3)
oneSampTest.mu
##
## One Sample t-test
##
## data: oneSampData
## t = -2.2961, df = 99, p-value = 0.02378
## alternative hypothesis: true mean is not equal to 0.3
## 95 percent confidence interval:
## -0.09071657 0.27152838
## sample estimates:
## mean of x
## 0.09040591
Two sample t-tests are categorized into 3 groups:
T-Test with equal variances
T-Test with un-equal variances
Paired T-Test (one-sample t-test on differences)
Samp1 <- rnorm(300, mean = 2.5, sd = 1)
Samp2 <- rnorm(500, mean = 3.0, sd = 1) # notice: not the same sample size
t.test(Samp1, Samp2) # default assump: unequal variances
##
## Welch Two Sample t-test
##
## data: Samp1 and Samp2
## t = -7.3058, df = 638.78, p-value = 8.249e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.6730166 -0.3878678
## sample estimates:
## mean of x mean of y
## 2.492232 3.022674
t.test(Samp1, Samp2, var.equal = TRUE) # default assump: unequal variances
##
## Two Sample t-test
##
## data: Samp1 and Samp2
## t = -7.2723, df = 798, p-value = 8.447e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.6736199 -0.3872646
## sample estimates:
## mean of x mean of y
## 2.492232 3.022674
t.test(Samp1, Samp2[1:300], paired = TRUE) # must be of the same sample size
##
## Paired t-test
##
## data: Samp1 and Samp2[1:300]
## t = -5.7581, df = 299, p-value = 2.109e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.6428004 -0.3153406
## sample estimates:
## mean of the differences
## -0.4790705
hist(Samp1)
With line at the mean
hist(Samp1, main = "Density of Samp1", freq = F, col = "grey")
abline(v = mean(Samp1), col = "red", lwd = 2)
hist(Samp1, main = "Densities of Samp1 and Samp2", freq = F, col = "grey", xlab="")
abline(v = mean(Samp1), col = "red", lwd = 2)
hist(Samp2, freq = F, add = T)
abline(v = mean(Samp2), col = "blue", lwd = 2)
legend("topright", legend = c("Samp1", "Samp2"),
fill = c("grey","white"), bty = "n", cex = 1.3)
plot(density(Samp1), col="red", main="Densities of Samp1 and Samp2", xlab="")
abline(v = mean(Samp1), col = "red", lwd = 2, lty=2)
lines(density(Samp2), col="blue")
abline(v = mean(Samp2), col = "blue", lwd = 2, lty = 2)
legend("topright", legend = c("Samp1", "Samp2"),
fill = c("red","blue"), bty = "n", cex = 1.3)
Here we use the “Prestige” dataset from the car
package, which has the following variables
education: Average education of occupational incumbents, years, in 1971.
income: Average income of incumbents, dollars, in 1971.
women: Percentage of incumbents who are women.
prestige: Pineo-Porter prestige score for occupation, from a social survey conducted in the mid-1960s.
census: Canadian Census occupational code.
type: Type of occupation, a factor with levels
# install.packages("car", dependencies=TRUE)
library(car)
data(Prestige) # load the data
str(Prestige)
## 'data.frame': 102 obs. of 6 variables:
## $ education: num 13.1 12.3 12.8 11.4 14.6 ...
## $ income : int 12351 25879 9271 8865 8403 11030 8258 14163 11377 11023 ...
## $ women : num 11.16 4.02 15.7 9.11 11.68 ...
## $ prestige : num 68.8 69.1 63.4 56.8 73.5 77.6 72.6 78.1 73.1 68.8 ...
## $ census : int 1113 1130 1171 1175 2111 2113 2133 2141 2143 2153 ...
## $ type : Factor w/ 3 levels "bc","prof","wc": 2 2 2 2 2 2 2 2 2 2 ...
summary(Prestige)
## education income women prestige
## Min. : 6.380 Min. : 611 Min. : 0.000 Min. :14.80
## 1st Qu.: 8.445 1st Qu.: 4106 1st Qu.: 3.592 1st Qu.:35.23
## Median :10.540 Median : 5930 Median :13.600 Median :43.60
## Mean :10.738 Mean : 6798 Mean :28.979 Mean :46.83
## 3rd Qu.:12.648 3rd Qu.: 8187 3rd Qu.:52.203 3rd Qu.:59.27
## Max. :15.970 Max. :25879 Max. :97.510 Max. :87.20
## census type
## Min. :1113 bc :44
## 1st Qu.:3120 prof:31
## Median :5135 wc :23
## Mean :5402 NA's: 4
## 3rd Qu.:8312
## Max. :9517
See 4 NA values for “type.” Let’s look at them.
Prestige[is.na(Prestige$type),]
## education income women prestige census type
## athletes 11.44 8206 8.13 54.1 3373 <NA>
## newsboys 9.62 918 7.00 14.8 5143 <NA>
## babysitters 9.46 611 96.53 25.9 6147 <NA>
## farmers 6.84 3643 3.60 44.1 7112 <NA>
Find their row indices and corresponding names…
ind <- which(is.na(Prestige$type)) # gives index numbers of the NAs in the vector
rbind(index=ind, name=rownames(Prestige)[ind]) # print index with rowname
## [,1] [,2] [,3] [,4]
## index "34" "53" "63" "67"
## name "athletes" "newsboys" "babysitters" "farmers"
Let’s recode newsboys, babysitters, and farmers as blue collar and exclude athletes.
Use rep()
to create a vector with 3 elements of “bc”
ind.ch <- ind[-1]
Prestige[ind.ch,"type"] <- rep("bc", 3)
summary(Prestige$type)
## bc prof wc NA's
## 47 31 23 1
Exclude athletes…
Prestige <- na.omit(Prestige)
summary(Prestige$type)
## bc prof wc
## 47 31 23
hist(Prestige$prestige, col = "grey",
main = "Histogram of Prestige Score", xlab = "Prestige Score")
plot(Prestige$education, Prestige$prestige,
main = "Prestige Score by Education",
xlab = "Ave. Years of Education", ylab = "Prestige Score")
abline()
adds a line to the plot, and lm()
is a list object that contains regression coefficients.
plot(Prestige$education, Prestige$prestige,
main = "Prestige Score by Education",
xlab = "Avg Years of Education", ylab = "Prestige Score")
abline(reg = lm(prestige ~ education, data = Prestige), col = "red", lwd = 2)
lines(lowess(Prestige$education, Prestige$prestige), col = "blue", lty = 2, lwd = 2)
legend("topleft",legend = c("Regression Line", "Smoother"), col = c("red","blue"),
lwd = c(2,2), lty = c(1,2), bty = "n")
Use the scatterplotMatrix()
function from the car
package
# Use direct ordering of the varaibles to control how they are plotted
scatterplotMatrix(Prestige[,c("prestige","education","income","women")])
boxplot(prestige ~ type, data = Prestige, col = "grey",
main = "Distribution of Prestige Score by Types",
xlab = "Occupation Types", ylab = "Prestige Score")
myReg <- lm(prestige ~ education + income + women, data = Prestige)
myReg
##
## Call:
## lm(formula = prestige ~ education + income + women, data = Prestige)
##
## Coefficients:
## (Intercept) education income women
## -6.801684 4.182482 0.001315 -0.008304
names(myReg)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
summary(myReg)
##
## Call:
## lm(formula = prestige ~ education + income + women, data = Prestige)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.7831 -5.4102 -0.1204 5.1821 17.5318
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.8016842 3.2543983 -2.090 0.0392 *
## education 4.1824817 0.3907842 10.703 < 2e-16 ***
## income 0.0013153 0.0002791 4.712 8.2e-06 ***
## women -0.0083038 0.0306187 -0.271 0.7868
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.883 on 97 degrees of freedom
## Multiple R-squared: 0.798, Adjusted R-squared: 0.7917
## F-statistic: 127.7 on 3 and 97 DF, p-value: < 2.2e-16
sum.myReg = summary(myReg)
names(sum.myReg) # show different contents
## [1] "call" "terms" "residuals" "coefficients"
## [5] "aliased" "sigma" "df" "r.squared"
## [9] "adj.r.squared" "fstatistic" "cov.unscaled"
names(myReg) # this is what we had previously
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
confint(myReg, 'income', level=0.95)
## 2.5 % 97.5 %
## income 0.000761253 0.001869315
confint(myReg, level=0.95)
## 2.5 % 97.5 %
## (Intercept) -13.260764035 -0.342604455
## education 3.406883236 4.958080173
## income 0.000761253 0.001869315
## women -0.069073423 0.052465913
mod = update(myReg, ~ . + type); summary(mod)
##
## Call:
## lm(formula = prestige ~ education + income + women + type, data = Prestige)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.1126 -4.9844 0.0942 5.1318 19.4630
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.7378124 5.5112850 0.497 0.6205
## education 3.1246575 0.6644750 4.702 8.70e-06 ***
## income 0.0011764 0.0002702 4.355 3.36e-05 ***
## women 0.0050863 0.0308647 0.165 0.8695
## typeprof 8.5502774 4.0724806 2.100 0.0384 *
## typewc -1.1457779 2.7280276 -0.420 0.6754
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.536 on 95 degrees of freedom
## Multiple R-squared: 0.8192, Adjusted R-squared: 0.8097
## F-statistic: 86.1 on 5 and 95 DF, p-value: < 2.2e-16
levels(Prestige$type)
## [1] "bc" "prof" "wc"
Prestige$type = relevel(Prestige$type, "prof")
levels(Prestige$type)
## [1] "prof" "bc" "wc"
mod = update(myReg, ~ . + type); summary(mod)
##
## Call:
## lm(formula = prestige ~ education + income + women + type, data = Prestige)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.1126 -4.9844 0.0942 5.1318 19.4630
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.2880898 9.1659054 1.232 0.22116
## education 3.1246575 0.6644750 4.702 8.70e-06 ***
## income 0.0011764 0.0002702 4.355 3.36e-05 ***
## women 0.0050863 0.0308647 0.165 0.86946
## typebc -8.5502774 4.0724806 -2.100 0.03842 *
## typewc -9.6960553 2.9391152 -3.299 0.00137 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.536 on 95 degrees of freedom
## Multiple R-squared: 0.8192, Adjusted R-squared: 0.8097
## F-statistic: 86.1 on 5 and 95 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2), oma = c(0, 0, 2, 0))
plot(myReg)
Predict the output for a new input
newData = data.frame(education=13.2, income=12000, women=12)
predict(myReg, newData, interval="predict")
## fit lwr upr
## 1 64.09084 48.2459 79.93578
BREAK!