Intro to R Workshop: Session 4

UCI Data Science Initiative

April 13, 2017

Session 4 Agenda

  1. T-Tests

  2. Plotting

  3. Linear Regression

T-Test in R

T-test can be used to draw statistical conclusions about parameters of interest in the data

T-tests can be categorized into two groups:

One-Sample T-Test (Create Data)

set.seed(123)
oneSampData <- rnorm(100, mean = 0, sd = 1)

mean(oneSampData)
## [1] 0.09040591
sd(oneSampData)
## [1] 0.9128159

One-Sample T-Test (True Mean Equal to 0)

oneSampTest.0 <- t.test(oneSampData) 
oneSampTest.0
## 
##  One Sample t-test
## 
## data:  oneSampData
## t = 0.99041, df = 99, p-value = 0.3244
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.09071657  0.27152838
## sample estimates:
##  mean of x 
## 0.09040591
names(oneSampTest.0) 
## [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
## [6] "null.value"  "alternative" "method"      "data.name"
oneSampTest.0$statistic
##         t 
## 0.9904068
oneSampTest.0$estimate
##  mean of x 
## 0.09040591

One-Sample T-Test (True Mean Equal to \(\mu\))

oneSampTest.mu <- t.test(oneSampData, mu = 0.3)
oneSampTest.mu
## 
##  One Sample t-test
## 
## data:  oneSampData
## t = -2.2961, df = 99, p-value = 0.02378
## alternative hypothesis: true mean is not equal to 0.3
## 95 percent confidence interval:
##  -0.09071657  0.27152838
## sample estimates:
##  mean of x 
## 0.09040591

Two-Sample T-Test

Two sample t-tests are categorized into 3 groups:

Two-Sample T-Test (Un-equal Variances)

Samp1 <- rnorm(300, mean = 2.5, sd = 1)
Samp2 <- rnorm(500, mean = 3.0, sd = 1) # notice: not the same sample size
t.test(Samp1, Samp2)  # default assump: unequal variances
## 
##  Welch Two Sample t-test
## 
## data:  Samp1 and Samp2
## t = -7.3058, df = 638.78, p-value = 8.249e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.6730166 -0.3878678
## sample estimates:
## mean of x mean of y 
##  2.492232  3.022674

Two-Sample T-Test (Equal Variances)

t.test(Samp1, Samp2, var.equal = TRUE)  # default assump: unequal variances
## 
##  Two Sample t-test
## 
## data:  Samp1 and Samp2
## t = -7.2723, df = 798, p-value = 8.447e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.6736199 -0.3872646
## sample estimates:
## mean of x mean of y 
##  2.492232  3.022674

Two-Sample T-Test (Paired T Test)

t.test(Samp1, Samp2[1:300], paired = TRUE) # must be of the same sample size
## 
##  Paired t-test
## 
## data:  Samp1 and Samp2[1:300]
## t = -5.7581, df = 299, p-value = 2.109e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.6428004 -0.3153406
## sample estimates:
## mean of the differences 
##              -0.4790705

Basic Histogram

hist(Samp1)

Relative Frequency Histogram

With line at the mean

hist(Samp1, main = "Density of Samp1", freq = F, col = "grey") 
abline(v = mean(Samp1), col = "red", lwd = 2)

Overlayed Histograms with a Legend

hist(Samp1, main = "Densities of Samp1 and Samp2", freq = F, col = "grey", xlab="") 
abline(v = mean(Samp1), col = "red", lwd = 2)
hist(Samp2, freq = F, add = T)
abline(v = mean(Samp2), col = "blue", lwd = 2)
legend("topright", legend = c("Samp1", "Samp2"),
       fill = c("grey","white"), bty = "n", cex = 1.3)

Overlayed Density Plots

plot(density(Samp1), col="red", main="Densities of Samp1 and Samp2", xlab="")
abline(v = mean(Samp1), col = "red", lwd = 2, lty=2)
lines(density(Samp2), col="blue")
abline(v = mean(Samp2), col = "blue", lwd = 2, lty = 2)
legend("topright", legend = c("Samp1", "Samp2"),
       fill = c("red","blue"), bty = "n", cex = 1.3)

Linear Regression Data Set Description

Here we use the “Prestige” dataset from the car package, which has the following variables

Load Data

# install.packages("car", dependencies=TRUE)
library(car)
data(Prestige) # load the data
str(Prestige)
## 'data.frame':    102 obs. of  6 variables:
##  $ education: num  13.1 12.3 12.8 11.4 14.6 ...
##  $ income   : int  12351 25879 9271 8865 8403 11030 8258 14163 11377 11023 ...
##  $ women    : num  11.16 4.02 15.7 9.11 11.68 ...
##  $ prestige : num  68.8 69.1 63.4 56.8 73.5 77.6 72.6 78.1 73.1 68.8 ...
##  $ census   : int  1113 1130 1171 1175 2111 2113 2133 2141 2143 2153 ...
##  $ type     : Factor w/ 3 levels "bc","prof","wc": 2 2 2 2 2 2 2 2 2 2 ...

Look at the Data

summary(Prestige)
##    education          income          women           prestige    
##  Min.   : 6.380   Min.   :  611   Min.   : 0.000   Min.   :14.80  
##  1st Qu.: 8.445   1st Qu.: 4106   1st Qu.: 3.592   1st Qu.:35.23  
##  Median :10.540   Median : 5930   Median :13.600   Median :43.60  
##  Mean   :10.738   Mean   : 6798   Mean   :28.979   Mean   :46.83  
##  3rd Qu.:12.648   3rd Qu.: 8187   3rd Qu.:52.203   3rd Qu.:59.27  
##  Max.   :15.970   Max.   :25879   Max.   :97.510   Max.   :87.20  
##      census       type   
##  Min.   :1113   bc  :44  
##  1st Qu.:3120   prof:31  
##  Median :5135   wc  :23  
##  Mean   :5402   NA's: 4  
##  3rd Qu.:8312            
##  Max.   :9517

See 4 NA values for “type.” Let’s look at them.

Examine Observations with NA values

Prestige[is.na(Prestige$type),]
##             education income women prestige census type
## athletes        11.44   8206  8.13     54.1   3373 <NA>
## newsboys         9.62    918  7.00     14.8   5143 <NA>
## babysitters      9.46    611 96.53     25.9   6147 <NA>
## farmers          6.84   3643  3.60     44.1   7112 <NA>

Find their row indices and corresponding names…

ind <- which(is.na(Prestige$type))  # gives index numbers of the NAs in the vector
rbind(index=ind, name=rownames(Prestige)[ind])  # print index with rowname
##       [,1]       [,2]       [,3]          [,4]     
## index "34"       "53"       "63"          "67"     
## name  "athletes" "newsboys" "babysitters" "farmers"

Recode/Drop NA Values

Let’s recode newsboys, babysitters, and farmers as blue collar and exclude athletes.

Use rep() to create a vector with 3 elements of “bc”

ind.ch <- ind[-1]
Prestige[ind.ch,"type"] <- rep("bc", 3)
summary(Prestige$type)
##   bc prof   wc NA's 
##   47   31   23    1

Exclude athletes…

Prestige <- na.omit(Prestige) 
summary(Prestige$type)
##   bc prof   wc 
##   47   31   23

Plotting - Histograms

hist(Prestige$prestige, col = "grey", 
     main = "Histogram of Prestige Score", xlab = "Prestige Score")

Plotting - Basic Scatter Plot

plot(Prestige$education, Prestige$prestige,
     main = "Prestige Score by Education",
     xlab = "Ave. Years of Education", ylab = "Prestige Score")

Plotting - Including Regression/Smoother Lines

abline() adds a line to the plot, and lm() is a list object that contains regression coefficients.

plot(Prestige$education, Prestige$prestige,
     main = "Prestige Score by Education",
     xlab = "Avg Years of Education", ylab = "Prestige Score")
abline(reg = lm(prestige ~ education, data = Prestige), col = "red", lwd = 2)
lines(lowess(Prestige$education, Prestige$prestige), col = "blue", lty = 2, lwd = 2)
legend("topleft",legend = c("Regression Line", "Smoother"), col = c("red","blue"),
       lwd = c(2,2), lty = c(1,2), bty = "n")

Plotting - Scatter Plot Matrix

Use the scatterplotMatrix() function from the car package

# Use direct ordering of the varaibles to control how they are plotted
scatterplotMatrix(Prestige[,c("prestige","education","income","women")])

Plotting - Boxplots

boxplot(prestige ~ type, data = Prestige, col = "grey",
        main = "Distribution of Prestige Score by Types",
        xlab = "Occupation Types", ylab = "Prestige Score")

Linear Regression - Fit the Model

myReg <- lm(prestige ~ education + income + women, data = Prestige)
myReg
## 
## Call:
## lm(formula = prestige ~ education + income + women, data = Prestige)
## 
## Coefficients:
## (Intercept)    education       income        women  
##   -6.801684     4.182482     0.001315    -0.008304
names(myReg)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"

Linear Regression - Summary of Fit

summary(myReg)
## 
## Call:
## lm(formula = prestige ~ education + income + women, data = Prestige)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.7831  -5.4102  -0.1204   5.1821  17.5318 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.8016842  3.2543983  -2.090   0.0392 *  
## education    4.1824817  0.3907842  10.703  < 2e-16 ***
## income       0.0013153  0.0002791   4.712  8.2e-06 ***
## women       -0.0083038  0.0306187  -0.271   0.7868    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.883 on 97 degrees of freedom
## Multiple R-squared:  0.798,  Adjusted R-squared:  0.7917 
## F-statistic: 127.7 on 3 and 97 DF,  p-value: < 2.2e-16

Linear Regression - “summary” Contents

sum.myReg = summary(myReg)
names(sum.myReg) # show different contents
##  [1] "call"          "terms"         "residuals"     "coefficients" 
##  [5] "aliased"       "sigma"         "df"            "r.squared"    
##  [9] "adj.r.squared" "fstatistic"    "cov.unscaled"
names(myReg) # this is what we had previously
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"

Linear Regression - Confidence Interval

confint(myReg, 'income', level=0.95)
##              2.5 %      97.5 %
## income 0.000761253 0.001869315
confint(myReg, level=0.95)
##                     2.5 %       97.5 %
## (Intercept) -13.260764035 -0.342604455
## education     3.406883236  4.958080173
## income        0.000761253  0.001869315
## women        -0.069073423  0.052465913

Linear Regression - Adding Variables

mod = update(myReg, ~ . + type); summary(mod)
## 
## Call:
## lm(formula = prestige ~ education + income + women + type, data = Prestige)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.1126  -4.9844   0.0942   5.1318  19.4630 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.7378124  5.5112850   0.497   0.6205    
## education    3.1246575  0.6644750   4.702 8.70e-06 ***
## income       0.0011764  0.0002702   4.355 3.36e-05 ***
## women        0.0050863  0.0308647   0.165   0.8695    
## typeprof     8.5502774  4.0724806   2.100   0.0384 *  
## typewc      -1.1457779  2.7280276  -0.420   0.6754    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.536 on 95 degrees of freedom
## Multiple R-squared:  0.8192, Adjusted R-squared:  0.8097 
## F-statistic:  86.1 on 5 and 95 DF,  p-value: < 2.2e-16

Linear Regression - Relevel a Factor

levels(Prestige$type)
## [1] "bc"   "prof" "wc"
Prestige$type = relevel(Prestige$type, "prof")

levels(Prestige$type)
## [1] "prof" "bc"   "wc"

Linear Regression - Relevel a Factor

mod = update(myReg, ~ . + type); summary(mod)
## 
## Call:
## lm(formula = prestige ~ education + income + women + type, data = Prestige)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.1126  -4.9844   0.0942   5.1318  19.4630 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.2880898  9.1659054   1.232  0.22116    
## education    3.1246575  0.6644750   4.702 8.70e-06 ***
## income       0.0011764  0.0002702   4.355 3.36e-05 ***
## women        0.0050863  0.0308647   0.165  0.86946    
## typebc      -8.5502774  4.0724806  -2.100  0.03842 *  
## typewc      -9.6960553  2.9391152  -3.299  0.00137 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.536 on 95 degrees of freedom
## Multiple R-squared:  0.8192, Adjusted R-squared:  0.8097 
## F-statistic:  86.1 on 5 and 95 DF,  p-value: < 2.2e-16

Linear Regression - Diagnostics

par(mfrow = c(2, 2), oma = c(0, 0, 2, 0))
plot(myReg)

Linear Regression - Predict

Predict the output for a new input

newData = data.frame(education=13.2, income=12000, women=12)
predict(myReg, newData, interval="predict")
##        fit     lwr      upr
## 1 64.09084 48.2459 79.93578

End of Session 4

BREAK!