Intro To R Workshop: Session 1

UCI Data Science Initiative

Emma Smith and Chris Galbraith

April 13, 2017

Introduction

Introduction

Session 1 - Agenda

  1. RStudio
  2. Data Types in R
  3. Data Structures in R
  4. Subsetting in R

What is R?

R Studio:

  1. RStudio is a free and open source integrated development environment (IDE) for R.
  2. To download RStudio please visit: http://rstudio.org/
  3. Please note that you must have R already installed before installing R Studio.

Fundamentals

Data Types in R:

Data Structures in R:

  1. One-dimensional:
    • Vectors
  2. Multi-dimensional:
    • Matrices
    • Data frames

 

Vectors in R:

numVec <- c(2,3,4)      # <- is the assigning operator
numVec
## [1] 2 3 4

Examples of Vectors

Examples of character, logical, and complex vectors:

charVec <- c("red", "green", "blue")
charVec
## [1] "red"   "green" "blue"
logVec <- c(TRUE, FALSE, FALSE, T, F)
logVec
## [1]  TRUE FALSE FALSE  TRUE FALSE
compVec <- c(1 + 0i, 3 + 1i)
compVec
## [1] 1+0i 3+1i

Special Values:

There are some special values in R:

intVec <- c(1L, 2L, 3L, 4L) 
intVec
## [1] 1 2 3 4
a <- Inf; b <- 0
rslt <- c(b/a, a/a)
rslt
## [1]   0 NaN

Data Type Coercion:

numCharVec <- c(3.14, "a")
numCharVec                 # What do you expect to be printed? 

numLogVec <- c(pi, T)
numLogVec                   

charLogVec <- c("a", TRUE)
charLogVec 

Data Type Coercion:

numVec <- seq(from = 1200, to = 1300, by = 15)
numVec
## [1] 1200 1215 1230 1245 1260 1275 1290
numToChar <- as(numVec, "character")
numToChar
## [1] "1200" "1215" "1230" "1245" "1260" "1275" "1290"
logVec <- c(F, T, F, T, T)
as(logVec, "numeric")
## [1] 0 1 0 1 1

Data Type Coercion:

compVec <- c(12+10i, 1+6i, -3-2i)
as(compVec, "numeric")
## Warning in asMethod(object): imaginary parts discarded in coercion
## [1] 12  1 -3
charVec <- c("2.5", "3", "2.8", "1.5", "zero")
as(charVec, "numeric")
## Warning in asMethod(object): NAs introduced by coercion
## [1] 2.5 3.0 2.8 1.5  NA

Factors:

Gender <- rep(c("Female", "Male"), times = 3)
Gender
## [1] "Female" "Male"   "Female" "Male"   "Female" "Male"
GenderFac1 <- factor(Gender)
GenderFac1
## [1] Female Male   Female Male   Female Male  
## Levels: Female Male

Factors:

levels(GenderFac1)
## [1] "Female" "Male"
table(GenderFac1)
## GenderFac1
## Female   Male 
##      3      3
unclass(GenderFac1) # returns the factor as integer values
## [1] 1 2 1 2 1 2
## attr(,"levels")
## [1] "Female" "Male"

Factors:

GenderFac1 # levels are ordered alphabetically - 1st level = BaseLevel
## [1] Female Male   Female Male   Female Male  
## Levels: Female Male
GenderFac2 <- factor(Gender, levels = c("Male", "Female"))
GenderFac1
## [1] Female Male   Female Male   Female Male  
## Levels: Female Male
GenderFac2
## [1] Female Male   Female Male   Female Male  
## Levels: Male Female

Missing Values:

a <- c(1,2)
a[3]
## [1] NA
b <- 0/0
b
## [1] NaN

Matrices:

myMat <- matrix(nrow = 2, ncol = 4)
myMat
##      [,1] [,2] [,3] [,4]
## [1,]   NA   NA   NA   NA
## [2,]   NA   NA   NA   NA
attributes(myMat)
## $dim
## [1] 2 4

Matrices:

myMat <- matrix(1:8, nrow = 2, ncol = 4)
myMat # matrices are filled in column-wise
##      [,1] [,2] [,3] [,4]
## [1,]    1    3    5    7
## [2,]    2    4    6    8

A matrix is a special case of a vector:

myVec <- 1:8
myVec
## [1] 1 2 3 4 5 6 7 8
dim(myVec) <- c(2,4)
myVec
##      [,1] [,2] [,3] [,4]
## [1,]    1    3    5    7
## [2,]    2    4    6    8

Other Ways to Create a Matrix:

vec1 <- 1:4
vec2 <- sample(1:100, 4, replace = FALSE)
vec3 <- rnorm(4, mean = 0, sd = 1)
colMat <- cbind(vec1, vec2, vec3)
colMat
##      vec1 vec2       vec3
## [1,]    1   99 -1.0990771
## [2,]    2   11 -1.5036870
## [3,]    3   20 -1.8075410
## [4,]    4   74  0.1202505

Other Ways to Create a Matrix:

vec1 <- 1:4
vec2 <- sample(1:100, 4, replace = FALSE)
vec3 <- rnorm(4, mean = 0, sd = 1)
rowMat <- rbind(vec1, vec2, vec3)
rowMat
##           [,1]       [,2]        [,3]       [,4]
## vec1  1.000000  2.0000000  3.00000000  4.0000000
## vec2 11.000000 97.0000000 18.00000000 61.0000000
## vec3 -0.629213 -0.8334383 -0.02996309  0.9864132

Lists:

myVec <- c(10, "R", 10-5i, T)
myVec
## [1] "10"    "R"     "10-5i" "TRUE"

Lists:

myList <- list(10, "R", 10-5i, T)
myList
## [[1]]
## [1] 10
## 
## [[2]]
## [1] "R"
## 
## [[3]]
## [1] 10-5i
## 
## [[4]]
## [1] TRUE

Data Frames:

studentID <- paste("S#", sample(c(6473:7392), 10), sep = "")
score <- sample(c(0:100), 10)
gender <- sample(c("female", "male"), 10, replace = TRUE)
data <- data.frame(studentID = studentID, score = score, gender = gender)
head(data)
##   studentID score gender
## 1    S#6766    48 female
## 2    S#7140     7 female
## 3    S#6571    74   male
## 4    S#7189    89   male
## 5    S#6507    76 female
## 6    S#6525    91   male

Subsetting:

Subsetting Examples:

myVec <- 1:10
myVec[3]
## [1] 3
myList <- list(obj1 = "a", obj2 = 10, obj3 = T, obj4 = 10-5i)
myList[[3]]
## [1] TRUE
myList$obj4
## [1] 10-5i

Subsetting with [ ]:

x <- seq(from=0, to=100, by=10) 
x
##  [1]   0  10  20  30  40  50  60  70  80  90 100
x[1:3]           # select the first, second, and third elements
## [1]  0 10 20
x[c(2,4,6)]      # select the second, fourth, and six elements
## [1] 10 30 50

Subsetting with [ ]: Index Vectors

x <- seq(from=0, to=100,by=10) 
x
##  [1]   0  10  20  30  40  50  60  70  80  90 100
IndVec <- c(1, 2, 3, 4, 5)       # index vector to select the first 5 elements 
x[IndVec]
## [1]  0 10 20 30 40

Index Vectors:

Example:

grades <- sample(0:100, 10)
names(grades) <- sample(letters[1:10], 10)
grades
##  b  f  i  j  h  g  e  d  c  a 
## 79 64  3 32 48  0 76 94 96 72

1. Logical Index Vector

logIndVec <- rep(c(T, F), each = 5)
logIndVec
##  [1]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
grades[logIndVec]
##  b  f  i  j  h 
## 79 64  3 32 48

1. Logical Index Vector

logIndVec <- grades > 60
logIndVec
##     b     f     i     j     h     g     e     d     c     a 
##  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE
grades[logIndVec]
##  b  f  e  d  c  a 
## 79 64 76 94 96 72

2. Index Vector of Positive Integers

posIndVec <- 4:7
posIndVec
## [1] 4 5 6 7
grades[posIndVec]
##  j  h  g  e 
## 32 48  0 76

3. Index Vector of Negative Integers

negIndVec <- -1:-5
negIndVec
## [1] -1 -2 -3 -4 -5
grades[negIndVec]
##  g  e  d  c  a 
##  0 76 94 96 72

4. Vector of Character Strings

chIndVec <- c("a")
chIndVec
## [1] "a"
grades[chIndVec]
##  a 
## 72

Subsetting Matrices:

myMat <- matrix(1:8, ncol = 4)
myMat
##      [,1] [,2] [,3] [,4]
## [1,]    1    3    5    7
## [2,]    2    4    6    8

Subsetting Matrices:

myMat[1,1]            # retrieve the element in the first row, first column
## [1] 1
myMat[2,]             # retrieve the second row
## [1] 2 4 6 8
myMat[,3]             # retrieve the third column
## [1] 5 6

Subsetting Matrices:

myMat[1,1]
## [1] 1
myMat[1,1, drop = FALSE]
##      [,1]
## [1,]    1
myMat[2,, drop = FALSE]
##      [,1] [,2] [,3] [,4]
## [1,]    2    4    6    8

Subsetting Lists:

myList <- list(ID = paste("ID", sample(c(100:199), 3), sep = ""), Age = sample(c(18:99), 3), Gender = sample(c("M", "F"), 3, replace = TRUE))
myList
## $ID
## [1] "ID100" "ID180" "ID136"
## 
## $Age
## [1] 38 92 75
## 
## $Gender
## [1] "M" "F" "M"
myList[1] # subset is still a list
## $ID
## [1] "ID100" "ID180" "ID136"

Subsetting Lists:

myList[1:2] # return the first two objects; subset is still a list
## $ID
## [1] "ID100" "ID180" "ID136"
## 
## $Age
## [1] 38 92 75
myList[[1]] # return the 1st object; subset is a character vector
## [1] "ID100" "ID180" "ID136"
myList$ID # alternative to [[]]
## [1] "ID100" "ID180" "ID136"

Subsetting Lists:

myList[[1]][2] # return the 2nd element of the 1st object
## [1] "ID180"
myList$ID[2]
## [1] "ID180"
myList[[c(1,2)]]
## [1] "ID180"

Subsetting Data Frames:

library(datasets)
data(quakes) # ?quakes for more info
str(quakes)
## 'data.frame':    1000 obs. of  5 variables:
##  $ lat     : num  -20.4 -20.6 -26 -18 -20.4 ...
##  $ long    : num  182 181 184 182 182 ...
##  $ depth   : int  562 650 42 626 649 195 82 194 211 622 ...
##  $ mag     : num  4.8 4.2 5.4 4.1 4 4 4.8 4.4 4.7 4.3 ...
##  $ stations: int  41 15 43 19 11 12 43 15 35 19 ...

Subsetting Data Frames:

quakes[1:10,]
##       lat   long depth mag stations
## 1  -20.42 181.62   562 4.8       41
## 2  -20.62 181.03   650 4.2       15
## 3  -26.00 184.10    42 5.4       43
## 4  -17.97 181.66   626 4.1       19
## 5  -20.42 181.96   649 4.0       11
## 6  -19.68 184.31   195 4.0       12
## 7  -11.70 166.10    82 4.8       43
## 8  -28.11 181.93   194 4.4       15
## 9  -28.74 181.74   211 4.7       35
## 10 -17.47 179.59   622 4.3       19

Subsetting Data Frames:

head(quakes$long)
## [1] 181.62 181.03 184.10 181.66 181.96 184.31
head(quakes[,c("lat", "long")])
##      lat   long
## 1 -20.42 181.62
## 2 -20.62 181.03
## 3 -26.00 184.10
## 4 -17.97 181.66
## 5 -20.42 181.96
## 6 -19.68 184.31