MKT500V Introduction to R Day 1: Basics

Sally Chen

8/25/2018

About Me

About You

Slides

http://rpubs.com/sallychen/301448

What is R & Rstudio

Course Objectives

Today: Contents & Agenda

If you have not installed R & Rstudio,use Rstudio cloud verion for today’s session

https://rstudio.cloud

Data Classes and Objects

Demo: Class and Objects

x1 = 0.02  #assign values to object with =
y1 <- "0.01"  #assign values to object with <-
z1 = TRUE
class(x1)
## [1] "numeric"
class(y1)
## [1] "character"
class(z1)
## [1] "logical"
X1 = 100  #Case sensitive x1 != X1
X1
## [1] 100
x1
## [1] 0.02
1x = 200  # name should not start with number
## Error: <text>:1:2: unexpected symbol
## 1: 1x
##      ^
x1  # original value of x1 = 0.02
## [1] 0.02
x1 + 5
## [1] 5.02
x1  # x1 is not overwritten
## [1] 0.02
x1 = x1 + 5  # x1 is overwritten
x1  # x1 has a new value
## [1] 5.02
class("TRUE")
## [1] "character"
"TRUE" + 1
## Error in "TRUE" + 1: non-numeric argument to binary operator
class(TRUE)
## [1] "logical"
TRUE + 1
## [1] 2
2 + "2"
## Error in 2 + "2": non-numeric argument to binary operator

Data Structures

Data Structures

Vector

Demo: Vector

vector("numeric", length = 3)  # a empty numeric vector of length 3
## [1] 0 0 0
numeric(3)
## [1] 0 0 0
vector("character", length = 5)  # a empty character vector of length 5
## [1] "" "" "" "" ""
character(5)
## [1] "" "" "" "" ""
x = c(1, 2, 3, 4)  #numeric vector
y = c("1", "2", "hello", "R")  #character vector
c(1, "hello", TRUE)  # every element will be transformed to character, as there are  ' ' around 1 and TRUE
## [1] "1"     "hello" "TRUE"
x
## [1] 1 2 3 4
length(x)
## [1] 4
class(x)
## [1] "numeric"
y
## [1] "1"     "2"     "hello" "R"
length(y)
## [1] 4
class(y)
## [1] "character"
x
## [1] 1 2 3 4
x <- c(x, 5)  # add a new element to x

Constructing numeric vectors of specific structure

a = 3:7
a
## [1] 3 4 5 6 7
seq(from = 1, to = 9, by = 2)
## [1] 1 3 5 7 9
a = rep(3, 4)  # replicate 3 for 4 times
a
## [1] 3 3 3 3

Some functions for numeric vectors & vector calculation

x
## [1] 1 2 3 4 5
class(x)  # x is a numeric vector
## [1] "numeric"
length(x)  # return the length of x
## [1] 5
sd(x)  # return the standard deviation of x
## [1] 1.581139
min(x)  # return the minimal of x
## [1] 1
max(x)  # return the maximal of x
## [1] 5
x + 1  # add 1 on each element
## [1] 2 3 4 5 6
exp(x)  # take expoential on each element
## [1]   2.718282   7.389056  20.085537  54.598150 148.413159
x[1] + 10  # add 1 on the first element
## [1] 11
x[1]  # x[1] is not overwritten
## [1] 1
x[1] = x[1] + 10  # x[1] is overwritten
x[1]
## [1] 11

Accesing data from numeric or character vectors

x
## [1] 11  2  3  4  5
x[1]
## [1] 11
x[1:3]
## [1] 11  2  3
x[c(1, 3)]
## [1] 11  3
x[-3]
## [1] 11  2  4  5

Data Type Coercion: Class really matters!

z = c("1", "2", "3")  # create a character vector
class(z)
## [1] "character"
mean(z)  # calculate mean
## Warning in mean.default(z): argument is not numeric or logical: returning
## NA
## [1] NA
is.numeric(z)  # check whether it is numeric
## [1] FALSE
z = as.numeric(z)  # transform data type
z
## [1] 1 2 3
mean(z)
## [1] 2
z = as.character(z)
z
## [1] "1" "2" "3"

Missing Values in Vector

x = c(1, 4, 7, NA, 12, 19, 15, 21, 20)
mean(x)
## [1] NA
is.na(x)
## [1] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
anyNA(x)
## [1] TRUE
mean(x, na.rm = TRUE)  # remove NA from x, then calculate the mean()
## [1] 12.375

In class Exercise 1

nv2 = seq(2, 20, 2)
length(nv2)
## [1] 10
nv2[6]
## [1] 12
nv2 = c(nv2, 22)
nv2 = nv2[-1]
print(nv2)
##  [1]  4  6  8 10 12 14 16 18 20 22
mean(nv2)
## [1] 13
min(nv2)
## [1] 4
max(nv2)
## [1] 22
sd(nv2)
## [1] 6.055301
nv3 = 1:10
nv3 = nv3 + 3
nv3 = as.character(nv3)
print(nv3)
##  [1] "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13"

Function: Basic

myfunction <- function(arg1, arg2){

statements

return(object)

}

myfunction(arg1 = value1, arg2 = value2)

Built-in Function

s = c(1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2)
s
##  [1] 1.0 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2.0
s = seq(from = 1, to = 2, by = 0.1)
s
##  [1] 1.0 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2.0
s = seq(from = 2, to = 4, by = 0.2)
s
##  [1] 2.0 2.2 2.4 2.6 2.8 3.0 3.2 3.4 3.6 3.8 4.0

Arguments matching: How to parse values to arguments?

seq(from = 1.1, to = 3.5, by = 0.3)
## [1] 1.1 1.4 1.7 2.0 2.3 2.6 2.9 3.2 3.5
seq(1.1, 3.5, 0.3)
## [1] 1.1 1.4 1.7 2.0 2.3 2.6 2.9 3.2 3.5
seq(by = 0.3, from = 1.1, to = 3.5)
## [1] 1.1 1.4 1.7 2.0 2.3 2.6 2.9 3.2 3.5

Default argument values in functions

# help(rnorm)
rnorm()  # call rnorm() without specifying the length n
## Error in rnorm(): argument "n" is missing, with no default
e = rnorm(n = 1000)  # 1000 observations from Standard normal N(0,1) 
mean(e)  # check the mean
## [1] 0.03828554
sd(e)  # check the standard deviation
## [1] 0.9784108
hist(e)  # use hist() to plot the distribution

e1 = rnorm(n = 1000, mean = 3, sd = 5)  # 1000 values from N(3,5) 
mean(e1)
## [1] 2.841117
sd(e1)
## [1] 5.028204
hist(e1)

In class Exercise 2

u1 = runif(1000)
min(u1)
## [1] 0.002355184
max(u1)
## [1] 0.9977875
mean(u1)
## [1] 0.4973576
hist(u1)

u2 = runif(1000, -5, 5)
min(u2)
## [1] -4.996475
max(u2)
## [1] 4.96678
mean(u2)
## [1] -0.01792917
hist(u2)

Short-break

Let’s take a short break of 10 minutes

Matrix

Demo: constructing a matrix

matrix(data = 1:6, nrow = 3, ncol = 2)
##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6
matrix(data = 1:6, nrow = 3, ncol = 2, byrow = TRUE)
##      [,1] [,2]
## [1,]    1    2
## [2,]    3    4
## [3,]    5    6
a1 = 1:3
a2 = 4:6
cbind(a1, a2)
##      a1 a2
## [1,]  1  4
## [2,]  2  5
## [3,]  3  6
a1 = 1:6
a2 = 4:5  # what if a1 and a2 has different length
cbind(a1, a2)
##      a1 a2
## [1,]  1  4
## [2,]  2  5
## [3,]  3  4
## [4,]  4  5
## [5,]  5  4
## [6,]  6  5
a1 = 1:3
a2 = 4:6
rbind(a1, a2)
##    [,1] [,2] [,3]
## a1    1    2    3
## a2    4    5    6
matrix(c(1, 2, 3, "4", 5, "hello"), nrow = 3, ncol = 2)  # matrix() will coerce everything to character
##      [,1] [,2]   
## [1,] "1"  "4"    
## [2,] "2"  "5"    
## [3,] "3"  "hello"
a = matrix(data = 1:6, nrow = 3, ncol = 2)
nrow(a)  #return the row dimension
## [1] 3
ncol(a)  #return the col dimension
## [1] 2
dim(a)  #return row,col
## [1] 3 2

Accesing data from matrix objects

Demo: Access data from matrix objects

a = matrix(1:6, nrow = 3, ncol = 2)  #create a 3*2 matrix
a
##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6
a[1, 1]  # access the first cell
## [1] 1
a[1, ]  # access the first row
## [1] 1 4
a[, 2]  # access the second column
## [1] 4 5 6
a[1:2, 1:2]  # access the top 2 row and column
##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5

Scalar & Matrix Calculations

a
##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6
a + 1  # add 1 to each element of the matrix
##      [,1] [,2]
## [1,]    2    5
## [2,]    3    6
## [3,]    4    7
a * 2  # multiply 2 to each element of the matrix
##      [,1] [,2]
## [1,]    2    8
## [2,]    4   10
## [3,]    6   12

Elementwise Matrix Calculations

a + a  # element-wise addition
##      [,1] [,2]
## [1,]    2    8
## [2,]    4   10
## [3,]    6   12
a - a  # element-wise addition
##      [,1] [,2]
## [1,]    0    0
## [2,]    0    0
## [3,]    0    0
b = matrix(1:6, nrow = 2, ncol = 3)
a
##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6
b
##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6
a + b
## Error in a + b: non-conformable arrays
a
##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6
a * a
##      [,1] [,2]
## [1,]    1   16
## [2,]    4   25
## [3,]    9   36

Transpose & Matrix algebric multiplication

a
##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6
dim(a)  # 3*2 matrix
## [1] 3 2
t(a)
##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    4    5    6
dim(t(a))  #2*3 matrix
## [1] 2 3
a  # 3*2 dimension
##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6
t(a)  # 2*3 dimension
##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    4    5    6
a %*% t(a)  # 3*3 dimension
##      [,1] [,2] [,3]
## [1,]   17   22   27
## [2,]   22   29   36
## [3,]   27   36   45
t(a) %*% a  # 2*2 dimension
##      [,1] [,2]
## [1,]   14   32
## [2,]   32   77

Matrix dimensions matching

a  # 3*2
##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6
t(a)  # 2*3
##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    4    5    6
a + t(a)  # a and t(a) have different dimensions
## Error in a + t(a): non-conformable arrays
a %*% a
## Error in a %*% a: non-conformable arguments

Functions on Matrix

a
##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6
sum(a)  # sum of every elements
## [1] 21
mean(a)  # mean of all the elements
## [1] 3.5
a
##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6
apply(a, MARGIN = 1, sum)  # caculate sum of each row
## [1] 5 7 9
apply(a, MARGIN = 2, mean)  # mean of each column
## [1] 2 5

In class exercise 3

v3 = seq(1, 29, 2)
m1 = matrix(v3, 3, 5)
m2 = matrix(v3, 5, 3)
m1 = m1 + 1
m2 = m2 * 2
t(m2)
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    2    6   10   14   18
## [2,]   22   26   30   34   38
## [3,]   42   46   50   54   58
m1 + t(m2)
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    4   14   24   34   44
## [2,]   26   36   46   56   66
## [3,]   48   58   68   78   88
m1 %*% m2
##      [,1] [,2] [,3]
## [1,]  940 2340 3740
## [2,] 1040 2640 4240
## [3,] 1140 2940 4740

Let’s Take a Rest

See you back at 1:30pm

Welcome Back!

It’s R time again… Any questions?

Recall: What we have learned

Data.frame

Example: a Built-in Data Frame

help(mtcars)
head(mtcars, 2)  # see the first 2 rows
##               mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4      21   6  160 110  3.9 2.620 16.46  0  1    4    4
## Mazda RX4 Wag  21   6  160 110  3.9 2.875 17.02  0  1    4    4
tail(mtcars, 2)
##                mpg cyl disp  hp drat   wt qsec vs am gear carb
## Maserati Bora 15.0   8  301 335 3.54 3.57 14.6  0  1    5    8
## Volvo 142E    21.4   4  121 109 4.11 2.78 18.6  1  1    4    2
colnames(mtcars)  # names of columns
##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"
rownames(mtcars)  # name of rows
##  [1] "Mazda RX4"           "Mazda RX4 Wag"       "Datsun 710"         
##  [4] "Hornet 4 Drive"      "Hornet Sportabout"   "Valiant"            
##  [7] "Duster 360"          "Merc 240D"           "Merc 230"           
## [10] "Merc 280"            "Merc 280C"           "Merc 450SE"         
## [13] "Merc 450SL"          "Merc 450SLC"         "Cadillac Fleetwood" 
## [16] "Lincoln Continental" "Chrysler Imperial"   "Fiat 128"           
## [19] "Honda Civic"         "Toyota Corolla"      "Toyota Corona"      
## [22] "Dodge Challenger"    "AMC Javelin"         "Camaro Z28"         
## [25] "Pontiac Firebird"    "Fiat X1-9"           "Porsche 914-2"      
## [28] "Lotus Europa"        "Ford Pantera L"      "Ferrari Dino"       
## [31] "Maserati Bora"       "Volvo 142E"
ncol(mtcars)  # number of columns
## [1] 11
nrow(mtcars)  # number of rows
## [1] 32

Creating a data.frame object

name = c("Messi", "Ronaldo", "Neymar")  # a character vector
age = c(31, 33, 26)  # a numeric vector
golden_ball = c(TRUE, TRUE, FALSE)
players = data.frame(name, age, golden_ball)
head(players)
##      name age golden_ball
## 1   Messi  31        TRUE
## 2 Ronaldo  33        TRUE
## 3  Neymar  26       FALSE
golden_ball = c(TRUE, TRUE)  # golden ball has one missing value
length(name)
## [1] 3
length(golden_ball)
## [1] 2
data.frame(name, golden_ball)  # data frame do not accept columns of different lengths
## Error in data.frame(name, golden_ball): arguments imply differing number of rows: 3, 2
golden_ball = c(TRUE, TRUE, NA)  # use NA to indicate missing value 
data.frame(name, age, golden_ball)
##      name age golden_ball
## 1   Messi  31        TRUE
## 2 Ronaldo  33        TRUE
## 3  Neymar  26          NA
m = matrix(1:6, nrow = 2, ncol = 3)
class(m)
## [1] "matrix"
m
##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6
m = as.data.frame(m)
class(m)
## [1] "data.frame"
m
##   V1 V2 V3
## 1  1  3  5
## 2  2  4  6

Accesing data from a data.frame

players[1, ]
##    name age golden_ball
## 1 Messi  31        TRUE
players[, 2]
## [1] 31 33 26
players[1:2, ]
##      name age golden_ball
## 1   Messi  31        TRUE
## 2 Ronaldo  33        TRUE
players[, 1:2]
##      name age
## 1   Messi  31
## 2 Ronaldo  33
## 3  Neymar  26
players$name
## [1] Messi   Ronaldo Neymar 
## Levels: Messi Neymar Ronaldo
players$name[2]
## [1] Ronaldo
## Levels: Messi Neymar Ronaldo

Operations on data.frame

players$age + 2  # add 2 to all the element of the age column
## [1] 33 35 28
players$age[1] + 3  # add 1 to the first element of the age column
## [1] 34
assists = c(9, 8, 3)
players <- cbind(players, assists)  # add one column with cbind()
players
##      name age golden_ball assists
## 1   Messi  31        TRUE       9
## 2 Ronaldo  33        TRUE       8
## 3  Neymar  26       FALSE       3
players$goals = c(20, 30, 10)  # add one column with $
players
##      name age golden_ball assists goals
## 1   Messi  31        TRUE       9    20
## 2 Ronaldo  33        TRUE       8    30
## 3  Neymar  26       FALSE       3    10
new_player = data.frame(name = "Suarez", age = 31, golden_ball = FALSE, goals = 40, 
    assists = 4)  # add a new row to existing data.frame
rbind(players, new_player)
##      name age golden_ball assists goals
## 1   Messi  31        TRUE       9    20
## 2 Ronaldo  33        TRUE       8    30
## 3  Neymar  26       FALSE       3    10
## 4  Suarez  31       FALSE       4    40
players <- rbind(players, new_player)

Functions on data.frame

mean(mtcars$mpg)
## [1] 20.09062
max(mtcars$carb)
## [1] 8
hist(mtcars$mpg)

summary(mtcars)
##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000
summary(mtcars$mpg)  # summary of a specific column
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.40   15.42   19.20   20.09   22.80   33.90

In class exercise 4

Name Gender TenK PR Qualified
Sally F 55 52 FALSE
Mike M 46 44 TRUE
Carol F 62 58 FALSE
HalfMarathon
120
100
140
Name Gender TenK PR Qualified HalfMarathon
Sage M 40 42 TRUE 81
Name = c("Sally", "Mike", "Carol")
Gender = c("F", "M", "F")
TenK = c(55, 46, 62)
PR = c(52, 44, 58)
Qualified = c(FALSE, TRUE, FALSE)
running = data.frame(Name, Gender, TenK, PR, Qualified)
running
##    Name Gender TenK PR Qualified
## 1 Sally      F   55 52     FALSE
## 2  Mike      M   46 44      TRUE
## 3 Carol      F   62 58     FALSE
running$HalfMarathon = c(120, 100, 140)
running
##    Name Gender TenK PR Qualified HalfMarathon
## 1 Sally      F   55 52     FALSE          120
## 2  Mike      M   46 44      TRUE          100
## 3 Carol      F   62 58     FALSE          140
newrunner = data.frame(Name = "Sage", Gender = "M", TenK = 40, PR = 42, Qualified = TRUE, 
    HalfMarathon = 81)
running = rbind(running, newrunner)
running
##    Name Gender TenK PR Qualified HalfMarathon
## 1 Sally      F   55 52     FALSE          120
## 2  Mike      M   46 44      TRUE          100
## 3 Carol      F   62 58     FALSE          140
## 4  Sage      M   40 42      TRUE           81

List

my_list = list(players, c(1, 2, 3), matrix(1:6, 2, 3))
my_list
## [[1]]
##      name age golden_ball assists goals
## 1   Messi  31        TRUE       9    20
## 2 Ronaldo  33        TRUE       8    30
## 3  Neymar  26       FALSE       3    10
## 4  Suarez  31       FALSE       4    40
## 
## [[2]]
## [1] 1 2 3
## 
## [[3]]
##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6

List: Reference & Sublist

my_list[[1]]  # returns the object of the 2nd member of my_list 
##      name age golden_ball assists goals
## 1   Messi  31        TRUE       9    20
## 2 Ronaldo  33        TRUE       8    30
## 3  Neymar  26       FALSE       3    10
## 4  Suarez  31       FALSE       4    40
class(my_list[[1]])  # it is a data.frame
## [1] "data.frame"
summary(my_list[[1]])
##       name        age        golden_ball        assists         goals     
##  Messi  :1   Min.   :26.00   Mode :logical   Min.   :3.00   Min.   :10.0  
##  Neymar :1   1st Qu.:29.75   FALSE:2         1st Qu.:3.75   1st Qu.:17.5  
##  Ronaldo:1   Median :31.00   TRUE :2         Median :6.00   Median :25.0  
##  Suarez :1   Mean   :30.25   NA's :0         Mean   :6.00   Mean   :25.0  
##              3rd Qu.:31.50                   3rd Qu.:8.25   3rd Qu.:32.5  
##              Max.   :33.00                   Max.   :9.00   Max.   :40.0
my_list[1]  # return a sublist 
## [[1]]
##      name age golden_ball assists goals
## 1   Messi  31        TRUE       9    20
## 2 Ronaldo  33        TRUE       8    30
## 3  Neymar  26       FALSE       3    10
## 4  Suarez  31       FALSE       4    40
class(my_list[1])  # it is a list
## [1] "list"
mean(my_list[1])  # my_list[1] is a list, mean() does not work on list
## Warning in mean.default(my_list[1]): argument is not numeric or logical:
## returning NA
## [1] NA
my_list[2:3]
## [[1]]
## [1] 1 2 3
## 
## [[2]]
##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6
my_list[c(1, 3)]
## [[1]]
##      name age golden_ball assists goals
## 1   Messi  31        TRUE       9    20
## 2 Ronaldo  33        TRUE       8    30
## 3  Neymar  26       FALSE       3    10
## 4  Suarez  31       FALSE       4    40
## 
## [[2]]
##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6
my_list[[2]]  # a vector
## [1] 1 2 3
my_list[2]  # a list 
## [[1]]
## [1] 1 2 3
mean(my_list[[2]])  # mean() works on vector
## [1] 2
mean(my_list[2])  # mean() does not work on list
## Warning in mean.default(my_list[2]): argument is not numeric or logical:
## returning NA
## [1] NA
my_list[[3]]
##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6
my_list[[3]] = matrix(1:6, 3, 2)
my_list[[3]]
##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6
my_list[[4]] = runif(5)
my_list[[4]]
## [1] 0.5553994 0.5335197 0.6975176 0.2739090 0.2006292
my_list[[4]] <- NULL
my_list[[4]]  # the 4th member is already deleted
## Error in my_list[[4]]: subscript out of bounds
length(my_list)
## [1] 3

In class exercise 5

new_list = list(runif(10, -2, 2), matrix(1:10, 2, 5, byrow = TRUE), running)
new_list
## [[1]]
##  [1] -0.77500241  1.50246652  1.39566400  0.18635356 -0.24914866
##  [6] -1.71086730 -0.10150976  0.63850541 -0.07045173 -1.14813332
## 
## [[2]]
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    2    3    4    5
## [2,]    6    7    8    9   10
## 
## [[3]]
##    Name Gender TenK PR Qualified HalfMarathon
## 1 Sally      F   55 52     FALSE          120
## 2  Mike      M   46 44      TRUE          100
## 3 Carol      F   62 58     FALSE          140
## 4  Sage      M   40 42      TRUE           81
new_list[[1]] = new_list[[1]] + 2
sum(new_list[[2]])
## [1] 55
new_list[[3]] <- NULL

Factor

Why we use factors

Demo:Factor

direction = c("North", "West", "North", "East", "South", "West", "North", "South")  #create a character vector
direction
## [1] "North" "West"  "North" "East"  "South" "West"  "North" "South"
class(direction)
## [1] "character"
factor_direction = factor(direction)  # create a factor of direction
factor_direction
## [1] North West  North East  South West  North South
## Levels: East North South West
class(factor_direction)
## [1] "factor"
levels(factor_direction)
## [1] "East"  "North" "South" "West"
table(factor_direction)
## factor_direction
##  East North South  West 
##     1     3     2     2
seasons = c("Spring", "Fall", "Summer", "Spring", "Fall", "Winter", "Winter")

factor_seasons = factor(seasons, levels = c("Spring", "Summer", "Fall", "Winter"), 
    ordered = TRUE)
factor_seasons[1] < factor_seasons[2]
## [1] TRUE
factor_seasons[2] < factor_seasons[3]
## [1] FALSE
head(women)
##   height weight
## 1     58    115
## 2     59    117
## 3     60    120
## 4     61    123
## 5     62    126
## 6     63    129
women$height
##  [1] 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
factor_height = cut(women$height, 3)
table(factor_height)
## factor_height
##   (58,62.7] (62.7,67.3]   (67.3,72] 
##           5           5           5
factor_height = cut(women$height, 3, labels = c("Low", "Medium", "High"))
table(factor_height)
## factor_height
##    Low Medium   High 
##      5      5      5

In class exercise 6

mons = factor(c("March", "April", "January", "November", "January", "September", 
    "October", "September", "November", "August", "January", "November", "November", 
    "February", "May", "August", "July", "December", "August", "August", "September", 
    "November", "February", "April"), levels = c("January", "February", "March", 
    "April", "May", "June", "July", "August", "September", "October", "November", 
    "December"), ordered = TRUE)
table(mons)
## mons
##   January  February     March     April       May      June      July 
##         3         2         1         2         1         0         1 
##    August September   October  November  December 
##         4         3         1         5         1
factor_weight = cut(women$weight, 2, labels = c("Low", "High"))
factor_weight
##  [1] Low  Low  Low  Low  Low  Low  Low  Low  Low  High High High High High
## [15] High
## Levels: Low High
table(factor_weight)
## factor_weight
##  Low High 
##    9    6

Q&A

Any Questions?

See you tomorrow 8:30am