- The RStudio environment
- Basic operators
- Basic data types
- Define variables
variable_name <- some_valuetarget <- data
variable_name <- some_valuetarget <- dataA vector is a sequence of values, all of the same type
x <- c(1, 3, 7, 15) # c stands for "combine" x
## [1] 1 3 7 15
is.vector(x)
## [1] TRUE
length(x) # find the number of elements in a vector
## [1] 4
seq(from=1, to=10) # sequence
## [1] 1 2 3 4 5 6 7 8 9 10
1:10 # sequence shorthand
## [1] 1 2 3 4 5 6 7 8 9 10
seq(from=1, to=10, by=2) # sequence
## [1] 1 3 5 7 9
rep(7, times=3) # repeat
## [1] 7 7 7
vec <- c(10, 20, 7, 13) # assigning a vector to a variable vec
## [1] 10 20 7 13
names(vec) <- c("value1", "value2", "value3", "value4")
vec
## value1 value2 value3 value4 ## 10 20 7 13
vec <- c("value1"=10, "value2"=20, "value3"=7, "value4"=13) #same result
vec <- c(value1=10, value2=20, value3=7, value4=13) #same result
vec1 <- c(1, 3, 5) vec2 <- c(11, 13, 15) c(vec1, vec2, c(21, 23, 25))
## [1] 1 3 5 11 13 15 21 23 25
Try to find the most efficient way (fewer characters) to create the following vectors:
## [1] 10 11 12 13 14 15 16 17 18 19 20
## [1] -30 -25 -20 -15 -10 -5 0 5 10 15 20 25 30
## apple banana cherry ## 4 5 3
Vector computations are performed element-wise
earnings <- c(10, 20, 30, 40) expenses <- c(5, 25, 25, 10) 5 * earnings
## [1] 50 100 150 200
earnings - expenses
## [1] 5 -5 5 30
earnings * c(1, 2, 3, 4)
## [1] 10 40 90 160
x <- c(10, 20, 30)
Is there a value of 20 in x?
20 %in% x
## [1] TRUE
Is there a value of 40 in x?
40 %in% x
## [1] FALSE
basket <- c("apple", "banana", "cherry")
Does basket have apple?
"apple" %in% basket
## [1] TRUE
Does basket have cheese?
"cheese" %in% basket
## [1] FALSE
In real world, your data may contain missing values. In R, we use NA (upper case) to represent a missing value.
vec = c(1, 4, NA, 2) vec
## [1] 1 4 NA 2
sum(vec)
## [1] NA
max(vec)
## [1] NA
NA creates problems for most numerical functions.
For example, we cannot add NA to other numbers.
To apply these numerical functions on data with NAs, we simply just remove NAs from the calculation. That is,
sum(vec, na.rm = T) # remove NAs before calculating the sum
## [1] 7
max(vec, na.rm = T) # remove NAs before getting the max value
## [1] 4
Recycling repeats elements in the shorter vector until its length matches the longer vector
u <- c(10, 20) v <- c(1, 2, 3, 4, 5) u + v # the shorter vector will be recycled to match the longer vector
## [1] 11 22 13 24 15
Under the hood:
u + v
= c(10, 20) + c(1, 2, 3, 4, 5)
= c(10, 20, 10, 20, 10) + c(1, 2, 3, 4, 5) # recycling
= c(10+1, 20+2, 10+3, 20+4, 10+5) # element-wise operation
= c(11, 22, 13, 24, 15)
Without typing the following into R, guess what the result of the last line would be:
vec1 <- 1:4 vec2 <- c(1, 0, 0,0) vec1 - vec2
Another more challenging one:
seq(from=0, to=10, by=2) + 3:1
You can retrieve elements from a vector by specifying the indexes of the elements. This operation is also known as subsetting.
vec <- c("value1"=10, "value2"=20, "value3"=30, "value4"=40)
vec[1] # get the element at index 1
## value1 ## 10
vec["value3"] # get the element whose name matches the string
## value3 ## 30
You can provide more than just one index.
vec[1:3] # specify a vector of indexes
## value1 value2 value3 ## 10 20 30
vec[c(3, 2, 1, 4)] # return with the specified order
## value3 value2 value1 value4 ## 30 20 10 40
vec[c("value4", "value4")]
## value4 value4 ## 40 40
vec <- c("value1"=10, "value2"=20, "value3"=30, "value4"=40)
vec[-1] # all but the first element
## value2 value3 value4 ## 20 30 40
vec[-c(1, 2)] # all but the first and the second elements
## value3 value4 ## 30 40
List is also a container for values, but can accommodate items of different data types.
x <- list("Bob", c(100,80,90))
x
## [[1]] ## [1] "Bob" ## ## [[2]] ## [1] 100 80 90
Just like vectors, you can give each element a name:
x <- list(name="Bob", grades=c(100,80,90)) x
## $name ## [1] "Bob" ## ## $grades ## [1] 100 80 90
x[2] # get the second elment as a list
## $grades ## [1] 100 80 90
x["grades"] # get the elment named "grades" as a list
## $grades ## [1] 100 80 90
y1 = x[2] class(y1)
## [1] "list"
x[[2]] # get the second elment as a vector
## [1] 100 80 90
y2 = x[[2]] class(y2)
## [1] "numeric"
x[["grades"]] # get the elment named "grades" as a vector
## [1] 100 80 90
x$grades # most common/readable way to retrieve an elment by name
## [1] 100 80 90
Create the following list and use the function mean() to get the GPA (grade point average) from her grade points:
## $name ## [1] "Anna" ## ## $is_female ## [1] TRUE ## ## $age ## [1] 22 ## ## $enrollment ## [1] "MIS4710" "MIS4720" "MIS4730" ## ## $grade_point ## [1] 4 3 4
A matrix is a collection of data elements arranged in a two-dimensional rectangular layout.
A <- matrix( 1:6, # the data elements nrow=2, # number of rows ncol=3, # number of columns byrow = TRUE) # fill matrix by rows A
## [,1] [,2] [,3] ## [1,] 1 2 3 ## [2,] 4 5 6
A data frame is a set of vectors of equal length. Consider data frame as an Excel sheet or a database table.
Column names are preserved or guessed if not explicitly set
course <- c("MIS4730", "MIS4710", "MIS4950", "MIS1234")
num_of_students <- c(20, 10, 40, 30)
data_analytics_minor <- c(TRUE, TRUE, TRUE, FALSE)
df <- data.frame(course, n_students=num_of_students, data_analytics_minor,
stringsAsFactors=F)
df # notice the column names and row names
## course n_students data_analytics_minor ## 1 MIS4730 20 TRUE ## 2 MIS4710 10 TRUE ## 3 MIS4950 40 TRUE ## 4 MIS1234 30 FALSE
ncol(df) # number of columns
## [1] 3
nrow(df) # number of rows
## [1] 4
colnames(df) # get column names
## [1] "course" "n_students" "data_analytics_minor"
rownames(df) # get row names
## [1] "1" "2" "3" "4"
You can change column and row names:
df2 <- df # create a copy of df, and name it as "df2"
colnames(df2) <- c("col1", "col2", "col3") # assign column names
colnames(df2) # they were "course", "n_students", "ba_minor_course"
## [1] "col1" "col2" "col3"
rownames(df2) <- c("row1", "row2", "row3", "row4") # assign row names
rownames(df2) # they were "1", "2", "3", "4"
## [1] "row1" "row2" "row3" "row4"
There are many ways you can get values out of a column:
dataframe_name$column_namedf$course
## [1] "MIS4730" "MIS4710" "MIS4950" "MIS1234"
df$n_students
## [1] 20 10 40 30
df
## course n_students data_analytics_minor ## 1 MIS4730 20 TRUE ## 2 MIS4710 10 TRUE ## 3 MIS4950 40 TRUE ## 4 MIS1234 30 FALSE
All of a row
df[2,] # row 2
## course n_students data_analytics_minor ## 2 MIS4710 10 TRUE
df
## course n_students data_analytics_minor ## 1 MIS4730 20 TRUE ## 2 MIS4710 10 TRUE ## 3 MIS4950 40 TRUE ## 4 MIS1234 30 FALSE
Multiple rows
df[c(1,3),] # rows 1 & 3
## course n_students data_analytics_minor ## 1 MIS4730 20 TRUE ## 3 MIS4950 40 TRUE
df[2,1] # row 2, column 1
## [1] "MIS4710"
df[c(3,4),c(1,2)] # rows 3 & 4, columns 1 & 2
## course n_students ## 3 MIS4950 40 ## 4 MIS1234 30
df["2","n_students"] # "2": row name, "n_students": column name
## [1] 10
Rows matching a condition:
df
## course n_students data_analytics_minor ## 1 MIS4730 20 TRUE ## 2 MIS4710 10 TRUE ## 3 MIS4950 40 TRUE ## 4 MIS1234 30 FALSE
# Be careful that it's df$course in the brackets, not just course df[df$course == 'MIS4730', ]
## course n_students data_analytics_minor ## 1 MIS4730 20 TRUE
df
## course n_students data_analytics_minor ## 1 MIS4730 20 TRUE ## 2 MIS4710 10 TRUE ## 3 MIS4950 40 TRUE ## 4 MIS1234 30 FALSE
df$course == 'MIS4730'
## [1] TRUE FALSE FALSE FALSE
df[df$course == 'MIS4730', ] # is interpreted by R as the following df[c(TRUE, FALSE, FALSE, FALSE), ]
The result is that you are getting the TRUE row(s).
This lab assignment is the first part of Assignment 1. It involves 2 tasks (see the next 2 slides). Once you finish the following tasks, please put everything in one single R file with the file name assignment1-Part1.R (.R is the file extension) and upload it to Canvas (Lab Assignment 1-Part 1).
You will lose 50% of the points if you use a different file name or put your code in multiple files.
In addition, lab assignments will be graded based on:
## $name ## [1] "Alex" "Bob" "Claire" "Denise" ## ## $female ## [1] FALSE FALSE TRUE TRUE ## ## $age ## [1] 20 25 30 35
## [1] "Bob"
## name female age ## row_1 Alex FALSE 20 ## row_2 Bob FALSE 25 ## row_3 Claire TRUE 30 ## row_4 Denise TRUE 35