# Calculate 3 + 4
3 + 4
## [1] 7
# Calculate 6 + 12
6+12
## [1] 18
Note how the # symbol is used to add comments on the R code. Modulo returns the remainder of the division of the number to the left by the number on its right
# An addition
5 + 5
## [1] 10
# A subtraction
5 - 5
## [1] 0
# A multiplication
3 * 5
## [1] 15
# A division
(5 + 5) / 2
## [1] 5
# Exponentiation
2^5
## [1] 32
# Modulo
28 %% 6
## [1] 4
# Assign the value 42 to x
x <- 42
# Print out the value of the variable x
x
## [1] 42
# Assign the value 5 to the variable my_apples
my_apples <-5
# Print out the value of the variable my_apples
my_apples
## [1] 5
# Assign a value to the variables my_apples and my_oranges
my_apples <- 5
my_oranges <- 6
# Add these two variables together
my_apples + my_oranges
## [1] 11
# Create the variable my_fruit
my_fruit <- my_apples + my_oranges
# Assign a value to the variable my_apples
my_apples <- 5
# Fix the assignment of my_oranges
my_oranges <- 6
# Create the variable my_fruit and print it out
my_fruit <- my_apples + my_oranges
my_fruit
## [1] 11
Note that R is case sensitive!
# Change my_numeric to be 42
my_numeric <- 42
# Change my_character to be "universe"
my_character <- "universe"
# Change my_logical to be FALSE
my_logical <- FALSE
# Declare variables of different types
my_numeric <- 42
my_character <- "universe"
my_logical <- FALSE
# Check class of my_numeric
class(my_numeric)
## [1] "numeric"
# Check class of my_character
class(my_character)
## [1] "character"
# Check class of my_logical
class(my_logical)
## [1] "logical"
# Define the variable vegas
vegas <- "Go!"
Vectors are one-dimension arrays that can hold numeric data, character data, or logical data. In other words, a vector is a simple tool to store data. In R, you create a vector with the combine function c().
numeric_vector <- c(1, 10, 49)
character_vector <- c("a", "b", "c")
# Complete the code for boolean_vector
boolean_vector <- c(TRUE,FALSE,TRUE)
# Poker winnings from Monday to Friday
poker_vector <- c(140, -50, 20, -120, 240)
# Roulette winnings from Monday to Friday
roulette_vector <- c(-24, -50, 100, -350, 10)
You can give a name to the elements of a vector with the names() function. Have a look at this example:
some_vector <- c("John Doe", "poker player")
names(some_vector) <- c("Name", "Profession")
# Poker winnings from Monday to Friday
poker_vector <- c(140, -50, 20, -120, 240)
# Roulette winnings from Monday to Friday
roulette_vector <- c(-24, -50, 100, -350, 10)
# Assign days as names of poker_vector
names(poker_vector) <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")
# Assign days as names of roulette_vector
names(roulette_vector) <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")
In the previous exercises you probably experienced that it is boring and frustrating to type and retype information such as the days of the week. However, when you look at it from a higher perspective, there is a more efficient way to do this, namely, to assign the days of the week vector to a variable!
# Poker winnings from Monday to Friday
poker_vector <- c(140, -50, 20, -120, 240)
# Roulette winnings from Monday to Friday
roulette_vector <- c(-24, -50, 100, -350, 10)
# The variable days_vector
days_vector <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")
# Assign the names of the day to roulette_vector and poker_vector
names(poker_vector) <- days_vector
names(roulette_vector) <- days_vector
You want to find out the following type of information:
How much has been your overall profit or loss per day of the week?
Have you lost money over the week in total?
Are you winning/losing money on poker or on roulette?
Instructions
A_vector <- c(1, 2, 3)
B_vector <- c(4, 5, 6)
# Take the sum of A_vector and B_vector
total_vector <- A_vector + B_vector
# Print out total_vector
total_vector
## [1] 5 7 9
# Poker and roulette winnings from Monday to Friday:
poker_vector <- c(140, -50, 20, -120, 240)
roulette_vector <- c(-24, -50, 100, -350, 10)
days_vector <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")
names(poker_vector) <- days_vector
names(roulette_vector) <- days_vector
# Assign to total_daily how much you won/lost on each day
total_daily <- poker_vector + roulette_vector
# Poker and roulette winnings from Monday to Friday:
poker_vector <- c(140, -50, 20, -120, 240)
roulette_vector <- c(-24, -50, 100, -350, 10)
days_vector <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")
names(poker_vector) <- days_vector
names(roulette_vector) <- days_vector
# Total winnings with poker
total_poker <- sum(poker_vector)
# Total winnings with roulette
total_roulette <- sum (roulette_vector)
# Total winnings overall
total_week <- sum( total_poker + total_roulette)
# Print out total_week
total_week
## [1] -84
# Poker and roulette winnings from Monday to Friday:
poker_vector <- c(140, -50, 20, -120, 240)
roulette_vector <- c(-24, -50, 100, -350, 10)
days_vector <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")
names(poker_vector) <- days_vector
names(roulette_vector) <- days_vector
# Calculate total gains for poker and roulette
total_poker <- sum (poker_vector)
total_roulette <- sum (roulette_vector)
# Check if you realized higher total gains in poker than in roulette
total_poker>total_roulette
## [1] TRUE
To select the first element of the vector, you type poker_vector[1]. To select the second element of the vector, you type poker_vector[2], etc. > Notice that the first element in a vector has index 1, not 0 as in many other programming languages.
# Poker and roulette winnings from Monday to Friday:
poker_vector <- c(140, -50, 20, -120, 240)
roulette_vector <- c(-24, -50, 100, -350, 10)
days_vector <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")
names(poker_vector) <- days_vector
names(roulette_vector) <- days_vector
# Define a new variable based on a selection
poker_wednesday <- poker_vector[3]
How to selecting multiple elements from a vector; suppose you want to select the first and the fifth day of the week: use the vector c(1, 5) between the square brackets.
# Poker and roulette winnings from Monday to Friday:
poker_vector <- c(140, -50, 20, -120, 240)
roulette_vector <- c(-24, -50, 100, -350, 10)
days_vector <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")
names(poker_vector) <- days_vector
names(roulette_vector) <- days_vector
# Define a new variable based on a selection
poker_midweek <- poker_vector[c(2,3,4)]
Another way to find the mid-week results is poker_vector[2:4], which generates a vector with all natural numbers from 2 up to 4.
# Poker and roulette winnings from Monday to Friday:
poker_vector <- c(140, -50, 20, -120, 240)
roulette_vector <- c(-24, -50, 100, -350, 10)
days_vector <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")
names(poker_vector) <- days_vector
names(roulette_vector) <- days_vector
# Define a new variable based on a selection
roulette_selection_vector <- roulette_vector[2:5]
Another way to tackle the previous exercise is by using the names of the vector elements (Monday, Tuesday, …) instead of their numeric positions. For example: poker_vector[“Monday”].You can also use the element names to select multiple elements, for example: poker_vector[c(“Monday”,“Tuesday”)]
# Poker and roulette winnings from Monday to Friday:
poker_vector <- c(140, -50, 20, -120, 240)
roulette_vector <- c(-24, -50, 100, -350, 10)
days_vector <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")
names(poker_vector) <- days_vector
names(roulette_vector) <- days_vector
# Select poker results for Monday, Tuesday and Wednesday
poker_start <- poker_vector[c("Monday", "Tuesday", "Wednesday")]
# Calculate the average of the elements in poker_start
mean(poker_start)
## [1] 36.66667
The (logical) comparison operators known to R are:
you can use these comparison operators also on vectors. For example:
c(4, 5, 6) > 5
[1] FALSE FALSE TRUE
# Poker and roulette winnings from Monday to Friday:
poker_vector <- c(140, -50, 20, -120, 240)
roulette_vector <- c(-24, -50, 100, -350, 10)
days_vector <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")
names(poker_vector) <- days_vector
names(roulette_vector) <- days_vector
# Which days did you make money on poker?
selection_vector <- poker_vector>0
# Print out selection_vector
selection_vector
## Monday Tuesday Wednesday Thursday Friday
## TRUE FALSE TRUE FALSE TRUE
# Poker and roulette winnings from Monday to Friday:
poker_vector <- c(140, -50, 20, -120, 240)
roulette_vector <- c(-24, -50, 100, -350, 10)
days_vector <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")
names(poker_vector) <- days_vector
names(roulette_vector) <- days_vector
# Which days did you make money on poker?
selection_vector <- poker_vector > 0
# Select from poker_vector these days
poker_winning_days <- poker_vector[selection_vector]
# Poker and roulette winnings from Monday to Friday:
poker_vector <- c(140, -50, 20, -120, 240)
roulette_vector <- c(-24, -50, 100, -350, 10)
days_vector <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")
names(poker_vector) <- days_vector
names(roulette_vector) <- days_vector
# Which days did you make money on roulette?
selection_vector <-roulette_vector>0
# Select from roulette_vector these days
roulette_winning_days <- roulette_vector[selection_vector]
In R, a matrix is a collection of elements of the same data type (numeric, character, or logical) arranged into a fixed number of rows and columns. Since you are only working with rows and columns, a matrix is called two-dimensional.
You can construct a matrix in R with the matrix() function. Consider the following example: matrix(1:9, byrow = TRUE, nrow = 3)
The first argument is the collection of elements that R will arrange into the rows and columns of the matrix. Here, we use 1:9 which is a shortcut for c(1, 2, 3, 4, 5, 6, 7, 8, 9).
The argument byrow indicates that the matrix is filled by the rows. If we want the matrix to be filled by the columns, we just place byrow = FALSE.
The third argument nrow indicates that the matrix should have three rows.
Instructions
matrix(1:9,byrow=TRUE,nrow=3)
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 4 5 6
## [3,] 7 8 9
# Box office Star Wars (in millions!)
new_hope <- c(460.998, 314.4)
empire_strikes <- c(290.475, 247.900)
return_jedi <- c(309.306, 165.8)
# Create box_office
box_office <- c(new_hope, empire_strikes, return_jedi)
# Construct star_wars_matrix
star_wars_matrix <- matrix(data=box_office, byrow=TRUE,nrow=3)
Similar to vectors, you can add names for the rows and the columns of a matrix
rownames(my_matrix) <- row_names_vector
colnames(my_matrix) <- col_names_vector
# Box office Star Wars (in millions!)
new_hope <- c(460.998, 314.4)
empire_strikes <- c(290.475, 247.900)
return_jedi <- c(309.306, 165.8)
# Construct matrix
star_wars_matrix <- matrix(c(new_hope, empire_strikes, return_jedi), nrow = 3, byrow = TRUE)
# Vectors region and titles, used for naming
region <- c("US", "non-US")
titles <- c("A New Hope", "The Empire Strikes Back", "Return of the Jedi")
# Name the columns with region
colnames(star_wars_matrix)<-region
# Name the rows with titles
rownames(star_wars_matrix)<-titles
# Print out star_wars_matrix
star_wars_matrix
## US non-US
## A New Hope 460.998 314.4
## The Empire Strikes Back 290.475 247.9
## Return of the Jedi 309.306 165.8
n R, the function rowSums() conveniently calculates the totals for each row of a matrix. This function creates a new vector:
rowSums(my_matrix)
# Construct star_wars_matrix
box_office <- c(460.998, 314.4, 290.475, 247.900, 309.306, 165.8)
region <- c("US", "non-US")
titles <- c("A New Hope",
"The Empire Strikes Back",
"Return of the Jedi")
star_wars_matrix <- matrix(box_office,
nrow = 3, byrow = TRUE,
dimnames = list(titles, region))
# Calculate worldwide box office figures
worldwide_vector <- rowSums(star_wars_matrix)
You can add a column or multiple columns to a matrix with the cbind() function, which merges matrices and/or vectors together by column. For example:
big_matrix <- cbind(matrix1, matrix2, vector1 ...)
# Construct star_wars_matrix
box_office <- c(460.998, 314.4, 290.475, 247.900, 309.306, 165.8)
region <- c("US", "non-US")
titles <- c("A New Hope",
"The Empire Strikes Back",
"Return of the Jedi")
star_wars_matrix <- matrix(box_office,
nrow = 3, byrow = TRUE,
dimnames = list(titles, region))
# The worldwide box office figures
worldwide_vector <- rowSums(star_wars_matrix)
# Bind the new variable worldwide_vector as a column to star_wars_matrix
all_wars_matrix <- cbind(star_wars_matrix,worldwide_vector)
# Construct star_wars_matrix2
box_office <- c(474.5, 552.5, 310.7, 338.7, 380.3, 468.5)
region <- c("US", "non-US")
titles <- c("The Phantom Menace",
"Attack of the Clones",
"Revenge of the Sith")
star_wars_matrix2 <- matrix(box_office,
nrow = 3, byrow = TRUE,
dimnames = list(titles, region))
Just like every action has a reaction, every cbind() has an rbind(). (We admit, we are pretty bad with metaphors.)
# star_wars_matrix and star_wars_matrix2 are available in your workspace
star_wars_matrix
## US non-US
## A New Hope 460.998 314.4
## The Empire Strikes Back 290.475 247.9
## Return of the Jedi 309.306 165.8
star_wars_matrix2
## US non-US
## The Phantom Menace 474.5 552.5
## Attack of the Clones 310.7 338.7
## Revenge of the Sith 380.3 468.5
# Combine both Star Wars trilogies in one matrix
all_wars_matrix <- rbind(star_wars_matrix,star_wars_matrix2)
Just like cbind() has rbind(), colSums() has rowSums()
# all_wars_matrix is available in your workspace
all_wars_matrix
## US non-US
## A New Hope 460.998 314.4
## The Empire Strikes Back 290.475 247.9
## Return of the Jedi 309.306 165.8
## The Phantom Menace 474.500 552.5
## Attack of the Clones 310.700 338.7
## Revenge of the Sith 380.300 468.5
# Total revenue for US and non-US
total_revenue_vector <- colSums(all_wars_matrix)
# Print out total_revenue_vector
total_revenue_vector
## US non-US
## 2226.279 2087.800
Similar to vectors, you can use the square brackets [ ] to select one or multiple elements from a matrix. Whereas vectors have one dimension, matrices have two dimensions. You should therefore use a comma to separate the rows you want to select from the columns. For example:
If you want to select all elements of a row or a column, no number is needed before or after the comma, respectively:
my_matrix[,1] selects all elements of the first column.
my_matrix[1,] selects all elements of the first row.
Instructions
# all_wars_matrix is available in your workspace
all_wars_matrix
## US non-US
## A New Hope 460.998 314.4
## The Empire Strikes Back 290.475 247.9
## Return of the Jedi 309.306 165.8
## The Phantom Menace 474.500 552.5
## Attack of the Clones 310.700 338.7
## Revenge of the Sith 380.300 468.5
# Select the non-US revenue for all movies
non_us_all <- all_wars_matrix[,2]
# Average non-US revenue
mean(non_us_all)
## [1] 347.9667
# Select the non-US revenue for first two movies
non_us_some <- all_wars_matrix[1:2,2]
# Average non-US revenue for first two movies
mean(non_us_some)
## [1] 281.15
Similar to what you have learned with vectors, the standard operators like +, -, /, *, etc. work in an element-wise way on matrices in R.
For example, 2 * my_matrix multiplies each element of my_matrix by two.
# all_wars_matrix is available in your workspace
all_wars_matrix
## US non-US
## A New Hope 460.998 314.4
## The Empire Strikes Back 290.475 247.9
## Return of the Jedi 309.306 165.8
## The Phantom Menace 474.500 552.5
## Attack of the Clones 310.700 338.7
## Revenge of the Sith 380.300 468.5
# Estimate the visitors
visitors <- all_wars_matrix/5
# Print the estimate to the console
visitors
## US non-US
## A New Hope 92.1996 62.88
## The Empire Strikes Back 58.0950 49.58
## Return of the Jedi 61.8612 33.16
## The Phantom Menace 94.9000 110.50
## Attack of the Clones 62.1400 67.74
## Revenge of the Sith 76.0600 93.70
Just like 2 * my_matrix multiplied every element of my_matrix by two, my_matrix1 * my_matrix2 creates a matrix where each element is the product of the corresponding elements in my_matrix1 and my_matrix2.
# Construct ticket_prices_matrix
box_office <- c(5.0, 5.0,6.0,6.0,7.0,7.0,4.0,4.0,4.5,4.5,4.9,4.9)
region <- c("US", "non-US")
titles <- c("A New Hope",
"The Empire Strikes Back",
"Return of the Jedi",
"The Phantom Menace",
"Attack of the Clones",
"Revenge of the Sith")
ticket_prices_matrix <- matrix(box_office,
nrow = 6, byrow = TRUE,
dimnames = list(titles, region))
ticket_prices_matrix
## US non-US
## A New Hope 5.0 5.0
## The Empire Strikes Back 6.0 6.0
## Return of the Jedi 7.0 7.0
## The Phantom Menace 4.0 4.0
## Attack of the Clones 4.5 4.5
## Revenge of the Sith 4.9 4.9
# all_wars_matrix and ticket_prices_matrix are available in your workspace
all_wars_matrix
## US non-US
## A New Hope 460.998 314.4
## The Empire Strikes Back 290.475 247.9
## Return of the Jedi 309.306 165.8
## The Phantom Menace 474.500 552.5
## Attack of the Clones 310.700 338.7
## Revenge of the Sith 380.300 468.5
ticket_prices_matrix
## US non-US
## A New Hope 5.0 5.0
## The Empire Strikes Back 6.0 6.0
## Return of the Jedi 7.0 7.0
## The Phantom Menace 4.0 4.0
## Attack of the Clones 4.5 4.5
## Revenge of the Sith 4.9 4.9
# Estimated number of visitors
visitors <- all_wars_matrix/ ticket_prices_matrix
# US visitors
us_visitors <- visitors[,1]
# Average number of US visitors
mean(us_visitors)
## [1] 75.01339
The term factor refers to a statistical data type used to store categorical variables.
The difference between a categorical variable and a continuous variable is that a categorical variable can belong to a limited number of categories.
A continuous variable, on the other hand, can correspond to an infinite number of values.
A good example of a categorical variable is sex.
# Assign to the variable theory what this chapter is about!
theory <- "factors"
To create factors in R, you make use of the function factor()
First thing that you have to do is create a vector that contains all the observations that belong to a limited number of categories.
The function factor() will encode the vector as a factor
# Sex vector
sex_vector <- c("Male", "Female", "Female", "Male", "Male")
# Convert sex_vector to a factor
factor_sex_vector <- factor(sex_vector)
# Print out factor_sex_vector
factor_sex_vector
## [1] Male Female Female Male Male
## Levels: Female Male
There are two types of categorical variables: a nominal categorical variable and an ordinal categorical variable.
A nominal variable is a categorical variable without an implied order. For example, think of the categorical variable animals_vector with the categories “Elephant”, “Giraffe”, “Donkey” and “Horse”
In contrast, ordinal variables do have a natural ordering.For example the categorical variable temperature_vector with the categories: “Low”, “Medium” and “High”.
# Animals
animals_vector <- c("Elephant", "Giraffe", "Donkey", "Horse")
factor_animals_vector <- factor(animals_vector)
factor_animals_vector
## [1] Elephant Giraffe Donkey Horse
## Levels: Donkey Elephant Giraffe Horse
# Temperature
temperature_vector <- c("High", "Low", "High","Low", "Medium")
factor_temperature_vector <- factor(temperature_vector, order = TRUE, levels = c("Low", "Medium", "High"))
factor_temperature_vector
## [1] High Low High Low Medium
## Levels: Low < Medium < High
Sometimes you will want to change the names of these levels for clarity or other reasons. R allows you to do this with the function levels():
levels(factor_vector) <- c("name1", "name2",...)
Watch out: the order with which you assign the levels is important. If you don’t specify the levels of the factor when creating the vector, R will automatically assign them alphabetically. To correctly map “F” to “Female” and “M” to “Male”, the levels should be set to c(“Female”, “Male”), in this order.
# Code to build factor_survey_vector
survey_vector <- c("M", "F", "F", "M", "M")
factor_survey_vector <- factor(survey_vector)
# Specify the levels of factor_survey_vector
levels(factor_survey_vector) <- c("Female", "Male")
factor_survey_vector
## [1] Male Female Female Male Male
## Levels: Female Male
Summary () will give you a quick overview of the contents of a variable.
# Build factor_survey_vector with clean levels
survey_vector <- c("M", "F", "F", "M", "M")
factor_survey_vector <- factor(survey_vector)
levels(factor_survey_vector) <- c("Female", "Male")
factor_survey_vector
## [1] Male Female Female Male Male
## Levels: Female Male
# Generate summary for survey_vector
summary(survey_vector)
## Length Class Mode
## 5 character character
# Generate summary for factor_survey_vector
summary(factor_survey_vector)
## Female Male
## 2 3
What happens when you try to compare elements of a factor?
# Build factor_survey_vector with clean levels
survey_vector <- c("M", "F", "F", "M", "M")
factor_survey_vector <- factor(survey_vector)
levels(factor_survey_vector) <- c("Female", "Male")
# Male
male <- factor_survey_vector[1]
# Female
female <- factor_survey_vector[2]
# Battle of the sexes: Male 'larger' than female?
male > female
## Warning in Ops.factor(male, female): '>' not meaningful for factors
## [1] NA
Since “Male” and “Female” are unordered (or nominal) factor levels, R returns a warning message, telling you that the greater than operator is not meaningful.
# Create speed_vector
speed_vector <- c("medium","slow","slow","medium", "fast")
To create an ordered factor, you have to add two additional arguments: ordered and levels.
factor(some_vector,
ordered = TRUE,
levels = c("lev1", "lev2" ...))
By setting the argument ordered to TRUE in the function factor(), you indicate that the factor is ordered. With the argument levels you give the values of the factor in the correct order.
# Create speed_vector
speed_vector <- c("medium", "slow", "slow", "medium", "fast")
# Convert speed_vector to ordered factor vector
factor_speed_vector <- factor(speed_vector,
ordered=TRUE,
levels=c("slow","medium","fast"))
# Print factor_speed_vector
factor_speed_vector
## [1] medium slow slow medium fast
## Levels: slow < medium < fast
summary(factor_speed_vector)
## slow medium fast
## 2 2 1
# Create factor_speed_vector
speed_vector <- c("medium", "slow", "slow", "medium", "fast")
factor_speed_vector <- factor(speed_vector, ordered = TRUE, levels = c("slow", "medium", "fast"))
# Factor value for second data analyst
da2 <- factor_speed_vector[2]
# Factor value for fifth data analyst
da5 <- factor_speed_vector[5]
# Is data analyst 2 faster than data analyst 5?
da2>da5
## [1] FALSE
You may remember from the chapter about matrices that all the elements that you put in a matrix should be of the same type.
You will often find yourself working with datasets that contain different data types instead of only one.
# Print out built-in R data frame
mtcars
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
It is often useful to show only a small part of the entire dataset.
The function head() enables you to show the first observations of a data frame. Similarly, the function tail() prints out the last observations in your dataset.
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
The function str() shows you the structure of your dataset. For a data frame it tells you:
The total number of observations (e.g. 32 car types)
The total number of variables (e.g. 11 car features)
A full list of the variables names (e.g. mpg, cyl … )
The data type of each variable (e.g. num)
The first observations
Instructions
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
You construct a data frame with the data.frame() function. As arguments, you pass the vectors from before: they will become the different columns of your data frame. Because every column has the same length, the vectors you pass should also have the same length. But don’t forget that it is possible (and likely) that they contain different types of data.
# Definition of vectors
name <- c("Mercury", "Venus", "Earth",
"Mars", "Jupiter", "Saturn",
"Uranus", "Neptune")
type <- c("Terrestrial planet",
"Terrestrial planet",
"Terrestrial planet",
"Terrestrial planet", "Gas giant",
"Gas giant", "Gas giant", "Gas giant")
diameter <- c(0.382, 0.949, 1, 0.532,
11.209, 9.449, 4.007, 3.883)
rotation <- c(58.64, -243.02, 1, 1.03,
0.41, 0.43, -0.72, 0.67)
rings <- c(FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE)
# Create a data frame from the vectors
planets_df <- data.frame(name, type, diameter, rotation, rings)
str(planets_df)
## 'data.frame': 8 obs. of 5 variables:
## $ name : chr "Mercury" "Venus" "Earth" "Mars" ...
## $ type : chr "Terrestrial planet" "Terrestrial planet" "Terrestrial planet" "Terrestrial planet" ...
## $ diameter: num 0.382 0.949 1 0.532 11.209 ...
## $ rotation: num 58.64 -243.02 1 1.03 0.41 ...
## $ rings : logi FALSE FALSE FALSE FALSE TRUE TRUE ...
Similar to vectors and matrices, you select elements from a data frame with the help of square brackets [ ]. By using a comma, you can indicate what to select from the rows and the columns respectively. For example:
Sometimes you want to select all elements of a row or column. For example, my_df[1, ] selects all elements of the first row.
# The planets_df data frame from the previous exercise is pre-loaded
# Print out diameter of Mercury (row 1, column 3)
planets_df[1,3]
## [1] 0.382
# Print out data for Mars (entire fourth row)
planets_df[4,]
## name type diameter rotation rings
## 4 Mars Terrestrial planet 0.532 1.03 FALSE
Instead of using numerics to select elements of a data frame, you can also use the variable names to select columns of a data frame.
# The planets_df data frame from the previous exercise is pre-loaded
# Select first 5 values of diameter column
planets_df[1:5,"diameter"]
## [1] 0.382 0.949 1.000 0.532 11.209
If you want to select all elements of the variable there is a short-cut. If your columns have names, you can use the $ sign.
# planets_df is pre-loaded in your workspace
# Select the rings variable from planets_df
rings_vector <- planets_df$rings
# Print out rings_vector
rings_vector
## [1] FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE
# planets_df and rings_vector are pre-loaded in your workspace
# Adapt the code to select all columns for planets with rings
planets_df[rings_vector,]
## name type diameter rotation rings
## 5 Jupiter Gas giant 11.209 0.41 TRUE
## 6 Saturn Gas giant 9.449 0.43 TRUE
## 7 Uranus Gas giant 4.007 -0.72 TRUE
## 8 Neptune Gas giant 3.883 0.67 TRUE
You should see the subset() function as a short-cut to do exactly the same as what you did in the previous exercises.
subset(my_df, subset = some_condition)
The first argument of subset() specifies the dataset for which you want a subset. By adding the second argument, you give R the necessary information and conditions to select the correct subset.
# planets_df is pre-loaded in your workspace
# Select planets with diameter < 1
subset(planets_df,subset=diameter<1)
## name type diameter rotation rings
## 1 Mercury Terrestrial planet 0.382 58.64 FALSE
## 2 Venus Terrestrial planet 0.949 -243.02 FALSE
## 4 Mars Terrestrial planet 0.532 1.03 FALSE
Order() is a function that gives you the ranked position of each element when it is applied on a variable.
# planets_df is pre-loaded in your workspace
# Use order() to create positions
positions <- order(planets_df$diameter)
# Use positions to sort planets_df
planets_df[positions,]
## name type diameter rotation rings
## 1 Mercury Terrestrial planet 0.382 58.64 FALSE
## 4 Mars Terrestrial planet 0.532 1.03 FALSE
## 2 Venus Terrestrial planet 0.949 -243.02 FALSE
## 3 Earth Terrestrial planet 1.000 1.00 FALSE
## 8 Neptune Gas giant 3.883 0.67 TRUE
## 7 Uranus Gas giant 4.007 -0.72 TRUE
## 6 Saturn Gas giant 9.449 0.43 TRUE
## 5 Jupiter Gas giant 11.209 0.41 TRUE
# Just submit the answer
A list in R is similar to your to-do list at work or school: the different items on that list most likely differ in length, characteristic, and type of activity that has to be done.
A list in R allows you to gather a variety of objects under one name (that is, the name of the list) in an ordered way. These objects can be matrices, vectors, data frames, even other lists, etc. It is not even required that these objects are related to each other in any way.
You can store practically any piece of information in it
# Just submit the answer to start the first exercise on lists.
To construct a list you use the function list():
my_list <- list(comp1, comp2 ...)
# Vector with numerics from 1 up to 10
my_vector <- 1:10
# Matrix with numerics from 1 up to 9
my_matrix <- matrix(1:9, ncol = 3)
# First 10 elements of the built-in data frame mtcars
my_df <- mtcars[1:10,]
# Construct list with these different elements:
my_list <- list(my_vector, my_matrix, my_df)
my_list <- list(name1 = your_comp1,
name2 = your_comp2)
This creates a list with components that are named name1, name2, and so on. If you want to name your lists after you’ve created them, you can use the names() function as you did with vectors. The following commands are fully equivalent to the assignment above:
my_list <- list(your_comp1, your_comp2)
names(my_list) <- c("name1", "name2")
# Vector with numerics from 1 up to 10
my_vector <- 1:10
# Matrix with numerics from 1 up to 9
my_matrix <- matrix(1:9, ncol = 3)
# First 10 elements of the built-in data frame mtcars
my_df <- mtcars[1:10,]
# Adapt list() call to give the components names
my_list <- list(vec=my_vector, mat=my_matrix, df=my_df)
# Print out my_list
my_list
## $vec
## [1] 1 2 3 4 5 6 7 8 9 10
##
## $mat
## [,1] [,2] [,3]
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
##
## $df
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
Do not forget to name the list components accordingly (names are moviename, actors and reviews).
# Elements for shining_list
mov<-"The Shining"
act<-c("Jack Nicholson","Shelley Duvall","Danny Lloyd","Scatman Crothers","Barry Nelson")
scores<-c(4.5,4.0,5.0)
sources<-c("IMDb1","IMDb2","IMDb3")
comments<-c("Best Horror Film I Have Ever Seen","A truly brilliant and scary film from Stanley Kubrick","A masterpiece of psychological horror")
rev<-data.frame(scores,sources,comments)
# The variables mov, act and rev are available
# Finish the code to build shining_list
shining_list <- list(moviename = mov,actors = act, reviews = rev)
One way to select a component is using the numbered position of that component. For example, to “grab” the first component of shining_list you type
shining_list[[1]]
Important to remember: to select elements from vectors, you use single square brackets: [ ]. Don’t mix them up!
You can also refer to the names of the components, with [[ ]] or with the $ sign. Both will select the data frame representing the reviews:
shining_list[["reviews"]]
shining_list$reviews
Besides selecting components, you often need to select specific elements out of these components. For example, with shining_list[[2]][1] you select from the second component, actors (shining_list[[2]]), the first element ([1]).
# shining_list is already pre-loaded in the workspace
# Print out the vector representing the actors
shining_list[["actors"]]
## [1] "Jack Nicholson" "Shelley Duvall" "Danny Lloyd" "Scatman Crothers"
## [5] "Barry Nelson"
# Print the second element of the vector representing the actors
shining_list[[2]][2]
## [1] "Shelley Duvall"
# Use the table from the exercise to define the comments and scores vectors
scores <- c(4.6, 5, 4.8, 5, 4.2)
comments <- c("I would watch it again", "Amazing!", "I liked it", "One of the best movies", "Fascinating plot")
# Save the average of the scores vector as avg_review
avg_review<-mean(scores)
# Combine scores and comments into the reviews_df data frame
reviews_df<-data.frame(scores,comments)
# Create and print out a list, called departed_list
departed_list<-list(mov,act,reviews_df,avg_review)
departed_list
## [[1]]
## [1] "The Shining"
##
## [[2]]
## [1] "Jack Nicholson" "Shelley Duvall" "Danny Lloyd" "Scatman Crothers"
## [5] "Barry Nelson"
##
## [[3]]
## scores comments
## 1 4.6 I would watch it again
## 2 5.0 Amazing!
## 3 4.8 I liked it
## 4 5.0 One of the best movies
## 5 4.2 Fascinating plot
##
## [[4]]
## [1] 4.72