I. Introduction
II. Data preparation: collection and loading
Next, you will need to install several packages which are required for collecting near real-time tweets and load them into the workspace.
Module 1: R Basics, Functions, and Data Types
1.1 Motivation
1.2 R Basics
log(1)## [1] 0
exp(1)## [1] 2.718282
log(exp(1))## [1] 1
# EX 2: Variable names
# Load package and data
library(dslabs)
data(murders)
# Use the function names to extract the variable names
names(murders)## [1] "state" "abb" "region" "population" "total"
# EX 3: Examining Variables
# To access the population variable from the murders dataset use this code:
p <- murders$population
# To determine the class of object `p` we use this code:
class(p)## [1] "numeric"
# Use the accessor to extract state abbreviations and assign it to a
a <- murders$abb
# Determine the class of a
class(a)## [1] "character"
# EX 4: Multiple ways to access variables
# We extract the population like this:
p <- murders$population
# This is how we do the same with the square brackets:
o <- murders[["population"]]
# We can confirm these two are the same
identical(o, p)## [1] TRUE
# Use square brackets to extract `abb` from `murders` and assign it to b
b <- murders[["abb"]]
# Check if `a` and `b` are identical
identical(a,b)## [1] TRUE
1.3 Data Types
class(2)## [1] "numeric"
class("programming")## [1] "character"
class(ls)## [1] "function"
class(murders)## [1] "data.frame"
class(murders$state)## [1] "character"
class(murders$region)## [1] "factor"
# structure
str(murders)## 'data.frame': 51 obs. of 5 variables:
## $ state : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ abb : chr "AL" "AK" "AZ" "AR" ...
## $ region : Factor w/ 4 levels "Northeast","South",..: 2 4 4 2 4 4 1 2 2 2 ...
## $ population: num 4779736 710231 6392017 2915918 37253956 ...
## $ total : num 135 19 232 93 1257 ...
names(murders)## [1] "state" "abb" "region" "population" "total"
head(murders)## state abb region population total
## 1 Alabama AL South 4779736 135
## 2 Alaska AK West 710231 19
## 3 Arizona AZ West 6392017 232
## 4 Arkansas AR South 2915918 93
## 5 California CA West 37253956 1257
## 6 Colorado CO West 5029196 65
# EX 2: Variable names
# Load package and data
library(dslabs)
data(murders)
# Use the function names to extract the variable names
names(murders)## [1] "state" "abb" "region" "population" "total"
# EX 5: Factors
# We can see the class of the region variable using class
class(murders$region)## [1] "factor"
# Determine the number of regions included in this variable
length(levels(murders$region))## [1] 4
# EX 6: Tables
# Here is an example of what the table function does
x <- c("a", "a", "b", "b", "b", "c")
table(x)## x
## a b c
## 2 3 1
# Write one line of code to show the number of states per region
table(murders$region)##
## Northeast South North Central West
## 9 17 12 13
Module 2: Vectors, Sorting
2.1 Vectors
c stands for concatenate
codes <- c(italy=380, canada=124, egypt=818)
codes## italy canada egypt
## 380 124 818
use to access an element of a vector
codes[2]## canada
## 124
codes[1:2]## italy canada
## 380 124
codes["canada"]## canada
## 124
# codes["egypt","canada"]x <- 1:5
x## [1] 1 2 3 4 5
y <- as.character(x)
y## [1] "1" "2" "3" "4" "5"
z <- as.numeric(y)
z## [1] 1 2 3 4 5
x <- c("1", "b","3")
x## [1] "1" "b" "3"
y <- as.numeric(x)## Warning: NAs introduced by coercion
y## [1] 1 NA 3
# EX 1: Numeric Vectors
# Here is an example creating a numeric vector named cost
cost <- c(50, 75, 90, 100, 150)
# Create a numeric vector to store the temperatures listed in the instructions into a vector named temp
# Make sure to follow the same order in the instructions
temp <- c("Beijing", 35, "Lagos", 88, "Paris", 42, "Rio de Janeiro", 84, "San Juan", 81, "Toronto", 30)
temp## [1] "Beijing" "35" "Lagos" "88"
## [5] "Paris" "42" "Rio de Janeiro" "84"
## [9] "San Juan" "81" "Toronto" "30"
temp <- c(35, 88, 42, 84, 81, 30)# EX 2: Character vectors
# here is an example of how to create a character vector
food <- c("pizza", "burgers", "salads", "cheese", "pasta")
# Create a character vector called city to store the city names
# Make sure to follow the same order as in the instructions
city <- c("Beijing", "Lagos", "Paris", "Rio de Janeiro", "San Juan","Toronto")
city## [1] "Beijing" "Lagos" "Paris" "Rio de Janeiro"
## [5] "San Juan" "Toronto"
# EX 3: Connecting Numeric and Character Vectors
# Associate the cost values with its corresponding food item
cost <- c(50, 75, 90, 100, 150)
food <- c("pizza", "burgers", "salads", "cheese", "pasta")
names(cost) <- food
# You already wrote this code
temp <- c(35, 88, 42, 84, 81, 30)
city <- c("Beijing", "Lagos", "Paris", "Rio de Janeiro", "San Juan", "Toronto")
# Associate the temperature values with its corresponding city
names(temp) <- city
temp## Beijing Lagos Paris Rio de Janeiro San Juan
## 35 88 42 84 81
## Toronto
## 30
# EX 4: Subsetting vectors
# cost of the last 3 items in our food list:
cost[3:5]## salads cheese pasta
## 90 100 150
# temperatures of the first three cities in the list:
temp[0:3]## Beijing Lagos Paris
## 35 88 42
temp[c(1,2,3)]## Beijing Lagos Paris
## 35 88 42
# EX 5: Subsetting vectors continued...
# Access the cost of pizza and pasta from our food list
cost[c(1,5)]## pizza pasta
## 50 150
# Define temp
temp <- c(35, 88, 42, 84, 81, 30)
city <- c("Beijing", "Lagos", "Paris", "Rio de Janeiro", "San Juan", "Toronto")
names(temp) <- city
# Access the temperatures of Paris and San Juan
temp[c(3,5)]## Paris San Juan
## 42 81
# EX 6: Sequences
# Create a vector m of integers that starts at 32 and ends at 99.
m <- 32:99
# Determine the length of object m.
length(m)## [1] 68
# Create a vector x of integers that starts 12 and ends at 73.
x <- 12:73
# Determine the length of object x.
length(x)## [1] 62
# EX 7: Sequences continued...
# Create a vector with the multiples of 7, smaller than 50.
seq(7, 49, 7) ## [1] 7 14 21 28 35 42 49
# Create a vector containing all the positive odd numbers smaller than 100.
# The numbers should be in ascending order
seq(1, 99, 2)## [1] 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 33 35 37 39 41 43 45
## [24] 47 49 51 53 55 57 59 61 63 65 67 69 71 73 75 77 79 81 83 85 87 89 91
## [47] 93 95 97 99
# EX 8: Sequences and length
# We can a vector with the multiples of 7, smaller than 50 like this
seq(7, 49, 7) ## [1] 7 14 21 28 35 42 49
# But note that the second argument does not need to be last number.
# It simply determines the maximum value permitted.
# so the following line of code produces the same vector as seq(7, 49, 7)
seq(7, 50, 7)## [1] 7 14 21 28 35 42 49
# Create a sequence of numbers from 6 to 55, with 4/7 increments and determine its length
length(seq(6, 55, 4/7))## [1] 86
# EX 9: Sequences of certain length
# Store the sequence in the object a
a <- seq(1, 10, length.out = 100)
# Determine the class of a
class(a)## [1] "numeric"
# EX 10: Integers
# Store the sequence in the object a
a <- seq(1, 10)
# Determine the class of a
class(a)## [1] "integer"
# EX 11: Integers and Numerics
# Check the class of 1, assigned to the object a
class(1)## [1] "numeric"
# Confirm the class of 1L is integer
class(1L)## [1] "integer"
# EX 12: Coercion
# Define the vector x
x <- c(1, 3, 5,"a")
# Note that the x is character vector
class(x)## [1] "character"
# Typecast the vector to get an integer vector
# You will get a warning but that is ok
x <- as.integer(x)## Warning: NAs introduced by coercion
2.2 Sorting
# how many murders
sort(murders$total)## [1] 2 4 5 5 7 8 11 12 12 16 19 21 22 27
## [15] 32 36 38 53 63 65 67 84 93 93 97 97 99 111
## [29] 116 118 120 135 142 207 219 232 246 250 286 293 310 321
## [43] 351 364 376 413 457 517 669 805 1257
x <- c(31,4,15,92,65)
sort(x)## [1] 4 15 31 65 92
index <- order(x)
index## [1] 2 3 1 5 4
# first we order the total murders and save it to index
index <- order(murders$total)
# then we use index to look up state ordered by murdercount from low to high
murders$abb[index]## [1] "VT" "ND" "NH" "WY" "HI" "SD" "ME" "ID" "MT" "RI" "AK" "IA" "UT" "WV"
## [15] "NE" "OR" "DE" "MN" "KS" "CO" "NM" "NV" "AR" "WA" "CT" "WI" "DC" "OK"
## [29] "KY" "MA" "MS" "AL" "IN" "SC" "TN" "AZ" "NJ" "VA" "NC" "MD" "OH" "MO"
## [43] "LA" "IL" "GA" "MI" "PA" "NY" "FL" "TX" "CA"
# which is the max murder number
max(murders$total)## [1] 1257
# use to look up state
i_max <- which.max(murders$total)
i_max## [1] 5
murders$state[i_max]## [1] "California"
# which is the min murder number
min(murders$total)## [1] 2
# use to look up state
i_min <- which.min(murders$total)
i_min## [1] 46
murders$state[i_min]## [1] "Vermont"
# ranking
rank(x)## [1] 3 1 2 5 4
# EX 1: sort
# Access the `state` variable and store it in an object
states <- murders$state
# Sort the object alphabetically and redefine the object
states <- sort(states)
# Report the first alphabetical value
states[1]## [1] "Alabama"
# Access population values from the dataset and store it in pop
pop <- murders$population
# Sort the object and save it in the same object
pop <- sort(pop)
# Report the smallest population size
pop[1]## [1] 563626
# EX 2: order
# Access population from the dataset and store it in pop
pop <- murders$population
# Use the command order, to order pop and store in object o
o <- order(pop)
# Find the index number of the entry with the smallest population size
o[1]## [1] 51
# EX 3: New Codes
# Find the smallest value for variable total
which.min(murders$total)## [1] 46
# Find the smallest value for population
which.min(murders$population)## [1] 51
# EX 4:Using the output of order
# Define the variable i to be the index of the smallest state
i <- which.min(murders$population)
# Define variable states to hold the states
states <- murders$state
# Use the index you just defined to find the state with the smallest population
states[i]## [1] "Wyoming"
# EX 5: Ranks# EX 5: Ranks
# Store temperatures in an object
temp <- c(35, 88, 42, 84, 81, 30)
# Store city names in an object
city <- c("Beijing", "Lagos", "Paris", "Rio de Janeiro", "San Juan", "Toronto")
# Create data frame with city names and temperature
city_temps <- data.frame(name = city, temperature = temp)
# Define a variable states to be the state names
states <- murders$state
# Define a variable ranks to determine the population size ranks
ranks <- rank(murders$population)
# Create a data frame my_df with the state name and its rank
my_df <- data.frame(states = states, ranks = ranks)
my_df## states ranks
## 1 Alabama 29
## 2 Alaska 5
## 3 Arizona 36
## 4 Arkansas 20
## 5 California 51
## 6 Colorado 30
## 7 Connecticut 23
## 8 Delaware 7
## 9 District of Columbia 2
## 10 Florida 49
## 11 Georgia 44
## 12 Hawaii 12
## 13 Idaho 13
## 14 Illinois 47
## 15 Indiana 37
## 16 Iowa 22
## 17 Kansas 19
## 18 Kentucky 26
## 19 Louisiana 27
## 20 Maine 11
## 21 Maryland 33
## 22 Massachusetts 38
## 23 Michigan 43
## 24 Minnesota 31
## 25 Mississippi 21
## 26 Missouri 34
## 27 Montana 8
## 28 Nebraska 14
## 29 Nevada 17
## 30 New Hampshire 10
## 31 New Jersey 41
## 32 New Mexico 16
## 33 New York 48
## 34 North Carolina 42
## 35 North Dakota 4
## 36 Ohio 45
## 37 Oklahoma 24
## 38 Oregon 25
## 39 Pennsylvania 46
## 40 Rhode Island 9
## 41 South Carolina 28
## 42 South Dakota 6
## 43 Tennessee 35
## 44 Texas 50
## 45 Utah 18
## 46 Vermont 3
## 47 Virginia 40
## 48 Washington 39
## 49 West Virginia 15
## 50 Wisconsin 32
## 51 Wyoming 1
# EX 6: Data Frames, Ranks and Orders
# Define a variable states to be the state names from the murders data frame
states <- murders$state
# Define a variable ranks to determine the population size ranks
ranks <- rank(murders$population)
# Define a variable ind to store the indexes needed to order the population values
ind <- order(murders$population)
# Create a data frame my_df with the state name and its rank and ordered from least populous to most
my_df <- data.frame(states = states[ind], ranks = ranks[ind])
my_df## states ranks
## 1 Wyoming 1
## 2 District of Columbia 2
## 3 Vermont 3
## 4 North Dakota 4
## 5 Alaska 5
## 6 South Dakota 6
## 7 Delaware 7
## 8 Montana 8
## 9 Rhode Island 9
## 10 New Hampshire 10
## 11 Maine 11
## 12 Hawaii 12
## 13 Idaho 13
## 14 Nebraska 14
## 15 West Virginia 15
## 16 New Mexico 16
## 17 Nevada 17
## 18 Utah 18
## 19 Kansas 19
## 20 Arkansas 20
## 21 Mississippi 21
## 22 Iowa 22
## 23 Connecticut 23
## 24 Oklahoma 24
## 25 Oregon 25
## 26 Kentucky 26
## 27 Louisiana 27
## 28 South Carolina 28
## 29 Alabama 29
## 30 Colorado 30
## 31 Minnesota 31
## 32 Wisconsin 32
## 33 Maryland 33
## 34 Missouri 34
## 35 Tennessee 35
## 36 Arizona 36
## 37 Indiana 37
## 38 Massachusetts 38
## 39 Washington 39
## 40 Virginia 40
## 41 New Jersey 41
## 42 North Carolina 42
## 43 Michigan 43
## 44 Georgia 44
## 45 Ohio 45
## 46 Pennsylvania 46
## 47 Illinois 47
## 48 New York 48
## 49 Florida 49
## 50 Texas 50
## 51 California 51
# EX 7: NA
# Using new dataset
library(dslabs)
data(na_example)
# Checking the structure
str(na_example)## int [1:1000] 2 1 3 2 1 3 1 4 3 2 ...
# Find out the mean of the entire dataset
mean(na_example)## [1] NA
# Use is.na to create a logical index ind that tells which entries are NA
ind <- is.na(na_example)
# Determine how many NA ind has using the sum function
sum(ind)## [1] 145
# EX 8: Rmoving NAs
# Note what we can do with the ! operator
x <- c(1, 2, 3)
ind <- c(FALSE, TRUE, FALSE)
x[!ind]## [1] 1 3
# Create the ind vector
library(dslabs)
data(na_example)
ind <- is.na(na_example)
# We saw that this gives an NA
mean(na_example)## [1] NA
# Compute the average, for entries of na_example that are not NA
mean(na_example[!ind])## [1] 2.301754
2.3 Vector Arithmetic
# which state is the biggest:
murders$state[which.max(murders$population)]## [1] "California"
# How many people:
max(murders$population)## [1] 37253956
Example of elementwise operations on vectors
# heights in feet
heights <- c(69,62,66,70,70,73,67,73,67,70)
heights * 2.54## [1] 175.26 157.48 167.64 177.80 177.80 185.42 170.18 185.42 170.18 177.80
murder_rate <- murders$total/murders$population*100000
murder_rate## [1] 2.8244238 2.6751860 3.6295273 3.1893901 3.3741383 1.2924531
## [7] 2.7139722 4.2319369 16.4527532 3.3980688 3.7903226 0.5145920
## [13] 0.7655102 2.8369608 2.1900730 0.6893484 2.2081106 2.6732010
## [19] 7.7425810 0.8280881 5.0748655 1.8021791 4.1786225 0.9992600
## [25] 4.0440846 5.3598917 1.2128379 1.7521372 3.1104763 0.3798036
## [31] 2.7980319 3.2537239 2.6679599 2.9993237 0.5947151 2.6871225
## [37] 2.9589340 0.9396843 3.5977513 1.5200933 4.4753235 0.9825837
## [43] 3.4509357 3.2013603 0.7959810 0.3196211 3.1246001 1.3829942
## [49] 1.4571013 1.7056487 0.8871131
murders$state[order(murder_rate,decreasing=TRUE)]## [1] "District of Columbia" "Louisiana" "Missouri"
## [4] "Maryland" "South Carolina" "Delaware"
## [7] "Michigan" "Mississippi" "Georgia"
## [10] "Arizona" "Pennsylvania" "Tennessee"
## [13] "Florida" "California" "New Mexico"
## [16] "Texas" "Arkansas" "Virginia"
## [19] "Nevada" "North Carolina" "Oklahoma"
## [22] "Illinois" "Alabama" "New Jersey"
## [25] "Connecticut" "Ohio" "Alaska"
## [28] "Kentucky" "New York" "Kansas"
## [31] "Indiana" "Massachusetts" "Nebraska"
## [34] "Wisconsin" "Rhode Island" "West Virginia"
## [37] "Washington" "Colorado" "Montana"
## [40] "Minnesota" "South Dakota" "Oregon"
## [43] "Wyoming" "Maine" "Utah"
## [46] "Idaho" "Iowa" "North Dakota"
## [49] "Hawaii" "New Hampshire" "Vermont"
# EX 1: Vectorized operations
# Assign city names to `city`
city <- c("Beijing", "Lagos", "Paris", "Rio de Janeiro", "San Juan", "Toronto")
# Store temperature values in `temp`
temp <- c(35, 88, 42, 84, 81, 30)
# Convert temperature into Celsius and overwrite the original values of 'temp' with these Celsius values
temp <- (temp-32) * 5/9
# Create a data frame `city_temps`
city_temps <- data.frame(name = city, temperature = temp)
city_temps## name temperature
## 1 Beijing 1.666667
## 2 Lagos 31.111111
## 3 Paris 5.555556
## 4 Rio de Janeiro 28.888889
## 5 San Juan 27.222222
## 6 Toronto -1.111111
# EX 2: Vectorized operations continued...
# Define an object `x` with the numbers 1 through 100
x <- seq(1, 100)
# Sum the equation
sum(1/x^2) ## [1] 1.634984
# EX 3:Vectorized operation continued...
# Load the data
library(dslabs)
data(murders)
# Store the per 100,000 murder rate for each state in murder_rate
murder_rate <- murders$total / murders$population * 100000
# Calculate the average murder rate in the US
sum(murder_rate) / length(murder_rate)## [1] 2.779125
mean(murder_rate)## [1] 2.779125
Module 3: Indexing, Data Wrangeling, Plots
3.1 Indexing
murder_rate <- murders$total / murders$population * 100000
# murder rate in Italy is 0.71, find us states with similar or lower rates
index <- murder_rate < 0.71
index## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [34] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [45] FALSE TRUE FALSE FALSE FALSE FALSE FALSE
# which states
murders$state[index]## [1] "Hawaii" "Iowa" "New Hampshire" "North Dakota"
## [5] "Vermont"
# how many states
sum(index)## [1] 5
# we want to find a states with mountains (West) and safe (murder_rate <= 1)
west <- murders$region == "West"
safe <- murder_rate <= 1
index <- safe & west
murders$state[index]## [1] "Hawaii" "Idaho" "Oregon" "Utah" "Wyoming"
which
x <- c(FALSE, TRUE, FALSE, TRUE, TRUE, FALSE)
which(x)## [1] 2 4 5
# Ex we want to look up the murderrate in Massachusetts
index <- which(murders$state =="Massachusetts")
index## [1] 22
# so to get the murder rate, we use the index
murder_rate[index]## [1] 1.802179
# Now we want to match severral states
index <- match(c("New York", "Florida", "Texas"), murders$state)
index## [1] 33 10 44
# To confirm we got it right
murder_state <- murders$state
murder_state[index]## [1] "New York" "Florida" "Texas"
# and the murder rate of these states
murder_rate[index]## [1] 2.667960 3.398069 3.201360
x <- c("a", "b", "c", "d", "e")
y <- c("a", "d", "f")
# so we can ask if y is in x
y %in% x## [1] TRUE TRUE FALSE
# check if three states are actually states
c("Boston", "Dakota", "Washington") %in% murders$state## [1] FALSE FALSE TRUE
# EX 1: Logical Vectors
# Store the murder rate per 100,000 for each state, in `murder_rate`
murder_rate <- murders$total / murders$population * 100000
#
# Store the `murder_rate < 1` in `low`
low <- murder_rate < 1# EX 2: which
# Store the murder rate per 100,000 for each state, in murder_rate
murder_rate <- murders$total/murders$population*100000
# Store the murder_rate < 1 in low
low <- murder_rate < 1
# Get the indices of entries that are below 1
which(low)## [1] 12 13 16 20 24 30 35 38 42 45 46 51
# EX 3: Ordering vectors
# Store the murder rate per 100,000 for each state, in murder_rate
murder_rate <- murders$total/murders$population*100000
# Store the murder_rate < 1 in low
low <- murder_rate < 1
# Names of states with murder rates lower than 1
murders$state[low]## [1] "Hawaii" "Idaho" "Iowa" "Maine"
## [5] "Minnesota" "New Hampshire" "North Dakota" "Oregon"
## [9] "South Dakota" "Utah" "Vermont" "Wyoming"
# EX 4: Filtering
# Store the murder rate per 100,000 for each state, in `murder_rate`
murder_rate <- murders$total/murders$population*100000
# Store the `murder_rate < 1` in `low`
low <- murder_rate < 1
# Create a vector ind for states in the Northeast and with murder rates lower than 1.
ind <- (murders$region == "Northeast") & (murder_rate < 1)
# Names of states in `ind`
murders$state[ind]## [1] "Maine" "New Hampshire" "Vermont"
# EX 5: Filtering continued
# Store the murder rate per 100,000 for each state, in murder_rate
murder_rate <- murders$total/murders$population*100000
# Compute average murder rate and store in avg using `mean`
avg <- mean(murder_rate)
# How many states have murder rates below avg ? Check using sum
sum(murder_rate < avg)## [1] 27
# EX 6: Match
# Store the 3 abbreviations in abbs in a vector (remember that they are character vectors and need quotes)
abbs <- c("AK", "MI", "IA")
# Match the abbs to the murders$abb and store in `ind`
ind <- match(abbs , murders$abb)
# Print state names from `ind`
murders$state[ind]## [1] "Alaska" "Michigan" "Iowa"
# EX 7: %in%
# Store the 5 abbreviations in `abbs`. (remember that they are character vectors)
abbs <- c("MA", "ME", "MI", "MO", "MU")
# Use the %in% command to check if the entries of abbs are abbreviations in the the murders data frame
abbs %in% murders$abb## [1] TRUE TRUE TRUE TRUE FALSE
# EX 8: Logical operator
# Store the 5 abbreviations in abbs. (remember that they are character vectors)
abbs <- c("MA", "ME", "MI", "MO", "MU")
# Use the `which` command and `!` operator to find out which abbreviation are not actually part of the dataset and store in ind
ind <- which(!abbs %in% murders$abb)
# What are the entries of abbs that are not actual abbreviations
abbs[ind]## [1] "MU"
3.2 Basic Data Wrangeling
library(dplyr)we want to add the murder ae into the table
murders <- mutate(murders, rate=total/population * 100000)
murders## state abb region population total rate
## 1 Alabama AL South 4779736 135 2.8244238
## 2 Alaska AK West 710231 19 2.6751860
## 3 Arizona AZ West 6392017 232 3.6295273
## 4 Arkansas AR South 2915918 93 3.1893901
## 5 California CA West 37253956 1257 3.3741383
## 6 Colorado CO West 5029196 65 1.2924531
## 7 Connecticut CT Northeast 3574097 97 2.7139722
## 8 Delaware DE South 897934 38 4.2319369
## 9 District of Columbia DC South 601723 99 16.4527532
## 10 Florida FL South 19687653 669 3.3980688
## 11 Georgia GA South 9920000 376 3.7903226
## 12 Hawaii HI West 1360301 7 0.5145920
## 13 Idaho ID West 1567582 12 0.7655102
## 14 Illinois IL North Central 12830632 364 2.8369608
## 15 Indiana IN North Central 6483802 142 2.1900730
## 16 Iowa IA North Central 3046355 21 0.6893484
## 17 Kansas KS North Central 2853118 63 2.2081106
## 18 Kentucky KY South 4339367 116 2.6732010
## 19 Louisiana LA South 4533372 351 7.7425810
## 20 Maine ME Northeast 1328361 11 0.8280881
## 21 Maryland MD South 5773552 293 5.0748655
## 22 Massachusetts MA Northeast 6547629 118 1.8021791
## 23 Michigan MI North Central 9883640 413 4.1786225
## 24 Minnesota MN North Central 5303925 53 0.9992600
## 25 Mississippi MS South 2967297 120 4.0440846
## 26 Missouri MO North Central 5988927 321 5.3598917
## 27 Montana MT West 989415 12 1.2128379
## 28 Nebraska NE North Central 1826341 32 1.7521372
## 29 Nevada NV West 2700551 84 3.1104763
## 30 New Hampshire NH Northeast 1316470 5 0.3798036
## 31 New Jersey NJ Northeast 8791894 246 2.7980319
## 32 New Mexico NM West 2059179 67 3.2537239
## 33 New York NY Northeast 19378102 517 2.6679599
## 34 North Carolina NC South 9535483 286 2.9993237
## 35 North Dakota ND North Central 672591 4 0.5947151
## 36 Ohio OH North Central 11536504 310 2.6871225
## 37 Oklahoma OK South 3751351 111 2.9589340
## 38 Oregon OR West 3831074 36 0.9396843
## 39 Pennsylvania PA Northeast 12702379 457 3.5977513
## 40 Rhode Island RI Northeast 1052567 16 1.5200933
## 41 South Carolina SC South 4625364 207 4.4753235
## 42 South Dakota SD North Central 814180 8 0.9825837
## 43 Tennessee TN South 6346105 219 3.4509357
## 44 Texas TX South 25145561 805 3.2013603
## 45 Utah UT West 2763885 22 0.7959810
## 46 Vermont VT Northeast 625741 2 0.3196211
## 47 Virginia VA South 8001024 250 3.1246001
## 48 Washington WA West 6724540 93 1.3829942
## 49 West Virginia WV South 1852994 27 1.4571013
## 50 Wisconsin WI North Central 5686986 97 1.7056487
## 51 Wyoming WY West 563626 5 0.8871131
filter(murders, rate <= 0.71)## state abb region population total rate
## 1 Hawaii HI West 1360301 7 0.5145920
## 2 Iowa IA North Central 3046355 21 0.6893484
## 3 New Hampshire NH Northeast 1316470 5 0.3798036
## 4 North Dakota ND North Central 672591 4 0.5947151
## 5 Vermont VT Northeast 625741 2 0.3196211
select specific columns
new_table <- select(murders,state,region,rate)
new_table## state region rate
## 1 Alabama South 2.8244238
## 2 Alaska West 2.6751860
## 3 Arizona West 3.6295273
## 4 Arkansas South 3.1893901
## 5 California West 3.3741383
## 6 Colorado West 1.2924531
## 7 Connecticut Northeast 2.7139722
## 8 Delaware South 4.2319369
## 9 District of Columbia South 16.4527532
## 10 Florida South 3.3980688
## 11 Georgia South 3.7903226
## 12 Hawaii West 0.5145920
## 13 Idaho West 0.7655102
## 14 Illinois North Central 2.8369608
## 15 Indiana North Central 2.1900730
## 16 Iowa North Central 0.6893484
## 17 Kansas North Central 2.2081106
## 18 Kentucky South 2.6732010
## 19 Louisiana South 7.7425810
## 20 Maine Northeast 0.8280881
## 21 Maryland South 5.0748655
## 22 Massachusetts Northeast 1.8021791
## 23 Michigan North Central 4.1786225
## 24 Minnesota North Central 0.9992600
## 25 Mississippi South 4.0440846
## 26 Missouri North Central 5.3598917
## 27 Montana West 1.2128379
## 28 Nebraska North Central 1.7521372
## 29 Nevada West 3.1104763
## 30 New Hampshire Northeast 0.3798036
## 31 New Jersey Northeast 2.7980319
## 32 New Mexico West 3.2537239
## 33 New York Northeast 2.6679599
## 34 North Carolina South 2.9993237
## 35 North Dakota North Central 0.5947151
## 36 Ohio North Central 2.6871225
## 37 Oklahoma South 2.9589340
## 38 Oregon West 0.9396843
## 39 Pennsylvania Northeast 3.5977513
## 40 Rhode Island Northeast 1.5200933
## 41 South Carolina South 4.4753235
## 42 South Dakota North Central 0.9825837
## 43 Tennessee South 3.4509357
## 44 Texas South 3.2013603
## 45 Utah West 0.7959810
## 46 Vermont Northeast 0.3196211
## 47 Virginia South 3.1246001
## 48 Washington West 1.3829942
## 49 West Virginia South 1.4571013
## 50 Wisconsin North Central 1.7056487
## 51 Wyoming West 0.8871131
filter(new_table, rate <= 0.71)## state region rate
## 1 Hawaii West 0.5145920
## 2 Iowa North Central 0.6893484
## 3 New Hampshire Northeast 0.3798036
## 4 North Dakota North Central 0.5947151
## 5 Vermont Northeast 0.3196211
using pipe to put it all together
murders %>% select(state,region,rate) %>% filter(rate <= 0.71)## state region rate
## 1 Hawaii West 0.5145920
## 2 Iowa North Central 0.6893484
## 3 New Hampshire Northeast 0.3798036
## 4 North Dakota North Central 0.5947151
## 5 Vermont Northeast 0.3196211
creating frames
grades <- data.frame(names=c("John", "juan","Jean","Yao"),
exam_1 = c(95, 80, 90, 85),
exam_2 = c(90, 85, 85, 90))
grades## names exam_1 exam_2
## 1 John 95 90
## 2 juan 80 85
## 3 Jean 90 85
## 4 Yao 85 90
class(grades$names)## [1] "factor"
grades <- data.frame(names=c("John", "juan","Jean","Yao"),
exam_1 = c(95, 80, 90, 85),
exam_2 = c(90, 85, 85, 90),
stringsAsFactors = FALSE)
class(grades$names)## [1] "character"
# EX 1: dplyr
# Loading data
library(dslabs)
data(murders)
# Loading dplyr
library(dplyr)
# Redefine murders so that it includes column named rate with the per 100,000 murder rates
murders <- mutate(murders, rate=total/population * 100000)# EX 2: mutate
# Note that if you want ranks from highest to lowest you can take the negative and then compute the ranks
x <- c(88, 100, 83, 92, 94)
rank(-x)## [1] 4 1 5 3 2
# Defining rate
rate <- murders$total/ murders$population * 100000
# Redefine murders to include a column named rank
# with the ranks of rate from highest to lowest
murders <- mutate(murders, rank(-rate))
murders## state abb region population total rate
## 1 Alabama AL South 4779736 135 2.8244238
## 2 Alaska AK West 710231 19 2.6751860
## 3 Arizona AZ West 6392017 232 3.6295273
## 4 Arkansas AR South 2915918 93 3.1893901
## 5 California CA West 37253956 1257 3.3741383
## 6 Colorado CO West 5029196 65 1.2924531
## 7 Connecticut CT Northeast 3574097 97 2.7139722
## 8 Delaware DE South 897934 38 4.2319369
## 9 District of Columbia DC South 601723 99 16.4527532
## 10 Florida FL South 19687653 669 3.3980688
## 11 Georgia GA South 9920000 376 3.7903226
## 12 Hawaii HI West 1360301 7 0.5145920
## 13 Idaho ID West 1567582 12 0.7655102
## 14 Illinois IL North Central 12830632 364 2.8369608
## 15 Indiana IN North Central 6483802 142 2.1900730
## 16 Iowa IA North Central 3046355 21 0.6893484
## 17 Kansas KS North Central 2853118 63 2.2081106
## 18 Kentucky KY South 4339367 116 2.6732010
## 19 Louisiana LA South 4533372 351 7.7425810
## 20 Maine ME Northeast 1328361 11 0.8280881
## 21 Maryland MD South 5773552 293 5.0748655
## 22 Massachusetts MA Northeast 6547629 118 1.8021791
## 23 Michigan MI North Central 9883640 413 4.1786225
## 24 Minnesota MN North Central 5303925 53 0.9992600
## 25 Mississippi MS South 2967297 120 4.0440846
## 26 Missouri MO North Central 5988927 321 5.3598917
## 27 Montana MT West 989415 12 1.2128379
## 28 Nebraska NE North Central 1826341 32 1.7521372
## 29 Nevada NV West 2700551 84 3.1104763
## 30 New Hampshire NH Northeast 1316470 5 0.3798036
## 31 New Jersey NJ Northeast 8791894 246 2.7980319
## 32 New Mexico NM West 2059179 67 3.2537239
## 33 New York NY Northeast 19378102 517 2.6679599
## 34 North Carolina NC South 9535483 286 2.9993237
## 35 North Dakota ND North Central 672591 4 0.5947151
## 36 Ohio OH North Central 11536504 310 2.6871225
## 37 Oklahoma OK South 3751351 111 2.9589340
## 38 Oregon OR West 3831074 36 0.9396843
## 39 Pennsylvania PA Northeast 12702379 457 3.5977513
## 40 Rhode Island RI Northeast 1052567 16 1.5200933
## 41 South Carolina SC South 4625364 207 4.4753235
## 42 South Dakota SD North Central 814180 8 0.9825837
## 43 Tennessee TN South 6346105 219 3.4509357
## 44 Texas TX South 25145561 805 3.2013603
## 45 Utah UT West 2763885 22 0.7959810
## 46 Vermont VT Northeast 625741 2 0.3196211
## 47 Virginia VA South 8001024 250 3.1246001
## 48 Washington WA West 6724540 93 1.3829942
## 49 West Virginia WV South 1852994 27 1.4571013
## 50 Wisconsin WI North Central 5686986 97 1.7056487
## 51 Wyoming WY West 563626 5 0.8871131
## rank(-rate)
## 1 23
## 2 27
## 3 10
## 4 17
## 5 14
## 6 38
## 7 25
## 8 6
## 9 1
## 10 13
## 11 9
## 12 49
## 13 46
## 14 22
## 15 31
## 16 47
## 17 30
## 18 28
## 19 2
## 20 44
## 21 4
## 22 32
## 23 7
## 24 40
## 25 8
## 26 3
## 27 39
## 28 33
## 29 19
## 30 50
## 31 24
## 32 15
## 33 29
## 34 20
## 35 48
## 36 26
## 37 21
## 38 42
## 39 11
## 40 35
## 41 5
## 42 41
## 43 12
## 44 16
## 45 45
## 46 51
## 47 18
## 48 37
## 49 36
## 50 34
## 51 43
# EX 3: select
# Load dplyr
library(dplyr)
# Use select to only show state names and abbreviations from murders
select(murders, state, abb)## state abb
## 1 Alabama AL
## 2 Alaska AK
## 3 Arizona AZ
## 4 Arkansas AR
## 5 California CA
## 6 Colorado CO
## 7 Connecticut CT
## 8 Delaware DE
## 9 District of Columbia DC
## 10 Florida FL
## 11 Georgia GA
## 12 Hawaii HI
## 13 Idaho ID
## 14 Illinois IL
## 15 Indiana IN
## 16 Iowa IA
## 17 Kansas KS
## 18 Kentucky KY
## 19 Louisiana LA
## 20 Maine ME
## 21 Maryland MD
## 22 Massachusetts MA
## 23 Michigan MI
## 24 Minnesota MN
## 25 Mississippi MS
## 26 Missouri MO
## 27 Montana MT
## 28 Nebraska NE
## 29 Nevada NV
## 30 New Hampshire NH
## 31 New Jersey NJ
## 32 New Mexico NM
## 33 New York NY
## 34 North Carolina NC
## 35 North Dakota ND
## 36 Ohio OH
## 37 Oklahoma OK
## 38 Oregon OR
## 39 Pennsylvania PA
## 40 Rhode Island RI
## 41 South Carolina SC
## 42 South Dakota SD
## 43 Tennessee TN
## 44 Texas TX
## 45 Utah UT
## 46 Vermont VT
## 47 Virginia VA
## 48 Washington WA
## 49 West Virginia WV
## 50 Wisconsin WI
## 51 Wyoming WY
# EX 4: filter
# Add the necessary columns
murders <- mutate(murders, rate = total/population * 100000, rank = rank(-rate))
# Filter to show the top 5 states with the highest murder rates
filter(murders, rank <= 5)## state abb region population total rate
## 1 District of Columbia DC South 601723 99 16.452753
## 2 Louisiana LA South 4533372 351 7.742581
## 3 Maryland MD South 5773552 293 5.074866
## 4 Missouri MO North Central 5988927 321 5.359892
## 5 South Carolina SC South 4625364 207 4.475323
## rank(-rate) rank
## 1 1 1
## 2 2 2
## 3 4 4
## 4 3 3
## 5 5 5
# EX 5: filter with !=
# Use filter to create a new data frame no_south
no_south <- filter(murders, region != "South")
# Use nrow() to calculate the number of rows
nrow(no_south)## [1] 34
# EX 6: filter with %in%
# Create a new data frame called murders_nw with only the states from the northeast and the west
murders_nw <- filter(murders, region %in% c("Northeast", "West"))
# Number of states (rows) in this category
nrow(murders_nw)## [1] 22
# EX 7: filtering by two conditions
# add the rate column
murders <- mutate(murders, rate = total / population * 100000, rank = rank(-rate))
# Create a table, call it `my_states`, that satisfies both the conditions
my_states <- filter(murders, region %in% c("Northeast", "West") & rate < 1)
# Use select to show only the state name, the murder rate and the rank
select(my_states, state, rate, rank)## state rate rank
## 1 Hawaii 0.5145920 49
## 2 Idaho 0.7655102 46
## 3 Maine 0.8280881 44
## 4 New Hampshire 0.3798036 50
## 5 Oregon 0.9396843 42
## 6 Utah 0.7959810 45
## 7 Vermont 0.3196211 51
## 8 Wyoming 0.8871131 43
# EX 8: Using the pipe %>%
## Define the rate and rank column
murders <- mutate(murders, rate = total / population * 100000, rank = rank(-rate))
# show the result and only include the state, rate, and rank columns, all in one line
filter(murders, region %in% c("Northeast", "West") & rate < 1) %>%
select(state, rate, rank)## state rate rank
## 1 Hawaii 0.5145920 49
## 2 Idaho 0.7655102 46
## 3 Maine 0.8280881 44
## 4 New Hampshire 0.3798036 50
## 5 Oregon 0.9396843 42
## 6 Utah 0.7959810 45
## 7 Vermont 0.3196211 51
## 8 Wyoming 0.8871131 43
# EX 9: mutate, filter and select
# Loading the libraries
library(dplyr)
data(murders)
# Create new data frame called my_states (with specifications in the instructions)
my_states <- murders %>%
mutate(rate = total / population * 100000, rank = rank(-rate)) %>%
filter(region %in% c("Northeast", "West") & rate < 1) %>%
select(state, rate, rank)
my_states## state rate rank
## 1 Hawaii 0.5145920 49
## 2 Idaho 0.7655102 46
## 3 Maine 0.8280881 44
## 4 New Hampshire 0.3798036 50
## 5 Oregon 0.9396843 42
## 6 Utah 0.7959810 45
## 7 Vermont 0.3196211 51
## 8 Wyoming 0.8871131 43
3.3 ## Basic Plots
more populus states have more murder
population_in_millions <- murders$population
total_gun_murders <- murders$total
plot(population_in_millions, total_gun_murders)To look at the distribution of the data we use histogram
class(murders$rate)## [1] "NULL"
murders <- mutate(murders, rate = total / population * 100000, rank = rank(-rate))
murders## state abb region population total rate rank
## 1 Alabama AL South 4779736 135 2.8244238 23
## 2 Alaska AK West 710231 19 2.6751860 27
## 3 Arizona AZ West 6392017 232 3.6295273 10
## 4 Arkansas AR South 2915918 93 3.1893901 17
## 5 California CA West 37253956 1257 3.3741383 14
## 6 Colorado CO West 5029196 65 1.2924531 38
## 7 Connecticut CT Northeast 3574097 97 2.7139722 25
## 8 Delaware DE South 897934 38 4.2319369 6
## 9 District of Columbia DC South 601723 99 16.4527532 1
## 10 Florida FL South 19687653 669 3.3980688 13
## 11 Georgia GA South 9920000 376 3.7903226 9
## 12 Hawaii HI West 1360301 7 0.5145920 49
## 13 Idaho ID West 1567582 12 0.7655102 46
## 14 Illinois IL North Central 12830632 364 2.8369608 22
## 15 Indiana IN North Central 6483802 142 2.1900730 31
## 16 Iowa IA North Central 3046355 21 0.6893484 47
## 17 Kansas KS North Central 2853118 63 2.2081106 30
## 18 Kentucky KY South 4339367 116 2.6732010 28
## 19 Louisiana LA South 4533372 351 7.7425810 2
## 20 Maine ME Northeast 1328361 11 0.8280881 44
## 21 Maryland MD South 5773552 293 5.0748655 4
## 22 Massachusetts MA Northeast 6547629 118 1.8021791 32
## 23 Michigan MI North Central 9883640 413 4.1786225 7
## 24 Minnesota MN North Central 5303925 53 0.9992600 40
## 25 Mississippi MS South 2967297 120 4.0440846 8
## 26 Missouri MO North Central 5988927 321 5.3598917 3
## 27 Montana MT West 989415 12 1.2128379 39
## 28 Nebraska NE North Central 1826341 32 1.7521372 33
## 29 Nevada NV West 2700551 84 3.1104763 19
## 30 New Hampshire NH Northeast 1316470 5 0.3798036 50
## 31 New Jersey NJ Northeast 8791894 246 2.7980319 24
## 32 New Mexico NM West 2059179 67 3.2537239 15
## 33 New York NY Northeast 19378102 517 2.6679599 29
## 34 North Carolina NC South 9535483 286 2.9993237 20
## 35 North Dakota ND North Central 672591 4 0.5947151 48
## 36 Ohio OH North Central 11536504 310 2.6871225 26
## 37 Oklahoma OK South 3751351 111 2.9589340 21
## 38 Oregon OR West 3831074 36 0.9396843 42
## 39 Pennsylvania PA Northeast 12702379 457 3.5977513 11
## 40 Rhode Island RI Northeast 1052567 16 1.5200933 35
## 41 South Carolina SC South 4625364 207 4.4753235 5
## 42 South Dakota SD North Central 814180 8 0.9825837 41
## 43 Tennessee TN South 6346105 219 3.4509357 12
## 44 Texas TX South 25145561 805 3.2013603 16
## 45 Utah UT West 2763885 22 0.7959810 45
## 46 Vermont VT Northeast 625741 2 0.3196211 51
## 47 Virginia VA South 8001024 250 3.1246001 18
## 48 Washington WA West 6724540 93 1.3829942 37
## 49 West Virginia WV South 1852994 27 1.4571013 36
## 50 Wisconsin WI North Central 5686986 97 1.7056487 34
## 51 Wyoming WY West 563626 5 0.8871131 43
hist(murders$rate)#One extreme value
murders$state[which.max(murders$rate)]## [1] "District of Columbia"
boxplots are good at comparing different groupings like regions
boxplot(rate~region, data= murders)# EX 1: Scatterplots
# Load the datasets and define some variables
library(dslabs)
data(murders)
population_in_millions <- murders$population/10^6
total_gun_murders <- murders$total
plot(population_in_millions, total_gun_murders)# Transform population using the log10 transformation and save to object log10_population
log10_population <- log10(murders$population)
# Transform total gun murders using log10 transformation and save to object log10_total_gun_murders
log10_total_gun_murders <- log10(total_gun_murders)
# Create a scatterplot with the log scale transformed population and murders
plot(log10_population, log10_total_gun_murders)# EX 2: Histograms
# Store the population in millions and save to population_in_millions
population_in_millions <- murders$population/10^6
# Create a histogram of this variable
hist(population_in_millions)# EX 3: Boxplots
# Create a boxplot of state populations by region for the murders dataset
boxplot(murders$population~murders$region)Module 4: Programming Basics
4.1 Introduction to Programming in R
4.2 Conditionals
library(dslabs)
data(murders)
murder_rate <- murders$total/murders$population*100000
ind <- which.min(murder_rate)
if(murder_rate[ind] < 0.5) {
print(murders$state[ind])
} else{
print("No state has murder rate that low")
}## [1] "Vermont"
ind <- which.min(murder_rate)
if(murder_rate[ind] < 0.25) {
print(murders$state[ind])
} else{
print("No state has murder rate that low")
}## [1] "No state has murder rate that low"
a <- c(0,1,2,-4,5)
result <- ifelse(a > 0, 1/a, NA)
result## [1] NA 1.0 0.5 NA 0.2
4.3 Functions
avg <- function(x) {
s <- sum(x)
n <- length(x)
s/n
}
x <- c(5,4,3,2)
avg(x)## [1] 3.5
4.4 For Loops
compute_s_n <- function(n){
x <- 1:n
sum(x)
}
compute_s_n(3) # 1+2+3## [1] 6
compute_s_n(100)## [1] 5050
we now want to repeat the process 25 times
m <- 25
# we create an empty vector
s_n <- vector(length = m)
for(n in 1:m) {
s_n[n] <- compute_s_n(n)
}
s_n## [1] 1 3 6 10 15 21 28 36 45 55 66 78 91 105 120 136 153
## [18] 171 190 210 231 253 276 300 325
n <- 1:m
plot(n, s_n)in stead of loops we use:
- apply
- sapply
- tapply #
# EX 2: Conditionals
# Assign the state abbreviation when the state name is longer than 8 characters
new_names <- ifelse(nchar(murders$state)>8, murders$abb, murders$state)
new_names## [1] "Alabama" "Alaska" "Arizona" "Arkansas" "CA" "Colorado"
## [7] "CT" "Delaware" "DC" "Florida" "Georgia" "Hawaii"
## [13] "Idaho" "Illinois" "Indiana" "Iowa" "Kansas" "Kentucky"
## [19] "LA" "Maine" "Maryland" "MA" "Michigan" "MN"
## [25] "MS" "Missouri" "Montana" "Nebraska" "Nevada" "NH"
## [31] "NJ" "NM" "New York" "NC" "ND" "Ohio"
## [37] "Oklahoma" "Oregon" "PA" "RI" "SC" "SD"
## [43] "TN" "Texas" "Utah" "Vermont" "Virginia" "WA"
## [49] "WV" "WI" "Wyoming"
# EX 4: Defining functions
# Create function called `sum_n`
sum_n <- function(n){
x <- 1:n
sum(x)
}
# Determine the sum of integers from 1 to 5000
sum_n(5000)## [1] 12502500
# EX 5: Defining functions continued...
# Create `altman_plot`
altman_plot <- function(x, y){
plot(x + y, y - x)
}
x <- c(1,2,3,4,5)
y <- c(2,4,6,8,10)
altman_plot(x,y)# Run this code
x <- 3
my_func <- function(y){
x <- 5
y+5
}
# Print value of x
x## [1] 3
# EX 7: For loops
# Here is a function that adds numbers from 1 to n
example_func <- function(n){
x <- 1:n
sum(x)
}
# Here is the sum of the first 100 numbers
example_func(100)## [1] 5050
# Write the function with argument n, with the above mentioned specifications and store it in `compute_s_n`
compute_s_n <- function(n){
x <- 1:n
sum(x^2)
}
# Report the value of the sum when n=10
compute_s_n(10)## [1] 385
# EX 8: For loops continued...
# Define a function and store it in `compute_s_n`
compute_s_n <- function(n){
x <- 1:n
sum(x^2)
}
# Create a vector for storing results
s_n <- vector("numeric", 25)
# Assign values to `n` and `s_n`
for(i in 1:25){
s_n[i] <- compute_s_n(i)
}# EX 9: Checking our math
# Define the function
compute_s_n <- function(n){
x <- 1:n
sum(x^2)
}
# Define the vector of n
n <- 1:25
# Define the vector to store data
s_n <- vector("numeric", 25)
for(i in n){
s_n[i] <- compute_s_n(i)
}
# Create the plot
plot(n, s_n)# EX 10: Checking our math continued
# Define the function
compute_s_n <- function(n){
x <- 1:n
sum(x^2)
}
# Define the vector of n
n <- 1:25
# Define the vector to store data
s_n <- vector("numeric", 25)
for(i in n){
s_n[i] <- compute_s_n(i)
}
# Check that s_n is identical to the formula given in the instructions.
identical(s_n, n*(n+1)*(2*n+1)/6)## [1] TRUE