I. Introduction

II. Data preparation: collection and loading

Next, you will need to install several packages which are required for collecting near real-time tweets and load them into the workspace.

Module 1: R Basics, Functions, and Data Types

1.1 Motivation

1.2 R Basics

log(1)

## [1] 0

exp(1)

## [1] 2.718282

log(exp(1))

## [1] 1

# EX 2: Variable names

# Load package and data

library(dslabs)
data(murders)

# Use the function names to extract the variable names 
names(murders)

## [1] "state"      "abb"        "region"     "population" "total"

# EX 3: Examining Variables

# To access the population variable from the murders dataset use this code:
p <- murders$population 

# To determine the class of object `p` we use this code:
class(p)

## [1] "numeric"

# Use the accessor to extract state abbreviations and assign it to a
a <- murders$abb

# Determine the class of a
class(a)

## [1] "character"

# EX 4: Multiple ways to access variables

# We extract the population like this:
p <- murders$population

# This is how we do the same with the square brackets:
o <- murders[["population"]] 

# We can confirm these two are the same
identical(o, p)

## [1] TRUE

# Use square brackets to extract `abb` from `murders` and assign it to b
b <- murders[["abb"]]
# Check if `a` and `b` are identical 
identical(a,b)

## [1] TRUE

1.3 Data Types

class(2)

## [1] "numeric"

class("programming")

## [1] "character"

class(ls)

## [1] "function"

class(murders)

## [1] "data.frame"

class(murders$state)

## [1] "character"

class(murders$region)

## [1] "factor"

# structure
str(murders)

## 'data.frame':    51 obs. of  5 variables:
##  $ state     : chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ abb       : chr  "AL" "AK" "AZ" "AR" ...
##  $ region    : Factor w/ 4 levels "Northeast","South",..: 2 4 4 2 4 4 1 2 2 2 ...
##  $ population: num  4779736 710231 6392017 2915918 37253956 ...
##  $ total     : num  135 19 232 93 1257 ...

names(murders)

## [1] "state"      "abb"        "region"     "population" "total"

head(murders)

##        state abb region population total
## 1    Alabama  AL  South    4779736   135
## 2     Alaska  AK   West     710231    19
## 3    Arizona  AZ   West    6392017   232
## 4   Arkansas  AR  South    2915918    93
## 5 California  CA   West   37253956  1257
## 6   Colorado  CO   West    5029196    65

# EX 2: Variable names

# Load package and data

library(dslabs)
data(murders)

# Use the function names to extract the variable names 
names(murders)

## [1] "state"      "abb"        "region"     "population" "total"

# EX 5: Factors

# We can see the class of the region variable using class
class(murders$region)

## [1] "factor"

# Determine the number of regions included in this variable 
length(levels(murders$region))

## [1] 4

# EX 6: Tables

# Here is an example of what the table function does
x <- c("a", "a", "b", "b", "b", "c")
table(x)

## x
## a b c 
## 2 3 1

# Write one line of code to show the number of states per region
table(murders$region)

## 
##     Northeast         South North Central          West 
##             9            17            12            13

Module 2: Vectors, Sorting

2.1 Vectors

c stands for concatenate

codes <- c(italy=380, canada=124, egypt=818)
codes

##  italy canada  egypt 
##    380    124    818

use to access an element of a vector

codes[2]

## canada 
##    124

codes[1:2]

##  italy canada 
##    380    124

codes["canada"]

## canada 
##    124

# codes["egypt","canada"]

x <- 1:5
x

## [1] 1 2 3 4 5

y <- as.character(x)
y

## [1] "1" "2" "3" "4" "5"

z <- as.numeric(y)
z

## [1] 1 2 3 4 5

x <- c("1", "b","3")
x

## [1] "1" "b" "3"

y <- as.numeric(x)

## Warning: NAs introduced by coercion

## [1]  1 NA  3

# EX 1: Numeric Vectors

# Here is an example creating a numeric vector named cost
cost <- c(50, 75, 90, 100, 150)

# Create a numeric vector to store the temperatures listed in the instructions into a vector named temp
# Make sure to follow the same order in the instructions
temp <- c("Beijing", 35, "Lagos", 88, "Paris", 42, "Rio de Janeiro", 84, "San Juan", 81, "Toronto", 30)
temp

##  [1] "Beijing"        "35"             "Lagos"          "88"            
##  [5] "Paris"          "42"             "Rio de Janeiro" "84"            
##  [9] "San Juan"       "81"             "Toronto"        "30"

temp <- c(35, 88, 42, 84, 81, 30)

# EX 2: Character vectors

# here is an example of how to create a character vector
food <- c("pizza", "burgers", "salads", "cheese", "pasta")

# Create a character vector called city to store the city names
# Make sure to follow the same order as in the instructions
city <- c("Beijing", "Lagos", "Paris",  "Rio de Janeiro", "San Juan","Toronto")
city

## [1] "Beijing"        "Lagos"          "Paris"          "Rio de Janeiro"
## [5] "San Juan"       "Toronto"

# EX 3: Connecting Numeric and Character Vectors

# Associate the cost values with its corresponding food item
cost <- c(50, 75, 90, 100, 150)
food <- c("pizza", "burgers", "salads", "cheese", "pasta")
names(cost) <- food

# You already wrote this code
temp <- c(35, 88, 42, 84, 81, 30)
city <- c("Beijing", "Lagos", "Paris", "Rio de Janeiro", "San Juan", "Toronto")

# Associate the temperature values with its corresponding city
names(temp) <- city
temp

##        Beijing          Lagos          Paris Rio de Janeiro       San Juan 
##             35             88             42             84             81 
##        Toronto 
##             30

# EX 4: Subsetting vectors

# cost of the last 3 items in our food list:
cost[3:5]

## salads cheese  pasta 
##     90    100    150

# temperatures of the first three cities in the list:
temp[0:3]

## Beijing   Lagos   Paris 
##      35      88      42

temp[c(1,2,3)]

## Beijing   Lagos   Paris 
##      35      88      42

# EX 5: Subsetting vectors continued...

# Access the cost of pizza and pasta from our food list 
cost[c(1,5)]

## pizza pasta 
##    50   150

# Define temp
temp <- c(35, 88, 42, 84, 81, 30)
city <- c("Beijing", "Lagos", "Paris", "Rio de Janeiro", "San Juan", "Toronto")
names(temp) <- city

# Access the temperatures of Paris and San Juan
temp[c(3,5)]

##    Paris San Juan 
##       42       81

# EX 6: Sequences

# Create a vector m of integers that starts at 32 and ends at 99.
m <- 32:99

# Determine the length of object m.
length(m)

## [1] 68

# Create a vector x of integers that starts 12 and ends at 73.
x <- 12:73
# Determine the length of object x.
length(x)

## [1] 62

# EX 7: Sequences continued...

# Create a vector with the multiples of 7, smaller than 50.
seq(7, 49, 7)

## [1]  7 14 21 28 35 42 49

# Create a vector containing all the positive odd numbers smaller than 100.
# The numbers should be in ascending order
seq(1, 99, 2)

##  [1]  1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31 33 35 37 39 41 43 45
## [24] 47 49 51 53 55 57 59 61 63 65 67 69 71 73 75 77 79 81 83 85 87 89 91
## [47] 93 95 97 99

# EX 8: Sequences and length

# We can a vector with the multiples of 7, smaller than 50 like this 
seq(7, 49, 7)

## [1]  7 14 21 28 35 42 49

# But note that the second argument does not need to be last number.
# It simply determines the maximum value permitted.
# so the following line of code produces the same vector as seq(7, 49, 7)
seq(7, 50, 7)

## [1]  7 14 21 28 35 42 49

# Create a sequence of numbers from 6 to 55, with 4/7 increments and determine its length
length(seq(6, 55, 4/7))

## [1] 86

# EX 9: Sequences of certain length

# Store the sequence in the object a
a <- seq(1, 10, length.out = 100)

# Determine the class of a
class(a)

## [1] "numeric"

# EX 10: Integers

# Store the sequence in the object a
a <- seq(1, 10)

# Determine the class of a
class(a)

## [1] "integer"

# EX 11: Integers and Numerics

# Check the class of 1, assigned to the object a
class(1)

## [1] "numeric"

# Confirm the class of 1L is integer
class(1L)

## [1] "integer"

# EX 12: Coercion

# Define the vector x
x <- c(1, 3, 5,"a")

# Note that the x is character vector
class(x)

## [1] "character"

# Typecast the vector to get an integer vector
# You will get a warning but that is ok
x <- as.integer(x)

## Warning: NAs introduced by coercion

2.2 Sorting

# how many murders
sort(murders$total)

##  [1]    2    4    5    5    7    8   11   12   12   16   19   21   22   27
## [15]   32   36   38   53   63   65   67   84   93   93   97   97   99  111
## [29]  116  118  120  135  142  207  219  232  246  250  286  293  310  321
## [43]  351  364  376  413  457  517  669  805 1257

x <- c(31,4,15,92,65)

sort(x)

## [1]  4 15 31 65 92

index <- order(x)
index

## [1] 2 3 1 5 4

# first we order the total murders and save it to index
index <- order(murders$total)

# then we use index to look up state ordered by murdercount from low to high
murders$abb[index]

##  [1] "VT" "ND" "NH" "WY" "HI" "SD" "ME" "ID" "MT" "RI" "AK" "IA" "UT" "WV"
## [15] "NE" "OR" "DE" "MN" "KS" "CO" "NM" "NV" "AR" "WA" "CT" "WI" "DC" "OK"
## [29] "KY" "MA" "MS" "AL" "IN" "SC" "TN" "AZ" "NJ" "VA" "NC" "MD" "OH" "MO"
## [43] "LA" "IL" "GA" "MI" "PA" "NY" "FL" "TX" "CA"

# which is the max murder number
max(murders$total)

## [1] 1257

# use to look up state
i_max <- which.max(murders$total)
i_max

## [1] 5

murders$state[i_max]

## [1] "California"

# which is the min murder number
min(murders$total)

## [1] 2

# use to look up state
i_min <- which.min(murders$total)
i_min

## [1] 46

murders$state[i_min]

## [1] "Vermont"

# ranking
rank(x)

## [1] 3 1 2 5 4

# EX 1: sort

# Access the `state` variable and store it in an object 
states <- murders$state 

# Sort the object alphabetically and redefine the object 
states <- sort(states) 

# Report the first alphabetical value  
states[1]

## [1] "Alabama"

# Access population values from the dataset and store it in pop
pop <- murders$population
# Sort the object and save it in the same object 
pop <- sort(pop)
# Report the smallest population size 
pop[1]

## [1] 563626

# EX 2: order

# Access population from the dataset and store it in pop
pop <- murders$population

# Use the command order, to order pop and store in object o
o <- order(pop)
# Find the index number of the entry with the smallest population size
o[1]

## [1] 51

# EX 3: New Codes

# Find the smallest value for variable total 
which.min(murders$total)

## [1] 46

# Find the smallest value for population
which.min(murders$population)

## [1] 51

# EX 4:Using the output of order

# Define the variable i to be the index of the smallest state
i <- which.min(murders$population)

# Define variable states to hold the states
states <- murders$state

# Use the index you just defined to find the state with the smallest population
states[i]

## [1] "Wyoming"

# EX 5: Ranks

# EX 5: Ranks

# Store temperatures in an object 
temp <- c(35, 88, 42, 84, 81, 30)

# Store city names in an object 
city <- c("Beijing", "Lagos", "Paris", "Rio de Janeiro", "San Juan", "Toronto")

# Create data frame with city names and temperature 
city_temps <- data.frame(name = city, temperature = temp)

# Define a variable states to be the state names 
states <- murders$state

# Define a variable ranks to determine the population size ranks 
ranks <- rank(murders$population)

# Create a data frame my_df with the state name and its rank
my_df <- data.frame(states = states, ranks = ranks)
my_df

##                  states ranks
## 1               Alabama    29
## 2                Alaska     5
## 3               Arizona    36
## 4              Arkansas    20
## 5            California    51
## 6              Colorado    30
## 7           Connecticut    23
## 8              Delaware     7
## 9  District of Columbia     2
## 10              Florida    49
## 11              Georgia    44
## 12               Hawaii    12
## 13                Idaho    13
## 14             Illinois    47
## 15              Indiana    37
## 16                 Iowa    22
## 17               Kansas    19
## 18             Kentucky    26
## 19            Louisiana    27
## 20                Maine    11
## 21             Maryland    33
## 22        Massachusetts    38
## 23             Michigan    43
## 24            Minnesota    31
## 25          Mississippi    21
## 26             Missouri    34
## 27              Montana     8
## 28             Nebraska    14
## 29               Nevada    17
## 30        New Hampshire    10
## 31           New Jersey    41
## 32           New Mexico    16
## 33             New York    48
## 34       North Carolina    42
## 35         North Dakota     4
## 36                 Ohio    45
## 37             Oklahoma    24
## 38               Oregon    25
## 39         Pennsylvania    46
## 40         Rhode Island     9
## 41       South Carolina    28
## 42         South Dakota     6
## 43            Tennessee    35
## 44                Texas    50
## 45                 Utah    18
## 46              Vermont     3
## 47             Virginia    40
## 48           Washington    39
## 49        West Virginia    15
## 50            Wisconsin    32
## 51              Wyoming     1

# EX 6: Data Frames, Ranks and Orders

# Define a variable states to be the state names from the murders data frame
states <- murders$state

# Define a variable ranks to determine the population size ranks 
ranks <- rank(murders$population)

# Define a variable ind to store the indexes needed to order the population values
ind <- order(murders$population)

# Create a data frame my_df with the state name and its rank and ordered from least populous to most 
my_df <- data.frame(states = states[ind], ranks = ranks[ind])
my_df

##                  states ranks
## 1               Wyoming     1
## 2  District of Columbia     2
## 3               Vermont     3
## 4          North Dakota     4
## 5                Alaska     5
## 6          South Dakota     6
## 7              Delaware     7
## 8               Montana     8
## 9          Rhode Island     9
## 10        New Hampshire    10
## 11                Maine    11
## 12               Hawaii    12
## 13                Idaho    13
## 14             Nebraska    14
## 15        West Virginia    15
## 16           New Mexico    16
## 17               Nevada    17
## 18                 Utah    18
## 19               Kansas    19
## 20             Arkansas    20
## 21          Mississippi    21
## 22                 Iowa    22
## 23          Connecticut    23
## 24             Oklahoma    24
## 25               Oregon    25
## 26             Kentucky    26
## 27            Louisiana    27
## 28       South Carolina    28
## 29              Alabama    29
## 30             Colorado    30
## 31            Minnesota    31
## 32            Wisconsin    32
## 33             Maryland    33
## 34             Missouri    34
## 35            Tennessee    35
## 36              Arizona    36
## 37              Indiana    37
## 38        Massachusetts    38
## 39           Washington    39
## 40             Virginia    40
## 41           New Jersey    41
## 42       North Carolina    42
## 43             Michigan    43
## 44              Georgia    44
## 45                 Ohio    45
## 46         Pennsylvania    46
## 47             Illinois    47
## 48             New York    48
## 49              Florida    49
## 50                Texas    50
## 51           California    51

# EX 7: NA

# Using new dataset 
library(dslabs)
data(na_example)

# Checking the structure 
str(na_example)

##  int [1:1000] 2 1 3 2 1 3 1 4 3 2 ...

# Find out the mean of the entire dataset 
mean(na_example)

## [1] NA

# Use is.na to create a logical index ind that tells which entries are NA
ind <- is.na(na_example)

# Determine how many NA ind has using the sum function
sum(ind)

## [1] 145

# EX 8: Rmoving NAs

# Note what we can do with the ! operator
x <- c(1, 2, 3)
ind <- c(FALSE, TRUE, FALSE)
x[!ind]

## [1] 1 3

# Create the ind vector
library(dslabs)
data(na_example)
ind <- is.na(na_example)

# We saw that this gives an NA
mean(na_example)

## [1] NA

# Compute the average, for entries of na_example that are not NA 
mean(na_example[!ind])

## [1] 2.301754

2.3 Vector Arithmetic

# which state is the biggest:
murders$state[which.max(murders$population)]

## [1] "California"

# How many people:
max(murders$population)

## [1] 37253956

Example of elementwise operations on vectors

# heights in feet
heights <- c(69,62,66,70,70,73,67,73,67,70)
heights * 2.54

##  [1] 175.26 157.48 167.64 177.80 177.80 185.42 170.18 185.42 170.18 177.80

murder_rate <- murders$total/murders$population*100000
murder_rate

##  [1]  2.8244238  2.6751860  3.6295273  3.1893901  3.3741383  1.2924531
##  [7]  2.7139722  4.2319369 16.4527532  3.3980688  3.7903226  0.5145920
## [13]  0.7655102  2.8369608  2.1900730  0.6893484  2.2081106  2.6732010
## [19]  7.7425810  0.8280881  5.0748655  1.8021791  4.1786225  0.9992600
## [25]  4.0440846  5.3598917  1.2128379  1.7521372  3.1104763  0.3798036
## [31]  2.7980319  3.2537239  2.6679599  2.9993237  0.5947151  2.6871225
## [37]  2.9589340  0.9396843  3.5977513  1.5200933  4.4753235  0.9825837
## [43]  3.4509357  3.2013603  0.7959810  0.3196211  3.1246001  1.3829942
## [49]  1.4571013  1.7056487  0.8871131

murders$state[order(murder_rate,decreasing=TRUE)]

##  [1] "District of Columbia" "Louisiana"            "Missouri"            
##  [4] "Maryland"             "South Carolina"       "Delaware"            
##  [7] "Michigan"             "Mississippi"          "Georgia"             
## [10] "Arizona"              "Pennsylvania"         "Tennessee"           
## [13] "Florida"              "California"           "New Mexico"          
## [16] "Texas"                "Arkansas"             "Virginia"            
## [19] "Nevada"               "North Carolina"       "Oklahoma"            
## [22] "Illinois"             "Alabama"              "New Jersey"          
## [25] "Connecticut"          "Ohio"                 "Alaska"              
## [28] "Kentucky"             "New York"             "Kansas"              
## [31] "Indiana"              "Massachusetts"        "Nebraska"            
## [34] "Wisconsin"            "Rhode Island"         "West Virginia"       
## [37] "Washington"           "Colorado"             "Montana"             
## [40] "Minnesota"            "South Dakota"         "Oregon"              
## [43] "Wyoming"              "Maine"                "Utah"                
## [46] "Idaho"                "Iowa"                 "North Dakota"        
## [49] "Hawaii"               "New Hampshire"        "Vermont"

# EX 1: Vectorized operations

# Assign city names to `city` 
city <- c("Beijing", "Lagos", "Paris", "Rio de Janeiro", "San Juan", "Toronto")

# Store temperature values in `temp`
temp <- c(35, 88, 42, 84, 81, 30)

# Convert temperature into Celsius and overwrite the original values of 'temp' with these Celsius values

temp <- (temp-32) * 5/9

# Create a data frame `city_temps` 
city_temps <- data.frame(name = city, temperature = temp)
city_temps

##             name temperature
## 1        Beijing    1.666667
## 2          Lagos   31.111111
## 3          Paris    5.555556
## 4 Rio de Janeiro   28.888889
## 5       San Juan   27.222222
## 6        Toronto   -1.111111

# EX 2: Vectorized operations continued...

# Define an object `x` with the numbers 1 through 100
x <- seq(1, 100)

# Sum the equation 
sum(1/x^2)

## [1] 1.634984

# EX 3:Vectorized operation continued...

# Load the data
library(dslabs)
data(murders)

# Store the per 100,000 murder rate for each state in murder_rate
murder_rate <- murders$total / murders$population * 100000 
# Calculate the average murder rate in the US 
sum(murder_rate) / length(murder_rate)

## [1] 2.779125

mean(murder_rate)

## [1] 2.779125

Module 3: Indexing, Data Wrangeling, Plots

3.1 Indexing

murder_rate <- murders$total / murders$population * 100000

# murder rate in Italy is 0.71, find us states with similar or lower rates
index <- murder_rate < 0.71
index

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12]  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [34] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [45] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE

# which states
murders$state[index]

## [1] "Hawaii"        "Iowa"          "New Hampshire" "North Dakota" 
## [5] "Vermont"

# how many states
sum(index)

## [1] 5

# we want to find a states with mountains (West) and safe (murder_rate <= 1)
west <- murders$region == "West"
safe <- murder_rate <= 1
index <- safe & west

murders$state[index]

## [1] "Hawaii"  "Idaho"   "Oregon"  "Utah"    "Wyoming"

which

x <- c(FALSE, TRUE, FALSE, TRUE, TRUE, FALSE)
which(x)

## [1] 2 4 5

# Ex we want to look up the murderrate in Massachusetts
index <- which(murders$state =="Massachusetts")
index

## [1] 22

# so to get the murder rate, we use the index
murder_rate[index]

## [1] 1.802179

# Now we want to match severral states
index <- match(c("New York", "Florida", "Texas"), murders$state)
index

## [1] 33 10 44

# To confirm we got it right
murder_state <- murders$state
murder_state[index]

## [1] "New York" "Florida"  "Texas"

# and the murder rate of these states
murder_rate[index]

## [1] 2.667960 3.398069 3.201360

x <- c("a", "b", "c", "d", "e")
y <- c("a", "d", "f")

# so we can ask if y is in x
y %in% x

## [1]  TRUE  TRUE FALSE

# check if three states are actually states
c("Boston", "Dakota", "Washington") %in% murders$state

## [1] FALSE FALSE  TRUE

# EX 1: Logical Vectors

# Store the murder rate per 100,000 for each state, in `murder_rate`
murder_rate <- murders$total / murders$population * 100000
# 
# Store the `murder_rate < 1` in `low` 
low <- murder_rate < 1

# EX 2: which

# Store the murder rate per 100,000 for each state, in murder_rate
murder_rate <- murders$total/murders$population*100000

# Store the murder_rate < 1 in low 
low <- murder_rate < 1

# Get the indices of entries that are below 1
which(low)

##  [1] 12 13 16 20 24 30 35 38 42 45 46 51

# EX 3: Ordering vectors

# Store the murder rate per 100,000 for each state, in murder_rate
murder_rate <- murders$total/murders$population*100000

# Store the murder_rate < 1 in low 
low <- murder_rate < 1

# Names of states with murder rates lower than 1
murders$state[low]

##  [1] "Hawaii"        "Idaho"         "Iowa"          "Maine"        
##  [5] "Minnesota"     "New Hampshire" "North Dakota"  "Oregon"       
##  [9] "South Dakota"  "Utah"          "Vermont"       "Wyoming"

# EX 4: Filtering

# Store the murder rate per 100,000 for each state, in `murder_rate`
murder_rate <- murders$total/murders$population*100000

# Store the `murder_rate < 1` in `low` 
low <- murder_rate < 1

# Create a vector ind for states in the Northeast and with murder rates lower than 1. 
ind <- (murders$region == "Northeast") & (murder_rate < 1)

# Names of states in `ind` 
murders$state[ind]

## [1] "Maine"         "New Hampshire" "Vermont"

# EX 5: Filtering continued

# Store the murder rate per 100,000 for each state, in murder_rate
murder_rate <- murders$total/murders$population*100000


# Compute average murder rate and store in avg using `mean` 
avg <- mean(murder_rate)

# How many states have murder rates below avg ? Check using sum 
sum(murder_rate < avg)

## [1] 27

# EX 6: Match

# Store the 3 abbreviations in abbs in a vector (remember that they are character vectors and need quotes)
abbs <- c("AK", "MI", "IA")

# Match the abbs to the murders$abb and store in `ind`
ind <- match(abbs , murders$abb)

# Print state names from `ind`
murders$state[ind]

## [1] "Alaska"   "Michigan" "Iowa"

# EX 7: %in%

# Store the 5 abbreviations in `abbs`. (remember that they are character vectors)
abbs <- c("MA", "ME", "MI", "MO", "MU")

# Use the %in% command to check if the entries of abbs are abbreviations in the the murders data frame
abbs %in% murders$abb

## [1]  TRUE  TRUE  TRUE  TRUE FALSE

# EX 8: Logical operator

# Store the 5 abbreviations in abbs. (remember that they are character vectors)
abbs <- c("MA", "ME", "MI", "MO", "MU") 

# Use the `which` command and `!` operator to find out which abbreviation are not actually part of the dataset and store in ind

ind <- which(!abbs %in% murders$abb)

# What are the entries of abbs that are not actual abbreviations
abbs[ind]

## [1] "MU"

3.2 Basic Data Wrangeling

library(dplyr)

we want to add the murder ae into the table

murders <- mutate(murders, rate=total/population * 100000)
murders

##                   state abb        region population total       rate
## 1               Alabama  AL         South    4779736   135  2.8244238
## 2                Alaska  AK          West     710231    19  2.6751860
## 3               Arizona  AZ          West    6392017   232  3.6295273
## 4              Arkansas  AR         South    2915918    93  3.1893901
## 5            California  CA          West   37253956  1257  3.3741383
## 6              Colorado  CO          West    5029196    65  1.2924531
## 7           Connecticut  CT     Northeast    3574097    97  2.7139722
## 8              Delaware  DE         South     897934    38  4.2319369
## 9  District of Columbia  DC         South     601723    99 16.4527532
## 10              Florida  FL         South   19687653   669  3.3980688
## 11              Georgia  GA         South    9920000   376  3.7903226
## 12               Hawaii  HI          West    1360301     7  0.5145920
## 13                Idaho  ID          West    1567582    12  0.7655102
## 14             Illinois  IL North Central   12830632   364  2.8369608
## 15              Indiana  IN North Central    6483802   142  2.1900730
## 16                 Iowa  IA North Central    3046355    21  0.6893484
## 17               Kansas  KS North Central    2853118    63  2.2081106
## 18             Kentucky  KY         South    4339367   116  2.6732010
## 19            Louisiana  LA         South    4533372   351  7.7425810
## 20                Maine  ME     Northeast    1328361    11  0.8280881
## 21             Maryland  MD         South    5773552   293  5.0748655
## 22        Massachusetts  MA     Northeast    6547629   118  1.8021791
## 23             Michigan  MI North Central    9883640   413  4.1786225
## 24            Minnesota  MN North Central    5303925    53  0.9992600
## 25          Mississippi  MS         South    2967297   120  4.0440846
## 26             Missouri  MO North Central    5988927   321  5.3598917
## 27              Montana  MT          West     989415    12  1.2128379
## 28             Nebraska  NE North Central    1826341    32  1.7521372
## 29               Nevada  NV          West    2700551    84  3.1104763
## 30        New Hampshire  NH     Northeast    1316470     5  0.3798036
## 31           New Jersey  NJ     Northeast    8791894   246  2.7980319
## 32           New Mexico  NM          West    2059179    67  3.2537239
## 33             New York  NY     Northeast   19378102   517  2.6679599
## 34       North Carolina  NC         South    9535483   286  2.9993237
## 35         North Dakota  ND North Central     672591     4  0.5947151
## 36                 Ohio  OH North Central   11536504   310  2.6871225
## 37             Oklahoma  OK         South    3751351   111  2.9589340
## 38               Oregon  OR          West    3831074    36  0.9396843
## 39         Pennsylvania  PA     Northeast   12702379   457  3.5977513
## 40         Rhode Island  RI     Northeast    1052567    16  1.5200933
## 41       South Carolina  SC         South    4625364   207  4.4753235
## 42         South Dakota  SD North Central     814180     8  0.9825837
## 43            Tennessee  TN         South    6346105   219  3.4509357
## 44                Texas  TX         South   25145561   805  3.2013603
## 45                 Utah  UT          West    2763885    22  0.7959810
## 46              Vermont  VT     Northeast     625741     2  0.3196211
## 47             Virginia  VA         South    8001024   250  3.1246001
## 48           Washington  WA          West    6724540    93  1.3829942
## 49        West Virginia  WV         South    1852994    27  1.4571013
## 50            Wisconsin  WI North Central    5686986    97  1.7056487
## 51              Wyoming  WY          West     563626     5  0.8871131

filter(murders, rate <= 0.71)

##           state abb        region population total      rate
## 1        Hawaii  HI          West    1360301     7 0.5145920
## 2          Iowa  IA North Central    3046355    21 0.6893484
## 3 New Hampshire  NH     Northeast    1316470     5 0.3798036
## 4  North Dakota  ND North Central     672591     4 0.5947151
## 5       Vermont  VT     Northeast     625741     2 0.3196211

select specific columns

new_table <- select(murders,state,region,rate)
new_table

##                   state        region       rate
## 1               Alabama         South  2.8244238
## 2                Alaska          West  2.6751860
## 3               Arizona          West  3.6295273
## 4              Arkansas         South  3.1893901
## 5            California          West  3.3741383
## 6              Colorado          West  1.2924531
## 7           Connecticut     Northeast  2.7139722
## 8              Delaware         South  4.2319369
## 9  District of Columbia         South 16.4527532
## 10              Florida         South  3.3980688
## 11              Georgia         South  3.7903226
## 12               Hawaii          West  0.5145920
## 13                Idaho          West  0.7655102
## 14             Illinois North Central  2.8369608
## 15              Indiana North Central  2.1900730
## 16                 Iowa North Central  0.6893484
## 17               Kansas North Central  2.2081106
## 18             Kentucky         South  2.6732010
## 19            Louisiana         South  7.7425810
## 20                Maine     Northeast  0.8280881
## 21             Maryland         South  5.0748655
## 22        Massachusetts     Northeast  1.8021791
## 23             Michigan North Central  4.1786225
## 24            Minnesota North Central  0.9992600
## 25          Mississippi         South  4.0440846
## 26             Missouri North Central  5.3598917
## 27              Montana          West  1.2128379
## 28             Nebraska North Central  1.7521372
## 29               Nevada          West  3.1104763
## 30        New Hampshire     Northeast  0.3798036
## 31           New Jersey     Northeast  2.7980319
## 32           New Mexico          West  3.2537239
## 33             New York     Northeast  2.6679599
## 34       North Carolina         South  2.9993237
## 35         North Dakota North Central  0.5947151
## 36                 Ohio North Central  2.6871225
## 37             Oklahoma         South  2.9589340
## 38               Oregon          West  0.9396843
## 39         Pennsylvania     Northeast  3.5977513
## 40         Rhode Island     Northeast  1.5200933
## 41       South Carolina         South  4.4753235
## 42         South Dakota North Central  0.9825837
## 43            Tennessee         South  3.4509357
## 44                Texas         South  3.2013603
## 45                 Utah          West  0.7959810
## 46              Vermont     Northeast  0.3196211
## 47             Virginia         South  3.1246001
## 48           Washington          West  1.3829942
## 49        West Virginia         South  1.4571013
## 50            Wisconsin North Central  1.7056487
## 51              Wyoming          West  0.8871131

filter(new_table, rate <= 0.71)

##           state        region      rate
## 1        Hawaii          West 0.5145920
## 2          Iowa North Central 0.6893484
## 3 New Hampshire     Northeast 0.3798036
## 4  North Dakota North Central 0.5947151
## 5       Vermont     Northeast 0.3196211

using pipe to put it all together

murders %>% select(state,region,rate) %>% filter(rate <= 0.71)

##           state        region      rate
## 1        Hawaii          West 0.5145920
## 2          Iowa North Central 0.6893484
## 3 New Hampshire     Northeast 0.3798036
## 4  North Dakota North Central 0.5947151
## 5       Vermont     Northeast 0.3196211

creating frames

grades <- data.frame(names=c("John", "juan","Jean","Yao"),
                     exam_1 = c(95, 80, 90, 85),
                     exam_2 = c(90, 85, 85, 90))
grades

##   names exam_1 exam_2
## 1  John     95     90
## 2  juan     80     85
## 3  Jean     90     85
## 4   Yao     85     90

class(grades$names)

## [1] "factor"

grades <- data.frame(names=c("John", "juan","Jean","Yao"),
                     exam_1 = c(95, 80, 90, 85),
                     exam_2 = c(90, 85, 85, 90),
                     stringsAsFactors = FALSE)

class(grades$names)

## [1] "character"

# EX 1: dplyr

# Loading data
library(dslabs)
data(murders)

# Loading dplyr
library(dplyr)

# Redefine murders so that it includes column named rate with the per 100,000 murder rates
murders <- mutate(murders, rate=total/population * 100000)

# EX 2: mutate
  
# Note that if you want ranks from highest to lowest you can take the negative and then compute the ranks 
x <- c(88, 100, 83, 92, 94)
rank(-x)

## [1] 4 1 5 3 2

# Defining rate
rate <-  murders$total/ murders$population * 100000

# Redefine murders to include a column named rank
# with the ranks of rate from highest to lowest
murders <- mutate(murders, rank(-rate))
murders

##                   state abb        region population total       rate
## 1               Alabama  AL         South    4779736   135  2.8244238
## 2                Alaska  AK          West     710231    19  2.6751860
## 3               Arizona  AZ          West    6392017   232  3.6295273
## 4              Arkansas  AR         South    2915918    93  3.1893901
## 5            California  CA          West   37253956  1257  3.3741383
## 6              Colorado  CO          West    5029196    65  1.2924531
## 7           Connecticut  CT     Northeast    3574097    97  2.7139722
## 8              Delaware  DE         South     897934    38  4.2319369
## 9  District of Columbia  DC         South     601723    99 16.4527532
## 10              Florida  FL         South   19687653   669  3.3980688
## 11              Georgia  GA         South    9920000   376  3.7903226
## 12               Hawaii  HI          West    1360301     7  0.5145920
## 13                Idaho  ID          West    1567582    12  0.7655102
## 14             Illinois  IL North Central   12830632   364  2.8369608
## 15              Indiana  IN North Central    6483802   142  2.1900730
## 16                 Iowa  IA North Central    3046355    21  0.6893484
## 17               Kansas  KS North Central    2853118    63  2.2081106
## 18             Kentucky  KY         South    4339367   116  2.6732010
## 19            Louisiana  LA         South    4533372   351  7.7425810
## 20                Maine  ME     Northeast    1328361    11  0.8280881
## 21             Maryland  MD         South    5773552   293  5.0748655
## 22        Massachusetts  MA     Northeast    6547629   118  1.8021791
## 23             Michigan  MI North Central    9883640   413  4.1786225
## 24            Minnesota  MN North Central    5303925    53  0.9992600
## 25          Mississippi  MS         South    2967297   120  4.0440846
## 26             Missouri  MO North Central    5988927   321  5.3598917
## 27              Montana  MT          West     989415    12  1.2128379
## 28             Nebraska  NE North Central    1826341    32  1.7521372
## 29               Nevada  NV          West    2700551    84  3.1104763
## 30        New Hampshire  NH     Northeast    1316470     5  0.3798036
## 31           New Jersey  NJ     Northeast    8791894   246  2.7980319
## 32           New Mexico  NM          West    2059179    67  3.2537239
## 33             New York  NY     Northeast   19378102   517  2.6679599
## 34       North Carolina  NC         South    9535483   286  2.9993237
## 35         North Dakota  ND North Central     672591     4  0.5947151
## 36                 Ohio  OH North Central   11536504   310  2.6871225
## 37             Oklahoma  OK         South    3751351   111  2.9589340
## 38               Oregon  OR          West    3831074    36  0.9396843
## 39         Pennsylvania  PA     Northeast   12702379   457  3.5977513
## 40         Rhode Island  RI     Northeast    1052567    16  1.5200933
## 41       South Carolina  SC         South    4625364   207  4.4753235
## 42         South Dakota  SD North Central     814180     8  0.9825837
## 43            Tennessee  TN         South    6346105   219  3.4509357
## 44                Texas  TX         South   25145561   805  3.2013603
## 45                 Utah  UT          West    2763885    22  0.7959810
## 46              Vermont  VT     Northeast     625741     2  0.3196211
## 47             Virginia  VA         South    8001024   250  3.1246001
## 48           Washington  WA          West    6724540    93  1.3829942
## 49        West Virginia  WV         South    1852994    27  1.4571013
## 50            Wisconsin  WI North Central    5686986    97  1.7056487
## 51              Wyoming  WY          West     563626     5  0.8871131
##    rank(-rate)
## 1           23
## 2           27
## 3           10
## 4           17
## 5           14
## 6           38
## 7           25
## 8            6
## 9            1
## 10          13
## 11           9
## 12          49
## 13          46
## 14          22
## 15          31
## 16          47
## 17          30
## 18          28
## 19           2
## 20          44
## 21           4
## 22          32
## 23           7
## 24          40
## 25           8
## 26           3
## 27          39
## 28          33
## 29          19
## 30          50
## 31          24
## 32          15
## 33          29
## 34          20
## 35          48
## 36          26
## 37          21
## 38          42
## 39          11
## 40          35
## 41           5
## 42          41
## 43          12
## 44          16
## 45          45
## 46          51
## 47          18
## 48          37
## 49          36
## 50          34
## 51          43

# EX 3: select

# Load dplyr
library(dplyr)

# Use select to only show state names and abbreviations from murders
select(murders, state, abb)

##                   state abb
## 1               Alabama  AL
## 2                Alaska  AK
## 3               Arizona  AZ
## 4              Arkansas  AR
## 5            California  CA
## 6              Colorado  CO
## 7           Connecticut  CT
## 8              Delaware  DE
## 9  District of Columbia  DC
## 10              Florida  FL
## 11              Georgia  GA
## 12               Hawaii  HI
## 13                Idaho  ID
## 14             Illinois  IL
## 15              Indiana  IN
## 16                 Iowa  IA
## 17               Kansas  KS
## 18             Kentucky  KY
## 19            Louisiana  LA
## 20                Maine  ME
## 21             Maryland  MD
## 22        Massachusetts  MA
## 23             Michigan  MI
## 24            Minnesota  MN
## 25          Mississippi  MS
## 26             Missouri  MO
## 27              Montana  MT
## 28             Nebraska  NE
## 29               Nevada  NV
## 30        New Hampshire  NH
## 31           New Jersey  NJ
## 32           New Mexico  NM
## 33             New York  NY
## 34       North Carolina  NC
## 35         North Dakota  ND
## 36                 Ohio  OH
## 37             Oklahoma  OK
## 38               Oregon  OR
## 39         Pennsylvania  PA
## 40         Rhode Island  RI
## 41       South Carolina  SC
## 42         South Dakota  SD
## 43            Tennessee  TN
## 44                Texas  TX
## 45                 Utah  UT
## 46              Vermont  VT
## 47             Virginia  VA
## 48           Washington  WA
## 49        West Virginia  WV
## 50            Wisconsin  WI
## 51              Wyoming  WY

# EX 4: filter

# Add the necessary columns
murders <- mutate(murders, rate = total/population * 100000, rank = rank(-rate))

# Filter to show the top 5 states with the highest murder rates
filter(murders, rank <= 5)

##                  state abb        region population total      rate
## 1 District of Columbia  DC         South     601723    99 16.452753
## 2            Louisiana  LA         South    4533372   351  7.742581
## 3             Maryland  MD         South    5773552   293  5.074866
## 4             Missouri  MO North Central    5988927   321  5.359892
## 5       South Carolina  SC         South    4625364   207  4.475323
##   rank(-rate) rank
## 1           1    1
## 2           2    2
## 3           4    4
## 4           3    3
## 5           5    5

# EX 5: filter with !=

# Use filter to create a new data frame no_south
no_south <- filter(murders, region != "South")

# Use nrow() to calculate the number of rows
nrow(no_south)

## [1] 34

# EX 6: filter with %in%

# Create a new data frame called murders_nw with only the states from the northeast and the west
murders_nw <- filter(murders, region %in% c("Northeast", "West"))

# Number of states (rows) in this category
nrow(murders_nw)

## [1] 22

# EX 7: filtering by two conditions

# add the rate column
murders <- mutate(murders, rate =  total / population * 100000, rank = rank(-rate))

# Create a table, call it `my_states`, that satisfies both the conditions 
my_states <- filter(murders, region %in% c("Northeast", "West") & rate < 1)

# Use select to show only the state name, the murder rate and the rank
select(my_states, state, rate, rank)

##           state      rate rank
## 1        Hawaii 0.5145920   49
## 2         Idaho 0.7655102   46
## 3         Maine 0.8280881   44
## 4 New Hampshire 0.3798036   50
## 5        Oregon 0.9396843   42
## 6          Utah 0.7959810   45
## 7       Vermont 0.3196211   51
## 8       Wyoming 0.8871131   43

# EX 8: Using the pipe %>%

## Define the rate and rank column
murders <- mutate(murders, rate =  total / population * 100000, rank = rank(-rate))

# show the result and only include the state, rate, and rank columns, all in one line
filter(murders, region %in% c("Northeast", "West") & rate < 1) %>%  
   select(state, rate, rank)

##           state      rate rank
## 1        Hawaii 0.5145920   49
## 2         Idaho 0.7655102   46
## 3         Maine 0.8280881   44
## 4 New Hampshire 0.3798036   50
## 5        Oregon 0.9396843   42
## 6          Utah 0.7959810   45
## 7       Vermont 0.3196211   51
## 8       Wyoming 0.8871131   43

# EX 9: mutate, filter and select

# Loading the libraries
library(dplyr)
data(murders)

# Create new data frame called my_states (with specifications in the instructions)
my_states <- murders %>% 
    mutate(rate =  total / population * 100000, rank = rank(-rate)) %>%
    filter(region %in% c("Northeast", "West") & rate < 1) %>%
    select(state, rate, rank)

my_states

##           state      rate rank
## 1        Hawaii 0.5145920   49
## 2         Idaho 0.7655102   46
## 3         Maine 0.8280881   44
## 4 New Hampshire 0.3798036   50
## 5        Oregon 0.9396843   42
## 6          Utah 0.7959810   45
## 7       Vermont 0.3196211   51
## 8       Wyoming 0.8871131   43

3.3 ## Basic Plots

more populus states have more murder

population_in_millions <- murders$population
total_gun_murders <- murders$total

plot(population_in_millions, total_gun_murders)

To look at the distribution of the data we use histogram

class(murders$rate)

## [1] "NULL"

murders <- mutate(murders, rate =  total / population * 100000, rank = rank(-rate))
murders

##                   state abb        region population total       rate rank
## 1               Alabama  AL         South    4779736   135  2.8244238   23
## 2                Alaska  AK          West     710231    19  2.6751860   27
## 3               Arizona  AZ          West    6392017   232  3.6295273   10
## 4              Arkansas  AR         South    2915918    93  3.1893901   17
## 5            California  CA          West   37253956  1257  3.3741383   14
## 6              Colorado  CO          West    5029196    65  1.2924531   38
## 7           Connecticut  CT     Northeast    3574097    97  2.7139722   25
## 8              Delaware  DE         South     897934    38  4.2319369    6
## 9  District of Columbia  DC         South     601723    99 16.4527532    1
## 10              Florida  FL         South   19687653   669  3.3980688   13
## 11              Georgia  GA         South    9920000   376  3.7903226    9
## 12               Hawaii  HI          West    1360301     7  0.5145920   49
## 13                Idaho  ID          West    1567582    12  0.7655102   46
## 14             Illinois  IL North Central   12830632   364  2.8369608   22
## 15              Indiana  IN North Central    6483802   142  2.1900730   31
## 16                 Iowa  IA North Central    3046355    21  0.6893484   47
## 17               Kansas  KS North Central    2853118    63  2.2081106   30
## 18             Kentucky  KY         South    4339367   116  2.6732010   28
## 19            Louisiana  LA         South    4533372   351  7.7425810    2
## 20                Maine  ME     Northeast    1328361    11  0.8280881   44
## 21             Maryland  MD         South    5773552   293  5.0748655    4
## 22        Massachusetts  MA     Northeast    6547629   118  1.8021791   32
## 23             Michigan  MI North Central    9883640   413  4.1786225    7
## 24            Minnesota  MN North Central    5303925    53  0.9992600   40
## 25          Mississippi  MS         South    2967297   120  4.0440846    8
## 26             Missouri  MO North Central    5988927   321  5.3598917    3
## 27              Montana  MT          West     989415    12  1.2128379   39
## 28             Nebraska  NE North Central    1826341    32  1.7521372   33
## 29               Nevada  NV          West    2700551    84  3.1104763   19
## 30        New Hampshire  NH     Northeast    1316470     5  0.3798036   50
## 31           New Jersey  NJ     Northeast    8791894   246  2.7980319   24
## 32           New Mexico  NM          West    2059179    67  3.2537239   15
## 33             New York  NY     Northeast   19378102   517  2.6679599   29
## 34       North Carolina  NC         South    9535483   286  2.9993237   20
## 35         North Dakota  ND North Central     672591     4  0.5947151   48
## 36                 Ohio  OH North Central   11536504   310  2.6871225   26
## 37             Oklahoma  OK         South    3751351   111  2.9589340   21
## 38               Oregon  OR          West    3831074    36  0.9396843   42
## 39         Pennsylvania  PA     Northeast   12702379   457  3.5977513   11
## 40         Rhode Island  RI     Northeast    1052567    16  1.5200933   35
## 41       South Carolina  SC         South    4625364   207  4.4753235    5
## 42         South Dakota  SD North Central     814180     8  0.9825837   41
## 43            Tennessee  TN         South    6346105   219  3.4509357   12
## 44                Texas  TX         South   25145561   805  3.2013603   16
## 45                 Utah  UT          West    2763885    22  0.7959810   45
## 46              Vermont  VT     Northeast     625741     2  0.3196211   51
## 47             Virginia  VA         South    8001024   250  3.1246001   18
## 48           Washington  WA          West    6724540    93  1.3829942   37
## 49        West Virginia  WV         South    1852994    27  1.4571013   36
## 50            Wisconsin  WI North Central    5686986    97  1.7056487   34
## 51              Wyoming  WY          West     563626     5  0.8871131   43

hist(murders$rate)

#One extreme value
murders$state[which.max(murders$rate)]

## [1] "District of Columbia"

boxplots are good at comparing different groupings like regions

boxplot(rate~region, data= murders)

# EX 1: Scatterplots

# Load the datasets and define some variables
library(dslabs)
data(murders)

population_in_millions <- murders$population/10^6
total_gun_murders <- murders$total

plot(population_in_millions, total_gun_murders)

# Transform population using the log10 transformation and save to object log10_population
log10_population <- log10(murders$population)

# Transform total gun murders using log10 transformation and save to object log10_total_gun_murders
log10_total_gun_murders <- log10(total_gun_murders)

# Create a scatterplot with the log scale transformed population and murders 
plot(log10_population, log10_total_gun_murders)

# EX 2: Histograms

# Store the population in millions and save to population_in_millions 
population_in_millions <- murders$population/10^6


# Create a histogram of this variable
hist(population_in_millions)

# EX 3: Boxplots

# Create a boxplot of state populations by region for the murders dataset
boxplot(murders$population~murders$region)

Module 4: Programming Basics

4.1 Introduction to Programming in R

4.2 Conditionals

library(dslabs)
data(murders)
murder_rate <- murders$total/murders$population*100000

ind <- which.min(murder_rate)

if(murder_rate[ind] < 0.5) {
  print(murders$state[ind])
} else{
  print("No state has murder rate that low")
}

## [1] "Vermont"

ind <- which.min(murder_rate)

if(murder_rate[ind] < 0.25) {
  print(murders$state[ind])
} else{
  print("No state has murder rate that low")
}

## [1] "No state has murder rate that low"

a <- c(0,1,2,-4,5)

result <- ifelse(a > 0, 1/a, NA)
result

## [1]  NA 1.0 0.5  NA 0.2

4.3 Functions

avg <- function(x) {
  s <- sum(x)
  n <- length(x)
  s/n
}

x <- c(5,4,3,2)

avg(x)

## [1] 3.5

4.4 For Loops

compute_s_n <- function(n){
  x <- 1:n
  sum(x)
}

compute_s_n(3) # 1+2+3

## [1] 6

compute_s_n(100)

## [1] 5050

we now want to repeat the process 25 times

m <- 25
# we create an empty vector
s_n <- vector(length = m)

for(n in 1:m) {
  s_n[n] <- compute_s_n(n)
}

s_n

##  [1]   1   3   6  10  15  21  28  36  45  55  66  78  91 105 120 136 153
## [18] 171 190 210 231 253 276 300 325

n <- 1:m
plot(n, s_n)

in stead of loops we use:

apply
sapply
tapply #

# EX 2: Conditionals

# Assign the state abbreviation when the state name is longer than 8 characters 
new_names <- ifelse(nchar(murders$state)>8, murders$abb, murders$state)
new_names

##  [1] "Alabama"  "Alaska"   "Arizona"  "Arkansas" "CA"       "Colorado"
##  [7] "CT"       "Delaware" "DC"       "Florida"  "Georgia"  "Hawaii"  
## [13] "Idaho"    "Illinois" "Indiana"  "Iowa"     "Kansas"   "Kentucky"
## [19] "LA"       "Maine"    "Maryland" "MA"       "Michigan" "MN"      
## [25] "MS"       "Missouri" "Montana"  "Nebraska" "Nevada"   "NH"      
## [31] "NJ"       "NM"       "New York" "NC"       "ND"       "Ohio"    
## [37] "Oklahoma" "Oregon"   "PA"       "RI"       "SC"       "SD"      
## [43] "TN"       "Texas"    "Utah"     "Vermont"  "Virginia" "WA"      
## [49] "WV"       "WI"       "Wyoming"

# EX 4: Defining functions

# Create function called `sum_n`
sum_n <- function(n){
    x <- 1:n
    sum(x)
}

# Determine the sum of integers from 1 to 5000
sum_n(5000)

## [1] 12502500

# EX 5: Defining functions continued...

# Create `altman_plot` 
altman_plot <- function(x, y){
    plot(x + y, y - x)
}

x <- c(1,2,3,4,5)

y <- c(2,4,6,8,10)

altman_plot(x,y)

# Run this code 
x <- 3
    my_func <- function(y){
    x <- 5
    y+5
}

# Print value of x 
    
x

## [1] 3

# EX 7: For loops
# Here is a function that adds numbers from 1 to n
example_func <- function(n){
    x <- 1:n
    sum(x)
}

# Here is the sum of the first 100 numbers
example_func(100)

## [1] 5050

# Write the function with argument n, with the above mentioned specifications and store it in `compute_s_n` 
compute_s_n <- function(n){
  x <- 1:n
  sum(x^2)
}

# Report the value of the sum when n=10
compute_s_n(10)

## [1] 385

# EX 8: For loops continued...

# Define a function and store it in `compute_s_n`
compute_s_n <- function(n){
  x <- 1:n
  sum(x^2)
}

# Create a vector for storing results
s_n <- vector("numeric", 25)

# Assign values to `n` and `s_n`
for(i in 1:25){
  s_n[i] <- compute_s_n(i)
}

# EX 9: Checking our math

# Define the function
compute_s_n <- function(n){
  x <- 1:n
  sum(x^2)
}

# Define the vector of n
n <- 1:25

# Define the vector to store data
s_n <- vector("numeric", 25)
for(i in n){
  s_n[i] <- compute_s_n(i)
}

#  Create the plot 
plot(n, s_n)

# EX 10: Checking our math continued

# Define the function
compute_s_n <- function(n){
  x <- 1:n
  sum(x^2)
}

# Define the vector of n
n <- 1:25

# Define the vector to store data
s_n <- vector("numeric", 25)
for(i in n){
  s_n[i] <- compute_s_n(i)
}

# Check that s_n is identical to the formula given in the instructions.
identical(s_n, n*(n+1)*(2*n+1)/6)

## [1] TRUE

Data Science R Basics

H. Gjerning

2019-03-10