## Example
# ----
# Let's first assign the variable x the value 5 and also assign the variable y the value 16.
# ---
#
x <- 5
y <- 16
## Then, we add x and y together using the addition operator +
# ---
#
x + y
## [1] 21
## Challenge
# ---
# Question: Using our understanding in the above example,
# let's now subtract x from y using the operator - below
# ---
# OUR CODE GOES BELOW
#
x-y
## [1] -11
## Example
# ---
# Question: Lets create two vectors v and t
# ---
#
v <- c(3,1,TRUE,2+3i)
t <- c(4,1,FALSE,2+3i)
# Then use the element-wise logical and operator & as follows
v&t
## [1] TRUE TRUE FALSE TRUE
## Example
# ---
# Question: Create again two vectors v and t
# ---
#
v <- c(0,0,TRUE,2+2i)
t <- c(0,3,TRUE,2+3i)
# Then use the element-wise logical or operator | below
v|t
## [1] FALSE TRUE TRUE TRUE
## Example
# ---
# Question: Let create two vectors v and t
# ---
#
v <- c(3,1,TRUE,2+3i)
t <- c(4,1,FALSE,2+3i)
# This time, use the logical not operator ||
v||t
## [1] TRUE
## Example
# ---
# Question: Create variables v1, v2 and v3, assigning them with vectors
# using the left assignment operators v1, v2 and v3
# ---
#
v1 <- c(3,1,TRUE,2+3i)
v2 <<- c(3,1,TRUE,2+3i)
v3 = c(3,1,TRUE,2+3i)
# Then we print out v1 below
v1
## [1] 3+0i 1+0i 1+0i 2+3i
# And print out v2 below
# ---
# OUR CODE GOES BELOW
#
v2
## [1] 3+0i 1+0i 1+0i 2+3i
# And now print out v3 below
# ---
# OUR CODE GOES BELOW
#
v3
## [1] 3+0i 1+0i 1+0i 2+3i
## Challenge
# ---
# Question: Use the right asignment operators to assign vectors to the variables v1 and v2
# ---
#
c(3,1,TRUE,2+3i) -> v1
c(3,1,TRUE,2+3i) ->> v2
# Then print out variable v1 and see what has happened
# ---
# OUR CODE GOES HERE
#
v1
## [1] 3+0i 1+0i 1+0i 2+3i
# And also print out variable v2
# ---
# OUR CODE GOES HERE
#
v2
## [1] 3+0i 1+0i 1+0i 2+3i
## Example
# ---
# Question: Use the right assignment operators to assign the vectors to the respective variables as shown below;
# ---
# OUR CODE GOES HERE
#
variable.1 = c(3,4,5,6)
variable.2 <- c("Hello"," there")
c(TRUE,2) -> variable.3
# Uncomment the following lines to see what has happened
# ---
#
variable.1
## [1] 3 4 5 6
variable.2
## [1] "Hello" " there"
variable.3
## [1] 1 2
## Example
# ---
# Question: To learn about the numeric data type,
# lets assign the value 62.4 to the variable m as shown below
# ---
# OUR CODE GOES BELOW
#
m = 62.4
# Print out the variable's value below
# ---
# OUR CODE BELOW
#
m
## [1] 62.4
class(m)
## [1] "numeric"
## Example
# ---
# Let's now create an integer 3 and assign it to the variable n
# ---
#
n = as.integer(3)
# Then print n below so as to see what is stored in n
# ---
#
n
## [1] 3
class(n)
## [1] "integer"
# Using another example, lets create convert 3.14 to an interger
# and assign the converted value to the variable p
# ---
# OUR CODE GOES BELOW
#
p = as.integer(3.14)
# And print out the value p so as to see the value that has been assigned to p
# ---
# OUR CODE GOES BELOW
#
p
## [1] 3
class(p)
## [1] "integer"
## Example
# ---
# We can also assign a complex number and assign it to the variable k just as shown below
# ---
#
k = 1 + 2i
# Now lets print out k below
# ---
# OUR CODE GOES BELOW
#
k
## [1] 1+2i
class(k)
## [1] "complex"
## Example
# ---
# To create a logical value we are first going to create two variables x and y variables
# ---
#
x = 4; y = 6
# Now we check whether x is greater than y
# ---
#
z = x > y
# And then print out the logical value below
# ---
# OUR CODE GOES BELOW
#
z
## [1] FALSE
class(z)
## [1] "logical"
## Example
# ---
# Convert the value 62.48 to a string and store it a variable g
# ---
#
g = as.character(62.48)
# Then print the character string g
# ---
# OUR CODE GOES BELOW
#
g
## [1] "62.48"
class(g)
## [1] "character"
## Example
# ---
# Lists are R objects which contain elements of different types
# like numbers, strings, vectors, matrices, functions and lists too.
# ---
# Question: Create a list using the list() function
# ---
# OUR CODE GOES BELOW
# ---
#
alist <- list ("Red", "Blue", c(42,36,01), FALSE, 73.91, 128.6)
# Now printing alist
# ---
# OUR CODE GOES BELOW
#
alist
## [[1]]
## [1] "Red"
##
## [[2]]
## [1] "Blue"
##
## [[3]]
## [1] 42 36 1
##
## [[4]]
## [1] FALSE
##
## [[5]]
## [1] 73.91
##
## [[6]]
## [1] 128.6
## Example
# ---
# Question: Create vectors a, b and c
# ---
#
a <- c(1,2,5.3,6,-2,4) # numeric vector
b <- c("one","two","three") # character vector
c <- c(TRUE,TRUE,TRUE,FALSE,TRUE,FALSE) # logical vector
# Print out these vectors a, b and c
# ---
# OUR CODE GOES BELOW
#
a
## [1] 1.0 2.0 5.3 6.0 -2.0 4.0
b
## [1] "one" "two" "three"
c
## [1] TRUE TRUE TRUE FALSE TRUE FALSE
## Example
# ---
# The elements in a list can be given names which would allow one
# to have access to those elements as shown below
# ---
# Question: Create a list containing a vector, a matrix and a list
# ---
#
list_data <- list(c("Jan","Feb","Mar"), matrix(c(3,9,5,1,-2,8), nrow = 2), list("green",12.3))
# Giving names to the elements in the list
names(list_data) <- c("1st Quarter", "A_Matrix", "A Inner list")
# Print the list list_data below
# ---
# OUR CODE GOES BELOW
#
list_data
## $`1st Quarter`
## [1] "Jan" "Feb" "Mar"
##
## $A_Matrix
## [,1] [,2] [,3]
## [1,] 3 5 -2
## [2,] 9 1 8
##
## $`A Inner list`
## $`A Inner list`[[1]]
## [1] "green"
##
## $`A Inner list`[[2]]
## [1] 12.3
## Example
# ---
# You can give a name to the elements of a vector with the names() function as shown below
# ---
#
a <- c("Serena Williams", "Tennis Player")
names(a) <- c("Name", "Profession")
# Then print out these vectors below
# ---
# OUR CODE GOES BELOW
#
a
## Name Profession
## "Serena Williams" "Tennis Player"
## Example
# ---
# Every element of the list can be accessed by the use of square brackets,
# and by numeric indices or by the logical index or by using element names.:
# ---
# Question: Let's now create a list ls
# ---
# OUR CODE BELOW
#
ls <- list( first = 2, second = 4, third = list( fourth = 3.2, fifth = 6.4 ) )
# And uncomment the following lines to find out what happens
# ---
#
ls [1:2]
## $first
## [1] 2
##
## $second
## [1] 4
ls[-3]
## $first
## [1] 2
##
## $second
## [1] 4
ls [c ("first", "second")]
## $first
## [1] 2
##
## $second
## [1] 4
## Example
# ---
# Create a numeric vector a
# ---
# OUR CODE GOES BELOW
#
a <- c(1,2,5.3,6,-2,4)
# And select the 2nd and 3rd elements in the vector
a[c(2,3)]
## [1] 2.0 5.3
## Example
# ---
# One can add elements only at the end of a list as shown
# ---
# Question: Create a list containing a vector and a matrix
# ---
# OUR CODE GOES BELOW
#
list_data <- list(c("Jan","Feb","Mar"), matrix(c(3,9,5,1,-2,8), nrow = 2))
# And add an element at the end of the list then print it out
# ---
#
list_data[4] <- "New element"
list_data[4]
## [[1]]
## [1] "New element"
names(list_data) <- c("Vector", "Matrix", "NULL", "Character")
list_data
## $Vector
## [1] "Jan" "Feb" "Mar"
##
## $Matrix
## [,1] [,2] [,3]
## [1,] 3 5 -2
## [2,] 9 1 8
##
## $`NULL`
## NULL
##
## $Character
## [1] "New element"
## Challenge
# ---
# Question: Create an empty list named months_of_the_years,
# then add all 12 months of the year
# ---
# OUR CODE GOES BELOW
#
months_of_the_year <- list()
months_of_the_year
## list()
months_of_the_year[1] <- "January"
months_of_the_year[2] <- "February"
months_of_the_year[3] <- "March"
months_of_the_year[4] <- "April"
months_of_the_year[5] <- "May"
months_of_the_year[6] <- "June"
months_of_the_year[7] <- "July"
months_of_the_year[8] <- "August"
months_of_the_year[9] <- "September"
months_of_the_year[10] <- "October"
months_of_the_year[11] <- "November"
months_of_the_year[12] <- "December"
names(months_of_the_year) <- c("First", "Second", "Third", "Fourth", "Fifth", "Sixth", "Seventh", "Eighth", "Ninth", "Tenth", "Eleventh", "Twelfth")
months_of_the_year
## $First
## [1] "January"
##
## $Second
## [1] "February"
##
## $Third
## [1] "March"
##
## $Fourth
## [1] "April"
##
## $Fifth
## [1] "May"
##
## $Sixth
## [1] "June"
##
## $Seventh
## [1] "July"
##
## $Eighth
## [1] "August"
##
## $Ninth
## [1] "September"
##
## $Tenth
## [1] "October"
##
## $Eleventh
## [1] "November"
##
## $Twelfth
## [1] "December"
## Example
# ---
# Question: Vectors can be combined via the function c as shown
# ---
# OUR CODE GOES BELOW
#
p = c(1, 2, 3)
q = c("aa", "bb", "cc")
# Then print out the combined vector
# ---
# Hint: Uncomment the line below
#
c(p, q)
## [1] "1" "2" "3" "aa" "bb" "cc"
## Challenge
# ---
# Question: Combine the following vectors and print out the result below
# ---
# OUR CODE GOES BELOW
#
a <- c("Serena Williams", "Tennis Player")
names(a) <- c("Name", "Profession")
c(names(a),a)
## Name Profession
## "Name" "Profession" "Serena Williams" "Tennis Player"
## Example
# ---
# Question: Let's create a matrix mymat
# ---
#
mymat <- matrix(1:12,4,3)
# And then print out mymat
# ---
# OUR CODE GOES BELOW
#
mymat
## [,1] [,2] [,3]
## [1,] 1 5 9
## [2,] 2 6 10
## [3,] 3 7 11
## [4,] 4 8 12
## Example
# ---
# Question: Let's use the byrow=TRUE argument to create a matrix
# by rows instead of by columns as shown below
# ---
mymat <- matrix(1:12,ncol=3,byrow=TRUE)
# And then print out the mymat variable below
mymat
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 4 5 6
## [3,] 7 8 9
## [4,] 10 11 12
## Example
# ---
# In order to remember what is stored in a matrix, you can add the names of the columns and rows.
# This will also help you to read the data as well as select elements from the matrix.
# ---
# Question: Lets create the vectors kenya, ethiopia and chad
# ---
# OUR CODE GOES BELOW
#
kenya <- c(460.998, 314.4)
ethiopia <- c(290.475, 247.900)
chad <- c(309.306, 165.8)
# then create a matrix geography_matrix
# ---
#
geography_matrix <- matrix(c(kenya, ethiopia, chad), nrow = 3, byrow = TRUE)
# Uncomment the following two vectors; location and countries which will be used for naming
# ---
#
location <- c("Lat", "Long")
countries <- c("Kenya", "Ethiopia", "Chad")
# Then also uncomment the line below and so as to name the columns with location
# ---
#
colnames(geography_matrix) <- location
# More on uncomment also the following line so as to name the rows with countries
# ---
#
rownames(geography_matrix) <- countries
# And lastly print out geography_matrix
# ---
#
geography_matrix
## Lat Long
## Kenya 460.998 314.4
## Ethiopia 290.475 247.9
## Chad 309.306 165.8
## Challenge
# ---
# Question: Create a matrix family with column names Name, Age, Gender and Occupation.
# Populate it with 5 your own family members.
# ---
# OUR CODE GOES HERE
#
fam1 <- c("Rose", 56, "Female", "Self" )
fam2 <- c("Caro", 31, "Female", "Self" )
fam3 <- c("Jeff", 24, "Male", "Unemployed" )
fam_matrix <- matrix(c(fam1,fam2,fam3), nrow = 3, byrow = TRUE)
cols <- c("Name", "Age", "Gender", "Occupation")
colnames(fam_matrix) <- cols
fam_matrix
## Name Age Gender Occupation
## [1,] "Rose" "56" "Female" "Self"
## [2,] "Caro" "31" "Female" "Self"
## [3,] "Jeff" "24" "Male" "Unemployed"
You can add a row to a matrix using the rbind() function.
## Example
# ---
# Question: Create the matrix x below
# ---
# OUR CODE GOES BELOW
#
x <- matrix(1:9, nrow = 3)
x
## [,1] [,2] [,3]
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
# Then add a column as shown
# ---
#
x_new <- cbind(x, c(1, 2, 3))
x_new
## [,1] [,2] [,3] [,4]
## [1,] 1 4 7 1
## [2,] 2 5 8 2
## [3,] 3 6 9 3
## Challenge
# ---
# Question: Add a column residence to your fictional family matrix that you had created earlier
# ---
# OUR CODE GOES BELOW
#
fam_new <- cbind(fam_matrix, c("Kenya", "USA", "Kenya"))
fam_new
## Name Age Gender Occupation
## [1,] "Rose" "56" "Female" "Self" "Kenya"
## [2,] "Caro" "31" "Female" "Self" "USA"
## [3,] "Jeff" "24" "Male" "Unemployed" "Kenya"
## Example
# ---
# Question: Creating a matrix x
# ---
#
x <- matrix(1:9, nrow = 3)
x
## [,1] [,2] [,3]
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
# You can also add a row using the cbind() function
# ---
#
x_new2 <- rbind(x,c(1,2,3))
x_new2
## [,1] [,2] [,3]
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
## [4,] 1 2 3
## Challenge
# ---
# Question: Add a fictional character to your fictional family matrix
# ---
# OUR CODE GOES BELOW
#
fam_new2 <- rbind(fam_new, c("Change", 8, "Male", "Changes", "Everywhere"))
fam_new2
## Name Age Gender Occupation
## [1,] "Rose" "56" "Female" "Self" "Kenya"
## [2,] "Caro" "31" "Female" "Self" "USA"
## [3,] "Jeff" "24" "Male" "Unemployed" "Kenya"
## [4,] "Change" "8" "Male" "Changes" "Everywhere"
To select an element of a matrix, one needs to specify both the row and the column as shown:
## Example
# ---
# Question: Select the following matrix
# ---
# OUR CODE GOES BELOW
#
x <- matrix(1:9, nrow = 3)
x
## [,1] [,2] [,3]
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
# Select the elements from the above matrix
# By uncommenting the following lines.
# ---
#
x[1,3] # select the element at 1nd row, 3rd column
## [1] 7
x[2, ] # the 2nd row
## [1] 2 5 8
x[ ,3] # the 3rd column
## [1] 7 8 9
## Challenge
# ---
# Question: Select the last member of your family member
# ---
# OUR CODE GOES BELOW
#
fam_new2[4, ]
## Name Age Gender Occupation
## "Change" "8" "Male" "Changes" "Everywhere"
## Challenge
# ---
# Question: Select the first member of your family member
# ---
# OUR CODE GOES BELOW
#
fam_new2[1, ]
## Name Age Gender Occupation
## "Rose" "56" "Female" "Self" "Kenya"
## Example
# ---
# Question: Matrix addition and subtract require the matrices to have the same dimensions.
# Let's start by creating matrices x and y.
# ---
# OUR CODE GOES BELOW
#
x <- matrix(c(3, 9, -1, 4, 2, 6), nrow = 2)
y <- matrix(c(5, 2, 0, 9, 3, 4), nrow = 2)
# Print matrix x below
# ---
#
x
## [,1] [,2] [,3]
## [1,] 3 -1 2
## [2,] 9 4 6
# Print the matrix y below
# ---
#
y
## [,1] [,2] [,3]
## [1,] 5 0 3
## [2,] 2 9 4
# Add the matrices and print out the result
# ---
#
x+y
## [,1] [,2] [,3]
## [1,] 8 -1 5
## [2,] 11 13 10
# Subtract the matrices
# ---
#
x-y
## [,1] [,2] [,3]
## [1,] -2 -1 -1
## [2,] 7 -5 2
## Challenge
# ---
# Question: Let's create two 2 x 3 matrices x and y
# ---
# OUR CODE GOES BELOW
#
x <- matrix(c(3, 9, -1, 4, 2, 6), nrow = 2)
y <- matrix(c(5, 2, 0, 9, 3, 4), nrow = 2)
x
## [,1] [,2] [,3]
## [1,] 3 -1 2
## [2,] 9 4 6
y
## [,1] [,2] [,3]
## [1,] 5 0 3
## [2,] 2 9 4
# And then multiply these matrices, assign the result to to the variable z
# ---
#
z = x*y
# Now print out the matrix z
# ---
#
z
## [,1] [,2] [,3]
## [1,] 15 0 6
## [2,] 18 36 24
# Lastly divide matrix x by y and assign the result to the variable z
# ---
#
z = x/y
z
## [,1] [,2] [,3]
## [1,] 0.6 -Inf 0.6666667
## [2,] 4.5 0.4444444 1.5000000
As we have noted, factors are variables in R which take on a limited number of different values; such variables are often referred to as categorical variables.
In a dataset, we can distinguish two types of variables: categorical and continuous.
In a categorical variable, the value is limited and usually based on a particular finite group. For example, a categorical variable can be countries, year, gender, occupation. A continuous variable, however, can take any values, from integer to decimal. For example, we can have the revenue, price of a share, etc.
## Example
# ---
# Question: Lets create a vector v
# ---
# OUR CODE GOES BELOW
#
v <- c(1,3,5,8,2,1,3,5,3,5)
v
## [1] 1 3 5 8 2 1 3 5 3 5
# Then determine whether this vector is a factor
# ---
#
is.factor(v)
## [1] FALSE
## Challenge
# ---
# Question: Calculate the categorical distribution as shown and figure out why the given output
# ---
# OUR CODE GOES BELOW
#
factor(v)
## [1] 1 3 5 8 2 1 3 5 3 5
## Levels: 1 2 3 5 8
## Example
# ---
# Question: Assign factor v to x and print out x
# ---
# OUR CODE GOES BELOW
#
x <- factor(v)
x
## [1] 1 3 5 8 2 1 3 5 3 5
## Levels: 1 2 3 5 8
## Challenge
# ---
# Question: Determine whether x is a factor below.
# Hint: Just like the way you did when you were finding out whether vector v is a factor
# ---
# OUR CODE GOES BELOW
#
is.factor(x)
## [1] TRUE
## Example
# ---
# Question: First we create a vector as input, check whether its a factor,
# apply the factor function to create a factor from the vector
# ---
# OUR CODE GOES BELOW
#
data <- c("East","West","East","North","North","East","West","West","West","East","North")
# Then print out this vector
data
## [1] "East" "West" "East" "North" "North" "East" "West" "West" "West"
## [10] "East" "North"
# Now, check whether this is a factor
is.factor(data)
## [1] FALSE
# Then, apply the factor function to create a factor from the vector
factor_data <- factor(data)
# Then see our newly created factor
factor_data
## [1] East West East North North East West West West East North
## Levels: East North West
# Check whether this is a factor
is.factor(factor_data)
## [1] TRUE
# Example
# ---
# Creating a factor, determine and check the levels
# ---
# OUR CODE GOES BELOW
#
sex <- factor(c("male", "female", "female", "male"))
# Determining the levels
levels(sex)
## [1] "female" "male"
# Then checking the number of levels using nlevels()
nlevels(sex)
## [1] 2
# Sometimes, the order of the factors does not matter, other times you might want to specify the order
# because it is meaningful (e.g., “low”, “medium”, “high”) or it is required by particular type of analysis.
# Additionally, specifying the order of the levels allows us to compare levels:
food <- factor(c("low", "high", "medium", "high", "low", "medium", "high"))
# then print out levels of food
levels(food)
## [1] "high" "low" "medium"
nlevels(food)
## [1] 3
A data frame is used for storing data tables. Unlike a matrix in data frame each column can contain different modes of data.
## Example
# ---
# Question: Lets create a data frame BMI
# ---
# OUR CODE GOES BELOW
#
BMI <- data.frame(
gender = c("Male", "Male","Female"),
height = c(152, 171.5, 165),
weight = c(81,93, 78),
Age = c(42,38,26)
)
# Then print it out below
BMI
## gender height weight Age
## 1 Male 152.0 81 42
## 2 Male 171.5 93 38
## 3 Female 165.0 78 26
## Challenge
# ---
# Question: Create a data frame family with column names Name, Age, Gender and Occupation.
# Populate it with 5 your own family members.
# ---
# OUR CODE GOES BELOW
#
fam_df <- data.frame(
Name = c("Rose", "Caro","Jeff"),
Age = c(56, 31, 24),
Gender = c("Female", "Female", "Male"),
Occupation = c("Self", "Self", "Unemployed")
)
# Then print it out below
fam_df
## Name Age Gender Occupation
## 1 Rose 56 Female Self
## 2 Caro 31 Female Self
## 3 Jeff 24 Male Unemployed
## Example
# ---
# Question: Selecting elements from the BMI dataframe
# ---
# OUR CODE GOES BELOW
#
# selecting row 1
BMI[1,]
## gender height weight Age
## 1 Male 152 81 42
# selecting rows 1 to 2
BMI[1:2, ]
## gender height weight Age
## 1 Male 152.0 81 42
## 2 Male 171.5 93 38
# selecting column 1
BMI[,1]
## [1] "Male" "Male" "Female"
# selecting column 1 to 2
BMI[,1:2 ]
## gender height
## 1 Male 152.0
## 2 Male 171.5
## 3 Female 165.0
# selecting row 1 in column 2
BMI[1,2]
## [1] 152
## Challenge
# ---
# Question: Select the column 2 from the BMI dataframe
# ---
# OUR CODE GOES BELOW
#
fam_df[,2]
## [1] 56 31 24
## Challenge
# ---
# Question: Select the second and third members of your family
# ---
# OUR CODE GOES BELOW
#
fam_df[2:3,]
## Name Age Gender Occupation
## 2 Caro 31 Female Self
## 3 Jeff 24 Male Unemployed
## Example
# ---
# Question: Sort the BMI dataframe by using the order() function
# ---
#
# Sort in ascending order by gender
# ---
#
sorted_by_gender <- BMI[order(BMI$gender),]
# Print out sorted_by_gender below
# ---
#
sorted_by_gender
## gender height weight Age
## 3 Female 165.0 78 26
## 1 Male 152.0 81 42
## 2 Male 171.5 93 38
# Sort in descending order by weight
# ---
#
sorted_by_weight <- BMI[order(-BMI$weight),]
# Print out sorted_by_weight below
# ---
#
sorted_by_weight
## gender height weight Age
## 2 Male 171.5 93 38
## 1 Male 152.0 81 42
## 3 Female 165.0 78 26
# And sort in descending order by gender below
# ---
# OUR CODE GOES BELOW
#
sorted_by_gender2 <- BMI[order(BMI$gender),]
sorted_by_gender2
## gender height weight Age
## 3 Female 165.0 78 26
## 1 Male 152.0 81 42
## 2 Male 171.5 93 38
As we have mentioned, a data table provides an enhanced version of data.frames.
The data.table R package is considered as the fastest package for data manipulation.
# Load the data.table package
# ---
#
library(data.table)
## Example
# ---
# Question: Create a data table DT
# ---
#
DT = data.table(
ID = c("b","b","b","a","a","c"),
a = 1:6,
b = 7:12,
c = 13:18
)
# Then print it out to see what happens
# ---
# OUR CODE GOES BELOW
#
DT
## ID a b c
## 1: b 1 7 13
## 2: b 2 8 14
## 3: b 3 9 15
## 4: a 4 10 16
## 5: a 5 11 17
## 6: c 6 12 18
## Example
# ---
# Question: Select elements from the given datatable DT
# ---
# OUR CODE GOES BELOW
#
# Selecting Row 1
DT[1,]
## ID a b c
## 1: b 1 7 13
# Selecting Rows 1 to 2
DT[1:2,]
## ID a b c
## 1: b 1 7 13
## 2: b 2 8 14
# Find out what happens when we print out the following statement
DT[,1]
## ID
## 1: b
## 2: b
## 3: b
## 4: a
## 5: a
## 6: c
# Find out what happens when we print out the following statement
DT[,1:2]
## ID a
## 1: b 1
## 2: b 2
## 3: b 3
## 4: a 4
## 5: a 5
## 6: c 6
# And lastly find out what happens when we print out the following statement
DT[1,2]
## a
## 1: 1
# Select the fourth and third rows from the data table
# ---
# OUR CODE GOES BELOW
#
DT[3:4,]
## ID a b c
## 1: b 3 9 15
## 2: a 4 10 16
## Example
# ---
# Question: Sorting the datatable in ascending order by c
# ---
# OUR CODE GOES BELOW
#
# Performing the sort
#
sorted_by_c <- DT[order(DT$c),]
# Printing out sorted_by_c
# ---
# OUR CODE GOES BELOW
#
sorted_by_c
## ID a b c
## 1: b 1 7 13
## 2: b 2 8 14
## 3: b 3 9 15
## 4: a 4 10 16
## 5: a 5 11 17
## 6: c 6 12 18
# Sort in descending order by b, uncommenting the line below
# ---
#
sorted_by_b <- DT[order(-DT$b),]
# Finally printing out sorted_by_b below
# ---
# OUR CODE GOES BELOW
#
sorted_by_b
## ID a b c
## 1: c 6 12 18
## 2: a 5 11 17
## 3: a 4 10 16
## 4: b 3 9 15
## 5: b 2 8 14
## 6: b 1 7 13
Tibbles are data frames, but they tweak some older behaviours to make life a little easier. They also have an enhanced print() method which makes them easier to use with large datasets containing complex objects
You can create a new tibble from individual vectors with tibble(). tibble() will automatically recycle inputs of length 1, and allows you to refer to variables that you just created, as shown below.
# First, we load the tibble package
library(tibble)
## Example
# ---
# Question: Create a tible tb
# ---
# OUR CODE GOES BELOW
#
# Then create our tibble tb
tb <- tibble(
x = 1:5,
y = 1,
z = x ^ 2 + y
)
# And finally print the created tibble
# ---
# OUR CODE GOES BELOW
#
tb
## # A tibble: 5 x 3
## x y z
## <int> <dbl> <dbl>
## 1 1 1 2
## 2 2 1 5
## 3 3 1 10
## 4 4 1 17
## 5 5 1 26
## Example
# ---
# Question: Find out what happens when we print the following
# ---
# OUR CODE GOES BELOW
#
tb[1,]
## # A tibble: 1 x 3
## x y z
## <int> <dbl> <dbl>
## 1 1 1 2
tb[1:2, ]
## # A tibble: 2 x 3
## x y z
## <int> <dbl> <dbl>
## 1 1 1 2
## 2 2 1 5
tb[,1]
## # A tibble: 5 x 1
## x
## <int>
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
tb[,1:2 ]
## # A tibble: 5 x 2
## x y
## <int> <dbl>
## 1 1 1
## 2 2 1
## 3 3 1
## 4 4 1
## 5 5 1
# Select the second and third rows
# ---
# OUR CODE GOES BELOW
#
tb[2:3,]
## # A tibble: 2 x 3
## x y z
## <int> <dbl> <dbl>
## 1 2 1 5
## 2 3 1 10
## Example
# ---
# Question: Find out what happens when we sort by doing the following
# ---
#
sorted_by_1 <- tb[order(tb$z),]
sorted_by_1
## # A tibble: 5 x 3
## x y z
## <int> <dbl> <dbl>
## 1 1 1 2
## 2 2 1 5
## 3 3 1 10
## 4 4 1 17
## 5 5 1 26
sorted_by_2 <- tb[order(-tb$x),]
sorted_by_2
## # A tibble: 5 x 3
## x y z
## <int> <dbl> <dbl>
## 1 5 1 26
## 2 4 1 17
## 3 3 1 10
## 4 2 1 5
## 5 1 1 2
# Sort tb in ascending order by x below
# ---
# OUR CODE GOES BELOW
#
sorted_by_3 <- tb[order(tb$x),]
sorted_by_3
## # A tibble: 5 x 3
## x y z
## <int> <dbl> <dbl>
## 1 1 1 2
## 2 2 1 5
## 3 3 1 10
## 4 4 1 17
## 5 5 1 26
In R, the missing values are shown by the symbol NA. To identify missing values in your dataset the function is.na() is normally used.
## Example
# ---
# Lets create a dataset dt
# ---
# OUR CODE GOES BELOW
#
Name <- c("John", "Tim", NA)
Sex <- c("men", "men", "women")
Age <- c(45, 53, NA)
dt <- data.frame(Name, Sex, Age)
# Then print out this dataset below
dt
## Name Sex Age
## 1 John men 45
## 2 Tim men 53
## 3 <NA> women NA
# Lets Identify missing data in your dataset
# by using the function is.na()
# ---
#
is.na(dt)
## Name Sex Age
## [1,] FALSE FALSE FALSE
## [2,] FALSE FALSE FALSE
## [3,] TRUE FALSE TRUE
# Example
# ---
# We can also find out total missing values in each column
# by using the function colSums()
# ---
# OUR CODE GOES BELOW
#
colSums(is.na(dt))
## Name Sex Age
## 1 0 1
Using na.omit() to omit all rows containing missing values.
## Example
# ---
# Question: Show all rows from the dataset which don't contain any missing values
# ---
# OUR CODE GOES BELOW
#
na.omit(dt)
## Name Sex Age
## 1 John men 45
## 2 Tim men 53
## Example
# ---
# Question: Recode/fill the missing value in a column with a number
# ---
# OUR CODE GOES BELOW
#
dt$Age[is.na(dt$Age)] <- 99
dt
## Name Sex Age
## 1 John men 45
## 2 Tim men 53
## 3 <NA> women 99
## Example
# ---
# Question: Recode or fill the missing value in a column with the mean value of the column-#-
# ---
# OUR CODE GOES BELOW
#
dt$Age[is.na(dt$Age)] <- mean(dt$Age, na.rm = TRUE)
# print the dt table below
dt
## Name Sex Age
## 1 John men 45
## 2 Tim men 53
## 3 <NA> women 99
## Challenge 1
# ---
# Question: Using the given bus dataset below, recode the missing values of the payment_method
# and travel_to columns with athen appropriate values
# ---
# OUR CODE GOES BELOW
#
# Lets first of all import our data table
# ---
#
#library("data.table")
#bus_dataset <- fread('http://bit.ly/BusNairobiWesternTransport')
bus_dataset <- read.csv("C:/Users/user/Downloads/buses-western-nairobi.csv")
# First check have a look at the dataset
# --
#
head(bus_dataset)
## ride_id seat_number payment_method payment_receipt travel_date travel_time
## 1 1442 15A Mpesa UZUEHCBUSO 17-10-17 7:15
## 2 5437 14A Mpesa TIHLBUSGTE 19-11-17 7:12
## 3 5710 8B Mpesa EQX8Q5G19O 26-11-17 7:05
## 4 5777 19A Mpesa SGP18CL0ME 27-11-17 7:10
## 5 5778 11A Mpesa BM97HFRGL9 27-11-17 7:12
## 6 5777 18B Mpesa B6PBDU30IZ 27-11-17 7:10
## travel_from travel_to car_type max_capacity
## 1 Migori Nairobi Bus 49
## 2 Migori Nairobi Bus 49
## 3 Keroka Nairobi Bus 49
## 4 Homa Bay Nairobi Bus 49
## 5 Migori Nairobi Bus 49
## 6 Homa Bay Nairobi Bus 49
colSums(is.na(bus_dataset))
## ride_id seat_number payment_method payment_receipt travel_date
## 0 0 0 0 0
## travel_time travel_from travel_to car_type max_capacity
## 0 0 0 0 0
## Challenge 2
# ---
# Question: Clean the given dataset
# ---
# Dataset url = http://bit.ly/MS-PropertyDataset
# ---
# OUR CODE GOES BELOW
#
df_property <- read.csv("C:/Users/user/Downloads/property-data.csv")
df_property
## PID ST_NUM ST_NAME OWN_OCCUPIED NUM_BEDROOMS NUM_BATH SQ_FT
## 1 100001000 104 PUTNAM Y 3 1 1000
## 2 100002000 197 LEXINGTON N 3 1.5 --
## 3 100003000 NA LEXINGTON N n/a 1 850
## 4 100004000 201 BERKELEY 12 1 NaN 700
## 5 NA 203 BERKELEY Y 3 2 1600
## 6 100006000 207 BERKELEY Y <NA> 1 800
## 7 100007000 NA WASHINGTON 2 HURLEY 950
## 8 100008000 213 TREMONT Y 1 1
## 9 100009000 215 TREMONT Y na 2 1800
colSums(is.na(df_property))
## PID ST_NUM ST_NAME OWN_OCCUPIED NUM_BEDROOMS NUM_BATH
## 1 2 0 0 1 0
## SQ_FT
## 0
df_property_clean = na.omit(df_property)
df_property_clean
## PID ST_NUM ST_NAME OWN_OCCUPIED NUM_BEDROOMS NUM_BATH SQ_FT
## 1 100001000 104 PUTNAM Y 3 1 1000
## 2 100002000 197 LEXINGTON N 3 1.5 --
## 4 100004000 201 BERKELEY 12 1 NaN 700
## 8 100008000 213 TREMONT Y 1 1
## 9 100009000 215 TREMONT Y na 2 1800
## Challenge 3
# ---
# Question:
# ---
# Dataset url = http://bit.ly/AirQualityDataset
# ---
# OUR CODE GOES BELOW
#
# NO DATASET AVAILABLE
In the process of producing, collecting, processing and analyzing data, outliers can come from many sources and hide in many dimensions. An outlier is an observation that is numerically distant from the rest of the data. When reviewing a boxplot, an outlier is defined as a data point that is located outside the fences (“whiskers”) of the boxplot.
## Example
# ---
# Let's create the vector A
# ---
#
A <- c(3, 2, 5, 6, 4, 8, 1, 2, 30, 2, 4)
# then print it out
A
## [1] 3 2 5 6 4 8 1 2 30 2 4
# We then plot a boxplot to help us visualise any existing outliers
# ---
#
boxplot(A)
# Then use the function boxplot.stats which lists the outliers in the vectors
# ---
#
boxplot.stats(A)$out
## [1] 30
Outliers should be investigated carefully. Often they contain valuable information about the process under investigation or the data gathering and recording process. Before considering the possible elimination of these points from the data, one should try to understand why they appeared and whether it is likely similar values will continue to appear. Of course, outliers are often bad data points.
An obvious inconsistency occurs when a record contains a value or combination of values that cannot correspond to a real-world situation. For example, a person’s age cannot be negative, a man cannot be pregnant and an under-aged person cannot possess a drivers license.
## Example
# ---
# Say from our vector x above, values above 20 are obvious inconsistencies
# then we using logical indices to check for
# ---
#
non_greater_than_20 <- A > 20
# printing out non_greater_than_20
non_greater_than_20
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::between() masks data.table::between()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks data.table::first()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks data.table::last()
## x purrr::transpose() masks data.table::transpose()
# Challenge
# ---
# Question: Use the given bus dataset below, determine whether there are any obvious inconsistencies
# ---
# Dataset url = http://bit.ly/BusNairobiWesternTransport
# ---
# OUR CODE GOES BELOW
#
# Importing our database
# ---
#
#install.package("data.table") # install package data.table to work with data tables
#library(data.table) # load package
#install.package("tidyverse") # install packages to work with data frame - extends into visualization
head(bus_dataset)
## ride_id seat_number payment_method payment_receipt travel_date travel_time
## 1 1442 15A Mpesa UZUEHCBUSO 17-10-17 7:15
## 2 5437 14A Mpesa TIHLBUSGTE 19-11-17 7:12
## 3 5710 8B Mpesa EQX8Q5G19O 26-11-17 7:05
## 4 5777 19A Mpesa SGP18CL0ME 27-11-17 7:10
## 5 5778 11A Mpesa BM97HFRGL9 27-11-17 7:12
## 6 5777 18B Mpesa B6PBDU30IZ 27-11-17 7:10
## travel_from travel_to car_type max_capacity
## 1 Migori Nairobi Bus 49
## 2 Migori Nairobi Bus 49
## 3 Keroka Nairobi Bus 49
## 4 Homa Bay Nairobi Bus 49
## 5 Migori Nairobi Bus 49
## 6 Homa Bay Nairobi Bus 49
# Previewing the dataset
# ---
#
#View(bus_dataset)
str(bus_dataset)
## 'data.frame': 51645 obs. of 10 variables:
## $ ride_id : int 1442 5437 5710 5777 5778 5777 5777 5778 5778 5781 ...
## $ seat_number : chr "15A" "14A" "8B" "19A" ...
## $ payment_method : chr "Mpesa" "Mpesa" "Mpesa" "Mpesa" ...
## $ payment_receipt: chr "UZUEHCBUSO" "TIHLBUSGTE" "EQX8Q5G19O" "SGP18CL0ME" ...
## $ travel_date : chr "17-10-17" "19-11-17" "26-11-17" "27-11-17" ...
## $ travel_time : chr "7:15" "7:12" "7:05" "7:10" ...
## $ travel_from : chr "Migori" "Migori" "Keroka" "Homa Bay" ...
## $ travel_to : chr "Nairobi" "Nairobi" "Nairobi" "Nairobi" ...
## $ car_type : chr "Bus" "Bus" "Bus" "Bus" ...
## $ max_capacity : int 49 49 49 49 49 49 49 49 49 49 ...
dim(bus_dataset)
## [1] 51645 10
class(bus_dataset)
## [1] "data.frame"
# Identifying the numeric class in the data and evaluating if there are any outliers
# ---
# OUR CODE GOES BELOW
#
boxplot(bus_dataset$max_capacity)
R checks for duplicates across rows through the duplicated() function.
## Example
# ---
# Question: Identify duplicate data in the given dataframe
# ---
# OUR CODE GOES BELOW
#
# Creating our vectors
# ---
#
x1 <- c(2, 4, 5, 6)
x2 <- c(2, 3, 5, 6)
x3 <- c(2, 4, 5, 6)
x4 <- c(2, 4, 5, 6)
# Create a dataframe df from the above vectors
# ---
#
df <- data.frame(rbind(x1, x2, x3, x4))
# Then printing out this dataset
df
## X1 X2 X3 X4
## x1 2 4 5 6
## x2 2 3 5 6
## x3 2 4 5 6
## x4 2 4 5 6
# Now lets find the duplicated rows in the dataset df
# and assign to a variable duplicated_rows below
# ---
#
duplicated_rows <- df[duplicated(df),]
# Lets print out the variable duplicated_rows and see these duplicated rows
# ---
# OUR CODE GOES BELOW
#
duplicated_rows
## X1 X2 X3 X4
## x3 2 4 5 6
## x4 2 4 5 6
# Removing these duplicated rows in the dataset or
# showing these unique items and assigning to a variable unique_items below
# ---
#
unique_items <- df[!duplicated(df), ]
# What about seeing what these unique items are?
# ---
#
unique_items
## X1 X2 X3 X4
## x1 2 4 5 6
## x2 2 3 5 6
# Now there is another way we can also remove duplicated rows
# in the dataset or show the unique items;
# We simply use the unique() function
# ---
#
unique_items2 <- unique(df)
# After having assigned the unique items to the variable unique_items2,
# we will now print out this variable and have a look at these unique items
# ---
# OUR CODE GOES BELOW
#
unique_items2
## X1 X2 X3 X4
## x1 2 4 5 6
## x2 2 3 5 6
## Challenge
# ---
# Question: Display and delete the only duplicate records in the iris dataset below:
# ---
# OUR CODE GOES BELOW
#
# Showing the first 6 records in the iris dataset
# ---
#
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
# Deleting duplicate records
# ---
# OUR CODE GOES BELOW
#
duplicates <- iris[duplicated(iris),]
duplicates
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 143 5.8 2.7 5.1 1.9 virginica
iris_unique <- unique(iris)
head(iris_unique)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## Challenge
# ---
# Question: Drop duplicate records in the video_games dataset from the url
# ---
# OUR CODE GOES BELOW
#
# Reading our dataset
# ---
#
video_games <- read.csv("C:/Users/user/Downloads/steam-200k.csv")
# Previewing the first 6 records of the video games dataset
# ---
#
head(video_games)
## X151603712 The.Elder.Scrolls.V.Skyrim purchase X1.0 X0
## 1 151603712 The Elder Scrolls V Skyrim play 273.0 0
## 2 151603712 Fallout 4 purchase 1.0 0
## 3 151603712 Fallout 4 play 87.0 0
## 4 151603712 Spore purchase 1.0 0
## 5 151603712 Spore play 14.9 0
## 6 151603712 Fallout New Vegas purchase 1.0 0
# Cleaning our dataset
# ---
# OUR CODE GOES BELOW
#
games_unique <- unique(video_games)
head(games_unique)
## X151603712 The.Elder.Scrolls.V.Skyrim purchase X1.0 X0
## 1 151603712 The Elder Scrolls V Skyrim play 273.0 0
## 2 151603712 Fallout 4 purchase 1.0 0
## 3 151603712 Fallout 4 play 87.0 0
## 4 151603712 Spore purchase 1.0 0
## 5 151603712 Spore play 14.9 0
## 6 151603712 Fallout New Vegas purchase 1.0 0
Before embarking on developing statistical models and generating predictions, it is essential to understand our data. This is typically done using conventional numerical and graphical methods.
## Example
# ---
# We will be using the hills dataset in this section,
# this dataset contains information on hill climbs made by various athletes
# ---
# OUR CODE GOES BELOW
#
# Printing the first six rows of the dataset
# ---
#
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
hills
## dist climb time
## Greenmantle 2.5 650 16.083
## Carnethy 6.0 2500 48.350
## Craig Dunain 6.0 900 33.650
## Ben Rha 7.5 800 45.600
## Ben Lomond 8.0 3070 62.267
## Goatfell 8.0 2866 73.217
## Bens of Jura 16.0 7500 204.617
## Cairnpapple 6.0 800 36.367
## Scolty 5.0 800 29.750
## Traprain 6.0 650 39.750
## Lairig Ghru 28.0 2100 192.667
## Dollar 5.0 2000 43.050
## Lomonds 9.5 2200 65.000
## Cairn Table 6.0 500 44.133
## Eildon Two 4.5 1500 26.933
## Cairngorm 10.0 3000 72.250
## Seven Hills 14.0 2200 98.417
## Knock Hill 3.0 350 78.650
## Black Hill 4.5 1000 17.417
## Creag Beag 5.5 600 32.567
## Kildcon Hill 3.0 300 15.950
## Meall Ant-Suidhe 3.5 1500 27.900
## Half Ben Nevis 6.0 2200 47.633
## Cow Hill 2.0 900 17.933
## N Berwick Law 3.0 600 18.683
## Creag Dubh 4.0 2000 26.217
## Burnswark 6.0 800 34.433
## Largo Law 5.0 950 28.567
## Criffel 6.5 1750 50.500
## Acmony 5.0 500 20.950
## Ben Nevis 10.0 4400 85.583
## Knockfarrel 6.0 600 32.383
## Two Breweries 18.0 5200 170.250
## Cockleroi 4.5 850 28.100
## Moffat Chase 20.0 5000 159.833
## Example
# ---
# Question: Find the mean of the distance covered by the athletes
# and assigning the mean to the variable athletes.dist.mean
# ---
# OUR CODE GOES BELOW
#
athletes.dist.mean <- mean(hills$dist)
# Printing out
# ---
#
athletes.dist.mean
## [1] 7.528571
## Example
# ---
# Question: Find the median which is the middle most value of the distance covered dist
# ---
# OUR CODE GOES BELOW
#
athletes.dist.median <- median(hills$dist)
# Printing out athletes.dist.median
# ---
#
athletes.dist.median
## [1] 6
## Example
# ---
# Question: Find the mode which is the value that has highest number of occurrences in a set of data.
# ---
# OUR CODE GOES BELOW
#
# Unfotunately, R does not have a standard in-built function to calculate mode so we have to build one
# We create the mode function that will perform our mode operation for us
# ---
#
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
# Calculating the mode using out getmode() function
# ---
#
athletes.dist.mode <- getmode(hills$dist)
# Then printing out athletes.dist.mode
# ---
# OUR CODE GOES BELOW
#
athletes.dist.mode
## [1] 6
## Challenge
# ---
# Question: Find the mean, median, mode of the total evening calls given the following dataset
# ---
# Dataset url = http://bit.ly/CustomerSignatureforChurnAnalysis
# ---
# OUR CODE GOES BELOW
# Previewing the first 6 rows of this dataset
# ---
#
customer = read.csv("C:/Users/user/Downloads/customer_signature_for_churn_analysis.csv")
head(customer)
## recordID state account_length area_code international_plan voice_mail_plan
## 1 1 HI 101 510 no no
## 2 2 MT 137 510 no no
## 3 3 OH 103 408 no yes
## 4 4 NM 99 415 no no
## 5 5 SC 108 415 no no
## 6 6 IA 117 415 no no
## number_vmail_messages total_day_minutes total_day_calls total_day_charge
## 1 0 70.9 123 12.05
## 2 0 223.6 86 38.01
## 3 29 294.7 95 50.10
## 4 0 216.8 123 36.86
## 5 0 197.4 78 33.56
## 6 0 226.5 85 38.51
## total_eve_minutes total_eve_calls total_eve_charge total_night_minutes
## 1 211.9 73 18.01 236.0
## 2 244.8 139 20.81 94.2
## 3 237.3 105 20.17 300.3
## 4 126.4 88 10.74 220.6
## 5 124.0 101 10.54 204.5
## 6 141.6 68 12.04 223.0
## total_night_calls total_night_charge total_intl_minutes total_intl_calls
## 1 73 10.62 10.6 3
## 2 81 4.24 9.5 7
## 3 127 13.51 13.7 6
## 4 82 9.93 15.7 2
## 5 107 9.20 7.7 4
## 6 90 10.04 6.9 5
## total_intl_charge number_customer_service_calls churn customer_id
## 1 2.86 3 no 23383607
## 2 2.57 0 no 22550362
## 3 3.70 1 no 59063354
## 4 4.24 1 no 25464504
## 5 2.08 2 no 691824
## 6 1.86 1 no 24456543
str(customer)
## 'data.frame': 12892 obs. of 22 variables:
## $ recordID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ state : chr "HI" "MT" "OH" "NM" ...
## $ account_length : int 101 137 103 99 108 117 63 94 138 128 ...
## $ area_code : int 510 510 408 415 415 415 415 408 510 415 ...
## $ international_plan : chr "no" "no" "no" "no" ...
## $ voice_mail_plan : chr "no" "no" "yes" "no" ...
## $ number_vmail_messages : int 0 0 29 0 0 0 32 0 0 43 ...
## $ total_day_minutes : num 70.9 223.6 294.7 216.8 197.4 ...
## $ total_day_calls : int 123 86 95 123 78 85 124 97 117 100 ...
## $ total_day_charge : num 12.1 38 50.1 36.9 33.6 ...
## $ total_eve_minutes : num 212 245 237 126 124 ...
## $ total_eve_calls : int 73 139 105 88 101 68 125 112 46 89 ...
## $ total_eve_charge : num 18 20.8 20.2 10.7 10.5 ...
## $ total_night_minutes : num 236 94.2 300.3 220.6 204.5 ...
## $ total_night_calls : int 73 81 127 82 107 90 120 106 71 92 ...
## $ total_night_charge : num 10.62 4.24 13.51 9.93 9.2 ...
## $ total_intl_minutes : num 10.6 9.5 13.7 15.7 7.7 6.9 12.9 11.1 9.9 11.9 ...
## $ total_intl_calls : int 3 7 6 2 4 5 3 6 4 1 ...
## $ total_intl_charge : num 2.86 2.57 3.7 4.24 2.08 1.86 3.48 3 2.67 3.21 ...
## $ number_customer_service_calls: int 3 0 1 1 2 1 1 0 2 0 ...
## $ churn : chr "no" "no" "no" "no" ...
## $ customer_id : num 23383607 22550362 59063354 25464504 691824 ...
# Finding the mean
# ---
#
# Finding the median
# ---
#
# Finding the mode
# ---
#
## Example
# ---
# Question: Find the minimum element of the distance using the min() function
# ---
# OUR CODE GOES BELOW
#
athletes.dist.min <- min(hills$dist)
# And then printing athletes.dist.min to show the minimum element
#
athletes.dist.min
## [1] 2
## Example
# ---
# Question: Find the maximum element of the distance using the function max()
# ---
# OUR CODE GOES BELOW
#
athletes.dist.max <- max(hills$dist)
# Then printing out the variable athletes.dist.max to show that maximum element
# ---
# OUR CODE GOES BELOW
#
athletes.dist.max
## [1] 28
## Example
# ---
# Find the maximum element of the distance using the function range() as shown below
# ---
#
athletes.dist.range <- range(hills$dist)
# Printing out the variable athletes.dist.range to show the range
# ---
#
athletes.dist.range
## [1] 2 28
## Example
# ---
# Question: Get the first and the third quartile together with the range
# and the median using the quantile() function
# ---
# OUR CODE GOES BELOW
#
athletes.dist.quantile <- quantile(hills$dist)
# Printing out the variable athletes.dist.quantile to show the range
# ---
# OUR CODE GOES BELOW
#
athletes.dist.quantile
## 0% 25% 50% 75% 100%
## 2.0 4.5 6.0 8.0 28.0
## Example
# ---
# Question: Find the variance of the distance using the var() function as shown below
# ---
# OUR CODE GOES BELOW
#
athletes.dist.variance <- var(hills$dist)
# Printing out the the variable athletes.dist.variance to show the variance
#
athletes.dist.variance
## [1] 30.51387
The variance is a numerical measure of how the data values is dispersed around the mean.
## Example
# ---
# Question: Find the standard deviation of vector t using the sd() function
# ---
# OUR CODE GOES BELOW
#
athletes.dist.sd <- sd(hills$dist)
# Printing out the variable athletes.dist.sd to show the variance
# ---
#
athletes.dist.sd
## [1] 5.523936
# Challenge
# ---
# Question: Find the minimum, maximum, range, quantile, variance
# and standard deviation for total day calls using the given dataset
# ---
# Dataset url = http://bit.ly/CustomerSignatureforChurnAnalysis
# ---
# OUR CODE GOES BELOW
#
# Find the minimum of total day calls
# ---
# OUR CODE GOES BELOW
#
customer.min.calls <- min(customer$total_day_calls)
customer.min.calls
## [1] 0
# Find the maximum i.e. max() total day calls
# ---
# OUR CODE GOES BELOW
#
customer.max.calls <- max(customer$total_day_calls)
customer.max.calls
## [1] 165
# Find the range i.e. range() of total day calls
# ---
# OUR CODE GOES BELOW
#
customer.range.calls <- max(customer$total_day_calls)
customer.range.calls
## [1] 165
# Find the quantile of total day calls
# ---
# OUR CODE GOES BELOW
#
customer.quantile.calls <- quantile(customer$total_day_calls)
customer.quantile.calls
## 0% 25% 50% 75% 100%
## 0 87 101 114 165
# Find the variance of total day calls
# ---
# OUR CODE GOES BELOW
#
customer.var.calls <- var(customer$total_day_calls)
customer.var.calls
## [1] 397.8691
# Find the standard deviation of total day calls
# ---
# OUR CODE GOES BELOW
#
customer.sd.calls <- sd(customer$total_day_calls)
customer.sd.calls
## [1] 19.94666
## Example
# ---
# Question: Lets create a boxplot graph for the distance using the boxplot() function
# ---
# OUR CODE GOES BELOW
#
boxplot(hills$dist)
The box plot of an observation variable is a graphical representation based on its quartiles, as well as its smallest and largest values. It attempts to provide a visual shape of the data distribution.
DF <- data.frame(
x=1:10,
y=10:1,
z=rep(5,10),
a=11:20
)
DF
## x y z a
## 1 1 10 5 11
## 2 2 9 5 12
## 3 3 8 5 13
## 4 4 7 5 14
## 5 5 6 5 15
## 6 6 5 5 16
## 7 7 4 5 17
## 8 8 3 5 18
## 9 9 2 5 19
## 10 10 1 5 20
drops <- c("x","z")
new_df <- DF[ , !(names(DF) %in% drops)]
new_df
## y a
## 1 10 11
## 2 9 12
## 3 8 13
## 4 7 14
## 5 6 15
## 6 5 16
## 7 4 17
## 8 3 18
## 9 2 19
## 10 1 20
A bar graph of a qualitative data sample consists of vertical parallel bars that shows the frequency distribution graphically.
## Example
# ---
# Create a frequency distribution of the School variable
# ---
# Dataset Info: For this example, we will use an R built-in database named painters.
# ---
# OUR CODE GOES BELOW
#
# Previewing the first six rows of the painters dataset
# ---
# OUR CODE GOES BELOW
#
head(painters)
## Composition Drawing Colour Expression School
## Da Udine 10 8 16 3 A
## Da Vinci 15 16 4 14 A
## Del Piombo 8 13 16 7 A
## Del Sarto 12 16 9 8 A
## Fr. Penni 0 15 8 0 A
## Guilio Romano 15 16 4 14 A
dops <- c("School")
painters_n = painters[ , !(names(painters) %in% dops)]
head(painters_n)
## Composition Drawing Colour Expression
## Da Udine 10 8 16 3
## Da Vinci 15 16 4 14
## Del Piombo 8 13 16 7
## Del Sarto 12 16 9 8
## Fr. Penni 0 15 8 0
## Guilio Romano 15 16 4 14
# Fetching the school column
# ---
#
school <- painters$School
# Applying the table() function will compute the frequency distribution of the School variable
# ---
#
school_frequency <- table(school)
# Printing school_frequency below
# ---
#
school_frequency
## school
## A B C D E F G H
## 10 6 6 10 7 4 7 4
# Then applying the barplot function to produce its bar graph
# ---
#
barplot(school_frequency)
## Challenge
# ---
# Question: Create a bar graph of the total day calls in the customer signature dataset
# ---
# Dataset url = http://bit.ly/CustomerSignatureforChurnAnalysis
# ---
# OUR CODE GOES BELOW
#
day_calls <- table(customer$total_day_calls)
day_calls
##
## 0 30 34 35 36 39 40 42 44 45 46 47 48 49 50 51 52 53 54 55
## 6 4 1 3 4 2 7 6 10 9 1 9 11 7 1 12 14 12 25 25
## 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
## 25 29 27 23 30 54 31 52 26 69 61 86 71 67 100 86 80 90 108 79
## 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
## 104 142 155 141 174 143 142 184 164 195 171 192 256 214 211 243 250 224 248 270
## 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
## 247 274 249 236 250 249 289 213 261 290 255 261 260 225 262 163 259 204 224 192
## 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
## 171 223 125 162 163 173 157 162 111 126 112 96 83 92 88 51 57 75 83 35
## 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 156 157 158
## 33 47 53 34 45 42 21 26 18 33 21 26 18 4 22 19 4 5 4 11
## 160 163 165
## 4 4 4
barplot(day_calls)
A histogram shows the frequency distribution of a quantitative variable. The area of each bar is equal to the frequency of items found in each class.
## Example
# ---
# Create a histogram using the faithful dataset
# ---
# Hint: we will use an R built-in data frame called faithful
# ---
# OUR CODE GOES BELOW
#
# Preview the first six rows of the faithful dataset
# ---
# OUR CODE GOES BELOW
#
head(faithful)
## eruptions waiting
## 1 3.600 79
## 2 1.800 54
## 3 3.333 74
## 4 2.283 62
## 5 4.533 85
## 6 2.883 55
# Then applying the hist() function to produce the histogram of the eruptions variable
# ---
#
hist(faithful$eruptions)
## Challenge
# ---
# Question: Create a histogram of the total day minutes in the customer signature dataset
# ---
# Dataset url = http://bit.ly/CustomerSignatureforChurnAnalysis
# ---
# OUR CODE GOES BELOW
#
hist(customer$total_day_minutes)
Covariance is a statistical representation of the degree to which two variables vary together. Basically, covariance is a number that reflects the degree to which two variable vary together. If the greater values of one variable correspond with the greater values of the other variable, or for the smaller values, then the variables show similar behavior, the covariance is a positive. If the greater values of one variable correspond to the smaller values of the other, the variables tend to show opposite behavior, the covariance is negative. If one variable is greater and paired equally often with both greater and lesser values on the other, the covariance will be near to zero.
## Example
# ---
# Question: Find the covariance of eruption duration and waiting time in the data set faithful
# ---
# OUR CODE GOES BELOW
#
# Printing out the the first 6 rows of the dataset
# ---
#
head(faithful)
## eruptions waiting
## 1 3.600 79
## 2 1.800 54
## 3 3.333 74
## 4 2.283 62
## 5 4.533 85
## 6 2.883 55
# Assigning the eruptions column to the variable eruptions
# ---
#
eruptions <- faithful$eruptions
# Assigning the waiting column to the variable waiting
# ---
#
waiting<- faithful$waiting
# Using the cov() function to determine the covariance
# ---
#
cov(eruptions, waiting)
## [1] 13.97781
The covariance of eruption duration and waiting time is about 13.98. It indicates a positive linear relationship between the two variables.
## Challenge
# ---
# Question: Find out the covariance of Bwt and Hwt in the cats dataset
# ---
# OUR CODE GOES BELOW
#
# Previewing the cats dataset
# ---
#
head(cats)
## Sex Bwt Hwt
## 1 F 2.0 7.0
## 2 F 2.0 7.4
## 3 F 2.0 9.5
## 4 F 2.1 7.2
## 5 F 2.1 7.3
## 6 F 2.1 7.6
# Finding out the covariance
# ---
# OUR CODE GOES BELOW
#
bwt <- cats$Bwt
hwt <- cats$Hwt
cov(bwt,hwt)
## [1] 0.9501127
The correlation coefficient of two variables in a data set equals to their covariance divided by the product of their individual standard deviations. It is a normalized measurement of how the two are linearly related. If the correlation coefficient is close to 1, it would indicate that the variables are positively linearly related. For -1, it indicates that the variables are negatively linearly related and the scatter plot almost falls along a straight line with negative slope. And for zero, it would indicate a weak linear relationship between the variables.
## Example
# ---
# Question: Find the correlation coefficient of eruption duration and waiting time in the faithful dataset
# ---
# OUR CODE GOES BELOW
#
# Assigning the eruptions column to the variable eruptions
# ---
#
eruptions <- faithful$eruptions
# Assigning the waiting column to the variable waiting
# ---
#
waiting<- faithful$waiting
# Using the cor() function to determine the covariance
# ---
#
cor(eruptions, waiting)
## [1] 0.9008112
The correlation coefficient of eruption duration and waiting time is 0.90081. Because it is close to 1, we can conclude that the variables are positively linearly related.
## Challenge
# ---
# Question: Find out the covariance of Bwt and Hwt in the cats data set below:
# ---
# OUR CODE GOES BELOW
#
# Previewing the cats dataset by first importing the Mass library
# then displaying the first 6 records of this database
library(MASS)
head(cats)
## Sex Bwt Hwt
## 1 F 2.0 7.0
## 2 F 2.0 7.4
## 3 F 2.0 9.5
## 4 F 2.1 7.2
## 5 F 2.1 7.3
## 6 F 2.1 7.6
cor(bwt,hwt)
## [1] 0.8041274
## Challenge
# ---
# Question: Create a correlation matrix in R using the corr() function
# ---
# Hint: http://bit.ly/RDocumentationCorrMatrix
# ---
# Dataset url = http://bit.ly/HousingDatainR
# ---
# OUR CODE GOES BELOW
#
#hous = fread("http://bit.ly/HousingDatainR")
#hous
cor(painters_n, method = c("pearson", "kendall", "spearman"))
## Composition Drawing Colour Expression
## Composition 1.00000000 0.4154456 -0.09758818 0.6571846
## Drawing 0.41544563 1.0000000 -0.51696052 0.5737066
## Colour -0.09758818 -0.5169605 1.00000000 -0.1995179
## Expression 0.65718460 0.5737066 -0.19951793 1.0000000
A scatter plot is a two-dimensional data visualization that uses dots to represent the values obtained for two different variables - one plotted along the x-axis and the other plotted along the y-axis. Scatter plots are used when you want to show the relationship between two variables. They are sometimes called correlation plots because they show how two variables are correlated.
## Example
# ---
# Question: Create a scatter plot of the eruption durations and waiting intervals from the faithful dataset
# ---
# OUR CODE GOES BELOW
#
# Assigning the eruptions column to the variable eruptions
# ---
#
eruptions <- faithful$eruptions
# Assigning the waiting column to the variable waiting
# ---
#
waiting <- faithful$waiting
# Creating the scatter plot using eruptions and waiting
# ---
#
plot(eruptions, waiting, xlab="Eruption duration", ylab="Time waited")
The scatter plot above reveals a positive linear relationship between eruptions and waiting.
# Challenge
# ---
# Question: Using the cats dataset, create a scatter plot of the Bwt and Hwt variables.
# Does it reveal any relationship between these variables?
# ---
# OUR CODE GOES BELOW
#
# Previewing the cats dataset
# ---
#
#head(cats)
plot(bwt, hwt, xlab="bwt", ylab="hwt")