## Example
# ----
# Let's first assign the variable x the value 5 and also assign the variable y the value 16. 
# ---
# 
x <- 5 
y <- 16 

## Then, we add x and y together using the addition operator +
# ---
#
x + y
## [1] 21
## Challenge 
# ---
# Question: Using our understanding in the above example, 
# let's now subtract x from y using the operator - below
# ---
# OUR CODE GOES BELOW
# 

x-y
## [1] -11
## Example
# ---
# Question: Lets create two vectors v and t
# ---
# 
v <- c(3,1,TRUE,2+3i) 
t <- c(4,1,FALSE,2+3i) 

# Then use the element-wise logical and operator & as follows
v&t
## [1]  TRUE  TRUE FALSE  TRUE
## Example 
# ---
# Question: Create again two vectors v and t
# ---
# 
v <- c(0,0,TRUE,2+2i) 
t <- c(0,3,TRUE,2+3i) 

# Then use the element-wise logical or operator | below
v|t
## [1] FALSE  TRUE  TRUE  TRUE
## Example 
# ---
# Question: Let create two vectors v and t
# ---
# 
v <- c(3,1,TRUE,2+3i) 
t <- c(4,1,FALSE,2+3i) 

# This time, use the logical not operator ||
v||t
## [1] TRUE
## Example 
# ---
# Question: Create variables v1, v2 and v3, assigning them with vectors 
# using the left assignment operators v1, v2 and v3
# ---
# 
v1 <- c(3,1,TRUE,2+3i) 
v2 <<- c(3,1,TRUE,2+3i) 
v3 = c(3,1,TRUE,2+3i) 

# Then we print out v1 below
v1 
## [1] 3+0i 1+0i 1+0i 2+3i
# And print out v2 below
# ---
# OUR CODE GOES BELOW
# 
v2
## [1] 3+0i 1+0i 1+0i 2+3i
# And now print out v3 below
# ---
# OUR CODE GOES BELOW
# 
v3
## [1] 3+0i 1+0i 1+0i 2+3i
## Challenge
# ---
# Question: Use the right asignment operators to assign vectors to the variables v1 and v2 
# ---
# 
c(3,1,TRUE,2+3i) -> v1 
c(3,1,TRUE,2+3i) ->> v2 

# Then print out variable v1 and see what has happened
# ---
# OUR CODE GOES HERE
# 
v1
## [1] 3+0i 1+0i 1+0i 2+3i
# And also print out variable v2
# ---
# OUR CODE GOES HERE
# 
v2
## [1] 3+0i 1+0i 1+0i 2+3i
## Example
# ---
# Question: Use the right assignment operators to assign the vectors to the respective variables as shown below;
# ---
# OUR CODE GOES HERE
# 
variable.1 = c(3,4,5,6)  
variable.2 <- c("Hello"," there")  
c(TRUE,2) -> variable.3 

# Uncomment the following lines to see what has happened
# --- 
# 
variable.1 
## [1] 3 4 5 6
variable.2
## [1] "Hello"  " there"
variable.3
## [1] 1 2
## Example 
# ---
# Question: To learn about the numeric data type, 
# lets assign the value 62.4 to the variable m as shown below
# ---
# OUR CODE GOES BELOW
# 
m = 62.4       

# Print out the variable's value below
# ---
# OUR CODE BELOW
# 
m
## [1] 62.4
class(m)
## [1] "numeric"
## Example 
# ---
# Let's now create an integer 3 and assign it to the variable n
# ---
# 
n = as.integer(3)  

# Then print n below so as to see what is stored in n
# ---
# 
n   
## [1] 3
class(n)
## [1] "integer"
# Using another example, lets create convert 3.14 to an interger 
# and assign the converted value to the variable p
# ---
# OUR CODE GOES BELOW
#
p = as.integer(3.14)

# And print out the value p so as to see the value that has been assigned to p
# ---
# OUR CODE GOES BELOW
#
p
## [1] 3
class(p)
## [1] "integer"
## Example 
# ---
# We can also assign a complex number and assign it to the variable k just as shown below
# ---
# 
k = 1 + 2i   

# Now lets print out k below
# ---
# OUR CODE GOES BELOW
# 
k
## [1] 1+2i
class(k)
## [1] "complex"
## Example 
# ---
# To create a logical value we are first going to create two variables x and y variables
# ---
#
x = 4;  y = 6      

# Now we check whether x is greater than y
# ---
# 
z = x > y         

# And then print out the logical value below
# ---
# OUR CODE GOES BELOW
#
z
## [1] FALSE
class(z)
## [1] "logical"
## Example 
# ---
# Convert the value 62.48 to a string and store it a variable g
# ---
#
g = as.character(62.48)  

# Then print the character string g
# ---
# OUR CODE GOES BELOW
#
g
## [1] "62.48"
class(g)
## [1] "character"

Lists and Vectors

1. Creating

Lists

1.1 Lists Code Example

## Example 
# ---
# Lists are R objects which contain elements of different types 
# like numbers, strings, vectors, matrices, functions and lists too.
# ---
# Question: Create a list using the list() function
# ---
# OUR CODE GOES BELOW
# ---
#
alist <- list ("Red", "Blue", c(42,36,01), FALSE, 73.91, 128.6)

# Now printing alist
# ---
# OUR CODE GOES BELOW
# 
alist
## [[1]]
## [1] "Red"
## 
## [[2]]
## [1] "Blue"
## 
## [[3]]
## [1] 42 36  1
## 
## [[4]]
## [1] FALSE
## 
## [[5]]
## [1] 73.91
## 
## [[6]]
## [1] 128.6

Vectors

## Example 
# ---
# Question: Create vectors a, b and c 
# ---
# 
a <- c(1,2,5.3,6,-2,4)                   # numeric vector 
b <- c("one","two","three")              # character vector 
c <- c(TRUE,TRUE,TRUE,FALSE,TRUE,FALSE)  # logical vector

# Print out these vectors a, b and c
# ---
# OUR CODE GOES BELOW
# 
a
## [1]  1.0  2.0  5.3  6.0 -2.0  4.0
b
## [1] "one"   "two"   "three"
c
## [1]  TRUE  TRUE  TRUE FALSE  TRUE FALSE

2. Naming

Lists

2.1 Naming Lists Code Example

## Example 
# ---
# The elements in a list can be given names which would allow one 
# to have access to those elements as shown below
# ---
# Question: Create a list containing a vector, a matrix and a list
# ---
# 
list_data <- list(c("Jan","Feb","Mar"), matrix(c(3,9,5,1,-2,8), nrow = 2), list("green",12.3)) 

# Giving names to the elements in the list
names(list_data) <- c("1st Quarter", "A_Matrix", "A Inner list") 

# Print the list list_data below
# ---
# OUR CODE GOES BELOW
# 

list_data
## $`1st Quarter`
## [1] "Jan" "Feb" "Mar"
## 
## $A_Matrix
##      [,1] [,2] [,3]
## [1,]    3    5   -2
## [2,]    9    1    8
## 
## $`A Inner list`
## $`A Inner list`[[1]]
## [1] "green"
## 
## $`A Inner list`[[2]]
## [1] 12.3

Vectors

2.2 Naming Vectors Code Example

## Example 
# ---
# You can give a name to the elements of a vector with the names() function as shown below
# ---
# 
a <- c("Serena Williams", "Tennis Player") 
names(a) <- c("Name", "Profession")

# Then print out these vectors below
# ---
# OUR CODE GOES BELOW
# 
a
##              Name        Profession 
## "Serena Williams"   "Tennis Player"

3. Selection

Lists

3.1 List Selection Code Example

## Example 
# ---
# Every element of the list can be accessed by the use of square brackets, 
# and by numeric indices or by the logical index or by using element names.:
# ---
# Question: Let's now create a list ls
# ---
# OUR CODE BELOW
# 
ls <- list( first = 2, second = 4, third = list( fourth = 3.2, fifth = 6.4 ) )

# And uncomment the following lines to find out what happens
# ---
# 
ls [1:2]
## $first
## [1] 2
## 
## $second
## [1] 4
ls[-3]
## $first
## [1] 2
## 
## $second
## [1] 4
ls [c ("first", "second")]
## $first
## [1] 2
## 
## $second
## [1] 4

Vectors

3.1 Vector Selection Code Example

## Example
# ---
# Create a numeric vector a 
# ---
# OUR CODE GOES BELOW
# 
a <- c(1,2,5.3,6,-2,4) 

# And select the 2nd and 3rd elements in the vector
a[c(2,3)]
## [1] 2.0 5.3

4. Adding

Lists

4.1 Adding Lists Code Example

## Example  
# ---
# One can add elements only at the end of a list as shown
# ---
# Question: Create a list containing a vector and a matrix 
# ---
# OUR CODE GOES BELOW
# 
list_data <- list(c("Jan","Feb","Mar"), matrix(c(3,9,5,1,-2,8), nrow = 2))

# And add an element at the end of the list then print it out
# ---
# 
list_data[4] <- "New element" 
list_data[4]
## [[1]]
## [1] "New element"
names(list_data) <- c("Vector", "Matrix", "NULL", "Character")

list_data
## $Vector
## [1] "Jan" "Feb" "Mar"
## 
## $Matrix
##      [,1] [,2] [,3]
## [1,]    3    5   -2
## [2,]    9    1    8
## 
## $`NULL`
## NULL
## 
## $Character
## [1] "New element"
## Challenge
# ---
# Question: Create an empty list named months_of_the_years, 
# then add all 12 months of the year
# ---
# OUR CODE GOES BELOW
# 
months_of_the_year <- list()
months_of_the_year
## list()
months_of_the_year[1] <- "January"
months_of_the_year[2] <- "February"
months_of_the_year[3] <- "March"
months_of_the_year[4] <- "April"
months_of_the_year[5] <- "May"
months_of_the_year[6] <- "June"
months_of_the_year[7] <- "July"
months_of_the_year[8] <- "August"
months_of_the_year[9] <- "September"
months_of_the_year[10] <- "October"
months_of_the_year[11] <- "November"
months_of_the_year[12] <- "December"

names(months_of_the_year) <- c("First", "Second", "Third", "Fourth", "Fifth", "Sixth", "Seventh", "Eighth", "Ninth", "Tenth", "Eleventh", "Twelfth")

months_of_the_year
## $First
## [1] "January"
## 
## $Second
## [1] "February"
## 
## $Third
## [1] "March"
## 
## $Fourth
## [1] "April"
## 
## $Fifth
## [1] "May"
## 
## $Sixth
## [1] "June"
## 
## $Seventh
## [1] "July"
## 
## $Eighth
## [1] "August"
## 
## $Ninth
## [1] "September"
## 
## $Tenth
## [1] "October"
## 
## $Eleventh
## [1] "November"
## 
## $Twelfth
## [1] "December"

Vectors

4.2 Adding Vectors Code Example

## Example 
# ---
# Question: Vectors can be combined via the function c as shown
# ---
# OUR CODE GOES BELOW
# 
p = c(1, 2, 3)
q = c("aa", "bb", "cc")

# Then print out the combined vector
# ---
# Hint: Uncomment the line below
# 
c(p, q)
## [1] "1"  "2"  "3"  "aa" "bb" "cc"
## Challenge 
# ---
# Question: Combine the following vectors and print out the result below
# ---
# OUR CODE GOES BELOW
# 
a <- c("Serena Williams", "Tennis Player") 
names(a) <- c("Name", "Profession")

c(names(a),a)
##                                                  Name        Profession 
##            "Name"      "Profession" "Serena Williams"   "Tennis Player"
## Example 
# ---
# Question: Let's create a matrix mymat
# ---
# 
mymat <- matrix(1:12,4,3)

# And then print out mymat
# ---
# OUR CODE GOES BELOW
# 

mymat
##      [,1] [,2] [,3]
## [1,]    1    5    9
## [2,]    2    6   10
## [3,]    3    7   11
## [4,]    4    8   12
## Example 
# ---
# Question: Let's use the byrow=TRUE argument to create a matrix 
# by rows instead of by columns as shown below
# ---
mymat <- matrix(1:12,ncol=3,byrow=TRUE)

# And then print out the mymat variable below
mymat
##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    4    5    6
## [3,]    7    8    9
## [4,]   10   11   12

2. Naming

2.1 Matrix Naming Code Example

## Example 
# ---
# In order to remember what is stored in a matrix, you can add the names of the columns and rows. 
# This will also help you to read the data as well as select elements from the matrix.
# ---
# Question: Lets create the vectors kenya, ethiopia and chad
# ---
# OUR CODE GOES BELOW
# 
kenya <- c(460.998, 314.4) 
ethiopia <- c(290.475, 247.900) 
chad <- c(309.306, 165.8)

# then create a matrix geography_matrix
# ---
# 
geography_matrix <- matrix(c(kenya, ethiopia, chad), nrow = 3, byrow = TRUE)

# Uncomment the following two vectors; location and countries which will be used for naming
# ---
#
location <- c("Lat", "Long")
countries <- c("Kenya", "Ethiopia", "Chad")

# Then also uncomment the line below and so as to name the columns with location
# ---
# 
colnames(geography_matrix) <- location

# More on uncomment also the following line so as to name the rows with countries
# ---
#
rownames(geography_matrix) <- countries

# And lastly print out geography_matrix
# ---
#
geography_matrix
##              Lat  Long
## Kenya    460.998 314.4
## Ethiopia 290.475 247.9
## Chad     309.306 165.8
## Challenge 
# ---
# Question: Create a matrix family with column names Name, Age, Gender and Occupation. 
# Populate it with 5 your own family members.
# ---
# OUR CODE GOES HERE
# 

fam1 <- c("Rose", 56, "Female", "Self" )
fam2 <- c("Caro", 31, "Female", "Self" )
fam3 <- c("Jeff", 24, "Male", "Unemployed" )

fam_matrix <- matrix(c(fam1,fam2,fam3), nrow = 3, byrow = TRUE)

cols <- c("Name", "Age", "Gender", "Occupation")
colnames(fam_matrix) <- cols

fam_matrix
##      Name   Age  Gender   Occupation  
## [1,] "Rose" "56" "Female" "Self"      
## [2,] "Caro" "31" "Female" "Self"      
## [3,] "Jeff" "24" "Male"   "Unemployed"

3. Adding a Column

You can add a row to a matrix using the rbind() function.

3.1 Adding a Column Code Example

## Example 
# ---
# Question: Create the matrix x below
# ---
# OUR CODE GOES BELOW
# 
x <- matrix(1:9, nrow = 3)
x
##      [,1] [,2] [,3]
## [1,]    1    4    7
## [2,]    2    5    8
## [3,]    3    6    9
# Then add a column as shown
# ---
# 
x_new <- cbind(x, c(1, 2, 3))
x_new
##      [,1] [,2] [,3] [,4]
## [1,]    1    4    7    1
## [2,]    2    5    8    2
## [3,]    3    6    9    3
## Challenge 
# ---
# Question: Add a column residence to your fictional family matrix that you had created earlier
# ---
# OUR CODE GOES BELOW
# 

fam_new <- cbind(fam_matrix, c("Kenya", "USA", "Kenya"))
fam_new
##      Name   Age  Gender   Occupation          
## [1,] "Rose" "56" "Female" "Self"       "Kenya"
## [2,] "Caro" "31" "Female" "Self"       "USA"  
## [3,] "Jeff" "24" "Male"   "Unemployed" "Kenya"

4. Adding a Row

4.1 Adding a Row Code Example

## Example 
# ---
# Question: Creating a matrix x
# ---
# 
x <- matrix(1:9, nrow = 3)
x
##      [,1] [,2] [,3]
## [1,]    1    4    7
## [2,]    2    5    8
## [3,]    3    6    9
# You can also add a row using the cbind() function
# ---
# 
x_new2 <- rbind(x,c(1,2,3))
x_new2
##      [,1] [,2] [,3]
## [1,]    1    4    7
## [2,]    2    5    8
## [3,]    3    6    9
## [4,]    1    2    3
## Challenge 
# ---
# Question: Add a fictional character to your fictional family matrix
# ---
# OUR CODE GOES BELOW
# 

fam_new2 <- rbind(fam_new, c("Change", 8, "Male", "Changes", "Everywhere"))
fam_new2
##      Name     Age  Gender   Occupation               
## [1,] "Rose"   "56" "Female" "Self"       "Kenya"     
## [2,] "Caro"   "31" "Female" "Self"       "USA"       
## [3,] "Jeff"   "24" "Male"   "Unemployed" "Kenya"     
## [4,] "Change" "8"  "Male"   "Changes"    "Everywhere"

5. Selecting a Matrix

To select an element of a matrix, one needs to specify both the row and the column as shown:

## Example 
# ---
# Question: Select the following matrix
# ---
# OUR CODE GOES BELOW
# 
x <- matrix(1:9, nrow = 3)
x
##      [,1] [,2] [,3]
## [1,]    1    4    7
## [2,]    2    5    8
## [3,]    3    6    9
# Select the elements from the above matrix 
# By uncommenting the following lines.
# ---
# 
x[1,3] # select the element at 1nd row, 3rd column
## [1] 7
x[2, ] # the 2nd row
## [1] 2 5 8
x[ ,3] # the 3rd column
## [1] 7 8 9
## Challenge 
# ---
# Question: Select the last member of your family member 
# ---
# OUR CODE GOES BELOW
# 

fam_new2[4, ]
##         Name          Age       Gender   Occupation              
##     "Change"          "8"       "Male"    "Changes" "Everywhere"
## Challenge 
# ---
# Question: Select the first member of your family member 
# ---
# OUR CODE GOES BELOW
# 

fam_new2[1, ]
##       Name        Age     Gender Occupation            
##     "Rose"       "56"   "Female"     "Self"    "Kenya"

6. Operations

Matrix addition & subtraction

6.1 Matrix Addition & Subtraction Code Example

## Example 
# ---
# Question: Matrix addition and subtract require the matrices to have the same dimensions. 
# Let's start by creating matrices x and y.
# ---
# OUR CODE GOES BELOW
# 
x <- matrix(c(3, 9, -1, 4, 2, 6), nrow = 2)
y <- matrix(c(5, 2, 0, 9, 3, 4), nrow = 2)

# Print matrix x below
# ---
# 
x
##      [,1] [,2] [,3]
## [1,]    3   -1    2
## [2,]    9    4    6
# Print the matrix y below
# ---
# 
y
##      [,1] [,2] [,3]
## [1,]    5    0    3
## [2,]    2    9    4
# Add the matrices and print out the result
# ---
#
x+y
##      [,1] [,2] [,3]
## [1,]    8   -1    5
## [2,]   11   13   10
# Subtract the matrices
# ---
# 
x-y
##      [,1] [,2] [,3]
## [1,]   -2   -1   -1
## [2,]    7   -5    2

Matrix Multiplication & Division

6.2 Matrix Multiplication & Division Code Example

## Challenge
# ---
# Question: Let's create two 2 x 3 matrices x and y
# ---
# OUR CODE GOES BELOW
# 
x <- matrix(c(3, 9, -1, 4, 2, 6), nrow = 2)
y <- matrix(c(5, 2, 0, 9, 3, 4), nrow = 2)

x
##      [,1] [,2] [,3]
## [1,]    3   -1    2
## [2,]    9    4    6
y
##      [,1] [,2] [,3]
## [1,]    5    0    3
## [2,]    2    9    4
# And then multiply these matrices, assign the result to to the variable z
# ---
# 
z = x*y
# Now print out the matrix z
# ---
# 
z
##      [,1] [,2] [,3]
## [1,]   15    0    6
## [2,]   18   36   24
# Lastly divide matrix x by y and assign the result to the variable z
# ---
#
z = x/y
z
##      [,1]      [,2]      [,3]
## [1,]  0.6      -Inf 0.6666667
## [2,]  4.5 0.4444444 1.5000000

Factors, Data Frames, Data Tables and Tibbles

1. Factors

As we have noted, factors are variables in R which take on a limited number of different values; such variables are often referred to as categorical variables.

In a dataset, we can distinguish two types of variables: categorical and continuous.

In a categorical variable, the value is limited and usually based on a particular finite group. For example, a categorical variable can be countries, year, gender, occupation. A continuous variable, however, can take any values, from integer to decimal. For example, we can have the revenue, price of a share, etc.

1.1 Factors Code Example

## Example 
# ---
# Question: Lets create a vector v
# ---
# OUR CODE GOES BELOW
# 
v <- c(1,3,5,8,2,1,3,5,3,5)
v
##  [1] 1 3 5 8 2 1 3 5 3 5
# Then determine whether this vector is a factor
# ---
#
is.factor(v)
## [1] FALSE
## Challenge 
# ---
# Question: Calculate the categorical distribution as shown and figure out why the given output
# ---
# OUR CODE GOES BELOW 
# 
factor(v)
##  [1] 1 3 5 8 2 1 3 5 3 5
## Levels: 1 2 3 5 8
## Example 
# ---
# Question: Assign factor v to x and print out x 
# ---
# OUR CODE GOES BELOW 
# 
x <- factor(v)

x
##  [1] 1 3 5 8 2 1 3 5 3 5
## Levels: 1 2 3 5 8
## Challenge 
# ---
# Question: Determine whether x is a factor below. 
# Hint: Just like the way you did when you were finding out whether vector v is a factor
# ---
# OUR CODE GOES BELOW
# 

is.factor(x)
## [1] TRUE

1.2 Factors Code Example

## Example 
# ---
# Question: First we create a vector as input, check whether its a factor, 
# apply the factor function to create a factor from the vector
# ---
# OUR CODE GOES BELOW
# 
data <- c("East","West","East","North","North","East","West","West","West","East","North")

# Then print out this vector
data
##  [1] "East"  "West"  "East"  "North" "North" "East"  "West"  "West"  "West" 
## [10] "East"  "North"
# Now, check whether this is a factor
is.factor(data)
## [1] FALSE
# Then, apply the factor function to create a factor from the vector
factor_data <- factor(data) 

# Then see our newly created factor
factor_data 
##  [1] East  West  East  North North East  West  West  West  East  North
## Levels: East North West
# Check whether this is a factor
is.factor(factor_data)
## [1] TRUE

1.3 Factors Code Example

# Example  
# ---
# Creating a factor, determine and check the levels 
# ---
# OUR CODE GOES BELOW
# 
sex <- factor(c("male", "female", "female", "male"))

# Determining the levels
levels(sex)
## [1] "female" "male"
# Then checking the number of levels using nlevels()
nlevels(sex)
## [1] 2
# Sometimes, the order of the factors does not matter, other times you might want to specify the order 
# because it is meaningful (e.g., “low”, “medium”, “high”) or it is required by particular type of analysis. 
# Additionally, specifying the order of the levels allows us to compare levels:

food <- factor(c("low", "high", "medium", "high", "low", "medium", "high"))

# then print out levels of food
levels(food)
## [1] "high"   "low"    "medium"
nlevels(food)
## [1] 3

2. Data Frames

A data frame is used for storing data tables. Unlike a matrix in data frame each column can contain different modes of data.

Creating a Dataframe

2.1 Creating a Dataframe Code Example

## Example 
# ---
# Question: Lets create a data frame BMI
# ---
# OUR CODE GOES BELOW
# 
BMI <- data.frame( 
    gender  = c("Male", "Male","Female"), 
    height  = c(152, 171.5, 165), 
    weight  = c(81,93, 78), 
    Age     = c(42,38,26)
) 

# Then print it out below
BMI
##   gender height weight Age
## 1   Male  152.0     81  42
## 2   Male  171.5     93  38
## 3 Female  165.0     78  26
## Challenge 
# ---
# Question: Create a data frame family with column names Name, Age, Gender and Occupation. 
# Populate it with 5 your own family members.
# ---
# OUR CODE GOES BELOW
# 

fam_df <- data.frame( 
    Name  = c("Rose", "Caro","Jeff"), 
    Age  = c(56, 31, 24), 
    Gender  = c("Female", "Female", "Male"), 
    Occupation = c("Self", "Self", "Unemployed")
) 

# Then print it out below
fam_df
##   Name Age Gender Occupation
## 1 Rose  56 Female       Self
## 2 Caro  31 Female       Self
## 3 Jeff  24   Male Unemployed

Selecting Elements From a DataFrame

2.2 Selecting Elements From a DataFrame Code Example

## Example 
# ---
# Question: Selecting elements from the BMI dataframe
# ---
# OUR CODE GOES BELOW
# 

# selecting row 1 
BMI[1,]
##   gender height weight Age
## 1   Male    152     81  42
# selecting rows 1 to 2
BMI[1:2, ] 
##   gender height weight Age
## 1   Male  152.0     81  42
## 2   Male  171.5     93  38
# selecting column 1 
BMI[,1]
## [1] "Male"   "Male"   "Female"
# selecting column 1 to 2
BMI[,1:2 ] 
##   gender height
## 1   Male  152.0
## 2   Male  171.5
## 3 Female  165.0
# selecting row 1 in column 2
BMI[1,2] 
## [1] 152
## Challenge 
# ---
# Question: Select the column 2 from the BMI dataframe
# ---
# OUR CODE GOES BELOW
# 

fam_df[,2]
## [1] 56 31 24
## Challenge 
# ---
# Question: Select the second and third members of your family 
# ---
# OUR CODE GOES BELOW
# 

fam_df[2:3,]
##   Name Age Gender Occupation
## 2 Caro  31 Female       Self
## 3 Jeff  24   Male Unemployed

Sorting

2.3 Sorting Code Example

## Example 
# ---
# Question: Sort the BMI dataframe by using the order() function 
# ---
# 

# Sort in ascending order by gender
# ---
#
sorted_by_gender <- BMI[order(BMI$gender),]

# Print out sorted_by_gender below
# ---
#
sorted_by_gender
##   gender height weight Age
## 3 Female  165.0     78  26
## 1   Male  152.0     81  42
## 2   Male  171.5     93  38
# Sort in descending order by weight
# ---
# 
sorted_by_weight <- BMI[order(-BMI$weight),]

# Print out sorted_by_weight below
# ---
# 
sorted_by_weight
##   gender height weight Age
## 2   Male  171.5     93  38
## 1   Male  152.0     81  42
## 3 Female  165.0     78  26
# And sort in descending order by gender below
# ---
# OUR CODE GOES BELOW
# 
sorted_by_gender2 <- BMI[order(BMI$gender),]
sorted_by_gender2
##   gender height weight Age
## 3 Female  165.0     78  26
## 1   Male  152.0     81  42
## 2   Male  171.5     93  38

3. Data Tables

As we have mentioned, a data table provides an enhanced version of data.frames.

Creating a Data Table

The data.table R package is considered as the fastest package for data manipulation.

3.2 Creating a Data Table Code Example

# Load the data.table package
# ---
# 
library(data.table)
## Example 
# ---
# Question: Create a data table DT
# ---
# 
DT = data.table(
  ID = c("b","b","b","a","a","c"),
  a = 1:6,
  b = 7:12,
  c = 13:18
)

# Then print it out to see what happens
# ---
# OUR CODE GOES BELOW
# 

DT
##    ID a  b  c
## 1:  b 1  7 13
## 2:  b 2  8 14
## 3:  b 3  9 15
## 4:  a 4 10 16
## 5:  a 5 11 17
## 6:  c 6 12 18

Selecting Elements From a Data Table

3.3 Selecting Elements From a Data Table Code Example

## Example 
# ---
# Question: Select elements from the given datatable DT
# ---
# OUR CODE GOES BELOW
# 

# Selecting Row 1 
DT[1,]
##    ID a b  c
## 1:  b 1 7 13
# Selecting Rows 1 to 2
DT[1:2,] 
##    ID a b  c
## 1:  b 1 7 13
## 2:  b 2 8 14
# Find out what happens when we print out the following statement 
DT[,1]
##    ID
## 1:  b
## 2:  b
## 3:  b
## 4:  a
## 5:  a
## 6:  c
# Find out what happens when we print out the following statement
DT[,1:2] 
##    ID a
## 1:  b 1
## 2:  b 2
## 3:  b 3
## 4:  a 4
## 5:  a 5
## 6:  c 6
# And lastly find out what happens when we print out the following statement
DT[1,2] 
##    a
## 1: 1
# Select the fourth and third rows from the data table
# ---
# OUR CODE GOES BELOW
# 
DT[3:4,]
##    ID a  b  c
## 1:  b 3  9 15
## 2:  a 4 10 16

Sorting a Data Table

3.4 Sorting a Data Table Code Example

## Example 
# ---
# Question: Sorting the datatable in ascending order by c 
# ---
# OUR CODE GOES BELOW
# 

# Performing the sort
# 
sorted_by_c <- DT[order(DT$c),]

# Printing out sorted_by_c
# ---
# OUR CODE GOES BELOW
# 
sorted_by_c
##    ID a  b  c
## 1:  b 1  7 13
## 2:  b 2  8 14
## 3:  b 3  9 15
## 4:  a 4 10 16
## 5:  a 5 11 17
## 6:  c 6 12 18
# Sort in descending order by b, uncommenting the line below
# ---
# 
sorted_by_b <- DT[order(-DT$b),]

# Finally printing out sorted_by_b below
# ---
# OUR CODE GOES BELOW
# 

sorted_by_b
##    ID a  b  c
## 1:  c 6 12 18
## 2:  a 5 11 17
## 3:  a 4 10 16
## 4:  b 3  9 15
## 5:  b 2  8 14
## 6:  b 1  7 13

4. Tibbles

Tibbles are data frames, but they tweak some older behaviours to make life a little easier. They also have an enhanced print() method which makes them easier to use with large datasets containing complex objects

Creating a Tibble

You can create a new tibble from individual vectors with tibble(). tibble() will automatically recycle inputs of length 1, and allows you to refer to variables that you just created, as shown below.

4.1 Creating a Tibble Code Example

# First, we load the tibble package
library(tibble)
## Example 
# ---
# Question: Create a tible tb
# ---
# OUR CODE GOES BELOW
#

# Then create our tibble tb
tb <- tibble(
  x = 1:5, 
  y = 1, 
  z = x ^ 2 + y
)

# And finally print the created tibble 
# ---
# OUR CODE GOES BELOW
# 
tb
## # A tibble: 5 x 3
##       x     y     z
##   <int> <dbl> <dbl>
## 1     1     1     2
## 2     2     1     5
## 3     3     1    10
## 4     4     1    17
## 5     5     1    26

Selecting a Tibble Code Example

4.1 Selecting a Tibble Code Example

## Example 
# ---
# Question: Find out what happens when we print the following 
# ---
# OUR CODE GOES BELOW
# 
tb[1,]
## # A tibble: 1 x 3
##       x     y     z
##   <int> <dbl> <dbl>
## 1     1     1     2
tb[1:2, ] 
## # A tibble: 2 x 3
##       x     y     z
##   <int> <dbl> <dbl>
## 1     1     1     2
## 2     2     1     5
tb[,1]
## # A tibble: 5 x 1
##       x
##   <int>
## 1     1
## 2     2
## 3     3
## 4     4
## 5     5
tb[,1:2 ] 
## # A tibble: 5 x 2
##       x     y
##   <int> <dbl>
## 1     1     1
## 2     2     1
## 3     3     1
## 4     4     1
## 5     5     1
# Select the second and third rows
# ---
# OUR CODE GOES BELOW
# 
tb[2:3,]
## # A tibble: 2 x 3
##       x     y     z
##   <int> <dbl> <dbl>
## 1     2     1     5
## 2     3     1    10

Sorting a Tibble

4.1 Sorting a Tibble Code Example

## Example
# ---
# Question: Find out what happens when we sort by doing the following 
# ---
# 
sorted_by_1 <- tb[order(tb$z),]
sorted_by_1
## # A tibble: 5 x 3
##       x     y     z
##   <int> <dbl> <dbl>
## 1     1     1     2
## 2     2     1     5
## 3     3     1    10
## 4     4     1    17
## 5     5     1    26
sorted_by_2 <- tb[order(-tb$x),]
sorted_by_2
## # A tibble: 5 x 3
##       x     y     z
##   <int> <dbl> <dbl>
## 1     5     1    26
## 2     4     1    17
## 3     3     1    10
## 4     2     1     5
## 5     1     1     2
# Sort tb in ascending order by x below
# ---
# OUR CODE GOES BELOW
#

sorted_by_3 <- tb[order(tb$x),]
sorted_by_3
## # A tibble: 5 x 3
##       x     y     z
##   <int> <dbl> <dbl>
## 1     1     1     2
## 2     2     1     5
## 3     3     1    10
## 4     4     1    17
## 5     5     1    26

Missing Data

1. Finding Missing Values

In R, the missing values are shown by the symbol NA. To identify missing values in your dataset the function is.na() is normally used.

Finding Missing Values Example 1.1

## Example 
# ---
# Lets create a dataset dt
# ---
# OUR CODE GOES BELOW
#
Name <- c("John", "Tim", NA)
Sex <- c("men", "men", "women")
Age <- c(45, 53, NA)
dt <- data.frame(Name, Sex, Age)

# Then print out this dataset below
dt
##   Name   Sex Age
## 1 John   men  45
## 2  Tim   men  53
## 3 <NA> women  NA
# Lets Identify missing data in your dataset 
# by using the function is.na() 
# ---
# 
is.na(dt)
##       Name   Sex   Age
## [1,] FALSE FALSE FALSE
## [2,] FALSE FALSE FALSE
## [3,]  TRUE FALSE  TRUE
# Example 
# ---
# We can also find out total missing values in each column 
# by using the function colSums()
# ---
# OUR CODE GOES BELOW
#
colSums(is.na(dt))
## Name  Sex  Age 
##    1    0    1

2. Dealing with Missing Values

Using na.omit() to omit all rows containing missing values.

Dealing with Missing Values Code Example 2.1

## Example 
# ---
# Question: Show all rows from the dataset which don't contain any missing values 
# ---
# OUR CODE GOES BELOW
#
na.omit(dt)
##   Name Sex Age
## 1 John men  45
## 2  Tim men  53

Dealing with Missing Values Code Example 2.2 (fill the missing value in a column with a number)

## Example 
# ---
# Question: Recode/fill the missing value in a column with a number
# ---
# OUR CODE GOES BELOW
#
dt$Age[is.na(dt$Age)] <- 99

dt
##   Name   Sex Age
## 1 John   men  45
## 2  Tim   men  53
## 3 <NA> women  99

Dealing with Missing Values Code Example 2.3 (fill the missing value in a column with the mean value of the column)

## Example
# ---
# Question: Recode or fill the missing value in a column with the mean value of the column-#-
# ---
# OUR CODE GOES BELOW
#
dt$Age[is.na(dt$Age)] <- mean(dt$Age, na.rm = TRUE)

# print the dt table below

dt
##   Name   Sex Age
## 1 John   men  45
## 2  Tim   men  53
## 3 <NA> women  99
## Challenge 1
# ---
# Question: Using the given bus dataset below, recode the missing values of the payment_method 
# and travel_to columns with athen appropriate values
# ---
# OUR CODE GOES BELOW
#

# Lets first of all import our data table
# ---
#
#library("data.table")
#bus_dataset <- fread('http://bit.ly/BusNairobiWesternTransport')

bus_dataset <- read.csv("C:/Users/user/Downloads/buses-western-nairobi.csv")


# First check have a look at the dataset
# --
#
head(bus_dataset)
##   ride_id seat_number payment_method payment_receipt travel_date travel_time
## 1    1442         15A          Mpesa      UZUEHCBUSO    17-10-17        7:15
## 2    5437         14A          Mpesa      TIHLBUSGTE    19-11-17        7:12
## 3    5710          8B          Mpesa      EQX8Q5G19O    26-11-17        7:05
## 4    5777         19A          Mpesa      SGP18CL0ME    27-11-17        7:10
## 5    5778         11A          Mpesa      BM97HFRGL9    27-11-17        7:12
## 6    5777         18B          Mpesa      B6PBDU30IZ    27-11-17        7:10
##   travel_from travel_to car_type max_capacity
## 1      Migori   Nairobi      Bus           49
## 2      Migori   Nairobi      Bus           49
## 3      Keroka   Nairobi      Bus           49
## 4    Homa Bay   Nairobi      Bus           49
## 5      Migori   Nairobi      Bus           49
## 6    Homa Bay   Nairobi      Bus           49
colSums(is.na(bus_dataset))
##         ride_id     seat_number  payment_method payment_receipt     travel_date 
##               0               0               0               0               0 
##     travel_time     travel_from       travel_to        car_type    max_capacity 
##               0               0               0               0               0
## Challenge 2
# ---
# Question: Clean the given dataset 
# ---
# Dataset url = http://bit.ly/MS-PropertyDataset
# ---
# OUR CODE GOES BELOW
# 

df_property <- read.csv("C:/Users/user/Downloads/property-data.csv")

df_property
##         PID ST_NUM    ST_NAME OWN_OCCUPIED NUM_BEDROOMS NUM_BATH SQ_FT
## 1 100001000    104     PUTNAM            Y            3        1  1000
## 2 100002000    197  LEXINGTON            N            3      1.5    --
## 3 100003000     NA  LEXINGTON            N          n/a        1   850
## 4 100004000    201   BERKELEY           12            1      NaN   700
## 5        NA    203   BERKELEY            Y            3        2  1600
## 6 100006000    207   BERKELEY            Y         <NA>        1   800
## 7 100007000     NA WASHINGTON                         2   HURLEY   950
## 8 100008000    213    TREMONT            Y            1        1      
## 9 100009000    215    TREMONT            Y           na        2  1800
colSums(is.na(df_property))
##          PID       ST_NUM      ST_NAME OWN_OCCUPIED NUM_BEDROOMS     NUM_BATH 
##            1            2            0            0            1            0 
##        SQ_FT 
##            0
df_property_clean = na.omit(df_property)

df_property_clean
##         PID ST_NUM   ST_NAME OWN_OCCUPIED NUM_BEDROOMS NUM_BATH SQ_FT
## 1 100001000    104    PUTNAM            Y            3        1  1000
## 2 100002000    197 LEXINGTON            N            3      1.5    --
## 4 100004000    201  BERKELEY           12            1      NaN   700
## 8 100008000    213   TREMONT            Y            1        1      
## 9 100009000    215   TREMONT            Y           na        2  1800
## Challenge 3
# ---
# Question: 
# ---
# Dataset url = http://bit.ly/AirQualityDataset
# ---
# OUR CODE GOES BELOW
#

# NO DATASET AVAILABLE

Outliers

1. Screening for Outliers

In the process of producing, collecting, processing and analyzing data, outliers can come from many sources and hide in many dimensions. An outlier is an observation that is numerically distant from the rest of the data. When reviewing a boxplot, an outlier is defined as a data point that is located outside the fences (“whiskers”) of the boxplot.

## Example 
# ---
# Let's create the vector A
# ---
# 

A <- c(3, 2, 5, 6, 4, 8, 1, 2, 30, 2, 4)

# then print it out 
A
##  [1]  3  2  5  6  4  8  1  2 30  2  4
# We then plot a boxplot to help us visualise any existing outliers 
# ---
#  
boxplot(A)

# Then use the function boxplot.stats which lists the outliers in the vectors
# ---
# 
boxplot.stats(A)$out
## [1] 30

Outliers should be investigated carefully. Often they contain valuable information about the process under investigation or the data gathering and recording process. Before considering the possible elimination of these points from the data, one should try to understand why they appeared and whether it is likely similar values will continue to appear. Of course, outliers are often bad data points.

2. Obvious Inconsistencies

An obvious inconsistency occurs when a record contains a value or combination of values that cannot correspond to a real-world situation. For example, a person’s age cannot be negative, a man cannot be pregnant and an under-aged person cannot possess a drivers license.

## Example 
# ---
# Say from our vector x above, values above 20 are obvious inconsistencies 
# then we using logical indices to check for 
# ---
#
non_greater_than_20 <- A > 20

# printing out non_greater_than_20
non_greater_than_20
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## v purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::between()   masks data.table::between()
## x dplyr::filter()    masks stats::filter()
## x dplyr::first()     masks data.table::first()
## x dplyr::lag()       masks stats::lag()
## x dplyr::last()      masks data.table::last()
## x purrr::transpose() masks data.table::transpose()
# Challenge 
# ---
# Question: Use the given bus dataset below, determine whether there are any obvious inconsistencies 
# ---
# Dataset url = http://bit.ly/BusNairobiWesternTransport
# ---
# OUR CODE GOES BELOW
# 



# Importing our database
# ---
# 
#install.package("data.table") # install package data.table to work with data tables
#library(data.table) # load package
#install.package("tidyverse") # install packages to work with data frame - extends into visualization

head(bus_dataset)
##   ride_id seat_number payment_method payment_receipt travel_date travel_time
## 1    1442         15A          Mpesa      UZUEHCBUSO    17-10-17        7:15
## 2    5437         14A          Mpesa      TIHLBUSGTE    19-11-17        7:12
## 3    5710          8B          Mpesa      EQX8Q5G19O    26-11-17        7:05
## 4    5777         19A          Mpesa      SGP18CL0ME    27-11-17        7:10
## 5    5778         11A          Mpesa      BM97HFRGL9    27-11-17        7:12
## 6    5777         18B          Mpesa      B6PBDU30IZ    27-11-17        7:10
##   travel_from travel_to car_type max_capacity
## 1      Migori   Nairobi      Bus           49
## 2      Migori   Nairobi      Bus           49
## 3      Keroka   Nairobi      Bus           49
## 4    Homa Bay   Nairobi      Bus           49
## 5      Migori   Nairobi      Bus           49
## 6    Homa Bay   Nairobi      Bus           49
# Previewing the dataset
# ---
# 
#View(bus_dataset)
str(bus_dataset)
## 'data.frame':    51645 obs. of  10 variables:
##  $ ride_id        : int  1442 5437 5710 5777 5778 5777 5777 5778 5778 5781 ...
##  $ seat_number    : chr  "15A" "14A" "8B" "19A" ...
##  $ payment_method : chr  "Mpesa" "Mpesa" "Mpesa" "Mpesa" ...
##  $ payment_receipt: chr  "UZUEHCBUSO" "TIHLBUSGTE" "EQX8Q5G19O" "SGP18CL0ME" ...
##  $ travel_date    : chr  "17-10-17" "19-11-17" "26-11-17" "27-11-17" ...
##  $ travel_time    : chr  "7:15" "7:12" "7:05" "7:10" ...
##  $ travel_from    : chr  "Migori" "Migori" "Keroka" "Homa Bay" ...
##  $ travel_to      : chr  "Nairobi" "Nairobi" "Nairobi" "Nairobi" ...
##  $ car_type       : chr  "Bus" "Bus" "Bus" "Bus" ...
##  $ max_capacity   : int  49 49 49 49 49 49 49 49 49 49 ...
dim(bus_dataset)
## [1] 51645    10
class(bus_dataset)
## [1] "data.frame"
# Identifying the numeric class in the data and evaluating if there are any outliers
# ---
# OUR CODE GOES BELOW
# 

boxplot(bus_dataset$max_capacity)

Duplicated Data

1. Identifying Duplicated Data

R checks for duplicates across rows through the duplicated() function.

Identifying Duplicated Data Code Example 1.1

## Example 
# ---
# Question: Identify duplicate data in the given dataframe
# ---
# OUR CODE GOES BELOW
# 

# Creating our vectors
# ---
# 
x1 <- c(2, 4, 5, 6)
x2 <- c(2, 3, 5, 6)
x3 <- c(2, 4, 5, 6)
x4 <- c(2, 4, 5, 6)

# Create a dataframe df from the above vectors
# ---
#
df <- data.frame(rbind(x1, x2, x3, x4))

# Then printing out this dataset 
df
##    X1 X2 X3 X4
## x1  2  4  5  6
## x2  2  3  5  6
## x3  2  4  5  6
## x4  2  4  5  6
# Now lets find the duplicated rows in the dataset df 
# and assign to a variable duplicated_rows below
# ---
# 
duplicated_rows <- df[duplicated(df),]

# Lets print out the variable duplicated_rows and see these duplicated rows 
# ---
# OUR CODE GOES BELOW
#

duplicated_rows
##    X1 X2 X3 X4
## x3  2  4  5  6
## x4  2  4  5  6
# Removing these duplicated rows in the dataset or 
# showing these unique items and assigning to a variable unique_items below
# ---
#
unique_items <- df[!duplicated(df), ]

# What about seeing what these unique items are?
# ---
#
unique_items
##    X1 X2 X3 X4
## x1  2  4  5  6
## x2  2  3  5  6
# Now there is another way we can also remove duplicated rows 
# in the dataset or show the unique items;
# We simply use the unique() function
# ---
#
unique_items2 <- unique(df)

# After having assigned the unique items to the variable unique_items2, 
# we will now print out this variable and have a look at these unique items 
# ---
# OUR CODE GOES BELOW
# 

unique_items2
##    X1 X2 X3 X4
## x1  2  4  5  6
## x2  2  3  5  6
## Challenge 
# ---
# Question: Display and delete the only duplicate records in the iris dataset below:
# ---
# OUR CODE GOES BELOW 
# 

# Showing the first 6 records in the iris dataset
# ---
# 

head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
# Deleting duplicate records
# ---
# OUR CODE GOES BELOW 
# 

duplicates <- iris[duplicated(iris),]
duplicates
##     Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 143          5.8         2.7          5.1         1.9 virginica
iris_unique <- unique(iris)
head(iris_unique)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
## Challenge 
# ---
# Question: Drop duplicate records in the video_games dataset from the url 
# ---
# OUR CODE GOES BELOW
# 


# Reading our dataset
# ---
# 
video_games <- read.csv("C:/Users/user/Downloads/steam-200k.csv")

# Previewing the first 6 records of the video games dataset
# ---
# 
head(video_games)
##   X151603712 The.Elder.Scrolls.V.Skyrim purchase  X1.0 X0
## 1  151603712 The Elder Scrolls V Skyrim     play 273.0  0
## 2  151603712                  Fallout 4 purchase   1.0  0
## 3  151603712                  Fallout 4     play  87.0  0
## 4  151603712                      Spore purchase   1.0  0
## 5  151603712                      Spore     play  14.9  0
## 6  151603712          Fallout New Vegas purchase   1.0  0
# Cleaning our dataset 
# ---
# OUR CODE GOES BELOW
# 
games_unique <- unique(video_games)
head(games_unique)
##   X151603712 The.Elder.Scrolls.V.Skyrim purchase  X1.0 X0
## 1  151603712 The Elder Scrolls V Skyrim     play 273.0  0
## 2  151603712                  Fallout 4 purchase   1.0  0
## 3  151603712                  Fallout 4     play  87.0  0
## 4  151603712                      Spore purchase   1.0  0
## 5  151603712                      Spore     play  14.9  0
## 6  151603712          Fallout New Vegas purchase   1.0  0

Univariate Graphical Exploratory Data Analysis

1. Measures of Central Tendency

Before embarking on developing statistical models and generating predictions, it is essential to understand our data. This is typically done using conventional numerical and graphical methods.

## Example 
# ---
# We will be using the hills dataset in this section, 
# this dataset contains information on hill climbs made by various athletes
# ---
# OUR CODE GOES BELOW
# 

# Printing the first six rows of the dataset 
# ---
# 
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
hills
##                  dist climb    time
## Greenmantle       2.5   650  16.083
## Carnethy          6.0  2500  48.350
## Craig Dunain      6.0   900  33.650
## Ben Rha           7.5   800  45.600
## Ben Lomond        8.0  3070  62.267
## Goatfell          8.0  2866  73.217
## Bens of Jura     16.0  7500 204.617
## Cairnpapple       6.0   800  36.367
## Scolty            5.0   800  29.750
## Traprain          6.0   650  39.750
## Lairig Ghru      28.0  2100 192.667
## Dollar            5.0  2000  43.050
## Lomonds           9.5  2200  65.000
## Cairn Table       6.0   500  44.133
## Eildon Two        4.5  1500  26.933
## Cairngorm        10.0  3000  72.250
## Seven Hills      14.0  2200  98.417
## Knock Hill        3.0   350  78.650
## Black Hill        4.5  1000  17.417
## Creag Beag        5.5   600  32.567
## Kildcon Hill      3.0   300  15.950
## Meall Ant-Suidhe  3.5  1500  27.900
## Half Ben Nevis    6.0  2200  47.633
## Cow Hill          2.0   900  17.933
## N Berwick Law     3.0   600  18.683
## Creag Dubh        4.0  2000  26.217
## Burnswark         6.0   800  34.433
## Largo Law         5.0   950  28.567
## Criffel           6.5  1750  50.500
## Acmony            5.0   500  20.950
## Ben Nevis        10.0  4400  85.583
## Knockfarrel       6.0   600  32.383
## Two Breweries    18.0  5200 170.250
## Cockleroi         4.5   850  28.100
## Moffat Chase     20.0  5000 159.833

Mean Code Example 1.1

## Example  
# ---
# Question: Find the mean of the distance covered by the athletes 
# and assigning the mean to the variable athletes.dist.mean
# ---
# OUR CODE GOES BELOW
# 

athletes.dist.mean <- mean(hills$dist)

# Printing out
# ---
#
athletes.dist.mean
## [1] 7.528571

Median Code Example 1.2

## Example 
# ---
# Question: Find the median which is the middle most value of the distance covered dist
# ---
# OUR CODE GOES BELOW
# 
athletes.dist.median <- median(hills$dist)

# Printing out athletes.dist.median
# ---
# 
athletes.dist.median
## [1] 6

Mode Code Example 1.3

## Example 
# ---
# Question: Find the mode which is the value that has highest number of occurrences in a set of data. 
# ---
# OUR CODE GOES BELOW
# 

# Unfotunately, R does not have a standard in-built function to calculate mode so we have to build one
# We create the mode function that will perform our mode operation for us
# ---
# 
getmode <- function(v) {
   uniqv <- unique(v)
   uniqv[which.max(tabulate(match(v, uniqv)))]
}

# Calculating the mode using out getmode() function
# ---
#
athletes.dist.mode <- getmode(hills$dist)

# Then printing out athletes.dist.mode 
# ---
# OUR CODE GOES BELOW
# 

athletes.dist.mode
## [1] 6
## Challenge 
# ---
# Question: Find the mean, median, mode of the total evening calls given the following dataset 
# ---
# Dataset url = http://bit.ly/CustomerSignatureforChurnAnalysis
# ---
# OUR CODE GOES BELOW 

# Previewing the first 6 rows of this dataset
# ---
# 

customer = read.csv("C:/Users/user/Downloads/customer_signature_for_churn_analysis.csv")
head(customer)
##   recordID state account_length area_code international_plan voice_mail_plan
## 1        1    HI            101       510                 no              no
## 2        2    MT            137       510                 no              no
## 3        3    OH            103       408                 no             yes
## 4        4    NM             99       415                 no              no
## 5        5    SC            108       415                 no              no
## 6        6    IA            117       415                 no              no
##   number_vmail_messages total_day_minutes total_day_calls total_day_charge
## 1                     0              70.9             123            12.05
## 2                     0             223.6              86            38.01
## 3                    29             294.7              95            50.10
## 4                     0             216.8             123            36.86
## 5                     0             197.4              78            33.56
## 6                     0             226.5              85            38.51
##   total_eve_minutes total_eve_calls total_eve_charge total_night_minutes
## 1             211.9              73            18.01               236.0
## 2             244.8             139            20.81                94.2
## 3             237.3             105            20.17               300.3
## 4             126.4              88            10.74               220.6
## 5             124.0             101            10.54               204.5
## 6             141.6              68            12.04               223.0
##   total_night_calls total_night_charge total_intl_minutes total_intl_calls
## 1                73              10.62               10.6                3
## 2                81               4.24                9.5                7
## 3               127              13.51               13.7                6
## 4                82               9.93               15.7                2
## 5               107               9.20                7.7                4
## 6                90              10.04                6.9                5
##   total_intl_charge number_customer_service_calls churn customer_id
## 1              2.86                             3    no    23383607
## 2              2.57                             0    no    22550362
## 3              3.70                             1    no    59063354
## 4              4.24                             1    no    25464504
## 5              2.08                             2    no      691824
## 6              1.86                             1    no    24456543
str(customer)
## 'data.frame':    12892 obs. of  22 variables:
##  $ recordID                     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ state                        : chr  "HI" "MT" "OH" "NM" ...
##  $ account_length               : int  101 137 103 99 108 117 63 94 138 128 ...
##  $ area_code                    : int  510 510 408 415 415 415 415 408 510 415 ...
##  $ international_plan           : chr  "no" "no" "no" "no" ...
##  $ voice_mail_plan              : chr  "no" "no" "yes" "no" ...
##  $ number_vmail_messages        : int  0 0 29 0 0 0 32 0 0 43 ...
##  $ total_day_minutes            : num  70.9 223.6 294.7 216.8 197.4 ...
##  $ total_day_calls              : int  123 86 95 123 78 85 124 97 117 100 ...
##  $ total_day_charge             : num  12.1 38 50.1 36.9 33.6 ...
##  $ total_eve_minutes            : num  212 245 237 126 124 ...
##  $ total_eve_calls              : int  73 139 105 88 101 68 125 112 46 89 ...
##  $ total_eve_charge             : num  18 20.8 20.2 10.7 10.5 ...
##  $ total_night_minutes          : num  236 94.2 300.3 220.6 204.5 ...
##  $ total_night_calls            : int  73 81 127 82 107 90 120 106 71 92 ...
##  $ total_night_charge           : num  10.62 4.24 13.51 9.93 9.2 ...
##  $ total_intl_minutes           : num  10.6 9.5 13.7 15.7 7.7 6.9 12.9 11.1 9.9 11.9 ...
##  $ total_intl_calls             : int  3 7 6 2 4 5 3 6 4 1 ...
##  $ total_intl_charge            : num  2.86 2.57 3.7 4.24 2.08 1.86 3.48 3 2.67 3.21 ...
##  $ number_customer_service_calls: int  3 0 1 1 2 1 1 0 2 0 ...
##  $ churn                        : chr  "no" "no" "no" "no" ...
##  $ customer_id                  : num  23383607 22550362 59063354 25464504 691824 ...
# Finding the mean
# ---
# 


# Finding the median 
# ---
#


# Finding the mode
# ---
# 

2. Measures of Dispersion

Mininum Code Example 1.4

## Example 
# ---
# Question: Find the minimum element of the distance using the min() function
# ---
# OUR CODE GOES BELOW
# 
athletes.dist.min <- min(hills$dist)

# And then printing athletes.dist.min to show the minimum element
# 
athletes.dist.min
## [1] 2

Maximum Code Example 1.5

## Example
# ---
# Question: Find the maximum element of the distance using the function max() 
# ---
# OUR CODE GOES BELOW 
# 
athletes.dist.max <- max(hills$dist)

# Then printing out the variable athletes.dist.max to show that maximum element
# ---
# OUR CODE GOES BELOW
#

athletes.dist.max
## [1] 28

Range Code Example 1.6

## Example 
# ---
# Find the maximum element of the distance using the function range() as shown below
# ---
# 
athletes.dist.range <- range(hills$dist)

# Printing out the variable athletes.dist.range to show the range 
# ---
#
athletes.dist.range
## [1]  2 28

Quantile Code Example 1.7

## Example 
# ---
# Question: Get the first and the third quartile together with the range 
# and the median using the quantile() function
# ---
# OUR CODE GOES BELOW
# 
athletes.dist.quantile <- quantile(hills$dist)

# Printing out the variable athletes.dist.quantile to show the range  
# ---
# OUR CODE GOES BELOW
# 

athletes.dist.quantile
##   0%  25%  50%  75% 100% 
##  2.0  4.5  6.0  8.0 28.0

Variance Code Example 1.8

## Example 
# ---
# Question: Find the variance of the distance using the var() function as shown below
# ---
# OUR CODE GOES BELOW 
# 

athletes.dist.variance <- var(hills$dist)

# Printing out the the variable athletes.dist.variance to show the variance 
# 
athletes.dist.variance
## [1] 30.51387

The variance is a numerical measure of how the data values is dispersed around the mean.

Standard Deviation Code Example 1.9

## Example 
# ---
# Question: Find the standard deviation of vector t using the sd() function 
# ---
# OUR CODE GOES BELOW 
# 
athletes.dist.sd <- sd(hills$dist)

# Printing out the variable athletes.dist.sd to show the variance 
# ---
#
athletes.dist.sd
## [1] 5.523936
# Challenge 
# ---
# Question: Find the minimum, maximum, range, quantile, variance 
# and standard deviation for total day calls using the given dataset
# ---
# Dataset url = http://bit.ly/CustomerSignatureforChurnAnalysis
# ---
# OUR CODE GOES BELOW
# 



# Find the minimum of total day calls
# ---
# OUR CODE GOES BELOW
# 
customer.min.calls <- min(customer$total_day_calls)
customer.min.calls
## [1] 0
# Find the maximum i.e. max() total day calls
# ---
# OUR CODE GOES BELOW
#

customer.max.calls <- max(customer$total_day_calls)
customer.max.calls
## [1] 165
# Find the range i.e. range() of total day calls
# ---
# OUR CODE GOES BELOW
# 

customer.range.calls <- max(customer$total_day_calls)
customer.range.calls
## [1] 165
# Find the quantile of total day calls
# ---
# OUR CODE GOES BELOW
# 

customer.quantile.calls <- quantile(customer$total_day_calls)
customer.quantile.calls
##   0%  25%  50%  75% 100% 
##    0   87  101  114  165
# Find the variance of total day calls
# ---
# OUR CODE GOES BELOW
# 

customer.var.calls <- var(customer$total_day_calls)
customer.var.calls
## [1] 397.8691
# Find the standard deviation of total day calls
# ---
# OUR CODE GOES BELOW
# 

customer.sd.calls <- sd(customer$total_day_calls)
customer.sd.calls
## [1] 19.94666

3. Univariate Graphical

Box Plots Code Example 3.1

## Example 
# ---
# Question: Lets create a boxplot graph for the distance using the boxplot() function
# ---
# OUR CODE GOES BELOW
# 

boxplot(hills$dist)

The box plot of an observation variable is a graphical representation based on its quartiles, as well as its smallest and largest values. It attempts to provide a visual shape of the data distribution.

DF <- data.frame(
  x=1:10,
  y=10:1,
  z=rep(5,10),
  a=11:20
)
DF
##     x  y z  a
## 1   1 10 5 11
## 2   2  9 5 12
## 3   3  8 5 13
## 4   4  7 5 14
## 5   5  6 5 15
## 6   6  5 5 16
## 7   7  4 5 17
## 8   8  3 5 18
## 9   9  2 5 19
## 10 10  1 5 20
drops <- c("x","z")
new_df <- DF[ , !(names(DF) %in% drops)]
new_df
##     y  a
## 1  10 11
## 2   9 12
## 3   8 13
## 4   7 14
## 5   6 15
## 6   5 16
## 7   4 17
## 8   3 18
## 9   2 19
## 10  1 20

Bar Graph Code Example 3.2

A bar graph of a qualitative data sample consists of vertical parallel bars that shows the frequency distribution graphically.

## Example 
# ---
# Create a frequency distribution of the School variable
# ---
# Dataset Info: For this example, we will use an R built-in database named painters. 
# ---
# OUR CODE GOES BELOW
# 

# Previewing the first six rows of the painters dataset
# ---
# OUR CODE GOES BELOW
#   
head(painters)
##               Composition Drawing Colour Expression School
## Da Udine               10       8     16          3      A
## Da Vinci               15      16      4         14      A
## Del Piombo              8      13     16          7      A
## Del Sarto              12      16      9          8      A
## Fr. Penni               0      15      8          0      A
## Guilio Romano          15      16      4         14      A
dops <- c("School")
painters_n = painters[ , !(names(painters) %in% dops)]
head(painters_n)
##               Composition Drawing Colour Expression
## Da Udine               10       8     16          3
## Da Vinci               15      16      4         14
## Del Piombo              8      13     16          7
## Del Sarto              12      16      9          8
## Fr. Penni               0      15      8          0
## Guilio Romano          15      16      4         14
# Fetching the school column
# ---
# 
school <- painters$School

# Applying the table() function will compute the frequency distribution of the School variable
# ---
# 
school_frequency <- table(school)

# Printing school_frequency below
# ---
#

school_frequency
## school
##  A  B  C  D  E  F  G  H 
## 10  6  6 10  7  4  7  4
# Then applying the barplot function to produce its bar graph
# ---
# 
barplot(school_frequency)

## Challenge
# ---
# Question: Create a bar graph of the total day calls in the customer signature dataset
# ---
# Dataset url = http://bit.ly/CustomerSignatureforChurnAnalysis
# ---
# OUR CODE GOES BELOW
#

day_calls <- table(customer$total_day_calls)
day_calls
## 
##   0  30  34  35  36  39  40  42  44  45  46  47  48  49  50  51  52  53  54  55 
##   6   4   1   3   4   2   7   6  10   9   1   9  11   7   1  12  14  12  25  25 
##  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75 
##  25  29  27  23  30  54  31  52  26  69  61  86  71  67 100  86  80  90 108  79 
##  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95 
## 104 142 155 141 174 143 142 184 164 195 171 192 256 214 211 243 250 224 248 270 
##  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 
## 247 274 249 236 250 249 289 213 261 290 255 261 260 225 262 163 259 204 224 192 
## 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 
## 171 223 125 162 163 173 157 162 111 126 112  96  83  92  88  51  57  75  83  35 
## 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 156 157 158 
##  33  47  53  34  45  42  21  26  18  33  21  26  18   4  22  19   4   5   4  11 
## 160 163 165 
##   4   4   4
barplot(day_calls)

Histogram Code Example 3.3

A histogram shows the frequency distribution of a quantitative variable. The area of each bar is equal to the frequency of items found in each class.

## Example
# ---
# Create a histogram using the faithful dataset 
# --- 
# Hint: we will use an R built-in data frame called faithful 
# ---
# OUR CODE GOES BELOW
# 

# Preview the first six rows of the faithful dataset
# ---
# OUR CODE GOES BELOW
# 

head(faithful)
##   eruptions waiting
## 1     3.600      79
## 2     1.800      54
## 3     3.333      74
## 4     2.283      62
## 5     4.533      85
## 6     2.883      55
# Then applying the hist() function to produce the histogram of the eruptions variable 
# ---
# 

hist(faithful$eruptions)

## Challenge 
# ---
# Question: Create a histogram of the total day minutes in the customer signature dataset 
# ---
# Dataset url = http://bit.ly/CustomerSignatureforChurnAnalysis
# ---
# OUR CODE GOES BELOW
# 

hist(customer$total_day_minutes)

Bivariate and Multivariate Graphical Data Analysis

1. Bivariate analysis

Covariance Code Example 1.1

Covariance is a statistical representation of the degree to which two variables vary together. Basically, covariance is a number that reflects the degree to which two variable vary together. If the greater values of one variable correspond with the greater values of the other variable, or for the smaller values, then the variables show similar behavior, the covariance is a positive. If the greater values of one variable correspond to the smaller values of the other, the variables tend to show opposite behavior, the covariance is negative. If one variable is greater and paired equally often with both greater and lesser values on the other, the covariance will be near to zero.

## Example
# ---
# Question: Find the covariance of eruption duration and waiting time in the data set faithful 
# ---
# OUR CODE GOES BELOW
# 

# Printing out the the first 6 rows of the dataset
# ---
# 
head(faithful)
##   eruptions waiting
## 1     3.600      79
## 2     1.800      54
## 3     3.333      74
## 4     2.283      62
## 5     4.533      85
## 6     2.883      55
# Assigning the eruptions column to the variable eruptions
# ---
# 
eruptions <- faithful$eruptions

# Assigning the waiting column to the variable waiting
# ---
# 
waiting<- faithful$waiting

# Using the cov() function to determine the covariance
# ---
#
cov(eruptions, waiting)
## [1] 13.97781

The covariance of eruption duration and waiting time is about 13.98. It indicates a positive linear relationship between the two variables.

## Challenge
# ---
# Question: Find out the covariance of Bwt and Hwt in the cats dataset
# ---
# OUR CODE GOES BELOW
# 

# Previewing the cats dataset
# ---
# 
head(cats)
##   Sex Bwt Hwt
## 1   F 2.0 7.0
## 2   F 2.0 7.4
## 3   F 2.0 9.5
## 4   F 2.1 7.2
## 5   F 2.1 7.3
## 6   F 2.1 7.6
# Finding out the covariance
# ---
# OUR CODE GOES BELOW
# 

bwt <- cats$Bwt
hwt <- cats$Hwt

cov(bwt,hwt)
## [1] 0.9501127

Correlation Coefficient Code Example 1.2

The correlation coefficient of two variables in a data set equals to their covariance divided by the product of their individual standard deviations. It is a normalized measurement of how the two are linearly related. If the correlation coefficient is close to 1, it would indicate that the variables are positively linearly related. For -1, it indicates that the variables are negatively linearly related and the scatter plot almost falls along a straight line with negative slope. And for zero, it would indicate a weak linear relationship between the variables.

## Example 
# ---
# Question: Find the correlation coefficient of eruption duration and waiting time in the faithful dataset
# ---
# OUR CODE GOES BELOW
# 

# Assigning the eruptions column to the variable eruptions
# ---
# 
eruptions <- faithful$eruptions

# Assigning the waiting column to the variable waiting
# ---
#
waiting<- faithful$waiting

# Using the cor() function to determine the covariance
# ---
#
cor(eruptions, waiting)
## [1] 0.9008112

The correlation coefficient of eruption duration and waiting time is 0.90081. Because it is close to 1, we can conclude that the variables are positively linearly related.

## Challenge 
# ---
# Question: Find out the covariance of Bwt and Hwt in the cats data set below:
# ---
# OUR CODE GOES BELOW 
# 

# Previewing the cats dataset by first importing the Mass library 
# then displaying the first 6 records of this database
library(MASS)
head(cats)
##   Sex Bwt Hwt
## 1   F 2.0 7.0
## 2   F 2.0 7.4
## 3   F 2.0 9.5
## 4   F 2.1 7.2
## 5   F 2.1 7.3
## 6   F 2.1 7.6
cor(bwt,hwt)
## [1] 0.8041274
## Challenge
# ---
# Question: Create a correlation matrix in R using the corr() function
# ---
# Hint: http://bit.ly/RDocumentationCorrMatrix
# ---
# Dataset url = http://bit.ly/HousingDatainR
# ---
# OUR CODE GOES BELOW
# 

#hous = fread("http://bit.ly/HousingDatainR")
#hous

cor(painters_n, method = c("pearson", "kendall", "spearman"))
##             Composition    Drawing      Colour Expression
## Composition  1.00000000  0.4154456 -0.09758818  0.6571846
## Drawing      0.41544563  1.0000000 -0.51696052  0.5737066
## Colour      -0.09758818 -0.5169605  1.00000000 -0.1995179
## Expression   0.65718460  0.5737066 -0.19951793  1.0000000

2. Graphical Techniques

Scatterplot Code Example 2.1

A scatter plot is a two-dimensional data visualization that uses dots to represent the values obtained for two different variables - one plotted along the x-axis and the other plotted along the y-axis. Scatter plots are used when you want to show the relationship between two variables. They are sometimes called correlation plots because they show how two variables are correlated.

## Example 
# ---
# Question: Create a scatter plot of the eruption durations and waiting intervals from the faithful dataset
# ---
# OUR CODE GOES BELOW 
# 

# Assigning the eruptions column to the variable eruptions
# ---
# 
eruptions <- faithful$eruptions

# Assigning the waiting column to the variable waiting
# ---
#
waiting <- faithful$waiting

# Creating the scatter plot using eruptions and waiting
# ---
# 
plot(eruptions, waiting, xlab="Eruption duration", ylab="Time waited")

The scatter plot above reveals a positive linear relationship between eruptions and waiting.

# Challenge 
# ---
# Question: Using the cats dataset, create a scatter plot of the Bwt and Hwt variables. 
# Does it reveal any relationship between these variables?
# ---
# OUR CODE GOES BELOW
# 

# Previewing the cats dataset
# ---
# 
#head(cats)

plot(bwt, hwt, xlab="bwt", ylab="hwt")