# Univariate Graphical Exploratory Data Analysis
## 1. Measures of Central Tendency
## Example
# ---
# We will be using the hills dataset in this section,
# this dataset contains information on hill climbs made by various athletes
# ---
# OUR CODE GOES BELOW
#
# Printing the first six rows of the dataset
# ---
#
library(MASS)
head(hills)
## dist climb time
## Greenmantle 2.5 650 16.083
## Carnethy 6.0 2500 48.350
## Craig Dunain 6.0 900 33.650
## Ben Rha 7.5 800 45.600
## Ben Lomond 8.0 3070 62.267
## Goatfell 8.0 2866 73.217
## Example
# ---
# Question: Find the mean of the distance covered by the athletes
# and assigning the mean to the variable athletes.dist.mean
# ---
# OUR CODE GOES BELOW
#
athletes.dist.mean <- mean(hills$dist)
# Printing out
# ---
#
athletes.dist.mean
## [1] 7.528571
#### Median Code Example 1.2
## Example
# ---
# Question: Find the median which is the middle most value of the distance covered dist
# ---
# OUR CODE GOES BELOW
#
athletes.dist.median <- median(hills$dist)
# Printing out athletes.dist.median
# ---
#
athletes.dist.median
## [1] 6
## Example
# ---
# Question: Find the mode which is the value that has highest number of occurrences in a set of data.
# ---
# OUR CODE GOES BELOW
#
# Unfotunately, R does not have a standard in-built function to calculate mode so we have to build one
# We create the mode function that will perform our mode operation for us
# ---
#
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
# Calculating the mode using out getmode() function
# ---
#
athletes.dist.mode <- getmode(hills$dist)
# Then printing out athletes.dist.mode
# ---
# OUR CODE GOES BELOW
#
athletes.dist.mode
## [1] 6
## Challenge
# ---
# Question: Find the mean, median, mode of the total evening calls given the following dataset
# ---
url_1 <- 'http://bit.ly/CustomerSignatureforChurnAnalysis'
# ---
# OUR CODE GOES BELOW
# Previewing the first 6 rows of this dataset
# ---
#
library(data.table)
churn = fread(url_1)
head(churn)
## recordID state account_length area_code international_plan voice_mail_plan
## 1: 1 HI 101 510 no no
## 2: 2 MT 137 510 no no
## 3: 3 OH 103 408 no yes
## 4: 4 NM 99 415 no no
## 5: 5 SC 108 415 no no
## 6: 6 IA 117 415 no no
## number_vmail_messages total_day_minutes total_day_calls total_day_charge
## 1: 0 70.9 123 12.05
## 2: 0 223.6 86 38.01
## 3: 29 294.7 95 50.10
## 4: 0 216.8 123 36.86
## 5: 0 197.4 78 33.56
## 6: 0 226.5 85 38.51
## total_eve_minutes total_eve_calls total_eve_charge total_night_minutes
## 1: 211.9 73 18.01 236.0
## 2: 244.8 139 20.81 94.2
## 3: 237.3 105 20.17 300.3
## 4: 126.4 88 10.74 220.6
## 5: 124.0 101 10.54 204.5
## 6: 141.6 68 12.04 223.0
## total_night_calls total_night_charge total_intl_minutes total_intl_calls
## 1: 73 10.62 10.6 3
## 2: 81 4.24 9.5 7
## 3: 127 13.51 13.7 6
## 4: 82 9.93 15.7 2
## 5: 107 9.20 7.7 4
## 6: 90 10.04 6.9 5
## total_intl_charge number_customer_service_calls churn customer_id
## 1: 2.86 3 no 23383607
## 2: 2.57 0 no 22550362
## 3: 3.70 1 no 59063354
## 4: 4.24 1 no 25464504
## 5: 2.08 2 no 691824
## 6: 1.86 1 no 24456543
eve.calls.mean <- mean(churn$total_eve_calls)
eve.calls.median <- median(churn$total_eve_calls)
eve.calls.mode <- getmode(churn$total_eve_calls)
eve.calls.mean
## [1] 100.1371
eve.calls.median
## [1] 100
eve.calls.mode
## [1] 105
## 2. Measures of Dispersion
#### Mininum Code Example 1.4
## Example
# ---
# Question: Find the minimum element of the distance using the min() function
# ---
# OUR CODE GOES BELOW
#
athletes.dist.min <- min(hills$dist)
# And then printing athletes.dist.min to show the minimum element
#
athletes.dist.min
## [1] 2
## Example
# ---
# Question: Find the maximum element of the distance using the function max()
# ---
# OUR CODE GOES BELOW
#
athletes.dist.max <- max(hills$dist)
# Then printing out the variable athletes.dist.max to show that maximum element
# ---
# OUR CODE GOES BELOW
#
athletes.dist.max
## [1] 28
#### Range Code Example 1.6
## Example
# ---
# Find the maximum element of the distance using the function range() as shown below
# ---
#
athletes.dist.range <- range(hills$dist)
# Printing out the variable athletes.dist.range to show the range
# ---
#
athletes.dist.range
## [1] 2 28
#### Quantile Code Example 1.7
## Example
# ---
# Question: Get the first and the third quartile together with the range
# and the median using the quantile() function
# ---
# OUR CODE GOES BELOW
#
athletes.dist.quantile <- quantile(hills$dist)
# Printing out the variable athletes.dist.quantile to show the range
# ---
# OUR CODE GOES BELOW
#
athletes.dist.quantile
## 0% 25% 50% 75% 100%
## 2.0 4.5 6.0 8.0 28.0
#### Variance Code Example 1.8
## Example
# ---
# Question: Find the variance of the distance using the var() function as shown below
# ---
# OUR CODE GOES BELOW
#
athletes.dist.variance <- var(hills$dist)
# Printing out the the variable athletes.dist.variance to show the variance
#
athletes.dist.variance
## [1] 30.51387
#### Standard Deviation Code Example 1.9
## Example
# ---
# Question: Find the standard deviation of vector t using the sd() function
# ---
# OUR CODE GOES BELOW
#
athletes.dist.sd <- sd(hills$dist)
# Printing out the variable athletes.dist.sd to show the variance
# ---
#
athletes.dist.sd
## [1] 5.523936
# Challenge
# ---
# Question: Find the minimum, maximum, range, quantile, variance
# and standard deviation for total day calls using the given dataset
# ---
# Dataset url = http://bit.ly/CustomerSignatureforChurnAnalysis
# ---
# OUR CODE GOES BELOW
#
# Find the minimum of total day calls
# ---
# OUR CODE GOES BELOW
#
day.calls.min <- min(churn$total_day_calls)
day.calls.max <- max(churn$total_day_calls)
day.calls.range <- range(churn$total_day_calls)
day.calls.quantile <- quantile(churn$total_day_calls)
day.calls.variance <- var(churn$total_day_calls)
day.calls.std <- sd(churn$total_day_calls)
day.calls.min
## [1] 0
day.calls.max
## [1] 165
day.calls.range
## [1] 0 165
day.calls.quantile
## 0% 25% 50% 75% 100%
## 0 87 101 114 165
day.calls.variance
## [1] 397.8691
day.calls.std
## [1] 19.94666
## 3. Univariate Graphical
#### Box Plots Code Example 3.1
## Example
# ---
# Question: Lets create a boxplot graph for the distance using the boxplot() function
# ---
# OUR CODE GOES BELOW
#
boxplot(hills$dist)

#### Bar Graph Code Example 3.2
## Example
# ---
# Create a frequency distribution of the School variable
# ---
# Dataset Info: For this example, we will use an R built-in database named painters.
# ---
# OUR CODE GOES BELOW
#
# Previewing the first six rows of the painters dataset
# ---
# OUR CODE GOES BELOW
#
head(painters)
## Composition Drawing Colour Expression School
## Da Udine 10 8 16 3 A
## Da Vinci 15 16 4 14 A
## Del Piombo 8 13 16 7 A
## Del Sarto 12 16 9 8 A
## Fr. Penni 0 15 8 0 A
## Guilio Romano 15 16 4 14 A
# Fetching the school column
# ---
#
school <- painters$School
# Applying the table() function will compute the frequency distribution of the School variable
# ---
#
school_frequency <- table(school)
# Printing school_frequency below
# ---
#
school_frequency
## school
## A B C D E F G H
## 10 6 6 10 7 4 7 4
# Then applying the barplot function to produce its bar graph
# ---
#
barplot(school_frequency)

## Challenge
# ---
# Question: Create a bar graph of the total day calls in the customer signature dataset
# ---
# Dataset url = http://bit.ly/CustomerSignatureforChurnAnalysis
# ---
# OUR CODE GOES BELOW
#
day_calls <- churn$total_day_calls
day.calls_frequency <- table(day_calls)
barplot(day.calls_frequency)

#### Histogram Code Example 3.3
## Example
# ---
# Create a histogram using the faithful dataset
# ---
# Hint: we will use an R built-in data frame called faithful
# ---
# OUR CODE GOES BELOW
#
# Preview the first six rows of the faithful dataset
# ---
# OUR CODE GOES BELOW
#
head(faithful)
## eruptions waiting
## 1 3.600 79
## 2 1.800 54
## 3 3.333 74
## 4 2.283 62
## 5 4.533 85
## 6 2.883 55
# Then applying the hist() function to produce the histogram of the eruptions variable
# ---
#
hist(faithful$eruptions)

## Challenge
# ---
# Question: Create a histogram of the total day minutes in the customer signature dataset
# ---
# Dataset url = http://bit.ly/CustomerSignatureforChurnAnalysis
# ---
# OUR CODE GOES BELOW
hist(churn$total_day_minutes)
