For your assignment you may be using different dataset than what is included here.
Always read carefully the instructions on Sakai.
Tasks/questions to be completed/answered are highlighted in larger bolded fonts and numbered according to their section.
We are going to use tidyverse a collection of R packages designed for data science.
## Loading required package: tidyverse
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.4
## ✔ tidyr 0.8.0 ✔ stringr 1.2.0
## ✔ readr 1.1.1 ✔ forcats 0.2.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## Loading required package: gridExtra
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
Name your dataset ‘mydata’ so it easy to work with.
Commands: read_csv() head() max() min() var() sd()
mydata <- read_csv("data/Advertising.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_integer(),
## TV = col_double(),
## radio = col_double(),
## newspaper = col_double(),
## sales = col_double()
## )
head(mydata)
sales <- mydata$sales
tv <- mydata$TV
radio <- mydata$radio
newspaper <- mydata$newspaper
mydata <- rename(mydata, "case_number" = "X1")
head(mydata)
Sales
#variable_max
sales_max <- max(sales)
#variable_min
sales_min <- min(sales)
#variable_Range max-min
sales_range <- sales_max- sales_min
#variable_mean
sales_mean <- mean(sales)
#variable_sd Standard Deviation
sales_sd <- sd(sales)
#variable_variance
sales_variance <- var(sales)
TV
#variable_max
tv_max <- max(tv)
#variable_min
tv_min <- min(tv)
#variable_Range max-min
tv_range <- tv_max-tv_min
#variable_mean
tv_mean <- mean(tv)
#variable_sd Standard Deviation
tv_sd <- sd(tv)
#variable_variance
tv_variance <- var(tv)
Radio
#variable_max
radio_max <- max(radio)
#variable_min
radio_min <- min(radio)
#variable_Range max-min
radio_range <- radio_max- radio_min
#variable_mean
radio_mean <- mean(radio)
#variable_sd Standard Deviation
radio_sd <- sd(radio)
#variable_variance
radio_variance <- var(radio)
Newspaper
#variable_max
newspaper_max <- max(newspaper)
#variable_min
newspaper_min <- min(newspaper)
#variable_Range max-min
newspaper_range <- radio_max- radio_min
#variable_mean
newspaper_mean <- mean(newspaper)
#variable_sd Standard Deviation
newspaper_sd <- sd(newspaper)
#variable_variance
newspaper_variance <- var(newspaper)
Comparison of Features Sales
##sales
sales_max
## [1] 27
sales_min
## [1] 1.6
sales_range
## [1] 25.4
sales_mean
## [1] 14.0225
sales_sd
## [1] 5.217457
sales_variance
## [1] 27.22185
TV
##tv
tv_max
## [1] 296.4
tv_min
## [1] 0.7
tv_range
## [1] 295.7
tv_mean
## [1] 147.0425
tv_sd
## [1] 85.85424
tv_variance
## [1] 7370.95
Radio
##radio
radio_max
## [1] 49.6
radio_min
## [1] 0
radio_range
## [1] 49.6
radio_mean
## [1] 23.264
radio_sd
## [1] 14.84681
radio_variance
## [1] 220.4277
Newspaper
##newspaper
newspaper_max
## [1] 114
newspaper_min
## [1] 0.3
newspaper_range
## [1] 49.6
newspaper_mean
## [1] 30.554
newspaper_sd
## [1] 21.77862
newspaper_variance
## [1] 474.3083
summary(mydata)
## case_number TV radio newspaper
## Min. : 1.00 Min. : 0.70 Min. : 0.000 Min. : 0.30
## 1st Qu.: 50.75 1st Qu.: 74.38 1st Qu.: 9.975 1st Qu.: 12.75
## Median :100.50 Median :149.75 Median :22.900 Median : 25.75
## Mean :100.50 Mean :147.04 Mean :23.264 Mean : 30.55
## 3rd Qu.:150.25 3rd Qu.:218.82 3rd Qu.:36.525 3rd Qu.: 45.10
## Max. :200.00 Max. :296.40 Max. :49.600 Max. :114.00
## sales
## Min. : 1.60
## 1st Qu.:10.38
## Median :12.90
## Mean :14.02
## 3rd Qu.:17.40
## Max. :27.00
Sales Outliers
quantile(sales)
## 0% 25% 50% 75% 100%
## 1.600 10.375 12.900 17.400 27.000
** Lower and upper quantile calculation **
# lowerq = quantile(VARIABLE)[2]
# upperq = quantile(VARIABLE)[4]
lowerq = quantile(sales)[2]
upperq = quantile(sales)[4]
lowerq
## 25%
## 10.375
upperq
## 75%
## 17.4
Interquantile calculation
# iqr = upperq - lowerq
iqr = upperq - lowerq
iqr
## 75%
## 7.025
** Calculation the upper threshold **
# upper_threshold = (iqr * 1.5) + upperq
upper_threshold = (iqr * 1.5) + upperq
upper_threshold
## 75%
## 27.9375
** Calculation the lower threshold **
# lower_threshold = lowerq - (iqr * 1.5)
lower_threshold = lowerq - (iqr * 1.5)
lower_threshold
## 25%
## -0.1625
** Identify outliers **
# VARIABLE[ VARIABLE > upper_threshold][1:10]
# VARIABLE[ VARIABLE > lower_threshold][1:10]
sales[sales > upper_threshold][1:10]
## [1] NA NA NA NA NA NA NA NA NA NA
sales[sales < lower_threshold][1:10]
## [1] NA NA NA NA NA NA NA NA NA NA
mydata[ sales > upper_threshold, ]
mydata[ sales < lower_threshold, ]
TV Outliers
quantile(tv)
## 0% 25% 50% 75% 100%
## 0.700 74.375 149.750 218.825 296.400
** Lower and upper quantile calculation **
# lowerq = quantile(VARIABLE)[2]
# upperq = quantile(VARIABLE)[4]
lowerq = quantile(tv)[2]
upperq = quantile(tv)[4]
lowerq
## 25%
## 74.375
upperq
## 75%
## 218.825
Interquantile calculation
# iqr = upperq - lowerq
iqr = upperq - lowerq
iqr
## 75%
## 144.45
** Calculation the upper threshold **
# upper_threshold = (iqr * 1.5) + upperq
upper_threshold = (iqr * 1.5) + upperq
upper_threshold
## 75%
## 435.5
** Calculation the lower threshold **
# lower_threshold = lowerq - (iqr * 1.5)
lower_threshold = lowerq - (iqr * 1.5)
lower_threshold
## 25%
## -142.3
** Identify outliers **
# VARIABLE[ VARIABLE > upper_threshold][1:10]
# VARIABLE[ VARIABLE > lower_threshold][1:10]
tv[tv > upper_threshold][1:10]
## [1] NA NA NA NA NA NA NA NA NA NA
tv [tv < lower_threshold][1:10]
## [1] NA NA NA NA NA NA NA NA NA NA
mydata[ tv > upper_threshold, ]
mydata[ tv < lower_threshold, ]
Newspaper Outliers
quantile(newspaper)
## 0% 25% 50% 75% 100%
## 0.30 12.75 25.75 45.10 114.00
** Lower and upper quantile calculation **
# lowerq = quantile(VARIABLE)[2]
# upperq = quantile(VARIABLE)[4]
lowerq = quantile(newspaper)[2]
upperq = quantile(newspaper)[4]
lowerq
## 25%
## 12.75
upperq
## 75%
## 45.1
Interquantile calculation
# iqr = upperq - lowerq
iqr = upperq - lowerq
iqr
## 75%
## 32.35
** Calculation the upper threshold **
# upper_threshold = (iqr * 1.5) + upperq
upper_threshold = (iqr * 1.5) + upperq
upper_threshold
## 75%
## 93.625
** Calculation the lower threshold **
# lower_threshold = lowerq - (iqr * 1.5)
lower_threshold = lowerq - (iqr * 1.5)
lower_threshold
## 25%
## -35.775
** Identify outliers **
# VARIABLE[ VARIABLE > upper_threshold][1:10]
# VARIABLE[ VARIABLE > lower_threshold][1:10]
newspaper[newspaper > upper_threshold][1:10]
## [1] 114.0 100.9 NA NA NA NA NA NA NA NA
newspaper [newspaper < lower_threshold][1:10]
## [1] NA NA NA NA NA NA NA NA NA NA
mydata[newspaper > upper_threshold, ]
mydata[ newspaper < lower_threshold, ]
Radio Outliers
quantile(radio)
## 0% 25% 50% 75% 100%
## 0.000 9.975 22.900 36.525 49.600
** Lower and upper quantile calculation **
# lowerq = quantile(VARIABLE)[2]
# upperq = quantile(VARIABLE)[4]
lowerq = quantile(radio)[2]
upperq = quantile(radio)[4]
lowerq
## 25%
## 9.975
upperq
## 75%
## 36.525
Interquantile calculation
# iqr = upperq - lowerq
iqr = upperq - lowerq
iqr
## 75%
## 26.55
** Calculation the upper threshold **
# upper_threshold = (iqr * 1.5) + upperq
upper_threshold = (iqr * 1.5) + upperq
upper_threshold
## 75%
## 76.35
** Calculation the lower threshold **
# lower_threshold = lowerq - (iqr * 1.5)
lower_threshold = lowerq - (iqr * 1.5)
lower_threshold
## 25%
## -29.85
** Identify outliers **
# VARIABLE[ VARIABLE > upper_threshold][1:10]
# VARIABLE[ VARIABLE > lower_threshold][1:10]
radio[radio > upper_threshold][1:10]
## [1] NA NA NA NA NA NA NA NA NA NA
radio [radio < lower_threshold][1:10]
## [1] NA NA NA NA NA NA NA NA NA NA
mydata[ radio > upper_threshold, ]
mydata[ radio < lower_threshold, ]
sales_plot1 <- ggplot(data = mydata, aes (x = case_number, y = sales )) + geom_point()
tv_plot2 <- ggplot(data = mydata, aes (x = case_number, y = tv )) + geom_point()
newspaper_plot3 <- ggplot(data = mydata, aes (x = case_number, y = newspaper )) + geom_point()
radio_plot4 <- ggplot(data = mydata, aes (x = case_number, y = radio )) + geom_point()
grid.arrange(sales_plot1, tv_plot2, newspaper_plot3,radio_plot4, ncol=2)
newdata <- mydata[order(mydata$sales),]
# Extract case_number from the newdata
case_number <- newdata$case_number
head(newdata)
# new_VARIABLE = newdata$VARIABLE
new_sales = newdata$sales
new_tv = newdata$TV
new_radio = newdata$radio
new_newspaper = newdata$newspaper
new_sales_plot <- ggplot(data = mydata, aes(x = case_number[order(case_number)] , y = new_sales)) + geom_point()
new_tv_plot <- ggplot(data = mydata, aes(x = case_number[order(case_number)] , y = new_tv)) + geom_point()
new_radio_plot <- ggplot(data = mydata, aes(x = case_number[order(case_number)] , y = new_radio)) + geom_point()
new_newspaper_plot <- ggplot(data = mydata, aes(x = case_number[order(case_number)] , y = new_newspaper)) + geom_point()
grid.arrange(new_sales_plot, new_tv_plot, new_radio_plot, new_newspaper_plot, ncol=2)
##Note your observations on what the new plots are revealing in terms of trending relationship. Tv and radio ad spend seem to have a simliar pattern to sales. Newspaper still does not seem to have much of a pattern. ———-
sales_z_score = (sales - mean(sales)) / sd(sales)
qplot( x = sales_z_score ,geom="histogram", binwidth = 0.3)
tv_z_score = (tv - mean(tv))/ sd(tv)
qplot( x = tv_z_score ,geom="histogram", binwidth = 0.3)
radio_z_score = (radio - mean(radio))/ sd(radio)
qplot( x = radio_z_score ,geom="histogram", binwidth = 0.3)
newspaper_z_score = (newspaper - mean(newspaper))/ sd(newspaper)
qplot( x = newspaper_z_score ,geom="histogram", binwidth = 0.3)
## the sales z-score histogram seems to be normally distributed while newspaper is positively skewed and radio and tv are pretty evenly distributed
z_score = (26.7 - mean (sales)) / sd(sales)
z_score
## [1] 2.429824