For your assignment you may be using different dataset than what is included here.
Always read carefully the instructions on Sakai.
Tasks/questions to be completed/answered are highlighted in larger bolded fonts and numbered according to their section.
We are going to use tidyverse a collection of R packages designed for data science.
Name your dataset ‘mydata’ so it easy to work with.
Commands: read_csv() head() max() min() var() sd()
mydata <- read.csv("data/Advertising.csv")
head(mydata)
mydata <- rename(mydata, "case_number" = "X")
CASE NUMBER
case_number_max <- max(mydata$case_number)
case_number_max
[1] 200
case_number_min <- min(mydata$case_number)
case_number_min
[1] 1
case_number_range <-range(case_number_max - case_number_min)
case_number_range
[1] 199 199
case_number_mean <- mean(mydata$case_number)
case_number_mean
[1] 100.5
case_number_sd <- sd(mydata$case_number)
case_number_sd
[1] 57.87918
case_number_variance <- var(mydata$case_number)
case_number_variance
[1] 3350
There is a large range for case_number at nearly 200. SALES
#variable_max
sales_max <- max(mydata$sales)
sales_max
[1] 27
#variable_min
sales_min <- min(mydata$sales)
sales_min
[1] 1.6
#variable_Range max-min
sales_range <- range(sales_max - sales_min)
sales_range
[1] 25.4 25.4
#variable_mean
sales_mean <- mean(mydata$sales)
sales_mean
[1] 14.0225
#variable_sd Standard Deviation
sales_sd <- sd(mydata$sales)
sales_sd
[1] 5.217457
#variable_variance
sales_variance <- var(mydata$sales)
sales_variance
[1] 27.22185
Sales has a relatively smaller range in contrast to case_number and the smallest sd.
RADIO
#variable_max
radio_max <- max(mydata$radio)
radio_max
[1] 49.6
#variable_min
radio_min <- min(mydata$radio)
radio_min
[1] 0
#variable_Range max-min
radio_range <- range(radio_max - radio_min)
radio_range
[1] 49.6 49.6
#variable_mean
radio_mean <- mean(mydata$radio)
radio_mean
[1] 23.264
#variable_sd Standard Deviation
radio_sd <- sd(mydata$radio)
radio_sd
[1] 14.84681
#variable_variance
radio_variance <- var(mydata$radio)
radio_variance
[1] 220.4277
Radio has a smaller variance in comparison to other variables.
TV
#variable_max
tv_max <- max(mydata$TV)
tv_max
[1] 296.4
#variable_min
tv_min <- min(mydata$TV)
tv_min
[1] 0.7
#variable_Range max-min
tv_range <- range(tv_max - tv_min)
tv_range
[1] 295.7 295.7
#variable_mean
tv_mean <- mean(mydata$TV)
tv_mean
[1] 147.0425
#variable_sd Standard Deviation
tv_sd <- sd(mydata$TV)
tv_sd
[1] 85.85424
#variable_variance
tv_variance <- var(mydata$TV)
tv_variance
[1] 7370.95
NEWSPAPER
#variable_max
newspaper_max <- max(mydata$newspaper)
newspaper_max
[1] 114
#variable_min
newspaper_min <- min(mydata$newspaper)
newspaper_min
[1] 0.3
#variable_Range max-min
newspaper_range <- range(newspaper_max - newspaper_min)
newspaper_range
[1] 113.7 113.7
#variable_mean
newspaper_mean <- mean(mydata$newspaper)
newspaper_mean
[1] 30.554
#variable_sd Standard Deviation
newspaper_sd <- sd(mydata$newspaper)
newspaper_sd
[1] 21.77862
#variable_variance
newspaper_variance <- var(mydata$newspaper)
newspaper_variance
[1] 474.3083
Newspaper is comaprable to the other variables and a small s.d. (close to sales’s s.d.)
summary(mydata)
case_number TV radio newspaper sales
Min. : 1.00 Min. : 0.70 Min. : 0.000 Min. : 0.30 Min. : 1.60
1st Qu.: 50.75 1st Qu.: 74.38 1st Qu.: 9.975 1st Qu.: 12.75 1st Qu.:10.38
Median :100.50 Median :149.75 Median :22.900 Median : 25.75 Median :12.90
Mean :100.50 Mean :147.04 Mean :23.264 Mean : 30.55 Mean :14.02
3rd Qu.:150.25 3rd Qu.:218.82 3rd Qu.:36.525 3rd Qu.: 45.10 3rd Qu.:17.40
Max. :200.00 Max. :296.40 Max. :49.600 Max. :114.00 Max. :27.00
Overall, there is a large range of values. For instance, maximum values range from 49.6 to 296.4, and mean values range from 23.3 to 147.75. On the other hand, miniumum values are more concentrated from 0
quantile(mydata$radio, na.rm= TRUE)
0% 25% 50% 75% 100%
0.000 9.975 22.900 36.525 49.600
# lowerq = quantile(VARIABLE)[2]
# upperq = quantile(VARIABLE)[4]
lowerq = quantile(mydata$radio)
upperq = quantile(mydata$radio)
lowerq
0% 25% 50% 75% 100%
0.000 9.975 22.900 36.525 49.600
upperq
0% 25% 50% 75% 100%
0.000 9.975 22.900 36.525 49.600
# iqr = upperq - lowerq
iqr = upperq - lowerq
iqr
0% 25% 50% 75% 100%
0 0 0 0 0
# upper_threshold = (iqr * 1.5) + upperq
upper_threshold = (iqr * 1.5) + upperq
# lower_threshold = lowerq - (iqr * 1.5)
lower_threshold = lowerq - (iqr * 1.5)
upper_threshold
0% 25% 50% 75% 100%
0.000 9.975 22.900 36.525 49.600
lower_threshold
0% 25% 50% 75% 100%
0.000 9.975 22.900 36.525 49.600
# VARIABLE[ VARIABLE > upper_threshold][1:10]
# VARIABLE[ VARIABLE > lower_threshold][1:10]
mydata$radio[mydata$radio > upper_threshold]
[1] 37.8 39.3 45.9 41.3 48.9 32.8 5.8 24.0 35.1 47.7 36.6 39.6 27.7 3.5 29.3 28.3 17.4 4.1 43.8 49.4 22.3 33.4 27.7 22.5 41.5 3.1 41.7 46.2 49.4 28.1 49.6 2.0 42.7
[34] 9.3 24.6 30.6 14.3 33.0 43.7 28.5 26.7 44.5 18.4 27.5 40.6 4.9 33.5 31.6 42.3 4.3 36.3 46.4 11.0 8.2 38.0 35.0 14.3 36.9 26.8 21.7 11.8 38.9 49.0 39.6 27.2 47.0
[67] 39.0 28.9 17.0 35.4 33.2 1.9 49.0 40.3 13.9 23.3 39.7 11.6 43.5 36.9 18.1 35.8 36.8 3.4 37.6 11.6 20.9 48.9 30.2 2.6 43.0 45.1 28.7 41.1 10.8 42.0 3.7 42.0
mydata$radio[mydata$radio < lower_threshold]
[1] 10.8 19.6 2.1 2.6 7.6 32.9 20.5 23.9 5.1 15.9 16.9 12.6 16.7 27.1 16.0 1.5 20.0 1.4 26.7 37.7 8.4 25.7 9.9 15.8 11.7 9.6 28.8 19.2 29.5 15.5 29.6 42.8 14.5
[34] 27.5 43.9 5.7 24.6 1.6 29.9 7.7 4.1 20.3 43.0 25.5 47.8 1.5 36.5 14.0 3.5 21.0 41.7 10.1 17.2 34.3 0.3 0.4 26.9 15.4 20.6 46.8 0.8 16.0 2.4 34.6 32.3 0.0
[67] 12.0 2.9 33.5 38.6 25.9 43.9 5.7 14.8 7.3 25.8 8.4 21.1 1.3 18.4 18.1 14.7 5.2 23.6 10.6 20.1 7.1 3.4 7.8 2.3 10.0 5.4 5.7 21.3 2.1 13.9 12.1 4.1 35.6
[100] 4.9 9.3 8.6
upper_threshold
0% 25% 50% 75% 100%
0.000 9.975 22.900 36.525 49.600
lower_threshold
0% 25% 50% 75% 100%
0.000 9.975 22.900 36.525 49.600
# mydata[ VARIABLE > upper_threshold, ][1:10]
# mydata[ VARIABLE > lower_threshold, ][1:10]
mydata[mydata$radio > upper_threshold, ]
mydata[mydata$radio < lower_threshold, ]
count(mydata[ mydata$radio < lower_threshold, ])
count(mydata [ mydata$radio > upper_threshold, ])
#There are 200 outliers for radio
** OUTLIERS FOR TV**
quantile(mydata$tv)
0% 25% 50% 75% 100%
NA NA NA NA NA
# lowerq = quantile(VARIABLE)[2]
# upperq = quantile(VARIABLE)[4]
lowerq = quantile(mydata$tv)
upperq = quantile(mydata$tv)
lowerq
0% 25% 50% 75% 100%
NA NA NA NA NA
upperq
0% 25% 50% 75% 100%
NA NA NA NA NA
# iqr = upperq - lowerq
iqr = upperq - lowerq
iqr
0% 25% 50% 75% 100%
NA NA NA NA NA
# upper_threshold = (iqr * 1.5) + upperq
upper_threshold = (iqr * 1.5) + upperq
# lower_threshold = lowerq - (iqr * 1.5)
lower_threshold = lowerq - (iqr * 1.5)
# VARIABLE[ VARIABLE > upper_threshold][1:10]
# VARIABLE[ VARIABLE > lower_threshold][1:10]
mydata$tv[mydata$tv > upper_threshold]
NULL
mydata$yv[mydata$tv < lower_threshold]
NULL
upper_threshold
0% 25% 50% 75% 100%
NA NA NA NA NA
lower_threshold
0% 25% 50% 75% 100%
NA NA NA NA NA
# mydata[ VARIABLE > upper_threshold, ][1:10]
# mydata[ VARIABLE > lower_threshold, ][1:10]
mydata[mydata$tv > upper_threshold, ]
mydata[mydata$tv < lower_threshold, ]
count(mydata[ mydata$tv < lower_threshold, ])
count(mydata[ mydata$tv > upper_threshold, ])
#There are no outliers for TV.
OUTLIERS FOR NEWSPAPER
quantile(mydata$newspaper)
0% 25% 50% 75% 100%
0.30 12.75 25.75 45.10 114.00
# lowerq = quantile(VARIABLE)[2]
# upperq = quantile(VARIABLE)[4]
lowerq = quantile(mydata$newspaper)
upperq = quantile(mydata$newspaper)
lowerq
0% 25% 50% 75% 100%
0.30 12.75 25.75 45.10 114.00
upperq
0% 25% 50% 75% 100%
0.30 12.75 25.75 45.10 114.00
# iqr = upperq - lowerq
iqr = upperq - lowerq
iqr
0% 25% 50% 75% 100%
0 0 0 0 0
# upper_threshold = (iqr * 1.5) + upperq
upper_threshold = (iqr * 1.5) + upperq
# lower_threshold = lowerq - (iqr * 1.5)
lower_threshold = lowerq - (iqr * 1.5)
# VARIABLE[ VARIABLE > upper_threshold][1:10]
# VARIABLE[ VARIABLE > lower_threshold][1:10]
mydata$newspaper[mydata$newspaper > upper_threshold]
[1] 69.2 45.1 69.3 58.5 75.0 23.5 24.2 65.9 52.9 114.0 55.8 53.4 23.5 49.6 19.5 43.2 38.6 30.0 8.5 45.7 31.6 38.7 31.5 35.7 49.9 34.6 39.6 58.7
[29] 60.0 41.4 21.4 54.7 27.3 0.9 38.7 31.7 89.4 20.7 22.3 36.9 32.5 65.7 16.0 63.2 73.4 9.3 33.0 59.0 72.3 52.9 51.2 49.8 100.9 59.0 29.7 56.5
[57] 23.2 52.7 25.6 79.2 46.2 50.4 25.9 50.6 8.7 43.0 8.5 59.7 12.9 75.6 37.9 9.0 44.3 37.0 48.7 5.7 50.5 45.2 30.7 49.3 84.8 21.6 57.6 18.4
[85] 47.4 41.8 20.3 35.2 8.3 27.4 29.7 71.8 19.6 26.6 5.8 31.6 13.8 66.2
mydata$newspaper[mydata$newspaper < lower_threshold]
[1] 58.4 11.6 1.0 21.2 4.0 7.2 46.0 18.3 19.1 26.2 18.3 12.6 22.9 22.9 40.8 0.3 7.4 5.0 35.1 32.0 1.8 26.4 43.3 18.5 36.8 3.6 15.9 16.6 37.7 9.3 8.4 28.9 2.2
[34] 10.2 11.0 27.2 19.3 31.3 13.1 14.2 9.4 23.1 35.6 33.8 51.4 10.9 5.9 22.0 45.9 21.4 17.9 5.3 23.2 25.6 5.5 2.4 10.7 34.5 14.8 22.3 15.6 12.4 74.2 9.2 3.2 43.1
[67] 2.1 65.6 9.3 20.5 1.7 34.4 38.9 8.7 11.9 20.6 14.2 37.7 9.5 24.3 34.6 25.6 7.4 5.4 19.4 6.4 17.0 12.8 13.1 23.7 17.6 30.0 18.2 3.7 23.4 6.0 3.6 6.0 8.1
[100] 6.4 8.7
upper_threshold
0% 25% 50% 75% 100%
0.30 12.75 25.75 45.10 114.00
lower_threshold
0% 25% 50% 75% 100%
0.30 12.75 25.75 45.10 114.00
# mydata[ VARIABLE > upper_threshold, ][1:10]
# mydata[ VARIABLE > lower_threshold, ][1:10]
mydata[mydata$newspaper > upper_threshold, ]
mydata[mydata$newspaper < lower_threshold, ]
count(mydata[mydata$newspaper > upper_threshold, ])
count(mydata[mydata$newspaper < lower_threshold, ])
#There are 199 outliers for newspaper.
OUTLIERS FOR SALES
quantile(mydata$sales)
0% 25% 50% 75% 100%
1.600 10.375 12.900 17.400 27.000
# lowerq = quantile(VARIABLE)[2]
# upperq = quantile(VARIABLE)[4]
lowerq = quantile(mydata$sales)
upperq = quantile(mydata$sales)
lowerq
0% 25% 50% 75% 100%
1.600 10.375 12.900 17.400 27.000
upperq
0% 25% 50% 75% 100%
1.600 10.375 12.900 17.400 27.000
# iqr = upperq - lowerq
iqr = upperq - lowerq
iqr
0% 25% 50% 75% 100%
0 0 0 0 0
# upper_threshold = (iqr * 1.5) + upperq
upper_threshold = (iqr * 1.5) + upperq
# lower_threshold = lowerq - (iqr * 1.5)
lower_threshold = lowerq - (iqr * 1.5)
#VARIABLE[ VARIABLE > upper_threshold][1:10]
#VARIABLE[ VARIABLE > lower_threshold][1:10]
mydata$sales[mydata$sales > upper_threshold]
[1] 22.1 10.4 18.5 7.2 11.8 13.2 8.6 17.4 22.4 12.5 24.4 18.0 12.5 12.0 15.0 15.9 18.9 21.4 11.9 12.8 25.4 14.7 16.6 17.1 20.7 14.9 10.6 23.2 11.4 10.7 22.6 21.2 23.7
[34] 13.2 23.8 8.1 24.2 15.7 9.3 13.4 18.9 18.3 12.4 8.7 14.2 11.8 12.3 15.2 12.0 16.0 11.2 19.4 22.2 16.9 11.7 15.5 25.4 11.7 23.8 14.8 19.2 13.4 21.8 14.1 12.6 12.2
[67] 15.5 10.6 24.7 12.7 19.6 11.6 20.8 10.9 19.2 20.1 10.3 13.2 25.4 16.1 11.6 16.6 19.0 3.2 15.3 14.4 13.3 14.9 18.0 11.9 8.4 14.5 27.0 20.2 10.5 12.2 26.2 22.6 17.3
[100] 10.8 19.6 7.6 25.5
mydata$sales[mydata$sales < lower_threshold]
[1] 9.3 12.9 4.8 10.6 9.2 9.7 19.0 11.3 14.6 5.6 15.5 9.7 10.5 9.6 9.5 10.1 21.5 12.9 8.5 14.8 9.7 20.2 5.5 18.4 14.0 18.0 9.5 22.3 8.8 11.0 17.0 6.9 5.3
[34] 11.0 11.3 13.6 21.7 12.9 16.7 7.3 11.5 17.2 14.7 20.7 7.2 8.7 5.3 19.8 15.9 14.6 9.4 15.9 6.6 7.0 11.6 15.2 19.7 6.6 8.8 9.7 5.7 10.8 9.5 9.6 20.7 10.4
[67] 11.4 10.9 10.1 15.6 10.1 7.3 12.9 11.9 8.0 12.2 17.1 15.0 7.6 11.7 11.5 11.7 11.8 12.6 8.7 17.6 10.3 15.9 6.7 9.9 5.9 17.3 9.7 12.8 13.4
upper_threshold
0% 25% 50% 75% 100%
1.600 10.375 12.900 17.400 27.000
lower_threshold
0% 25% 50% 75% 100%
1.600 10.375 12.900 17.400 27.000
# mydata[ VARIABLE > upper_threshold, ][1:10]
# mydata[ VARIABLE > lower_threshold, ][1:10]
mydata[mydata$sales > upper_threshold, ]
mydata[mydata$sales < lower_threshold, ]
count(mydata[mydata$sales > upper_threshold, ])
count(mydata[mydata$sales < lower_threshold, ])
#There are 198 outliers for sales.
PLOTTING OUTLIERS
p <- ggplot(data = mydata, aes(x = "", y = mydata$radio)) + geom_boxplot() + coord_flip()
p
p <- ggplot(data = mydata, aes(x = "", y = mydata$newspaper)) + geom_boxplot() + coord_flip()
p
p <- ggplot(data = mydata, aes(x = "", y = mydata$sales)) + geom_boxplot() + coord_flip()
p
p <- ggplot(data = mydata, aes(x = "", y = mydata$TV)) + geom_boxplot() + coord_flip()
p
From these box plots, we can visualize which variables have outliers. They tend to be either on extremes as shown on newspaper’s box plot.
min(mydata$radio, na.rm = TRUE)
[1] 0
max(mydata$radio, na.rm = TRUE)
[1] 49.6
min(mydata$newspaper, na.rm = TRUE)
[1] 0.3
max(mydata$newspaper, na.rm = TRUE)
[1] 114
min(mydata$sales, na.rm = TRUE)
[1] 1.6
max(mydata$sales, na.rm = TRUE)
[1] 27
min(mydata$TV, na.rm = TRUE)
[1] 0.7
max(mydata$TV, na.rm = TRUE)
[1] 296.4
min(mydata$case_number, na.rm = TRUE)
[1] 1
max(mydata$case_number, na.rm = TRUE)
[1] 200
library(ggplot2)
library(gridExtra)
#grid.arrange(VARIABLE_plot1, VARIABLE_plot2, VARIABLE_plot3, VARIABLE_plot4, ncol=2)
tv_plot <- ggplot(data = mydata, aes(x = case_number, y = TV)) + geom_point()
newspaper_plot <- ggplot(data = mydata, aes(x = case_number, y = newspaper)) + geom_point()
sales_plot <- ggplot(data = mydata, aes(x = case_number, y = sales)) + geom_point()
radio_plot <- ggplot(data = mydata, aes(x = case_number, y = radio)) + geom_point()
tv_plot
newspaper_plot
sales_plot
radio_plot
final <- grid.arrange(tv_plot, newspaper_plot, sales_plot, radio_plot, ncol=2)
Out of all these graphs, one cannot deduce strongly that there is a clear relationship between the two variables. In fact, there is no line of best fit for any of these graphs.
# Extract case_number from the newdata
newdata <- mydata [ order(mydata$sales), ]
newdata
# new_VARIABLE = newdata$VARIABLE
new_sales = newdata$sales
new_TV = newdata$TV
new_newspaper = newdata$newspaper
new_radio = newdata$radio
newtv_plot <- ggplot(data = mydata, aes(x = case_number[order(case_number)], y = new_TV)) + geom_point()
newnewspaper_plot <- ggplot(data = mydata, aes(x = case_number[order(case_number)], y = new_newspaper)) + geom_point()
newsales_plot <- ggplot(data = mydata, aes(x = case_number[order(case_number)], y = new_sales)) + geom_point()
newradio_plot <- ggplot(data = mydata, aes(x = case_number[order(case_number)], y = new_radio)) + geom_point()
final <- grid.arrange(newtv_plot, newnewspaper_plot, newsales_plot, newradio_plot, ncol=2)
#grid.arrange(newsales_plot, newtv_plot, newradio_plot, newnews_plot, ncol=2)
These new graphs present relationships between certain variables and case numbers. new_TV, new_sales, and new_radio have positive trendlines, illustrating that there have proportional relationships with case numbers. However, new_sales plot appears to be a linear line of best fit with the least amount of variation from the trendline, whereas new_radio and new_TV have curved/exponentional relaionships. On the other hand, new_newspaper doesn’t appear to have any prominent trendline. ———-
z_scores = ( mydata$sales - mean(mydata$sales) ) / sd(mydata$sales)
qplot( x = z_scores ,geom="histogram", binwidth = 0.3)
This histogram has a normal distribution and is positively skewed, which means the mode is less than the mean and median.
z_score = (26.7 - (mean(mydata$sales))) / sd(mydata$sales)
z_score
[1] 2.429824
Basd on the z-value, I would rate $26700 as a very good performance. Not only is it above the mean, but it exceeds the sales data’s upper threshold. Hence, one might conclude that this value is an outlier, and it might not be representative of the whole data and would require further investigation.
```