For your assignment you may be using different dataset than what is included here.
Always read carefully the instructions on Sakai.
Tasks/questions to be completed/answered are highlighted in larger bolded fonts and numbered according to their section.
We are going to use tidyverse a collection of R packages designed for data science.
Loading required package: tidyverse
[30m-- [1mAttaching packages[22m --------------------------------------- tidyverse 1.2.1 --[39m
[30m[32mv[30m [34mggplot2[30m 2.2.1 [32mv[30m [34mpurrr [30m 0.2.4
[32mv[30m [34mtibble [30m 1.4.2 [32mv[30m [34mdplyr [30m 0.7.4
[32mv[30m [34mtidyr [30m 0.8.0 [32mv[30m [34mstringr[30m 1.2.0
[32mv[30m [34mreadr [30m 1.1.1 [32mv[30m [34mforcats[30m 0.2.0[39m
[30m-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31mx[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()[39m
Loading required package: gridExtra
Attaching package: <U+393C><U+3E31>gridExtra<U+393C><U+3E32>
The following object is masked from <U+393C><U+3E31>package:dplyr<U+393C><U+3E32>:
combine
Name your dataset ‘mydata’ so it easy to work with.
Commands: read_csv() head() max() min() var() sd()
mydata <- read_csv("data/Advertising.csv")
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
X1 = col_integer(),
TV = col_double(),
radio = col_double(),
newspaper = col_double(),
sales = col_double()
)
head(mydata)
mydata <- rename(mydata, "case_number" = "X1")
Error: `X1` contains unknown variables
SALES
sales <- mydata$sales
#variable_max
sales_max <- max(mydata$sales)
sales_max
[1] 27
#variable_min
sales_min <- min(mydata$sales)
sales_min
[1] 1.6
#variable_Range max-min
sales_Range <- sales_max - sales_min
sales_Range
[1] 25.4
#variable_mean
sales_mean <- mean(mydata$sales)
sales_mean
[1] 14.0225
#variable_sd Standard Deviation
sales_sd <- sd(mydata$sales)
sales_sd
[1] 5.217457
#variable_variance
sales_variance <- var(mydata$sales)
sales_variance
[1] 27.22185
TV
tv <- mydata$TV
#variable_max
tv_max <- max(mydata$TV)
tv_max
[1] 296.4
#variable_min
tv_min <- min(mydata$TV)
tv_min
[1] 0.7
#variable_Range max-min
tv_Range <- tv_max - tv_min
tv_Range
[1] 295.7
#variable_mean
tv_mean <- mean(mydata$TV)
tv_mean
[1] 147.0425
#variable_sd Standard Deviation
tv_sd <- sd(mydata$TV)
tv_sd
[1] 85.85424
#variable_variance
tv_variance <- var(mydata$TV)
tv_variance
[1] 7370.95
Radio
radio <- mydata$radio
#variable_max
radio_max <- max(mydata$radio)
radio_max
[1] 49.6
#variable_min
radio_min <- min(mydata$radio)
radio_min
[1] 0
#variable_Range max-min
radio_Range <- radio_max - radio_min
radio_Range
[1] 49.6
#variable_mean
radio_mean <- mean(mydata$radio)
radio_mean
[1] 23.264
#variable_sd Standard Deviation
radio_sd <- sd(mydata$radio)
radio_sd
[1] 14.84681
#variable_variance
radio_variance <- var(mydata$radio)
radio_variance
[1] 220.4277
newspaper
newspaper <- mydata$newspaper
#variable_max
newspaper_max <- max(mydata$newspaper)
newspaper_max
[1] 114
#variable_min
newspaper_min <- min(mydata$newspaper)
newspaper_min
[1] 0.3
#variable_Range max-min
newspaper_Range <- newspaper_max - newspaper_min
newspaper_Range
[1] 113.7
#variable_mean
newspaper_mean <- mean(mydata$newspaper)
newspaper_mean
[1] 30.554
#variable_sd Standard Deviation
newspaper_sd <- sd(mydata$newspaper)
newspaper_sd
[1] 21.77862
#variable_variance
newspaper_variance <- var(mydata$newspaper)
newspaper_variance
[1] 474.3083
summary(mydata)
case_number TV radio newspaper sales
Min. : 1.00 Min. : 0.70 Min. : 0.000 Min. : 0.30 Min. : 1.60
1st Qu.: 50.75 1st Qu.: 74.38 1st Qu.: 9.975 1st Qu.: 12.75 1st Qu.:10.38
Median :100.50 Median :149.75 Median :22.900 Median : 25.75 Median :12.90
Mean :100.50 Mean :147.04 Mean :23.264 Mean : 30.55 Mean :14.02
3rd Qu.:150.25 3rd Qu.:218.82 3rd Qu.:36.525 3rd Qu.: 45.10 3rd Qu.:17.40
Max. :200.00 Max. :296.40 Max. :49.600 Max. :114.00 Max. :27.00
quantile(sales, na.rm = TRUE)
0% 25% 50% 75% 100%
1.600 10.375 12.900 17.400 27.000
quantile(tv, na.rm = TRUE)
0% 25% 50% 75% 100%
0.700 74.375 149.750 218.825 296.400
quantile(radio, na.rm = TRUE)
0% 25% 50% 75% 100%
0.000 9.975 22.900 36.525 49.600
quantile(newspaper, na.rm = TRUE)
0% 25% 50% 75% 100%
0.30 12.75 25.75 45.10 114.00
lowerqsales = quantile(sales, na.rm = TRUE)[2]
lowerqtv = quantile(tv, na.rm = TRUE)[2]
lowerqradio = quantile(radio, na.rm = TRUE)[2]
lowerqnewspaper = quantile(newspaper, na.rm = TRUE)[2]
upperqsales = quantile(sales, na.rm = TRUE)[4]
upperqtv = quantile(tv, na.rm = TRUE)[4]
upperqradio = quantile(radio, na.rm = TRUE)[4]
upperqnewspaper = quantile(newspaper, na.rm = TRUE)[4]
salesiqr = upperqsales - lowerqsales
salesiqr
75%
7.025
tviqr = upperqtv - lowerqtv
tviqr
75%
144.45
radioiqr = upperqradio - lowerqradio
radioiqr
75%
26.55
newspaperiqr = upperqnewspaper - lowerqnewspaper
newspaperiqr
75%
32.35
upper_threshold_sales = (salesiqr * 1.5) + upperqsales
upper_threshold_sales
75%
27.9375
upper_threshold_tv = (tviqr * 1.5) + upperqtv
upper_threshold_tv
75%
435.5
upper_threshold_radio = (radioiqr * 1.5) + upperqradio
upper_threshold_radio
75%
76.35
upper_threshold_newspaper = (newspaperiqr * 1.5) + upperqnewspaper
upper_threshold_newspaper
75%
93.625
lower_threshold_sales = lowerqsales - (salesiqr * 1.5)
lower_threshold_sales
25%
-0.1625
lower_threshold_tv = lowerqtv - (tviqr * 1.5)
lower_threshold_tv
25%
-142.3
lower_threshold_radio = lowerqradio - (radioiqr * 1.5)
lower_threshold_radio
25%
-29.85
lower_threshold_newspaper = lowerqnewspaper - (newspaperiqr * 1.5)
lower_threshold_newspaper
25%
-35.775
sales[ sales > upper_threshold_sales][1:10]
[1] NA NA NA NA NA NA NA NA NA NA
radio[ radio > upper_threshold_radio][1:10]
[1] NA NA NA NA NA NA NA NA NA NA
tv[ tv > upper_threshold_tv][1:10]
[1] NA NA NA NA NA NA NA NA NA NA
newspaper[ newspaper > upper_threshold_newspaper][1:10]
[1] 114.0 100.9 NA NA NA NA NA NA NA NA
sales[ sales < lower_threshold_sales][1:10]
[1] NA NA NA NA NA NA NA NA NA NA
tv[ tv < lower_threshold_tv][1:10]
[1] NA NA NA NA NA NA NA NA NA NA
radio[ radio < lower_threshold_radio][1:10]
[1] NA NA NA NA NA NA NA NA NA NA
newspaper[ newspaper < lower_threshold_newspaper][1:10]
[1] NA NA NA NA NA NA NA NA NA NA
mydata[ sales > upper_threshold_sales, ]
mydata[ tv > upper_threshold_tv, ]
mydata[ radio > upper_threshold_radio, ]
mydata[ newspaper > upper_threshold_newspaper, ]
mydata[ sales < lower_threshold_sales, ]
mydata[ tv < lower_threshold_tv, ]
mydata[ radio < lower_threshold_radio, ]
mydata[ newspaper < lower_threshold_newspaper, ]
count(mydata[ sales > upper_threshold_sales, ])
count(mydata[ tv > upper_threshold_tv, ])
count(mydata[ radio > upper_threshold_radio, ])
count(mydata[ newspaper > upper_threshold_newspaper, ])
count(mydata[ sales < lower_threshold_sales, ])
count(mydata[ tv < lower_threshold_tv, ])
count(mydata[ radio < lower_threshold_radio, ])
count(mydata[ newspaper < lower_threshold_newspaper, ])
sales_plot <- ggplot(data = mydata, aes(x = case_number, y = sales)) + geom_point()
tv_plot <- ggplot(data = mydata, aes(x = case_number, y = tv)) + geom_point()
radio_plot <- ggplot(data = mydata, aes(x = case_number, y = radio)) + geom_point()
newspaper_plot <- ggplot(data = mydata, aes(x = case_number, y = newspaper)) + geom_point()
grid.arrange(sales_plot, tv_plot, radio_plot, newspaper_plot, ncol=2)
# Extract case_number from the newdata
newdata = mydata[ order(mydata$sales), ]
case_number <- newdata$case_number
head(newdata)
# new_VARIABLE = newdata$VARIABLE
new_sales = newdata$sales
new_tv = newdata$TV
new_radio = newdata$radio
new_newspaper = newdata$newspaper
newsales_plot <- ggplot(data = newdata, aes(x = case_number[order(case_number)], y = new_sales)) + geom_point()
newtv_plot <- ggplot(data = newdata, aes(x = case_number[order(case_number)], y = new_tv)) + geom_point()
newradio_plot <- ggplot(data = newdata, aes(x = case_number[order(case_number)], y = new_radio)) + geom_point()
newnewspaper_plot <- ggplot(data = newdata, aes(x = case_number[order(case_number)], y = new_newspaper)) + geom_point()
grid.arrange(newsales_plot, newtv_plot, newradio_plot, newnewspaper_plot, ncol=2)
z_score_sales = ( sales - mean(sales) ) / sd(sales)
z_score_tv = ( tv - mean(tv) ) / sd(tv)
z_score_radio = ( radio - mean(radio) ) / sd(radio)
z_score_newspaper = ( newspaper - mean(newspaper) ) / sd(newspaper)
qplot( x = z_score_sales ,geom="histogram", binwidth = 0.3)
qplot( x = z_score_tv ,geom="histogram", binwidth = 0.3)
qplot( x = z_score_radio ,geom="histogram", binwidth = 0.3)
qplot( x = z_score_newspaper ,geom="histogram", binwidth = 0.3)
x = 26.7
z_score_calculatecor = ( x - mean(sales) ) / sd(sales)
z_score_calculatecor
[1] 2.429824