For your assignment you may be using different dataset than what is included here.
Always read carefully the instructions on Sakai.
Tasks/questions to be completed/answered are highlighted in larger bolded fonts and numbered according to their section.
We are going to use tidyverse a collection of R packages designed for data science.
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.4
## ✔ tidyr 0.7.2 ✔ stringr 1.2.0
## ✔ readr 1.1.1 ✔ forcats 0.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## Loading required package: gridExtra
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
Name your dataset ‘mydata’ so it easy to work with.
Commands: read_csv() head() max() min() var() sd()
mydata <- read_csv("data/Advertising.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_integer(),
## TV = col_double(),
## radio = col_double(),
## newspaper = col_double(),
## sales = col_double()
## )
head(mydata)
mydata <- rename(mydata, "case_number" = "X1")
radio <- mydata$radio
newspaper <- mydata$newspaper
TV <- mydata$TV
sales <- mydata$sales
RADIO
#variable_max
radio_max <- max(mydata$radio)
radio_max
## [1] 49.6
#variable_min
radio_min <- min(mydata$radio)
radio_min
## [1] 0
#variable_Range max-min
radio_range <- radio_max-radio_min
radio_range
## [1] 49.6
#variable_mean
radio_mean <- mean(mydata$radio)
radio_mean
## [1] 23.264
#variable_sd Standard Deviation
radio_sd <- sd(mydata$radio)
radio_sd
## [1] 14.84681
#variable_variance
radio_variance <- var(mydata$radio)
radio_variance
## [1] 220.4277
TV
#variable_max
TV_max <- max(TV)
TV_max
## [1] 296.4
#variable_min
TV_min <- min(TV)
TV_min
## [1] 0.7
#variable_Range max-min
TV_range <- TV_max-TV_min
TV_range
## [1] 295.7
#variable_mean
TV_mean <- mean(TV)
TV_mean
## [1] 147.0425
#variable_sd Standard Deviation
TV_sd <- sd(TV)
TV_sd
## [1] 85.85424
#variable_variance
TV_variance <- var(TV)
TV_variance
## [1] 7370.95
NEWSPAPER
#variable_max
newspaper_max <- max(newspaper)
newspaper_max
## [1] 114
#variable_min
newspaper_min <- min(newspaper)
newspaper_min
## [1] 0.3
#variable_Range max-min
newspaper_range <- newspaper_max-newspaper_min
newspaper_range
## [1] 113.7
#variable_mean
newspaper_mean <- mean(newspaper)
newspaper_mean
## [1] 30.554
#variable_sd Standard Deviation
newspaper_sd <- sd(newspaper)
newspaper_sd
## [1] 21.77862
#variable_variance
newspaper_variance <- var(newspaper)
newspaper_variance
## [1] 474.3083
SALES
#variable_max
sales_max <- max(sales)
sales_max
## [1] 27
#variable_min
sales_min <- min(sales)
sales_min
## [1] 1.6
#variable_Range max-min
sales_range <- sales_max-sales_min
sales_range
## [1] 25.4
#variable_mean
sales_mean <- mean(sales)
sales_mean
## [1] 14.0225
#variable_sd Standard Deviation
sales_sd <- sd(sales)
sales_sd
## [1] 5.217457
#variable_variance
sales_variance <- var(sales)
sales_variance
## [1] 27.22185
The max for radio is much lower than TV and Newspaper. The Minimums are relatively similar. The range for TV is very large. The variance for TV is very large. Sales max and variance is similar.
summary(mydata)
## case_number TV radio newspaper
## Min. : 1.00 Min. : 0.70 Min. : 0.000 Min. : 0.30
## 1st Qu.: 50.75 1st Qu.: 74.38 1st Qu.: 9.975 1st Qu.: 12.75
## Median :100.50 Median :149.75 Median :22.900 Median : 25.75
## Mean :100.50 Mean :147.04 Mean :23.264 Mean : 30.55
## 3rd Qu.:150.25 3rd Qu.:218.82 3rd Qu.:36.525 3rd Qu.: 45.10
## Max. :200.00 Max. :296.40 Max. :49.600 Max. :114.00
## sales
## Min. : 1.60
## 1st Qu.:10.38
## Median :12.90
## Mean :14.02
## 3rd Qu.:17.40
## Max. :27.00
OUTLIERS FOR TV
quantile(TV, na.rm = TRUE)
## 0% 25% 50% 75% 100%
## 0.700 74.375 149.750 218.825 296.400
lowerq_TV = quantile(TV, na.rm = TRUE)[2]
upperq_TV = quantile(TV, na.rm = TRUE)[4]
iqr_TV = upperq_TV - lowerq_TV
iqr_TV
## 75%
## 144.45
upper_threshold_TV = (iqr_TV * 1.5) + upperq_TV
upper_threshold_TV
## 75%
## 435.5
lower_threshold_TV = lowerq_TV - (iqr_TV * 1.5)
lower_threshold_TV
## 25%
## -142.3
TV[ TV > upper_threshold_TV]
## numeric(0)
TV[ TV < lower_threshold_TV]
## numeric(0)
mydata[ TV > upper_threshold_TV, ]
mydata[ TV < lower_threshold_TV, ]
count(mydata[TV > upper_threshold_TV, ])
count(mydata[TV < lower_threshold_TV, ])
There are 0 outliers in the TV data. Outliers should be removed if there are any as they will skew the data and therefore give us inaccurate readings. Since there are none, there will be no removal.
OUTLIERS FOR RADIO
quantile(radio, na.rm = TRUE)
## 0% 25% 50% 75% 100%
## 0.000 9.975 22.900 36.525 49.600
lowerq_radio = quantile(radio, na.rm = TRUE)[2]
upperq_radio = quantile(radio, na.rm = TRUE)[4]
iqr_radio = upperq_radio - lowerq_radio
iqr_radio
## 75%
## 26.55
upper_threshold_radio = (iqr_radio * 1.5) + upperq_radio
upper_threshold_radio
## 75%
## 76.35
lower_threshold_radio = lowerq_radio - (iqr_radio * 1.5)
lower_threshold_radio
## 25%
## -29.85
radio[ radio > upper_threshold_radio]
## numeric(0)
radio[ radio < lower_threshold_radio]
## numeric(0)
mydata[ radio > upper_threshold_radio, ]
mydata[ radio < lower_threshold_radio, ]
count(mydata[ radio> upper_threshold_radio, ])
count(mydata[radio < lower_threshold_radio, ])
There are 0 outliers for radio.
OUTLIERS FOR NEWSPAPER
quantile(newspaper)
## 0% 25% 50% 75% 100%
## 0.30 12.75 25.75 45.10 114.00
lowerq_newspaper = quantile(newspaper, na.rm = TRUE)[2]
upperq_newspaper = quantile(newspaper, na.rm = TRUE)[4]
iqr_newspaper = upperq_newspaper - lowerq_newspaper
iqr_newspaper
## 75%
## 32.35
upper_threshold_newspaper = (iqr_newspaper * 1.5) + upperq_newspaper
upper_threshold_newspaper
## 75%
## 93.625
lower_threshold_newspaper = lowerq_newspaper - (iqr_newspaper * 1.5)
lower_threshold_newspaper
## 25%
## -35.775
newspaper[ newspaper > upper_threshold_newspaper]
## [1] 114.0 100.9
newspaper[ newspaper < lower_threshold_newspaper]
## numeric(0)
mydata[ newspaper > upper_threshold_newspaper, ]
mydata[ newspaper < lower_threshold_newspaper, ]
count(mydata[ newspaper> upper_threshold_newspaper, ])
count(mydata[newspaper < lower_threshold_newspaper, ])
There are 2 outliers in the newspaper data.
OUTLIERS FOR SALES
quantile(sales, na.rm = TRUE)
## 0% 25% 50% 75% 100%
## 1.600 10.375 12.900 17.400 27.000
lowerq_sales = quantile(sales, na.rm = TRUE)[2]
upperq_sales = quantile(sales, na.rm = TRUE)[4]
iqr_sales = upperq_sales - lowerq_sales
iqr_sales
## 75%
## 7.025
upper_threshold_sales = (iqr_sales * 1.5) + upperq_sales
upper_threshold_sales
## 75%
## 27.9375
lower_threshold_sales = lowerq_sales - (iqr_sales * 1.5)
lower_threshold_sales
## 25%
## -0.1625
sales[ sales > upper_threshold_sales]
## numeric(0)
sales[ sales < lower_threshold_sales]
## numeric(0)
There are 0 outliers in sales.
The dataset represents the amount of advertising per medium per case number. The data contains values for case number, radio, TV, newspaper and sales. This is related to the advertising and how they affect sales. TV advertisements had the highest mad, radio the lowest minimum. The largest range and numbers was in the TV category.
#grid.arrange(VARIABLE_plot1, VARIABLE_plot2, VARIABLE_plot3, VARIABLE_plot4, ncol=2)
#X1 = col_integer(),
#TV = col_double(),
#radio = col_double(),
#newspaper = col_double(),
#sales = col_double()
TV_plot <- ggplot(data = mydata, aes(x = case_number, y = TV)) + geom_point()
radio_plot <- ggplot(data = mydata, aes(x = case_number, y = radio)) + geom_point()
newspaper_plot <- ggplot(data = mydata, aes(x = case_number, y = newspaper)) + geom_point()
sales_plot <- ggplot(data = mydata, aes(x = case_number, y = sales)) + geom_point()
TV_plot
radio_plot
newspaper_plot
sales_plot
grid.arrange(TV_plot, radio_plot, newspaper_plot, sales_plot, ncol=2)
newdata <- mydata[ order(mydata$sales), ]
# Extract case_number from the newdata
case_number <- newdata$case_number
newdata
# new_VARIABLE = newdata$VARIABLE
new_sales = newdata$sales
new_radio = newdata$radio
new_newspaper = newdata$newspaper
new_TV = newdata$TV
TV_plot2 <- ggplot(data = mydata, aes(x = case_number, y = new_TV)) + geom_point()
radio_plot2 <- ggplot(data = mydata, aes(x = case_number, y = new_radio)) + geom_point()
newspaper_plot2 <- ggplot(data = mydata, aes(x = case_number, y = new_newspaper)) + geom_point()
sales_plot2 <- ggplot(data = mydata, aes(x = case_number, y = new_sales)) + geom_point()
grid.arrange(TV_plot2, radio_plot2, newspaper_plot2, sales_plot2, ncol=2)
z_score = ( sales - mean(sales) ) / sd(sales)
qplot( x = z_score, geom="histogram", binwidth = 0.3)
The z score historgram shows a bell curve. The highest point is around a z score of -.2-.5. The highest z score is around 2.4 and the lowest is approx. -2.4.
x = 26.7
z_scores = (x - mean(sales) ) / sd(sales)
z_scores
## [1] 2.429824
I would rate the $26,700 as very good.It is at the highest end of the z score bell curve. This means it is a high rated anomoly.