Notebook Instructions


Load Packages in R/RStudio

We are going to use tidyverse a collection of R packages designed for data science.

## Loading required package: tidyverse
## -- Attaching packages ----------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.2     v dplyr   0.7.4
## v tidyr   0.7.2     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.2.0
## -- Conflicts -------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## Loading required package: gridExtra
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine

Task 1: Quantitative Analysis


1A) Read the csv file into R Studio and display the dataset.

  • Name your dataset ‘mydata’ so it easy to work with.

  • Commands: read_csv() head() max() min() var() sd()

Extract the assigned features (columns) to perform some analytics.

mydata <- read.csv("advertising.csv")
mydata

Change the variable name “X1” to case_number using the function rename()

  • mydata <- rename(mydata, “NEW_VAR_NAME” = “OLD_VAR_NAME”)
mydata <- rename(mydata, "case_number" = "X")
mydata

1B) Find the range ( difference between min and max ), min, max, standard deviation and variance for each assigned feature ( Use separate chunks for each feature ). Compare each feature and note any significant differences

TV

#variable_max
maxTV <- max(mydata$TV)
maxTV
## [1] 296.4
#variable_min
minTV <- min(mydata$TV)
minTV
## [1] 0.7
#variable_Range max-min
rangeTV <- sum(maxTV - minTV)
rangeTV
## [1] 295.7
#variable_mean 
meanTV <- mean(mydata$TV)
meanTV
## [1] 147.0425
#variable_sd Standard Deviation
sdTV <- sd(mydata$TV)
sdTV
## [1] 85.85424
#variable_variance
varianceTV <- var(mydata$TV)
varianceTV
## [1] 7370.95

radio

#variable_max
maxradio <- max(mydata$radio)
maxradio
## [1] 49.6
#variable_min
minradio <- min(mydata$radio)
minradio
## [1] 0
#variable_Range max-min
rangeradio <- sum(maxradio - minradio)
rangeradio
## [1] 49.6
#variable_mean 
meanradio <- mean(mydata$radio)
meanradio
## [1] 23.264
#variable_sd Standard Deviation
sdradio <- sd(mydata$radio)
sdradio
## [1] 14.84681
#variable_variance
varianceradio <- var(mydata$radio)
varianceradio
## [1] 220.4277

newspaper

#variable_max
maxnewspaper <- max(mydata$newspaper)
maxnewspaper
## [1] 114
#variable_min
minnewspaper <- min(mydata$newspaper)
minnewspaper
## [1] 0.3
#variable_Range max-min
rangenewspaper <- sum(maxnewspaper - minnewspaper)
rangenewspaper
## [1] 113.7
#variable_mean 
meannewspaper <- mean(mydata$newspaper)
meannewspaper
## [1] 30.554
#variable_sd Standard Deviation
sdnewspaper <- sd(mydata$newspaper)
sdnewspaper
## [1] 21.77862
#variable_variance
variancenewspaper <- var(mydata$newspaper)
variancenewspaper
## [1] 474.3083

sales

#variable_max
maxsales <- max(mydata$sales)
maxsales
## [1] 27
#variable_min
minsales <- min(mydata$sales)
minsales
## [1] 1.6
#variable_Range max-min
rangesales <- sum(maxsales - minsales)
rangesales
## [1] 25.4
#variable_mean 
meansales <- mean(mydata$sales)
meansales
## [1] 14.0225
#variable_sd Standard Deviation
sdsales <- sd(mydata$sales)
sdsales
## [1] 5.217457
#variable_variance
variancesales <- var(mydata$sales)
variancesales
## [1] 27.22185

1C) Use the summary() function on all the dataset to give you a general description of the data. Note any differences between features.

summary(mydata)
##   case_number           TV             radio          newspaper     
##  Min.   :  1.00   Min.   :  0.70   Min.   : 0.000   Min.   :  0.30  
##  1st Qu.: 50.75   1st Qu.: 74.38   1st Qu.: 9.975   1st Qu.: 12.75  
##  Median :100.50   Median :149.75   Median :22.900   Median : 25.75  
##  Mean   :100.50   Mean   :147.04   Mean   :23.264   Mean   : 30.55  
##  3rd Qu.:150.25   3rd Qu.:218.82   3rd Qu.:36.525   3rd Qu.: 45.10  
##  Max.   :200.00   Max.   :296.40   Max.   :49.600   Max.   :114.00  
##      sales      
##  Min.   : 1.60  
##  1st Qu.:10.38  
##  Median :12.90  
##  Mean   :14.02  
##  3rd Qu.:17.40  
##  Max.   :27.00
summary(mydata$TV)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.70   74.38  149.75  147.04  218.82  296.40
summary(mydata$radio)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   9.975  22.900  23.264  36.525  49.600
summary(mydata$newspaper)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.30   12.75   25.75   30.55   45.10  114.00
summary(mydata$sales)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.60   10.38   12.90   14.02   17.40   27.00

Are there any outliers, if not explain the lack of outliers? if any explain what the outliers represent and how many records are outliers? ( Use code from notebook-03 to find outliers)

lowerquantileTV <- quantile(mydata$TV)[2]
upperquantileTV <- quantile(mydata$TV)[4]

lowerquantileradio <- quantile(mydata$radio)[2]
upperquantileradio <- quantile(mydata$radio)[4]

lowerquantilenewspaper <- quantile(mydata$newspaper)[2]
upperquantilenewspaper <- quantile(mydata$newspaper)[4]

lowerquantilesales <- quantile(mydata$sales)[2]
upperquantilesales <- quantile(mydata$sales)[4]

IQR calculations

iqrTV = upperquantileTV - lowerquantileTV

iqrradio = upperquantileradio - lowerquantileradio

iqrnewspaper = upperquantilenewspaper - lowerquantilenewspaper

iqrsales = upperquantilesales - lowerquantilesales

Upper threshold (UT)

UTTV = (iqrTV * 1.5) + upperquantileTV

UTradio = (iqrradio * 1.5) + upperquantileradio

UTnewspaper = (iqrnewspaper * 1.5) + upperquantilenewspaper

UTsales = (iqrsales * 1.5) + upperquantilesales

Lower threshold (LT)

LTTV = lowerquantileTV - (iqrTV * 1.5)

LTradio = lowerquantileradio - (iqrradio * 1.5)

LTnewspaper = lowerquantilenewspaper - (iqrnewspaper * 1.5)

LTsales = lowerquantilesales - (iqrsales * 1.5)
count(mydata[mydata$TV > UTTV, ])
count(mydata[mydata$TV < LTTV, ])
count(mydata[mydata$radio > UTradio, ])
count(mydata[mydata$radio < LTradio, ])
count(mydata[mydata$newspaper > UTnewspaper, ])
count(mydata[mydata$newspaper < LTnewspaper, ])
count(mydata[mydata$sales > UTsales, ])
count(mydata[mydata$sales < LTsales, ])

There are two outliers that surpass the upper threshold for newspapers. These outliers represent two data points that drastically differ from the pattern of the rest of the data set and are above the thresholds of calculated from the dataset.

1D) Write a general description of the dataset using the statistics found in the steps above. Use the min,max range to compare the features, note any significant differences.

TV had a max of 296.4 and a min of .7 making the range 295.7. The mean for TV was around 147.04, with a standard deviation of 85.85 and a variance of 7370.95. Radio had a max of 49.6 and a min of 0 making the range 49.6. The mean for radio was around 23.26, with a standard deviation of 14.85 and a variance of 220.43. Newspaper had a max of 114 and a min of .3 making the range 113.7. The mean for newspaper was 30.55, with a standard deviation of 21.78 and a variance of 474.31. Lastly, sales had a max of 27 and a min of 1.6 making the range 25.4. The mean for sales was 14.02, with a standard deviation of 5.22 and a variance of 27.22. The highest max was that of TV and the lowest max was sales. The highest min was sales and the lowest min was radio. The highest range was TV and the lowest range was sales. The highest mean was TV and the lowest mean was sales again (since it had the lowest max and highest min). The highest standard deviation and variance was TV while the lowest standard deviation and variance was sales.


Task 2: Qualitative Analysis


2A) Plot all the assigned features as y-axis for x-axis use case_number. Use the given commands to create each plot and create a grid to plot all features Note any trends/patters in the data

  • Commands: VARIABLE_plot <- ggplot(data = mydata, aes(x = VARIABLE, y = VARIABLE)) + geom_point()
  • Commands: grid.arrange(VARIABLE_plot1, VARIABLE_plot2, VARIABLE_plot3, VARIABLE_plot4, ncol=2)
TV_plot <- ggplot(data = mydata, aes(x = mydata$case_number, y = mydata$TV)) + geom_point()
radio_plot <- ggplot(data = mydata, aes(x = mydata$case_number, y = mydata$radio)) + geom_point()
newspaper_plot <- ggplot(data = mydata, aes(x = mydata$case_number, y = mydata$newspaper)) + geom_point()
sales_plot <- ggplot(data = mydata, aes(x = mydata$case_number, y = mydata$sales)) + geom_point()

grid.arrange(sales_plot, TV_plot, radio_plot, newspaper_plot, ncol=2)

I do not see any patterns in the data.

  • When looking at these plots it is hard to see a particular trend.
  • One way to observe any possible trend in the sales data would be to re-order the data from low to high.
  • The 200 months observations are in no particular chronological time sequence.
  • The case numbers are independent sequentially generated numbers. Since each case is independent, we can reorder them.

2B) Re-order sales from low to high, and save re-ordered data in a new set. As sales data is re-reorded associated other column fields follow.

  • Commands: newdata <- mydata[ order(mydata$VARIABLE), ]
newdata <- mydata[ order(mydata$sales), ]

case_number <- newdata$case_number
head(newdata)

Extract the variables from the new data

# new_VARIABLE = newdata$VARIABLE
new_TV <- newdata$TV
new_radio <- newdata$radio
new_newspaper <- newdata$newspaper
new_sales <- newdata$sales

Task 3: Standardized Z-Value


3A) Create a histogram of the assigned feature z-scores. Describe the output note any relevant values.

  • Command: z_score = ( VARIABLE - mean(VARIABLE) ) / sd(VARIABLE)
  • Commands: qplot( x = VARIABLE ,geom=“histogram”, binwidth = 0.3)
z_scoreTV = (mydata$TV - meanTV) / sdTV
z_scoreradio = (mydata$radio - meanradio) / sdradio
z_scorenewspaper = (mydata$newspaper - meannewspaper) / sdnewspaper
z_scoresales = (mydata$sales - meansales) / sdsales
qplot( x = z_scoreTV ,geom="histogram", binwidth = 0.3)

qplot( x = z_scoreradio ,geom="histogram", binwidth = 0.3)

qplot( x = z_scorenewspaper ,geom="histogram", binwidth = 0.3)

qplot( x = z_scoresales ,geom="histogram", binwidth = 0.3)

3B) Given a sales value of $26700, calculate the corresponding z-value or z-score.

  • Command: z_score = ( VARIABLE - mean(VARIABLE) ) / sd(VARIABLE)
z_scoresalescalculation = ( 26.7 - mean(mydata$sales) ) / sd(mydata$sales)
z_scoresalescalculation
## [1] 2.429824

3C) Based on the z-value, how would you rate a $26700 sales value: poor, average, good, or very good performance? Explain your logic.

This is good performance because it is a positive number which means it is above the mean. Z-scores below 1.8 are not good and z-scores above 3 are really good, and since this z-score is 2.43, I would classify it as a good z-score.