Data

Use the data for the breakfast cereals example in Section 4.8 to explore and summarize the data.

Cereals <- read.csv("D:/MSBA/3-Winter 2020/560/data/Cereals.csv") 

Part A

Which variables are quantitative/numerical? Which are ordinal? Which are nominal?

head(Cereals)
##                        name mfr type calories protein fat sodium fiber carbo
## 1                 100%_Bran   N    C       70       4   1    130  10.0   5.0
## 2         100%_Natural_Bran   Q    C      120       3   5     15   2.0   8.0
## 3                  All-Bran   K    C       70       4   1    260   9.0   7.0
## 4 All-Bran_with_Extra_Fiber   K    C       50       4   0    140  14.0   8.0
## 5            Almond_Delight   R    C      110       2   2    200   1.0  14.0
## 6   Apple_Cinnamon_Cheerios   G    C      110       2   2    180   1.5  10.5
##   sugars potass vitamins shelf weight cups   rating
## 1      6    280       25     3      1 0.33 68.40297
## 2      8    135        0     3      1 1.00 33.98368
## 3      5    320       25     3      1 0.33 59.42551
## 4      0    330       25     3      1 0.50 93.70491
## 5      8     NA       25     3      1 0.75 34.38484
## 6     10     70       25     1      1 0.75 29.50954

Part B

Compute the mean, median, min, max, and standard deviation for each of the quantitative variables.

CSUM<-data.frame(mean=sapply(Cereals[,4:16],mean,na.rm=TRUE)
                ,median=sapply(Cereals[,4:16],median,na.rm=TRUE)
                ,min=sapply(Cereals[,4:16],min,na.rm=TRUE)
                ,max=sapply(Cereals[,4:16], max,na.rm=TRUE)
                ,sd=sapply(Cereals[,4:16], sd,na.rm=TRUE))
colnames(CSUM)=c("mean","Median","Min","Max","Standard Deviation")
CSUM
##                mean    Median      Min       Max Standard Deviation
## calories 106.883117 110.00000 50.00000 160.00000         19.4841191
## protein    2.545455   3.00000  1.00000   6.00000          1.0947897
## fat        1.012987   1.00000  0.00000   5.00000          1.0064726
## sodium   159.675325 180.00000  0.00000 320.00000         83.8322952
## fiber      2.151948   2.00000  0.00000  14.00000          2.3833640
## carbo     14.802632  14.50000  5.00000  23.00000          3.9073256
## sugars     7.026316   7.00000  0.00000  15.00000          4.3786564
## potass    98.666667  90.00000 15.00000 330.00000         70.4106360
## vitamins  28.246753  25.00000  0.00000 100.00000         22.3425225
## shelf      2.207792   2.00000  1.00000   3.00000          0.8325241
## weight     1.029610   1.00000  0.50000   1.50000          0.1504768
## cups       0.821039   0.75000  0.25000   1.50000          0.2327161
## rating    42.665705  40.40021 18.04285  93.70491         14.0472887

Part C

Plot a histogram for each of the quantitative variables.

library(tidyr)
library (ggplot2)

Cereals %>% gather() %>% head()
## Warning: attributes are not identical across measure variables;
## they will be dropped
##    key                     value
## 1 name                 100%_Bran
## 2 name         100%_Natural_Bran
## 3 name                  All-Bran
## 4 name All-Bran_with_Extra_Fiber
## 5 name            Almond_Delight
## 6 name   Apple_Cinnamon_Cheerios
ggplot(gather(Cereals[,4:16]),aes(value))+
  geom_histogram(bins=10)+facet_wrap(~key,scales="free_x")
## Warning: Removed 4 rows containing non-finite values (stat_bin).

Part D

Plot a side-by-side boxplot of consumer rating as a function of the shelf height.

boxplot(calories~type,data=Cereals,main="Cold (C) vs. Hot (H) cereals",
        xlab="Type of cereals",ylab="# of calories",
        col="blueviolet",medcol="darkgoldenrod1",boxlty=0,border="black",
        whisklty=1,staplelwd=4,outpch=13,outcex=1,outcol="darkslateblue")

Part E

Plot a side-by-side boxplot of consumer rating as a function of the shelf height.

boxplot(rating~shelf,data=Cereals,main="Impact of Shelf Size on Customer Ratings",
        xlab="Ratings",ylab="Shelf",horizontal=TRUE, 
        col="blueviolet",medcol="darkgoldenrod1",boxlty=0,border="black",
        whisklty=1,staplelwd=4,outpch=13,outcex=1,outcol="darkslateblue")

Part F

Compute the correlation table for the quantitative variable.

cmCereal=cor(Cereals[,4:16],use="complete.obs")
round(cmCereal,2)
##          calories protein   fat sodium fiber carbo sugars potass vitamins shelf
## calories     1.00    0.03  0.51   0.30 -0.30  0.27   0.57  -0.07     0.26  0.09
## protein      0.03    1.00  0.20   0.01  0.51 -0.04  -0.29   0.58     0.05  0.20
## fat          0.51    0.20  1.00   0.00  0.01 -0.28   0.29   0.20    -0.03  0.28
## sodium       0.30    0.01  0.00   1.00 -0.07  0.33   0.04  -0.04     0.33 -0.12
## fiber       -0.30    0.51  0.01  -0.07  1.00 -0.38  -0.15   0.91    -0.04  0.31
## carbo        0.27   -0.04 -0.28   0.33 -0.38  1.00  -0.45  -0.37     0.25 -0.19
## sugars       0.57   -0.29  0.29   0.04 -0.15 -0.45   1.00   0.00     0.07  0.06
## potass      -0.07    0.58  0.20  -0.04  0.91 -0.37   0.00   1.00     0.00  0.39
## vitamins     0.26    0.05 -0.03   0.33 -0.04  0.25   0.07   0.00     1.00  0.28
## shelf        0.09    0.20  0.28  -0.12  0.31 -0.19   0.06   0.39     0.28  1.00
## weight       0.70    0.23  0.22   0.31  0.25  0.14   0.46   0.42     0.32  0.19
## cups         0.09   -0.24 -0.16   0.12 -0.51  0.36  -0.03  -0.50     0.13 -0.35
## rating      -0.69    0.47 -0.41  -0.38  0.60  0.06  -0.76   0.42    -0.21  0.05
##          weight  cups rating
## calories   0.70  0.09  -0.69
## protein    0.23 -0.24   0.47
## fat        0.22 -0.16  -0.41
## sodium     0.31  0.12  -0.38
## fiber      0.25 -0.51   0.60
## carbo      0.14  0.36   0.06
## sugars     0.46 -0.03  -0.76
## potass     0.42 -0.50   0.42
## vitamins   0.32  0.13  -0.21
## shelf      0.19 -0.35   0.05
## weight     1.00 -0.20  -0.30
## cups      -0.20  1.00  -0.22
## rating    -0.30 -0.22   1.00

Part F Cont.

Generate a matrix plot for these variables.

#make sure to install the "corplot" and "RColorBrewer" packages if this is your first time using them
library(corrplot)
## corrplot 0.84 loaded
library(RColorBrewer)

corrplot(cmCereal,type="lower",main="Correlation matrix",mar=c(0,0,1,0),tl.cex=0.8,tl.col="black", tl.srt=45,
         col=brewer.pal(n=8, name="PuOr"))

Part F Normalized

Recreate the correlation table and matrix plot with normalized data.

#make sure to install the "caret" packages if this is your first time using it
library(caret)
## Loading required package: lattice
normCereal=preProcess(Cereals[,4:16],method=c("center","scale"),na.rm=TRUE)
NormcmCereal=predict(normCereal,Cereals[,4:16])

cmCereal2=cor(NormcmCereal,use="complete.obs")
round(cmCereal2,2)
##          calories protein   fat sodium fiber carbo sugars potass vitamins shelf
## calories     1.00    0.03  0.51   0.30 -0.30  0.27   0.57  -0.07     0.26  0.09
## protein      0.03    1.00  0.20   0.01  0.51 -0.04  -0.29   0.58     0.05  0.20
## fat          0.51    0.20  1.00   0.00  0.01 -0.28   0.29   0.20    -0.03  0.28
## sodium       0.30    0.01  0.00   1.00 -0.07  0.33   0.04  -0.04     0.33 -0.12
## fiber       -0.30    0.51  0.01  -0.07  1.00 -0.38  -0.15   0.91    -0.04  0.31
## carbo        0.27   -0.04 -0.28   0.33 -0.38  1.00  -0.45  -0.37     0.25 -0.19
## sugars       0.57   -0.29  0.29   0.04 -0.15 -0.45   1.00   0.00     0.07  0.06
## potass      -0.07    0.58  0.20  -0.04  0.91 -0.37   0.00   1.00     0.00  0.39
## vitamins     0.26    0.05 -0.03   0.33 -0.04  0.25   0.07   0.00     1.00  0.28
## shelf        0.09    0.20  0.28  -0.12  0.31 -0.19   0.06   0.39     0.28  1.00
## weight       0.70    0.23  0.22   0.31  0.25  0.14   0.46   0.42     0.32  0.19
## cups         0.09   -0.24 -0.16   0.12 -0.51  0.36  -0.03  -0.50     0.13 -0.35
## rating      -0.69    0.47 -0.41  -0.38  0.60  0.06  -0.76   0.42    -0.21  0.05
##          weight  cups rating
## calories   0.70  0.09  -0.69
## protein    0.23 -0.24   0.47
## fat        0.22 -0.16  -0.41
## sodium     0.31  0.12  -0.38
## fiber      0.25 -0.51   0.60
## carbo      0.14  0.36   0.06
## sugars     0.46 -0.03  -0.76
## potass     0.42 -0.50   0.42
## vitamins   0.32  0.13  -0.21
## shelf      0.19 -0.35   0.05
## weight     1.00 -0.20  -0.30
## cups      -0.20  1.00  -0.22
## rating    -0.30 -0.22   1.00
corrplot(cmCereal2,type="lower",main="Normalized Correlation matrix",mar=c(0,0,1,0),tl.cex=0.8,tl.col="black", tl.srt=45,
         col=brewer.pal(n=8, name="PuOr"))