Use the data for the breakfast cereals example in Section 4.8 to explore and summarize the data.
Cereals <- read.csv("D:/MSBA/3-Winter 2020/560/data/Cereals.csv")
Which variables are quantitative/numerical? Which are ordinal? Which are nominal?
head(Cereals)
## name mfr type calories protein fat sodium fiber carbo
## 1 100%_Bran N C 70 4 1 130 10.0 5.0
## 2 100%_Natural_Bran Q C 120 3 5 15 2.0 8.0
## 3 All-Bran K C 70 4 1 260 9.0 7.0
## 4 All-Bran_with_Extra_Fiber K C 50 4 0 140 14.0 8.0
## 5 Almond_Delight R C 110 2 2 200 1.0 14.0
## 6 Apple_Cinnamon_Cheerios G C 110 2 2 180 1.5 10.5
## sugars potass vitamins shelf weight cups rating
## 1 6 280 25 3 1 0.33 68.40297
## 2 8 135 0 3 1 1.00 33.98368
## 3 5 320 25 3 1 0.33 59.42551
## 4 0 330 25 3 1 0.50 93.70491
## 5 8 NA 25 3 1 0.75 34.38484
## 6 10 70 25 1 1 0.75 29.50954
Compute the mean, median, min, max, and standard deviation for each of the quantitative variables.
CSUM<-data.frame(mean=sapply(Cereals[,4:16],mean,na.rm=TRUE)
,median=sapply(Cereals[,4:16],median,na.rm=TRUE)
,min=sapply(Cereals[,4:16],min,na.rm=TRUE)
,max=sapply(Cereals[,4:16], max,na.rm=TRUE)
,sd=sapply(Cereals[,4:16], sd,na.rm=TRUE))
colnames(CSUM)=c("mean","Median","Min","Max","Standard Deviation")
CSUM
## mean Median Min Max Standard Deviation
## calories 106.883117 110.00000 50.00000 160.00000 19.4841191
## protein 2.545455 3.00000 1.00000 6.00000 1.0947897
## fat 1.012987 1.00000 0.00000 5.00000 1.0064726
## sodium 159.675325 180.00000 0.00000 320.00000 83.8322952
## fiber 2.151948 2.00000 0.00000 14.00000 2.3833640
## carbo 14.802632 14.50000 5.00000 23.00000 3.9073256
## sugars 7.026316 7.00000 0.00000 15.00000 4.3786564
## potass 98.666667 90.00000 15.00000 330.00000 70.4106360
## vitamins 28.246753 25.00000 0.00000 100.00000 22.3425225
## shelf 2.207792 2.00000 1.00000 3.00000 0.8325241
## weight 1.029610 1.00000 0.50000 1.50000 0.1504768
## cups 0.821039 0.75000 0.25000 1.50000 0.2327161
## rating 42.665705 40.40021 18.04285 93.70491 14.0472887
Plot a histogram for each of the quantitative variables.
library(tidyr)
library (ggplot2)
Cereals %>% gather() %>% head()
## Warning: attributes are not identical across measure variables;
## they will be dropped
## key value
## 1 name 100%_Bran
## 2 name 100%_Natural_Bran
## 3 name All-Bran
## 4 name All-Bran_with_Extra_Fiber
## 5 name Almond_Delight
## 6 name Apple_Cinnamon_Cheerios
ggplot(gather(Cereals[,4:16]),aes(value))+
geom_histogram(bins=10)+facet_wrap(~key,scales="free_x")
## Warning: Removed 4 rows containing non-finite values (stat_bin).
Plot a side-by-side boxplot of consumer rating as a function of the shelf height.
boxplot(calories~type,data=Cereals,main="Cold (C) vs. Hot (H) cereals",
xlab="Type of cereals",ylab="# of calories",
col="blueviolet",medcol="darkgoldenrod1",boxlty=0,border="black",
whisklty=1,staplelwd=4,outpch=13,outcex=1,outcol="darkslateblue")
Plot a side-by-side boxplot of consumer rating as a function of the shelf height.
boxplot(rating~shelf,data=Cereals,main="Impact of Shelf Size on Customer Ratings",
xlab="Ratings",ylab="Shelf",horizontal=TRUE,
col="blueviolet",medcol="darkgoldenrod1",boxlty=0,border="black",
whisklty=1,staplelwd=4,outpch=13,outcex=1,outcol="darkslateblue")
Compute the correlation table for the quantitative variable.
cmCereal=cor(Cereals[,4:16],use="complete.obs")
round(cmCereal,2)
## calories protein fat sodium fiber carbo sugars potass vitamins shelf
## calories 1.00 0.03 0.51 0.30 -0.30 0.27 0.57 -0.07 0.26 0.09
## protein 0.03 1.00 0.20 0.01 0.51 -0.04 -0.29 0.58 0.05 0.20
## fat 0.51 0.20 1.00 0.00 0.01 -0.28 0.29 0.20 -0.03 0.28
## sodium 0.30 0.01 0.00 1.00 -0.07 0.33 0.04 -0.04 0.33 -0.12
## fiber -0.30 0.51 0.01 -0.07 1.00 -0.38 -0.15 0.91 -0.04 0.31
## carbo 0.27 -0.04 -0.28 0.33 -0.38 1.00 -0.45 -0.37 0.25 -0.19
## sugars 0.57 -0.29 0.29 0.04 -0.15 -0.45 1.00 0.00 0.07 0.06
## potass -0.07 0.58 0.20 -0.04 0.91 -0.37 0.00 1.00 0.00 0.39
## vitamins 0.26 0.05 -0.03 0.33 -0.04 0.25 0.07 0.00 1.00 0.28
## shelf 0.09 0.20 0.28 -0.12 0.31 -0.19 0.06 0.39 0.28 1.00
## weight 0.70 0.23 0.22 0.31 0.25 0.14 0.46 0.42 0.32 0.19
## cups 0.09 -0.24 -0.16 0.12 -0.51 0.36 -0.03 -0.50 0.13 -0.35
## rating -0.69 0.47 -0.41 -0.38 0.60 0.06 -0.76 0.42 -0.21 0.05
## weight cups rating
## calories 0.70 0.09 -0.69
## protein 0.23 -0.24 0.47
## fat 0.22 -0.16 -0.41
## sodium 0.31 0.12 -0.38
## fiber 0.25 -0.51 0.60
## carbo 0.14 0.36 0.06
## sugars 0.46 -0.03 -0.76
## potass 0.42 -0.50 0.42
## vitamins 0.32 0.13 -0.21
## shelf 0.19 -0.35 0.05
## weight 1.00 -0.20 -0.30
## cups -0.20 1.00 -0.22
## rating -0.30 -0.22 1.00
Generate a matrix plot for these variables.
#make sure to install the "corplot" and "RColorBrewer" packages if this is your first time using them
library(corrplot)
## corrplot 0.84 loaded
library(RColorBrewer)
corrplot(cmCereal,type="lower",main="Correlation matrix",mar=c(0,0,1,0),tl.cex=0.8,tl.col="black", tl.srt=45,
col=brewer.pal(n=8, name="PuOr"))
Recreate the correlation table and matrix plot with normalized data.
#make sure to install the "caret" packages if this is your first time using it
library(caret)
## Loading required package: lattice
normCereal=preProcess(Cereals[,4:16],method=c("center","scale"),na.rm=TRUE)
NormcmCereal=predict(normCereal,Cereals[,4:16])
cmCereal2=cor(NormcmCereal,use="complete.obs")
round(cmCereal2,2)
## calories protein fat sodium fiber carbo sugars potass vitamins shelf
## calories 1.00 0.03 0.51 0.30 -0.30 0.27 0.57 -0.07 0.26 0.09
## protein 0.03 1.00 0.20 0.01 0.51 -0.04 -0.29 0.58 0.05 0.20
## fat 0.51 0.20 1.00 0.00 0.01 -0.28 0.29 0.20 -0.03 0.28
## sodium 0.30 0.01 0.00 1.00 -0.07 0.33 0.04 -0.04 0.33 -0.12
## fiber -0.30 0.51 0.01 -0.07 1.00 -0.38 -0.15 0.91 -0.04 0.31
## carbo 0.27 -0.04 -0.28 0.33 -0.38 1.00 -0.45 -0.37 0.25 -0.19
## sugars 0.57 -0.29 0.29 0.04 -0.15 -0.45 1.00 0.00 0.07 0.06
## potass -0.07 0.58 0.20 -0.04 0.91 -0.37 0.00 1.00 0.00 0.39
## vitamins 0.26 0.05 -0.03 0.33 -0.04 0.25 0.07 0.00 1.00 0.28
## shelf 0.09 0.20 0.28 -0.12 0.31 -0.19 0.06 0.39 0.28 1.00
## weight 0.70 0.23 0.22 0.31 0.25 0.14 0.46 0.42 0.32 0.19
## cups 0.09 -0.24 -0.16 0.12 -0.51 0.36 -0.03 -0.50 0.13 -0.35
## rating -0.69 0.47 -0.41 -0.38 0.60 0.06 -0.76 0.42 -0.21 0.05
## weight cups rating
## calories 0.70 0.09 -0.69
## protein 0.23 -0.24 0.47
## fat 0.22 -0.16 -0.41
## sodium 0.31 0.12 -0.38
## fiber 0.25 -0.51 0.60
## carbo 0.14 0.36 0.06
## sugars 0.46 -0.03 -0.76
## potass 0.42 -0.50 0.42
## vitamins 0.32 0.13 -0.21
## shelf 0.19 -0.35 0.05
## weight 1.00 -0.20 -0.30
## cups -0.20 1.00 -0.22
## rating -0.30 -0.22 1.00
corrplot(cmCereal2,type="lower",main="Normalized Correlation matrix",mar=c(0,0,1,0),tl.cex=0.8,tl.col="black", tl.srt=45,
col=brewer.pal(n=8, name="PuOr"))