library(readxl)
cereals_practice <- read_excel("D:/UST-Global/docs/CENA-DataScience/TrainingDocs_R_DrVinod/DataSets/cereals_practice.xlsx")
#Impute dataset
cp <- cereals_practice
str(cp)
## Classes 'tbl_df', 'tbl' and 'data.frame': 77 obs. of 16 variables:
## $ name : chr "100%_Bran" "100%_Natural_Bran" "All-Bran" "All-Bran_with_Extra_Fiber" ...
## $ mfr : chr "N" "Q" "K" "K" ...
## $ type : chr "C" "C" "C" "C" ...
## $ calories: num 70 120 70 50 110 110 110 130 90 90 ...
## $ protein : num 4 3 4 4 2 2 2 3 2 3 ...
## $ fat : num 1 5 1 0 2 2 0 2 1 0 ...
## $ sodium : num 130 15 260 140 200 180 125 210 200 210 ...
## $ fiber : num 10 2 9 14 1 1.5 1 2 4 5 ...
## $ carbo : num 5 8 7 8 14 10.5 11 18 15 13 ...
## $ sugars : num 6 8 5 0 8 10 14 8 6 5 ...
## $ potass : num 280 135 320 330 NA 70 30 100 125 190 ...
## $ vitamins: num 25 0 25 25 25 25 25 25 25 25 ...
## $ shelf : num 3 3 3 3 3 1 2 3 1 3 ...
## $ weight : num 1 1 1 1 1 1 1 1.33 1 1 ...
## $ cups : num 0.33 1 0.33 0.5 0.75 0.75 1 0.75 0.67 0.67 ...
## $ rating : num 68.4 34 59.4 93.7 34.4 ...
summary(cp)
## name mfr type calories
## Length:77 Length:77 Length:77 Min. : 50.0
## Class :character Class :character Class :character 1st Qu.:100.0
## Mode :character Mode :character Mode :character Median :110.0
## Mean :106.9
## 3rd Qu.:110.0
## Max. :160.0
##
## protein fat sodium fiber
## Min. :1.000 Min. :0.000 Min. : 0.0 Min. : 0.000
## 1st Qu.:2.000 1st Qu.:0.000 1st Qu.:130.0 1st Qu.: 1.000
## Median :3.000 Median :1.000 Median :180.0 Median : 2.000
## Mean :2.545 Mean :1.013 Mean :159.7 Mean : 2.152
## 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:210.0 3rd Qu.: 3.000
## Max. :6.000 Max. :5.000 Max. :320.0 Max. :14.000
##
## carbo sugars potass vitamins
## Min. : 5.0 Min. : 0.000 Min. : 15.00 Min. : 0.00
## 1st Qu.:12.0 1st Qu.: 3.000 1st Qu.: 42.50 1st Qu.: 25.00
## Median :14.5 Median : 7.000 Median : 90.00 Median : 25.00
## Mean :14.8 Mean : 7.026 Mean : 98.67 Mean : 28.25
## 3rd Qu.:17.0 3rd Qu.:11.000 3rd Qu.:120.00 3rd Qu.: 25.00
## Max. :23.0 Max. :15.000 Max. :330.00 Max. :100.00
## NA's :1 NA's :1 NA's :2
## shelf weight cups rating
## Min. :1.000 Min. :0.50 Min. :0.250 Min. :18.04
## 1st Qu.:1.000 1st Qu.:1.00 1st Qu.:0.670 1st Qu.:33.17
## Median :2.000 Median :1.00 Median :0.750 Median :40.40
## Mean :2.208 Mean :1.03 Mean :0.821 Mean :42.67
## 3rd Qu.:3.000 3rd Qu.:1.00 3rd Qu.:1.000 3rd Qu.:50.83
## Max. :3.000 Max. :1.50 Max. :1.500 Max. :93.70
##
#Attempting to remove the NA values from the observations
#I am applying KNN method here
#install.packages("VIM")
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
cpImpute <- kNN(cp)
## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion
## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion
## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion
## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion
## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion
## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion
## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion
## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion
## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion
## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion
## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion
## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion
## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion
## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion
## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion
## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion
## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion
## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion
library(psych)
summary(cpImpute)
## name mfr type calories
## Length:77 Length:77 Length:77 Min. : 50.0
## Class :character Class :character Class :character 1st Qu.:100.0
## Mode :character Mode :character Mode :character Median :110.0
## Mean :106.9
## 3rd Qu.:110.0
## Max. :160.0
## protein fat sodium fiber
## Min. :1.000 Min. :0.000 Min. : 0.0 Min. : 0.000
## 1st Qu.:2.000 1st Qu.:0.000 1st Qu.:130.0 1st Qu.: 1.000
## Median :3.000 Median :1.000 Median :180.0 Median : 2.000
## Mean :2.545 Mean :1.013 Mean :159.7 Mean : 2.152
## 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:210.0 3rd Qu.: 3.000
## Max. :6.000 Max. :5.000 Max. :320.0 Max. :14.000
## carbo sugars potass vitamins
## Min. : 5.00 Min. : 0.000 Min. : 15.00 Min. : 0.00
## 1st Qu.:12.00 1st Qu.: 3.000 1st Qu.: 45.00 1st Qu.: 25.00
## Median :14.00 Median : 7.000 Median : 90.00 Median : 25.00
## Mean :14.75 Mean : 7.013 Mean : 98.18 Mean : 28.25
## 3rd Qu.:17.00 3rd Qu.:11.000 3rd Qu.:120.00 3rd Qu.: 25.00
## Max. :23.00 Max. :15.000 Max. :330.00 Max. :100.00
## shelf weight cups rating
## Min. :1.000 Min. :0.50 Min. :0.250 Min. :18.04
## 1st Qu.:1.000 1st Qu.:1.00 1st Qu.:0.670 1st Qu.:33.17
## Median :2.000 Median :1.00 Median :0.750 Median :40.40
## Mean :2.208 Mean :1.03 Mean :0.821 Mean :42.67
## 3rd Qu.:3.000 3rd Qu.:1.00 3rd Qu.:1.000 3rd Qu.:50.83
## Max. :3.000 Max. :1.50 Max. :1.500 Max. :93.70
## name_imp mfr_imp type_imp calories_imp
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:77 FALSE:77 FALSE:77 FALSE:77
##
##
##
##
## protein_imp fat_imp sodium_imp fiber_imp
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:77 FALSE:77 FALSE:77 FALSE:77
##
##
##
##
## carbo_imp sugars_imp potass_imp vitamins_imp
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:76 FALSE:76 FALSE:75 FALSE:77
## TRUE :1 TRUE :1 TRUE :2
##
##
##
## shelf_imp weight_imp cups_imp rating_imp
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:77 FALSE:77 FALSE:77 FALSE:77
##
##
##
##
#Take the original variables to vector c
cp<- subset(cpImpute,select = c(name,mfr,type,calories,protein,fat,sodium,fiber,
carbo,sugars,potass,vitamins,shelf,weight,cups,rating))
summary(cp)
## name mfr type calories
## Length:77 Length:77 Length:77 Min. : 50.0
## Class :character Class :character Class :character 1st Qu.:100.0
## Mode :character Mode :character Mode :character Median :110.0
## Mean :106.9
## 3rd Qu.:110.0
## Max. :160.0
## protein fat sodium fiber
## Min. :1.000 Min. :0.000 Min. : 0.0 Min. : 0.000
## 1st Qu.:2.000 1st Qu.:0.000 1st Qu.:130.0 1st Qu.: 1.000
## Median :3.000 Median :1.000 Median :180.0 Median : 2.000
## Mean :2.545 Mean :1.013 Mean :159.7 Mean : 2.152
## 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:210.0 3rd Qu.: 3.000
## Max. :6.000 Max. :5.000 Max. :320.0 Max. :14.000
## carbo sugars potass vitamins
## Min. : 5.00 Min. : 0.000 Min. : 15.00 Min. : 0.00
## 1st Qu.:12.00 1st Qu.: 3.000 1st Qu.: 45.00 1st Qu.: 25.00
## Median :14.00 Median : 7.000 Median : 90.00 Median : 25.00
## Mean :14.75 Mean : 7.013 Mean : 98.18 Mean : 28.25
## 3rd Qu.:17.00 3rd Qu.:11.000 3rd Qu.:120.00 3rd Qu.: 25.00
## Max. :23.00 Max. :15.000 Max. :330.00 Max. :100.00
## shelf weight cups rating
## Min. :1.000 Min. :0.50 Min. :0.250 Min. :18.04
## 1st Qu.:1.000 1st Qu.:1.00 1st Qu.:0.670 1st Qu.:33.17
## Median :2.000 Median :1.00 Median :0.750 Median :40.40
## Mean :2.208 Mean :1.03 Mean :0.821 Mean :42.67
## 3rd Qu.:3.000 3rd Qu.:1.00 3rd Qu.:1.000 3rd Qu.:50.83
## Max. :3.000 Max. :1.50 Max. :1.500 Max. :93.70
#1) How many cereals in each type?
counts<-table(cp$type)
library(plotrix)
##
## Attaching package: 'plotrix'
## The following object is masked from 'package:psych':
##
## rescale
a<-table(cp$type)
grp<-c("Cold","Hot")
cnt<-round(a)
grpd<-paste(grp,cnt)
lbl<-paste(grpd,"#", sep="")
pie3D(a,
labels = lbl,
labelcex=0.9,
main = "Cereals' Count in Each Type",
col = rainbow(length(a)))
#2) How calories and potassium are distributed
#Distribution of calories
par(mfcol = c(2,2))
x<-cp$calories
h<-hist(x,
breaks=10,
col="red",
xlab = "Calories",
main = "Histogram of Calories with Normal curve")
xfit<-seq(min(x),max(x),length(40))
yfit<-dnorm(xfit, mean = mean(x), sd=sd(x))
yfit<-yfit * diff(h$mids[1:2]*length(x))
lines(xfit, yfit, col="blue",lwd=2)
d<-density(cp$calories)
plot(d, main = "Kernel Density of Calories")
polygon(d, col="red", border = "black")
#Distribution of Potassium
x<-cp$potass
h<-hist(x,
breaks = 10,
col="green",
xlab="Potassium",
main="Histogram of Potassium with Normal curve")
xfit<-seq(min(x),max(x),length(40))
yfit<-dnorm(xfit, mean = mean(x), sd=sd(x))
yfit<-yfit * diff(h$mids[1:2]*length(x))
lines(xfit, yfit, col="blue",lwd=2)
d<-density(cp$potass)
plot(d, main = "Kernel Density of Potassium")
polygon(d, col="red", border = "black")
summary(cp$calories)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 50.0 100.0 110.0 106.9 110.0 160.0
#IQR = 110-100 = 10
summary(cp$potass)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 15.00 45.00 90.00 98.18 120.00 330.00
#IQR = 120-45 = 75
#3) How calories related to carbohydrates, sugar and fat?
library(psych)
cp1<-subset(cp,select=c(calories,carbo,sugars,fat))
pairs.panels(cp1[,1:4],
method = "pearson", #coorelation method
hist.col = "red",
main="Bivariate Scatter Plots Along With Histogram and Pearson Correlation",
density = TRUE, # show density plots
ellipses = TRUE, # show correlation ellipses
lm=TRUE #linear regression fits
)
#4) We know that eating too much fat, cholesterol, or sodium may increase the
#risk of heart diseases or high blood pressure. We need to keep these nutrients as low as
#possible. Can we analyze on the cereal dataset and come out with the healthiest among
#them.
library("ggplot2")
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
ggplot(data = cp)+
geom_point(mapping = aes(x = fat,
y = sodium,
color = mfr))+
xlab("Fat")+
ylab("Sodium")+
ggtitle("Weighted scatterplot of Manufacturer vs Sodium and Fat levels")
ggplot(data = cp)+
geom_point (mapping = aes(x = fat,
y = sodium))+
facet_wrap(~mfr, nrow = 2)+
xlab("Fat")+
ylab("Sodium")+
ggtitle("Weighted scatterplot of Manufacturer vs Sodium and Fat levels")
#5) List of manufacturers selling cereals having high calories
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
cp_select<-cp%>%subset(select=c(mfr,calories),calories>110)%>%
count(calories,mfr)
x<-cp_select$calories
brand <-cp_select$mfr
pct <- round(x/sum(x)*100)
count<-round(x)
lbl<-paste(brand,"-Cal(",x,")-",pct,"%",sep="")
pie(x,
labels = lbl,
cex=0.8,
col=rainbow(length(x)),
main="Piechart of manufacturers selling cereals having high calories (> median)")
#6)Can we list down the manufacturer selling cereals with high calorie and minimum sugar content
ggplot(cp,
aes(x=sugars, y=calories, color=mfr, shape=mfr)) +
geom_point(size=6) +
scale_shape_manual(values=c(15:23))
#7)Is there any relation between the calories per serving with the placement of cereal
#in the shelves? For eg: healthier cereals are in shelf 1 or so.
dotchart(cp$calories,
groups = factor(cp$shelf),
xlab = "Calories",
ylab = "Manufacturer",
color = c("darkgreen","red","orange"),
cex = 1.5,
pch=17)
#8)How ratings related with calories?
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:psych':
##
## logit
scatterplot(cp$rating~calories,
data = cp,
xlab="Calories",
ylab="Rating",
main="Rating vs Calories",
col="green")
#9)Which nutrients are essential for a nutritious breakfast per rating?
#Plots for rating vs various variables
par(mfcol=c(3,3))
#rating vs calories
plot(cp$rating~calories,
data = cp,
xlab="Calories",
ylab="Rating",
main="Rating vs Calories",
col="blue")
abline(lm(cp$rating~cp$calories), col="red")
#rating vs protein
plot(cp$rating~protein,
data = cp,
xlab="Protein",
ylab="Rating",
main="Rating vs Protein",
col="blue")
abline(lm(cp$rating~cp$protein), col="red")
#rating vs fat
plot(cp$rating~fat,
data = cp,
xlab="Fat",
ylab="Rating",
main="Rating vs Fat",
col="blue")
abline(lm(cp$rating~cp$fat), col="red")
#rating vs sodium
plot(cp$rating~sodium,
data = cp,
xlab="Sodium",
ylab="Rating",
main="Rating vs Sodium",
col="blue")
abline(lm(cp$rating~cp$sodium), col="red")
#rating vs fiber
plot(cp$rating~fiber,
data = cp,
xlab="Fiber",
ylab="Rating",
main="Rating vs Fiber",
col="blue")
abline(lm(cp$rating~cp$fiber), col="red")
#rating vs carbo
plot(cp$rating~carbo,
data = cp,
xlab="Carbohydrates",
ylab="Rating",
main="Rating vs Carbohydrates",
col="blue")
abline(lm(cp$rating~cp$carbo), col="red")
#rating vs sugars
plot(cp$rating~sugars,
data = cp,
xlab="Sugar",
ylab="Rating",
main="Rating vs Sugar",
col="blue")
abline(lm(cp$rating~cp$sugars), col="red")
#rating vs potass
plot(cp$rating~potass,
data = cp,
xlab="Potassium",
ylab="Rating",
main="Rating vs Potassium",
col="blue")
abline(lm(cp$rating~cp$potass), col="red")
#rating vs vitamins
boxplot(rating~vitamins,
data = cp,
xlab="Vitamins",
ylab="Ratings",
main="Rating vs Vitamins",
col=c("red","green","blue"))
#10)Is there any significance of manufacturer with regards to cereal rating?
boxplot(rating~mfr,
data = cp,
xlab = "Manufacturer",
ylab = "Ratings",
main = "Rating vs Manufacturer",
col = topo.colors(7))
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.