Project_Cereal

library(readxl)
cereals_practice <- read_excel("D:/UST-Global/docs/CENA-DataScience/TrainingDocs_R_DrVinod/DataSets/cereals_practice.xlsx")

#Impute dataset
cp <- cereals_practice
str(cp)

## Classes 'tbl_df', 'tbl' and 'data.frame':    77 obs. of  16 variables:
##  $ name    : chr  "100%_Bran" "100%_Natural_Bran" "All-Bran" "All-Bran_with_Extra_Fiber" ...
##  $ mfr     : chr  "N" "Q" "K" "K" ...
##  $ type    : chr  "C" "C" "C" "C" ...
##  $ calories: num  70 120 70 50 110 110 110 130 90 90 ...
##  $ protein : num  4 3 4 4 2 2 2 3 2 3 ...
##  $ fat     : num  1 5 1 0 2 2 0 2 1 0 ...
##  $ sodium  : num  130 15 260 140 200 180 125 210 200 210 ...
##  $ fiber   : num  10 2 9 14 1 1.5 1 2 4 5 ...
##  $ carbo   : num  5 8 7 8 14 10.5 11 18 15 13 ...
##  $ sugars  : num  6 8 5 0 8 10 14 8 6 5 ...
##  $ potass  : num  280 135 320 330 NA 70 30 100 125 190 ...
##  $ vitamins: num  25 0 25 25 25 25 25 25 25 25 ...
##  $ shelf   : num  3 3 3 3 3 1 2 3 1 3 ...
##  $ weight  : num  1 1 1 1 1 1 1 1.33 1 1 ...
##  $ cups    : num  0.33 1 0.33 0.5 0.75 0.75 1 0.75 0.67 0.67 ...
##  $ rating  : num  68.4 34 59.4 93.7 34.4 ...

summary(cp)

##      name               mfr                type              calories    
##  Length:77          Length:77          Length:77          Min.   : 50.0  
##  Class :character   Class :character   Class :character   1st Qu.:100.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :110.0  
##                                                           Mean   :106.9  
##                                                           3rd Qu.:110.0  
##                                                           Max.   :160.0  
##                                                                          
##     protein           fat            sodium          fiber       
##  Min.   :1.000   Min.   :0.000   Min.   :  0.0   Min.   : 0.000  
##  1st Qu.:2.000   1st Qu.:0.000   1st Qu.:130.0   1st Qu.: 1.000  
##  Median :3.000   Median :1.000   Median :180.0   Median : 2.000  
##  Mean   :2.545   Mean   :1.013   Mean   :159.7   Mean   : 2.152  
##  3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.:210.0   3rd Qu.: 3.000  
##  Max.   :6.000   Max.   :5.000   Max.   :320.0   Max.   :14.000  
##                                                                  
##      carbo          sugars           potass          vitamins     
##  Min.   : 5.0   Min.   : 0.000   Min.   : 15.00   Min.   :  0.00  
##  1st Qu.:12.0   1st Qu.: 3.000   1st Qu.: 42.50   1st Qu.: 25.00  
##  Median :14.5   Median : 7.000   Median : 90.00   Median : 25.00  
##  Mean   :14.8   Mean   : 7.026   Mean   : 98.67   Mean   : 28.25  
##  3rd Qu.:17.0   3rd Qu.:11.000   3rd Qu.:120.00   3rd Qu.: 25.00  
##  Max.   :23.0   Max.   :15.000   Max.   :330.00   Max.   :100.00  
##  NA's   :1      NA's   :1        NA's   :2                        
##      shelf           weight          cups           rating     
##  Min.   :1.000   Min.   :0.50   Min.   :0.250   Min.   :18.04  
##  1st Qu.:1.000   1st Qu.:1.00   1st Qu.:0.670   1st Qu.:33.17  
##  Median :2.000   Median :1.00   Median :0.750   Median :40.40  
##  Mean   :2.208   Mean   :1.03   Mean   :0.821   Mean   :42.67  
##  3rd Qu.:3.000   3rd Qu.:1.00   3rd Qu.:1.000   3rd Qu.:50.83  
##  Max.   :3.000   Max.   :1.50   Max.   :1.500   Max.   :93.70  
##

#Attempting to remove the NA values from the observations
#I am applying KNN method here
#install.packages("VIM")
library(VIM)

## Loading required package: colorspace

## Loading required package: grid

## Loading required package: data.table

## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.

## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

cpImpute <- kNN(cp)

## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion

## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion

## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion

## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion

## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion

## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion

## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion

## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion

## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion

## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion

## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion

## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion

## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion

## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion

## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion

## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion

## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion

## Warning in gowerD(don_dist_var, imp_dist_var, weights = weightsx,
## numericalX, : NAs introduced by coercion

library(psych)
summary(cpImpute)

##      name               mfr                type              calories    
##  Length:77          Length:77          Length:77          Min.   : 50.0  
##  Class :character   Class :character   Class :character   1st Qu.:100.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :110.0  
##                                                           Mean   :106.9  
##                                                           3rd Qu.:110.0  
##                                                           Max.   :160.0  
##     protein           fat            sodium          fiber       
##  Min.   :1.000   Min.   :0.000   Min.   :  0.0   Min.   : 0.000  
##  1st Qu.:2.000   1st Qu.:0.000   1st Qu.:130.0   1st Qu.: 1.000  
##  Median :3.000   Median :1.000   Median :180.0   Median : 2.000  
##  Mean   :2.545   Mean   :1.013   Mean   :159.7   Mean   : 2.152  
##  3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.:210.0   3rd Qu.: 3.000  
##  Max.   :6.000   Max.   :5.000   Max.   :320.0   Max.   :14.000  
##      carbo           sugars           potass          vitamins     
##  Min.   : 5.00   Min.   : 0.000   Min.   : 15.00   Min.   :  0.00  
##  1st Qu.:12.00   1st Qu.: 3.000   1st Qu.: 45.00   1st Qu.: 25.00  
##  Median :14.00   Median : 7.000   Median : 90.00   Median : 25.00  
##  Mean   :14.75   Mean   : 7.013   Mean   : 98.18   Mean   : 28.25  
##  3rd Qu.:17.00   3rd Qu.:11.000   3rd Qu.:120.00   3rd Qu.: 25.00  
##  Max.   :23.00   Max.   :15.000   Max.   :330.00   Max.   :100.00  
##      shelf           weight          cups           rating     
##  Min.   :1.000   Min.   :0.50   Min.   :0.250   Min.   :18.04  
##  1st Qu.:1.000   1st Qu.:1.00   1st Qu.:0.670   1st Qu.:33.17  
##  Median :2.000   Median :1.00   Median :0.750   Median :40.40  
##  Mean   :2.208   Mean   :1.03   Mean   :0.821   Mean   :42.67  
##  3rd Qu.:3.000   3rd Qu.:1.00   3rd Qu.:1.000   3rd Qu.:50.83  
##  Max.   :3.000   Max.   :1.50   Max.   :1.500   Max.   :93.70  
##   name_imp        mfr_imp         type_imp       calories_imp   
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:77        FALSE:77        FALSE:77        FALSE:77       
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##  protein_imp      fat_imp        sodium_imp      fiber_imp      
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:77        FALSE:77        FALSE:77        FALSE:77       
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##  carbo_imp       sugars_imp      potass_imp      vitamins_imp   
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:76        FALSE:76        FALSE:75        FALSE:77       
##  TRUE :1         TRUE :1         TRUE :2                        
##                                                                 
##                                                                 
##                                                                 
##  shelf_imp       weight_imp       cups_imp       rating_imp     
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:77        FALSE:77        FALSE:77        FALSE:77       
##                                                                 
##                                                                 
##                                                                 
##

#Take the original variables to vector c
cp<- subset(cpImpute,select = c(name,mfr,type,calories,protein,fat,sodium,fiber,
                                carbo,sugars,potass,vitamins,shelf,weight,cups,rating))

summary(cp)

##      name               mfr                type              calories    
##  Length:77          Length:77          Length:77          Min.   : 50.0  
##  Class :character   Class :character   Class :character   1st Qu.:100.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :110.0  
##                                                           Mean   :106.9  
##                                                           3rd Qu.:110.0  
##                                                           Max.   :160.0  
##     protein           fat            sodium          fiber       
##  Min.   :1.000   Min.   :0.000   Min.   :  0.0   Min.   : 0.000  
##  1st Qu.:2.000   1st Qu.:0.000   1st Qu.:130.0   1st Qu.: 1.000  
##  Median :3.000   Median :1.000   Median :180.0   Median : 2.000  
##  Mean   :2.545   Mean   :1.013   Mean   :159.7   Mean   : 2.152  
##  3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.:210.0   3rd Qu.: 3.000  
##  Max.   :6.000   Max.   :5.000   Max.   :320.0   Max.   :14.000  
##      carbo           sugars           potass          vitamins     
##  Min.   : 5.00   Min.   : 0.000   Min.   : 15.00   Min.   :  0.00  
##  1st Qu.:12.00   1st Qu.: 3.000   1st Qu.: 45.00   1st Qu.: 25.00  
##  Median :14.00   Median : 7.000   Median : 90.00   Median : 25.00  
##  Mean   :14.75   Mean   : 7.013   Mean   : 98.18   Mean   : 28.25  
##  3rd Qu.:17.00   3rd Qu.:11.000   3rd Qu.:120.00   3rd Qu.: 25.00  
##  Max.   :23.00   Max.   :15.000   Max.   :330.00   Max.   :100.00  
##      shelf           weight          cups           rating     
##  Min.   :1.000   Min.   :0.50   Min.   :0.250   Min.   :18.04  
##  1st Qu.:1.000   1st Qu.:1.00   1st Qu.:0.670   1st Qu.:33.17  
##  Median :2.000   Median :1.00   Median :0.750   Median :40.40  
##  Mean   :2.208   Mean   :1.03   Mean   :0.821   Mean   :42.67  
##  3rd Qu.:3.000   3rd Qu.:1.00   3rd Qu.:1.000   3rd Qu.:50.83  
##  Max.   :3.000   Max.   :1.50   Max.   :1.500   Max.   :93.70

#1) How many cereals in each type?
counts<-table(cp$type)
library(plotrix)

## 
## Attaching package: 'plotrix'

## The following object is masked from 'package:psych':
## 
##     rescale

a<-table(cp$type)
grp<-c("Cold","Hot")
cnt<-round(a)
grpd<-paste(grp,cnt)
lbl<-paste(grpd,"#", sep="")

pie3D(a,
      labels = lbl,
      labelcex=0.9,
      main = "Cereals' Count in Each Type",
      col = rainbow(length(a)))

#2) How calories and potassium are distributed
#Distribution of calories
par(mfcol = c(2,2))
x<-cp$calories
h<-hist(x,
        breaks=10,
        col="red",
        xlab = "Calories",
        main = "Histogram of Calories with Normal curve")

xfit<-seq(min(x),max(x),length(40))

yfit<-dnorm(xfit, mean = mean(x), sd=sd(x))

yfit<-yfit * diff(h$mids[1:2]*length(x))
lines(xfit, yfit, col="blue",lwd=2)

d<-density(cp$calories)
plot(d, main = "Kernel Density of Calories")
polygon(d, col="red", border = "black")

#Distribution of Potassium
x<-cp$potass
h<-hist(x,
     breaks = 10,
     col="green",
     xlab="Potassium",
     main="Histogram of Potassium with Normal curve")

xfit<-seq(min(x),max(x),length(40))

yfit<-dnorm(xfit, mean = mean(x), sd=sd(x))

yfit<-yfit * diff(h$mids[1:2]*length(x))
lines(xfit, yfit, col="blue",lwd=2)

d<-density(cp$potass)
plot(d, main = "Kernel Density of Potassium")
polygon(d, col="red", border = "black")

summary(cp$calories)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    50.0   100.0   110.0   106.9   110.0   160.0

#IQR = 110-100 = 10

summary(cp$potass)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.00   45.00   90.00   98.18  120.00  330.00

#IQR = 120-45 = 75

#3) How calories related to carbohydrates, sugar and fat? 
library(psych)
cp1<-subset(cp,select=c(calories,carbo,sugars,fat))
pairs.panels(cp1[,1:4],
             method = "pearson", #coorelation method
             hist.col = "red",
             main="Bivariate Scatter Plots Along With Histogram and Pearson Correlation",
             density = TRUE, # show density plots
             ellipses = TRUE, # show correlation ellipses
             lm=TRUE #linear regression fits 
             )

#4) We know that eating too much fat, cholesterol, or sodium may increase the
#risk of heart diseases or high blood pressure.  We need to keep these nutrients as low as
#possible. Can we analyze on the cereal dataset and come out with the healthiest among 
#them.
library("ggplot2")

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

ggplot(data = cp)+
  geom_point(mapping = aes(x = fat,
                           y = sodium,
                           color = mfr))+
             xlab("Fat")+
             ylab("Sodium")+
             ggtitle("Weighted scatterplot of Manufacturer vs Sodium and Fat levels")

ggplot(data = cp)+
  geom_point (mapping = aes(x = fat,
                           y = sodium))+
 facet_wrap(~mfr, nrow = 2)+
  xlab("Fat")+
  ylab("Sodium")+
  ggtitle("Weighted scatterplot of Manufacturer vs Sodium and Fat levels")

#5) List of manufacturers selling cereals having high calories
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

cp_select<-cp%>%subset(select=c(mfr,calories),calories>110)%>%
  count(calories,mfr)

x<-cp_select$calories
brand <-cp_select$mfr

pct <- round(x/sum(x)*100)
count<-round(x)
lbl<-paste(brand,"-Cal(",x,")-",pct,"%",sep="")

pie(x,
    labels = lbl,
    cex=0.8,
    col=rainbow(length(x)),
    main="Piechart of manufacturers selling cereals having high calories (> median)")

#6)Can we list down the manufacturer selling cereals with high calorie and minimum sugar content

ggplot(cp, 
       aes(x=sugars, y=calories, color=mfr, shape=mfr)) + 
  geom_point(size=6) +
  scale_shape_manual(values=c(15:23))

#7)Is there any relation between the calories per serving with the placement of cereal 
#in the shelves? For eg: healthier cereals are in shelf 1 or so.

dotchart(cp$calories,
         groups = factor(cp$shelf),
         xlab = "Calories",
         ylab = "Manufacturer",
         color = c("darkgreen","red","orange"),
         cex = 1.5,
         pch=17)

#8)How ratings related with calories?
library(car)

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:psych':
## 
##     logit

scatterplot(cp$rating~calories, 
     data = cp, 
     xlab="Calories", 
     ylab="Rating", 
     main="Rating vs Calories",
     col="green")

#9)Which nutrients are essential for a nutritious breakfast per rating?
#Plots for rating vs various variables
par(mfcol=c(3,3))
#rating vs calories
plot(cp$rating~calories, 
     data = cp, 
     xlab="Calories", 
     ylab="Rating", 
     main="Rating vs Calories",
     col="blue")
abline(lm(cp$rating~cp$calories), col="red")

#rating vs protein
plot(cp$rating~protein, 
     data = cp, 
     xlab="Protein", 
     ylab="Rating", 
     main="Rating vs Protein",
     col="blue")
abline(lm(cp$rating~cp$protein), col="red")

#rating vs fat
plot(cp$rating~fat, 
     data = cp, 
     xlab="Fat", 
     ylab="Rating", 
     main="Rating vs Fat",
     col="blue")
abline(lm(cp$rating~cp$fat), col="red")

#rating vs sodium
plot(cp$rating~sodium, 
     data = cp, 
     xlab="Sodium", 
     ylab="Rating", 
     main="Rating vs Sodium",
     col="blue")
abline(lm(cp$rating~cp$sodium), col="red")

#rating vs fiber

plot(cp$rating~fiber, 
     data = cp, 
     xlab="Fiber", 
     ylab="Rating", 
     main="Rating vs Fiber",
     col="blue")
abline(lm(cp$rating~cp$fiber), col="red")

#rating vs carbo

plot(cp$rating~carbo, 
     data = cp, 
     xlab="Carbohydrates", 
     ylab="Rating", 
     main="Rating vs Carbohydrates",
     col="blue")
abline(lm(cp$rating~cp$carbo), col="red")

#rating vs sugars

plot(cp$rating~sugars, 
     data = cp, 
     xlab="Sugar", 
     ylab="Rating", 
     main="Rating vs Sugar",
     col="blue")
abline(lm(cp$rating~cp$sugars), col="red")

#rating vs potass

plot(cp$rating~potass, 
     data = cp, 
     xlab="Potassium", 
     ylab="Rating", 
     main="Rating vs Potassium",
     col="blue")

abline(lm(cp$rating~cp$potass), col="red")


#rating vs vitamins

boxplot(rating~vitamins, 
        data = cp, 
        xlab="Vitamins",
        ylab="Ratings",
        main="Rating vs Vitamins",
        col=c("red","green","blue"))

#10)Is there any significance of manufacturer with regards to cereal rating?
boxplot(rating~mfr,
        data = cp,
        xlab = "Manufacturer",
        ylab = "Ratings",
        main = "Rating vs Manufacturer",
        col = topo.colors(7))

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)

##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Project_Cereal_Aswathy

Aswathy Nair

December 28, 2017

R Markdown

Including Plots