October 2018

Abstract

This datafile contains nutritional information and grocery shelf location for 77 breakfast cereals. Current research states that adults should consume no more than 30% of their calories in the form of fat, they need about 50 grams (women) or 63 grams (men) of protein daily, and should provide for the remainder of their caloric intake with complex carbohydrates. One gram of fat contains 9 calories and carbohydrates and proteins contain 4 calories per gram. A "good" diet should also contain 20-35 grams of dietary fiber.

Description: Data on several variable of different brands of cereal

-Info : https://ntrda.me/2z0TGjn

About the Cereal Data

  • Total of the Observations are 77
  • There are 16 variable, consist of categoric an numeric variable, that we can see in this table :

Display Data

data<-read.csv("cereal.csv",header=TRUE,sep=",")
library('knitr')
kable(head(data), format = "markdown", full_width=T)
name mfr type calories protein fat sodium fiber carbo sugars potass vitamins shelf weight cups rating
100% Bran N C 70 4 1 130 10.0 5.0 6 280 25 3 1 0.33 68.40297
100% Natural Bran Q C 120 3 5 15 2.0 8.0 8 135 0 3 1 1.00 33.98368
All-Bran K C 70 4 1 260 9.0 7.0 5 320 25 3 1 0.33 59.42551
All-Bran with Extra Fiber K C 50 4 0 140 14.0 8.0 0 330 25 3 1 0.50 93.70491
Almond Delight R C 110 2 2 200 1.0 14.0 8 -1 25 3 1 0.75 34.38484
Apple Cinnamon Cheerios G C 110 2 2 180 1.5 10.5 10 70 25 1 1 0.75 29.50954

You can download the data in this link : http://bit.ly/kaggle_cerealdataset

Outline

  • Descriptive Statistics & Visualization
  • Correlation Analysis
  • Linear Regression

Library

library(ggplot2)
library(ggthemes)
library(corrplot)
## corrplot 0.84 loaded

Descriptive Statistic & Visualization (1/8)

Visualization with pie chart for categoric variable

##  mfr    type  
##  A: 1   C:74  
##  G:22   H: 3  
##  K:23         
##  N: 6         
##  P: 9         
##  Q: 8         
##  R: 8

Descriptive Statistic & Visualization (2/8)

Summary for all numeric variable

summary(data[,4:15])
##     calories        protein           fat            sodium     
##  Min.   : 50.0   Min.   :1.000   Min.   :0.000   Min.   :  0.0  
##  1st Qu.:100.0   1st Qu.:2.000   1st Qu.:0.000   1st Qu.:130.0  
##  Median :110.0   Median :3.000   Median :1.000   Median :180.0  
##  Mean   :106.9   Mean   :2.545   Mean   :1.013   Mean   :159.7  
##  3rd Qu.:110.0   3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.:210.0  
##  Max.   :160.0   Max.   :6.000   Max.   :5.000   Max.   :320.0  
##      fiber            carbo          sugars           potass      
##  Min.   : 0.000   Min.   :-1.0   Min.   :-1.000   Min.   : -1.00  
##  1st Qu.: 1.000   1st Qu.:12.0   1st Qu.: 3.000   1st Qu.: 40.00  
##  Median : 2.000   Median :14.0   Median : 7.000   Median : 90.00  
##  Mean   : 2.152   Mean   :14.6   Mean   : 6.922   Mean   : 96.08  
##  3rd Qu.: 3.000   3rd Qu.:17.0   3rd Qu.:11.000   3rd Qu.:120.00  
##  Max.   :14.000   Max.   :23.0   Max.   :15.000   Max.   :330.00  
##     vitamins          shelf           weight          cups      
##  Min.   :  0.00   Min.   :1.000   Min.   :0.50   Min.   :0.250  
##  1st Qu.: 25.00   1st Qu.:1.000   1st Qu.:1.00   1st Qu.:0.670  
##  Median : 25.00   Median :2.000   Median :1.00   Median :0.750  
##  Mean   : 28.25   Mean   :2.208   Mean   :1.03   Mean   :0.821  
##  3rd Qu.: 25.00   3rd Qu.:3.000   3rd Qu.:1.00   3rd Qu.:1.000  
##  Max.   :100.00   Max.   :3.000   Max.   :1.50   Max.   :1.500

Descriptive Statistic & Visualization (3/8)

Visualization of all numeric variable with Bar Chart & Histogram

Descriptive Statistic & Visualization (4/8)

Histogram of "rating"" as respon variable

ggplot(data = data) +
  aes(x = rating) +
  labs(title = "Histogram Of Rating (RESPON)")+
  geom_histogram(bins = 25, fill = "#fd8d3c") +
  theme_solarized()

Descriptive Statistic & Visualization (5/8)

Density of "rating" as respon variable

ggplot(data = data) +
  aes(x = rating) +
  geom_density(adjust = 0.4, fill = "#fd8d3c") +
  labs(title = "Density of Rating (RESPON)") +
  theme_solarized()

Descriptive Statistic & Visualization (6/8)

Boxplot all continous variable

Descriptive Statistic & Visualization (7/8)

ggplot(data = data) +
  aes(x = mfr, y = rating, fill = mfr) +
  geom_boxplot() +
  scale_fill_brewer(palette = "OrRd") +
  labs(title = "Rating Based on Manufacturer of Cereal") +
  theme_solarized()

Descriptive Statistic & Visualization (8/8)

ggplot(data = data) +
  aes(x = type, y = rating, fill = type) +
  geom_boxplot() +
  scale_fill_brewer(palette = "OrRd") +
  labs(title = "Rating Based on Type") +
  theme_solarized()

Scatter Plot

Correlation Analysis (1/2)

The correlation between all numeric variable predictor with respon

cor(data[,4:16])
##             calories      protein          fat       sodium       fiber
## calories  1.00000000  0.019066068  0.498609814  0.300649227 -0.29341275
## protein   0.01906607  1.000000000  0.208430990 -0.054674348  0.50033004
## fat       0.49860981  0.208430990  1.000000000 -0.005407464  0.01671924
## sodium    0.30064923 -0.054674348 -0.005407464  1.000000000 -0.07067501
## fiber    -0.29341275  0.500330043  0.016719237 -0.070675009  1.00000000
## carbo     0.25068091 -0.130863648 -0.318043492  0.355983473 -0.35608274
## sugars    0.56234029 -0.329141777  0.270819175  0.101451381 -0.14120539
## potass   -0.06660886  0.549407400  0.193278602 -0.032603467  0.90337367
## vitamins  0.26535630  0.007335371 -0.031156266  0.361476688 -0.03224268
## shelf     0.09723437  0.133864789  0.263691089 -0.069719015  0.29753906
## weight    0.69609108  0.216158486  0.214625033  0.308576451  0.24722563
## cups      0.08719955 -0.244469158 -0.175892142  0.119664615 -0.51306093
## rating   -0.68937603  0.470618465 -0.409283660 -0.401295204  0.58416042
##                carbo      sugars      potass     vitamins       shelf
## calories  0.25068091  0.56234029 -0.06660886  0.265356298  0.09723437
## protein  -0.13086365 -0.32914178  0.54940740  0.007335371  0.13386479
## fat      -0.31804349  0.27081918  0.19327860 -0.031156266  0.26369109
## sodium    0.35598347  0.10145138 -0.03260347  0.361476688 -0.06971902
## fiber    -0.35608274 -0.14120539  0.90337367 -0.032242679  0.29753906
## carbo     1.00000000 -0.33166538 -0.34968522  0.258147549 -0.10179030
## sugars   -0.33166538  1.00000000  0.02169581  0.125137260  0.10043789
## potass   -0.34968522  0.02169581  1.00000000  0.020698687  0.36066341
## vitamins  0.25814755  0.12513726  0.02069869  1.000000000  0.29926167
## shelf    -0.10179030  0.10043789  0.36066341  0.299261665  1.00000000
## weight    0.13513642  0.45064760  0.41630315  0.320324059  0.19076197
## cups      0.36393247 -0.03235762 -0.49519495  0.128404543 -0.33526876
## rating    0.05205466 -0.75967466  0.38016537 -0.240543611  0.02515882
##              weight        cups      rating
## calories  0.6960911  0.08719955 -0.68937603
## protein   0.2161585 -0.24446916  0.47061846
## fat       0.2146250 -0.17589214 -0.40928366
## sodium    0.3085765  0.11966461 -0.40129520
## fiber     0.2472256 -0.51306093  0.58416042
## carbo     0.1351364  0.36393247  0.05205466
## sugars    0.4506476 -0.03235762 -0.75967466
## potass    0.4163032 -0.49519495  0.38016537
## vitamins  0.3203241  0.12840454 -0.24054361
## shelf     0.1907620 -0.33526876  0.02515882
## weight    1.0000000 -0.19958272 -0.29812398
## cups     -0.1995827  1.00000000 -0.20316006
## rating   -0.2981240 -0.20316006  1.00000000

Correlation Analysis (2/2)

The correlation between all numeric variable predictor with respon

corrplot.mixed(cor(data[,4:16]), number.cex = .7, tl.cex=.6)

Linear Regression Methods (1/3)

Now it's time to make the linear regression between variable predictor and respon

## 
## Call:  glm(formula = rating ~ type + calories + protein + fat + sodium + 
##     fiber + carbo + sugars + potass + vitamins + shelf + weight + 
##     cups, data = data)
## 
## Coefficients:
## (Intercept)        typeH     calories      protein          fat  
##   5.493e+01   -3.917e-08   -2.227e-01    3.273e+00   -1.691e+00  
##      sodium        fiber        carbo       sugars       potass  
##  -5.449e-02    3.443e+00    1.092e+00   -7.249e-01   -3.399e-02  
##    vitamins        shelf       weight         cups  
##  -5.121e-02   -3.700e-08   -4.010e-07    1.430e-07  
## 
## Degrees of Freedom: 76 Total (i.e. Null);  63 Residual
## Null Deviance:       15000 
## Residual Deviance: 5.928e-12     AIC: -2077

Linear Regression Methods (2/3)

Now it's time to make the linear regression between variable predictor and respon

## 
## Call:
## glm(formula = rating ~ type + calories + protein + fat + sodium + 
##     fiber + carbo + sugars + potass + vitamins + shelf + weight + 
##     cups, data = data)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -5.246e-07  -2.573e-07   4.610e-08   2.242e-07   5.663e-07  
## 
## Coefficients:
##               Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)  5.493e+01  3.677e-07  1.494e+08   <2e-16 ***
## typeH       -3.917e-08  2.478e-07 -1.580e-01    0.875    
## calories    -2.227e-01  5.750e-09 -3.873e+07   <2e-16 ***
## protein      3.273e+00  5.149e-08  6.357e+07   <2e-16 ***
## fat         -1.691e+00  6.388e-08 -2.648e+07   <2e-16 ***
## sodium      -5.449e-02  5.179e-10 -1.052e+08   <2e-16 ***
## fiber        3.443e+00  4.434e-08  7.765e+07   <2e-16 ***
## carbo        1.092e+00  1.956e-08  5.584e+07   <2e-16 ***
## sugars      -7.249e-01  2.066e-08 -3.509e+07   <2e-16 ***
## potass      -3.399e-02  1.486e-09 -2.288e+07   <2e-16 ***
## vitamins    -5.121e-02  1.953e-09 -2.622e+07   <2e-16 ***
## shelf       -3.700e-08  5.327e-08 -6.950e-01    0.490    
## weight      -4.010e-07  5.554e-07 -7.220e-01    0.473    
## cups         1.430e-07  1.965e-07  7.280e-01    0.470    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 9.409889e-14)
## 
##     Null deviance: 1.4997e+04  on 76  degrees of freedom
## Residual deviance: 5.9282e-12  on 63  degrees of freedom
## AIC: -2076.5
## 
## Number of Fisher Scoring iterations: 1

Linear Regression Methods (3/3)

Full output of significant variable

THANK YOU