INTRO

Breakfast cereal is one of the favorite breakfast foods for both children and adults. On some packages you can find words like “fit,” “high protein,” “healthy.” But is this true? It is possible to find out by decreasing the size of the dataset preserving most information PCA on cereal nutrition ingredients. Project based on dataset which contains nutritional values from 80 types of different cereals https://www.kaggle.com/datasets/crawford/80-cereals

Which breakfast cereal ingredient stores the most nutritional information? maybe not so healthy sugar? let’s check:

ANALYSIS

Descriptive stats

summary(cereal)
##      name               mfr                type              calories    
##  Length:77          Length:77          Length:77          Min.   : 50.0  
##  Class :character   Class :character   Class :character   1st Qu.:100.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :110.0  
##                                                           Mean   :106.9  
##                                                           3rd Qu.:110.0  
##                                                           Max.   :160.0  
##     protein           fat            sodium          fiber       
##  Min.   :1.000   Min.   :0.000   Min.   :  0.0   Min.   : 0.000  
##  1st Qu.:2.000   1st Qu.:0.000   1st Qu.:130.0   1st Qu.: 1.000  
##  Median :3.000   Median :1.000   Median :180.0   Median : 2.000  
##  Mean   :2.545   Mean   :1.013   Mean   :159.7   Mean   : 2.152  
##  3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.:210.0   3rd Qu.: 3.000  
##  Max.   :6.000   Max.   :5.000   Max.   :320.0   Max.   :14.000  
##      carbo          sugars           potass          vitamins     
##  Min.   :-1.0   Min.   :-1.000   Min.   : -1.00   Min.   :  0.00  
##  1st Qu.:12.0   1st Qu.: 3.000   1st Qu.: 40.00   1st Qu.: 25.00  
##  Median :14.0   Median : 7.000   Median : 90.00   Median : 25.00  
##  Mean   :14.6   Mean   : 6.922   Mean   : 96.08   Mean   : 28.25  
##  3rd Qu.:17.0   3rd Qu.:11.000   3rd Qu.:120.00   3rd Qu.: 25.00  
##  Max.   :23.0   Max.   :15.000   Max.   :330.00   Max.   :100.00  
##      shelf           weight          cups           rating     
##  Min.   :1.000   Min.   :0.50   Min.   :0.250   Min.   :18.04  
##  1st Qu.:1.000   1st Qu.:1.00   1st Qu.:0.670   1st Qu.:33.17  
##  Median :2.000   Median :1.00   Median :0.750   Median :40.40  
##  Mean   :2.208   Mean   :1.03   Mean   :0.821   Mean   :42.67  
##  3rd Qu.:3.000   3rd Qu.:1.00   3rd Qu.:1.000   3rd Qu.:50.83  
##  Max.   :3.000   Max.   :1.50   Max.   :1.500   Max.   :93.70
dim(cereal)
## [1] 77 16

Nutritional distribution

par(mfrow = c(3, 3))
hist(cereal$calories, xlab = "Calories", main = "Calories distribution", col = wes_palette('Royal2', 11, type = ('continuous')))
hist(cereal$protein, xlab = "Protein", main = "Protein distribution", col = wes_palette('Royal2', 11, type = ('continuous')))
hist(cereal$fat, xlab = "Fat", main = "Fat distribution", col = wes_palette('Royal2', 11, type = ('continuous')))
hist(cereal$sodium, xlab = "Sodium", main = "Sodium distribution", col = wes_palette('Royal2', 11, type = ('continuous')))
hist(cereal$fiber, xlab = "Fiber", main = "Fiber distribution", col = wes_palette('Royal2', 11, type = ('continuous')))
hist(cereal$carbo, xlab = "Carbo", main = "Carbo distribution", col = wes_palette('Royal2', 11, type = ('continuous')))
hist(cereal$sugars, xlab = "Sugars", main = "Sugars distribution", col = wes_palette('Royal2', 11, type = ('continuous')))
hist(cereal$potass, xlab = "Potass", main = "Potass distribution", col = wes_palette('Royal2', 11, type = ('continuous')))
hist(cereal$vitamins, xlab = "Vitamins", main = "Vitamins distribution", col = wes_palette('Royal2', 11, type = ('continuous')))

Standardization of selected columns

nutri <- cereal[5:12]

pre <- preProcess(nutri, method = c("center", "scale"))
nutri_s <- predict(pre, nutri)

Checking correlation

ggpairs(nutri_s)

par(mfrow = c(1,1))
correlation <- cor(nutri_s)

corrplot(correlation, method = "circle", type = "upper", order = "hclust", addCoef.col = "grey30", col = wes_palette("Royal2", 9, type = ('continuous')))

Potas and fieber are highly correlated

PRINCIPAL COMPONENTS ANALYSIS

pca <- prcomp(nutri_s, center = FALSE, scale = FALSE)
pca$rotation
##                  PC1        PC2         PC3         PC4         PC5         PC6
## protein  -0.43020955 -0.3010297 -0.04226036  0.44085282 -0.14487747 -0.48172465
## fat      -0.19690483  0.3443189  0.33120825  0.73231694  0.01793745  0.35848190
## sodium    0.15661812 -0.3423062  0.53646747  0.04864795  0.65187105 -0.32868478
## fiber    -0.54478378 -0.2028145  0.02700826 -0.32522238  0.11669172  0.21861791
## carbo     0.36093074 -0.5096970 -0.01362805  0.16664513  0.08106192  0.61175926
## sugars    0.03876167  0.4982030  0.51689413 -0.29006145  0.03667417  0.01572122
## potass   -0.55911349 -0.1414933  0.16774992 -0.19764731  0.09673517  0.32758013
## vitamins  0.10221673 -0.3232657  0.55179984 -0.10126088 -0.72309213 -0.04241843
##                  PC7          PC8
## protein  -0.51645313  0.091606439
## fat       0.23992739  0.101438647
## sodium    0.18528599 -0.028858386
## fiber     0.17625558  0.680515306
## carbo    -0.43248908  0.118803916
## sugars   -0.60799612  0.166507396
## potass   -0.08922837 -0.689593448
## vitamins  0.21364994 -0.002523253
summary(pca)
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6     PC7
## Standard deviation     1.6126 1.2751 1.2032 0.9435 0.80220 0.62008 0.58894
## Proportion of Variance 0.3251 0.2032 0.1810 0.1113 0.08044 0.04806 0.04336
## Cumulative Proportion  0.3251 0.5283 0.7093 0.8205 0.90098 0.94904 0.99240
##                           PC8
## Standard deviation     0.2466
## Proportion of Variance 0.0076
## Cumulative Proportion  1.0000
fviz_eig(pca)

fviz_pca_var(pca, col.var = 'turquoise4')

fviz_pca_ind(pca, col.ind="cos2", geom = "point", gradient.cols = wes_palette("Royal2", 3, type = ('discrete')))

At a glance there are few potential nutritional ingrediens which can preserve the most information, let’s check contributions of individual variables

PC1 <- fviz_contrib(pca, choice = "var", axes = 1)
PC2 <- fviz_contrib(pca, choice = "var", axes = 2)
grid.arrange(PC1, PC2)

On the first plot almost 4 ingredients have the biggest contribution but on the second plot we can distinguish that only two ingredients can fulfill more than 50% of contribution. And YES! Sugar is here

How the components are clustered?

autoplot(pca, loadings=TRUE, loadings.colour= "rosybrown3", loadings.label=TRUE, loadings.label.size=5, loadings.label.col = "turquoise4")

fviz_cluster(clusters, geom = "point", palette = palette)

Easily detectable are three clusters in this data set. One of them contains sugar and fat, ingredients that are considered the ‘bad’ ones. The following clusters, on the other hand, contain the rest of the ‘good’ ingredients

CONCLUSIONS

Breakfast cereals are certainly tasty and eagerly eaten by most of the people, but if you want to maintain a healthy diet, don’t just read the cover of the cereal. The information about the contents is on the back of the package and is no longer as satisfying as their cover. Dimensional reduction techniques are very helpful in analyzing large data sets. But even on a small set like the one presented above, it is possible to detect variables which keep the most information included in original dataset.