Introduction

This report analyzes the COVID-19 Healthy Diet Dataset [Ren]. The dataset contains information on 170 countries with 30 continuous numeric variables, 23 of which relate directly to food intake, 6 variables relate to health measures, and 1 variable provides the population of each country. This document contains sections with code for PCA, MDS, clustering and classification

#load packages
library(MASS)
install.packages('aplpack')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(aplpack)
## Warning in fun(libname, pkgname): couldn't connect to display ":0"
library(scatterplot3d)
install.packages('corrplot')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(corrplot)
## corrplot 0.92 loaded
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
install.packages('ggplot2')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(ggplot2)
install.packages('ggcorrplot')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(ggcorrplot)
install.packages('GGally')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
theme_set(theme_bw())

DATA processing and exploring

#import data
raw_data <- read.csv("Food_Supply_Quantity_kg_Data.csv")

#get rid of NAs
d <- na.omit(raw_data)

#remove units column
d['Unit..all.except.Population.'] = NULL


#remove factors from undernourished and convert to numeric
d$Undernourished[d$Undernourished == "<2.5"] <- 2.5

d$Undernourished <- as.numeric(as.character(d$Undernourished))

#str(d)
corrplot(cor(d[,2:31]),tl.pos='n')

corrplot(cor(d[,2:31][1:10]),tl.cex=0.6)

#only consider food variables 
d <- subset(d, select = 1:24)

rownames(d) <- d[,1]
d <- d[,-1]
#don't need to scale since it's all a percentage
X <- scale(d, center = TRUE, scale = FALSE)

Now we’ve got our data in a format whihc is suitable for PCA. We see that our data is rich in correlations meaning PCA should be effective. We’ll now attempt to reduce the dimensions using PCA

PCA

pca <- prcomp(X)

# List of elements of PCA object
names(pca)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
#The function prcomp returns a list that includes the 
#standard deviations of the principal components (square roots of eigenvalues),
#the rotation matrix (a.k.a., matrix of eigenvectors), as well as a matrix of scores x.

#install this package to help woth visualisation
install.packages('factoextra')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
factoextra::fviz_pca_biplot(pca,labelsize = 2, repel = TRUE)

#close points are similar

factoextra::fviz_pca_var(pca, labelsize = 2,repel = TRUE,arrowsize = 0.1,col.ind = "red")

#interpretation
#if 2 arrows allogned - then more correlated 

factoextra::fviz_pca_ind(pca, labelsize = 2,pointsize = 1,repel = TRUE)

#intepretation
#close points similar 




# Proportion of variance explained
summary(pca)
## Importance of components:
##                            PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     10.9335 6.3383 4.21410 3.32052 2.59771 2.05560 1.65389
## Proportion of Variance  0.5788 0.1945 0.08599 0.05339 0.03267 0.02046 0.01324
## Cumulative Proportion   0.5788 0.7733 0.85934 0.91273 0.94540 0.96586 0.97911
##                            PC8     PC9    PC10    PC11   PC12    PC13    PC14
## Standard deviation     1.39990 1.08694 0.60550 0.48681 0.4318 0.39717 0.26849
## Proportion of Variance 0.00949 0.00572 0.00178 0.00115 0.0009 0.00076 0.00035
## Cumulative Proportion  0.98860 0.99432 0.99609 0.99724 0.9981 0.99891 0.99926
##                           PC15    PC16    PC17    PC18    PC19    PC20     PC21
## Standard deviation     0.21730 0.18605 0.15907 0.13365 0.12856 0.10976 0.003553
## Proportion of Variance 0.00023 0.00017 0.00012 0.00009 0.00008 0.00006 0.000000
## Cumulative Proportion  0.99948 0.99965 0.99978 0.99986 0.99994 1.00000 1.000000
##                             PC22      PC23
## Standard deviation     0.0003278 2.754e-05
## Proportion of Variance 0.0000000 0.000e+00
## Cumulative Proportion  1.0000000 1.000e+00
plot(cumsum(pca$sdev^2 / sum(pca$sdev^2)), type="b",main="Plot showing the number of PCs against the cumulative variance",cex.main=0.9,
     xlab="Number of PCs", ylab="Cumulative Variance"
     )

We’ve performed PCA and reduced the number of columns to 4 whilst maintaining over 90% of the information. Now we’ll explore MDS and see if we can generate any conclusions.

MDS

d1 <- dist(d) # euclidean distances between the rows
# Apply classical MDS
mds1 <- cmdscale(d = d1, eig = TRUE)
#  eigenvalues
eigen <- mds1$eig

# Goodness of fit criterion based on absolute value
goodness<- cumsum(abs(mds1$eig/sum(abs(mds1$eig))))


# Plot 2d classical MDS configuration
cmd_2d <- mds1$points
#make a plot
mdsoutplot <- plot(cmd_2d, type = "n", asp = 1, xlab = "MDS dim 1", ylab = "MDS dim 2")
text(cmd_2d, labels = rownames(cmd_2d))

# Compute minimum spanning tree
tree <- ape::mst(d1)

# Create plot
mdsout <- plot(cmd_2d, type = "p", asp = 1, 
     xlab = "MDS dim 1", ylab = "MDS dim 2")

# Loop over pictures
for(i in 1:nrow(tree)) {
  # Identify adjacent pictures
  which_adj <- which(tree[i,] == 1)
  
  # Add segment between pictures i and j if adjacent
  segments(x0 = cmd_2d[i,1], 
           y0 = cmd_2d[i,2], 
           x1 = cmd_2d[which_adj,1], 
           y1 = cmd_2d[which_adj,2], 
           col = "grey")
}

Clustering

#k-means

# Perform k-means clustering on the dataset
set.seed(123)
k <- 2:10
sse <- numeric(length(k))
for(i in k) {
  km <- kmeans(X, centers=i, nstart=25)
  sse[i-1] <- km$tot.withinss
}

# Plot elbow plot
plot(k, sse, type="b", pch=19, frame=FALSE, xlab="Number of clusters K", ylab="SSE")
#abline(v=k[which.min(sse)] ,col="red", lty=2)
text(k[which.min(sse)], par("usr")[4], paste("k=",k[which.min(sse)]), pos=4)

# Choose the number of clusters
n_clusters <- 4

# Perform k-means clustering
set.seed(123)
kmeans_fit <- kmeans(X, centers = n_clusters, nstart = 25)
cluster = kmeans_fit$cluster
fviz_cluster(list(data = X, cluster = cluster), ellipse.type = "t", geom = "point", stand = FALSE)

# Load required packages
library(maps)
library(mapdata)
library(ggplot2)

# Create a data frame with the country names and their clusters
country_clusters <- data.frame(region = row.names(X), cluster = cluster)

# Load the world map data
map.world <- map_data("world")

# Merge the country clusters with the world map data
map.world <- merge(map.world, country_clusters, by = "region")

# Plot the map with the clusters colored by cluster number
ggplot(map.world, aes(x = long, y = lat, group = group, fill = factor(cluster))) +
  geom_polygon() +
  scale_fill_discrete(name = "Cluster") +
  theme_void() +
  theme(legend.position = "bottom")

#We can use the same code to get results for 6 clusters.

Hierarchical Clustering

#clustering hierarchical

# Perform clustering using hierarchical clustering
hc <- hclust(dist(X))

# Plot the dendrogram
plot(hc)

# Cut the dendrogram into k clusters
k <- 4
clusters <- cutree(hc, k)

# Print the number of countries in each cluster
table(clusters)
## clusters
##  1  2  3  4 
## 25 45 66 18
# Print the countries in each cluster
for (i in 1:k) {
  cat("Cluster ", i, ":\n")
  print(rownames(X)[which(clusters == i)])
  cat("\n")
}
## Cluster  1 :
##  [1] "Afghanistan"                      "Bangladesh"                      
##  [3] "Burkina Faso"                     "Cambodia"                        
##  [5] "Chad"                             "Djibouti"                        
##  [7] "Ethiopia"                         "Gambia"                          
##  [9] "Guinea-Bissau"                    "Indonesia"                       
## [11] "Iraq"                             "Lao People's Democratic Republic"
## [13] "Lesotho"                          "Madagascar"                      
## [15] "Nepal"                            "Niger"                           
## [17] "Philippines"                      "Senegal"                         
## [19] "Sierra Leone"                     "Sri Lanka"                       
## [21] "Thailand"                         "Timor-Leste"                     
## [23] "Yemen"                            "Zambia"                          
## [25] "Zimbabwe"                        
## 
## Cluster  2 :
##  [1] "Albania"                  "Argentina"               
##  [3] "Australia"                "Austria"                 
##  [5] "Belgium"                  "Brazil"                  
##  [7] "Bulgaria"                 "Costa Rica"              
##  [9] "Croatia"                  "Cyprus"                  
## [11] "Czechia"                  "Denmark"                 
## [13] "Estonia"                  "Finland"                 
## [15] "France"                   "Germany"                 
## [17] "Greece"                   "Hungary"                 
## [19] "Iceland"                  "Ireland"                 
## [21] "Israel"                   "Italy"                   
## [23] "Kazakhstan"               "Latvia"                  
## [25] "Lithuania"                "Luxembourg"              
## [27] "Mongolia"                 "Montenegro"              
## [29] "Netherlands"              "New Zealand"             
## [31] "Norway"                   "Pakistan"                
## [33] "Poland"                   "Portugal"                
## [35] "Romania"                  "Russian Federation"      
## [37] "Serbia"                   "Slovakia"                
## [39] "Slovenia"                 "Spain"                   
## [41] "Sweden"                   "Switzerland"             
## [43] "United Kingdom"           "United States of America"
## [45] "Uruguay"                 
## 
## Cluster  3 :
##  [1] "Algeria"                            "Armenia"                           
##  [3] "Azerbaijan"                         "Barbados"                          
##  [5] "Belarus"                            "Belize"                            
##  [7] "Bolivia"                            "Bosnia and Herzegovina"            
##  [9] "Botswana"                           "Cabo Verde"                        
## [11] "China"                              "Colombia"                          
## [13] "Cuba"                               "Dominica"                          
## [15] "Dominican Republic"                 "Ecuador"                           
## [17] "Egypt"                              "El Salvador"                       
## [19] "Eswatini"                           "Fiji"                              
## [21] "Gabon"                              "Georgia"                           
## [23] "Guatemala"                          "Guyana"                            
## [25] "Honduras"                           "India"                             
## [27] "Iran (Islamic Republic of)"         "Jamaica"                           
## [29] "Japan"                              "Jordan"                            
## [31] "Kenya"                              "Korea, South"                      
## [33] "Kuwait"                             "Kyrgyzstan"                        
## [35] "Lebanon"                            "Malaysia"                          
## [37] "Maldives"                           "Mali"                              
## [39] "Malta"                              "Mauritania"                        
## [41] "Mauritius"                          "Mexico"                            
## [43] "Morocco"                            "Namibia"                           
## [45] "Nicaragua"                          "North Macedonia"                   
## [47] "Oman"                               "Panama"                            
## [49] "Paraguay"                           "Peru"                              
## [51] "Saint Vincent and the Grenadines"   "Samoa"                             
## [53] "Sao Tome and Principe"              "Saudi Arabia"                      
## [55] "South Africa"                       "Sudan"                             
## [57] "Suriname"                           "Trinidad and Tobago"               
## [59] "Tunisia"                            "Turkey"                            
## [61] "Ukraine"                            "United Arab Emirates"              
## [63] "Uzbekistan"                         "Vanuatu"                           
## [65] "Venezuela (Bolivarian Republic of)" "Vietnam"                           
## 
## Cluster  4 :
##  [1] "Angola"                      "Benin"                      
##  [3] "Cameroon"                    "Central African Republic"   
##  [5] "Congo"                       "Cote d'Ivoire"              
##  [7] "Ghana"                       "Guinea"                     
##  [9] "Haiti"                       "Liberia"                    
## [11] "Malawi"                      "Mozambique"                 
## [13] "Nigeria"                     "Rwanda"                     
## [15] "Solomon Islands"             "Togo"                       
## [17] "Uganda"                      "United Republic of Tanzania"
fviz_cluster(list(data = X, cluster = clusters), ellipse.type = "t", geom = "point", stand = FALSE)

#The same code as above can be used to generate the other counrty plots

Classification

#Add the covid deaths to the data set since we previously removed it

raw_data2 <- read.csv("Food_Supply_Quantity_kg_Data.csv")
d2 <- na.omit(raw_data)
deaths <- d2$Deaths
#turning it into categorical with low, medium and high deaths
quantiles <- quantile(deaths, c(0.33, 0.67))
categories <- cut(deaths, breaks = c(-Inf, quantiles, Inf), labels = c("low", "medium", "high"))
categories<- data.frame(categories)

#Now do classification


# Load the rpart package
library(rpart)

# Set the seed for reproducibility
set.seed(123)

#add categories to X

X_withdead <- cbind(X,categories)

# Split the data into training and test sets
train_index <- sample(nrow(X_withdead), nrow(X_withdead)*0.7, replace = FALSE)
X_train <- as.data.frame(X_withdead[train_index, ])
X_test <- as.data.frame(X_withdead[-train_index, ])

# Train a decision tree on the training set
tree_model <- rpart(categories ~ ., data = X_train, method = "class")

# Make predictions on the test set
test_predictions <- predict(tree_model, data.frame(X_test), type = "class")

# Create a confusion matrix to evaluate the performance
confusion_matrix <- table(test_predictions, X_test$categories)
print(confusion_matrix)
##                 
## test_predictions low medium high
##           low     10      9    4
##           medium   2      3    2
##           high     1      6   10
# Calculate overall accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Overall Accuracy:", round(accuracy, 2)))
## [1] "Overall Accuracy: 0.49"
# Calculate precision for each category
precision <- diag(confusion_matrix) / colSums(confusion_matrix)
names(precision) <- levels(X_train$categories)
print(precision)
##       low    medium      high 
## 0.7692308 0.1666667 0.6250000
# Calculate recall (sensitivity) for each category
recall <- diag(confusion_matrix) / rowSums(confusion_matrix)
names(recall) <- levels(X_train$categories)
print(recall)
##       low    medium      high 
## 0.4347826 0.4285714 0.5882353
# Plot the decision tree
install.packages("rpart.plot")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(rpart.plot)
rpart.plot(tree_model, type = 3, extra = 101, under = TRUE, fallen.leaves = TRUE)

Now we’ll perform classification but with only two classes to see if it improves performance.

#We don't have enough data to do this for 3 categories.
#maybe try to classify: low and high
quantiles1 <- quantile(deaths, c(0.5))
categories1 <- cut(deaths, breaks = c(-Inf, quantiles1, Inf), labels = c("low", "high"))
categories1<- data.frame(categories1)


#add categories to X

X_withdead1 <- cbind(X,categories1)

# Split the data into training and test sets
train_index <- sample(nrow(X_withdead1), nrow(X_withdead1)*0.7, replace = FALSE)
X_train <- as.data.frame(X_withdead1[train_index, ])
X_test <- as.data.frame(X_withdead1[-train_index, ])

# Train a decision tree on the training set
tree_model <- rpart(categories1 ~ ., data = X_train, method = "class")

# Make predictions on the test set
test_predictions <- predict(tree_model, data.frame(X_test), type = "class")

# Create a confusion matrix to evaluate the performance
confusion_matrix <- table(test_predictions, X_test$categories)
print(confusion_matrix)
##                 
## test_predictions low high
##             low   16    9
##             high   4   18
# Calculate overall accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Overall Accuracy:", round(accuracy, 2)))
## [1] "Overall Accuracy: 0.72"
# Calculate precision for each category
precision <- diag(confusion_matrix) / colSums(confusion_matrix)
names(precision) <- levels(X_train$categories)
print(precision)
##       low      high 
## 0.8000000 0.6666667
# Calculate recall (sensitivity) for each category
recall <- diag(confusion_matrix) / rowSums(confusion_matrix)
names(recall) <- levels(X_train$categories)
print(recall)
##       low      high 
## 0.6400000 0.8181818
# Plot the decision tree

library(rpart.plot)
rpart.plot(tree_model, type = 3, extra = 101, under = TRUE, fallen.leaves = TRUE)