This report analyzes the COVID-19 Healthy Diet Dataset [Ren]. The dataset contains information on 170 countries with 30 continuous numeric variables, 23 of which relate directly to food intake, 6 variables relate to health measures, and 1 variable provides the population of each country. This document contains sections with code for PCA, MDS, clustering and classification
#load packages
library(MASS)
install.packages('aplpack')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(aplpack)
## Warning in fun(libname, pkgname): couldn't connect to display ":0"
library(scatterplot3d)
install.packages('corrplot')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(corrplot)
## corrplot 0.92 loaded
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
install.packages('ggplot2')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(ggplot2)
install.packages('ggcorrplot')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(ggcorrplot)
install.packages('GGally')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
theme_set(theme_bw())
#import data
raw_data <- read.csv("Food_Supply_Quantity_kg_Data.csv")
#get rid of NAs
d <- na.omit(raw_data)
#remove units column
d['Unit..all.except.Population.'] = NULL
#remove factors from undernourished and convert to numeric
d$Undernourished[d$Undernourished == "<2.5"] <- 2.5
d$Undernourished <- as.numeric(as.character(d$Undernourished))
#str(d)
corrplot(cor(d[,2:31]),tl.pos='n')
corrplot(cor(d[,2:31][1:10]),tl.cex=0.6)
#only consider food variables
d <- subset(d, select = 1:24)
rownames(d) <- d[,1]
d <- d[,-1]
#don't need to scale since it's all a percentage
X <- scale(d, center = TRUE, scale = FALSE)
Now we’ve got our data in a format whihc is suitable for PCA. We see that our data is rich in correlations meaning PCA should be effective. We’ll now attempt to reduce the dimensions using PCA
pca <- prcomp(X)
# List of elements of PCA object
names(pca)
## [1] "sdev" "rotation" "center" "scale" "x"
#The function prcomp returns a list that includes the
#standard deviations of the principal components (square roots of eigenvalues),
#the rotation matrix (a.k.a., matrix of eigenvectors), as well as a matrix of scores x.
#install this package to help woth visualisation
install.packages('factoextra')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
factoextra::fviz_pca_biplot(pca,labelsize = 2, repel = TRUE)
#close points are similar
factoextra::fviz_pca_var(pca, labelsize = 2,repel = TRUE,arrowsize = 0.1,col.ind = "red")
#interpretation
#if 2 arrows allogned - then more correlated
factoextra::fviz_pca_ind(pca, labelsize = 2,pointsize = 1,repel = TRUE)
#intepretation
#close points similar
# Proportion of variance explained
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 10.9335 6.3383 4.21410 3.32052 2.59771 2.05560 1.65389
## Proportion of Variance 0.5788 0.1945 0.08599 0.05339 0.03267 0.02046 0.01324
## Cumulative Proportion 0.5788 0.7733 0.85934 0.91273 0.94540 0.96586 0.97911
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 1.39990 1.08694 0.60550 0.48681 0.4318 0.39717 0.26849
## Proportion of Variance 0.00949 0.00572 0.00178 0.00115 0.0009 0.00076 0.00035
## Cumulative Proportion 0.98860 0.99432 0.99609 0.99724 0.9981 0.99891 0.99926
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.21730 0.18605 0.15907 0.13365 0.12856 0.10976 0.003553
## Proportion of Variance 0.00023 0.00017 0.00012 0.00009 0.00008 0.00006 0.000000
## Cumulative Proportion 0.99948 0.99965 0.99978 0.99986 0.99994 1.00000 1.000000
## PC22 PC23
## Standard deviation 0.0003278 2.754e-05
## Proportion of Variance 0.0000000 0.000e+00
## Cumulative Proportion 1.0000000 1.000e+00
plot(cumsum(pca$sdev^2 / sum(pca$sdev^2)), type="b",main="Plot showing the number of PCs against the cumulative variance",cex.main=0.9,
xlab="Number of PCs", ylab="Cumulative Variance"
)
We’ve performed PCA and reduced the number of columns to 4 whilst
maintaining over 90% of the information. Now we’ll explore MDS and see
if we can generate any conclusions.
d1 <- dist(d) # euclidean distances between the rows
# Apply classical MDS
mds1 <- cmdscale(d = d1, eig = TRUE)
# eigenvalues
eigen <- mds1$eig
# Goodness of fit criterion based on absolute value
goodness<- cumsum(abs(mds1$eig/sum(abs(mds1$eig))))
# Plot 2d classical MDS configuration
cmd_2d <- mds1$points
#make a plot
mdsoutplot <- plot(cmd_2d, type = "n", asp = 1, xlab = "MDS dim 1", ylab = "MDS dim 2")
text(cmd_2d, labels = rownames(cmd_2d))
# Compute minimum spanning tree
tree <- ape::mst(d1)
# Create plot
mdsout <- plot(cmd_2d, type = "p", asp = 1,
xlab = "MDS dim 1", ylab = "MDS dim 2")
# Loop over pictures
for(i in 1:nrow(tree)) {
# Identify adjacent pictures
which_adj <- which(tree[i,] == 1)
# Add segment between pictures i and j if adjacent
segments(x0 = cmd_2d[i,1],
y0 = cmd_2d[i,2],
x1 = cmd_2d[which_adj,1],
y1 = cmd_2d[which_adj,2],
col = "grey")
}
#k-means
# Perform k-means clustering on the dataset
set.seed(123)
k <- 2:10
sse <- numeric(length(k))
for(i in k) {
km <- kmeans(X, centers=i, nstart=25)
sse[i-1] <- km$tot.withinss
}
# Plot elbow plot
plot(k, sse, type="b", pch=19, frame=FALSE, xlab="Number of clusters K", ylab="SSE")
#abline(v=k[which.min(sse)] ,col="red", lty=2)
text(k[which.min(sse)], par("usr")[4], paste("k=",k[which.min(sse)]), pos=4)
# Choose the number of clusters
n_clusters <- 4
# Perform k-means clustering
set.seed(123)
kmeans_fit <- kmeans(X, centers = n_clusters, nstart = 25)
cluster = kmeans_fit$cluster
fviz_cluster(list(data = X, cluster = cluster), ellipse.type = "t", geom = "point", stand = FALSE)
# Load required packages
library(maps)
library(mapdata)
library(ggplot2)
# Create a data frame with the country names and their clusters
country_clusters <- data.frame(region = row.names(X), cluster = cluster)
# Load the world map data
map.world <- map_data("world")
# Merge the country clusters with the world map data
map.world <- merge(map.world, country_clusters, by = "region")
# Plot the map with the clusters colored by cluster number
ggplot(map.world, aes(x = long, y = lat, group = group, fill = factor(cluster))) +
geom_polygon() +
scale_fill_discrete(name = "Cluster") +
theme_void() +
theme(legend.position = "bottom")
#We can use the same code to get results for 6 clusters.
#clustering hierarchical
# Perform clustering using hierarchical clustering
hc <- hclust(dist(X))
# Plot the dendrogram
plot(hc)
# Cut the dendrogram into k clusters
k <- 4
clusters <- cutree(hc, k)
# Print the number of countries in each cluster
table(clusters)
## clusters
## 1 2 3 4
## 25 45 66 18
# Print the countries in each cluster
for (i in 1:k) {
cat("Cluster ", i, ":\n")
print(rownames(X)[which(clusters == i)])
cat("\n")
}
## Cluster 1 :
## [1] "Afghanistan" "Bangladesh"
## [3] "Burkina Faso" "Cambodia"
## [5] "Chad" "Djibouti"
## [7] "Ethiopia" "Gambia"
## [9] "Guinea-Bissau" "Indonesia"
## [11] "Iraq" "Lao People's Democratic Republic"
## [13] "Lesotho" "Madagascar"
## [15] "Nepal" "Niger"
## [17] "Philippines" "Senegal"
## [19] "Sierra Leone" "Sri Lanka"
## [21] "Thailand" "Timor-Leste"
## [23] "Yemen" "Zambia"
## [25] "Zimbabwe"
##
## Cluster 2 :
## [1] "Albania" "Argentina"
## [3] "Australia" "Austria"
## [5] "Belgium" "Brazil"
## [7] "Bulgaria" "Costa Rica"
## [9] "Croatia" "Cyprus"
## [11] "Czechia" "Denmark"
## [13] "Estonia" "Finland"
## [15] "France" "Germany"
## [17] "Greece" "Hungary"
## [19] "Iceland" "Ireland"
## [21] "Israel" "Italy"
## [23] "Kazakhstan" "Latvia"
## [25] "Lithuania" "Luxembourg"
## [27] "Mongolia" "Montenegro"
## [29] "Netherlands" "New Zealand"
## [31] "Norway" "Pakistan"
## [33] "Poland" "Portugal"
## [35] "Romania" "Russian Federation"
## [37] "Serbia" "Slovakia"
## [39] "Slovenia" "Spain"
## [41] "Sweden" "Switzerland"
## [43] "United Kingdom" "United States of America"
## [45] "Uruguay"
##
## Cluster 3 :
## [1] "Algeria" "Armenia"
## [3] "Azerbaijan" "Barbados"
## [5] "Belarus" "Belize"
## [7] "Bolivia" "Bosnia and Herzegovina"
## [9] "Botswana" "Cabo Verde"
## [11] "China" "Colombia"
## [13] "Cuba" "Dominica"
## [15] "Dominican Republic" "Ecuador"
## [17] "Egypt" "El Salvador"
## [19] "Eswatini" "Fiji"
## [21] "Gabon" "Georgia"
## [23] "Guatemala" "Guyana"
## [25] "Honduras" "India"
## [27] "Iran (Islamic Republic of)" "Jamaica"
## [29] "Japan" "Jordan"
## [31] "Kenya" "Korea, South"
## [33] "Kuwait" "Kyrgyzstan"
## [35] "Lebanon" "Malaysia"
## [37] "Maldives" "Mali"
## [39] "Malta" "Mauritania"
## [41] "Mauritius" "Mexico"
## [43] "Morocco" "Namibia"
## [45] "Nicaragua" "North Macedonia"
## [47] "Oman" "Panama"
## [49] "Paraguay" "Peru"
## [51] "Saint Vincent and the Grenadines" "Samoa"
## [53] "Sao Tome and Principe" "Saudi Arabia"
## [55] "South Africa" "Sudan"
## [57] "Suriname" "Trinidad and Tobago"
## [59] "Tunisia" "Turkey"
## [61] "Ukraine" "United Arab Emirates"
## [63] "Uzbekistan" "Vanuatu"
## [65] "Venezuela (Bolivarian Republic of)" "Vietnam"
##
## Cluster 4 :
## [1] "Angola" "Benin"
## [3] "Cameroon" "Central African Republic"
## [5] "Congo" "Cote d'Ivoire"
## [7] "Ghana" "Guinea"
## [9] "Haiti" "Liberia"
## [11] "Malawi" "Mozambique"
## [13] "Nigeria" "Rwanda"
## [15] "Solomon Islands" "Togo"
## [17] "Uganda" "United Republic of Tanzania"
fviz_cluster(list(data = X, cluster = clusters), ellipse.type = "t", geom = "point", stand = FALSE)
#The same code as above can be used to generate the other counrty plots
#Add the covid deaths to the data set since we previously removed it
raw_data2 <- read.csv("Food_Supply_Quantity_kg_Data.csv")
d2 <- na.omit(raw_data)
deaths <- d2$Deaths
#turning it into categorical with low, medium and high deaths
quantiles <- quantile(deaths, c(0.33, 0.67))
categories <- cut(deaths, breaks = c(-Inf, quantiles, Inf), labels = c("low", "medium", "high"))
categories<- data.frame(categories)
#Now do classification
# Load the rpart package
library(rpart)
# Set the seed for reproducibility
set.seed(123)
#add categories to X
X_withdead <- cbind(X,categories)
# Split the data into training and test sets
train_index <- sample(nrow(X_withdead), nrow(X_withdead)*0.7, replace = FALSE)
X_train <- as.data.frame(X_withdead[train_index, ])
X_test <- as.data.frame(X_withdead[-train_index, ])
# Train a decision tree on the training set
tree_model <- rpart(categories ~ ., data = X_train, method = "class")
# Make predictions on the test set
test_predictions <- predict(tree_model, data.frame(X_test), type = "class")
# Create a confusion matrix to evaluate the performance
confusion_matrix <- table(test_predictions, X_test$categories)
print(confusion_matrix)
##
## test_predictions low medium high
## low 10 9 4
## medium 2 3 2
## high 1 6 10
# Calculate overall accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Overall Accuracy:", round(accuracy, 2)))
## [1] "Overall Accuracy: 0.49"
# Calculate precision for each category
precision <- diag(confusion_matrix) / colSums(confusion_matrix)
names(precision) <- levels(X_train$categories)
print(precision)
## low medium high
## 0.7692308 0.1666667 0.6250000
# Calculate recall (sensitivity) for each category
recall <- diag(confusion_matrix) / rowSums(confusion_matrix)
names(recall) <- levels(X_train$categories)
print(recall)
## low medium high
## 0.4347826 0.4285714 0.5882353
# Plot the decision tree
install.packages("rpart.plot")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(rpart.plot)
rpart.plot(tree_model, type = 3, extra = 101, under = TRUE, fallen.leaves = TRUE)
Now we’ll perform classification but with only two classes to see if it
improves performance.
#We don't have enough data to do this for 3 categories.
#maybe try to classify: low and high
quantiles1 <- quantile(deaths, c(0.5))
categories1 <- cut(deaths, breaks = c(-Inf, quantiles1, Inf), labels = c("low", "high"))
categories1<- data.frame(categories1)
#add categories to X
X_withdead1 <- cbind(X,categories1)
# Split the data into training and test sets
train_index <- sample(nrow(X_withdead1), nrow(X_withdead1)*0.7, replace = FALSE)
X_train <- as.data.frame(X_withdead1[train_index, ])
X_test <- as.data.frame(X_withdead1[-train_index, ])
# Train a decision tree on the training set
tree_model <- rpart(categories1 ~ ., data = X_train, method = "class")
# Make predictions on the test set
test_predictions <- predict(tree_model, data.frame(X_test), type = "class")
# Create a confusion matrix to evaluate the performance
confusion_matrix <- table(test_predictions, X_test$categories)
print(confusion_matrix)
##
## test_predictions low high
## low 16 9
## high 4 18
# Calculate overall accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Overall Accuracy:", round(accuracy, 2)))
## [1] "Overall Accuracy: 0.72"
# Calculate precision for each category
precision <- diag(confusion_matrix) / colSums(confusion_matrix)
names(precision) <- levels(X_train$categories)
print(precision)
## low high
## 0.8000000 0.6666667
# Calculate recall (sensitivity) for each category
recall <- diag(confusion_matrix) / rowSums(confusion_matrix)
names(recall) <- levels(X_train$categories)
print(recall)
## low high
## 0.6400000 0.8181818
# Plot the decision tree
library(rpart.plot)
rpart.plot(tree_model, type = 3, extra = 101, under = TRUE, fallen.leaves = TRUE)