Combine Clustering and Dimension Reduction in Health Survey in Covid-19

1. Project introduction

In this project, I used data from a survey of Covid-19’s impact to student life in Dimension reduction and Clustering data in Kaggle (https://www.kaggle.com/kunal28chaturvedi/covid19-and-its-impact-on-students). The project includes 8 parts:
- Data proceessing
- Analysis of correlation
- MDS
- PCA
- Clustering
- Data analysis

2. Data processing

covidSurvey <- read.csv('COVID-19 Survey Student Responses.csv',na.strings = 'NA')
covidSurvey <- na.omit(covidSurvey)
covidSurvey$ID <- NULL # Remove unnecessary columns
covidSurvey$Region.of.residence <- NULL # Remove unnecessary columns
rating <- data.frame(rate = unique(covidSurvey$Rating.of.Online.Class.experience), online.class.rate = c(1,2,-2,0,-1))
covidSurvey.healthissues <- as.data.frame(covidSurvey$Health.issue.during.lockdown)
covidSurvey.weightchange <- as.data.frame(covidSurvey$Change.in.your.weight)
covidSurvey <- merge(covidSurvey, rating, by.x = 'Rating.of.Online.Class.experience', by.y = 'rate')
colnames(covidSurvey)

##  [1] "Rating.of.Online.Class.experience"                                                 
##  [2] "Age.of.Subject"                                                                    
##  [3] "Time.spent.on.Online.Class"                                                        
##  [4] "Medium.for.online.class"                                                           
##  [5] "Time.spent.on.self.study"                                                          
##  [6] "Time.spent.on.fitness"                                                             
##  [7] "Time.spent.on.sleep"                                                               
##  [8] "Time.spent.on.social.media"                                                        
##  [9] "Prefered.social.media.platform"                                                    
## [10] "Time.spent.on.TV"                                                                  
## [11] "Number.of.meals.per.day"                                                           
## [12] "Change.in.your.weight"                                                             
## [13] "Health.issue.during.lockdown"                                                      
## [14] "Stress.busters"                                                                    
## [15] "Time.utilized"                                                                     
## [16] "Do.you.find.yourself.more.connected.with.your.family..close.friends...relatives..."
## [17] "What.you.miss.the.most"                                                            
## [18] "online.class.rate"

covidSurvey <- covidSurvey[,c(2,3,5,6,7,8,10,11,18)] # take only number variables for dimension MDS and PCA
covidSurvey$Time.spent.on.TV <- as.numeric(covidSurvey$Time.spent.on.TV)

summary(covidSurvey)

##  Age.of.Subject  Time.spent.on.Online.Class Time.spent.on.self.study
##  Min.   : 7.00   Min.   : 0.000             Min.   : 0.00           
##  1st Qu.:17.00   1st Qu.: 2.000             1st Qu.: 2.00           
##  Median :20.00   Median : 3.000             Median : 2.00           
##  Mean   :20.12   Mean   : 3.299             Mean   : 2.92           
##  3rd Qu.:21.00   3rd Qu.: 5.000             3rd Qu.: 4.00           
##  Max.   :59.00   Max.   :10.000             Max.   :18.00           
##  Time.spent.on.fitness Time.spent.on.sleep Time.spent.on.social.media
##  Min.   :0.0000        Min.   : 4.000      Min.   : 0.000            
##  1st Qu.:0.0000        1st Qu.: 7.000      1st Qu.: 1.000            
##  Median :1.0000        Median : 8.000      Median : 2.000            
##  Mean   :0.7619        Mean   : 7.853      Mean   : 2.355            
##  3rd Qu.:1.0000        3rd Qu.: 8.750      3rd Qu.: 3.000            
##  Max.   :5.0000        Max.   :15.000      Max.   :10.000            
##  Time.spent.on.TV Number.of.meals.per.day online.class.rate
##  Min.   : 1.000   Min.   :1.000           Min.   :-2.0000  
##  1st Qu.: 2.000   1st Qu.:2.000           1st Qu.:-2.0000  
##  Median :10.000   Median :3.000           Median : 0.0000  
##  Mean   : 7.905   Mean   :2.926           Mean   :-0.3758  
##  3rd Qu.:13.000   3rd Qu.:3.000           3rd Qu.: 1.0000  
##  Max.   :25.000   Max.   :8.000           Max.   : 2.0000

3. Analysis of correlation

covidSurvey.n<-as.matrix(as.data.frame(lapply(covidSurvey, scale)))
covidSurvey.n.cor <-cor(covidSurvey.n, method="pearson")
corrplot(covidSurvey.n.cor, order ="alphabet", tl.cex=0.6, na.label = "NA")

We can see low correlation across variables. This means difficulties in dimension reduction.
## Multidimensional scaling

distSurvey <- dist(covidSurvey)
mds1<-cmdscale(distSurvey, k=2)
summary(mds1)

##        V1                 V2          
##  Min.   :-13.8809   Min.   :-21.4717  
##  1st Qu.: -3.3784   1st Qu.: -2.9721  
##  Median : -0.3216   Median :  0.1531  
##  Mean   :  0.0000   Mean   :  0.0000  
##  3rd Qu.:  3.0498   3rd Qu.:  4.6014  
##  Max.   : 32.1992   Max.   : 10.7590

plot(mds1[,1], mds1[,2])

library(maptools)

## Loading required package: sp

## Checking rgeos availability: FALSE
##      Note: when rgeos is not available, polygon geometry     computations in maptools depend on gpclib,
##      which has a restricted licence. It is disabled by default;
##      to enable gpclib, type gpclibPermit()

plot(jitter(mds1[,1], amount=5), jitter(mds1[,2], amount=5), xlim=c(-30, 50), ylim=c(-30, 20))

Outliers

x.out<-which(mds1[,1]>20) 
y.out<-which(mds1[,2]<(-15) | mds1[,2]>10) 
out.all<-unique(c(y.out, x.out)) # merging into single dataset
covidSurvey[out.all,]

##     Age.of.Subject Time.spent.on.Online.Class Time.spent.on.self.study
## 17              59                          3                      6.0
## 92              33                          4                      1.0
## 248             34                          2                      1.0
## 301             23                          2                      2.0
## 324             24                          6                      6.0
## 368             40                          5                      2.0
## 455              9                          5                      2.0
## 518             50                          2                      2.0
## 734             30                          2                      0.0
## 779             46                          2                      0.5
## 879             22                          3                      0.0
## 23              43                          2                      0.0
## 316             40                          1                      2.0
## 385             45                          3                      4.0
## 670             52                          0                      2.0
## 761             40                          5                      3.0
##     Time.spent.on.fitness Time.spent.on.sleep Time.spent.on.social.media
## 17                   1.00                   9                        0.5
## 92                   0.00                   7                        1.0
## 248                  0.00                   7                        6.0
## 301                  0.30                   8                        1.0
## 324                  0.00                   7                        2.0
## 368                  1.00                   8                        4.0
## 455                  0.60                   7                        1.0
## 518                  0.00                   6                        2.0
## 734                  1.00                   8                        1.0
## 779                  0.25                   7                        1.0
## 879                  5.00                   8                       10.0
## 23                   1.00                   6                        2.0
## 316                  1.00                   6                        4.0
## 385                  1.00                   8                        2.0
## 670                  1.00                   8                        2.0
## 761                  0.00                   6                        3.0
##     Time.spent.on.TV Number.of.meals.per.day online.class.rate
## 17                10                       2                 0
## 92                24                       4                 0
## 248               20                       3                 0
## 301               24                       3                 0
## 324               25                       3                 0
## 368               20                       3                 0
## 455                2                       4                 2
## 518               10                       3                 1
## 734               22                       4                -2
## 779               10                       3                -2
## 879               25                       3                -2
## 23                 7                       3                 0
## 316                2                       2                 0
## 385               10                       3                 2
## 670                2                       4                 1
## 761                2                       3                -2

Principal Component Analysis

Eigenvalues on the basis of covariance

covidSurvey.cov<-cov(covidSurvey.n)
covidSurvey.eigen <-eigen(covidSurvey.cov)
covidSurvey.eigen$values

## [1] 1.5120971 1.3163938 1.1591267 1.0437197 0.9627265 0.8470104 0.7737525
## [8] 0.7500375 0.6351357

head(covidSurvey.eigen$vectors)

##            [,1]        [,2]        [,3]         [,4]        [,5]         [,6]
## [1,]  0.1427674  0.46121839  0.16097579 -0.607940205  0.09964294  0.327238353
## [2,] -0.4921394 -0.19824316  0.17211719  0.235312344 -0.46110412 -0.008692502
## [3,] -0.4838284  0.29776854 -0.15420463  0.003083643 -0.01762121  0.312149372
## [4,] -0.1276494 -0.21704996 -0.55047407 -0.463427962  0.09824798 -0.561906241
## [5,]  0.4323654 -0.32124677 -0.17327519  0.340733537  0.35285508  0.298158314
## [6,]  0.4403292  0.07372548  0.05000258 -0.102932740 -0.66352125 -0.088899577
##             [,7]       [,8]        [,9]
## [1,]  0.23606354 -0.1822778 -0.40672561
## [2,]  0.31986560 -0.1792710 -0.53512387
## [3,] -0.48541755  0.5383389 -0.17397891
## [4,]  0.03603518  0.1220170 -0.27790250
## [5,]  0.07398794  0.2480469 -0.53228797
## [6,]  0.16652629  0.5577529  0.02442102

PCA loadings

surveyPCA1 <- prcomp(covidSurvey.n, center=FALSE, scale.=FALSE)
summary(surveyPCA1)

## Importance of components:
##                          PC1    PC2    PC3   PC4    PC5     PC6     PC7     PC8
## Standard deviation     1.230 1.1473 1.0766 1.022 0.9812 0.92033 0.87963 0.86605
## Proportion of Variance 0.168 0.1463 0.1288 0.116 0.1070 0.09411 0.08597 0.08334
## Cumulative Proportion  0.168 0.3143 0.4431 0.559 0.6660 0.76012 0.84609 0.92943
##                            PC9
## Standard deviation     0.79695
## Proportion of Variance 0.07057
## Cumulative Proportion  1.00000

Plot cumulative variance

plot(summary(surveyPCA1)$importance[3,],type = 'l')

The cumulative variance line is close to diagonal line. This means the first PCs cannot explains the variance significantly alone.

Contributions of individual variables to PC

var<-get_pca_var(surveyPCA1)
a<-fviz_contrib(surveyPCA1, "var", axes=1, xtickslab.rt=90) 
b<-fviz_contrib(surveyPCA1, "var", axes=2, xtickslab.rt=90)
grid.arrange(a,b,ncol = 2,top='Contribution to the first two Principal Components')

rotated PCA

surveyPCA2<-principal(covidSurvey.n, nfactors=5, rotate="varimax")
summary(surveyPCA2)

## 
## Factor analysis with Call: principal(r = covidSurvey.n, nfactors = 5, rotate = "varimax")
## 
## Test of the hypothesis that 5 factors are sufficient.
## The degrees of freedom for the model is 1  and the objective function was  1.89 
## The number of observations was  1131  with Chi Square =  2120.18  with prob <  0 
## 
## The root mean square of the residuals (RMSA) is  0.13

print(loadings(surveyPCA2), digits=4, cutoff=0.4, sort=TRUE)

## 
## Loadings:
##                            RC1     RC2     RC4     RC3     RC5    
## Time.spent.on.self.study    0.5606                                
## Time.spent.on.sleep        -0.8158                                
## Time.spent.on.TV                    0.7195                        
## online.class.rate                   0.6920                        
## Age.of.Subject                             -0.8157                
## Time.spent.on.Online.Class  0.5228          0.6310                
## Time.spent.on.fitness                               0.7319        
## Number.of.meals.per.day                             0.7577        
## Time.spent.on.social.media                                  0.8536
## 
##                   RC1    RC2    RC4    RC3    RC5
## SS loadings    1.3330 1.2145 1.1572 1.1506 1.1387
## Proportion Var 0.1481 0.1349 0.1286 0.1278 0.1265
## Cumulative Var 0.1481 0.2831 0.4116 0.5395 0.6660

Quality measures of PCA

Scree plots

fviz_eig(surveyPCA1, choice='eigenvalue') # eigenvalues on y-axis

fviz_eig(surveyPCA1)

Complexity and uniqueness

surveyPCA2$complexity

##             Age.of.Subject Time.spent.on.Online.Class 
##                   1.216081                   2.179917 
##   Time.spent.on.self.study      Time.spent.on.fitness 
##                   2.206988                   1.531358 
##        Time.spent.on.sleep Time.spent.on.social.media 
##                   1.087518                   1.026279 
##           Time.spent.on.TV    Number.of.meals.per.day 
##                   1.449103                   1.427497 
##          online.class.rate 
##                   1.549264

surveyPCA2$uniquenesses

##             Age.of.Subject Time.spent.on.Online.Class 
##                  0.2638081                  0.2852104 
##   Time.spent.on.self.study      Time.spent.on.fitness 
##                  0.5014420                  0.3286567 
##        Time.spent.on.sleep Time.spent.on.social.media 
##                  0.3056346                  0.2618577 
##           Time.spent.on.TV    Number.of.meals.per.day 
##                  0.3658643                  0.3091674 
##          online.class.rate 
##                  0.3842950

Visualization of PCA

autoplot(surveyPCA1, loadings=TRUE, loadings.colour='blue', loadings.label=TRUE,loadings.label.size=3)

surveyKM <- eclust(covidSurvey.n, k=3, graph = F)
autoplot(surveyPCA1, data=surveyKM, colour="cluster")

4. Clustering

Assessing Clustering Tendency

hopkins(covidSurvey.n, n=nrow(covidSurvey.n)-1)

## $H
## [1] 0.2098456

The result of hopkins statistics is 1 - 0.2098456 = 0.7901544. This means the dataset can be clustered.

###Optimal number of clusters

opt_km1 <- fviz_nbclust(covidSurvey.n,kmeans,method = "wss") + ggtitle("k-means")
opt_km2 <- fviz_nbclust(covidSurvey.n,pam,method = "wss") + ggtitle("PAM")
opt_km3 <- fviz_nbclust(covidSurvey.n,clara,method = "wss") + ggtitle("CLARA")
grid.arrange(opt_km1,opt_km2, opt_km3, ncol=2, top = "Optimal number of clusters using Within-Cluster-Sum of Squared Errors method")

###Stripes for k-means

d1<-cclust(covidSurvey.n, 3, dist="euclidean")

## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'

## Also defined by 'kernlab'

## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'

## Also defined by 'kernlab'

stripes(d1)

###Cluster data

covidSurvey.select <- covidSurvey.n[-out.all,]
c_km4 <-eclust(covidSurvey.select, "kmeans", k= 4, graph = F)
c_pam4 <-eclust(covidSurvey.select, "pam", k= 4, graph = F)
c_clara4 <-eclust(covidSurvey.select, "clara", k= 4, graph = F)
plotkm4 <- fviz_cluster(c_km4, geom = c("point")) + ggtitle('K-means with 4 clusters')
plotpam4 <- fviz_cluster(c_pam4, geom = c("point")) + ggtitle('PAM with 4 clusters')
plotclara4 <- fviz_cluster(c_clara4, geom = c("point")) + ggtitle('CLARA with 4 clusters')
grid.arrange(arrangeGrob(plotkm4,plotpam4,plotclara4 ,ncol=2 , top = "Clustering data into 4 groups"))

On the clustering graphs, Dim1 and Dim2 cannot clearly separate clusters in the visualization.

###Hierachical Clustering

d1 <- dist(covidSurvey.select, method = "euclidean")
cluster_hierarchical <- hclust(d1, method = "complete" )
sub_grp_1 <- cutree(cluster_hierarchical, k = 4)
plot(cluster_hierarchical,)
rect.hclust(cluster_hierarchical, k=4)

###Measure the clustering quality Silhouette index

slh_km <- fviz_silhouette(c_km4) + ggtitle("k-means")

##   cluster size ave.sil.width
## 1       1  259          0.08
## 2       2  335          0.08
## 3       3  310          0.14
## 4       4  211          0.06

slh_pam <- fviz_silhouette(c_pam4)+ ggtitle("PAM")

##   cluster size ave.sil.width
## 1       1  417          0.07
## 2       2  343          0.08
## 3       3  292          0.10
## 4       4   63          0.27

slh_clara <- fviz_silhouette(c_clara4) + ggtitle("CLARA")

##   cluster size ave.sil.width
## 1       1  520          0.09
## 2       2  375          0.10
## 3       3  122          0.04
## 4       4   98          0.11

grid.arrange(arrangeGrob(slh_km,slh_pam,slh_clara, ncol=2 , top = "Silhoette statistic"))

Silhouette values for both 3 methods are quite low, less than 15%. The clustering quality is not assured.

Calinski-Harabasz & Duda-Hart

c_km5 <-eclust(covidSurvey.select, "kmeans", k= 5, graph = F)
round(calinhara(covidSurvey.select, c_km4$cluster),4)

## [1] 107.4357

round(calinhara(covidSurvey.select, c_km5$cluster),4)

## [1] 104.6403