In this project, I used data from a survey of Covid-19’s impact to student life in Dimension reduction and Clustering data in Kaggle (https://www.kaggle.com/kunal28chaturvedi/covid19-and-its-impact-on-students). The project includes 8 parts:
- Data proceessing
- Analysis of correlation
- MDS
- PCA
- Clustering
- Data analysis
covidSurvey <- read.csv('COVID-19 Survey Student Responses.csv',na.strings = 'NA')
covidSurvey <- na.omit(covidSurvey)
covidSurvey$ID <- NULL # Remove unnecessary columns
covidSurvey$Region.of.residence <- NULL # Remove unnecessary columns
rating <- data.frame(rate = unique(covidSurvey$Rating.of.Online.Class.experience), online.class.rate = c(1,2,-2,0,-1))
covidSurvey.healthissues <- as.data.frame(covidSurvey$Health.issue.during.lockdown)
covidSurvey.weightchange <- as.data.frame(covidSurvey$Change.in.your.weight)
covidSurvey <- merge(covidSurvey, rating, by.x = 'Rating.of.Online.Class.experience', by.y = 'rate')
colnames(covidSurvey)
## [1] "Rating.of.Online.Class.experience"
## [2] "Age.of.Subject"
## [3] "Time.spent.on.Online.Class"
## [4] "Medium.for.online.class"
## [5] "Time.spent.on.self.study"
## [6] "Time.spent.on.fitness"
## [7] "Time.spent.on.sleep"
## [8] "Time.spent.on.social.media"
## [9] "Prefered.social.media.platform"
## [10] "Time.spent.on.TV"
## [11] "Number.of.meals.per.day"
## [12] "Change.in.your.weight"
## [13] "Health.issue.during.lockdown"
## [14] "Stress.busters"
## [15] "Time.utilized"
## [16] "Do.you.find.yourself.more.connected.with.your.family..close.friends...relatives..."
## [17] "What.you.miss.the.most"
## [18] "online.class.rate"
covidSurvey <- covidSurvey[,c(2,3,5,6,7,8,10,11,18)] # take only number variables for dimension MDS and PCA
covidSurvey$Time.spent.on.TV <- as.numeric(covidSurvey$Time.spent.on.TV)
summary(covidSurvey)
## Age.of.Subject Time.spent.on.Online.Class Time.spent.on.self.study
## Min. : 7.00 Min. : 0.000 Min. : 0.00
## 1st Qu.:17.00 1st Qu.: 2.000 1st Qu.: 2.00
## Median :20.00 Median : 3.000 Median : 2.00
## Mean :20.12 Mean : 3.299 Mean : 2.92
## 3rd Qu.:21.00 3rd Qu.: 5.000 3rd Qu.: 4.00
## Max. :59.00 Max. :10.000 Max. :18.00
## Time.spent.on.fitness Time.spent.on.sleep Time.spent.on.social.media
## Min. :0.0000 Min. : 4.000 Min. : 0.000
## 1st Qu.:0.0000 1st Qu.: 7.000 1st Qu.: 1.000
## Median :1.0000 Median : 8.000 Median : 2.000
## Mean :0.7619 Mean : 7.853 Mean : 2.355
## 3rd Qu.:1.0000 3rd Qu.: 8.750 3rd Qu.: 3.000
## Max. :5.0000 Max. :15.000 Max. :10.000
## Time.spent.on.TV Number.of.meals.per.day online.class.rate
## Min. : 1.000 Min. :1.000 Min. :-2.0000
## 1st Qu.: 2.000 1st Qu.:2.000 1st Qu.:-2.0000
## Median :10.000 Median :3.000 Median : 0.0000
## Mean : 7.905 Mean :2.926 Mean :-0.3758
## 3rd Qu.:13.000 3rd Qu.:3.000 3rd Qu.: 1.0000
## Max. :25.000 Max. :8.000 Max. : 2.0000
covidSurvey.n<-as.matrix(as.data.frame(lapply(covidSurvey, scale)))
covidSurvey.n.cor <-cor(covidSurvey.n, method="pearson")
corrplot(covidSurvey.n.cor, order ="alphabet", tl.cex=0.6, na.label = "NA")
We can see low correlation across variables. This means difficulties in dimension reduction.
## Multidimensional scaling
distSurvey <- dist(covidSurvey)
mds1<-cmdscale(distSurvey, k=2)
summary(mds1)
## V1 V2
## Min. :-13.8809 Min. :-21.4717
## 1st Qu.: -3.3784 1st Qu.: -2.9721
## Median : -0.3216 Median : 0.1531
## Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 3.0498 3rd Qu.: 4.6014
## Max. : 32.1992 Max. : 10.7590
plot(mds1[,1], mds1[,2])
library(maptools)
## Loading required package: sp
## Checking rgeos availability: FALSE
## Note: when rgeos is not available, polygon geometry computations in maptools depend on gpclib,
## which has a restricted licence. It is disabled by default;
## to enable gpclib, type gpclibPermit()
plot(jitter(mds1[,1], amount=5), jitter(mds1[,2], amount=5), xlim=c(-30, 50), ylim=c(-30, 20))
x.out<-which(mds1[,1]>20)
y.out<-which(mds1[,2]<(-15) | mds1[,2]>10)
out.all<-unique(c(y.out, x.out)) # merging into single dataset
covidSurvey[out.all,]
## Age.of.Subject Time.spent.on.Online.Class Time.spent.on.self.study
## 17 59 3 6.0
## 92 33 4 1.0
## 248 34 2 1.0
## 301 23 2 2.0
## 324 24 6 6.0
## 368 40 5 2.0
## 455 9 5 2.0
## 518 50 2 2.0
## 734 30 2 0.0
## 779 46 2 0.5
## 879 22 3 0.0
## 23 43 2 0.0
## 316 40 1 2.0
## 385 45 3 4.0
## 670 52 0 2.0
## 761 40 5 3.0
## Time.spent.on.fitness Time.spent.on.sleep Time.spent.on.social.media
## 17 1.00 9 0.5
## 92 0.00 7 1.0
## 248 0.00 7 6.0
## 301 0.30 8 1.0
## 324 0.00 7 2.0
## 368 1.00 8 4.0
## 455 0.60 7 1.0
## 518 0.00 6 2.0
## 734 1.00 8 1.0
## 779 0.25 7 1.0
## 879 5.00 8 10.0
## 23 1.00 6 2.0
## 316 1.00 6 4.0
## 385 1.00 8 2.0
## 670 1.00 8 2.0
## 761 0.00 6 3.0
## Time.spent.on.TV Number.of.meals.per.day online.class.rate
## 17 10 2 0
## 92 24 4 0
## 248 20 3 0
## 301 24 3 0
## 324 25 3 0
## 368 20 3 0
## 455 2 4 2
## 518 10 3 1
## 734 22 4 -2
## 779 10 3 -2
## 879 25 3 -2
## 23 7 3 0
## 316 2 2 0
## 385 10 3 2
## 670 2 4 1
## 761 2 3 -2
covidSurvey.cov<-cov(covidSurvey.n)
covidSurvey.eigen <-eigen(covidSurvey.cov)
covidSurvey.eigen$values
## [1] 1.5120971 1.3163938 1.1591267 1.0437197 0.9627265 0.8470104 0.7737525
## [8] 0.7500375 0.6351357
head(covidSurvey.eigen$vectors)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.1427674 0.46121839 0.16097579 -0.607940205 0.09964294 0.327238353
## [2,] -0.4921394 -0.19824316 0.17211719 0.235312344 -0.46110412 -0.008692502
## [3,] -0.4838284 0.29776854 -0.15420463 0.003083643 -0.01762121 0.312149372
## [4,] -0.1276494 -0.21704996 -0.55047407 -0.463427962 0.09824798 -0.561906241
## [5,] 0.4323654 -0.32124677 -0.17327519 0.340733537 0.35285508 0.298158314
## [6,] 0.4403292 0.07372548 0.05000258 -0.102932740 -0.66352125 -0.088899577
## [,7] [,8] [,9]
## [1,] 0.23606354 -0.1822778 -0.40672561
## [2,] 0.31986560 -0.1792710 -0.53512387
## [3,] -0.48541755 0.5383389 -0.17397891
## [4,] 0.03603518 0.1220170 -0.27790250
## [5,] 0.07398794 0.2480469 -0.53228797
## [6,] 0.16652629 0.5577529 0.02442102
surveyPCA1 <- prcomp(covidSurvey.n, center=FALSE, scale.=FALSE)
summary(surveyPCA1)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## Standard deviation 1.230 1.1473 1.0766 1.022 0.9812 0.92033 0.87963 0.86605
## Proportion of Variance 0.168 0.1463 0.1288 0.116 0.1070 0.09411 0.08597 0.08334
## Cumulative Proportion 0.168 0.3143 0.4431 0.559 0.6660 0.76012 0.84609 0.92943
## PC9
## Standard deviation 0.79695
## Proportion of Variance 0.07057
## Cumulative Proportion 1.00000
plot(summary(surveyPCA1)$importance[3,],type = 'l')
The cumulative variance line is close to diagonal line. This means the first PCs cannot explains the variance significantly alone.
var<-get_pca_var(surveyPCA1)
a<-fviz_contrib(surveyPCA1, "var", axes=1, xtickslab.rt=90)
b<-fviz_contrib(surveyPCA1, "var", axes=2, xtickslab.rt=90)
grid.arrange(a,b,ncol = 2,top='Contribution to the first two Principal Components')
surveyPCA2<-principal(covidSurvey.n, nfactors=5, rotate="varimax")
summary(surveyPCA2)
##
## Factor analysis with Call: principal(r = covidSurvey.n, nfactors = 5, rotate = "varimax")
##
## Test of the hypothesis that 5 factors are sufficient.
## The degrees of freedom for the model is 1 and the objective function was 1.89
## The number of observations was 1131 with Chi Square = 2120.18 with prob < 0
##
## The root mean square of the residuals (RMSA) is 0.13
print(loadings(surveyPCA2), digits=4, cutoff=0.4, sort=TRUE)
##
## Loadings:
## RC1 RC2 RC4 RC3 RC5
## Time.spent.on.self.study 0.5606
## Time.spent.on.sleep -0.8158
## Time.spent.on.TV 0.7195
## online.class.rate 0.6920
## Age.of.Subject -0.8157
## Time.spent.on.Online.Class 0.5228 0.6310
## Time.spent.on.fitness 0.7319
## Number.of.meals.per.day 0.7577
## Time.spent.on.social.media 0.8536
##
## RC1 RC2 RC4 RC3 RC5
## SS loadings 1.3330 1.2145 1.1572 1.1506 1.1387
## Proportion Var 0.1481 0.1349 0.1286 0.1278 0.1265
## Cumulative Var 0.1481 0.2831 0.4116 0.5395 0.6660
Scree plots
fviz_eig(surveyPCA1, choice='eigenvalue') # eigenvalues on y-axis
fviz_eig(surveyPCA1)
Complexity and uniqueness
surveyPCA2$complexity
## Age.of.Subject Time.spent.on.Online.Class
## 1.216081 2.179917
## Time.spent.on.self.study Time.spent.on.fitness
## 2.206988 1.531358
## Time.spent.on.sleep Time.spent.on.social.media
## 1.087518 1.026279
## Time.spent.on.TV Number.of.meals.per.day
## 1.449103 1.427497
## online.class.rate
## 1.549264
surveyPCA2$uniquenesses
## Age.of.Subject Time.spent.on.Online.Class
## 0.2638081 0.2852104
## Time.spent.on.self.study Time.spent.on.fitness
## 0.5014420 0.3286567
## Time.spent.on.sleep Time.spent.on.social.media
## 0.3056346 0.2618577
## Time.spent.on.TV Number.of.meals.per.day
## 0.3658643 0.3091674
## online.class.rate
## 0.3842950
autoplot(surveyPCA1, loadings=TRUE, loadings.colour='blue', loadings.label=TRUE,loadings.label.size=3)
surveyKM <- eclust(covidSurvey.n, k=3, graph = F)
autoplot(surveyPCA1, data=surveyKM, colour="cluster")
hopkins(covidSurvey.n, n=nrow(covidSurvey.n)-1)
## $H
## [1] 0.2098456
The result of hopkins statistics is 1 - 0.2098456 = 0.7901544. This means the dataset can be clustered.
###Optimal number of clusters
opt_km1 <- fviz_nbclust(covidSurvey.n,kmeans,method = "wss") + ggtitle("k-means")
opt_km2 <- fviz_nbclust(covidSurvey.n,pam,method = "wss") + ggtitle("PAM")
opt_km3 <- fviz_nbclust(covidSurvey.n,clara,method = "wss") + ggtitle("CLARA")
grid.arrange(opt_km1,opt_km2, opt_km3, ncol=2, top = "Optimal number of clusters using Within-Cluster-Sum of Squared Errors method")
###Stripes for k-means
d1<-cclust(covidSurvey.n, 3, dist="euclidean")
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
stripes(d1)
###Cluster data
covidSurvey.select <- covidSurvey.n[-out.all,]
c_km4 <-eclust(covidSurvey.select, "kmeans", k= 4, graph = F)
c_pam4 <-eclust(covidSurvey.select, "pam", k= 4, graph = F)
c_clara4 <-eclust(covidSurvey.select, "clara", k= 4, graph = F)
plotkm4 <- fviz_cluster(c_km4, geom = c("point")) + ggtitle('K-means with 4 clusters')
plotpam4 <- fviz_cluster(c_pam4, geom = c("point")) + ggtitle('PAM with 4 clusters')
plotclara4 <- fviz_cluster(c_clara4, geom = c("point")) + ggtitle('CLARA with 4 clusters')
grid.arrange(arrangeGrob(plotkm4,plotpam4,plotclara4 ,ncol=2 , top = "Clustering data into 4 groups"))
On the clustering graphs, Dim1 and Dim2 cannot clearly separate clusters in the visualization.
###Hierachical Clustering
d1 <- dist(covidSurvey.select, method = "euclidean")
cluster_hierarchical <- hclust(d1, method = "complete" )
sub_grp_1 <- cutree(cluster_hierarchical, k = 4)
plot(cluster_hierarchical,)
rect.hclust(cluster_hierarchical, k=4)
###Measure the clustering quality Silhouette index
slh_km <- fviz_silhouette(c_km4) + ggtitle("k-means")
## cluster size ave.sil.width
## 1 1 259 0.08
## 2 2 335 0.08
## 3 3 310 0.14
## 4 4 211 0.06
slh_pam <- fviz_silhouette(c_pam4)+ ggtitle("PAM")
## cluster size ave.sil.width
## 1 1 417 0.07
## 2 2 343 0.08
## 3 3 292 0.10
## 4 4 63 0.27
slh_clara <- fviz_silhouette(c_clara4) + ggtitle("CLARA")
## cluster size ave.sil.width
## 1 1 520 0.09
## 2 2 375 0.10
## 3 3 122 0.04
## 4 4 98 0.11
grid.arrange(arrangeGrob(slh_km,slh_pam,slh_clara, ncol=2 , top = "Silhoette statistic"))
Silhouette values for both 3 methods are quite low, less than 15%. The clustering quality is not assured.
Calinski-Harabasz & Duda-Hart
c_km5 <-eclust(covidSurvey.select, "kmeans", k= 5, graph = F)
round(calinhara(covidSurvey.select, c_km4$cluster),4)
## [1] 107.4357
round(calinhara(covidSurvey.select, c_km5$cluster),4)
## [1] 104.6403