Import Dataset
data <- read.csv("C:/Users/notebook/Downloads/VaccineHesitancy_assign.csv",header = TRUE)
Filter Selangor Data
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
c <- sqldf('SELECT Age, Education, State, PClinicalBarriers, PAccessBarriers, TrustVaccineProc, TrustGov, PIndBenefits, Respondent FROM data WHERE State == "Selangor" ORDER BY Respondent ASC')
#Mutate dataset
c %<>%
mutate(
AgeClass = ifelse(Age<50 , "Below 50", "Above 50"),
AgeClass = as.factor(AgeClass)) %>%
arrange(AgeClass)
#Convert row name as respondent number
rownames(c) <- c$Respondent
Summary Statistics
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.2
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ stringr 1.4.1
## ✔ tidyr 1.2.1 ✔ forcats 0.5.2
## ✔ readr 2.1.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dplyr)
#Malaysia data
msia <- sqldf('SELECT Age, Education, State, PClinicalBarriers, PAccessBarriers, TrustVaccineProc, TrustGov, PIndBenefits, Respondent FROM data')
#Remove missing values
dt = subset(msia, Education != "None")
#Descriptive statistics
summary(dt)
## Age Education State PClinicalBarriers
## Min. :18.00 Length:785 Length:785 Min. : 6.00
## 1st Qu.:27.00 Class :character Class :character 1st Qu.:15.00
## Median :33.00 Mode :character Mode :character Median :18.00
## Mean :35.52 Mean :17.76
## 3rd Qu.:43.00 3rd Qu.:21.00
## Max. :79.00 Max. :30.00
## PAccessBarriers TrustVaccineProc TrustGov PIndBenefits
## Min. : 4.00 Min. : 4.00 Min. : 5.00 Min. : 3.00
## 1st Qu.:10.00 1st Qu.:13.00 1st Qu.:12.00 1st Qu.:10.00
## Median :13.00 Median :16.00 Median :15.00 Median :12.00
## Mean :12.71 Mean :15.35 Mean :15.87 Mean :11.86
## 3rd Qu.:15.00 3rd Qu.:18.00 3rd Qu.:20.00 3rd Qu.:14.00
## Max. :20.00 Max. :20.00 Max. :25.00 Max. :15.00
## Respondent
## Min. : 1.0
## 1st Qu.: 232.0
## Median : 477.0
## Mean : 802.5
## 3rd Qu.:1584.0
## Max. :1926.0
#Count of states in Malaysia data
dt %>%
group_by(State) %>%
summarise(Count = n()) %>%
arrange(desc(Count),State)
## # A tibble: 16 × 2
## State Count
## <chr> <int>
## 1 Selangor 223
## 2 Kuala Lumpur 165
## 3 Johor 88
## 4 Penang 67
## 5 Negeri Sembilan 38
## 6 Perak 37
## 7 Kedah 35
## 8 Sarawak 26
## 9 Kelantan 24
## 10 Malacca 23
## 11 Sabah 21
## 12 Pahang 20
## 13 Terengganu 14
## 14 Perlis 2
## 15 Labuan 1
## 16 Putrajaya 1
#Comparison of mean values between Malaysia and Selangor data
#Selangor
colMeans(c[sapply(c,is.numeric)],na.rm=TRUE)
## Age PClinicalBarriers PAccessBarriers TrustVaccineProc
## 36.68610 17.61435 12.60987 15.17040
## TrustGov PIndBenefits Respondent
## 14.87444 11.57399 923.40359
#Malaysia
colMeans(dt[sapply(dt,is.numeric)],na.rm=TRUE)
## Age PClinicalBarriers PAccessBarriers TrustVaccineProc
## 35.51975 17.75541 12.71465 15.34904
## TrustGov PIndBenefits Respondent
## 15.87006 11.85860 802.49554
Data Visualization
#Boxplot of vaccination barriers against education (fill by age class)
bp1 <- ggplot(c, aes(x=Education, y = PClinicalBarriers, fill = AgeClass)) +
geom_boxplot() +
ggtitle("PClinicalBarriers vs Education Boxplot")
bp2 <- ggplot(c, aes(x=Education, y = PAccessBarriers, fill = AgeClass)) +
geom_boxplot() +
ggtitle("PAccessBarriers vs Education Boxplot")
#Boxplot of vaccination barriers against education (fill by education)
bp3 <- ggplot(c, aes(x=Education, y = PClinicalBarriers, fill = Education)) +
geom_boxplot() +
ggtitle("PClinicalBarriers vs Education Boxplot")
bp4 <- ggplot(c, aes(x=Education, y = PAccessBarriers, fill = Education)) +
geom_boxplot() +
ggtitle("PAccessBarriers vs Education Boxplot")
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(bp3,bp4,bp1,bp2, ncol=2, nrow=2)

#Violin Plot of confidence level in COVID-19 prevention efforts towards age class
vp1 <- ggplot(c, aes(AgeClass, TrustVaccineProc, fill = AgeClass)) +
geom_violin(aes(color = AgeClass), trim = T) +
geom_boxplot(width=0.1) +
ggtitle("TrustVaccineProc vs AgeClass Violin Plot")
vp2 <- ggplot(c, aes(AgeClass, TrustGov, fill = AgeClass)) +
geom_violin(aes(color = AgeClass), trim = T) +
geom_boxplot(width=0.1) +
theme(legend.position="right") +
ggtitle("TrustGov vs AgeClass Violin Plot")
vp3 <- ggplot(c, aes(AgeClass, PIndBenefits, fill = AgeClass)) +
geom_violin(aes(color = AgeClass), trim = T) +
geom_boxplot(width=0.1) +
theme(legend.position="right") +
ggtitle("PIndBenefits vs AgeClass Violin Plot")
grid.arrange(vp3,vp2,vp1, ncol=2, nrow=2)

#Correlation matrix of selected variables
library(PerformanceAnalytics)
## Warning: package 'PerformanceAnalytics' was built under R version 4.2.2
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
pca = c[,c(4:8)]
chart.Correlation(pca, histogram = FALSE, method = "pearson") +
mtext("Correlation Matrix of Variables Selected", side=3, line=1,cex=1.5)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter

## integer(0)
Principal Component Analysis
pca = c[,c(4:8)]
pr.out =prcomp (pca, scale =TRUE)
#PCA loading vectors
pr.out$rotation
## PC1 PC2 PC3 PC4 PC5
## PClinicalBarriers 0.3835140 0.6074780 -0.32855703 0.5490079 0.27299833
## PAccessBarriers 0.3792111 0.5571329 0.38399710 -0.6296009 -0.04416841
## TrustVaccineProc -0.5773626 0.2164170 -0.04122352 -0.2340373 0.75056197
## TrustGov -0.4658656 0.4278675 -0.50448816 -0.1702014 -0.56251378
## PIndBenefits -0.3984189 0.3011088 0.69884821 0.4673874 -0.20917943
#Proportion of variance (PCA)
summary(pr.out)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 1.444 1.0110 0.9254 0.8035 0.62533
## Proportion of Variance 0.417 0.2044 0.1713 0.1291 0.07821
## Cumulative Proportion 0.417 0.6214 0.7927 0.9218 1.00000
#Scree plot
pr.var = pr.out$sdev^2
pr.var
## [1] 2.0850034 1.0220598 0.8563540 0.6455506 0.3910322
pve=pr.var/sum(pr.var )
pve
## [1] 0.41700068 0.20441196 0.17127079 0.12911013 0.07820644
#Scree plot
plot(pve , xlab=" Principal Component ", ylab=" Proportion of Variance Explained ", ylim=c(0,1) ,type="b",main = "Scree Plot",col=3)

library(ggfortify)
## Warning: package 'ggfortify' was built under R version 4.2.2
#PCA grouped by education
autoplot(pr.out, data = c, colour = 'Education', shape = FALSE, label.size = 3, loadings = TRUE, loadings.colour = 'black', loadings.label = TRUE, loadings.label.size = 3, main="PCA Biplot filled by Education Level", xlim=c(-0.25,0.25)) +
ggeasy::easy_center_title() +
geom_vline(xintercept=0,linetype="dotted") +
geom_hline(yintercept=0,linetype="dotted")

#PCA grouped by age class
autoplot(pr.out, data = c, colour = 'AgeClass', shape = FALSE, label.size = 3, loadings = TRUE, loadings.colour = 'black', loadings.label = TRUE, loadings.label.size = 3, main ="PCA Biplot filled by Age Class", xlim=c(-0.25,0.25),legend.title="Age Class") +
ggeasy::easy_center_title() +
geom_vline(xintercept=0,linetype="dotted") +
geom_hline(yintercept=0,linetype="dotted") +
guides(colour = guide_legend(title = "Age Class"))

K-means Clustering
library(NbClust)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.2.2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#Elbow method
fviz_nbclust(pr.out$x[, 1:2], kmeans, method = "wss") +
geom_vline(xintercept = 4, linetype = 2) +
labs(subtitle = "Elbow method")

#K-means clustering
kmeans_4 = kmeans(pr.out$x[, 1:2],centers = 4, nstart = 20)
kmeans_4
## K-means clustering with 4 clusters of sizes 44, 56, 89, 34
##
## Cluster means:
## PC1 PC2
## 1 1.9315927 -0.39152347
## 2 -1.7583179 -0.51876959
## 3 0.2752355 -0.08588279
## 4 -0.3241246 1.58593230
##
## Clustering vector:
## 61 84 106 146 155 164 168 191 226 230 240 333 337 614 644 667
## 3 4 4 3 2 3 3 3 2 4 2 3 2 2 3 2
## 1388 1391 1394 1414 1464 1511 1557 1584 1587 1589 1617 1773 19 22 28 34
## 3 3 2 2 2 1 2 3 2 3 1 3 3 3 3 3
## 46 59 60 77 82 86 93 94 101 109 114 123 134 140 145 150
## 2 1 2 4 4 2 4 3 1 3 1 3 3 3 4 1
## 154 159 174 177 181 185 189 192 197 200 204 205 208 212 214 223
## 3 3 2 4 3 1 4 2 2 2 4 3 4 3 3 3
## 225 238 244 249 255 264 268 270 275 276 277 279 293 303 308 334
## 3 2 2 3 4 1 3 1 2 3 4 2 4 3 2 2
## 348 352 353 359 360 364 367 371 372 377 381 387 391 393 394 396
## 4 1 1 2 4 4 3 4 1 3 3 1 2 2 2 3
## 397 399 405 423 433 435 439 442 451 455 458 459 462 467 471 478
## 3 4 4 4 3 1 2 3 4 1 1 2 3 2 4 4
## 486 488 492 496 504 506 507 509 511 512 519 522 525 527 529 553
## 3 2 2 1 3 3 2 1 4 1 4 1 2 3 1 3
## 557 568 580 608 609 612 628 630 631 648 1385 1405 1412 1415 1418 1426
## 1 3 3 2 4 3 3 1 4 4 2 2 4 3 3 3
## 1427 1443 1452 1486 1494 1507 1522 1527 1535 1539 1544 1546 1564 1574 1603 1619
## 1 1 1 1 2 3 2 3 3 3 2 3 1 2 1 3
## 1694 1697 1715 1720 1724 1725 1728 1733 1736 1737 1738 1739 1741 1743 1745 1746
## 3 1 1 3 1 1 3 2 3 3 1 3 3 2 1 1
## 1748 1752 1754 1755 1756 1758 1767 1768 1769 1770 1777 1789 1792 1795 1797 1800
## 1 3 2 3 3 4 3 3 2 3 2 3 1 1 1 3
## 1802 1803 1806 1811 1812 1832 1834 1841 1843 1850 1863 1864 1866 1867 1868 1869
## 3 1 3 3 1 3 2 2 3 3 3 3 1 3 4 3
## 1875 1882 1890 1893 1900 1903 1905 1906 1907 1908 1912 1913 1917 1921 1926
## 4 3 3 2 3 3 2 2 3 2 2 2 4 1 2
##
## Within cluster sum of squares by cluster:
## [1] 59.97550 67.89986 58.50801 47.78194
## (between_SS / total_SS = 66.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
kmeans_4$tot.withinss #quality of the partition
## [1] 234.1653
#K-means clustering plot
fviz_cluster(kmeans_4,pr.out$x[, 1:2], ylim = c(-3,5)) +
ggtitle("K-means Clustering Plot") +
ggeasy::easy_center_title()

#Mean of each variable in every cluster
aggregate(pca, by=list(cluster = kmeans_4$cluster), mean)
## cluster PClinicalBarriers PAccessBarriers TrustVaccineProc TrustGov
## 1 1 19.65909 14.590909 11.20455 9.431818
## 2 2 12.89286 9.392857 17.98214 17.910714
## 3 3 17.85393 12.820225 14.60674 13.921348
## 4 4 22.11765 14.794118 17.14706 19.411765
## PIndBenefits
## 1 9.363636
## 2 13.089286
## 3 11.280899
## 4 12.705882
Exploratory Factor Analysis
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
#Convert to correlation matrix
mpca = as.data.frame(pca)
mmpca = cor(mpca)
View(mmpca)
#Kaiser-Meyer-Olkin (KMO) test
KMO(mpca)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = mpca)
## Overall MSA = 0.62
## MSA for each item =
## PClinicalBarriers PAccessBarriers TrustVaccineProc TrustGov
## 0.61 0.71 0.60 0.58
## PIndBenefits
## 0.71
#Bartlett’s statistical test
cortest.bartlett(mmpca, n=223, diag = TRUE)
## $chisq
## [1] 170.134
##
## $p.value
## [1] 2.602334e-31
##
## $df
## [1] 10
#Oblique factor rotation technique
factanal(mpca, factors = 2, n.obs = 223, rotation = "promax")
##
## Call:
## factanal(x = mpca, factors = 2, n.obs = 223, rotation = "promax")
##
## Uniquenesses:
## PClinicalBarriers PAccessBarriers TrustVaccineProc TrustGov
## 0.005 0.861 0.153 0.647
## PIndBenefits
## 0.824
##
## Loadings:
## Factor1 Factor2
## PClinicalBarriers 1.004
## PAccessBarriers -0.208 0.244
## TrustVaccineProc 0.916
## TrustGov 0.626 0.124
## PIndBenefits 0.399
##
## Factor1 Factor2
## SS loadings 1.433 1.085
## Proportion Var 0.287 0.217
## Cumulative Var 0.287 0.504
##
## Factor Correlations:
## Factor1 Factor2
## Factor1 1.000 -0.351
## Factor2 -0.351 1.000
##
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 2.5 on 1 degree of freedom.
## The p-value is 0.114