Import Dataset

data <- read.csv("C:/Users/notebook/Downloads/VaccineHesitancy_assign.csv",header = TRUE)

Filter Selangor Data

library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
c <- sqldf('SELECT Age, Education, State, PClinicalBarriers, PAccessBarriers, TrustVaccineProc, TrustGov, PIndBenefits, Respondent FROM data WHERE State == "Selangor" ORDER BY Respondent ASC')

#Mutate dataset
c %<>%                          
  mutate( 
      AgeClass = ifelse(Age<50 , "Below 50", "Above 50"),  
            AgeClass = as.factor(AgeClass)) %>% 
    arrange(AgeClass) 

#Convert row name as respondent number
rownames(c) <- c$Respondent

Summary Statistics

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.2
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ stringr 1.4.1
## ✔ tidyr   1.2.1     ✔ forcats 0.5.2
## ✔ readr   2.1.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(dplyr)

#Malaysia data
msia <- sqldf('SELECT Age, Education, State, PClinicalBarriers, PAccessBarriers, TrustVaccineProc, TrustGov, PIndBenefits, Respondent FROM data')

#Remove missing values
dt = subset(msia, Education != "None")

#Descriptive statistics
summary(dt)
##       Age         Education            State           PClinicalBarriers
##  Min.   :18.00   Length:785         Length:785         Min.   : 6.00    
##  1st Qu.:27.00   Class :character   Class :character   1st Qu.:15.00    
##  Median :33.00   Mode  :character   Mode  :character   Median :18.00    
##  Mean   :35.52                                         Mean   :17.76    
##  3rd Qu.:43.00                                         3rd Qu.:21.00    
##  Max.   :79.00                                         Max.   :30.00    
##  PAccessBarriers TrustVaccineProc    TrustGov      PIndBenefits  
##  Min.   : 4.00   Min.   : 4.00    Min.   : 5.00   Min.   : 3.00  
##  1st Qu.:10.00   1st Qu.:13.00    1st Qu.:12.00   1st Qu.:10.00  
##  Median :13.00   Median :16.00    Median :15.00   Median :12.00  
##  Mean   :12.71   Mean   :15.35    Mean   :15.87   Mean   :11.86  
##  3rd Qu.:15.00   3rd Qu.:18.00    3rd Qu.:20.00   3rd Qu.:14.00  
##  Max.   :20.00   Max.   :20.00    Max.   :25.00   Max.   :15.00  
##    Respondent    
##  Min.   :   1.0  
##  1st Qu.: 232.0  
##  Median : 477.0  
##  Mean   : 802.5  
##  3rd Qu.:1584.0  
##  Max.   :1926.0
#Count of states in Malaysia data
dt %>% 
  group_by(State) %>% 
  summarise(Count = n()) %>% 
  arrange(desc(Count),State)
## # A tibble: 16 × 2
##    State           Count
##    <chr>           <int>
##  1 Selangor          223
##  2 Kuala Lumpur      165
##  3 Johor              88
##  4 Penang             67
##  5 Negeri Sembilan    38
##  6 Perak              37
##  7 Kedah              35
##  8 Sarawak            26
##  9 Kelantan           24
## 10 Malacca            23
## 11 Sabah              21
## 12 Pahang             20
## 13 Terengganu         14
## 14 Perlis              2
## 15 Labuan              1
## 16 Putrajaya           1
#Comparison of mean values between Malaysia and Selangor data
#Selangor
colMeans(c[sapply(c,is.numeric)],na.rm=TRUE)
##               Age PClinicalBarriers   PAccessBarriers  TrustVaccineProc 
##          36.68610          17.61435          12.60987          15.17040 
##          TrustGov      PIndBenefits        Respondent 
##          14.87444          11.57399         923.40359
#Malaysia
colMeans(dt[sapply(dt,is.numeric)],na.rm=TRUE)
##               Age PClinicalBarriers   PAccessBarriers  TrustVaccineProc 
##          35.51975          17.75541          12.71465          15.34904 
##          TrustGov      PIndBenefits        Respondent 
##          15.87006          11.85860         802.49554

Data Visualization

#Boxplot of vaccination barriers against education (fill by age class)
bp1 <- ggplot(c, aes(x=Education, y = PClinicalBarriers, fill = AgeClass)) + 
  geom_boxplot() + 
  ggtitle("PClinicalBarriers vs Education Boxplot")

bp2 <- ggplot(c, aes(x=Education, y = PAccessBarriers, fill = AgeClass)) + 
  geom_boxplot() + 
  ggtitle("PAccessBarriers vs Education Boxplot")

#Boxplot of vaccination barriers against education (fill by education)
bp3 <- ggplot(c, aes(x=Education, y = PClinicalBarriers, fill = Education)) + 
  geom_boxplot() + 
  ggtitle("PClinicalBarriers vs Education Boxplot")

bp4 <- ggplot(c, aes(x=Education, y = PAccessBarriers, fill = Education)) + 
  geom_boxplot() + 
  ggtitle("PAccessBarriers vs Education Boxplot")

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(bp3,bp4,bp1,bp2, ncol=2, nrow=2)

#Violin Plot of confidence level in COVID-19 prevention efforts towards age class
vp1 <- ggplot(c, aes(AgeClass, TrustVaccineProc, fill = AgeClass)) + 
  geom_violin(aes(color = AgeClass), trim = T) +
  geom_boxplot(width=0.1) +
  ggtitle("TrustVaccineProc vs AgeClass Violin Plot")  

vp2 <- ggplot(c, aes(AgeClass, TrustGov, fill = AgeClass)) + 
  geom_violin(aes(color = AgeClass), trim = T) +
  geom_boxplot(width=0.1) +
  theme(legend.position="right") +
  ggtitle("TrustGov vs AgeClass Violin Plot")

vp3 <- ggplot(c, aes(AgeClass, PIndBenefits, fill = AgeClass)) + 
  geom_violin(aes(color = AgeClass), trim = T) +
  geom_boxplot(width=0.1) +
  theme(legend.position="right") +
  ggtitle("PIndBenefits vs AgeClass Violin Plot")

grid.arrange(vp3,vp2,vp1, ncol=2, nrow=2)

#Correlation matrix of selected variables
library(PerformanceAnalytics)
## Warning: package 'PerformanceAnalytics' was built under R version 4.2.2
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
pca = c[,c(4:8)] 

chart.Correlation(pca, histogram = FALSE, method = "pearson") + 
mtext("Correlation Matrix of Variables Selected", side=3, line=1,cex=1.5)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## integer(0)

Principal Component Analysis

pca = c[,c(4:8)] 
pr.out =prcomp (pca, scale =TRUE) 

#PCA loading vectors
pr.out$rotation
##                          PC1       PC2         PC3        PC4         PC5
## PClinicalBarriers  0.3835140 0.6074780 -0.32855703  0.5490079  0.27299833
## PAccessBarriers    0.3792111 0.5571329  0.38399710 -0.6296009 -0.04416841
## TrustVaccineProc  -0.5773626 0.2164170 -0.04122352 -0.2340373  0.75056197
## TrustGov          -0.4658656 0.4278675 -0.50448816 -0.1702014 -0.56251378
## PIndBenefits      -0.3984189 0.3011088  0.69884821  0.4673874 -0.20917943
#Proportion of variance (PCA)
summary(pr.out)
## Importance of components:
##                          PC1    PC2    PC3    PC4     PC5
## Standard deviation     1.444 1.0110 0.9254 0.8035 0.62533
## Proportion of Variance 0.417 0.2044 0.1713 0.1291 0.07821
## Cumulative Proportion  0.417 0.6214 0.7927 0.9218 1.00000
#Scree plot
pr.var = pr.out$sdev^2  
pr.var 
## [1] 2.0850034 1.0220598 0.8563540 0.6455506 0.3910322
pve=pr.var/sum(pr.var ) 
pve
## [1] 0.41700068 0.20441196 0.17127079 0.12911013 0.07820644
#Scree plot
plot(pve , xlab=" Principal Component ", ylab=" Proportion of Variance Explained ", ylim=c(0,1) ,type="b",main = "Scree Plot",col=3) 

library(ggfortify)
## Warning: package 'ggfortify' was built under R version 4.2.2
#PCA grouped by education
autoplot(pr.out, data = c, colour = 'Education', shape = FALSE, label.size = 3, loadings = TRUE, loadings.colour = 'black', loadings.label = TRUE, loadings.label.size = 3, main="PCA Biplot filled by Education Level", xlim=c(-0.25,0.25)) + 
  ggeasy::easy_center_title() + 
  geom_vline(xintercept=0,linetype="dotted") + 
  geom_hline(yintercept=0,linetype="dotted")

#PCA grouped by age class
autoplot(pr.out, data = c, colour = 'AgeClass', shape = FALSE, label.size = 3, loadings = TRUE, loadings.colour = 'black', loadings.label = TRUE, loadings.label.size = 3, main ="PCA Biplot filled by Age Class", xlim=c(-0.25,0.25),legend.title="Age Class") +      
  ggeasy::easy_center_title() + 
  geom_vline(xintercept=0,linetype="dotted") + 
  geom_hline(yintercept=0,linetype="dotted") + 
  guides(colour = guide_legend(title = "Age Class"))

K-means Clustering

library(NbClust)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.2.2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#Elbow method
fviz_nbclust(pr.out$x[, 1:2], kmeans, method = "wss") +
  geom_vline(xintercept = 4, linetype = 2) + 
  labs(subtitle = "Elbow method")

#K-means clustering
kmeans_4 = kmeans(pr.out$x[, 1:2],centers = 4, nstart = 20)
kmeans_4
## K-means clustering with 4 clusters of sizes 44, 56, 89, 34
## 
## Cluster means:
##          PC1         PC2
## 1  1.9315927 -0.39152347
## 2 -1.7583179 -0.51876959
## 3  0.2752355 -0.08588279
## 4 -0.3241246  1.58593230
## 
## Clustering vector:
##   61   84  106  146  155  164  168  191  226  230  240  333  337  614  644  667 
##    3    4    4    3    2    3    3    3    2    4    2    3    2    2    3    2 
## 1388 1391 1394 1414 1464 1511 1557 1584 1587 1589 1617 1773   19   22   28   34 
##    3    3    2    2    2    1    2    3    2    3    1    3    3    3    3    3 
##   46   59   60   77   82   86   93   94  101  109  114  123  134  140  145  150 
##    2    1    2    4    4    2    4    3    1    3    1    3    3    3    4    1 
##  154  159  174  177  181  185  189  192  197  200  204  205  208  212  214  223 
##    3    3    2    4    3    1    4    2    2    2    4    3    4    3    3    3 
##  225  238  244  249  255  264  268  270  275  276  277  279  293  303  308  334 
##    3    2    2    3    4    1    3    1    2    3    4    2    4    3    2    2 
##  348  352  353  359  360  364  367  371  372  377  381  387  391  393  394  396 
##    4    1    1    2    4    4    3    4    1    3    3    1    2    2    2    3 
##  397  399  405  423  433  435  439  442  451  455  458  459  462  467  471  478 
##    3    4    4    4    3    1    2    3    4    1    1    2    3    2    4    4 
##  486  488  492  496  504  506  507  509  511  512  519  522  525  527  529  553 
##    3    2    2    1    3    3    2    1    4    1    4    1    2    3    1    3 
##  557  568  580  608  609  612  628  630  631  648 1385 1405 1412 1415 1418 1426 
##    1    3    3    2    4    3    3    1    4    4    2    2    4    3    3    3 
## 1427 1443 1452 1486 1494 1507 1522 1527 1535 1539 1544 1546 1564 1574 1603 1619 
##    1    1    1    1    2    3    2    3    3    3    2    3    1    2    1    3 
## 1694 1697 1715 1720 1724 1725 1728 1733 1736 1737 1738 1739 1741 1743 1745 1746 
##    3    1    1    3    1    1    3    2    3    3    1    3    3    2    1    1 
## 1748 1752 1754 1755 1756 1758 1767 1768 1769 1770 1777 1789 1792 1795 1797 1800 
##    1    3    2    3    3    4    3    3    2    3    2    3    1    1    1    3 
## 1802 1803 1806 1811 1812 1832 1834 1841 1843 1850 1863 1864 1866 1867 1868 1869 
##    3    1    3    3    1    3    2    2    3    3    3    3    1    3    4    3 
## 1875 1882 1890 1893 1900 1903 1905 1906 1907 1908 1912 1913 1917 1921 1926 
##    4    3    3    2    3    3    2    2    3    2    2    2    4    1    2 
## 
## Within cluster sum of squares by cluster:
## [1] 59.97550 67.89986 58.50801 47.78194
##  (between_SS / total_SS =  66.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
kmeans_4$tot.withinss #quality of the partition
## [1] 234.1653
#K-means clustering plot
fviz_cluster(kmeans_4,pr.out$x[, 1:2], ylim = c(-3,5)) +
  ggtitle("K-means Clustering Plot") +
  ggeasy::easy_center_title() 

#Mean of each variable in every cluster
aggregate(pca, by=list(cluster = kmeans_4$cluster), mean)
##   cluster PClinicalBarriers PAccessBarriers TrustVaccineProc  TrustGov
## 1       1          19.65909       14.590909         11.20455  9.431818
## 2       2          12.89286        9.392857         17.98214 17.910714
## 3       3          17.85393       12.820225         14.60674 13.921348
## 4       4          22.11765       14.794118         17.14706 19.411765
##   PIndBenefits
## 1     9.363636
## 2    13.089286
## 3    11.280899
## 4    12.705882

Exploratory Factor Analysis

library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
#Convert to correlation matrix
mpca = as.data.frame(pca)
mmpca = cor(mpca)
View(mmpca)

#Kaiser-Meyer-Olkin (KMO) test  
KMO(mpca)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = mpca)
## Overall MSA =  0.62
## MSA for each item = 
## PClinicalBarriers   PAccessBarriers  TrustVaccineProc          TrustGov 
##              0.61              0.71              0.60              0.58 
##      PIndBenefits 
##              0.71
#Bartlett’s statistical test
cortest.bartlett(mmpca, n=223, diag = TRUE)
## $chisq
## [1] 170.134
## 
## $p.value
## [1] 2.602334e-31
## 
## $df
## [1] 10
#Oblique factor rotation technique 
factanal(mpca, factors = 2, n.obs = 223, rotation = "promax") 
## 
## Call:
## factanal(x = mpca, factors = 2, n.obs = 223, rotation = "promax")
## 
## Uniquenesses:
## PClinicalBarriers   PAccessBarriers  TrustVaccineProc          TrustGov 
##             0.005             0.861             0.153             0.647 
##      PIndBenefits 
##             0.824 
## 
## Loadings:
##                   Factor1 Factor2
## PClinicalBarriers          1.004 
## PAccessBarriers   -0.208   0.244 
## TrustVaccineProc   0.916         
## TrustGov           0.626   0.124 
## PIndBenefits       0.399         
## 
##                Factor1 Factor2
## SS loadings      1.433   1.085
## Proportion Var   0.287   0.217
## Cumulative Var   0.287   0.504
## 
## Factor Correlations:
##         Factor1 Factor2
## Factor1   1.000  -0.351
## Factor2  -0.351   1.000
## 
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 2.5 on 1 degree of freedom.
## The p-value is 0.114