library(readxl)
library(car)
## Loading required package: carData
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(ggplot2)
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(ggpubr)
library(ggrepel)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some() masks car::some()
library(usmap)
library(FactoMineR)
This exercise uses Principal Component Analysis (PCA) to reduce the dimensionality of a large data set containing variables related tofertility rate. The selected variables for the exercise are age at first marriage, % of hispanics population, age at first birth, % voted for obama, % of female population, and % of population having bachelors degree measured at the U.S. state level. The data source is Junkins et al. (2021).
data2 <- read_excel("C:\\Users\\anami\\OneDrive\\Documents\\Stat ll\\Assignment 2\\Junkins Data.xlsx")
head(data2)
## # A tibble: 6 × 62
## STATE_ABBR statename state region division rz_ext rz_agr rz_cns rz_neu
## <chr> <chr> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 AK Alabama 1 3 6 -0.126 -0.105 -0.110 -0.0491
## 2 AL Alaska 2 4 9 0.0430 0.100 0.0806 -0.0152
## 3 AR Arizona 4 4 8 -0.00934 -0.0154 -0.0583 0.0765
## 4 AZ Arkansas 5 3 7 0.0122 -0.00702 0.00843 -0.0436
## 5 CA California 6 4 9 -0.0430 -0.0357 -0.0654 -0.0143
## 6 CO Colorado 8 4 8 -0.00489 -0.0658 -0.0221 -0.0451
## # … with 53 more variables: rz_opn <dbl>, mzextra <dbl>, mzagree <dbl>,
## # mzconsc <dbl>, mzneuro <dbl>, mzopen <dbl>, fzextra <dbl>, fzagree <dbl>,
## # fzconsc <dbl>, fzneuro <dbl>, fzopen <dbl>, E_LT30 <dbl>, A_LT30 <dbl>,
## # C_LT30 <dbl>, N_LT30 <dbl>, O_LT30 <dbl>, E_GT30 <dbl>, A_GT30 <dbl>,
## # C_GT30 <dbl>, N_GT30 <dbl>, O_GT30 <dbl>, GenD_E <dbl>, GenD_A <dbl>,
## # GenD_C <dbl>, GenD_N <dbl>, GenD_O <dbl>, AgeD_E <dbl>, AgeD_A <dbl>,
## # AgeD_C <dbl>, AgeD_N <dbl>, AgeD_O <dbl>, TFR <dbl>, alpha <dbl>, …
data2 <- as.data.frame(data2)
head(data2)
## STATE_ABBR statename state region division rz_ext rz_agr rz_cns
## 1 AK Alabama 1 3 6 -0.125762 -0.104588 -0.109966
## 2 AL Alaska 2 4 9 0.043042 0.100483 0.080650
## 3 AR Arizona 4 4 8 -0.009335 -0.015433 -0.058324
## 4 AZ Arkansas 5 3 7 0.012155 -0.007023 0.008430
## 5 CA California 6 4 9 -0.043034 -0.035667 -0.065377
## 6 CO Colorado 8 4 8 -0.004892 -0.065783 -0.022110
## rz_neu rz_opn mzextra mzagree mzconsc mzneuro mzopen fzextra
## 1 -0.049075 0.051630 -0.004242 0.412499 0.301710 -0.326062 0.427654 0.139469
## 2 -0.015238 -0.112345 0.112153 0.504055 0.357393 -0.292434 0.394611 0.250319
## 3 0.076501 -0.080592 0.090463 0.468918 0.313488 -0.242351 0.435278 0.200209
## 4 -0.043650 0.002879 0.100366 0.469782 0.343585 -0.307043 0.452290 0.219097
## 5 -0.014349 0.092209 0.058111 0.463655 0.294992 -0.271535 0.496399 0.187948
## 6 -0.045139 0.073840 0.084083 0.433223 0.310563 -0.315733 0.477640 0.211284
## fzagree fzconsc fzneuro fzopen E_LT30 A_LT30 C_LT30 N_LT30
## 1 0.534365 0.337558 -0.013555 0.419930 -0.140225 -0.133313 -0.142583 -0.041884
## 2 0.635946 0.440848 0.012200 0.300435 0.051194 0.140257 0.130334 -0.030951
## 3 0.574798 0.384941 0.061924 0.311068 -0.009714 -0.012163 -0.048157 0.072481
## 4 0.578078 0.413102 -0.013868 0.365238 -0.003936 0.007324 0.020526 -0.047188
## 5 0.557243 0.376993 -0.004257 0.411690 -0.070760 -0.023765 -0.059610 -0.013238
## 6 0.552301 0.406930 -0.010579 0.407388 -0.007388 -0.066719 -0.012239 -0.044091
## O_LT30 E_GT30 A_GT30 C_GT30 N_GT30 O_GT30 GenD_E
## 1 0.064560 -0.093362 -0.040240 -0.036899 -0.065184 0.022665 -0.143711
## 2 -0.109746 0.017092 -0.026130 -0.077508 0.034782 -0.120618 -0.138166
## 3 -0.036030 -0.008526 -0.022415 -0.080035 0.085087 -0.175739 -0.109746
## 4 0.014119 0.051829 -0.042397 -0.021394 -0.034928 -0.024835 -0.118731
## 5 0.082416 0.027033 -0.065745 -0.079949 -0.017157 0.116955 -0.129837
## 6 0.078307 0.001056 -0.063551 -0.045642 -0.047637 0.063192 -0.127201
## GenD_A GenD_C GenD_N GenD_O AgeD_E AgeD_A AgeD_C
## 1 -0.121866 -0.035848 -0.312507 0.007724 -0.046863 -0.093073 -0.105684
## 2 -0.131891 -0.083455 -0.304634 0.094176 0.034101 0.166388 0.207842
## 3 -0.105880 -0.071453 -0.304275 0.124210 -0.001189 0.010252 0.031878
## 4 -0.108296 -0.069517 -0.293175 0.087052 -0.055765 0.049721 0.041920
## 5 -0.093588 -0.082001 -0.267278 0.084709 -0.097794 0.041980 0.020339
## 6 -0.119078 -0.096367 -0.305154 0.070252 -0.008443 -0.003168 0.033403
## AgeD_N AgeD_O TFR alpha peak stop ageFB t_ageFM nevermar
## 1 0.023300 0.041896 2.3470 13.338400 23.90560 2.854100 24.3 25.85 0.316093
## 2 -0.065733 0.010872 1.8715 11.279618 24.63356 4.435501 23.6 26.60 0.291652
## 3 -0.012606 0.139709 2.0030 12.428397 23.17632 4.598583 23.0 25.65 0.263799
## 4 -0.012260 0.038953 2.0680 10.913909 25.19252 2.924860 24.0 26.80 0.316186
## 5 0.003918 -0.034539 1.9475 7.012688 28.85349 2.941833 25.6 28.30 0.360036
## 6 0.003546 0.015115 1.9240 6.950724 28.27968 3.288943 25.7 27.15 0.306618
## divorce cohabit abortion t_nmf unintprg famplnpw med_inc perAA perHisp
## 1 0.016276 8.2 12.0 31.9 53 147 64576 3.7 5.5
## 2 0.017909 4.8 12.0 45.0 55 147 40474 26.2 3.9
## 3 0.018978 5.3 8.7 39.1 56 152 38307 15.4 6.4
## 4 0.014391 7.7 15.2 41.3 51 151 46789 4.1 29.6
## 5 0.012347 8.0 27.6 33.9 56 245 57708 6.2 37.6
## 6 0.015716 8.1 15.7 29.8 48 80 54046 4.0 20.7
## perFem perBA perUrb voteO vryrel relcons
## 1 47.9 27.2 66.02 38.74 56.3 42.76
## 2 51.5 21.7 59.04 37.89 27.9 18.75
## 3 50.9 19.1 56.16 45.12 35.7 18.08
## 4 50.3 26.3 89.81 38.86 52.1 39.93
## 5 50.3 30.1 94.95 61.01 34.0 11.45
## 6 49.9 35.9 86.15 53.66 32.6 14.78
data2.pc <- prcomp(data2[,c("ageFB","perHisp","t_ageFM","voteO","perBA","perFem")], center = TRUE,scale. = TRUE,retx=T,rownames=junkinsdata[,1],rownames=data2[,1]
)
## Warning: In prcomp.default(data2[, c("ageFB", "perHisp", "t_ageFM", "voteO",
## "perBA", "perFem")], center = TRUE, scale. = TRUE, retx = T,
## rownames = junkinsdata[, 1], rownames = data2[, 1]) :
## extra arguments 'rownames', 'rownames' will be disregarded
summary(data2.pc)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.741 1.0757 0.9453 0.7756 0.5014 0.25736
## Proportion of Variance 0.505 0.1928 0.1489 0.1003 0.0419 0.01104
## Cumulative Proportion 0.505 0.6979 0.8468 0.9471 0.9890 1.00000
data2.pc$rotation
## PC1 PC2 PC3 PC4 PC5 PC6
## ageFB 0.52784156 -0.05702974 0.31694782 0.1843058 -0.12954141 0.75294447
## perHisp 0.08883248 -0.66968278 -0.70637345 0.1205503 0.05570787 0.16442191
## t_ageFM 0.50756882 0.16575556 -0.21542141 0.1626877 -0.68796132 -0.41077311
## voteO 0.40546972 -0.07512066 -0.03294675 -0.9041343 0.09998293 -0.03755438
## perBA 0.46929845 -0.29543208 0.32248691 0.3009644 0.52605426 -0.47028546
## perFem 0.26686932 0.65412113 -0.49909012 0.1303587 0.46914306 0.12135392
biplot(data2.pc, scale = 0,xlabs=data2[,1])
Biplot shows the importance of the relationship between different variables and principal components. Each state is plotted with its value for PC1 and PC2. The states closer to each other on the plot have similar data patterns regarding the variables in the original data set. A state with a higher value for PC1 generally has high age of first marriage and increased age at first birth but a lower % of the Hispanic population. While states having a high value for PC2 generally have a high % of female population but a low % of Hispanics. Moreover, these red arrows represent the eigenvector for each variable in the data set.
screeplot(data2.pc, type = "l", main = "Scree Plot")
abline(h=1)
The Screeplot shows the eigenvalue on the Y-axis and the number of factors on the X-axis. The plot above suggests that the analysis can generate two factors.
data2.pc$sdev^2
## [1] 3.03022920 1.15707982 0.89356111 0.60150696 0.25138880 0.06623411
data2.play <- PCA(data2[,c("ageFB","perHisp","t_ageFM","voteO","perBA","perFem")], scale.unit=T,graph=F)
eigenvalues<-data2.play$eig
head(eigenvalues[,1:2])
## eigenvalue percentage of variance
## comp 1 3.03022920 50.503820
## comp 2 1.15707982 19.284664
## comp 3 0.89356111 14.892685
## comp 4 0.60150696 10.025116
## comp 5 0.25138880 4.189813
## comp 6 0.06623411 1.103902
hist(data2.pc$x[,1])
hist(data2.pc$x[,2])
cor(data2.pc$x[,1],data2.pc$x[,2])
## [1] 6.82294e-18
scores<-data.frame(data2.pc$x)
data2<-cbind(data2,scores)
ggplot(data2, aes(PC1, PC2, col = region, fill = region)) +
stat_ellipse(geom = "polygon", col = "black", alpha = 0.5) +
geom_point(shape = 21, col = "black")
round(cor(data2[,c("ageFB","perHisp","t_ageFM","voteO","perBA","perFem","PC1","PC2")]), 3)
## ageFB perHisp t_ageFM voteO perBA perFem PC1 PC2
## ageFB 1.000 0.006 0.760 0.539 0.854 0.248 0.919 -0.061
## perHisp 0.006 1.000 0.142 0.124 0.176 -0.103 0.155 -0.720
## t_ageFM 0.760 0.142 1.000 0.511 0.554 0.560 0.884 0.178
## voteO 0.539 0.124 0.511 1.000 0.444 0.226 0.706 -0.081
## perBA 0.854 0.176 0.554 0.444 1.000 0.094 0.817 -0.318
## perFem 0.248 -0.103 0.560 0.226 0.094 1.000 0.465 0.704
## PC1 0.919 0.155 0.884 0.706 0.817 0.465 1.000 0.000
## PC2 -0.061 -0.720 0.178 -0.081 -0.318 0.704 0.000 1.000
The table shows the correlation of original variables with principal component scores. Variables- the age at first marriage, age at first birth, and % of the population having a bachelor’s degree are strongly positively related to PC1 of the analysis. At the same time, % of the female population is strongly positively correlated to PC2. And % of Hispanics are conversely associated with the PC2 component of the analysis.
The main purpose is to find clusters of States such that observations within each group are quite similar. In contrast, the observations in different clusters are quite different from each other.
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
data3 <- subset(data2, select = c("ageFB","perHisp","t_ageFM","voteO","perBA","perFem"))
datascaled <- scale(data3)
fviz_nbclust(datascaled, kmeans, method = "wss")
### Interpretation For the above plot, it appears that there is a bit of
an elbow or “bend” at k = 3 clusters.
gap_stat <- clusGap(datascaled,
FUN = kmeans,
nstart = 25,
K.max = 10,
B = 50)
#plot number of clusters vs. gap statistic
fviz_gap_stat(gap_stat)
set.seed(1)
km <- kmeans(datascaled, centers = 3, nstart = 25)
km
## K-means clustering with 3 clusters of sizes 6, 25, 19
##
## Cluster means:
## ageFB perHisp t_ageFM voteO perBA perFem
## 1 -0.4119258 2.27847649 0.1820592 0.07167521 -0.2644728 -0.3908453
## 2 -0.6781705 -0.47513349 -0.6665669 -0.61477732 -0.6280180 -0.2078586
## 3 1.0224114 -0.09434325 0.8195694 0.78628324 0.9098572 0.3969231
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## 2 2 2 1 1 3 3 3 1 2 3 2 2 3 2 2 2 2 3 3 3 3 3 2 2 2
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
## 2 2 2 3 3 1 1 3 2 2 3 3 3 2 2 2 1 2 3 3 3 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 17.67363 81.23874 49.45491
## (between_SS / total_SS = 49.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
From the output, we can see that 6 states are assigned to the first cluster. 25 states are assigned to the second cluster. 19 states are assigned to the third cluster.
fviz_cluster(km, data = datascaled)
### finding means of each cluster
aggregate(datascaled, by=list(cluster=km$cluster), mean)
## cluster ageFB perHisp t_ageFM voteO perBA perFem
## 1 1 -0.4119258 2.27847649 0.1820592 0.07167521 -0.2644728 -0.3908453
## 2 2 -0.6781705 -0.47513349 -0.6665669 -0.61477732 -0.6280180 -0.2078586
## 3 3 1.0224114 -0.09434325 0.8195694 0.78628324 0.9098572 0.3969231
data2 <- cbind(data2, cluster = km$cluster)
data2$newcluster <- car::recode(data2$cluster,"1=1;2=3;3=2")
boxplot(data2$PC1 ~ data2$newcluster,
col='steelblue',
xlab='Cluster',
ylab='Value on PC1')
boxplot(data2$PC2 ~ data2$newcluster,
col='steelblue',
xlab='Cluster',
ylab='Value on PC2')
### Comparing to Region’s classification
table(data2$region,data2$newcluster)
##
## 1 2 3
## 1 1 7 1
## 2 0 4 8
## 3 4 3 9
## 4 1 5 7
Comments
I should have focused more on the literature part and then the selection of variables for the analysis ( PCA and K-means clustering). Before starting to run codes, I should be familiar with the analysis’s main idea. First, I blindly ran the code and completed my assignment 2. But now I know the basics of PCA and clustering ( why am I doing these analyses).