Loading libraries

library(readxl)
library(car)
## Loading required package: carData
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(ggplot2)
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(ggpubr)
library(ggrepel)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.0      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ✔ purrr   0.3.4      
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some()   masks car::some()
library(usmap)
library(FactoMineR)

Objective:

This exercise uses Principal Component Analysis (PCA) to reduce the dimensionality of a large data set containing variables related tofertility rate. The selected variables for the exercise are age at first marriage, % of hispanics population, age at first birth, % voted for obama, % of female population, and % of population having bachelors degree measured at the U.S. state level. The data source is Junkins et al. (2021).

Reading the Excel file

data2 <- read_excel("C:\\Users\\anami\\OneDrive\\Documents\\Stat ll\\Assignment 2\\Junkins Data.xlsx")
head(data2)
## # A tibble: 6 × 62
##   STATE_ABBR statename  state region division   rz_ext   rz_agr   rz_cns  rz_neu
##   <chr>      <chr>      <dbl> <chr>  <chr>       <dbl>    <dbl>    <dbl>   <dbl>
## 1 AK         Alabama        1 3      6        -0.126   -0.105   -0.110   -0.0491
## 2 AL         Alaska         2 4      9         0.0430   0.100    0.0806  -0.0152
## 3 AR         Arizona        4 4      8        -0.00934 -0.0154  -0.0583   0.0765
## 4 AZ         Arkansas       5 3      7         0.0122  -0.00702  0.00843 -0.0436
## 5 CA         California     6 4      9        -0.0430  -0.0357  -0.0654  -0.0143
## 6 CO         Colorado       8 4      8        -0.00489 -0.0658  -0.0221  -0.0451
## # … with 53 more variables: rz_opn <dbl>, mzextra <dbl>, mzagree <dbl>,
## #   mzconsc <dbl>, mzneuro <dbl>, mzopen <dbl>, fzextra <dbl>, fzagree <dbl>,
## #   fzconsc <dbl>, fzneuro <dbl>, fzopen <dbl>, E_LT30 <dbl>, A_LT30 <dbl>,
## #   C_LT30 <dbl>, N_LT30 <dbl>, O_LT30 <dbl>, E_GT30 <dbl>, A_GT30 <dbl>,
## #   C_GT30 <dbl>, N_GT30 <dbl>, O_GT30 <dbl>, GenD_E <dbl>, GenD_A <dbl>,
## #   GenD_C <dbl>, GenD_N <dbl>, GenD_O <dbl>, AgeD_E <dbl>, AgeD_A <dbl>,
## #   AgeD_C <dbl>, AgeD_N <dbl>, AgeD_O <dbl>, TFR <dbl>, alpha <dbl>, …
data2 <- as.data.frame(data2)

head(data2)
##   STATE_ABBR  statename state region division    rz_ext    rz_agr    rz_cns
## 1         AK    Alabama     1      3        6 -0.125762 -0.104588 -0.109966
## 2         AL     Alaska     2      4        9  0.043042  0.100483  0.080650
## 3         AR    Arizona     4      4        8 -0.009335 -0.015433 -0.058324
## 4         AZ   Arkansas     5      3        7  0.012155 -0.007023  0.008430
## 5         CA California     6      4        9 -0.043034 -0.035667 -0.065377
## 6         CO   Colorado     8      4        8 -0.004892 -0.065783 -0.022110
##      rz_neu    rz_opn   mzextra  mzagree  mzconsc   mzneuro   mzopen  fzextra
## 1 -0.049075  0.051630 -0.004242 0.412499 0.301710 -0.326062 0.427654 0.139469
## 2 -0.015238 -0.112345  0.112153 0.504055 0.357393 -0.292434 0.394611 0.250319
## 3  0.076501 -0.080592  0.090463 0.468918 0.313488 -0.242351 0.435278 0.200209
## 4 -0.043650  0.002879  0.100366 0.469782 0.343585 -0.307043 0.452290 0.219097
## 5 -0.014349  0.092209  0.058111 0.463655 0.294992 -0.271535 0.496399 0.187948
## 6 -0.045139  0.073840  0.084083 0.433223 0.310563 -0.315733 0.477640 0.211284
##    fzagree  fzconsc   fzneuro   fzopen    E_LT30    A_LT30    C_LT30    N_LT30
## 1 0.534365 0.337558 -0.013555 0.419930 -0.140225 -0.133313 -0.142583 -0.041884
## 2 0.635946 0.440848  0.012200 0.300435  0.051194  0.140257  0.130334 -0.030951
## 3 0.574798 0.384941  0.061924 0.311068 -0.009714 -0.012163 -0.048157  0.072481
## 4 0.578078 0.413102 -0.013868 0.365238 -0.003936  0.007324  0.020526 -0.047188
## 5 0.557243 0.376993 -0.004257 0.411690 -0.070760 -0.023765 -0.059610 -0.013238
## 6 0.552301 0.406930 -0.010579 0.407388 -0.007388 -0.066719 -0.012239 -0.044091
##      O_LT30    E_GT30    A_GT30    C_GT30    N_GT30    O_GT30    GenD_E
## 1  0.064560 -0.093362 -0.040240 -0.036899 -0.065184  0.022665 -0.143711
## 2 -0.109746  0.017092 -0.026130 -0.077508  0.034782 -0.120618 -0.138166
## 3 -0.036030 -0.008526 -0.022415 -0.080035  0.085087 -0.175739 -0.109746
## 4  0.014119  0.051829 -0.042397 -0.021394 -0.034928 -0.024835 -0.118731
## 5  0.082416  0.027033 -0.065745 -0.079949 -0.017157  0.116955 -0.129837
## 6  0.078307  0.001056 -0.063551 -0.045642 -0.047637  0.063192 -0.127201
##      GenD_A    GenD_C    GenD_N   GenD_O    AgeD_E    AgeD_A    AgeD_C
## 1 -0.121866 -0.035848 -0.312507 0.007724 -0.046863 -0.093073 -0.105684
## 2 -0.131891 -0.083455 -0.304634 0.094176  0.034101  0.166388  0.207842
## 3 -0.105880 -0.071453 -0.304275 0.124210 -0.001189  0.010252  0.031878
## 4 -0.108296 -0.069517 -0.293175 0.087052 -0.055765  0.049721  0.041920
## 5 -0.093588 -0.082001 -0.267278 0.084709 -0.097794  0.041980  0.020339
## 6 -0.119078 -0.096367 -0.305154 0.070252 -0.008443 -0.003168  0.033403
##      AgeD_N    AgeD_O    TFR     alpha     peak     stop ageFB t_ageFM nevermar
## 1  0.023300  0.041896 2.3470 13.338400 23.90560 2.854100  24.3   25.85 0.316093
## 2 -0.065733  0.010872 1.8715 11.279618 24.63356 4.435501  23.6   26.60 0.291652
## 3 -0.012606  0.139709 2.0030 12.428397 23.17632 4.598583  23.0   25.65 0.263799
## 4 -0.012260  0.038953 2.0680 10.913909 25.19252 2.924860  24.0   26.80 0.316186
## 5  0.003918 -0.034539 1.9475  7.012688 28.85349 2.941833  25.6   28.30 0.360036
## 6  0.003546  0.015115 1.9240  6.950724 28.27968 3.288943  25.7   27.15 0.306618
##    divorce cohabit abortion t_nmf unintprg famplnpw med_inc perAA perHisp
## 1 0.016276     8.2     12.0  31.9       53      147   64576   3.7     5.5
## 2 0.017909     4.8     12.0  45.0       55      147   40474  26.2     3.9
## 3 0.018978     5.3      8.7  39.1       56      152   38307  15.4     6.4
## 4 0.014391     7.7     15.2  41.3       51      151   46789   4.1    29.6
## 5 0.012347     8.0     27.6  33.9       56      245   57708   6.2    37.6
## 6 0.015716     8.1     15.7  29.8       48       80   54046   4.0    20.7
##   perFem perBA perUrb voteO vryrel relcons
## 1   47.9  27.2  66.02 38.74   56.3   42.76
## 2   51.5  21.7  59.04 37.89   27.9   18.75
## 3   50.9  19.1  56.16 45.12   35.7   18.08
## 4   50.3  26.3  89.81 38.86   52.1   39.93
## 5   50.3  30.1  94.95 61.01   34.0   11.45
## 6   49.9  35.9  86.15 53.66   32.6   14.78

PCA

data2.pc <- prcomp(data2[,c("ageFB","perHisp","t_ageFM","voteO","perBA","perFem")], center = TRUE,scale. = TRUE,retx=T,rownames=junkinsdata[,1],rownames=data2[,1]
)
## Warning: In prcomp.default(data2[, c("ageFB", "perHisp", "t_ageFM", "voteO", 
##     "perBA", "perFem")], center = TRUE, scale. = TRUE, retx = T, 
##     rownames = junkinsdata[, 1], rownames = data2[, 1]) :
##  extra arguments 'rownames', 'rownames' will be disregarded
summary(data2.pc)
## Importance of components:
##                          PC1    PC2    PC3    PC4    PC5     PC6
## Standard deviation     1.741 1.0757 0.9453 0.7756 0.5014 0.25736
## Proportion of Variance 0.505 0.1928 0.1489 0.1003 0.0419 0.01104
## Cumulative Proportion  0.505 0.6979 0.8468 0.9471 0.9890 1.00000

From the results, we can note the following:

  1. The standard deviation of PC1 and PC2 are greater than 1.
  2. The first Principal component explains 50.5% of the total variance in the data set. The second Principal component explains 19.3% of the total variance in the data set. The third Principal component explains 15% of the total variance in the data set. The fourth Principal component explains 10% of the total variance in the data set.. Thus, the first two principal components define 69.5% variance in the data.

Rotation

data2.pc$rotation
##                PC1         PC2         PC3        PC4         PC5         PC6
## ageFB   0.52784156 -0.05702974  0.31694782  0.1843058 -0.12954141  0.75294447
## perHisp 0.08883248 -0.66968278 -0.70637345  0.1205503  0.05570787  0.16442191
## t_ageFM 0.50756882  0.16575556 -0.21542141  0.1626877 -0.68796132 -0.41077311
## voteO   0.40546972 -0.07512066 -0.03294675 -0.9041343  0.09998293 -0.03755438
## perBA   0.46929845 -0.29543208  0.32248691  0.3009644  0.52605426 -0.47028546
## perFem  0.26686932  0.65412113 -0.49909012  0.1303587  0.46914306  0.12135392
biplot(data2.pc, scale = 0,xlabs=data2[,1])

Interpretation:

Biplot shows the importance of the relationship between different variables and principal components. Each state is plotted with its value for PC1 and PC2. The states closer to each other on the plot have similar data patterns regarding the variables in the original data set. A state with a higher value for PC1 generally has high age of first marriage and increased age at first birth but a lower % of the Hispanic population. While states having a high value for PC2 generally have a high % of female population but a low % of Hispanics. Moreover, these red arrows represent the eigenvector for each variable in the data set.

Scree Plot

screeplot(data2.pc, type = "l", main = "Scree Plot")
abline(h=1)

Interpretation:

The Screeplot shows the eigenvalue on the Y-axis and the number of factors on the X-axis. The plot above suggests that the analysis can generate two factors.

Obtaining Eigenvalue

data2.pc$sdev^2
## [1] 3.03022920 1.15707982 0.89356111 0.60150696 0.25138880 0.06623411
data2.play <- PCA(data2[,c("ageFB","perHisp","t_ageFM","voteO","perBA","perFem")], scale.unit=T,graph=F)
eigenvalues<-data2.play$eig
head(eigenvalues[,1:2])
##        eigenvalue percentage of variance
## comp 1 3.03022920              50.503820
## comp 2 1.15707982              19.284664
## comp 3 0.89356111              14.892685
## comp 4 0.60150696              10.025116
## comp 5 0.25138880               4.189813
## comp 6 0.06623411               1.103902
hist(data2.pc$x[,1])

hist(data2.pc$x[,2])

cor(data2.pc$x[,1],data2.pc$x[,2])
## [1] 6.82294e-18
scores<-data.frame(data2.pc$x)
data2<-cbind(data2,scores)

Plot

ggplot(data2, aes(PC1, PC2, col = region, fill = region)) +
  stat_ellipse(geom = "polygon", col = "black", alpha = 0.5) +
  geom_point(shape = 21, col = "black")

round(cor(data2[,c("ageFB","perHisp","t_ageFM","voteO","perBA","perFem","PC1","PC2")]), 3)
##          ageFB perHisp t_ageFM  voteO  perBA perFem   PC1    PC2
## ageFB    1.000   0.006   0.760  0.539  0.854  0.248 0.919 -0.061
## perHisp  0.006   1.000   0.142  0.124  0.176 -0.103 0.155 -0.720
## t_ageFM  0.760   0.142   1.000  0.511  0.554  0.560 0.884  0.178
## voteO    0.539   0.124   0.511  1.000  0.444  0.226 0.706 -0.081
## perBA    0.854   0.176   0.554  0.444  1.000  0.094 0.817 -0.318
## perFem   0.248  -0.103   0.560  0.226  0.094  1.000 0.465  0.704
## PC1      0.919   0.155   0.884  0.706  0.817  0.465 1.000  0.000
## PC2     -0.061  -0.720   0.178 -0.081 -0.318  0.704 0.000  1.000

Interpretation

The table shows the correlation of original variables with principal component scores. Variables- the age at first marriage, age at first birth, and % of the population having a bachelor’s degree are strongly positively related to PC1 of the analysis. At the same time, % of the female population is strongly positively correlated to PC2. And % of Hispanics are conversely associated with the PC2 component of the analysis.

Homework 3

K Means

Objective

The main purpose is to find clusters of States such that observations within each group are quite similar. In contrast, the observations in different clusters are quite different from each other.

Loading libraries

library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)

Preparing the data

data3 <- subset(data2, select = c("ageFB","perHisp","t_ageFM","voteO","perBA","perFem"))
datascaled <- scale(data3)

Finding the Optimal Number of Clusters

fviz_nbclust(datascaled, kmeans, method = "wss")

### Interpretation For the above plot, it appears that there is a bit of an elbow or “bend” at k = 3 clusters.

Calculating gap statistic based on number of clusters

gap_stat <- clusGap(datascaled,
                    FUN = kmeans,
                    nstart = 25,
                    K.max = 10,
                    B = 50)

#plot number of clusters vs. gap statistic
fviz_gap_stat(gap_stat)

Performing k-means clustering with k = 3 clusters

set.seed(1)
km <- kmeans(datascaled, centers = 3, nstart = 25)
km
## K-means clustering with 3 clusters of sizes 6, 25, 19
## 
## Cluster means:
##        ageFB     perHisp    t_ageFM       voteO      perBA     perFem
## 1 -0.4119258  2.27847649  0.1820592  0.07167521 -0.2644728 -0.3908453
## 2 -0.6781705 -0.47513349 -0.6665669 -0.61477732 -0.6280180 -0.2078586
## 3  1.0224114 -0.09434325  0.8195694  0.78628324  0.9098572  0.3969231
## 
## Clustering vector:
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
##  2  2  2  1  1  3  3  3  1  2  3  2  2  3  2  2  2  2  3  3  3  3  3  2  2  2 
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 
##  2  2  2  3  3  1  1  3  2  2  3  3  3  2  2  2  1  2  3  3  3  2  2  2 
## 
## Within cluster sum of squares by cluster:
## [1] 17.67363 81.23874 49.45491
##  (between_SS / total_SS =  49.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Interpretation

From the output, we can see that 6 states are assigned to the first cluster. 25 states are assigned to the second cluster. 19 states are assigned to the third cluster.

Plot results of final k-means model

fviz_cluster(km, data = datascaled)

### finding means of each cluster

aggregate(datascaled, by=list(cluster=km$cluster), mean)
##   cluster      ageFB     perHisp    t_ageFM       voteO      perBA     perFem
## 1       1 -0.4119258  2.27847649  0.1820592  0.07167521 -0.2644728 -0.3908453
## 2       2 -0.6781705 -0.47513349 -0.6665669 -0.61477732 -0.6280180 -0.2078586
## 3       3  1.0224114 -0.09434325  0.8195694  0.78628324  0.9098572  0.3969231

Adding cluster assigment to original data

data2 <- cbind(data2, cluster = km$cluster)

data2$newcluster <- car::recode(data2$cluster,"1=1;2=3;3=2")

boxplot(data2$PC1 ~ data2$newcluster,
                col='steelblue',
        xlab='Cluster',
        ylab='Value on PC1') 

boxplot(data2$PC2 ~ data2$newcluster,
                col='steelblue',
        xlab='Cluster',
        ylab='Value on PC2') 

### Comparing to Region’s classification

table(data2$region,data2$newcluster)
##    
##     1 2 3
##   1 1 7 1
##   2 0 4 8
##   3 4 3 9
##   4 1 5 7

Comments

I should have focused more on the literature part and then the selection of variables for the analysis ( PCA and K-means clustering). Before starting to run codes, I should be familiar with the analysis’s main idea. First, I blindly ran the code and completed my assignment 2. But now I know the basics of PCA and clustering ( why am I doing these analyses).