library(readxl)
library(car)
## Loading required package: carData
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(ggplot2)
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(ggpubr)
library(ggrepel)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some() masks car::some()
library(usmap)
library(FactoMineR)
This exercise uses Principal Component Analysis (PCA) to reduce the dimensionality of a large data set containing variables related to regional differences in personality and total fertility rate. The selected variables for the exercise are variation in extraversion, agreeableness, conscientiousness, neuroticism, and total fertility rate measured at the U.S. state level. The data source is Junkins et al. (2021).
junkinsdata <- read_excel("C:\\Users\\anami\\OneDrive\\Documents\\Stat ll\\Assignment 2\\Junkins Data.xlsx")
head(junkinsdata)
## # A tibble: 6 × 62
## STATE_ABBR statename state region division rz_ext rz_agr rz_cns rz_neu
## <chr> <chr> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 AK Alabama 1 3 6 -0.126 -0.105 -0.110 -0.0491
## 2 AL Alaska 2 4 9 0.0430 0.100 0.0806 -0.0152
## 3 AR Arizona 4 4 8 -0.00934 -0.0154 -0.0583 0.0765
## 4 AZ Arkansas 5 3 7 0.0122 -0.00702 0.00843 -0.0436
## 5 CA California 6 4 9 -0.0430 -0.0357 -0.0654 -0.0143
## 6 CO Colorado 8 4 8 -0.00489 -0.0658 -0.0221 -0.0451
## # … with 53 more variables: rz_opn <dbl>, mzextra <dbl>, mzagree <dbl>,
## # mzconsc <dbl>, mzneuro <dbl>, mzopen <dbl>, fzextra <dbl>, fzagree <dbl>,
## # fzconsc <dbl>, fzneuro <dbl>, fzopen <dbl>, E_LT30 <dbl>, A_LT30 <dbl>,
## # C_LT30 <dbl>, N_LT30 <dbl>, O_LT30 <dbl>, E_GT30 <dbl>, A_GT30 <dbl>,
## # C_GT30 <dbl>, N_GT30 <dbl>, O_GT30 <dbl>, GenD_E <dbl>, GenD_A <dbl>,
## # GenD_C <dbl>, GenD_N <dbl>, GenD_O <dbl>, AgeD_E <dbl>, AgeD_A <dbl>,
## # AgeD_C <dbl>, AgeD_N <dbl>, AgeD_O <dbl>, TFR <dbl>, alpha <dbl>, …
junkinsdata <- as.data.frame(junkinsdata)
head(junkinsdata)
## STATE_ABBR statename state region division rz_ext rz_agr rz_cns
## 1 AK Alabama 1 3 6 -0.125762 -0.104588 -0.109966
## 2 AL Alaska 2 4 9 0.043042 0.100483 0.080650
## 3 AR Arizona 4 4 8 -0.009335 -0.015433 -0.058324
## 4 AZ Arkansas 5 3 7 0.012155 -0.007023 0.008430
## 5 CA California 6 4 9 -0.043034 -0.035667 -0.065377
## 6 CO Colorado 8 4 8 -0.004892 -0.065783 -0.022110
## rz_neu rz_opn mzextra mzagree mzconsc mzneuro mzopen fzextra
## 1 -0.049075 0.051630 -0.004242 0.412499 0.301710 -0.326062 0.427654 0.139469
## 2 -0.015238 -0.112345 0.112153 0.504055 0.357393 -0.292434 0.394611 0.250319
## 3 0.076501 -0.080592 0.090463 0.468918 0.313488 -0.242351 0.435278 0.200209
## 4 -0.043650 0.002879 0.100366 0.469782 0.343585 -0.307043 0.452290 0.219097
## 5 -0.014349 0.092209 0.058111 0.463655 0.294992 -0.271535 0.496399 0.187948
## 6 -0.045139 0.073840 0.084083 0.433223 0.310563 -0.315733 0.477640 0.211284
## fzagree fzconsc fzneuro fzopen E_LT30 A_LT30 C_LT30 N_LT30
## 1 0.534365 0.337558 -0.013555 0.419930 -0.140225 -0.133313 -0.142583 -0.041884
## 2 0.635946 0.440848 0.012200 0.300435 0.051194 0.140257 0.130334 -0.030951
## 3 0.574798 0.384941 0.061924 0.311068 -0.009714 -0.012163 -0.048157 0.072481
## 4 0.578078 0.413102 -0.013868 0.365238 -0.003936 0.007324 0.020526 -0.047188
## 5 0.557243 0.376993 -0.004257 0.411690 -0.070760 -0.023765 -0.059610 -0.013238
## 6 0.552301 0.406930 -0.010579 0.407388 -0.007388 -0.066719 -0.012239 -0.044091
## O_LT30 E_GT30 A_GT30 C_GT30 N_GT30 O_GT30 GenD_E
## 1 0.064560 -0.093362 -0.040240 -0.036899 -0.065184 0.022665 -0.143711
## 2 -0.109746 0.017092 -0.026130 -0.077508 0.034782 -0.120618 -0.138166
## 3 -0.036030 -0.008526 -0.022415 -0.080035 0.085087 -0.175739 -0.109746
## 4 0.014119 0.051829 -0.042397 -0.021394 -0.034928 -0.024835 -0.118731
## 5 0.082416 0.027033 -0.065745 -0.079949 -0.017157 0.116955 -0.129837
## 6 0.078307 0.001056 -0.063551 -0.045642 -0.047637 0.063192 -0.127201
## GenD_A GenD_C GenD_N GenD_O AgeD_E AgeD_A AgeD_C
## 1 -0.121866 -0.035848 -0.312507 0.007724 -0.046863 -0.093073 -0.105684
## 2 -0.131891 -0.083455 -0.304634 0.094176 0.034101 0.166388 0.207842
## 3 -0.105880 -0.071453 -0.304275 0.124210 -0.001189 0.010252 0.031878
## 4 -0.108296 -0.069517 -0.293175 0.087052 -0.055765 0.049721 0.041920
## 5 -0.093588 -0.082001 -0.267278 0.084709 -0.097794 0.041980 0.020339
## 6 -0.119078 -0.096367 -0.305154 0.070252 -0.008443 -0.003168 0.033403
## AgeD_N AgeD_O TFR alpha peak stop ageFB t_ageFM nevermar
## 1 0.023300 0.041896 2.3470 13.338400 23.90560 2.854100 24.3 25.85 0.316093
## 2 -0.065733 0.010872 1.8715 11.279618 24.63356 4.435501 23.6 26.60 0.291652
## 3 -0.012606 0.139709 2.0030 12.428397 23.17632 4.598583 23.0 25.65 0.263799
## 4 -0.012260 0.038953 2.0680 10.913909 25.19252 2.924860 24.0 26.80 0.316186
## 5 0.003918 -0.034539 1.9475 7.012688 28.85349 2.941833 25.6 28.30 0.360036
## 6 0.003546 0.015115 1.9240 6.950724 28.27968 3.288943 25.7 27.15 0.306618
## divorce cohabit abortion t_nmf unintprg famplnpw med_inc perAA perHisp
## 1 0.016276 8.2 12.0 31.9 53 147 64576 3.7 5.5
## 2 0.017909 4.8 12.0 45.0 55 147 40474 26.2 3.9
## 3 0.018978 5.3 8.7 39.1 56 152 38307 15.4 6.4
## 4 0.014391 7.7 15.2 41.3 51 151 46789 4.1 29.6
## 5 0.012347 8.0 27.6 33.9 56 245 57708 6.2 37.6
## 6 0.015716 8.1 15.7 29.8 48 80 54046 4.0 20.7
## perFem perBA perUrb voteO vryrel relcons
## 1 47.9 27.2 66.02 38.74 56.3 42.76
## 2 51.5 21.7 59.04 37.89 27.9 18.75
## 3 50.9 19.1 56.16 45.12 35.7 18.08
## 4 50.3 26.3 89.81 38.86 52.1 39.93
## 5 50.3 30.1 94.95 61.01 34.0 11.45
## 6 49.9 35.9 86.15 53.66 32.6 14.78
junkinsdata.pc <- prcomp(junkinsdata[,c("rz_ext","rz_agr","rz_cns","rz_neu","TFR")], center = TRUE,scale. = TRUE,retx=T,rownames=junkinsdata[,1])
## Warning: In prcomp.default(junkinsdata[, c("rz_ext", "rz_agr", "rz_cns",
## "rz_neu", "TFR")], center = TRUE, scale. = TRUE, retx = T,
## rownames = junkinsdata[, 1]) :
## extra argument 'rownames' will be disregarded
summary(junkinsdata.pc)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 1.6437 1.2042 0.62273 0.56706 0.37261
## Proportion of Variance 0.5404 0.2900 0.07756 0.06431 0.02777
## Cumulative Proportion 0.5404 0.8304 0.90792 0.97223 1.00000
The first Principal component explains 54% of the total variance in the data set. The second Principal component explains 29% of the total variance in the data set. The third Principal component explains 8% of the total variance in the data set. The fourth Principal component explains 6% of the total variance in the data set. The fifth Principal component explains 3% of the total variance in the data set. Thus, the first two principal components define most of the variance (83%) in the data.
junkinsdata.pc$rotation
## PC1 PC2 PC3 PC4 PC5
## rz_ext 0.4639894 -0.4229173 0.38631091 -0.4543772 0.50016003
## rz_agr 0.5280487 -0.1442345 -0.24189015 0.7568902 0.26261631
## rz_cns 0.5683446 -0.1498156 0.01394529 -0.1444446 -0.79591522
## rz_neu -0.3621230 -0.5587092 0.58525410 0.4086089 -0.21731851
## TFR 0.2274417 0.6824490 0.67047461 0.1811992 0.01281605
biplot(junkinsdata.pc, scale = 0,xlabs=junkinsdata[,1])
Biplot shows the importance of the relationship between different variables and principal components. Each state is plotted with its value for PC1 and PC2. The states closer to each other on the plot have similar data patterns regarding the variables in the original data set. A state with a higher value for PC1 generally has low levels of neuroticism but higher levels of extraversion, conscientiousness, and agreeableness with a moderate fertility rate. While states having a high value for PC2 are generally low in neuroticism but average levels of conscientiousness and agreeableness with higher levels of fertility rate. Also, conscientiousness and agreeableness coincide with each other in the plot; no real difference between the values of the variables.Moreover, these red arrows represent the eigen vector for each variable in the data set.
screeplot(junkinsdata.pc, type = "l", main = "Scree Plot")
abline(h=1)
The Screeplot shows the eigenvalue on the Y-axis and the number of factors on the X-axis. The plot above suggests that the analysis can generate two factors.
junkinsdata.pc$sdev^2
## [1] 2.7017908 1.4500251 0.3877923 0.3215561 0.1388356
junkinsdata.play <- PCA(junkinsdata[,c("rz_ext","rz_agr","rz_cns","rz_neu","TFR")], scale.unit=T,graph=F)
eigenvalues<-junkinsdata.play$eig
head(eigenvalues[,1:2])
## eigenvalue percentage of variance
## comp 1 2.7017908 54.035815
## comp 2 1.4500251 29.000503
## comp 3 0.3877923 7.755847
## comp 4 0.3215561 6.431123
## comp 5 0.1388356 2.776712
hist(junkinsdata.pc$x[,1])
hist(junkinsdata.pc$x[,2])
cor(junkinsdata.pc$x[,1],junkinsdata.pc$x[,2])
## [1] 2.228857e-16
scores<-data.frame(junkinsdata.pc$x)
junkinsdata<-cbind(junkinsdata,scores)
ggplot(junkinsdata, aes(PC1, PC2, PC3, col = region, fill = region)) +
stat_ellipse(geom = "polygon", col = "black", alpha = 0.5) +
geom_point(shape = 21, col = "black")
round(cor(junkinsdata[,c("rz_ext","rz_agr","rz_cns","rz_neu","TFR","PC1","PC2")]), 3)
## rz_ext rz_agr rz_cns rz_neu TFR PC1 PC2
## rz_ext 1.000 0.622 0.772 -0.098 -0.059 0.763 -0.509
## rz_agr 0.622 1.000 0.777 -0.363 0.163 0.868 -0.174
## rz_cns 0.772 0.777 1.000 -0.426 0.195 0.934 -0.180
## rz_neu -0.098 -0.363 -0.426 1.000 -0.600 -0.595 -0.673
## TFR -0.059 0.163 0.195 -0.600 1.000 0.374 0.822
## PC1 0.763 0.868 0.934 -0.595 0.374 1.000 0.000
## PC2 -0.509 -0.174 -0.180 -0.673 0.822 0.000 1.000
The table shows the correlation of original variables with principal component scores. Personality traits like extraversion, conscientiousness, and agreeableness are strongly positively related to PC1 of the analysis. Also, the fertility rate appears positively associated with the first principal component. At the same time, the total fertility rate is strongly positively correlated to PC2. And extraversion and neuroticism are negatively related to PC2 with moderate coefficients of correlation.