Assignment 2

Author

Jacob M. Souch

Assignment 2

library(janitor)

Attaching package: 'janitor'
The following objects are masked from 'package:stats':

    chisq.test, fisher.test
library(readxl)
library(car)
Loading required package: carData
library(ggplot2)
library(stargazer)

Please cite as: 
 Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
 R package version 5.2.3. https://CRAN.R-project.org/package=stargazer 
library(ggpubr)
library(ggrepel)
library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2
──
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.3.0      ✔ stringr 1.5.0 
✔ readr   2.1.3      ✔ forcats 0.5.2 
✔ purrr   1.0.1      
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
✖ dplyr::recode() masks car::recode()
✖ purrr::some()   masks car::some()
Junkins <- read_excel("C:/Users/jacob/OneDrive - University of Texas at San Antonio/Courses/Stats for Demographic Data 2/Homework 1/Junkins Data.xlsx") %>% as.data.frame()

Junkins$STATE_ABBR <- NULL


Junkins$statabb <- case_when(state.name == Junkins$statename ~ state.abb)

#Junkins <-filter(Junkins, STATEAB!="DC")
Policy.pc <- prcomp(Junkins[,c( "cohabit","relcons","abortion","famplnpw","perBA")], center = TRUE,scale. = TRUE,retx=T,rownames=Junkins[,62])
Warning: In prcomp.default(Junkins[, c("cohabit", "relcons", "abortion", 
    "famplnpw", "perBA")], center = TRUE, scale. = TRUE, retx = T, 
    rownames = Junkins[, 62]) :
 extra argument 'rownames' will be disregarded
plot(Policy.pc)

biplot(Policy.pc, scale = 0,xlabs=Junkins[,62])

Policy.pc
Standard deviations (1, .., p=5):
[1] 1.4387920 1.0312699 0.9018942 0.8171225 0.6206913

Rotation (n x k) = (5 x 5):
                 PC1         PC2         PC3        PC4         PC5
cohabit   0.47098683 -0.03012803  0.70152112  0.3240984  0.42437255
relcons  -0.42826473 -0.29362850  0.53860864 -0.6600198  0.06816326
abortion  0.50712785 -0.21560143 -0.41479365 -0.5195491  0.50433360
famplnpw  0.05113282 -0.92751848 -0.08018794  0.2920140 -0.21305551
perBA     0.57876378  0.07810319  0.19820438 -0.3226929 -0.71799490
Policy.pc$rotation
                 PC1         PC2         PC3        PC4         PC5
cohabit   0.47098683 -0.03012803  0.70152112  0.3240984  0.42437255
relcons  -0.42826473 -0.29362850  0.53860864 -0.6600198  0.06816326
abortion  0.50712785 -0.21560143 -0.41479365 -0.5195491  0.50433360
famplnpw  0.05113282 -0.92751848 -0.08018794  0.2920140 -0.21305551
perBA     0.57876378  0.07810319  0.19820438 -0.3226929 -0.71799490
summary(Policy.pc)
Importance of components:
                         PC1    PC2    PC3    PC4     PC5
Standard deviation     1.439 1.0313 0.9019 0.8171 0.62069
Proportion of Variance 0.414 0.2127 0.1627 0.1335 0.07705
Cumulative Proportion  0.414 0.6267 0.7894 0.9230 1.00000
centroid_labels <- usmapdata::centroid_labels("states")

Only PC1 and PC2 have SDs greater than 1. This means that only PC1 and PC2 are salient to subsequent analysis and interpretation.

plot(Policy.pc)

plot(Junkins[,40:50], pch=20)

p <- biplot(Policy.pc, scale = 0,xlabs=Junkins[,1])

#ggbiplot(Policy.pc, labels =  rownames(Junkins))


library(ggfortify)
ggplot2::autoplot(Policy.pc, label = TRUE, loadings.label = TRUE)

screeplot(Policy.pc, type = "l", main = "Scree Plot")
abline(h=1)

Interpretation

Principal Component Analysis (PCA) was used to analyze total fertility rate, divorce rate, unintentional pregnancy rate, total age at first marriage, and bachelor’s degree attainment at the state-level. The Scree plot suggests that two-components are sufficient to explain variation. PC1 and PC2 cumulatively explain 62.7% of the variation in the features of the data.

The density plot and ellipse plots suggest that the strongest divergence occurs between the Northeast and the South.

PC1

The loadings on PC1 are similar loading (all loadings are >0.4) with the exception of famplnpw (family planning expenditures) (0.05). PC1 appears to be dependent educational attainment (0.57) and cohabitation behaviors (0.47). Given these loading values, PC1 may be capturing a measure of a progressive lifestyle. It follows that non-coastal states like Missouri and Kentucky score highly in PC1.

scores<-as.data.frame(Policy.pc$x)

Policy<-cbind(scores,Junkins)

Policy$fips <- sprintf("%02d", as.numeric(as.character(Policy$state)))
ggplot(Policy, aes(PC1, PC2, col = Species, fill = region)) +
  stat_ellipse(geom = "polygon", col = "black", alpha = 0.5) +
  geom_point(shape = 21, col = "black")

ggplot(Policy, aes(PC1, PC2, col = region, fill = region))+
  stat_density_2d(geom = "contour")+
  geom_point()

library(usmap)
library(ggplot2)


plot_usmap(data =  Policy, values = "PC1", labels = TRUE)+
scale_fill_continuous(low = "blue", high ="orange")+
  ggtitle("PC1 at State-Level")

plot_usmap(data =  Policy, values = "PC2", labels = TRUE)+
scale_fill_continuous(low = "blue", high ="orange")+
  ggtitle("PC2 at State-Level")

PC2

The loadings on PC2 are more varied compared to PC1. Familyplanpw loads highly on to PC2 (-0.93), followed by relcons (-0.21), perBA (-0.29). PC2 appears to be capturing conservative family orientation (associated with a lack of family planning and abortion). It follows that states like California would score lowly on this component.

Assignment 3

str(Junkins)
'data.frame':   50 obs. of  62 variables:
 $ statename: chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
 $ state    : num  1 2 4 5 6 8 9 10 12 13 ...
 $ region   : chr  "3" "4" "4" "3" ...
 $ division : chr  "6" "9" "8" "7" ...
 $ rz_ext   : num  -0.12576 0.04304 -0.00933 0.01215 -0.04303 ...
 $ rz_agr   : num  -0.10459 0.10048 -0.01543 -0.00702 -0.03567 ...
 $ rz_cns   : num  -0.10997 0.08065 -0.05832 0.00843 -0.06538 ...
 $ rz_neu   : num  -0.0491 -0.0152 0.0765 -0.0437 -0.0143 ...
 $ rz_opn   : num  0.05163 -0.11235 -0.08059 0.00288 0.09221 ...
 $ mzextra  : num  -0.00424 0.11215 0.09046 0.10037 0.05811 ...
 $ mzagree  : num  0.412 0.504 0.469 0.47 0.464 ...
 $ mzconsc  : num  0.302 0.357 0.313 0.344 0.295 ...
 $ mzneuro  : num  -0.326 -0.292 -0.242 -0.307 -0.272 ...
 $ mzopen   : num  0.428 0.395 0.435 0.452 0.496 ...
 $ fzextra  : num  0.139 0.25 0.2 0.219 0.188 ...
 $ fzagree  : num  0.534 0.636 0.575 0.578 0.557 ...
 $ fzconsc  : num  0.338 0.441 0.385 0.413 0.377 ...
 $ fzneuro  : num  -0.01355 0.0122 0.06192 -0.01387 -0.00426 ...
 $ fzopen   : num  0.42 0.3 0.311 0.365 0.412 ...
 $ E_LT30   : num  -0.14022 0.05119 -0.00971 -0.00394 -0.07076 ...
 $ A_LT30   : num  -0.13331 0.14026 -0.01216 0.00732 -0.02377 ...
 $ C_LT30   : num  -0.1426 0.1303 -0.0482 0.0205 -0.0596 ...
 $ N_LT30   : num  -0.0419 -0.031 0.0725 -0.0472 -0.0132 ...
 $ O_LT30   : num  0.0646 -0.1097 -0.036 0.0141 0.0824 ...
 $ E_GT30   : num  -0.09336 0.01709 -0.00853 0.05183 0.02703 ...
 $ A_GT30   : num  -0.0402 -0.0261 -0.0224 -0.0424 -0.0657 ...
 $ C_GT30   : num  -0.0369 -0.0775 -0.08 -0.0214 -0.0799 ...
 $ N_GT30   : num  -0.0652 0.0348 0.0851 -0.0349 -0.0172 ...
 $ O_GT30   : num  0.0227 -0.1206 -0.1757 -0.0248 0.117 ...
 $ GenD_E   : num  -0.144 -0.138 -0.11 -0.119 -0.13 ...
 $ GenD_A   : num  -0.1219 -0.1319 -0.1059 -0.1083 -0.0936 ...
 $ GenD_C   : num  -0.0358 -0.0835 -0.0715 -0.0695 -0.082 ...
 $ GenD_N   : num  -0.313 -0.305 -0.304 -0.293 -0.267 ...
 $ GenD_O   : num  0.00772 0.09418 0.12421 0.08705 0.08471 ...
 $ AgeD_E   : num  -0.04686 0.0341 -0.00119 -0.05577 -0.09779 ...
 $ AgeD_A   : num  -0.0931 0.1664 0.0103 0.0497 0.042 ...
 $ AgeD_C   : num  -0.1057 0.2078 0.0319 0.0419 0.0203 ...
 $ AgeD_N   : num  0.0233 -0.06573 -0.01261 -0.01226 0.00392 ...
 $ AgeD_O   : num  0.0419 0.0109 0.1397 0.039 -0.0345 ...
 $ TFR      : num  2.35 1.87 2 2.07 1.95 ...
 $ alpha    : num  13.34 11.28 12.43 10.91 7.01 ...
 $ peak     : num  23.9 24.6 23.2 25.2 28.9 ...
 $ stop     : num  2.85 4.44 4.6 2.92 2.94 ...
 $ ageFB    : num  24.3 23.6 23 24 25.6 25.7 27.2 25 25 24.5 ...
 $ t_ageFM  : num  25.9 26.6 25.6 26.8 28.3 ...
 $ nevermar : num  0.316 0.292 0.264 0.316 0.36 ...
 $ divorce  : num  0.0163 0.0179 0.019 0.0144 0.0123 ...
 $ cohabit  : num  8.2 4.8 5.3 7.7 8 8.1 6.5 7 7.6 6.1 ...
 $ abortion : num  12 12 8.7 15.2 27.6 15.7 24.6 40 27.2 19.2 ...
 $ t_nmf    : num  31.9 45 39.1 41.3 33.9 29.8 32 42.5 40.5 39.6 ...
 $ unintprg : num  53 55 56 51 56 48 51 60 59 57 ...
 $ famplnpw : num  147 147 152 151 245 80 65 143 92 142 ...
 $ med_inc  : num  64576 40474 38307 46789 57708 ...
 $ perAA    : num  3.7 26.2 15.4 4.1 6.2 4 10.1 21.4 16 30.5 ...
 $ perHisp  : num  5.5 3.9 6.4 29.6 37.6 20.7 13.4 8.2 22.5 8.8 ...
 $ perFem   : num  47.9 51.5 50.9 50.3 50.3 49.9 51.3 51.6 51.1 51.2 ...
 $ perBA    : num  27.2 21.7 19.1 26.3 30.1 35.9 35.2 27.7 25.9 27.2 ...
 $ perUrb   : num  66 59 56.2 89.8 95 ...
 $ voteO    : num  38.7 37.9 45.1 38.9 61 ...
 $ vryrel   : num  56.3 27.9 35.7 52.1 34 32.6 31.4 34.1 39 48.5 ...
 $ relcons  : num  42.8 18.8 18.1 39.9 11.4 ...
 $ statabb  : chr  "AL" "AK" "AZ" "AR" ...

data_num <- Junkins_nums[ ,Junkins_nums]

Junkins_nums <- select_if(Junkins, is.numeric)  

par(mfcol=c(5,3),mai=c(0.5,0.5,0.5,0))

pairs(Junkins_nums[,1:20])

pairs(Junkins_nums[,21:30])

pairs(Junkins_nums[,31:40])

pairs(Junkins_nums[,41:50])

pairs(Junkins_nums[,51:58])

Junkins_vars <- Junkins_nums %>% select(relcons,perBA,famplnpw) %>% scale()


Junkins_zscaled <- Junkins_nums %>% scale()

Junkins_distmatx <- dist(Junkins_vars)

Junkins_distmatx %>% hclust() %>% plot(labels =Junkins$statename)

Junkins_distmatx %>% hclust(method ="median") %>% plot(labels = Junkins$statename)