library(ipumsr)
usa_00012 <- read_ipums_ddi("usa_00012.xml")
tx_00012 <- read_ipums_micro(usa_00012, data_file = ("usa_00012.dat.gz"), verbose = FALSE)
library(stringr)
names(tx_00012)<-tolower(names(tx_00012))
names(tx_00012)
## [1] "year" "multyear" "sample" "serial" "cbserial" "hhwt"
## [7] "cluster" "statefip" "puma" "strata" "gq" "ownershp"
## [13] "ownershpd" "mortgage" "multgen" "multgend" "pernum" "perwt"
## [19] "sex" "age" "fertyr" "race" "raced" "hispan"
## [25] "hispand" "hcovany" "educ" "educd" "empstat" "empstatd"
## [31] "labforce" "occ" "ind" "uhrswork" "inctot" "poverty"
## [37] "presgl" "migrate1" "migrate1d"
#CS Code
tx_00012<-zap_labels(tx_00012)
tx_00012$newpuma<- paste (str_pad(tx_00012$statefip, 2,"left", "0"), str_pad(tx_00012$puma,5,"left", "0") , sep="")
table(tx_00012$newpuma)
##
## 4800100 4800200 4800300 4800400 4800501 4800502 4800600 4800700 4800800 4800900
## 6381 2938 3739 4380 2621 4524 5101 3354 6057 4489
## 4801000 4801100 4801200 4801300 4801400 4801501 4801502 4801600 4801700 4801800
## 4426 2637 2415 2817 3090 1970 2426 2581 2920 3101
## 4801901 4801902 4801903 4801904 4801905 4801906 4801907 4802001 4802002 4802003
## 3612 3179 3034 2473 3209 4642 3587 3595 3345 4668
## 4802004 4802005 4802006 4802101 4802102 4802200 4802301 4802302 4802303 4802304
## 3673 2848 4038 4406 3719 3038 2270 2279 2590 1671
## 4802305 4802306 4802307 4802308 4802309 4802310 4802311 4802312 4802313 4802314
## 2108 2884 3100 2564 3091 3342 3424 3035 4266 2173
## 4802315 4802316 4802317 4802318 4802319 4802320 4802321 4802322 4802400 4802501
## 2684 1987 1874 2418 2391 2616 3044 3514 3299 2888
## 4802502 4802503 4802504 4802505 4802506 4802507 4802508 4802509 4802510 4802511
## 3471 2584 2013 2929 2260 3033 2171 3451 3473 2190
## 4802512 4802513 4802514 4802515 4802516 4802600 4802700 4802800 4802900 4803000
## 2223 3114 3393 2812 2613 6097 3779 3142 2657 3006
## 4803100 4803200 4803301 4803302 4803303 4803304 4803305 4803306 4803400 4803501
## 2886 2886 2848 3177 2353 1860 2383 2925 4351 3140
## 4803502 4803601 4803602 4803700 4803801 4803802 4803900 4804000 4804100 4804200
## 3842 3493 4563 5594 2245 4018 3165 3182 2051 2976
## 4804301 4804302 4804400 4804501 4804502 4804503 4804504 4804601 4804602 4804603
## 2204 2751 1889 2323 2470 1924 1827 2765 2281 3073
## 4804604 4804605 4804606 4804607 4804608 4804609 4804610 4804611 4804612 4804613
## 3979 2162 2241 1897 1953 2567 1943 1918 2713 2689
## 4804614 4804615 4804616 4804617 4804618 4804619 4804620 4804621 4804622 4804623
## 2334 2141 2623 1692 1934 1850 2126 2742 1973 1866
## 4804624 4804625 4804626 4804627 4804628 4804629 4804630 4804631 4804632 4804633
## 2262 1898 2089 2249 2327 2285 1993 2371 2583 1493
## 4804634 4804635 4804636 4804637 4804638 4804701 4804702 4804801 4804802 4804803
## 1554 2264 2217 2315 1849 3994 2852 1643 2355 2406
## 4804901 4804902 4804903 4804904 4804905 4805000 4805100 4805201 4805202 4805203
## 2226 2765 1943 1952 3148 3841 3473 3301 2823 2719
## 4805204 4805301 4805302 4805303 4805304 4805305 4805306 4805307 4805308 4805309
## 3497 2436 2954 3763 3180 2788 4288 3618 3272 3513
## 4805400 4805500 4805600 4805700 4805800 4805901 4805902 4805903 4805904 4805905
## 4131 4343 2567 3792 2730 2217 2511 2318 2407 2381
## 4805906 4805907 4805908 4805909 4805910 4805911 4805912 4805913 4805914 4805915
## 2179 2627 2287 2504 2874 2913 2590 2098 3360 2684
## 4805916 4806000 4806100 4806200 4806301 4806302 4806400 4806500 4806601 4806602
## 2401 2691 2216 2539 2960 2326 1790 2866 2721 2237
## 4806603 4806701 4806702 4806703 4806801 4806802 4806803 4806804 4806805 4806806
## 2820 1845 2625 2745 1612 1303 1600 1512 2270 1495
## 4806807 4806900
## 1323 1706
bordp<-readr::read_csv("C:/Users/codar/OneDrive/Documents/Stats II/Data/border_100mi_pumas_table.csv")
## Parsed with column specification:
## cols(
## fid = col_double(),
## STATEFP10 = col_double(),
## PUMACE10 = col_character(),
## AFFGEOID10 = col_character(),
## GEOID10 = col_double(),
## NAME10 = col_character(),
## LSAD10 = col_character(),
## ALAND10 = col_double(),
## AWATER10 = col_double()
## )
mdat<-merge(tx_00012, bordp, by.x="newpuma", by.y="GEOID10")
table(mdat$newpuma)
##
## 4802800 4803200 4803301 4803302 4803303 4803304 4803305 4803306 4806000 4806100
## 3142 2886 2848 3177 2353 1860 2383 2925 2691 2216
## 4806200 4806301 4806302 4806400 4806701 4806702 4806703 4806801 4806802 4806803
## 2539 2960 2326 1790 1845 2625 2745 1612 1303 1600
## 4806804 4806805 4806806 4806807 4806900
## 1512 2270 1495 1323 1706
library(dplyr)
tx_00012<-tx_00012%>%
filter(newpuma %in% c( "4802800", "4803200","4806000", "4806100", "4806200", "4806301", "4806302", "4806701", "4806702", "4806703", "4806900" ))
View(tx_00012)
names(tx_00012)
## [1] "year" "multyear" "sample" "serial" "cbserial" "hhwt"
## [7] "cluster" "statefip" "puma" "strata" "gq" "ownershp"
## [13] "ownershpd" "mortgage" "multgen" "multgend" "pernum" "perwt"
## [19] "sex" "age" "fertyr" "race" "raced" "hispan"
## [25] "hispand" "hcovany" "educ" "educd" "empstat" "empstatd"
## [31] "labforce" "occ" "ind" "uhrswork" "inctot" "poverty"
## [37] "presgl" "migrate1" "migrate1d" "newpuma"
describe(tx_00012$income)
## Warning: Unknown or uninitialised column: `income`.
## Warning in min(x, na.rm = T): no non-missing arguments to min; returning Inf
## Warning in max(x, na.rm = T): no non-missing arguments to max; returning -Inf
## [0 obs.]
## NULL: NULL
## min: Inf - max: -Inf - NAs: 0 (NaN%) - 0 unique values
##
## X0L X0L.1
## 1 0 0
summary(tx_00012$educ)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 6.000 6.000 6.722 8.000 11.000
class(tx_00012$educ)
## [1] "integer"
#Recodes
tx_hw7 <-tx_00012 %>%
mutate(sex=case_when(sex == 1~0,
sex == 2~ 1,
TRUE ~ NA_real_),
lfpart=case_when(labforce== 1 ~ 0,
labforce== 2 ~ 1,
TRUE ~ NA_real_),
edu=case_when(educ== 0 ~ 'none',
educ %in% 1:5 ~ 'hs incomplete',
educ %in% 6 ~ 'hs complete',
educ %in% 7:11 ~ 'college',
TRUE ~ NA_character_),
edu3=case_when(educ %in% 1:5 ~ 1,
educ %in% 6 ~ 2,
educ %in% 7:11 ~ 3,
TRUE~NA_real_),
race=case_when(race== 1 ~ 'white',
race== 2 ~ 'black',
# race== 3 ~'aian',
race %in% 4:5 ~ 'asian',
race== 6 ~ 'oapi',
race== 7 ~ 'other',
race== 8 ~ 'twomajor',
race== 9 ~ 'threemoremaj',
TRUE ~ NA_character_),
hisp= case_when(hispan !=0 ~ "Latino",
hispan==0 ~'NL',
hispan==9 ~ 'NL',
TRUE ~ NA_character_),
migrate1=case_when(migrate1==1 ~ 'same house',
migrate1==2 ~ 'movinstate',
migrate1==3 ~ 'abroad1yr',
TRUE ~ NA_character_),
fertyr=case_when(fertyr== 1 ~ 0,
fertyr== 2 ~ 1,
TRUE~ NA_real_ ),
poverty1=case_when(poverty==001 ~ "1% or less",
poverty ==501 ~ "501% or more",
TRUE~ NA_character_),
hcov=case_when(hcovany == 1 ~ 0,
hcovany == 2 ~ 1,
TRUE~NA_real_),
ownhome=case_when(ownershp==1 ~ 1,
ownershp==2 ~ 0,
TRUE ~ NA_real_),
multgen1=case_when(multgen==1 ~ 1,
multgen==2 ~ 2,
multgen==3 ~ 3,
TRUE~NA_real_))
# mgmt = if_else(occ %in% c(10:160) | occ %in% c(220:730), 1, 0)) #occupational prestige
# occ=case_when(occ %in% 10:160 ~ 'Mgmt/Biz',
# occ %in% 220:730 ~ 'Mgmt/Biz',
# occ %in% 800:950 ~ 'Finance',
# # occ %in% 1000:1240 ~ 'STEM',
# occ %in% 1300:1540 ~ 'Arch/Eng',
# occ %in% 1550:1560 ~ 'Technical',
# # occ %in% 1600:1760 ~ 'STEM',
# occ %in% 1800:1840 ~ 'SocSTEM',
# occ %in% 1900:1980 ~ 'Technical',
# occ %in% 2000:2060 ~ 'PublicServ',
# occ == 2100 ~ 'Law',
# occ %in% 2140:2150 ~ 'Technical',
# occ %in% 2200:2430 ~ 'Education',
# occ %in% 2440:2550 ~ 'Technical',
# occ %in% 2600:2910 ~ 'A&E/Sports/Media',
# occ == 2920 ~ 'Technical',
# occ %in% 3000:3500 ~ 'Health/Med',
# occ %in% 3510:3650 ~ 'Technical',
# occ %in% 3700:3950 ~ 'PublicServ',
# occ == 4000 ~ 'A&E/Sports/Media',
# occ %in% 4010:4965 ~ 'Sales/Service',
# occ %in% 5000: 5940 ~ 'Office/Admin',
# occ %in% 6200:8965 ~ 'SkilledTrade',
# occ %in% 9000:9750 ~ 'Transport',
# TRUE~ NA_character_))
sapply(tx_hw7, class)
## year multyear sample serial cbserial hhwt
## "integer" "numeric" "integer" "numeric" "numeric" "numeric"
## cluster statefip puma strata gq ownershp
## "numeric" "integer" "numeric" "numeric" "integer" "integer"
## ownershpd mortgage multgen multgend pernum perwt
## "integer" "integer" "integer" "integer" "numeric" "numeric"
## sex age fertyr race raced hispan
## "numeric" "integer" "numeric" "character" "integer" "integer"
## hispand hcovany educ educd empstat empstatd
## "integer" "integer" "integer" "integer" "integer" "integer"
## labforce occ ind uhrswork inctot poverty
## "integer" "numeric" "numeric" "integer" "numeric" "numeric"
## presgl migrate1 migrate1d newpuma lfpart edu
## "numeric" "character" "integer" "character" "numeric" "character"
## edu3 hisp poverty1 hcov ownhome multgen1
## "numeric" "character" "character" "numeric" "numeric" "numeric"
names(tx_hw7)
## [1] "year" "multyear" "sample" "serial" "cbserial" "hhwt"
## [7] "cluster" "statefip" "puma" "strata" "gq" "ownershp"
## [13] "ownershpd" "mortgage" "multgen" "multgend" "pernum" "perwt"
## [19] "sex" "age" "fertyr" "race" "raced" "hispan"
## [25] "hispand" "hcovany" "educ" "educd" "empstat" "empstatd"
## [31] "labforce" "occ" "ind" "uhrswork" "inctot" "poverty"
## [37] "presgl" "migrate1" "migrate1d" "newpuma" "lfpart" "edu"
## [43] "edu3" "hisp" "poverty1" "hcov" "ownhome" "multgen1"
#For this homework, you are to use the technique of Principal Components Analysis (PCA) to perform a variable reduction of at least 5 variables.If you have an idea for latent construct, state what you believe this is.Report the summary statistics and correlation matrix for your data. Report the results of the PCA, being sure to include the eigenvalues and corresponding vectors. Interpret your component(s) if possible. If deemed appropriate, conduct some testing of your index/components/latent variables.
#Latent construct: socioeconomic status
# I will use the following variables to help determine socioeconomic status: home ownership, education, employment status, income, occupational prestige, and health care coverage.
#Summary statistics and correlation matrix
tx_hw71<-tx_hw7%>%
filter(complete.cases(perwt, strata, newpuma, hcov,edu3, uhrswork, lfpart, ownhome,sex, race, inctot, presgl, empstat, hispan)) %>%
select(perwt, strata, newpuma, hcov,edu3, uhrswork, lfpart, ownhome,sex, race, inctot, presgl, empstat, hispan)%>%
mutate_at(vars(hcov,edu3, uhrswork, lfpart, ownhome,inctot, presgl), scale)
summary(tx_hw71)
## perwt strata newpuma hcov.V1
## Min. : 1.00 Min. :280048 Length:26778 Min. :-1.7569522
## 1st Qu.: 10.00 1st Qu.:600048 Class :character 1st Qu.: 0.5691462
## Median : 16.00 Median :630148 Mode :character Median : 0.5691462
## Mean : 21.12 Mean :566236 Mean : 0.0000000
## 3rd Qu.: 27.00 3rd Qu.:670248 3rd Qu.: 0.5691462
## Max. :237.00 Max. :690048 Max. : 0.5691462
##
## edu3.V1 uhrswork.V1 lfpart.V1 ownhome.V1
## Min. :-1.8318362 Min. :-2.943141 Min. : NA Min. :-1.6885440
## 1st Qu.:-0.4400669 1st Qu.:-0.151639 1st Qu.: NA 1st Qu.:-1.6885440
## Median :-0.4400669 Median :-0.000747 Median : NA Median : 0.5922041
## Mean : 0.0000000 Mean : 0.000000 Mean :NaN Mean : 0.0000000
## 3rd Qu.: 0.9517024 3rd Qu.: 0.376483 3rd Qu.: NA 3rd Qu.: 0.5922041
## Max. : 0.9517024 Max. : 4.450568 Max. : NA Max. : 0.5922041
## NA's :26778
## sex race inctot.V1 presgl.V1
## Min. :0.000 Length:26778 Min. :-0.962276 Min. :-2.7914275
## 1st Qu.:0.000 Class :character 1st Qu.:-0.528451 1st Qu.:-0.6065000
## Median :0.000 Mode :character Median :-0.244940 Median :-0.1963841
## Mean :0.468 Mean : 0.000000 Mean : 0.0000000
## 3rd Qu.:1.000 3rd Qu.: 0.199922 3rd Qu.: 0.7652668
## Max. :1.000 Max. :14.557583 Max. : 2.9714073
##
## empstat hispan
## Min. :1 Min. :0.0000
## 1st Qu.:1 1st Qu.:0.0000
## Median :1 Median :1.0000
## Mean :1 Mean :0.7902
## 3rd Qu.:1 3rd Qu.:1.0000
## Max. :1 Max. :4.0000
##
# tx_hw71$pwt <- tx_hw7$perwt/100
tx_pc<-princomp(~ownhome+hcov+edu3+uhrswork+empstat+inctot+presgl, #add employment and income and restrict to only those in lf
data=tx_hw71,
scores=T)
summary(tx_pc)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.4153143 1.0229421 0.9870530 0.8777319 0.8072633
## Proportion of Variance 0.3338649 0.1744083 0.1623850 0.1284070 0.1086164
## Cumulative Proportion 0.3338649 0.5082732 0.6706582 0.7990652 0.9076816
## Comp.6 Comp.7
## Standard deviation 0.74423763 6.715863e-09
## Proportion of Variance 0.09231839 7.517416e-18
## Cumulative Proportion 1.00000000 1.000000e+00
#pc1 explains roughly 33% of the variance and pc2 about 17%, cumulatively Comp.1 and 2 explain about 51% of the variance
#We want our eiganvalues to be above one. This occurs for the first two components only. This means I would only use those two in my index.
loadings(tx_pc)
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
## ownhome 0.247 0.410 0.753 0.379 0.241
## hcov 0.410 0.250 0.201 -0.840 -0.137
## edu3 0.466 0.237 -0.469 0.302 0.638
## uhrswork 0.294 -0.743 0.193 -0.119 0.555
## empstat -1.000
## inctot 0.458 -0.374 0.143 0.248 -0.722 0.216
## presgl 0.507 0.149 -0.340 0.258 -0.733
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.143 0.143 0.143 0.143 0.143 0.143 0.143
## Cumulative Var 0.143 0.286 0.429 0.571 0.714 0.857 1.000
#Reviewing the loadings or eigenvectors, first I note that most of the variables in the first component are positively correlated with the first component. The exception is the employment status, which is suppressed. However, I suspect this may be happening because I restricted the data to only those that are employed in IPUMS. I didn't create a subset in R. What is the best approach here?
#I would also interpret the loadings to mean that higher values in owning a home (1=own, 2 = rent), health coverage (0=no, 1=yes), higher levels of education, higher levels of hours worked per week, higher levels of income, and higher levels of occupational prestige indicate better socioeconomic status.
#Screeplot
screeplot(tx_pc,
type = "l",
main = "Scree Plot")
abline(h=2)

plot(tx_pc)

screeplot(tx_pc, type = "line", main = "Scree Plot")

biplot(tx_pc)
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

scores<-data.frame(tx_pc$scores)
hist(scores$Comp.1)

hist(scores$Comp.2)

#calculate the correlation between the first 2 components to show they are orthogonal, i.e. correlation == 0
cor(scores[,1:2])
## Comp.1 Comp.2
## Comp.1 1.000000e+00 1.838385e-14
## Comp.2 1.838385e-14 1.000000e+00
tx_hw71c<-cbind(tx_hw71, scores)
names(tx_hw71c)
## [1] "perwt" "strata" "newpuma" "hcov" "edu3" "uhrswork"
## [7] "lfpart" "ownhome" "sex" "race" "inctot" "presgl"
## [13] "empstat" "hispan" "Comp.1" "Comp.2" "Comp.3" "Comp.4"
## [19] "Comp.5" "Comp.6" "Comp.7"
#to make a correlation matrix between the comps and the original values
# round(cor(tx_hw71c[,c(-1:-5, -17:-27)]), 3)
# round(cor(tx_hw71c[,c(-1:-5, -19:-27)]),3)
#Using the PCs in an analysis
options(survey.lonely.psu = "adjust")
tx_hw71c$pc1q<-cut(tx_hw71c$Comp.1,
breaks = quantile(tx_hw71c$Comp.1,
probs = c(0,.25,.5,.75,1) ,
na.rm=T),
include.lowest=T)
des<-svydesign(ids=~1,
strata=~strata,
weights=~perwt,
data=tx_hw71c)
library(ggplot2)
ggplot(aes(x=factor(sex), y=Comp.1),
data=tx_hw71c)+
geom_boxplot()

#The results show that the mean of component 1 aren't very different.
ggplot(aes(x=race, y=Comp.1),
data=tx_hw71c)+
geom_boxplot()

#According to the results, Blacks, other race, and those of two or more races have lower averages of component 1 than White, OAPI. This may signal that there is lower socioeconomic status among these groups. Latino/as are categorized as White, which does not give us an accurate picture. Hispan is not included in the index because it had a negative association with component 1, so I removed it.
fit.1<-svyglm(Comp.1~factor(sex)+race,
des,
family=gaussian)
summary(fit.1)
##
## Call:
## svyglm(formula = Comp.1 ~ factor(sex) + race, design = des, family = gaussian)
##
## Survey design:
## svydesign(ids = ~1, strata = ~strata, weights = ~perwt, data = tx_hw71c)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.69303 0.31220 2.220 0.026437 *
## factor(sex)1 -0.07656 0.02155 -3.553 0.000382 ***
## raceblack -0.77741 0.34078 -2.281 0.022540 *
## raceoapi -0.03757 0.35194 -0.107 0.914989
## raceother -1.17714 0.31438 -3.744 0.000181 ***
## racethreemoremaj -0.06851 0.42992 -0.159 0.873395
## racetwomajor -0.93643 0.32618 -2.871 0.004097 **
## racewhite -0.78570 0.31201 -2.518 0.011802 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1.954698)
##
## Number of Fisher Scoring iterations: 2
#Results indicate that lower values of sex (being a male) is associated with higher socioeconomic status.This makes sense. Sex is negatively correlated with socioeconomic status. This negative association is also true of race. However, there are certain races that have a stronger correlation to Comp.1, such as Black, other, two major, and white.