HW7

library(ipumsr)
usa_00012 <- read_ipums_ddi("usa_00012.xml")
tx_00012 <- read_ipums_micro(usa_00012, data_file = ("usa_00012.dat.gz"), verbose = FALSE)

library(stringr)
names(tx_00012)<-tolower(names(tx_00012))

names(tx_00012)

##  [1] "year"      "multyear"  "sample"    "serial"    "cbserial"  "hhwt"     
##  [7] "cluster"   "statefip"  "puma"      "strata"    "gq"        "ownershp" 
## [13] "ownershpd" "mortgage"  "multgen"   "multgend"  "pernum"    "perwt"    
## [19] "sex"       "age"       "fertyr"    "race"      "raced"     "hispan"   
## [25] "hispand"   "hcovany"   "educ"      "educd"     "empstat"   "empstatd" 
## [31] "labforce"  "occ"       "ind"       "uhrswork"  "inctot"    "poverty"  
## [37] "presgl"    "migrate1"  "migrate1d"

#CS Code
tx_00012<-zap_labels(tx_00012)
tx_00012$newpuma<- paste (str_pad(tx_00012$statefip, 2,"left", "0"), str_pad(tx_00012$puma,5,"left", "0") , sep="")
table(tx_00012$newpuma)

## 
## 4800100 4800200 4800300 4800400 4800501 4800502 4800600 4800700 4800800 4800900 
##    6381    2938    3739    4380    2621    4524    5101    3354    6057    4489 
## 4801000 4801100 4801200 4801300 4801400 4801501 4801502 4801600 4801700 4801800 
##    4426    2637    2415    2817    3090    1970    2426    2581    2920    3101 
## 4801901 4801902 4801903 4801904 4801905 4801906 4801907 4802001 4802002 4802003 
##    3612    3179    3034    2473    3209    4642    3587    3595    3345    4668 
## 4802004 4802005 4802006 4802101 4802102 4802200 4802301 4802302 4802303 4802304 
##    3673    2848    4038    4406    3719    3038    2270    2279    2590    1671 
## 4802305 4802306 4802307 4802308 4802309 4802310 4802311 4802312 4802313 4802314 
##    2108    2884    3100    2564    3091    3342    3424    3035    4266    2173 
## 4802315 4802316 4802317 4802318 4802319 4802320 4802321 4802322 4802400 4802501 
##    2684    1987    1874    2418    2391    2616    3044    3514    3299    2888 
## 4802502 4802503 4802504 4802505 4802506 4802507 4802508 4802509 4802510 4802511 
##    3471    2584    2013    2929    2260    3033    2171    3451    3473    2190 
## 4802512 4802513 4802514 4802515 4802516 4802600 4802700 4802800 4802900 4803000 
##    2223    3114    3393    2812    2613    6097    3779    3142    2657    3006 
## 4803100 4803200 4803301 4803302 4803303 4803304 4803305 4803306 4803400 4803501 
##    2886    2886    2848    3177    2353    1860    2383    2925    4351    3140 
## 4803502 4803601 4803602 4803700 4803801 4803802 4803900 4804000 4804100 4804200 
##    3842    3493    4563    5594    2245    4018    3165    3182    2051    2976 
## 4804301 4804302 4804400 4804501 4804502 4804503 4804504 4804601 4804602 4804603 
##    2204    2751    1889    2323    2470    1924    1827    2765    2281    3073 
## 4804604 4804605 4804606 4804607 4804608 4804609 4804610 4804611 4804612 4804613 
##    3979    2162    2241    1897    1953    2567    1943    1918    2713    2689 
## 4804614 4804615 4804616 4804617 4804618 4804619 4804620 4804621 4804622 4804623 
##    2334    2141    2623    1692    1934    1850    2126    2742    1973    1866 
## 4804624 4804625 4804626 4804627 4804628 4804629 4804630 4804631 4804632 4804633 
##    2262    1898    2089    2249    2327    2285    1993    2371    2583    1493 
## 4804634 4804635 4804636 4804637 4804638 4804701 4804702 4804801 4804802 4804803 
##    1554    2264    2217    2315    1849    3994    2852    1643    2355    2406 
## 4804901 4804902 4804903 4804904 4804905 4805000 4805100 4805201 4805202 4805203 
##    2226    2765    1943    1952    3148    3841    3473    3301    2823    2719 
## 4805204 4805301 4805302 4805303 4805304 4805305 4805306 4805307 4805308 4805309 
##    3497    2436    2954    3763    3180    2788    4288    3618    3272    3513 
## 4805400 4805500 4805600 4805700 4805800 4805901 4805902 4805903 4805904 4805905 
##    4131    4343    2567    3792    2730    2217    2511    2318    2407    2381 
## 4805906 4805907 4805908 4805909 4805910 4805911 4805912 4805913 4805914 4805915 
##    2179    2627    2287    2504    2874    2913    2590    2098    3360    2684 
## 4805916 4806000 4806100 4806200 4806301 4806302 4806400 4806500 4806601 4806602 
##    2401    2691    2216    2539    2960    2326    1790    2866    2721    2237 
## 4806603 4806701 4806702 4806703 4806801 4806802 4806803 4806804 4806805 4806806 
##    2820    1845    2625    2745    1612    1303    1600    1512    2270    1495 
## 4806807 4806900 
##    1323    1706

bordp<-readr::read_csv("C:/Users/codar/OneDrive/Documents/Stats II/Data/border_100mi_pumas_table.csv")

## Parsed with column specification:
## cols(
##   fid = col_double(),
##   STATEFP10 = col_double(),
##   PUMACE10 = col_character(),
##   AFFGEOID10 = col_character(),
##   GEOID10 = col_double(),
##   NAME10 = col_character(),
##   LSAD10 = col_character(),
##   ALAND10 = col_double(),
##   AWATER10 = col_double()
## )

mdat<-merge(tx_00012, bordp, by.x="newpuma", by.y="GEOID10")
table(mdat$newpuma)

## 
## 4802800 4803200 4803301 4803302 4803303 4803304 4803305 4803306 4806000 4806100 
##    3142    2886    2848    3177    2353    1860    2383    2925    2691    2216 
## 4806200 4806301 4806302 4806400 4806701 4806702 4806703 4806801 4806802 4806803 
##    2539    2960    2326    1790    1845    2625    2745    1612    1303    1600 
## 4806804 4806805 4806806 4806807 4806900 
##    1512    2270    1495    1323    1706

library(dplyr)
tx_00012<-tx_00012%>% 
filter(newpuma %in% c( "4802800", "4803200","4806000", "4806100", "4806200", "4806301", "4806302", "4806701", "4806702", "4806703", "4806900" ))
 View(tx_00012)
  names(tx_00012)

##  [1] "year"      "multyear"  "sample"    "serial"    "cbserial"  "hhwt"     
##  [7] "cluster"   "statefip"  "puma"      "strata"    "gq"        "ownershp" 
## [13] "ownershpd" "mortgage"  "multgen"   "multgend"  "pernum"    "perwt"    
## [19] "sex"       "age"       "fertyr"    "race"      "raced"     "hispan"   
## [25] "hispand"   "hcovany"   "educ"      "educd"     "empstat"   "empstatd" 
## [31] "labforce"  "occ"       "ind"       "uhrswork"  "inctot"    "poverty"  
## [37] "presgl"    "migrate1"  "migrate1d" "newpuma"

describe(tx_00012$income)

## Warning: Unknown or uninitialised column: `income`.

## Warning in min(x, na.rm = T): no non-missing arguments to min; returning Inf

## Warning in max(x, na.rm = T): no non-missing arguments to max; returning -Inf

## [0 obs.] 
## NULL: NULL
## min: Inf - max: -Inf - NAs: 0 (NaN%) - 0 unique values
## 
##   X0L X0L.1
## 1   0     0

summary(tx_00012$educ)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   6.000   6.000   6.722   8.000  11.000

class(tx_00012$educ)

## [1] "integer"

#Recodes
tx_hw7 <-tx_00012 %>%
  mutate(sex=case_when(sex == 1~0,
                       sex == 2~ 1,
                       TRUE ~ NA_real_),
        lfpart=case_when(labforce== 1 ~ 0,
                          labforce== 2 ~ 1,
                         TRUE ~ NA_real_),
         edu=case_when(educ== 0 ~ 'none',
                        educ %in% 1:5 ~ 'hs incomplete',
                        educ %in% 6 ~ 'hs complete',
                        educ %in% 7:11 ~ 'college',
                       TRUE ~ NA_character_),
        edu3=case_when(educ %in% 1:5 ~ 1,
                       educ %in% 6 ~ 2,
                       educ %in% 7:11 ~ 3,
                       TRUE~NA_real_),
         race=case_when(race== 1 ~ 'white',
                        race== 2 ~ 'black',
                        # race== 3 ~'aian',
                        race %in% 4:5 ~ 'asian',
                        race== 6 ~ 'oapi',
                        race== 7 ~ 'other',
                        race== 8 ~ 'twomajor',
                        race== 9 ~ 'threemoremaj',
                        TRUE ~ NA_character_),
         hisp= case_when(hispan !=0 ~ "Latino",
                         hispan==0 ~'NL',
                         hispan==9 ~ 'NL',
                         TRUE ~ NA_character_),
         migrate1=case_when(migrate1==1 ~ 'same house',
                            migrate1==2 ~ 'movinstate',
                            migrate1==3 ~ 'abroad1yr',
                            TRUE ~ NA_character_),
         fertyr=case_when(fertyr== 1 ~ 0, 
                          fertyr== 2 ~ 1,
                          TRUE~ NA_real_ ),
         poverty1=case_when(poverty==001 ~ "1% or less",
                           poverty ==501 ~ "501% or more",
                           TRUE~ NA_character_),
         hcov=case_when(hcovany == 1 ~ 0,
                        hcovany == 2 ~ 1,
                        TRUE~NA_real_),
         ownhome=case_when(ownershp==1 ~ 1,
                            ownershp==2 ~ 0,
                            TRUE ~ NA_real_),
        multgen1=case_when(multgen==1 ~ 1,
                           multgen==2 ~ 2,
                           multgen==3 ~ 3,
                           TRUE~NA_real_))
         # mgmt = if_else(occ %in% c(10:160) | occ %in% c(220:730), 1, 0))    #occupational prestige
        
         # occ=case_when(occ %in% 10:160 ~ 'Mgmt/Biz',
         #                    occ %in% 220:730 ~ 'Mgmt/Biz',
         #                    occ %in% 800:950 ~ 'Finance',
         #                    # occ %in% 1000:1240 ~ 'STEM',
         #                    occ %in% 1300:1540 ~ 'Arch/Eng',
         #                    occ %in% 1550:1560 ~ 'Technical',
         #                    # occ %in% 1600:1760 ~ 'STEM',
         #                    occ %in% 1800:1840 ~ 'SocSTEM',
         #                    occ %in% 1900:1980 ~ 'Technical',
         #                    occ %in% 2000:2060 ~ 'PublicServ',
         #                    occ == 2100 ~ 'Law',
         #                    occ %in% 2140:2150 ~ 'Technical',
         #                    occ %in% 2200:2430 ~ 'Education',
         #                    occ %in% 2440:2550 ~ 'Technical',
         #                    occ %in% 2600:2910 ~ 'A&E/Sports/Media',
         #                    occ == 2920 ~ 'Technical',
         #                    occ %in% 3000:3500 ~ 'Health/Med',
         #                    occ %in% 3510:3650 ~ 'Technical',
         #                    occ %in% 3700:3950 ~ 'PublicServ',
         #                    occ == 4000 ~ 'A&E/Sports/Media',
         #                    occ %in% 4010:4965 ~ 'Sales/Service',
         #                    occ %in% 5000: 5940 ~ 'Office/Admin',
         #                    occ %in% 6200:8965 ~ 'SkilledTrade',
         #                    occ %in% 9000:9750 ~ 'Transport',
         #                    TRUE~ NA_character_))

             

sapply(tx_hw7, class)

##        year    multyear      sample      serial    cbserial        hhwt 
##   "integer"   "numeric"   "integer"   "numeric"   "numeric"   "numeric" 
##     cluster    statefip        puma      strata          gq    ownershp 
##   "numeric"   "integer"   "numeric"   "numeric"   "integer"   "integer" 
##   ownershpd    mortgage     multgen    multgend      pernum       perwt 
##   "integer"   "integer"   "integer"   "integer"   "numeric"   "numeric" 
##         sex         age      fertyr        race       raced      hispan 
##   "numeric"   "integer"   "numeric" "character"   "integer"   "integer" 
##     hispand     hcovany        educ       educd     empstat    empstatd 
##   "integer"   "integer"   "integer"   "integer"   "integer"   "integer" 
##    labforce         occ         ind    uhrswork      inctot     poverty 
##   "integer"   "numeric"   "numeric"   "integer"   "numeric"   "numeric" 
##      presgl    migrate1   migrate1d     newpuma      lfpart         edu 
##   "numeric" "character"   "integer" "character"   "numeric" "character" 
##        edu3        hisp    poverty1        hcov     ownhome    multgen1 
##   "numeric" "character" "character"   "numeric"   "numeric"   "numeric"

names(tx_hw7)

##  [1] "year"      "multyear"  "sample"    "serial"    "cbserial"  "hhwt"     
##  [7] "cluster"   "statefip"  "puma"      "strata"    "gq"        "ownershp" 
## [13] "ownershpd" "mortgage"  "multgen"   "multgend"  "pernum"    "perwt"    
## [19] "sex"       "age"       "fertyr"    "race"      "raced"     "hispan"   
## [25] "hispand"   "hcovany"   "educ"      "educd"     "empstat"   "empstatd" 
## [31] "labforce"  "occ"       "ind"       "uhrswork"  "inctot"    "poverty"  
## [37] "presgl"    "migrate1"  "migrate1d" "newpuma"   "lfpart"    "edu"      
## [43] "edu3"      "hisp"      "poverty1"  "hcov"      "ownhome"   "multgen1"

#For this homework, you are to use the technique of Principal Components Analysis (PCA) to perform a variable reduction of at least 5 variables.If you have an idea for latent construct, state what you believe this is.Report the summary statistics and correlation matrix for your data. Report the results of the PCA, being sure to include the eigenvalues and corresponding vectors. Interpret your component(s) if possible. If deemed appropriate, conduct some testing of your index/components/latent variables.

#Latent construct: socioeconomic status 
# I will use the following variables to help determine socioeconomic status: home ownership, education, employment status, income, occupational prestige, and health care coverage.

#Summary statistics and correlation matrix 

tx_hw71<-tx_hw7%>%
  filter(complete.cases(perwt, strata, newpuma, hcov,edu3, uhrswork, lfpart, ownhome,sex, race, inctot, presgl, empstat, hispan)) %>%
  select(perwt, strata, newpuma, hcov,edu3, uhrswork, lfpart, ownhome,sex, race, inctot, presgl, empstat, hispan)%>%
  mutate_at(vars(hcov,edu3, uhrswork, lfpart, ownhome,inctot, presgl), scale)
summary(tx_hw71)

##      perwt            strata         newpuma                hcov.V1       
##  Min.   :  1.00   Min.   :280048   Length:26778       Min.   :-1.7569522  
##  1st Qu.: 10.00   1st Qu.:600048   Class :character   1st Qu.: 0.5691462  
##  Median : 16.00   Median :630148   Mode  :character   Median : 0.5691462  
##  Mean   : 21.12   Mean   :566236                      Mean   : 0.0000000  
##  3rd Qu.: 27.00   3rd Qu.:670248                      3rd Qu.: 0.5691462  
##  Max.   :237.00   Max.   :690048                      Max.   : 0.5691462  
##                                                                           
##        edu3.V1            uhrswork.V1        lfpart.V1         ownhome.V1     
##  Min.   :-1.8318362   Min.   :-2.943141   Min.   : NA     Min.   :-1.6885440  
##  1st Qu.:-0.4400669   1st Qu.:-0.151639   1st Qu.: NA     1st Qu.:-1.6885440  
##  Median :-0.4400669   Median :-0.000747   Median : NA     Median : 0.5922041  
##  Mean   : 0.0000000   Mean   : 0.000000   Mean   :NaN     Mean   : 0.0000000  
##  3rd Qu.: 0.9517024   3rd Qu.: 0.376483   3rd Qu.: NA     3rd Qu.: 0.5922041  
##  Max.   : 0.9517024   Max.   : 4.450568   Max.   : NA     Max.   : 0.5922041  
##                                           NA's   :26778                       
##       sex            race                inctot.V1           presgl.V1      
##  Min.   :0.000   Length:26778       Min.   :-0.962276   Min.   :-2.7914275  
##  1st Qu.:0.000   Class :character   1st Qu.:-0.528451   1st Qu.:-0.6065000  
##  Median :0.000   Mode  :character   Median :-0.244940   Median :-0.1963841  
##  Mean   :0.468                      Mean   : 0.000000   Mean   : 0.0000000  
##  3rd Qu.:1.000                      3rd Qu.: 0.199922   3rd Qu.: 0.7652668  
##  Max.   :1.000                      Max.   :14.557583   Max.   : 2.9714073  
##                                                                             
##     empstat      hispan      
##  Min.   :1   Min.   :0.0000  
##  1st Qu.:1   1st Qu.:0.0000  
##  Median :1   Median :1.0000  
##  Mean   :1   Mean   :0.7902  
##  3rd Qu.:1   3rd Qu.:1.0000  
##  Max.   :1   Max.   :4.0000  
##

# tx_hw71$pwt <- tx_hw7$perwt/100

tx_pc<-princomp(~ownhome+hcov+edu3+uhrswork+empstat+inctot+presgl,   #add employment and income and restrict to only those in lf
                   data=tx_hw71,
                   scores=T)

summary(tx_pc)

## Importance of components:
##                           Comp.1    Comp.2    Comp.3    Comp.4    Comp.5
## Standard deviation     1.4153143 1.0229421 0.9870530 0.8777319 0.8072633
## Proportion of Variance 0.3338649 0.1744083 0.1623850 0.1284070 0.1086164
## Cumulative Proportion  0.3338649 0.5082732 0.6706582 0.7990652 0.9076816
##                            Comp.6       Comp.7
## Standard deviation     0.74423763 6.715863e-09
## Proportion of Variance 0.09231839 7.517416e-18
## Cumulative Proportion  1.00000000 1.000000e+00

#pc1 explains roughly 33% of the variance and pc2 about 17%, cumulatively Comp.1 and 2 explain about 51% of the variance 
#We want our eiganvalues to be above one. This occurs for the first two components only. This means I would only use those two in my index. 
loadings(tx_pc)

## 
## Loadings:
##          Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
## ownhome   0.247  0.410  0.753  0.379  0.241              
## hcov      0.410  0.250  0.201 -0.840 -0.137              
## edu3      0.466  0.237 -0.469         0.302  0.638       
## uhrswork  0.294 -0.743  0.193 -0.119  0.555              
## empstat                                            -1.000
## inctot    0.458 -0.374  0.143  0.248 -0.722  0.216       
## presgl    0.507  0.149 -0.340  0.258        -0.733       
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
## SS loadings     1.000  1.000  1.000  1.000  1.000  1.000  1.000
## Proportion Var  0.143  0.143  0.143  0.143  0.143  0.143  0.143
## Cumulative Var  0.143  0.286  0.429  0.571  0.714  0.857  1.000

#Reviewing the loadings or eigenvectors, first I note that most of the variables in the first component are positively correlated with the first component. The exception is the employment status, which is suppressed. However, I suspect this may be happening because I restricted the data to only those that are employed in IPUMS. I didn't create a subset in R. What is the best approach here? 
#I would also interpret the loadings to mean that higher values in owning a home (1=own, 2 = rent), health coverage (0=no, 1=yes), higher levels of education, higher levels of hours worked per week, higher levels of income, and higher levels of occupational prestige indicate better socioeconomic status.

#Screeplot
screeplot(tx_pc,
          type = "l",
          main = "Scree Plot")
abline(h=2)

plot(tx_pc)

screeplot(tx_pc, type = "line", main = "Scree Plot")

biplot(tx_pc)

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

scores<-data.frame(tx_pc$scores)
hist(scores$Comp.1)

hist(scores$Comp.2)

#calculate the correlation between the first 2 components to show they are orthogonal, i.e. correlation == 0 
cor(scores[,1:2])

##              Comp.1       Comp.2
## Comp.1 1.000000e+00 1.838385e-14
## Comp.2 1.838385e-14 1.000000e+00

tx_hw71c<-cbind(tx_hw71, scores)
names(tx_hw71c)

##  [1] "perwt"    "strata"   "newpuma"  "hcov"     "edu3"     "uhrswork"
##  [7] "lfpart"   "ownhome"  "sex"      "race"     "inctot"   "presgl"  
## [13] "empstat"  "hispan"   "Comp.1"   "Comp.2"   "Comp.3"   "Comp.4"  
## [19] "Comp.5"   "Comp.6"   "Comp.7"

#to make a correlation matrix between the comps and the original values 
# round(cor(tx_hw71c[,c(-1:-5, -17:-27)]), 3)
# round(cor(tx_hw71c[,c(-1:-5, -19:-27)]),3)

#Using the PCs in an analysis 
options(survey.lonely.psu = "adjust")

tx_hw71c$pc1q<-cut(tx_hw71c$Comp.1,
                    breaks = quantile(tx_hw71c$Comp.1,
                                      probs = c(0,.25,.5,.75,1) ,
                                      na.rm=T), 
                    include.lowest=T)
des<-svydesign(ids=~1,
               strata=~strata,
               weights=~perwt,
               data=tx_hw71c)

library(ggplot2)
ggplot(aes(x=factor(sex), y=Comp.1),
       data=tx_hw71c)+
  geom_boxplot()

#The results show that the mean of component 1 aren't very different. 


ggplot(aes(x=race, y=Comp.1),  
       data=tx_hw71c)+
  geom_boxplot()

#According to the results, Blacks, other race, and those of two or more races have lower averages of component 1 than White, OAPI. This may signal that there is lower socioeconomic status among these groups. Latino/as are categorized as White, which does not give us an accurate picture. Hispan is not included in the index because it had a negative association with component 1, so I removed it.

fit.1<-svyglm(Comp.1~factor(sex)+race,
              des,
              family=gaussian)
summary(fit.1)

## 
## Call:
## svyglm(formula = Comp.1 ~ factor(sex) + race, design = des, family = gaussian)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~strata, weights = ~perwt, data = tx_hw71c)
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.69303    0.31220   2.220 0.026437 *  
## factor(sex)1     -0.07656    0.02155  -3.553 0.000382 ***
## raceblack        -0.77741    0.34078  -2.281 0.022540 *  
## raceoapi         -0.03757    0.35194  -0.107 0.914989    
## raceother        -1.17714    0.31438  -3.744 0.000181 ***
## racethreemoremaj -0.06851    0.42992  -0.159 0.873395    
## racetwomajor     -0.93643    0.32618  -2.871 0.004097 ** 
## racewhite        -0.78570    0.31201  -2.518 0.011802 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.954698)
## 
## Number of Fisher Scoring iterations: 2

#Results indicate that lower values of sex (being a male) is associated with higher socioeconomic status.This makes sense. Sex is negatively correlated with socioeconomic status.  This negative association is also true of race. However, there are certain races that have a stronger correlation to Comp.1, such as Black, other, two major, and white.

HW7

Coda Rayo-Garza

3/23/2021