library(ipumsr)
usa_00013 <- read_ipums_ddi("usa_00013.xml")
tx_00013 <- read_ipums_micro(usa_00013, data_file = ("usa_00013.dat.gz"), verbose = FALSE)

library(stringr)
names(tx_00013)<-tolower(names(tx_00013))

names(tx_00013)
##  [1] "year"      "multyear"  "sample"    "serial"    "cbserial"  "hhwt"     
##  [7] "cluster"   "statefip"  "puma"      "strata"    "gq"        "ownershp" 
## [13] "ownershpd" "mortgage"  "multgen"   "multgend"  "pernum"    "perwt"    
## [19] "sex"       "age"       "fertyr"    "race"      "raced"     "hispan"   
## [25] "hispand"   "hcovany"   "educ"      "educd"     "empstat"   "empstatd" 
## [31] "labforce"  "occ"       "ind"       "uhrswork"  "inctot"    "poverty"  
## [37] "presgl"    "migrate1"  "migrate1d"
#CS Code
tx_00013<-zap_labels(tx_00013)
tx_00013$newpuma<- paste (str_pad(tx_00013$statefip, 2,"left", "0"), str_pad(tx_00013$puma,5,"left", "0") , sep="")
table(tx_00013$newpuma)
## 
## 4800100 4800200 4800300 4800400 4800501 4800502 4800600 4800700 4800800 4800900 
##    8976    4362    4674    6757    3800    5870    7094    4974    8381    6331 
## 4801000 4801100 4801200 4801300 4801400 4801501 4801502 4801600 4801700 4801800 
##    6359    4275    3594    4239    4289    2823    3451    3754    4852    5583 
## 4801901 4801902 4801903 4801904 4801905 4801906 4801907 4802001 4802002 4802003 
##    4402    4245    3728    3195    4205    6061    4681    4436    4103    6084 
## 4802004 4802005 4802006 4802101 4802102 4802200 4802301 4802302 4802303 4802304 
##    4873    4049    5078    5882    5309    4226    2988    3015    3388    2921 
## 4802305 4802306 4802307 4802308 4802309 4802310 4802311 4802312 4802313 4802314 
##    3255    3718    3948    3274    3816    3944    4200    4436    5082    3627 
## 4802315 4802316 4802317 4802318 4802319 4802320 4802321 4802322 4802400 4802501 
##    3899    2974    2488    3253    3424    3566    3905    4536    4496    3737 
## 4802502 4802503 4802504 4802505 4802506 4802507 4802508 4802509 4802510 4802511 
##    4185    3696    2676    3742    3053    3873    3622    4406    4376    2785 
## 4802512 4802513 4802514 4802515 4802516 4802600 4802700 4802800 4802900 4803000 
##    3138    4605    4330    3849    3494    9343    5077    4847    3644    3894 
## 4803100 4803200 4803301 4803302 4803303 4803304 4803305 4803306 4803400 4803501 
##    4029    4379    4575    4536    3704    2996    3661    4159    6975    4733 
## 4803502 4803601 4803602 4803700 4803801 4803802 4803900 4804000 4804100 4804200 
##    5277    5462    7410    8434    3518    5522    6597    5112    3565    4323 
## 4804301 4804302 4804400 4804501 4804502 4804503 4804504 4804601 4804602 4804603 
##    3282    4948    3389    3376    3264    2591    2489    3635    3768    4783 
## 4804604 4804605 4804606 4804607 4804608 4804609 4804610 4804611 4804612 4804613 
##    4719    3325    3548    3001    3208    3701    2969    2641    3627    3319 
## 4804614 4804615 4804616 4804617 4804618 4804619 4804620 4804621 4804622 4804623 
##    3146    2923    3399    2564    2998    2644    2778    3648    2757    2811 
## 4804624 4804625 4804626 4804627 4804628 4804629 4804630 4804631 4804632 4804633 
##    3105    2733    3005    2988    3085    2941    2632    3105    3428    2002 
## 4804634 4804635 4804636 4804637 4804638 4804701 4804702 4804801 4804802 4804803 
##    2120    2990    2901    3075    2706    5956    3790    2101    3777    3715 
## 4804901 4804902 4804903 4804904 4804905 4805000 4805100 4805201 4805202 4805203 
##    3025    3696    2939    2716    4373    5532    5005    4511    3605    3445 
## 4805204 4805301 4805302 4805303 4805304 4805305 4805306 4805307 4805308 4805309 
##    4498    3153    3689    4928    4124    3561    6162    4390    4123    4819 
## 4805400 4805500 4805600 4805700 4805800 4805901 4805902 4805903 4805904 4805905 
##    5755    6315    3466    5226    3730    3654    3649    3516    3355    3693 
## 4805906 4805907 4805908 4805909 4805910 4805911 4805912 4805913 4805914 4805915 
##    3386    4084    3118    3324    3944    3566    3237    2997    4406    3570 
## 4805916 4806000 4806100 4806200 4806301 4806302 4806400 4806500 4806601 4806602 
##    3317    3527    3634    4059    4778    3826    3097    4814    3861    3349 
## 4806603 4806701 4806702 4806703 4806801 4806802 4806803 4806804 4806805 4806806 
##    3786    3088    4194    4537    2613    2169    2594    2336    3324    2414 
## 4806807 4806900 
##    2775    2869
bordp<-readr::read_csv("C:/Users/codar/OneDrive/Documents/Stats II/Data/border_100mi_pumas_table.csv")
## Parsed with column specification:
## cols(
##   fid = col_double(),
##   STATEFP10 = col_double(),
##   PUMACE10 = col_character(),
##   AFFGEOID10 = col_character(),
##   GEOID10 = col_double(),
##   NAME10 = col_character(),
##   LSAD10 = col_character(),
##   ALAND10 = col_double(),
##   AWATER10 = col_double()
## )
mdat<-merge(tx_00013, bordp, by.x="newpuma", by.y="GEOID10")
table(mdat$newpuma)
## 
## 4802800 4803200 4803301 4803302 4803303 4803304 4803305 4803306 4806000 4806100 
##    4847    4379    4575    4536    3704    2996    3661    4159    3527    3634 
## 4806200 4806301 4806302 4806400 4806701 4806702 4806703 4806801 4806802 4806803 
##    4059    4778    3826    3097    3088    4194    4537    2613    2169    2594 
## 4806804 4806805 4806806 4806807 4806900 
##    2336    3324    2414    2775    2869
library(dplyr)
tx_00013<-tx_00013%>% 
filter(newpuma %in% c( "4802800", "4803200","4806000", "4806100", "4806200", "4806301", "4806302", "4806701", "4806702", "4806703", "4806900" ))
 View(tx_00013)
  names(tx_00013)
##  [1] "year"      "multyear"  "sample"    "serial"    "cbserial"  "hhwt"     
##  [7] "cluster"   "statefip"  "puma"      "strata"    "gq"        "ownershp" 
## [13] "ownershpd" "mortgage"  "multgen"   "multgend"  "pernum"    "perwt"    
## [19] "sex"       "age"       "fertyr"    "race"      "raced"     "hispan"   
## [25] "hispand"   "hcovany"   "educ"      "educd"     "empstat"   "empstatd" 
## [31] "labforce"  "occ"       "ind"       "uhrswork"  "inctot"    "poverty"  
## [37] "presgl"    "migrate1"  "migrate1d" "newpuma"
# describe(tx_00012$income)
# summary(tx_00012$educ)
# class(tx_00012$educ)
#Recodes
tx_5 <-tx_00013 %>%
  mutate(sex=case_when(sex == 1~0,
                       sex == 2~ 1,
                       TRUE ~ NA_real_),
         sexb=case_when(sex==1 ~ 'male',
                        sex ==2 ~ 'female',
                        TRUE~ NA_character_),
        lfpart=case_when(labforce== 1 ~ 0,
                          labforce== 2 ~ 1,
                         TRUE ~ NA_real_),
         edu=case_when(educ== 0 ~ 'none',
                        educ %in% 1:5 ~ 'hs incomplete',
                        educ %in% 6 ~ 'hs complete',
                        educ %in% 7:11 ~ 'college',
                       TRUE ~ NA_character_),
        edu3=case_when(educ %in% 1:5 ~ 1,
                       educ %in% 6 ~ 2,
                       educ %in% 7:11 ~ 3,
                       TRUE~NA_real_),
         race=case_when(race== 1 ~ 'white',
                        race== 2 ~ 'black',
                        # race== 3 ~'aian',
                        race %in% 4:5 ~ 'asian',
                        race== 6 ~ 'oapi',
                        race== 7 ~ 'other',
                        race== 8 ~ 'twomajor',
                        race== 9 ~ 'threemoremaj',
                        TRUE ~ NA_character_),
         hisp= case_when(hispan !=0 ~ "Latino",
                         hispan==0 ~'NL',
                         hispan==9 ~ 'NL',
                         TRUE ~ NA_character_),
         migrate1=case_when(migrate1==1 ~ 'same house',
                            migrate1==2 ~ 'movinstate',
                            migrate1==3 ~ 'abroad1yr',
                            TRUE ~ NA_character_),
         fertyr=case_when(fertyr== 1 ~ 0, 
                          fertyr== 2 ~ 1,
                          TRUE~ NA_real_ ),
         poverty1=case_when(poverty==001 ~ "1% or less",
                           poverty ==501 ~ "501% or more",
                           TRUE~ NA_character_),
         hcov=case_when(hcovany == 1 ~ 0,
                        hcovany == 2 ~ 1,
                        TRUE~NA_real_),
         ownhome=case_when(ownershp==1 ~ 1,
                            ownershp==2 ~ 0,
                            TRUE ~ NA_real_),
        multgen1=case_when(multgen==1 ~ 1,
                           multgen==2 ~ 2,
                           multgen==3 ~ 3,
                           TRUE~NA_real_))
         # mgmt = if_else(occ %in% c(10:160) | occ %in% c(220:730), 1, 0))    #occupational prestige
        
         # occ=case_when(occ %in% 10:160 ~ 'Mgmt/Biz',
         #                    occ %in% 220:730 ~ 'Mgmt/Biz',
         #                    occ %in% 800:950 ~ 'Finance',
         #                    # occ %in% 1000:1240 ~ 'STEM',
         #                    occ %in% 1300:1540 ~ 'Arch/Eng',
         #                    occ %in% 1550:1560 ~ 'Technical',
         #                    # occ %in% 1600:1760 ~ 'STEM',
         #                    occ %in% 1800:1840 ~ 'SocSTEM',
         #                    occ %in% 1900:1980 ~ 'Technical',
         #                    occ %in% 2000:2060 ~ 'PublicServ',
         #                    occ == 2100 ~ 'Law',
         #                    occ %in% 2140:2150 ~ 'Technical',
         #                    occ %in% 2200:2430 ~ 'Education',
         #                    occ %in% 2440:2550 ~ 'Technical',
         #                    occ %in% 2600:2910 ~ 'A&E/Sports/Media',
         #                    occ == 2920 ~ 'Technical',
         #                    occ %in% 3000:3500 ~ 'Health/Med',
         #                    occ %in% 3510:3650 ~ 'Technical',
         #                    occ %in% 3700:3950 ~ 'PublicServ',
         #                    occ == 4000 ~ 'A&E/Sports/Media',
         #                    occ %in% 4010:4965 ~ 'Sales/Service',
         #                    occ %in% 5000: 5940 ~ 'Office/Admin',
         #                    occ %in% 6200:8965 ~ 'SkilledTrade',
         #                    occ %in% 9000:9750 ~ 'Transport',
         #                    TRUE~ NA_character_))
View(tx_5)
#Summary statistics and correlation matrix 

tx_5a<-tx_5%>%
  filter(complete.cases(perwt, strata, newpuma,edu3, edu, lfpart,sex, race, presgl, empstat, inctot, age)) %>%
  select(perwt, strata, newpuma,edu3, edu, lfpart,sex, race, presgl, empstat, fertyr, inctot, age)
summary(tx_5a)
##      perwt            strata         newpuma               edu3      
##  Min.   :  1.00   Min.   :280048   Length:41392       Min.   :1.000  
##  1st Qu.: 10.00   1st Qu.:600048   Class :character   1st Qu.:2.000  
##  Median : 16.00   Median :630148   Mode  :character   Median :2.000  
##  Mean   : 20.98   Mean   :569347                      Mean   :2.144  
##  3rd Qu.: 27.00   3rd Qu.:670248                      3rd Qu.:3.000  
##  Max.   :331.00   Max.   :690048                      Max.   :3.000  
##                                                                      
##      edu                lfpart            sex             race          
##  Length:41392       Min.   :0.0000   Min.   :0.0000   Length:41392      
##  Class :character   1st Qu.:0.0000   1st Qu.:0.0000   Class :character  
##  Mode  :character   Median :1.0000   Median :0.0000   Mode  :character  
##                     Mean   :0.6412   Mean   :0.4969                     
##                     3rd Qu.:1.0000   3rd Qu.:1.0000                     
##                     Max.   :1.0000   Max.   :1.0000                     
##                                                                         
##      presgl         empstat          fertyr          inctot      
##  Min.   : 0.00   Min.   :1.000   Min.   :0.000   Min.   : -6600  
##  1st Qu.: 0.00   1st Qu.:1.000   1st Qu.:0.000   1st Qu.:  1058  
##  Median :32.10   Median :1.000   Median :0.000   Median : 15878  
##  Mean   :28.75   Mean   :1.755   Mean   :0.065   Mean   : 28905  
##  3rd Qu.:46.90   3rd Qu.:3.000   3rd Qu.:0.000   3rd Qu.: 39800  
##  Max.   :81.50   Max.   :3.000   Max.   :1.000   Max.   :816000  
##                                  NA's   :27330                   
##       age       
##  Min.   :16.00  
##  1st Qu.:27.00  
##  Median :40.00  
##  Mean   :39.99  
##  3rd Qu.:53.00  
##  Max.   :64.00  
## 
View(tx_5a)
# my_summary1 <-
#   list("Labor Force Participation" =
#        list("min"       = ~ min(lfpart),
#             "max"       = ~ max(lfpart),
#             "mean (sd)" = ~ qwraps2::mean_sd(lfpart)),
#        "Fertility" =
#        list("min"       = ~ min(fertyr),
#             "max"       = ~ max(fertyr),
#             "mean (sd)" = ~ qwraps2::mean_sd(fertyr)),
#        "Educational Attainment" =
#        list("min"       = ~ min(edu3),
#             "max"       = ~ max(edu3),
#             "mean (sd)" = ~ qwraps2::mean_sd(edu3))
#        )
# 
# 
# library(qwraps2)
# bysex <- summary_table(dplyr::group_by(tx_hw71, sex), my_summary1)
# print(bysex)


CreateTableOne(data=tx_5a)
##                      
##                       Overall              
##   n                       41392            
##   perwt (mean (SD))       20.98 (17.79)    
##   strata (mean (SD))  569347.12 (143178.43)
##   newpuma (%)                              
##      4802800               4635 (11.2)     
##      4803200               4169 (10.1)     
##      4806000               3368 ( 8.1)     
##      4806100               3498 ( 8.5)     
##      4806200               3809 ( 9.2)     
##      4806301               4504 (10.9)     
##      4806302               3590 ( 8.7)     
##      4806701               2889 ( 7.0)     
##      4806702               3933 ( 9.5)     
##      4806703               4251 (10.3)     
##      4806900               2746 ( 6.6)     
##   edu3 (mean (SD))         2.14 (0.77)     
##   edu (%)                                  
##      college              15684 (37.9)     
##      hs complete          16005 (38.7)     
##      hs incomplete         9703 (23.4)     
##   lfpart (mean (SD))       0.64 (0.48)     
##   sex (mean (SD))          0.50 (0.50)     
##   race (%)                                 
##      asian                   55 ( 0.1)     
##      black                  718 ( 1.7)     
##      oapi                   275 ( 0.7)     
##      other                 2397 ( 5.8)     
##      threemoremaj            30 ( 0.1)     
##      twomajor               542 ( 1.3)     
##      white                37375 (90.3)     
##   presgl (mean (SD))      28.76 (20.62)    
##   empstat (mean (SD))      1.76 (0.95)     
##   fertyr (mean (SD))       0.06 (0.25)     
##   inctot (mean (SD))   28904.83 (45658.90) 
##   age (mean (SD))         39.99 (14.59)
# library(tableone)
# t2<-CreateTableOne(vars = c(edu3, fertyr,empstat, strata = "sex", test = T, data = tx_hw71)
# The research question to be examined is: How does labor force participation for women along border PUMAS vary based on race, education, occupation, and fertility?
library(tableone)

# # Create a variable list which we want in Table 1
# listVars <- c("empstat", "fertyr", "edu", "edu3", "sex", "presgl", "inctot", "age")
# 
# # Define categorical variables
# catVars <- c("sex","edu","empstat", "fertyr")
# 
# table1 <- CreateTableOne(vars = listVars, data = tx_5a, factorVars = catVars, strata = c("sex"))
# 
# 
# a <- print(table1, quote = TRUE, noSpaces = TRUE)
# 
# as.data.frame(a)
# my_skim <- skim_with(
#   numeric = sfl(iqr = IQR, mad = mad, p99 = ~ quantile(., probs = .99)),
#   append = FALSE
# )
# my_skim(iris, Sepal.Length)


library(skimr)
## Warning: package 'skimr' was built under R version 4.0.4
skim(tx_5a)
Data summary
Name tx_5a
Number of rows 41392
Number of columns 13
_______________________
Column type frequency:
character 3
numeric 10
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
newpuma 0 1 7 7 0 11 0
edu 0 1 7 13 0 3 0
race 0 1 4 12 0 7 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
perwt 0 1.00 20.98 17.79 1 10 16.0 27.0 331.0 ▇▁▁▁▁
strata 0 1.00 569347.12 143178.43 280048 600048 630148.0 670248.0 690048.0 ▂▁▁▁▇
edu3 0 1.00 2.14 0.77 1 2 2.0 3.0 3.0 ▅▁▇▁▇
lfpart 0 1.00 0.64 0.48 0 0 1.0 1.0 1.0 ▅▁▁▁▇
sex 0 1.00 0.50 0.50 0 0 0.0 1.0 1.0 ▇▁▁▁▇
presgl 0 1.00 28.76 20.62 0 0 32.1 46.9 81.5 ▇▆▇▆▁
empstat 0 1.00 1.76 0.95 1 1 1.0 3.0 3.0 ▇▁▁▁▅
fertyr 27330 0.34 0.06 0.25 0 0 0.0 0.0 1.0 ▇▁▁▁▁
inctot 0 1.00 28904.83 45658.90 -6600 1058 15878.0 39800.0 816000.0 ▇▁▁▁▁
age 0 1.00 39.99 14.59 16 27 40.0 53.0 64.0 ▇▆▆▇▇
tx_5a %>%
  dplyr::group_by(sex) %>%
  skim()
Data summary
Name Piped data
Number of rows 41392
Number of columns 13
_______________________
Column type frequency:
character 3
numeric 9
________________________
Group variables sex

Variable type: character

skim_variable sex n_missing complete_rate min max empty n_unique whitespace
newpuma 0 0 1 7 7 0 11 0
newpuma 1 0 1 7 7 0 11 0
edu 0 0 1 7 13 0 3 0
edu 1 0 1 7 13 0 3 0
race 0 0 1 4 12 0 7 0
race 1 0 1 4 12 0 7 0

Variable type: numeric

skim_variable sex n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
perwt 0 0 1.00 21.12 18.36 1 10.0 16.0 27.0 331.0 ▇▁▁▁▁
perwt 1 0 1.00 20.84 17.18 1 10.0 16.0 27.0 237.0 ▇▁▁▁▁
strata 0 0 1.00 561866.90 147650.07 280048 600048.0 630148.0 670248.0 690048.0 ▃▁▁▁▇
strata 1 0 1.00 576921.91 138094.51 280048 600048.0 630148.0 670248.0 690048.0 ▂▁▁▁▇
edu3 0 0 1.00 2.09 0.77 1 1.0 2.0 3.0 3.0 ▅▁▇▁▇
edu3 1 0 1.00 2.20 0.77 1 2.0 2.0 3.0 3.0 ▅▁▇▁▇
lfpart 0 0 1.00 0.68 0.47 0 0.0 1.0 1.0 1.0 ▃▁▁▁▇
lfpart 1 0 1.00 0.61 0.49 0 0.0 1.0 1.0 1.0 ▅▁▁▁▇
presgl 0 0 1.00 28.78 19.32 0 17.5 32.1 45.3 81.5 ▆▇▆▅▁
presgl 1 0 1.00 28.73 21.86 0 0.0 32.8 47.8 81.5 ▇▃▇▅▁
empstat 0 0 1.00 1.69 0.93 1 1.0 1.0 3.0 3.0 ▇▁▁▁▅
empstat 1 0 1.00 1.82 0.97 1 1.0 1.0 3.0 3.0 ▇▁▁▁▆
fertyr 0 20826 0.00 NaN NA NA NA NA NA NA
fertyr 1 6504 0.68 0.06 0.25 0 0.0 0.0 0.0 1.0 ▇▁▁▁▁
inctot 0 0 1.00 36630.61 55275.11 -6242 3176.0 21170.0 50000.0 816000.0 ▇▁▁▁▁
inctot 1 0 1.00 21081.38 31309.88 -6600 0.0 12296.0 30000.0 587116.0 ▇▁▁▁▁
age 0 0 1.00 39.30 14.59 16 26.0 39.0 52.0 64.0 ▇▆▆▆▇
age 1 0 1.00 40.69 14.56 16 28.0 41.0 54.0 64.0 ▇▆▆▇▇
#skim by certain columns
skim(tx_5a, sex, empstat, fertyr, edu, presgl, inctot, age)
Data summary
Name tx_5a
Number of rows 41392
Number of columns 13
_______________________
Column type frequency:
character 1
numeric 6
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
edu 0 1 7 13 0 3 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
sex 0 1.00 0.50 0.50 0 0 0.0 1.0 1.0 ▇▁▁▁▇
empstat 0 1.00 1.76 0.95 1 1 1.0 3.0 3.0 ▇▁▁▁▅
fertyr 27330 0.34 0.06 0.25 0 0 0.0 0.0 1.0 ▇▁▁▁▁
presgl 0 1.00 28.76 20.62 0 0 32.1 46.9 81.5 ▇▆▇▆▁
inctot 0 1.00 28904.83 45658.90 -6600 1058 15878.0 39800.0 816000.0 ▇▁▁▁▁
age 0 1.00 39.99 14.59 16 27 40.0 53.0 64.0 ▇▆▆▇▇
#skim by currently employed females only and 
tx_5a %>% 
  dplyr::group_by(empstat) %>%
  filter(sex == "1") %>% 
  skim()
Data summary
Name Piped data
Number of rows 20566
Number of columns 13
_______________________
Column type frequency:
character 3
numeric 9
________________________
Group variables empstat

Variable type: character

skim_variable empstat n_missing complete_rate min max empty n_unique whitespace
newpuma 1 0 1 7 7 0 11 0
newpuma 2 0 1 7 7 0 11 0
newpuma 3 0 1 7 7 0 11 0
edu 1 0 1 7 13 0 3 0
edu 2 0 1 7 13 0 3 0
edu 3 0 1 7 13 0 3 0
race 1 0 1 4 12 0 7 0
race 2 0 1 4 12 0 6 0
race 3 0 1 4 12 0 7 0

Variable type: numeric

skim_variable empstat n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
perwt 1 0 1.00 20.78 17.42 1 10.00 16.0 26.00 237.0 ▇▁▁▁▁
perwt 2 0 1.00 22.17 16.95 1 10.00 16.0 29.00 86.0 ▇▃▂▁▁
perwt 3 0 1.00 20.80 16.85 1 10.00 16.0 27.00 170.0 ▇▁▁▁▁
strata 1 0 1.00 573787.09 139793.96 280048 600048.00 630148.0 670248.00 690048.0 ▂▁▁▁▇
strata 2 0 1.00 596106.65 124566.60 280048 610048.00 630148.0 670248.00 690048.0 ▂▁▁▁▇
strata 3 0 1.00 579733.09 136584.15 280048 600048.00 630148.0 670248.00 690048.0 ▂▁▁▁▇
edu3 1 0 1.00 2.38 0.70 1 2.00 3.0 3.00 3.0 ▂▁▆▁▇
edu3 2 0 1.00 2.13 0.72 1 2.00 2.0 3.00 3.0 ▃▁▇▁▆
edu3 3 0 1.00 1.94 0.79 1 1.00 2.0 3.00 3.0 ▇▁▇▁▆
lfpart 1 0 1.00 1.00 0.00 1 1.00 1.0 1.00 1.0 ▁▁▇▁▁
lfpart 2 0 1.00 1.00 0.00 1 1.00 1.0 1.00 1.0 ▁▁▇▁▁
lfpart 3 0 1.00 0.00 0.00 0 0.00 0.0 0.00 0.0 ▁▁▇▁▁
sex 1 0 1.00 1.00 0.00 1 1.00 1.0 1.00 1.0 ▁▁▇▁▁
sex 2 0 1.00 1.00 0.00 1 1.00 1.0 1.00 1.0 ▁▁▇▁▁
sex 3 0 1.00 1.00 0.00 1 1.00 1.0 1.00 1.0 ▁▁▇▁▁
presgl 1 0 1.00 41.10 14.29 0 32.80 41.9 50.30 81.5 ▂▃▇▆▁
presgl 2 0 1.00 25.97 18.55 0 0.00 30.9 36.30 78.3 ▇▆▇▃▁
presgl 3 0 1.00 11.09 18.77 0 0.00 0.0 20.30 78.3 ▇▁▁▁▁
fertyr 1 3680 0.69 0.05 0.23 0 0.00 0.0 0.00 1.0 ▇▁▁▁▁
fertyr 2 96 0.87 0.06 0.23 0 0.00 0.0 0.00 1.0 ▇▁▁▁▁
fertyr 3 2728 0.66 0.08 0.27 0 0.00 0.0 0.00 1.0 ▇▁▁▁▁
inctot 1 0 1.00 32279.97 33676.35 -6600 13223.25 24333.0 43035.00 587116.0 ▇▁▁▁▁
inctot 2 0 1.00 7775.86 16956.35 -2618 0.00 1952.0 9545.25 328138.0 ▇▁▁▁▁
inctot 3 0 1.00 6081.29 19948.83 -5184 0.00 0.0 6284.00 437801.0 ▇▁▁▁▁
age 1 0 1.00 41.91 13.04 16 31.00 43.0 53.00 64.0 ▅▆▆▇▇
age 2 0 1.00 33.23 13.12 16 21.00 30.0 44.00 64.0 ▇▅▃▃▂
age 3 0 1.00 39.58 16.36 16 24.00 40.0 55.00 64.0 ▇▅▅▅▇
#usin psych

psych::describe.by(tx_5a, tx_5a$sex, digits = 2)
## Warning: describe.by is deprecated. Please use the describeBy function
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## 
##  Descriptive statistics by group 
## group: 0
##          vars     n      mean        sd   median   trimmed      mad    min
## perwt       1 20826     21.12     18.36     16.0     17.98    11.86      1
## strata      2 20826 561866.90 147650.07 630148.0 581822.58 59304.00 280048
## newpuma*    3 20826      5.64      3.20      6.0      5.58     4.45      1
## edu3        4 20826      2.09      0.77      2.0      2.12     1.48      1
## edu*        5 20826      1.91      0.77      2.0      1.88     1.48      1
## lfpart      6 20826      0.68      0.47      1.0      0.72     0.00      0
## sex         7 20826      0.00      0.00      0.0      0.00     0.00      0
## race*       8 20826      6.64      1.11      7.0      6.99     0.00      1
## presgl      9 20826     28.78     19.32     32.1     28.63    21.65      0
## empstat    10 20826      1.69      0.93      1.0      1.61     0.00      1
## fertyr     11     0       NaN        NA       NA       NaN       NA    Inf
## inctot     12 20826  36630.61  55275.11  21170.0  26690.66 30772.10  -6242
## age        13 20826     39.30     14.59     39.0     39.22    19.27     16
##               max    range  skew kurtosis      se
## perwt       331.0    330.0  2.78    14.79    0.13
## strata   690048.0 410000.0 -1.17    -0.44 1023.13
## newpuma*     11.0     10.0  0.11    -1.23    0.02
## edu3          3.0      2.0 -0.16    -1.29    0.01
## edu*          3.0      2.0  0.16    -1.29    0.01
## lfpart        1.0      1.0 -0.75    -1.43    0.00
## sex           0.0      0.0   NaN      NaN    0.00
## race*         7.0      6.0 -3.09     8.52    0.01
## presgl       81.5     81.5 -0.08    -0.82    0.13
## empstat       3.0      2.0  0.66    -1.52    0.01
## fertyr       -Inf     -Inf    NA       NA      NA
## inctot   816000.0 822242.0  4.82    36.03  383.02
## age          64.0     48.0  0.02    -1.26    0.10
## ------------------------------------------------------------ 
## group: 1
##          vars     n      mean        sd   median   trimmed      mad    min
## perwt       1 20566     20.84     17.18     16.0     17.97    11.86      1
## strata      2 20566 576921.91 138094.51 630148.0 600764.15 59304.00 280048
## newpuma*    3 20566      5.94      3.13      6.0      5.97     4.45      1
## edu3        4 20566      2.20      0.77      2.0      2.24     1.48      1
## edu*        5 20566      1.80      0.77      2.0      1.76     1.48      1
## lfpart      6 20566      0.61      0.49      1.0      0.63     0.00      0
## sex         7 20566      1.00      0.00      1.0      1.00     0.00      1
## race*       8 20566      6.74      0.91      7.0      7.00     0.00      1
## presgl      9 20566     28.73     21.86     32.8     28.20    25.95      0
## empstat    10 20566      1.82      0.97      1.0      1.78     0.00      1
## fertyr     11 14062      0.06      0.25      0.0      0.00     0.00      0
## inctot     12 20566  21081.38  31309.88  12296.0  15685.76 18230.05  -6600
## age        13 20566     40.69     14.56     41.0     40.88    19.27     16
##               max    range  skew kurtosis     se
## perwt       237.0    236.0  2.38     9.69   0.12
## strata   690048.0 410000.0 -1.44     0.33 962.95
## newpuma*     11.0     10.0 -0.03    -1.21   0.02
## edu3          3.0      2.0 -0.35    -1.24   0.01
## edu*          3.0      2.0  0.35    -1.24   0.01
## lfpart        1.0      1.0 -0.43    -1.81   0.00
## sex           1.0      0.0   NaN      NaN   0.00
## race*         7.0      6.0 -3.59    12.33   0.01
## presgl       81.5     81.5 -0.08    -1.29   0.15
## empstat       3.0      2.0  0.36    -1.83   0.01
## fertyr        1.0      1.0  3.54    10.50   0.00
## inctot   587116.0 593716.0  5.50    57.65 218.33
## age          64.0     48.0 -0.10    -1.23   0.10
library(table1)
## Warning: package 'table1' was built under R version 4.0.4
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
#Frequency Table 
#random visualizations
ggplot2::ggplot(data = tx_5a, aes(x = empstat)) +
   geom_bar() +
   xlab("X-axis label") +
   ylab("Frequency")

ggplot(tx_5a, aes(x = `newpuma`)) +
        geom_bar() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1))

library(ggplot2)


#HIstogram of age
ggplot(tx_5a, aes(x = `age`)) +
        geom_bar() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1))

sapply(tx_5a, class)
##       perwt      strata     newpuma        edu3         edu      lfpart 
##   "numeric"   "numeric" "character"   "numeric" "character"   "numeric" 
##         sex        race      presgl     empstat      fertyr      inctot 
##   "numeric" "character"   "numeric"   "integer"   "numeric"   "numeric" 
##         age 
##   "integer"
# library(summarytools)
# summarytools::freq(tx_5a$Type, order = "freq")
# options(survey.lonely.psu = "adjust")
# 
# des<-svydesign(ids=~1, strata=~strata, weights=~perwt, data = tx_5a )

table(tx_5a$sex, tx_5a$edu)
##    
##     college hs complete hs incomplete
##   0    7198        8394          5234
##   1    8486        7611          4469