library(tidyverse)
library(readxl)
library(DT)
library(magrittr)
library(GGally)
library(fastDummies)

https://www.nlsinfo.org/content/cohorts/nlsy79

# Code Book
read_excel("G:/My Drive/homework/Lovely J/Codebook+NLSY1979.xlsx", sheet = "Data Set Description") %>%
  datatable(options = list(pageLength = 5), caption = 'Codebook 1')
# Code Book
read_excel("G:/My Drive/homework/Lovely J/Codebook+NLSY1979.xlsx", sheet = "Variable List") %>%
  datatable(options = list(pageLength = 5), caption = 'Codebook 2')
# Import Data
nlsy <- read_csv("G:/My Drive/homework/Lovely J/NLSY1979_1990.csv")

# View Data
nlsy %>% datatable(options = list(pageLength = 5,
                                  autoWidth = TRUE),
                   filter = 'bottom',
                   caption = 'National Longitudinal Survey of Youth 1979')
# Subset Variables
nlsy %<>% select(YEAR_OF_BIRTH, COUNTRY_OF_BIRTH, SAMPLE_RACE, SAMPLE_SEX, # given
                WHEN_IN_POVERTY, EDU_DEGREE, EVER_DIVORCED_, REGION_) # chosen
                # EVER_IN_POVERTY
nlsy %<>%
  filter(COUNTRY_OF_BIRTH != "-3") %>%
  filter(EDU_DEGREE != "-5") %>%
  filter(EVER_DIVORCED_ != "-5") %>%
  filter(REGION_ != "-3") %>%
  drop_na()

nlsy %<>% mutate(Age = 79 - YEAR_OF_BIRTH, 
                .keep = "unused",
                .before = COUNTRY_OF_BIRTH)

nlsy %>% 
  mutate(across(.cols = everything(), as_factor)) %>%
  summary()
##       Age               COUNTRY_OF_BIRTH                  SAMPLE_RACE  
##  17     :1341   IN OTHER COUNTRY: 613    NON-BLACK, NON-HISPANIC:5366  
##  19     :1268   IN THE US       :8977    HISPANIC               :1612  
##  16     :1267                            BLACK                  :2612  
##  18     :1264                                                          
##  20     :1168                                                          
##  21     :1106                                                          
##  (Other):2176                                                          
##   SAMPLE_SEX               WHEN_IN_POVERTY
##  FEMALE:4929   ADULT ONLY          :4943  
##  MALE  :4661   NEVER               :3292  
##                CHILD ONLY          : 179  
##                BOTH CHILD AND ADULT:1176  
##                                           
##                                           
##                                           
##                                EDU_DEGREE   EVER_DIVORCED_
##  High school diploma (or equivalent):5261   No :8066      
##  None                               :1012   Yes:1524      
##  Bachelor of Science (BS)           : 972                 
##  Associate/Junior College (AA)      : 940                 
##  Master's Degree (MA, MBA, MS, MSW) : 549                 
##  Bachelor of Arts Degree (BA)       : 540                 
##  (Other)                            : 316                 
##              REGION_    
##  1: NORTHEAST    :1718  
##  4: WEST         :1920  
##  3: SOUTH        :3721  
##  2: NORTH CENTRAL:2231  
##                         
##                         
## 
nlsy %>% mutate(across(.cols = "Age", as_factor)) %>%
  ggpairs(axisLabels = "none")

https://cran.r-project.org/web/packages/fastDummies/vignettes/making-dummy-variables.html
https://stackoverflow.com/a/57998108

nlsy %>%
  dummy_columns(select_columns = names(nlsy[, -1]),
                remove_most_frequent_dummy = TRUE, 
                remove_selected_columns = TRUE) %>%
  datatable(options = list(pageLength = 5))
sapply(nlsy, unique)
## $Age
## [1] 20 18 17 19 15 21 16 22
## 
## $COUNTRY_OF_BIRTH
## [1] "IN OTHER COUNTRY" "IN THE US"       
## 
## $SAMPLE_RACE
## [1] "NON-BLACK, NON-HISPANIC" "HISPANIC"               
## [3] "BLACK"                  
## 
## $SAMPLE_SEX
## [1] "FEMALE" "MALE"  
## 
## $WHEN_IN_POVERTY
## [1] "ADULT ONLY"           "NEVER"                "CHILD ONLY"          
## [4] "BOTH CHILD AND ADULT"
## 
## $EDU_DEGREE
## [1] "High school diploma (or equivalent)" "Bachelor of Science (BS)"           
## [3] "Associate/Junior College (AA)"       "Master's Degree (MA, MBA, MS, MSW)" 
## [5] "Bachelor of Arts Degree (BA)"        "None"                               
## [7] "Other (SPECIFY)"                     "Professional Degree (MD, LLD, DDS)" 
## [9] "Doctoral Degree (PhD)"              
## 
## $EVER_DIVORCED_
## [1] "No"  "Yes"
## 
## $REGION_
## [1] "1: NORTHEAST"     "4: WEST"          "3: SOUTH"         "2: NORTH CENTRAL"
nlsy %>%
  ggplot(aes(SAMPLE_SEX, EDU_DEGREE,
             shape = COUNTRY_OF_BIRTH, col = EVER_DIVORCED_)) +
  geom_jitter()

nlsy %>%
  ggplot(aes(SAMPLE_SEX, EDU_DEGREE)) +
  geom_jitter() +
  facet_grid(cols = vars(REGION_), rows = vars(WHEN_IN_POVERTY))