library(tidyverse)
library(readxl)
library(DT)
library(magrittr)
library(GGally)
library(fastDummies)
https://www.nlsinfo.org/content/cohorts/nlsy79
# Code Book
read_excel("G:/My Drive/homework/Lovely J/Codebook+NLSY1979.xlsx", sheet = "Data Set Description") %>%
datatable(options = list(pageLength = 5), caption = 'Codebook 1')
# Code Book
read_excel("G:/My Drive/homework/Lovely J/Codebook+NLSY1979.xlsx", sheet = "Variable List") %>%
datatable(options = list(pageLength = 5), caption = 'Codebook 2')
# Import Data
nlsy <- read_csv("G:/My Drive/homework/Lovely J/NLSY1979_1990.csv")
# View Data
nlsy %>% datatable(options = list(pageLength = 5,
autoWidth = TRUE),
filter = 'bottom',
caption = 'National Longitudinal Survey of Youth 1979')
# Subset Variables
nlsy %<>% select(YEAR_OF_BIRTH, COUNTRY_OF_BIRTH, SAMPLE_RACE, SAMPLE_SEX, # given
WHEN_IN_POVERTY, EDU_DEGREE, EVER_DIVORCED_, REGION_) # chosen
# EVER_IN_POVERTY
nlsy %<>%
filter(COUNTRY_OF_BIRTH != "-3") %>%
filter(EDU_DEGREE != "-5") %>%
filter(EVER_DIVORCED_ != "-5") %>%
filter(REGION_ != "-3") %>%
drop_na()
nlsy %<>% mutate(Age = 79 - YEAR_OF_BIRTH,
.keep = "unused",
.before = COUNTRY_OF_BIRTH)
nlsy %>%
mutate(across(.cols = everything(), as_factor)) %>%
summary()
## Age COUNTRY_OF_BIRTH SAMPLE_RACE
## 17 :1341 IN OTHER COUNTRY: 613 NON-BLACK, NON-HISPANIC:5366
## 19 :1268 IN THE US :8977 HISPANIC :1612
## 16 :1267 BLACK :2612
## 18 :1264
## 20 :1168
## 21 :1106
## (Other):2176
## SAMPLE_SEX WHEN_IN_POVERTY
## FEMALE:4929 ADULT ONLY :4943
## MALE :4661 NEVER :3292
## CHILD ONLY : 179
## BOTH CHILD AND ADULT:1176
##
##
##
## EDU_DEGREE EVER_DIVORCED_
## High school diploma (or equivalent):5261 No :8066
## None :1012 Yes:1524
## Bachelor of Science (BS) : 972
## Associate/Junior College (AA) : 940
## Master's Degree (MA, MBA, MS, MSW) : 549
## Bachelor of Arts Degree (BA) : 540
## (Other) : 316
## REGION_
## 1: NORTHEAST :1718
## 4: WEST :1920
## 3: SOUTH :3721
## 2: NORTH CENTRAL:2231
##
##
##
nlsy %>% mutate(across(.cols = "Age", as_factor)) %>%
ggpairs(axisLabels = "none")
https://cran.r-project.org/web/packages/fastDummies/vignettes/making-dummy-variables.html
https://stackoverflow.com/a/57998108
nlsy %>%
dummy_columns(select_columns = names(nlsy[, -1]),
remove_most_frequent_dummy = TRUE,
remove_selected_columns = TRUE) %>%
datatable(options = list(pageLength = 5))
sapply(nlsy, unique)
## $Age
## [1] 20 18 17 19 15 21 16 22
##
## $COUNTRY_OF_BIRTH
## [1] "IN OTHER COUNTRY" "IN THE US"
##
## $SAMPLE_RACE
## [1] "NON-BLACK, NON-HISPANIC" "HISPANIC"
## [3] "BLACK"
##
## $SAMPLE_SEX
## [1] "FEMALE" "MALE"
##
## $WHEN_IN_POVERTY
## [1] "ADULT ONLY" "NEVER" "CHILD ONLY"
## [4] "BOTH CHILD AND ADULT"
##
## $EDU_DEGREE
## [1] "High school diploma (or equivalent)" "Bachelor of Science (BS)"
## [3] "Associate/Junior College (AA)" "Master's Degree (MA, MBA, MS, MSW)"
## [5] "Bachelor of Arts Degree (BA)" "None"
## [7] "Other (SPECIFY)" "Professional Degree (MD, LLD, DDS)"
## [9] "Doctoral Degree (PhD)"
##
## $EVER_DIVORCED_
## [1] "No" "Yes"
##
## $REGION_
## [1] "1: NORTHEAST" "4: WEST" "3: SOUTH" "2: NORTH CENTRAL"
nlsy %>%
ggplot(aes(SAMPLE_SEX, EDU_DEGREE,
shape = COUNTRY_OF_BIRTH, col = EVER_DIVORCED_)) +
geom_jitter()
nlsy %>%
ggplot(aes(SAMPLE_SEX, EDU_DEGREE)) +
geom_jitter() +
facet_grid(cols = vars(REGION_), rows = vars(WHEN_IN_POVERTY))