library(tidyverse)
library(magrittr)
library(GGally)
library(corrplot)
library(moderndive)
library(DT)
library(readxl)
library(skimr)
library(fastDummies)
# library(scales) ?
# library(bloom) ?summary
# Import Data
baseline <- read_csv("G:/My Drive/homework/Lovely J/Baseline+survey.csv")
# Clean up Answer5
bad <-
baseline %>%
select(Answer5) %>%
distinct() %>%
filter(Answer5 != "1" &
Answer5 != "2" &
Answer5 != "3" &
Answer5 != "4" &
Answer5 != "5") %>%
pull()
good <-
c("1", "2", NA, "3", "1", "1", "1", "2", "3", "1", "2", "4", "3", "3", "1",
"3", "1", "1", "1", "1", "1", "1", "1", "1", "1", "3", "1")
for (i in 1:length(good)){
baseline$Answer5[baseline$Answer5 == bad[i]] <- good[i]
}
# Drop questions, change answer data types & names
baseline %<>%
select(-X1, -starts_with("Question")) %>%
mutate(Major = Answer1,
Age_Yrs = Answer2 %>% as.numeric(),
Work_Yrs = Answer3 %>% as.numeric(),
Stats_Yrs = Answer4 %>% as.numeric(),
R_Lvl = Answer5 %>% as.integer(.),
Cen_Tend_Lvl = Answer6 %>% as.integer(.),
Var_Lvl = Answer7 %>% as.integer(.),
Bivar_Lvl = Answer8 %>% as.integer(.),
Prob_Lvl = Answer9 %>% as.integer(.),
.keep = "unused")
# Replace NAs
# https://www.tutorialspoint.com/r/r_mean_median_mode.htm
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
age_mode <- baseline$Age_Yrs %>% getmode(.)
baseline %<>% mutate(Age_Yrs = replace_na(Age_Yrs, age_mode))
rm(age_mode)
baseline %<>% mutate(Work_Yrs = round(Work_Yrs))
work_mode <- baseline$Work_Yrs %>% getmode(.)
baseline %<>% mutate(Work_Yrs = replace_na(Work_Yrs, work_mode))
rm(work_mode)
baseline %<>% mutate(Stats_Yrs = round(Stats_Yrs))
stats_mode <- baseline$Stats_Yrs %>% getmode(.)
baseline %<>% mutate(Stats_Yrs = replace_na(Stats_Yrs, stats_mode))
rm(stats_mode)
r_mode <- baseline$R_Lvl %>% getmode(.)
baseline %<>% mutate(R_Lvl = replace_na(R_Lvl, r_mode))
rm(r_mode)
cen_mode <- baseline$Cen_Tend_Lvl %>% getmode(.)
baseline %<>% mutate(Cen_Tend_Lvl = replace_na(Cen_Tend_Lvl, cen_mode))
baseline$Cen_Tend_Lvl[baseline$Cen_Tend_Lvl < 1] <- 1
baseline$Cen_Tend_Lvl[baseline$Cen_Tend_Lvl > 5] <- cen_mode
rm(cen_mode)
var_mode <- baseline$Var_Lvl %>% getmode(.)
baseline %<>% mutate(Var_Lvl = replace_na(Var_Lvl, var_mode))
baseline$Var_Lvl[baseline$Var_Lvl < 1] = 1
baseline$Var_Lvl[baseline$Var_Lvl > 5] = var_mode
rm(var_mode)
bivar_mode <- baseline$Bivar_Lvl %>% getmode(.)
baseline %<>% mutate(Bivar_Lvl = replace_na(Bivar_Lvl, bivar_mode))
baseline$Bivar_Lvl[baseline$Bivar_Lvl < 1] = 1
baseline$Bivar_Lvl[baseline$Bivar_Lvl > 5] = bivar_mode
rm(bivar_mode)
prob_mode <- baseline$Prob_Lvl %>% getmode(.)
baseline %<>% mutate(Prob_Lvl = replace_na(Prob_Lvl, prob_mode))
baseline$Prob_Lvl[baseline$Prob_Lvl < 1] = 1
baseline$Prob_Lvl[baseline$Prob_Lvl > 5] = prob_mode
rm(prob_mode)
rm(getmode)
rm(bad)
rm(good)
# Summaries
# baseline %>% glimpse()
# baseline %>% mutate(across(.cols = everything(), as.factor)) %>% summary()
baseline %>%
select(Age_Yrs, Work_Yrs, Stats_Yrs, R_Lvl, Cen_Tend_Lvl) %>% # arbitrary
mutate(across(.cols = everything(), as.factor)) %>%
ggpairs()
baseline %>%
select(Age_Yrs, Work_Yrs, Stats_Yrs, R_Lvl, Cen_Tend_Lvl) %>%
cor() %>%
corrplot.mixed(tl.pos = "lt")
model.fit <-
baseline %>%
select(Age_Yrs, Work_Yrs, Stats_Yrs, R_Lvl, Cen_Tend_Lvl) %>%
lm(Age_Yrs ~ ., data = .)
# model.fit %>% summary()
model.fit %>% get_regression_table(digits = 2) %>% datatable()
model.fit %>% get_regression_summaries() %>% round(1) %>% datatable()
baseline %>%
ggplot(aes(sample = Age_Yrs)) +
geom_qq() +
geom_qq_line() +
ggtitle("Response Variable (Age_Yrs) QQ Plot")
model.fit$residuals %>%
as.data.frame() %>%
ggplot(aes(sample = .)) +
geom_qq() +
geom_qq_line() +
ggtitle("Residuals QQ Plot")
model.fit$fitted.values %>%
as.data.frame() %>%
ggplot(aes(sample = .)) +
geom_qq() +
geom_qq_line() +
ggtitle("Fitted Values QQ Plot")
baseline %>%
select(Age_Yrs, Work_Yrs, Stats_Yrs, R_Lvl, Cen_Tend_Lvl) %>%
bind_cols(data.frame(Residuals = residuals(model.fit),
Fitted = fitted(model.fit)
)
) %>%
pivot_longer(!Residuals) %>%
ggplot(aes(Residuals, value)) +
geom_point() +
facet_wrap(vars(name)) #, scales = "free"
model.fit <-
baseline %>%
select(Age_Yrs, Work_Yrs, Stats_Yrs, R_Lvl, Cen_Tend_Lvl) %>%
mutate(across(.cols = c(Work_Yrs:Cen_Tend_Lvl), as.factor)) %>% # make categories
lm(Age_Yrs ~ Work_Yrs * Stats_Yrs * R_Lvl * Cen_Tend_Lvl, # make interactions
data = .)
model.fit %>%
get_regression_table(digits = 2) %>%
filter(complete.cases(.)) %>% # drops 984 NA interactions
filter(p_value < 0.1) %>% # drops 47 insignificant coefficient estimates
datatable((options = list(pageLength = 5)))
https://www.nlsinfo.org/content/cohorts/nlsy79
# Code Book
read_excel("G:/My Drive/homework/Lovely J/Codebook+NLSY1979.xlsx",
sheet = "Data Set Description") %>%
datatable(options = list(pageLength = 5), caption = 'Codebook 1')
# Code Book
read_excel("G:/My Drive/homework/Lovely J/Codebook+NLSY1979.xlsx",
sheet = "Variable List") %>%
datatable(options = list(pageLength = 5), caption = 'Codebook 2')
# Import Data
nlsy <- read_csv("G:/My Drive/homework/Lovely J/NLSY1979_1990.csv")
nlsy %<>%
select(-ID, -YEAR, -EDU_LOAN_) %>% # not usefull
select(-EVER_EDU_LOAN, -c(HOURS_WORKED_PER_WEEK_:TOTAL_EDU_LOAN)) # too many errors
nlsy %<>%
filter(COUNTRY_OF_BIRTH != "-3") %>% # 1
filter(C1DOB_Y != "-1" & C1DOB_Y != "-2" & C1DOB_Y != "-3") %>% # 7
filter(HAVING_HEALTHPLAN != "-2" & HAVING_HEALTHPLAN != "-3") %>% # 12
filter(TNFI_ >= 0) %>% #1 1742
filter(POVSTATUS_ != "-3") %>% # 1742
filter(REGION_ != "-3") %>% # 18
filter(MARSTAT_KEY_ != "-3") %>% # 1
filter(WKSUEMP_PCY_ >= 0) %>% # 465
filter(URBAN_RURAL_ != "-5" & URBAN_RURAL_ != "-4") %>% # 74
filter(JOBSNUM_ >= 0) %>% # 9
filter(!is.na(EVER_IN_POVERTY)) %>% # 151
filter(INCOME_ >= 0) %>% # 236
# filter(EVER_EDU_LOAN != "-4" & !is.na(EVER_EDU_LOAN)) %>% # 5203
filter(EDU_DEGREE != "-5") %>% # 648
filter(EVER_DIVORCED_ != "-5") # %>% # 60
# filter(HOURS_WORKED_PER_WEEK_ >= 0) %>% # 7111
# filter(MAJOR_1_ >= 0) %>% # 5267
# filter(MAJOR_2_ >= 0) %>% # 10352
# filter(EDU_MAJOR >= 0) %>% 4836
# filter(AMT_EDU_LOAN_ >= 0) %>% # 10236
# filter(TOTAL_EDU_LOAN >= 0) %>% # 7798
# View Data
nlsy %>%
datatable(options = list(pageLength = 5,
autoWidth = TRUE),
filter = 'bottom',
caption = 'National Longitudinal Survey of Youth 1979 Subset')
nlsy.cat <-
nlsy %>%
mutate(across(.cols = c(YEAR_OF_BIRTH:FAMSIZE_,
POVSTATUS_:WHEN_IN_POVERTY,
EDU_DEGREE:CAL_YEAR_JOBS_,
EMP_STATUS_),
as_factor))
# nlsy.cat %>% summary()
# nlsy.cat %>% select(where(is.factor)) %>% sapply(., levels)
model.full <-
nlsy %>%
lm(INCOME_ ~ ., data = .)
model.full %>%
get_regression_table(digits = 2) %>%
select(-statistic, -lower_ci, -upper_ci) %>%
datatable(options = list(pageLength = 5),
caption = 'Full Additive Model')
model.full %>% get_regression_summaries() %>% round(1) %>% datatable()