library(tidyverse)
library(magrittr)
library(GGally)
library(corrplot)
library(moderndive)
library(DT)
library(readxl)
library(skimr)
library(fastDummies)
# library(scales) ?
# library(bloom) ?summary

Module 5

Recycle/Reuse Code from Module 1

# Import Data
baseline <- read_csv("G:/My Drive/homework/Lovely J/Baseline+survey.csv")

# Clean up Answer5
bad <- 
  baseline %>%
  select(Answer5) %>%
  distinct() %>%
  filter(Answer5 != "1" &
         Answer5 != "2" &
         Answer5 != "3" &
         Answer5 != "4" &
         Answer5 != "5") %>%
  pull()

good <-
  c("1", "2", NA, "3", "1", "1", "1", "2", "3", "1", "2", "4", "3", "3", "1", 
    "3", "1", "1", "1", "1", "1", "1", "1", "1", "1", "3", "1")

for (i in 1:length(good)){
  baseline$Answer5[baseline$Answer5 == bad[i]] <- good[i]
}

# Drop questions, change answer data types & names
baseline %<>% 
  select(-X1, -starts_with("Question")) %>% 
  mutate(Major = Answer1,
         Age_Yrs = Answer2 %>% as.numeric(),
         Work_Yrs = Answer3 %>% as.numeric(),
         Stats_Yrs = Answer4 %>% as.numeric(),
         R_Lvl = Answer5 %>% as.integer(.),
         Cen_Tend_Lvl = Answer6 %>% as.integer(.),
         Var_Lvl = Answer7 %>% as.integer(.),
         Bivar_Lvl = Answer8 %>% as.integer(.),
         Prob_Lvl = Answer9 %>% as.integer(.),
         .keep = "unused")

# Replace NAs
# https://www.tutorialspoint.com/r/r_mean_median_mode.htm
getmode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

age_mode <- baseline$Age_Yrs %>% getmode(.)
baseline %<>% mutate(Age_Yrs = replace_na(Age_Yrs, age_mode))
rm(age_mode)

baseline %<>% mutate(Work_Yrs = round(Work_Yrs))
work_mode <- baseline$Work_Yrs %>% getmode(.) 
baseline %<>% mutate(Work_Yrs = replace_na(Work_Yrs, work_mode))
rm(work_mode)

baseline %<>% mutate(Stats_Yrs = round(Stats_Yrs))
stats_mode <- baseline$Stats_Yrs %>% getmode(.)
baseline %<>% mutate(Stats_Yrs = replace_na(Stats_Yrs, stats_mode))
rm(stats_mode)

r_mode <- baseline$R_Lvl %>% getmode(.)
baseline %<>% mutate(R_Lvl = replace_na(R_Lvl, r_mode))
rm(r_mode)

cen_mode <- baseline$Cen_Tend_Lvl %>% getmode(.)
baseline %<>% mutate(Cen_Tend_Lvl = replace_na(Cen_Tend_Lvl, cen_mode))
baseline$Cen_Tend_Lvl[baseline$Cen_Tend_Lvl < 1] <- 1
baseline$Cen_Tend_Lvl[baseline$Cen_Tend_Lvl > 5] <- cen_mode
rm(cen_mode)

var_mode <- baseline$Var_Lvl %>% getmode(.)
baseline %<>% mutate(Var_Lvl = replace_na(Var_Lvl, var_mode))
baseline$Var_Lvl[baseline$Var_Lvl < 1] = 1
baseline$Var_Lvl[baseline$Var_Lvl > 5] = var_mode
rm(var_mode)

bivar_mode <- baseline$Bivar_Lvl %>% getmode(.)
baseline %<>% mutate(Bivar_Lvl = replace_na(Bivar_Lvl, bivar_mode))
baseline$Bivar_Lvl[baseline$Bivar_Lvl < 1] = 1
baseline$Bivar_Lvl[baseline$Bivar_Lvl > 5] = bivar_mode
rm(bivar_mode)

prob_mode <- baseline$Prob_Lvl %>% getmode(.)
baseline %<>% mutate(Prob_Lvl = replace_na(Prob_Lvl, prob_mode))
baseline$Prob_Lvl[baseline$Prob_Lvl < 1] = 1
baseline$Prob_Lvl[baseline$Prob_Lvl > 5] = prob_mode
rm(prob_mode)

rm(getmode)
rm(bad)
rm(good)

# Summaries
# baseline %>% glimpse()
# baseline %>% mutate(across(.cols = everything(), as.factor)) %>% summary()

Part (1)

baseline %>% 
  select(Age_Yrs, Work_Yrs, Stats_Yrs, R_Lvl, Cen_Tend_Lvl) %>% # arbitrary
  mutate(across(.cols = everything(), as.factor)) %>%
  ggpairs()

baseline %>% 
  select(Age_Yrs, Work_Yrs, Stats_Yrs, R_Lvl, Cen_Tend_Lvl) %>% 
  cor() %>%
  corrplot.mixed(tl.pos = "lt")

Part (2)

No Interaction or Categorical Data Model

model.fit <-
  baseline %>%
  select(Age_Yrs, Work_Yrs, Stats_Yrs, R_Lvl, Cen_Tend_Lvl) %>%
  lm(Age_Yrs ~ ., data = .)

# model.fit %>% summary()

model.fit %>% get_regression_table(digits = 2) %>% datatable()
model.fit %>% get_regression_summaries() %>% round(1) %>% datatable()
baseline %>%
  ggplot(aes(sample = Age_Yrs)) +
  geom_qq() +
  geom_qq_line() +
  ggtitle("Response Variable (Age_Yrs) QQ Plot")

model.fit$residuals %>%
  as.data.frame() %>%
  ggplot(aes(sample = .)) +
  geom_qq() +
  geom_qq_line() +
  ggtitle("Residuals QQ Plot")

model.fit$fitted.values %>%
  as.data.frame() %>%
  ggplot(aes(sample = .)) +
  geom_qq() +
  geom_qq_line() +
  ggtitle("Fitted Values QQ Plot")

baseline %>%
  select(Age_Yrs, Work_Yrs, Stats_Yrs, R_Lvl, Cen_Tend_Lvl) %>%
  bind_cols(data.frame(Residuals = residuals(model.fit),
                       Fitted = fitted(model.fit)
                       )
            ) %>%
  pivot_longer(!Residuals) %>%
  ggplot(aes(Residuals, value)) +
  geom_point() +
  facet_wrap(vars(name)) #, scales = "free"

Interaction & Categorical Data Model

model.fit <-
  baseline %>%
  select(Age_Yrs, Work_Yrs, Stats_Yrs, R_Lvl, Cen_Tend_Lvl) %>%
  mutate(across(.cols = c(Work_Yrs:Cen_Tend_Lvl), as.factor)) %>% # make categories
  lm(Age_Yrs ~ Work_Yrs * Stats_Yrs * R_Lvl * Cen_Tend_Lvl, # make interactions
     data = .)

model.fit %>%
  get_regression_table(digits = 2) %>%
  filter(complete.cases(.)) %>% # drops 984 NA interactions
  filter(p_value < 0.1) %>% # drops 47 insignificant coefficient estimates
  datatable((options = list(pageLength = 5))) 

Module 6

Recycle/Reuse Code from Module 3

https://www.nlsinfo.org/content/cohorts/nlsy79

# Code Book
read_excel("G:/My Drive/homework/Lovely J/Codebook+NLSY1979.xlsx",
           sheet = "Data Set Description") %>%
  datatable(options = list(pageLength = 5), caption = 'Codebook 1')
# Code Book
read_excel("G:/My Drive/homework/Lovely J/Codebook+NLSY1979.xlsx",
           sheet = "Variable List") %>%
  datatable(options = list(pageLength = 5), caption = 'Codebook 2')
# Import Data
nlsy <- read_csv("G:/My Drive/homework/Lovely J/NLSY1979_1990.csv")

nlsy %<>%
  select(-ID, -YEAR, -EDU_LOAN_) %>% # not usefull
  select(-EVER_EDU_LOAN, -c(HOURS_WORKED_PER_WEEK_:TOTAL_EDU_LOAN)) # too many errors

nlsy %<>%
  filter(COUNTRY_OF_BIRTH != "-3") %>% # 1
  filter(C1DOB_Y != "-1" & C1DOB_Y != "-2" & C1DOB_Y != "-3") %>% # 7
  filter(HAVING_HEALTHPLAN != "-2" & HAVING_HEALTHPLAN != "-3") %>% # 12
  filter(TNFI_ >= 0) %>% #1 1742
  filter(POVSTATUS_ != "-3") %>% # 1742
  filter(REGION_ != "-3") %>% # 18
  filter(MARSTAT_KEY_ != "-3") %>% # 1
  filter(WKSUEMP_PCY_ >= 0) %>% # 465
  filter(URBAN_RURAL_ != "-5" & URBAN_RURAL_ != "-4") %>% # 74
  filter(JOBSNUM_ >= 0) %>% # 9
  filter(!is.na(EVER_IN_POVERTY)) %>% # 151
  filter(INCOME_ >= 0) %>% # 236  
  # filter(EVER_EDU_LOAN != "-4" & !is.na(EVER_EDU_LOAN)) %>% # 5203
  filter(EDU_DEGREE != "-5") %>% # 648
  filter(EVER_DIVORCED_ != "-5") # %>% # 60
  # filter(HOURS_WORKED_PER_WEEK_ >= 0) %>% # 7111
  # filter(MAJOR_1_ >= 0) %>% # 5267
  # filter(MAJOR_2_ >= 0) %>% # 10352
  # filter(EDU_MAJOR >= 0) %>% 4836
  # filter(AMT_EDU_LOAN_ >= 0) %>% # 10236
  # filter(TOTAL_EDU_LOAN >= 0) %>% # 7798

# View Data
nlsy %>%
  datatable(options = list(pageLength = 5,
                                  autoWidth = TRUE),
                   filter = 'bottom',
                   caption = 'National Longitudinal Survey of Youth 1979 Subset')
nlsy.cat <-
  nlsy %>% 
  mutate(across(.cols = c(YEAR_OF_BIRTH:FAMSIZE_,
                          POVSTATUS_:WHEN_IN_POVERTY,
                          EDU_DEGREE:CAL_YEAR_JOBS_,
                          EMP_STATUS_),
                as_factor))

# nlsy.cat %>% summary()

# nlsy.cat %>% select(where(is.factor)) %>% sapply(., levels)

Part (1)

Full Additive Model w/o explict factors

model.full <-
  nlsy %>%
  lm(INCOME_ ~ ., data = .)

model.full %>%
  get_regression_table(digits = 2) %>% 
  select(-statistic, -lower_ci, -upper_ci) %>%
  datatable(options = list(pageLength = 5),
            caption = 'Full Additive Model')
model.full %>% get_regression_summaries() %>% round(1) %>% datatable()