Using a data set of your choice, do the following:
1) define an outcome, this can be of any form, but use an appropriate distribution for your outcome. Also define at least 1 continuous predictor.
My outcome variable will be a binary variable of education called College with the reference (0) being no college and 1 being those who have college education. The continuous predictor that will be used will be age.
2) Using the gam() function, estimate a model with only linear terms in your model
With only linear terms in the model age is not statistically signifant but Hispanics were with Non Hispanics having bigher educational attainment than Hispanics and those who were Born outside of the US tended to have lower educational attainment than those born in the US.
3) Repeat Step 2, but include a smooth term for at least one continuous variable
The same findings are also found in this model. When observing the variable of age itself as a smooth term it shows that it does not have a linear relationship with education being that it is statisitcally significant. So this relationship between these two variables is non linear.
4) Test if the model in step 3 is fitting the data better than the purely linear model.
After conducting the LRT Test it is shown with a statistical significane well below the .001 level that the model with the smooth term of age is a better model than the one with just linear terms.
5) Produce a plot of the smooth effect from the model in step 3
As it is seen in the scatterplot of the model of the smooth term it can be easily seen that the muliple knots within the age variable in its relationship with education has multiple knots with much data that will be missed if a linarity is assumed. This is further validation of usuage of splines assuming non linear relationships between the variables.
library(haven)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(scales)
library(sur)
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library(summarytools)
library(Rmisc)
## Loading required package: lattice
library(car)
## Loading required package: carData
##
## Attaching package: 'carData'
## The following objects are masked from 'package:sur':
##
## Anscombe, States
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(forcats)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.6 v purrr 0.3.4
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x plyr::arrange() masks dplyr::arrange()
## x readr::col_factor() masks scales::col_factor()
## x purrr::compact() masks plyr::compact()
## x plyr::count() masks dplyr::count()
## x purrr::discard() masks scales::discard()
## x plyr::failwith() masks dplyr::failwith()
## x dplyr::filter() masks stats::filter()
## x plyr::id() masks dplyr::id()
## x dplyr::lag() masks stats::lag()
## x plyr::mutate() masks dplyr::mutate()
## x car::recode() masks dplyr::recode()
## x plyr::rename() masks dplyr::rename()
## x purrr::some() masks car::some()
## x plyr::summarise() masks dplyr::summarise()
## x plyr::summarize() masks dplyr::summarize()
## x tibble::view() masks summarytools::view()
library(survey)
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(grid)
library(Matrix)
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
## The following object is masked from 'package:purrr':
##
## lift
library(mgcv)
## Warning: package 'mgcv' was built under R version 4.1.3
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.8-40. For overview type 'help("mgcv-package")'.
library(VGAM)
## Loading required package: stats4
## Loading required package: splines
##
## Attaching package: 'VGAM'
## The following object is masked from 'package:mgcv':
##
## s
## The following object is masked from 'package:caret':
##
## predictors
## The following object is masked from 'package:survey':
##
## calibrate
## The following object is masked from 'package:tidyr':
##
## fill
## The following object is masked from 'package:car':
##
## logit
library(svyVGAM)
library(gtsummary)
##
## Attaching package: 'gtsummary'
## The following object is masked from 'package:plyr':
##
## mutate
gss2021_ZERODraft<-read_dta("C:\\Users\\BTP\\Desktop\\STATS 2 FOLDER\\2021_sas\\gss2021.dta")
recode hispanic
gss2021_ZERODraft$subgrouphis <-Recode(gss2021_ZERODraft$hispanic, recodes="1 = 0; 2:50 = 1; else=NA", as.factor=T)
gss2021_ZERODraft %>%
tabyl(subgrouphis)
## subgrouphis n percent valid_percent
## 0 3544 0.87896825 0.8864432
## 1 454 0.11259921 0.1135568
## <NA> 34 0.00843254 NA
subgrouphis_1<-as.factor(ifelse(gss2021_ZERODraft$subgrouphis==1, "Hispanic", "Non Hispanic"))
tabyl(subgrouphis_1)
## subgrouphis_1 n percent valid_percent
## Hispanic 454 0.11259921 0.1135568
## Non Hispanic 3544 0.87896825 0.8864432
## <NA> 34 0.00843254 NA
recode education outcome variable for Multinomial model for those with less than high school, high school, and college
gss2021_ZERODraft$College <-Recode(gss2021_ZERODraft$educ, recodes="0:12 = 0; 13:20 = 1; else=NA", as.factor=T)
gss2021_ZERODraft$College<-relevel(gss2021_ZERODraft$College, ref = "1")
gss2021_ZERODraft %>%
tabyl(educ)
## educ n percent valid_percent
## 0 9 0.0022321429 0.0022692890
## 1 1 0.0002480159 0.0002521432
## 2 2 0.0004960317 0.0005042864
## 3 3 0.0007440476 0.0007564297
## 4 1 0.0002480159 0.0002521432
## 5 2 0.0004960317 0.0005042864
## 6 15 0.0037202381 0.0037821483
## 7 5 0.0012400794 0.0012607161
## 8 25 0.0062003968 0.0063035804
## 9 32 0.0079365079 0.0080685830
## 10 52 0.0128968254 0.0131114473
## 11 83 0.0205853175 0.0209278870
## 12 829 0.2056051587 0.2090267272
## 13 277 0.0687003968 0.0698436712
## 14 542 0.1344246032 0.1366616238
## 15 208 0.0515873016 0.0524457892
## 16 942 0.2336309524 0.2375189107
## 17 258 0.0639880952 0.0650529501
## 18 351 0.0870535714 0.0885022693
## 19 113 0.0280257937 0.0284921836
## 20 216 0.0535714286 0.0544629349
## NA 66 0.0163690476 NA
gss2021_ZERODraft %>%
tabyl(College)
## College n percent valid_percent
## 1 2907 0.72098214 0.7329803
## 0 1059 0.26264881 0.2670197
## <NA> 66 0.01636905 NA
sex
gss2021_ZERODraft$subgroupsex <-Recode(gss2021_ZERODraft$sex, recodes="1:1 = 0; 2:2 = 1; else=NA",)
gss2021_ZERODraft %>%
tabyl(subgroupsex)
## subgroupsex n percent valid_percent
## 0 1736 0.43055556 0.4406091
## 1 2204 0.54662698 0.5593909
## NA 92 0.02281746 NA
subgroupsex_1<-as.factor(ifelse(gss2021_ZERODraft$subgroupsex==1, "Women", "Men"))
tabyl(subgroupsex_1)
## subgroupsex_1 n percent valid_percent
## Men 1736 0.43055556 0.4406091
## Women 2204 0.54662698 0.5593909
## <NA> 92 0.02281746 NA
income of household of respondent when age 16
gss2021_ZERODraft$subgroupincom16 <-Recode(gss2021_ZERODraft$incom16, recodes="1:2 = 0; 3:5 = 1; else=NA",)
gss2021_ZERODraft %>%
tabyl(subgroupincom16)
## subgroupincom16 n percent valid_percent
## 0 1434 0.35565476 0.374804
## 1 2392 0.59325397 0.625196
## NA 206 0.05109127 NA
subgroupincom16_1<-as.factor(ifelse(gss2021_ZERODraft$subgroupincom16==1, "secure economic resources at 16", "insecure economic resources"))
tabyl(subgroupincom16_1)
## subgroupincom16_1 n percent valid_percent
## insecure economic resources 1434 0.35565476 0.374804
## secure economic resources at 16 2392 0.59325397 0.625196
## <NA> 206 0.05109127 NA
if respondent was born in the US
gss2021_ZERODraft$subgroupBorn <-Recode(gss2021_ZERODraft$born, recodes="1:1 = 1; 2:2 = 0; else=NA",)
gss2021_ZERODraft %>%
tabyl(subgroupBorn)
## subgroupBorn n percent valid_percent
## 0 444 0.11011905 0.1121212
## 1 3516 0.87202381 0.8878788
## NA 72 0.01785714 NA
subgroupborn_1<-as.factor(ifelse(gss2021_ZERODraft$subgroupBorn==1, "Born in US", "Not born in US"))
tabyl(subgroupborn_1)
## subgroupborn_1 n percent valid_percent
## Born in US 3516 0.87202381 0.8878788
## Not born in US 444 0.11011905 0.1121212
## <NA> 72 0.01785714 NA
if respondents ever been disrespected
gss2021_ZERODraft$subgroupdisrspct <-Recode(gss2021_ZERODraft$disrspct, recodes="1:5 = 1; 6:6 = 0; else=NA",)
gss2021_ZERODraft %>%
tabyl(subgroupdisrspct)
## subgroupdisrspct n percent valid_percent
## 0 554 0.1374008 0.212995
## 1 2047 0.5076885 0.787005
## NA 1431 0.3549107 NA
subgroupdisrspct_1<-as.factor(ifelse(gss2021_ZERODraft$subgroupdisrspct==1, "respondent has been disrespected", "Not being disrespected"))
tabyl(subgroupdisrspct_1)
## subgroupdisrspct_1 n percent valid_percent
## Not being disrespected 554 0.1374008 0.212995
## respondent has been disrespected 2047 0.5076885 0.787005
## <NA> 1431 0.3549107 NA
if respondents ever been called or treated like they were not smart.
gss2021_ZERODraft$subgroupnotsmart <-Recode(gss2021_ZERODraft$notsmart, recodes="1:5 = 1; 6:6 = 0; else=NA",)
gss2021_ZERODraft %>%
tabyl(subgroupnotsmart)
## subgroupnotsmart n percent valid_percent
## 0 867 0.2150298 0.3332052
## 1 1735 0.4303075 0.6667948
## NA 1430 0.3546627 NA
subgroupnotsmart_1<-as.factor(ifelse(gss2021_ZERODraft$subgroupnotsmart==1, "respondent was told or treated as if they are not smart", "Never experienced that sort of treatment of not being smart"))
tabyl(subgroupnotsmart_1)
## subgroupnotsmart_1 n percent
## Never experienced that sort of treatment of not being smart 867 0.2150298
## respondent was told or treated as if they are not smart 1735 0.4303075
## <NA> 1430 0.3546627
## valid_percent
## 0.3332052
## 0.6667948
## NA
age cut into intervals
age1<-cut(gss2021_ZERODraft$age,
breaks = c(0,24,39,59,79,99))
sub <- gss2021_ZERODraft %>%
select(College,subgrouphis,subgroupBorn,subgroupsex,subgroupincom16,subgroupdisrspct,subgroupnotsmart,age,educ,vstrat,wtssnrps) %>%
filter(complete.cases(.))
Weights introduced
library(srvyr)
##
## Attaching package: 'srvyr'
## The following objects are masked from 'package:plyr':
##
## mutate, rename, summarise, summarize
## The following object is masked from 'package:stats':
##
## filter
options(survey.lonely.psu = "adjust")
des1<-svydesign(ids=~1, strata=~vstrat, weights=~wtssnrps, data=gss2021_ZERODraft)
No Smooth Term
gssgam<-gam(educ ~ age+subgrouphis + subgroupBorn,
data=sub,
weights = wtssnrps,
family=gaussian)
summary(gssgam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## educ ~ age + subgrouphis + subgroupBorn
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.833441 0.223020 66.512 <2e-16 ***
## age -0.003644 0.003228 -1.129 0.2591
## subgrouphis1 -1.551443 0.176606 -8.785 <2e-16 ***
## subgroupBorn -0.333250 0.167924 -1.985 0.0473 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## R-sq.(adj) = 0.0302 Deviance explained = 3.21%
## GCV = 7.2293 Scale est. = 7.217 n = 2351
With Smooth Term
gssgamsmooth<-gam(educ ~ s(age)+subgrouphis + subgroupBorn,
data=sub,
weights = wtssnrps,
family=gaussian)
summary(gssgamsmooth)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## educ ~ s(age) + subgrouphis + subgroupBorn
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.6974 0.1607 91.485 < 2e-16 ***
## subgrouphis1 -1.4386 0.1758 -8.185 4.41e-16 ***
## subgroupBorn -0.3605 0.1664 -2.166 0.0304 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(age) 7.67 8.536 8.607 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0567 Deviance explained = 6.31%
## GCV = 7.0376 Scale est. = 7.0057 n = 2351
LRT Test
anova( gssgamsmooth,
gssgam,
test="Chisq")
## Analysis of Deviance Table
##
## Model 1: educ ~ s(age) + subgrouphis + subgroupBorn
## Model 2: educ ~ age + subgrouphis + subgroupBorn
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 2339.5 16396
## 2 2347.0 16938 -7.5361 -542.66 8.954e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Plots of Smooth
plot(gssgamsmooth)
