library(readxl)
## Warning: package 'readxl' was built under R version 4.1.3
library(rcompanion)
## Warning: package 'rcompanion' was built under R version 4.1.3
library(ordinal)
## Warning: package 'ordinal' was built under R version 4.1.3
library(foreign)
## Warning: package 'foreign' was built under R version 4.1.3
data <- read_excel( "C:/Users/gunde/Downloads/adult.xlsx")
data
## # A tibble: 27,825 x 21
## Age Workclass_code Workclass Education_code Education Bachelors
## <dbl> <dbl> <chr> <dbl> <chr> <dbl>
## 1 39 7 State-gov 11 Bachelors 1
## 2 50 6 Self-emp-not-inc 11 Bachelors 1
## 3 38 4 Private 9 HS-grad 0
## 4 53 4 Private 7 11th 0
## 5 28 4 Private 11 Bachelors 1
## 6 37 4 Private 12 Masters 1
## 7 49 4 Private 5 9th 0
## 8 52 6 Self-emp-not-inc 9 HS-grad 0
## 9 31 4 Private 12 Masters 1
## 10 42 4 Private 11 Bachelors 1
## # ... with 27,815 more rows, and 15 more variables: Education_yrs <dbl>,
## # Marital_status_code <dbl>, Marital_status <chr>, Occupation_code <dbl>,
## # Occupation <chr>, Family_position_code <dbl>, Family_position <chr>,
## # Race_code <dbl>, Race <chr>, Gender_code <dbl>, Gender <chr>,
## # Hrs_per_week <dbl>, Country <chr>, Income_greater_than_50k_code <dbl>,
## # Income_greater_than_50k <chr>
model_glm <- glm(Income_greater_than_50k_code ~ Bachelors, data = data)
summary(model_glm)
##
## Call:
## glm(formula = Income_greater_than_50k_code ~ Bachelors, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4703 -0.1563 -0.1563 -0.1563 0.8437
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.156344 0.002810 55.63 <2e-16 ***
## Bachelors 0.313952 0.005504 57.04 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1624751)
##
## Null deviance: 5049.2 on 27824 degrees of freedom
## Residual deviance: 4520.5 on 27823 degrees of freedom
## AIC: 28403
##
## Number of Fisher Scoring iterations: 2
nagelkerke(model_glm)
## $Models
##
## Model: "glm, Income_greater_than_50k_code ~ Bachelors, data"
## Null: "glm, Income_greater_than_50k_code ~ 1, data"
##
## $Pseudo.R.squared.for.model.vs.null
## Pseudo.R.squared
## McFadden 0.0977705
## Cox and Snell (ML) 0.1046990
## Nagelkerke (Cragg and Uhler) 0.1545720
##
## $Likelihood.ratio.test
## Df.diff LogLik.diff Chisq p.value
## -1 -1538.7 3077.3 0
##
## $Number.of.observations
##
## Model: 27825
## Null: 27825
##
## $Messages
## [1] "Note: For models fit with REML, these statistics are based on refitting with ML"
##
## $Warnings
## [1] "None"
AIC(model_glm)
## [1] 28403.5
BIC(model_glm)
## [1] 28428.2
data$Income_greater_than_50k_code <- factor(data$Income_greater_than_50k_code, ordered=TRUE)
data$Education_code <- as.numeric(data$Education_code)
data$Bachelors <- as.numeric(data$Bachelors)
data$Education_yrs <- as.numeric(data$Education_yrs)
model_clm <- clm(Income_greater_than_50k_code ~ Education_code, data = data)
summary(model_clm)
## formula: Income_greater_than_50k_code ~ Education_code
## data: data
##
## link threshold nobs logLik AIC niter max.grad cond.H
## logit flexible 27825 -13782.36 27568.72 6(0) 3.36e-11 7.3e+03
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## Education_code 0.56232 0.01214 46.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Threshold coefficients:
## Estimate Std. Error z value
## 0|1 6.6819 0.1236 54.05
model_clm2 <- clm(Income_greater_than_50k_code ~ Bachelors, data = data)
summary(model_clm2)
## formula: Income_greater_than_50k_code ~ Bachelors
## data: data
##
## link threshold nobs logLik AIC niter max.grad cond.H
## logit flexible 27825 -13934.25 27872.51 5(0) 1.90e-11 6.0e+00
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## Bachelors 1.56673 0.03036 51.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Threshold coefficients:
## Estimate Std. Error z value
## 0|1 1.6857 0.0192 87.81
model_clm3 <- clm(Income_greater_than_50k_code ~ Education_yrs, data = data)
summary(model_clm3)
## formula: Income_greater_than_50k_code ~ Education_yrs
## data: data
##
## link threshold nobs logLik AIC niter max.grad cond.H
## logit flexible 27825 -13641.64 27287.28 5(0) 3.88e-08 3.1e+03
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## Education_yrs 0.351389 0.006673 52.66 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Threshold coefficients:
## Estimate Std. Error z value
## 0|1 4.8359 0.0741 65.26
nagelkerke(model_clm)
## $Models
##
## Model: "clm, Income_greater_than_50k_code ~ Education_code, data"
## Null: "clm, Income_greater_than_50k_code ~ 1, data"
##
## $Pseudo.R.squared.for.model.vs.null
## Pseudo.R.squared
## McFadden 0.0977722
## Cox and Snell (ML) 0.1017920
## Nagelkerke (Cragg and Uhler) 0.1527350
##
## $Likelihood.ratio.test
## Df.diff LogLik.diff Chisq p.value
## -1 -1493.6 2987.1 0
##
## $Number.of.observations
##
## Model: 27825
## Null: 27825
##
## $Messages
## [1] "Note: For models fit with REML, these statistics are based on refitting with ML"
##
## $Warnings
## [1] "None"
AIC(model_clm)
## [1] 27568.72
BIC(model_clm)
## [1] 27585.18
nagelkerke(model_clm2)
## $Models
##
## Model: "clm, Income_greater_than_50k_code ~ Bachelors, data"
## Null: "clm, Income_greater_than_50k_code ~ 1, data"
##
## $Pseudo.R.squared.for.model.vs.null
## Pseudo.R.squared
## McFadden 0.0878287
## Cox and Snell (ML) 0.0919319
## Nagelkerke (Cragg and Uhler) 0.1379400
##
## $Likelihood.ratio.test
## Df.diff LogLik.diff Chisq p.value
## -1 -1341.7 2683.3 0
##
## $Number.of.observations
##
## Model: 27825
## Null: 27825
##
## $Messages
## [1] "Note: For models fit with REML, these statistics are based on refitting with ML"
##
## $Warnings
## [1] "None"
AIC(model_clm2)
## [1] 27872.51
BIC(model_clm2)
## [1] 27888.98
nagelkerke(model_clm3)
## $Models
##
## Model: "clm, Income_greater_than_50k_code ~ Education_yrs, data"
## Null: "clm, Income_greater_than_50k_code ~ 1, data"
##
## $Pseudo.R.squared.for.model.vs.null
## Pseudo.R.squared
## McFadden 0.106984
## Cox and Snell (ML) 0.110831
## Nagelkerke (Cragg and Uhler) 0.166298
##
## $Likelihood.ratio.test
## Df.diff LogLik.diff Chisq p.value
## -1 -1634.3 3268.6 0
##
## $Number.of.observations
##
## Model: 27825
## Null: 27825
##
## $Messages
## [1] "Note: For models fit with REML, these statistics are based on refitting with ML"
##
## $Warnings
## [1] "None"
AIC(model_clm3)
## [1] 27287.28
BIC(model_clm3)
## [1] 27303.74