library(readxl)
## Warning: package 'readxl' was built under R version 4.1.3
library(rcompanion)
## Warning: package 'rcompanion' was built under R version 4.1.3
library(ordinal)
## Warning: package 'ordinal' was built under R version 4.1.3
library(foreign)
## Warning: package 'foreign' was built under R version 4.1.3
data <- read_excel( "C:/Users/gunde/Downloads/adult.xlsx")
data
## # A tibble: 27,825 x 21
##      Age Workclass_code Workclass        Education_code Education Bachelors
##    <dbl>          <dbl> <chr>                     <dbl> <chr>         <dbl>
##  1    39              7 State-gov                    11 Bachelors         1
##  2    50              6 Self-emp-not-inc             11 Bachelors         1
##  3    38              4 Private                       9 HS-grad           0
##  4    53              4 Private                       7 11th              0
##  5    28              4 Private                      11 Bachelors         1
##  6    37              4 Private                      12 Masters           1
##  7    49              4 Private                       5 9th               0
##  8    52              6 Self-emp-not-inc              9 HS-grad           0
##  9    31              4 Private                      12 Masters           1
## 10    42              4 Private                      11 Bachelors         1
## # ... with 27,815 more rows, and 15 more variables: Education_yrs <dbl>,
## #   Marital_status_code <dbl>, Marital_status <chr>, Occupation_code <dbl>,
## #   Occupation <chr>, Family_position_code <dbl>, Family_position <chr>,
## #   Race_code <dbl>, Race <chr>, Gender_code <dbl>, Gender <chr>,
## #   Hrs_per_week <dbl>, Country <chr>, Income_greater_than_50k_code <dbl>,
## #   Income_greater_than_50k <chr>
model_glm <- glm(Income_greater_than_50k_code ~ Bachelors, data = data) 
summary(model_glm)
## 
## Call:
## glm(formula = Income_greater_than_50k_code ~ Bachelors, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4703  -0.1563  -0.1563  -0.1563   0.8437  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.156344   0.002810   55.63   <2e-16 ***
## Bachelors   0.313952   0.005504   57.04   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1624751)
## 
##     Null deviance: 5049.2  on 27824  degrees of freedom
## Residual deviance: 4520.5  on 27823  degrees of freedom
## AIC: 28403
## 
## Number of Fisher Scoring iterations: 2
nagelkerke(model_glm)
## $Models
##                                                             
## Model: "glm, Income_greater_than_50k_code ~ Bachelors, data"
## Null:  "glm, Income_greater_than_50k_code ~ 1, data"        
## 
## $Pseudo.R.squared.for.model.vs.null
##                              Pseudo.R.squared
## McFadden                            0.0977705
## Cox and Snell (ML)                  0.1046990
## Nagelkerke (Cragg and Uhler)        0.1545720
## 
## $Likelihood.ratio.test
##  Df.diff LogLik.diff  Chisq p.value
##       -1     -1538.7 3077.3       0
## 
## $Number.of.observations
##             
## Model: 27825
## Null:  27825
## 
## $Messages
## [1] "Note: For models fit with REML, these statistics are based on refitting with ML"
## 
## $Warnings
## [1] "None"
AIC(model_glm)
## [1] 28403.5
BIC(model_glm)
## [1] 28428.2
data$Income_greater_than_50k_code <- factor(data$Income_greater_than_50k_code, ordered=TRUE)
data$Education_code <- as.numeric(data$Education_code)
data$Bachelors <- as.numeric(data$Bachelors)
data$Education_yrs <- as.numeric(data$Education_yrs)

model_clm <- clm(Income_greater_than_50k_code ~ Education_code, data = data) 
summary(model_clm)
## formula: Income_greater_than_50k_code ~ Education_code
## data:    data
## 
##  link  threshold nobs  logLik    AIC      niter max.grad cond.H 
##  logit flexible  27825 -13782.36 27568.72 6(0)  3.36e-11 7.3e+03
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## Education_code  0.56232    0.01214    46.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Threshold coefficients:
##     Estimate Std. Error z value
## 0|1   6.6819     0.1236   54.05
model_clm2 <- clm(Income_greater_than_50k_code ~ Bachelors, data = data) 
summary(model_clm2)
## formula: Income_greater_than_50k_code ~ Bachelors
## data:    data
## 
##  link  threshold nobs  logLik    AIC      niter max.grad cond.H 
##  logit flexible  27825 -13934.25 27872.51 5(0)  1.90e-11 6.0e+00
## 
## Coefficients:
##           Estimate Std. Error z value Pr(>|z|)    
## Bachelors  1.56673    0.03036    51.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Threshold coefficients:
##     Estimate Std. Error z value
## 0|1   1.6857     0.0192   87.81
model_clm3 <- clm(Income_greater_than_50k_code ~ Education_yrs, data = data) 
summary(model_clm3)
## formula: Income_greater_than_50k_code ~ Education_yrs
## data:    data
## 
##  link  threshold nobs  logLik    AIC      niter max.grad cond.H 
##  logit flexible  27825 -13641.64 27287.28 5(0)  3.88e-08 3.1e+03
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## Education_yrs 0.351389   0.006673   52.66   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Threshold coefficients:
##     Estimate Std. Error z value
## 0|1   4.8359     0.0741   65.26
nagelkerke(model_clm)
## $Models
##                                                                  
## Model: "clm, Income_greater_than_50k_code ~ Education_code, data"
## Null:  "clm, Income_greater_than_50k_code ~ 1, data"             
## 
## $Pseudo.R.squared.for.model.vs.null
##                              Pseudo.R.squared
## McFadden                            0.0977722
## Cox and Snell (ML)                  0.1017920
## Nagelkerke (Cragg and Uhler)        0.1527350
## 
## $Likelihood.ratio.test
##  Df.diff LogLik.diff  Chisq p.value
##       -1     -1493.6 2987.1       0
## 
## $Number.of.observations
##             
## Model: 27825
## Null:  27825
## 
## $Messages
## [1] "Note: For models fit with REML, these statistics are based on refitting with ML"
## 
## $Warnings
## [1] "None"
AIC(model_clm)
## [1] 27568.72
BIC(model_clm)
## [1] 27585.18
nagelkerke(model_clm2)
## $Models
##                                                             
## Model: "clm, Income_greater_than_50k_code ~ Bachelors, data"
## Null:  "clm, Income_greater_than_50k_code ~ 1, data"        
## 
## $Pseudo.R.squared.for.model.vs.null
##                              Pseudo.R.squared
## McFadden                            0.0878287
## Cox and Snell (ML)                  0.0919319
## Nagelkerke (Cragg and Uhler)        0.1379400
## 
## $Likelihood.ratio.test
##  Df.diff LogLik.diff  Chisq p.value
##       -1     -1341.7 2683.3       0
## 
## $Number.of.observations
##             
## Model: 27825
## Null:  27825
## 
## $Messages
## [1] "Note: For models fit with REML, these statistics are based on refitting with ML"
## 
## $Warnings
## [1] "None"
AIC(model_clm2)
## [1] 27872.51
BIC(model_clm2)
## [1] 27888.98
nagelkerke(model_clm3)
## $Models
##                                                                 
## Model: "clm, Income_greater_than_50k_code ~ Education_yrs, data"
## Null:  "clm, Income_greater_than_50k_code ~ 1, data"            
## 
## $Pseudo.R.squared.for.model.vs.null
##                              Pseudo.R.squared
## McFadden                             0.106984
## Cox and Snell (ML)                   0.110831
## Nagelkerke (Cragg and Uhler)         0.166298
## 
## $Likelihood.ratio.test
##  Df.diff LogLik.diff  Chisq p.value
##       -1     -1634.3 3268.6       0
## 
## $Number.of.observations
##             
## Model: 27825
## Null:  27825
## 
## $Messages
## [1] "Note: For models fit with REML, these statistics are based on refitting with ML"
## 
## $Warnings
## [1] "None"
AIC(model_clm3)
## [1] 27287.28
BIC(model_clm3)
## [1] 27303.74