stats.R

library(readxl)

## Warning: package 'readxl' was built under R version 4.1.3

library(rcompanion)

## Warning: package 'rcompanion' was built under R version 4.1.3

library(ordinal)

## Warning: package 'ordinal' was built under R version 4.1.3

library(foreign)

## Warning: package 'foreign' was built under R version 4.1.3

data <- read_excel( "C:/Users/gunde/Downloads/adult.xlsx")
data

## # A tibble: 27,825 x 21
##      Age Workclass_code Workclass        Education_code Education Bachelors
##    <dbl>          <dbl> <chr>                     <dbl> <chr>         <dbl>
##  1    39              7 State-gov                    11 Bachelors         1
##  2    50              6 Self-emp-not-inc             11 Bachelors         1
##  3    38              4 Private                       9 HS-grad           0
##  4    53              4 Private                       7 11th              0
##  5    28              4 Private                      11 Bachelors         1
##  6    37              4 Private                      12 Masters           1
##  7    49              4 Private                       5 9th               0
##  8    52              6 Self-emp-not-inc              9 HS-grad           0
##  9    31              4 Private                      12 Masters           1
## 10    42              4 Private                      11 Bachelors         1
## # ... with 27,815 more rows, and 15 more variables: Education_yrs <dbl>,
## #   Marital_status_code <dbl>, Marital_status <chr>, Occupation_code <dbl>,
## #   Occupation <chr>, Family_position_code <dbl>, Family_position <chr>,
## #   Race_code <dbl>, Race <chr>, Gender_code <dbl>, Gender <chr>,
## #   Hrs_per_week <dbl>, Country <chr>, Income_greater_than_50k_code <dbl>,
## #   Income_greater_than_50k <chr>

model_glm <- glm(Income_greater_than_50k_code ~ Bachelors, data = data) 
summary(model_glm)

## 
## Call:
## glm(formula = Income_greater_than_50k_code ~ Bachelors, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4703  -0.1563  -0.1563  -0.1563   0.8437  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.156344   0.002810   55.63   <2e-16 ***
## Bachelors   0.313952   0.005504   57.04   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1624751)
## 
##     Null deviance: 5049.2  on 27824  degrees of freedom
## Residual deviance: 4520.5  on 27823  degrees of freedom
## AIC: 28403
## 
## Number of Fisher Scoring iterations: 2

nagelkerke(model_glm)

## $Models
##                                                             
## Model: "glm, Income_greater_than_50k_code ~ Bachelors, data"
## Null:  "glm, Income_greater_than_50k_code ~ 1, data"        
## 
## $Pseudo.R.squared.for.model.vs.null
##                              Pseudo.R.squared
## McFadden                            0.0977705
## Cox and Snell (ML)                  0.1046990
## Nagelkerke (Cragg and Uhler)        0.1545720
## 
## $Likelihood.ratio.test
##  Df.diff LogLik.diff  Chisq p.value
##       -1     -1538.7 3077.3       0
## 
## $Number.of.observations
##             
## Model: 27825
## Null:  27825
## 
## $Messages
## [1] "Note: For models fit with REML, these statistics are based on refitting with ML"
## 
## $Warnings
## [1] "None"

AIC(model_glm)

## [1] 28403.5

BIC(model_glm)

## [1] 28428.2

data$Income_greater_than_50k_code <- factor(data$Income_greater_than_50k_code, ordered=TRUE)
data$Education_code <- as.numeric(data$Education_code)
data$Bachelors <- as.numeric(data$Bachelors)
data$Education_yrs <- as.numeric(data$Education_yrs)

model_clm <- clm(Income_greater_than_50k_code ~ Education_code, data = data) 
summary(model_clm)

## formula: Income_greater_than_50k_code ~ Education_code
## data:    data
## 
##  link  threshold nobs  logLik    AIC      niter max.grad cond.H 
##  logit flexible  27825 -13782.36 27568.72 6(0)  3.36e-11 7.3e+03
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## Education_code  0.56232    0.01214    46.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Threshold coefficients:
##     Estimate Std. Error z value
## 0|1   6.6819     0.1236   54.05

model_clm2 <- clm(Income_greater_than_50k_code ~ Bachelors, data = data) 
summary(model_clm2)

## formula: Income_greater_than_50k_code ~ Bachelors
## data:    data
## 
##  link  threshold nobs  logLik    AIC      niter max.grad cond.H 
##  logit flexible  27825 -13934.25 27872.51 5(0)  1.90e-11 6.0e+00
## 
## Coefficients:
##           Estimate Std. Error z value Pr(>|z|)    
## Bachelors  1.56673    0.03036    51.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Threshold coefficients:
##     Estimate Std. Error z value
## 0|1   1.6857     0.0192   87.81

model_clm3 <- clm(Income_greater_than_50k_code ~ Education_yrs, data = data) 
summary(model_clm3)

## formula: Income_greater_than_50k_code ~ Education_yrs
## data:    data
## 
##  link  threshold nobs  logLik    AIC      niter max.grad cond.H 
##  logit flexible  27825 -13641.64 27287.28 5(0)  3.88e-08 3.1e+03
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## Education_yrs 0.351389   0.006673   52.66   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Threshold coefficients:
##     Estimate Std. Error z value
## 0|1   4.8359     0.0741   65.26

nagelkerke(model_clm)

## $Models
##                                                                  
## Model: "clm, Income_greater_than_50k_code ~ Education_code, data"
## Null:  "clm, Income_greater_than_50k_code ~ 1, data"             
## 
## $Pseudo.R.squared.for.model.vs.null
##                              Pseudo.R.squared
## McFadden                            0.0977722
## Cox and Snell (ML)                  0.1017920
## Nagelkerke (Cragg and Uhler)        0.1527350
## 
## $Likelihood.ratio.test
##  Df.diff LogLik.diff  Chisq p.value
##       -1     -1493.6 2987.1       0
## 
## $Number.of.observations
##             
## Model: 27825
## Null:  27825
## 
## $Messages
## [1] "Note: For models fit with REML, these statistics are based on refitting with ML"
## 
## $Warnings
## [1] "None"

AIC(model_clm)

## [1] 27568.72

BIC(model_clm)

## [1] 27585.18

nagelkerke(model_clm2)

## $Models
##                                                             
## Model: "clm, Income_greater_than_50k_code ~ Bachelors, data"
## Null:  "clm, Income_greater_than_50k_code ~ 1, data"        
## 
## $Pseudo.R.squared.for.model.vs.null
##                              Pseudo.R.squared
## McFadden                            0.0878287
## Cox and Snell (ML)                  0.0919319
## Nagelkerke (Cragg and Uhler)        0.1379400
## 
## $Likelihood.ratio.test
##  Df.diff LogLik.diff  Chisq p.value
##       -1     -1341.7 2683.3       0
## 
## $Number.of.observations
##             
## Model: 27825
## Null:  27825
## 
## $Messages
## [1] "Note: For models fit with REML, these statistics are based on refitting with ML"
## 
## $Warnings
## [1] "None"

AIC(model_clm2)

## [1] 27872.51

BIC(model_clm2)

## [1] 27888.98

nagelkerke(model_clm3)

## $Models
##                                                                 
## Model: "clm, Income_greater_than_50k_code ~ Education_yrs, data"
## Null:  "clm, Income_greater_than_50k_code ~ 1, data"            
## 
## $Pseudo.R.squared.for.model.vs.null
##                              Pseudo.R.squared
## McFadden                             0.106984
## Cox and Snell (ML)                   0.110831
## Nagelkerke (Cragg and Uhler)         0.166298
## 
## $Likelihood.ratio.test
##  Df.diff LogLik.diff  Chisq p.value
##       -1     -1634.3 3268.6       0
## 
## $Number.of.observations
##             
## Model: 27825
## Null:  27825
## 
## $Messages
## [1] "Note: For models fit with REML, these statistics are based on refitting with ML"
## 
## $Warnings
## [1] "None"

AIC(model_clm3)

## [1] 27287.28

BIC(model_clm3)

## [1] 27303.74

stats.R

gunde

2022-12-12