Logistic Regression

 options(repos = c(CRAN = "https://cloud.r-project.org"))
#Packages to install
install.packages("car")

## Installing package into 'C:/Users/chris/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)

## package 'car' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\chris\AppData\Local\Temp\Rtmpee2ldJ\downloaded_packages

install.packages("rcompanion")

## Installing package into 'C:/Users/chris/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)

## 
##   There is a binary version available but the source version is later:
##            binary source needs_compilation
## rcompanion  2.5.0  2.5.1             FALSE

## installing the source package 'rcompanion'

install.packages("lmtest")

## Installing package into 'C:/Users/chris/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)

## package 'lmtest' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\chris\AppData\Local\Temp\Rtmpee2ldJ\downloaded_packages

install.packages("pastecs")

## Installing package into 'C:/Users/chris/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)

## package 'pastecs' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\chris\AppData\Local\Temp\Rtmpee2ldJ\downloaded_packages

library("car")

## Warning: package 'car' was built under R version 4.5.2

## Loading required package: carData

library(rcompanion)
library(lmtest)

## Warning: package 'lmtest' was built under R version 4.5.2

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library("pastecs")

## Warning: package 'pastecs' was built under R version 4.5.2

#Getting the data ready using a CSV file
Data <- read.csv(choose.files(),header = TRUE)
attach (Data)
options(scipen = 999)
#Create data frame with assignment 4 variables
DataFrameLR <- Data[, c("left", "satisfaction_level", "last_evaluation","average_monthly_hours")]
#Descriptive Statistics (mean and SD) and Correlation
stat.desc(DataFrameLR)

##                         left satisfaction_level last_evaluation
## nbr.val      14999.000000000    14999.000000000 14999.000000000
## nbr.null     11428.000000000        0.000000000     0.000000000
## nbr.na           0.000000000        0.000000000     0.000000000
## min              0.000000000        0.090000000     0.360000000
## max              1.000000000        1.000000000     1.000000000
## range            1.000000000        0.910000000     0.640000000
## sum           3571.000000000     9191.890000000 10740.810000000
## median           0.000000000        0.640000000     0.720000000
## mean             0.238082539        0.612833522     0.716101740
## SE.mean          0.003477772        0.002030128     0.001397637
## CI.mean.0.95     0.006816857        0.003979300     0.002739538
## var              0.181411338        0.061817201     0.029298864
## std.dev          0.425924099        0.248630651     0.171169111
## coef.var         1.788976636        0.405706676     0.239029039
##              average_monthly_hours
## nbr.val              14999.0000000
## nbr.null                 0.0000000
## nbr.na                   0.0000000
## min                     96.0000000
## max                    310.0000000
## range                  214.0000000
## sum                3015554.0000000
## median                 200.0000000
## mean                   201.0503367
## SE.mean                  0.4077973
## CI.mean.0.95             0.7993325
## var                   2494.3131748
## std.dev                 49.9430994
## coef.var                 0.2484109

cor(DataFrameLR)

##                              left satisfaction_level last_evaluation
## left                   1.00000000        -0.38837498      0.00656712
## satisfaction_level    -0.38837498         1.00000000      0.10502121
## last_evaluation        0.00656712         0.10502121      1.00000000
## average_monthly_hours  0.07128718        -0.02004811      0.33974180
##                       average_monthly_hours
## left                             0.07128718
## satisfaction_level              -0.02004811
## last_evaluation                  0.33974180
## average_monthly_hours            1.00000000

#Linearity Assumption Interactions
Data$Log_Sat <- log(satisfaction_level)*satisfaction_level
Data$Log_Eval <- log(last_evaluation)*last_evaluation
Data$Log_Avg_Hours <- log(average_monthly_hours)*average_monthly_hours
attach(Data)

## The following objects are masked from Data (pos = 3):
## 
##     average_monthly_hours, department, last_evaluation, left,
##     number_project, promotion_last_5years, salary, satisfaction_level,
##     time_spend_company, Work_accident

names(Data)

##  [1] "satisfaction_level"    "last_evaluation"       "number_project"       
##  [4] "average_monthly_hours" "time_spend_company"    "Work_accident"        
##  [7] "left"                  "promotion_last_5years" "department"           
## [10] "salary"                "Log_Sat"               "Log_Eval"             
## [13] "Log_Avg_Hours"

#Test of Linearity Assumption Violations
model_assumption <- glm(left ~ satisfaction_level + last_evaluation + average_monthly_hours + Log_Sat + Log_Eval +
Log_Avg_Hours, data = Data, family = binomial(link = "logit"))
summary(model_assumption)

## 
## Call:
## glm(formula = left ~ satisfaction_level + last_evaluation + average_monthly_hours + 
##     Log_Sat + Log_Eval + Log_Avg_Hours, family = binomial(link = "logit"), 
##     data = Data)
## 
## Coefficients:
##                         Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)            30.979109   1.029126  30.102 < 0.0000000000000002 ***
## satisfaction_level     -3.906234   0.128646 -30.364 < 0.0000000000000002 ***
## last_evaluation       -15.837704   0.730589 -21.678 < 0.0000000000000002 ***
## average_monthly_hours  -0.426574   0.022868 -18.654 < 0.0000000000000002 ***
## Log_Sat                 2.108074   0.355440   5.931        0.00000000301 ***
## Log_Eval               25.001119   1.120784  22.307 < 0.0000000000000002 ***
## Log_Avg_Hours           0.068116   0.003643  18.700 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16465  on 14998  degrees of freedom
## Residual deviance: 13185  on 14992  degrees of freedom
## AIC: 13199
## 
## Number of Fisher Scoring iterations: 5

#LR Model
model = glm(left ~ satisfaction_level + last_evaluation + average_monthly_hours, data = Data, family =
binomial(link="logit"))
summary(model)

## 
## Call:
## glm(formula = left ~ satisfaction_level + last_evaluation + average_monthly_hours, 
##     family = binomial(link = "logit"), data = Data)
## 
## Coefficients:
##                         Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)            0.3992286  0.1104972   3.613             0.000303 ***
## satisfaction_level    -3.8139121  0.0879776 -43.351 < 0.0000000000000002 ***
## last_evaluation        0.2838230  0.1320874   2.149             0.031654 *  
## average_monthly_hours  0.0018329  0.0004414   4.153            0.0000329 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16465  on 14998  degrees of freedom
## Residual deviance: 14163  on 14995  degrees of freedom
## AIC: 14171
## 
## Number of Fisher Scoring iterations: 4

#Chi-Squared Difference Significance
model_chi <- model$null.deviance - model$deviance
model_chi

## [1] 2302.049

chidf <- model$df.null - model$df.residual
chidf

## [1] 3

chisq.prob <- 1 - pchisq(model_chi, chidf)
chisq.prob

## [1] 0

#wald statistic
Anova(model, type="II", test="Wald")

## Analysis of Deviance Table (Type II tests)
## 
## Response: left
##                       Df     Chisq            Pr(>Chisq)    
## satisfaction_level     1 1879.3047 < 0.00000000000000022 ***
## last_evaluation        1    4.6171               0.03165 *  
## average_monthly_hours  1   17.2440            0.00003287 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Odds Ratios
exp(model$coefficients)

##           (Intercept)    satisfaction_level       last_evaluation 
##             1.4906744             0.0220617             1.3281978 
## average_monthly_hours 
##             1.0018346

exp(confint(model))

## Waiting for profiling to be done...

##                            2.5 %     97.5 %
## (Intercept)           1.20028074 1.85101188
## satisfaction_level    0.01855121 0.02619142
## last_evaluation       1.02522851 1.72070465
## average_monthly_hours 1.00096846 1.00270196

#Pseudo-R-squared
nagelkerke(model)

## $Models
##                                                                                                                    
## Model: "glm, left ~ satisfaction_level + last_evaluation + average_monthly_hours, binomial(link = \"logit\"), Data"
## Null:  "glm, left ~ 1, binomial(link = \"logit\"), Data"                                                           
## 
## $Pseudo.R.squared.for.model.vs.null
##                              Pseudo.R.squared
## McFadden                             0.139817
## Cox and Snell (ML)                   0.142282
## Nagelkerke (Cragg and Uhler)         0.213519
## 
## $Likelihood.ratio.test
##  Df.diff LogLik.diff Chisq p.value
##       -3       -1151  2302       0
## 
## $Number.of.observations
##             
## Model: 14999
## Null:  14999
## 
## $Messages
## [1] "Note: For models fit with REML, these statistics are based on refitting with ML"
## 
## $Warnings
## [1] "None"

#Stay Odds Ratio (no code for this) -> 1/Odds ratio for each predictor – see example walk-through for more details on this.

Logistic Regression

Christabel

2025-11-21