#Data manipulation
mydata <- read.table("./anketa.csv", header=TRUE, sep=";")

# Removing GDPR question 
mydata <- mydata[,-1]

# Remove "=" in all cells
mydata[] <- lapply(mydata, sub, pattern = "=", replacement = "")
# Filter the data by age
mydata <- mydata[mydata$Q1 >= 1995 & mydata$Q1 <= 2004,]
# Remove all rows, where people didn't finish the questionnaire
mydata <- replace_with_na_all(data = mydata, condition = ~.x == "-3") %>% drop_na()

# Adding ID to each row
mydata$ID <- seq.int(nrow(mydata))

# Moving ID column to the 1st place
mydata <- mydata[,c("ID",setdiff(names(mydata),"ID"))]

# Remove 4 units that did not finish the questionnaire
mydata <- mydata[!(mydata$ID %in% c(159, 178, 182, 216)),]

# Transform all string variables into numerical
mydata[, which(!grepl("text", colnames(mydata)))] <- lapply(mydata[, which(!grepl("text", colnames(mydata)))], as.integer)
# Subset of data for logistic regression

LRdata <- mydata[c(1,2,3,16,21,26,31,76:78,90)]

colnames(LRdata) <- c("ID", "Year", "Cash", "Security", "Acceptance", "Speed", "Control", "Gender", "Education", "Employment", "HomeTown")

LRdata$Employment[LRdata$Employment == 3] <- 1
LRdata = LRdata[LRdata$Employment != "4",]

head(LRdata)
## # A tibble: 6 × 11
##      ID  Year  Cash Secur…¹ Accep…² Speed Control Gender Educa…³ Emplo…⁴ HomeT…⁵
##   <int> <int> <int>   <int>   <int> <int>   <int>  <int>   <int>   <dbl>   <int>
## 1     1  1999     2       5       5     3       5      1       3       1       2
## 2     2  1999     2       4       4     5       2      2       4       1       3
## 3     3  1996     2       3       5     2       3      2       3       2       5
## 4     4  1998     2       5       5     5       5      1       3       1       1
## 5     5  2000     2       4       5     2       2      1       2       1       2
## 6     6  2000     2       4       3     3       2      2       3       1       3
## # … with abbreviated variable names ¹​Security, ²​Acceptance, ³​Education,
## #   ⁴​Employment, ⁵​HomeTown
# dependent variable: Q2, independent variables: Q6a, Q7a, Q8a, Q9a + all demographic 
LRdata$CashF <- factor(LRdata$Cash,
                       levels = c(2,1),
                       labels = c("Not_Using", "Using"))

LRdata$GenderF <- factor(LRdata$Gender,
                       levels = c(1,2),
                       labels = c("Male", "Female"))

LRdata$EducationF <- factor(LRdata$Education,
                       levels = c(1:4),
                       labels = c("Secondary", "Secondary", "Tertiary", "Tertiary"))

LRdata$EmploymentF <- factor(LRdata$Employment,
                       levels = c(1,2),
                       labels = c("Student", "Employed"))
                       
LRdata$HomeTownF <- factor(LRdata$HomeTown,
                       levels = c(1:5),
                       labels = c("Village", "Village", "Town", "Town", "Town"))

LRdata$Age <- abs(LRdata$Year-2022)



summary(LRdata[,c(-1,-2,-3,-8,-9,-10,-11)])
##     Security       Acceptance       Speed          Control           CashF    
##  Min.   :1.000   Min.   :1.00   Min.   :1.000   Min.   :1.00   Not_Using:195  
##  1st Qu.:3.000   1st Qu.:4.00   1st Qu.:2.000   1st Qu.:2.00   Using    : 30  
##  Median :4.000   Median :5.00   Median :3.000   Median :3.00                  
##  Mean   :3.613   Mean   :4.56   Mean   :2.933   Mean   :3.04                  
##  3rd Qu.:4.000   3rd Qu.:5.00   3rd Qu.:4.000   3rd Qu.:4.00                  
##  Max.   :5.000   Max.   :5.00   Max.   :5.000   Max.   :5.00                  
##    GenderF        EducationF    EmploymentF    HomeTownF        Age       
##  Male  : 92   Secondary:110   Student :192   Village: 85   Min.   :18.00  
##  Female:133   Tertiary :115   Employed: 33   Town   :140   1st Qu.:21.00  
##                                                            Median :22.00  
##                                                            Mean   :22.24  
##                                                            3rd Qu.:23.00  
##                                                            Max.   :27.00
# Intital regression model
INmodel <- glm(CashF ~ 1,
             family = binomial,
             data = LRdata)

#summary(INmodel)

# Calculating initial odds
exp(cbind(odds = INmodel$coefficients, confint.default(INmodel)))
##                  odds     2.5 %    97.5 %
## (Intercept) 0.1538462 0.1047496 0.2259544
Pseudo_R2_INmodel <- 197/227
Pseudo_R2_INmodel
## [1] 0.8678414
# Complex regression model

CLRmodel <- glm(CashF ~ Security + Acceptance + Speed + Control + GenderF + EducationF + EmploymentF + HomeTownF + Age,
               family = binomial,
               data = LRdata)

# Vif statistics for complex regression model
vif(CLRmodel)
##    Security  Acceptance       Speed     Control     GenderF  EducationF 
##    1.210901    1.070917    1.169424    1.038087    1.127684    2.256248 
## EmploymentF   HomeTownF         Age 
##    1.756353    1.196088    3.144763
mean(vif(CLRmodel))
## [1] 1.552274
# Calculating standardized residuals and Cook's distances
LRdata$StdResid <- rstandard(CLRmodel)
LRdata$CooksD <- cooks.distance(CLRmodel)

# Histogram of standardized residuals - no requirements for normal distribution
hist(LRdata$StdResid,
     main = "Histogram of standardized residuals",
     ylab = "Frequency",
     xlab = "Standardized residuals")

# Listing the highest Cook's distances
head(LRdata[order(-LRdata$CooksD), c("ID", "CooksD")], 20)
## # A tibble: 20 × 2
##       ID CooksD
##    <int>  <dbl>
##  1   118 0.0963
##  2    51 0.0795
##  3   135 0.0663
##  4   211 0.0651
##  5   188 0.0588
##  6    61 0.0512
##  7    75 0.0510
##  8   177 0.0414
##  9     4 0.0361
## 10   149 0.0350
## 11    18 0.0321
## 12   174 0.0319
## 13    59 0.0307
## 14    34 0.0300
## 15   199 0.0295
## 16   209 0.0284
## 17    67 0.0193
## 18    41 0.0192
## 19    40 0.0186
## 20    65 0.0151
# Removing influential units and outliers

#LRdata <- LRdata[!(LRdata$CooksD > (3*mean(LRdata$CooksD))),]

LRdata <- LRdata[!(LRdata$CooksD > 0.03),]

LRdata <- LRdata[!(LRdata$StdResid < -3),]
LRdata <- LRdata[!(LRdata$StdResid > 3),]
# Rerunning regression models

# Complex Model w/ demographics

CLRmodel <- glm(CashF ~ Security + Acceptance + Speed + Control + GenderF + EducationF + EmploymentF + HomeTownF + Age,
               family = binomial,
               data = LRdata)

# Simple Model w/out demographics

SLRmodel <- glm(CashF ~ Security + Acceptance + Speed + Control,
               family = binomial,
               data = LRdata)

# Comparision of models based on -2LL statistics
# H0: Simple model is better
# H1: Complex model is better

anova(SLRmodel, CLRmodel, test = "Chi")
## Analysis of Deviance Table
## 
## Model 1: CashF ~ Security + Acceptance + Speed + Control
## Model 2: CashF ~ Security + Acceptance + Speed + Control + GenderF + EducationF + 
##     EmploymentF + HomeTownF + Age
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)    
## 1       207     81.858                         
## 2       202     60.874  5   20.983 0.000816 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We can reject H0 at p < 0.001. Complex model is better.

# Results of logistic regression analysis
summary(CLRmodel)
## 
## Call:
## glm(formula = CashF ~ Security + Acceptance + Speed + Control + 
##     GenderF + EducationF + EmploymentF + HomeTownF + Age, family = binomial, 
##     data = LRdata)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.49153  -0.18704  -0.04458  -0.00980   2.56250  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -38.2769     9.8068  -3.903  9.5e-05 ***
## Security              1.1705     0.4817   2.430  0.01511 *  
## Acceptance           -0.1809     0.5372  -0.337  0.73637    
## Speed                 1.5826     0.4629   3.419  0.00063 ***
## Control               1.5973     0.4921   3.246  0.00117 ** 
## GenderFFemale         0.1738     0.7625   0.228  0.81968    
## EducationFTertiary   -4.3974     1.3726  -3.204  0.00136 ** 
## EmploymentFEmployed  -1.7136     1.4280  -1.200  0.23015    
## HomeTownFTown        -1.8582     0.8781  -2.116  0.03433 *  
## Age                   1.0465     0.3561   2.939  0.00329 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 132.485  on 211  degrees of freedom
## Residual deviance:  60.874  on 202  degrees of freedom
## AIC: 80.874
## 
## Number of Fisher Scoring iterations: 8
vif(CLRmodel)
##    Security  Acceptance       Speed     Control     GenderF  EducationF 
##    1.568883    1.099727    1.682955    1.410759    1.271184    4.176813 
## EmploymentF   HomeTownF         Age 
##    1.918685    1.795115    4.742666
mean(vif(CLRmodel))
## [1] 2.185199
# Calculating odds
exp(cbind(OR = CLRmodel$coefficients, confint.default(CLRmodel)))
##                               OR        2.5 %       97.5 %
## (Intercept)         2.379809e-17 1.069035e-25 5.297758e-09
## Security            3.223572e+00 1.253989e+00 8.286688e+00
## Acceptance          8.345454e-01 2.911704e-01 2.391953e+00
## Speed               4.867555e+00 1.964495e+00 1.206065e+01
## Control             4.939805e+00 1.882803e+00 1.296029e+01
## GenderFFemale       1.189851e+00 2.669447e-01 5.303515e+00
## EducationFTertiary  1.230908e-02 8.353599e-04 1.813752e-01
## EmploymentFEmployed 1.802149e-01 1.097071e-02 2.960375e+00
## HomeTownFTown       1.559499e-01 2.789601e-02 8.718231e-01
## Age                 2.847527e+00 1.417084e+00 5.721896e+00

EXPLAINATIONS OF RESULTS

Significant coeff.:

# Estimates of probabilities for Y=1: P(Y=1)
LRdata$EstProb <- fitted(CLRmodel)

#head(LRdata[c(1,12:20)],10)

# Classification
LRdata$Classification <- ifelse(test = LRdata$EstProb < 0.50, 
                                yes = "YES", 
                                no = "NO")

# If estimated probability is below 0.50, person is classified into a group of people preferring cash (YES), otherwise NO.

LRdata$ClassificationF <- factor(LRdata$Classification,
                                 levels = c("YES", "NO"), 
                                 labels = c("YES", "NO"))

ClassificationTable <- table(LRdata$CashF, LRdata$ClassificationF) #Classification table based on 2 categorical variables.

ClassificationTable
##            
##             YES  NO
##   Not_Using 188   4
##   Using       9  11

Rows - actual outcome

Column - predicted outcome

Pseudo_R2_fit2 <- (ClassificationTable[1, 1] + ClassificationTable[2, 2] )/ nrow(LRdata)
Pseudo_R2_fit2
## [1] 0.9386792

We correctly classified 93.9% units.