#Data manipulation
mydata <- read.table("./anketa.csv", header=TRUE, sep=";")
# Removing GDPR question
mydata <- mydata[,-1]
# Remove "=" in all cells
mydata[] <- lapply(mydata, sub, pattern = "=", replacement = "")
# Filter the data by age
mydata <- mydata[mydata$Q1 >= 1995 & mydata$Q1 <= 2004,]
# Remove all rows, where people didn't finish the questionnaire
mydata <- replace_with_na_all(data = mydata, condition = ~.x == "-3") %>% drop_na()
# Adding ID to each row
mydata$ID <- seq.int(nrow(mydata))
# Moving ID column to the 1st place
mydata <- mydata[,c("ID",setdiff(names(mydata),"ID"))]
# Remove 4 units that did not finish the questionnaire
mydata <- mydata[!(mydata$ID %in% c(159, 178, 182, 216)),]
# Transform all string variables into numerical
mydata[, which(!grepl("text", colnames(mydata)))] <- lapply(mydata[, which(!grepl("text", colnames(mydata)))], as.integer)
# Subset of data for logistic regression
LRdata <- mydata[c(1,2,3,16,21,26,31,76:78,90)]
colnames(LRdata) <- c("ID", "Year", "Cash", "Security", "Acceptance", "Speed", "Control", "Gender", "Education", "Employment", "HomeTown")
LRdata$Employment[LRdata$Employment == 3] <- 1
LRdata = LRdata[LRdata$Employment != "4",]
head(LRdata)
## # A tibble: 6 × 11
## ID Year Cash Secur…¹ Accep…² Speed Control Gender Educa…³ Emplo…⁴ HomeT…⁵
## <int> <int> <int> <int> <int> <int> <int> <int> <int> <dbl> <int>
## 1 1 1999 2 5 5 3 5 1 3 1 2
## 2 2 1999 2 4 4 5 2 2 4 1 3
## 3 3 1996 2 3 5 2 3 2 3 2 5
## 4 4 1998 2 5 5 5 5 1 3 1 1
## 5 5 2000 2 4 5 2 2 1 2 1 2
## 6 6 2000 2 4 3 3 2 2 3 1 3
## # … with abbreviated variable names ¹Security, ²Acceptance, ³Education,
## # ⁴Employment, ⁵HomeTown
# dependent variable: Q2, independent variables: Q6a, Q7a, Q8a, Q9a + all demographic
LRdata$CashF <- factor(LRdata$Cash,
levels = c(2,1),
labels = c("Not_Using", "Using"))
LRdata$GenderF <- factor(LRdata$Gender,
levels = c(1,2),
labels = c("Male", "Female"))
LRdata$EducationF <- factor(LRdata$Education,
levels = c(1:4),
labels = c("Secondary", "Secondary", "Tertiary", "Tertiary"))
LRdata$EmploymentF <- factor(LRdata$Employment,
levels = c(1,2),
labels = c("Student", "Employed"))
LRdata$HomeTownF <- factor(LRdata$HomeTown,
levels = c(1:5),
labels = c("Village", "Village", "Town", "Town", "Town"))
LRdata$Age <- abs(LRdata$Year-2022)
summary(LRdata[,c(-1,-2,-3,-8,-9,-10,-11)])
## Security Acceptance Speed Control CashF
## Min. :1.000 Min. :1.00 Min. :1.000 Min. :1.00 Not_Using:195
## 1st Qu.:3.000 1st Qu.:4.00 1st Qu.:2.000 1st Qu.:2.00 Using : 30
## Median :4.000 Median :5.00 Median :3.000 Median :3.00
## Mean :3.613 Mean :4.56 Mean :2.933 Mean :3.04
## 3rd Qu.:4.000 3rd Qu.:5.00 3rd Qu.:4.000 3rd Qu.:4.00
## Max. :5.000 Max. :5.00 Max. :5.000 Max. :5.00
## GenderF EducationF EmploymentF HomeTownF Age
## Male : 92 Secondary:110 Student :192 Village: 85 Min. :18.00
## Female:133 Tertiary :115 Employed: 33 Town :140 1st Qu.:21.00
## Median :22.00
## Mean :22.24
## 3rd Qu.:23.00
## Max. :27.00
# Intital regression model
INmodel <- glm(CashF ~ 1,
family = binomial,
data = LRdata)
#summary(INmodel)
# Calculating initial odds
exp(cbind(odds = INmodel$coefficients, confint.default(INmodel)))
## odds 2.5 % 97.5 %
## (Intercept) 0.1538462 0.1047496 0.2259544
Pseudo_R2_INmodel <- 197/227
Pseudo_R2_INmodel
## [1] 0.8678414
# Complex regression model
CLRmodel <- glm(CashF ~ Security + Acceptance + Speed + Control + GenderF + EducationF + EmploymentF + HomeTownF + Age,
family = binomial,
data = LRdata)
# Vif statistics for complex regression model
vif(CLRmodel)
## Security Acceptance Speed Control GenderF EducationF
## 1.210901 1.070917 1.169424 1.038087 1.127684 2.256248
## EmploymentF HomeTownF Age
## 1.756353 1.196088 3.144763
mean(vif(CLRmodel))
## [1] 1.552274
# Calculating standardized residuals and Cook's distances
LRdata$StdResid <- rstandard(CLRmodel)
LRdata$CooksD <- cooks.distance(CLRmodel)
# Histogram of standardized residuals - no requirements for normal distribution
hist(LRdata$StdResid,
main = "Histogram of standardized residuals",
ylab = "Frequency",
xlab = "Standardized residuals")
# Listing the highest Cook's distances
head(LRdata[order(-LRdata$CooksD), c("ID", "CooksD")], 20)
## # A tibble: 20 × 2
## ID CooksD
## <int> <dbl>
## 1 118 0.0963
## 2 51 0.0795
## 3 135 0.0663
## 4 211 0.0651
## 5 188 0.0588
## 6 61 0.0512
## 7 75 0.0510
## 8 177 0.0414
## 9 4 0.0361
## 10 149 0.0350
## 11 18 0.0321
## 12 174 0.0319
## 13 59 0.0307
## 14 34 0.0300
## 15 199 0.0295
## 16 209 0.0284
## 17 67 0.0193
## 18 41 0.0192
## 19 40 0.0186
## 20 65 0.0151
# Removing influential units and outliers
#LRdata <- LRdata[!(LRdata$CooksD > (3*mean(LRdata$CooksD))),]
LRdata <- LRdata[!(LRdata$CooksD > 0.03),]
LRdata <- LRdata[!(LRdata$StdResid < -3),]
LRdata <- LRdata[!(LRdata$StdResid > 3),]
# Rerunning regression models
# Complex Model w/ demographics
CLRmodel <- glm(CashF ~ Security + Acceptance + Speed + Control + GenderF + EducationF + EmploymentF + HomeTownF + Age,
family = binomial,
data = LRdata)
# Simple Model w/out demographics
SLRmodel <- glm(CashF ~ Security + Acceptance + Speed + Control,
family = binomial,
data = LRdata)
# Comparision of models based on -2LL statistics
# H0: Simple model is better
# H1: Complex model is better
anova(SLRmodel, CLRmodel, test = "Chi")
## Analysis of Deviance Table
##
## Model 1: CashF ~ Security + Acceptance + Speed + Control
## Model 2: CashF ~ Security + Acceptance + Speed + Control + GenderF + EducationF +
## EmploymentF + HomeTownF + Age
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 207 81.858
## 2 202 60.874 5 20.983 0.000816 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We can reject H0 at p < 0.001. Complex model is better.
# Results of logistic regression analysis
summary(CLRmodel)
##
## Call:
## glm(formula = CashF ~ Security + Acceptance + Speed + Control +
## GenderF + EducationF + EmploymentF + HomeTownF + Age, family = binomial,
## data = LRdata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.49153 -0.18704 -0.04458 -0.00980 2.56250
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -38.2769 9.8068 -3.903 9.5e-05 ***
## Security 1.1705 0.4817 2.430 0.01511 *
## Acceptance -0.1809 0.5372 -0.337 0.73637
## Speed 1.5826 0.4629 3.419 0.00063 ***
## Control 1.5973 0.4921 3.246 0.00117 **
## GenderFFemale 0.1738 0.7625 0.228 0.81968
## EducationFTertiary -4.3974 1.3726 -3.204 0.00136 **
## EmploymentFEmployed -1.7136 1.4280 -1.200 0.23015
## HomeTownFTown -1.8582 0.8781 -2.116 0.03433 *
## Age 1.0465 0.3561 2.939 0.00329 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 132.485 on 211 degrees of freedom
## Residual deviance: 60.874 on 202 degrees of freedom
## AIC: 80.874
##
## Number of Fisher Scoring iterations: 8
vif(CLRmodel)
## Security Acceptance Speed Control GenderF EducationF
## 1.568883 1.099727 1.682955 1.410759 1.271184 4.176813
## EmploymentF HomeTownF Age
## 1.918685 1.795115 4.742666
mean(vif(CLRmodel))
## [1] 2.185199
# Calculating odds
exp(cbind(OR = CLRmodel$coefficients, confint.default(CLRmodel)))
## OR 2.5 % 97.5 %
## (Intercept) 2.379809e-17 1.069035e-25 5.297758e-09
## Security 3.223572e+00 1.253989e+00 8.286688e+00
## Acceptance 8.345454e-01 2.911704e-01 2.391953e+00
## Speed 4.867555e+00 1.964495e+00 1.206065e+01
## Control 4.939805e+00 1.882803e+00 1.296029e+01
## GenderFFemale 1.189851e+00 2.669447e-01 5.303515e+00
## EducationFTertiary 1.230908e-02 8.353599e-04 1.813752e-01
## EmploymentFEmployed 1.802149e-01 1.097071e-02 2.960375e+00
## HomeTownFTown 1.559499e-01 2.789601e-02 8.718231e-01
## Age 2.847527e+00 1.417084e+00 5.721896e+00
Significant coeff.:
# Estimates of probabilities for Y=1: P(Y=1)
LRdata$EstProb <- fitted(CLRmodel)
#head(LRdata[c(1,12:20)],10)
# Classification
LRdata$Classification <- ifelse(test = LRdata$EstProb < 0.50,
yes = "YES",
no = "NO")
# If estimated probability is below 0.50, person is classified into a group of people preferring cash (YES), otherwise NO.
LRdata$ClassificationF <- factor(LRdata$Classification,
levels = c("YES", "NO"),
labels = c("YES", "NO"))
ClassificationTable <- table(LRdata$CashF, LRdata$ClassificationF) #Classification table based on 2 categorical variables.
ClassificationTable
##
## YES NO
## Not_Using 188 4
## Using 9 11
Rows - actual outcome
Column - predicted outcome
Pseudo_R2_fit2 <- (ClassificationTable[1, 1] + ClassificationTable[2, 2] )/ nrow(LRdata)
Pseudo_R2_fit2
## [1] 0.9386792
We correctly classified 93.9% units.