#install.packages
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#install.packages("ggpubr")
library(ggpubr)
## Loading required package: ggplot2
#install.packages("DescTools")
library(DescTools)
mydata_excel1 <- read_excel("~/Desktop/NLB projekt/Logistična regresija/1ka data.xlsx")
mydata_excel2 <- read_excel("~/Desktop/NLB projekt/Logistična regresija/anketa_končni podatki.xlsx")
mydata_excel2 <- mydata_excel2[-1, ] #Delete first row in which the questions are written
mydata_excel2$ID <- seq(1,nrow(mydata_excel2))
mydata2 <- mydata_excel2[!(apply(mydata_excel2 == -3, 1, any)), ]
mydata2 <- subset(mydata2, select = -c(Q21:Q40))
mydata2$ID <- seq(1,nrow(mydata2))
mydata2$Q2 <- factor(mydata2$Q2,
levels = c(1, 2),
labels = c("Yes","No"))
mydata2$Q3 <- factor(mydata2$Q3,
levels = c(1, 2),
labels = c("Yes","No"))
mydata2$Q4 <- factor(mydata2$Q4,
levels = c(1, 2, 3),
labels = c("Yes","No", "I don't know"))
mydata2$Q5 <- factor(mydata2$Q5,
levels = c(1, 2),
labels = c("Yes","No"))
mydata2$Q7 <- factor(mydata2$Q7,
levels = c(1, 2),
labels = c("Yes","No"))
mydata2$Q16a <- factor(mydata2$Q16a,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q16b <- factor(mydata2$Q16b,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q16c <- factor(mydata2$Q16c,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q16d <- factor(mydata2$Q16d,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q16e <- factor(mydata2$Q16e,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q16f <- factor(mydata2$Q16f,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q16g <- factor(mydata2$Q16g,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q41 <- factor(mydata2$Q41,
levels = c(1, 2, 3),
labels = c("Female","Male", "I don't want to answer"))
mydata2$Q43a <- factor(mydata2$Q43a,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q43b <- factor(mydata2$Q43b,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q43c <- factor(mydata2$Q43c,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q43d <- factor(mydata2$Q43d,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q43e <- factor(mydata2$Q43e,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q43f <- factor(mydata2$Q43f,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q43g <- factor(mydata2$Q43g,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q43h <- factor(mydata2$Q43h,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q44 <- factor(mydata2$Q44,
levels = c(2, 6, 1, 4, 3, 5),
labels = c("1.000 – 5.000 habitants","More than 100.000 habitants", "Less than 1.000 habitants", "20.001 – 50.000 habitants", "5.001 – 20.000 habitants", "50.001 – 100.000 habitants"))
mydata2$Q45 <- factor(mydata2$Q45,
levels = c(3, 5, 1, 9, 12, 7, 10, 4, 11, 6),
labels = c("OTP banka d.d.","Banka Intesa Sanpaolo d.d.", "Nova Ljubljanska Banka d.d. (NLB)", "Gorenjska Banka d.d.", "Delavska Hranilnica d.d.", "Revolut", "Deželna Banka Slovenije d.d.", "Banka Sparkasse d.d.", "Addiko Bank d.d.", "UniCredit Banka Slovenija d.d."))
mydata2$Q46 <- factor(mydata2$Q46,
levels = c(1, 2, 3, 5, 6, 4),
labels = c("Študent/-ka","Redno zaposlen/-a", "Upokojen/-a", "Samozaposlen/-a", "Delno zaposlen/-a", "Brezposeln/-a"))
mydata2$Q47 <- factor(mydata2$Q47,
levels = c(1, 5, 3, 8, 2, 4, 6, 7),
labels = c("Pod 1.000€","3.001€ - 5.000€", "1.501€ - 2.000€", "I don't want to answer", "1.000€ - 1.500€", "2.001€ - 3.000€", "5.001€ - 10.000€", "Above 10.000€"))
mydata2$Q48 <- factor(mydata2$Q48,
levels = c(2, 6, 3, 5, 7, 4),
labels = c("Dokončana osnovna šola","Dokončana visokošolska strokovna univerzitetna izobrazba (tudi 2. bolonjska stopnja)", "Dokončana nižja ali srednja poklicna izobrazba", "Dokončana višješolska strokovna ali visokošolska strokovna izobrazba (tudi 1. bolonjska stopnja)", "Dokončana specializacija, znanstveni magisterij, doktorat", "Dokončana srednja strokovna ali splošna izobrazba"))
mydata2$Q49 <- factor(mydata2$Q49,
levels = c(1, 2, 3),
labels = c("Preko linka","Na tablici", "Na listu papirja"))
mydata2[c(6:14, 18:22, 25:39, 48:55, 58:62)] <- mydata2[c(6:14, 18:22, 25:39, 48:55, 58:62)] %>% mutate_all(as.numeric)
mydata2 <- mydata2 %>% mutate(across(where(is.numeric), ~ replace(., . == -2, mean(.[. != -2], na.rm = TRUE))))
mydata2 <- mydata2 %>% mutate(across(where(is.numeric), ~ replace(., is.na(.), mean(., na.rm = TRUE))))
mydata2 <- mydata2 %>%
filter(!ID %in% c(2, 16, 17))
mydata2$ID <- seq(1, nrow(mydata2))
summary(mydata2[c(-1, -15, -16, -23, -24, -47, -56, -57, -63, -64, -66, -75, -78, -83)])
## Q2 Q3 Q4 Q5 Q6a
## Yes :121 Yes :131 Yes :103 Yes : 25 Min. :1.000
## No : 36 No : 26 No : 28 No :132 1st Qu.:3.000
## NA's: 1 NA's: 1 I don't know: 26 NA's: 1 Median :3.000
## NA's : 1 Mean :3.433
## 3rd Qu.:4.000
## Max. :5.000
##
## Q6b Q6c Q6d Q6e
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:1.000
## Median :4.000 Median :4.000 Median :4.000 Median :2.000
## Mean :3.535 Mean :3.847 Mean :3.854 Mean :2.688
## 3rd Qu.:4.000 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
##
## Q6f Q6g Q6h Q6i Q7
## Min. :1.000 Min. :1.000 Min. :1.00 Min. :1.000 Yes :82
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:2.00 1st Qu.:3.000 No :75
## Median :4.000 Median :3.086 Median :3.00 Median :4.000 NA's: 1
## Mean :3.497 Mean :3.172 Mean :2.93 Mean :3.465
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.00 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.00 Max. :5.000
##
## Podpora in svetovanje
## Min. :1.00
## 1st Qu.:3.00
## Median :4.00
## Mean :3.79
## 3rd Qu.:4.00
## Max. :5.00
##
## Zagotavljanje varnosti in zaščita vaših finančnih podatkov ter transakcij
## Min. :1.000
## 1st Qu.:4.000
## Median :5.000
## Mean :4.382
## 3rd Qu.:5.000
## Max. :5.000
##
## Transparentnost delovanja banke in njenih storitev Priročnost
## Min. :1.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.:4.000
## Median :4.000 Median :4.000
## Mean :4.108 Mean :3.975
## 3rd Qu.:5.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000
##
## Hitrost in zanesljivost opravljene storitve Poslovalnica_Podpora in svetovanje
## Min. :1.000 Min. :2.000
## 1st Qu.:4.000 1st Qu.:4.000
## Median :4.000 Median :4.000
## Mean :4.185 Mean :4.006
## 3rd Qu.:5.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000
##
## Mobilna banka_Podpora in svetovanje
## Min. :1.00
## 1st Qu.:3.00
## Median :3.00
## Mean :3.14
## 3rd Qu.:4.00
## Max. :5.00
##
## Poslovalnica_Zagotavljanje varnosti in zaščita vaših finančnih podatkov ter transakcij
## Min. :2.000
## 1st Qu.:3.986
## Median :4.000
## Mean :3.981
## 3rd Qu.:5.000
## Max. :5.000
##
## Mobilna banka_Zagotavljanje varnosti in zaščita vaših finančnih podatkov ter transakcij
## Min. :1.000
## 1st Qu.:2.000
## Median :3.000
## Mean :3.032
## 3rd Qu.:4.000
## Max. :5.000
##
## Poslovalnica_Transparentnost delovanja banke in njenih storitev
## Min. :1.000
## 1st Qu.:3.000
## Median :4.000
## Mean :3.898
## 3rd Qu.:4.000
## Max. :5.000
##
## Mobilna banka_Transparentnost delovanja banke in njenih storitev
## Min. :1.000
## 1st Qu.:3.000
## Median :3.000
## Mean :3.248
## 3rd Qu.:4.000
## Max. :5.000
##
## Poslovalnica_Priročnost Mobilna banka_Priročnost
## Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000
## Median :4.000 Median :3.000
## Mean :3.752 Mean :3.389
## 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000
##
## Poslovalnica_Hitrost in zanesljivost opravljene storitve
## Min. :1.000
## 1st Qu.:3.000
## Median :4.000
## Mean :3.701
## 3rd Qu.:4.000
## Max. :5.000
##
## Mobilna banka_Hitrost in zanesljivost opravljene storitve Q15a
## Min. :1.000 Min. :1
## 1st Qu.:3.000 1st Qu.:4
## Median :4.000 Median :4
## Mean :3.586 Mean :4
## 3rd Qu.:4.000 3rd Qu.:5
## Max. :5.000 Max. :5
##
## Q15b Q15c Q15d Q15e
## Min. :2.000 Min. :1.000 Min. :1.000 Min. :2.000
## 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:3.000 1st Qu.:4.000
## Median :4.000 Median :4.000 Median :4.000 Median :4.000
## Mean :4.134 Mean :4.115 Mean :3.809 Mean :4.127
## 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
##
## Q16a Q16b Q16c Q16d
## Selected :98 Selected :66 Selected :68 Selected :79
## Not selected:59 Not selected:91 Not selected:89 Not selected:78
## NA's : 1 NA's : 1 NA's : 1 NA's : 1
##
##
##
##
## Q16e Q16f Q16g Q17a
## Selected : 49 Selected : 36 Selected : 9 Min. :0.000
## Not selected:108 Not selected:121 Not selected:148 1st Qu.:4.000
## NA's : 1 NA's : 1 NA's : 1 Median :4.000
## Mean :3.662
## 3rd Qu.:4.000
## Max. :5.000
##
## Q17b Q17c Q17d Q17e Q18a
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :1.00
## 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:2.000 1st Qu.:4.000 1st Qu.:3.00
## Median :4.000 Median :4.000 Median :4.000 Median :4.000 Median :4.00
## Mean :3.694 Mean :3.758 Mean :3.223 Mean :3.809 Mean :3.35
## 3rd Qu.:4.000 3rd Qu.:4.750 3rd Qu.:4.000 3rd Qu.:5.000 3rd Qu.:4.00
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.00
##
## Q18b Q18c Q19a Q19b
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000
## Median :4.000 Median :4.000 Median :3.000 Median :4.000
## Mean :3.732 Mean :3.713 Mean :3.217 Mean :3.548
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
##
## Q19c Q19d Q19e Q41
## Min. :1.000 Min. :1.000 Min. :1.000 Female :78
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:2.000 Male :79
## Median :4.000 Median :4.000 Median :3.000 I don't want to answer: 0
## Mean :3.331 Mean :3.618 Mean :3.051 NA's : 1
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000
##
## Q43a Q43b Q43c Q43d
## Selected : 32 Selected : 12 Selected : 5 Selected :81
## Not selected:125 Not selected:145 Not selected:152 Not selected:76
## NA's : 1 NA's : 1 NA's : 1 NA's : 1
##
##
##
##
## Q43e Q43f Q43g Q43h
## Selected : 57 Selected : 1 Selected : 29 Selected : 0
## Not selected:100 Not selected:156 Not selected:128 Not selected:157
## NA's : 1 NA's : 1 NA's : 1 NA's : 1
##
##
##
##
## Q44 Q45
## 1.000 – 5.000 habitants :44 OTP banka d.d. :71
## More than 100.000 habitants:36 Nova Ljubljanska Banka d.d. (NLB):35
## Less than 1.000 habitants :30 Delavska Hranilnica d.d. :12
## 20.001 – 50.000 habitants :14 Banka Intesa Sanpaolo d.d. :10
## 5.001 – 20.000 habitants :22 Deželna Banka Slovenije d.d. : 8
## 50.001 – 100.000 habitants :11 (Other) :18
## NA's : 1 NA's : 4
## Q46 Q47
## Študent/-ka :22 Pod 1.000€ :46
## Redno zaposlen/-a:73 1.000€ - 1.500€:39
## Upokojen/-a :43 2.001€ - 3.000€:29
## Samozaposlen/-a :13 1.501€ - 2.000€:25
## Delno zaposlen/-a: 3 3.001€ - 5.000€: 6
## Brezposeln/-a : 3 (Other) : 4
## NA's : 1 NA's : 9
## Q48
## Dokončana osnovna šola : 8
## Dokončana visokošolska strokovna univerzitetna izobrazba (tudi 2. bolonjska stopnja) :35
## Dokončana nižja ali srednja poklicna izobrazba :23
## Dokončana višješolska strokovna ali visokošolska strokovna izobrazba (tudi 1. bolonjska stopnja):34
## Dokončana specializacija, znanstveni magisterij, doktorat : 5
## Dokončana srednja strokovna ali splošna izobrazba :50
## NA's : 3
## Q49
## Preko linka :84
## Na tablici :37
## Na listu papirja:17
## NA's :20
##
##
##
Q1: Ali uporabljate mobilno aplikacijo banke, kjer imate odprt primarni račun (npr. KlikIn, mBank@Net in podobno)? [1 - Da, 2 - Ne]
Q2: Ali dnevno uporabljate pametni telefon? [1 - Da, 2 - Ne]
Q3: Ali uporabljate internet vsaj nekajkrat na teden? [1 - Da, 2 - Ne]
Q4: Ali je kdo od vaših družinskih članov uporabnik mobilne banke? [1 - Da, 2 - Ne, 3 - Ne vem]
Q5: Ali imate skupni bančni račun, ki ga upravlja en član družine? [1 - Da, 2 - Ne]
Q6: Zakaj ne uporabljate mobilne banke (npr. KlikIn, mBank@Net, Addiko Mobil, …)? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti se ne strinjam niti se strinjam, 4 - Se strinjam, 5 - Popolnoma se strinjam]
Q7: Se vam zdi, da bi potrebovali dodatno izobraževanje, preden bi lahko uporabljali mobilno banko? [1 - Da, 2 - Ne]
Q8: Kako pomembni so naslednji dejavniki pri odločitvi med poslovalnico ali mobilno aplikacijo? [1 - Sploh ni pomembno, 2 - Ni pomembno, 3 - Niti ni pomembno niti je pomembno, 4 - Je pomembno, 5 - Zelo pomembno]
Q10: Kako zaznavate podporo in svetovanje v poslovalnici in mobilni aplikaciji? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti niti, 4 - Se strinjam, 5 - Popolnoma se strinjam]
Q11: Kako zaznavate zagotavljanje varnosti in zaščito vaših finančnih podatkov ter transakcij v poslovalnici in mobilni aplikaciji? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti niti, 4 - Se strinjam, 5 - Popolnoma se strinjam]
Q12: Kako zaznavate transparentnost delovanja banke in njenih storitev v poslovalnici in mobilni aplikaciji? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti niti, 4 - Se strinjam, 5 - Popolnoma se strinjam]
Q13: Kako zaznavate priročnost (npr. enostavna uporaba, dostopnost od kjerkoli, hitre transakcije) v poslovalnici in mobilni aplikaciji? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti niti, 4 - Se strinjam, 5 - Popolnoma se strinjam]
Q14: Kako zaznavate zanesljivost opravljene storitve v poslovalnici in mobilni aplikaciji? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti niti, 4 - Se strinjam, 5 - Popolnoma se strinjam]
Q15: Kako zaskrbljeni ste glede naslednjih tveganj med uporabo mobilnega bančništva? [1 - Sploh me ne skrbi, 2 - Me ne skrbi, 3 - Niti me ne skrbi niti me skrbi, 4 - Me skrbi, 5 - Zelo me skrbi]
Q16: Kateri izmed spodaj navedenih dejavnikov bi vam omogočili večje zaupanje v mobilno bančništvo? [Izbira več možnih odgovorov]
Q17: Kako koristne bi se vam zdele naslednje funkcije v aplikaciji za mobilno bančništvo? [1 - Sploh ni koristna, 2 - Ni koristna, 3 - Niti ni koristna niti je koristna, 4 - Je koristna, 5 - Izjemno koristna]
Q18: Kateri finančni spodbudni ukrepi bi vas spodbudili k uporabi mobilnega bančništva? [1 - Sploh me ne bi spodbudilo, 2 - Ne bi me spodbudilo, 3 - Niti niti, 4 - Bi me spodbudilo, 5 - Zelo bi me spodbudilo]
Q19: Kateri nefinančni spodbudni ukrepi bi vas spodbudili k uporabi mobilnega bančništva? [1 - Sploh me ne bi spodbudilo, 2 - Ne bi me spodbudilo, 3 - Niti niti, 4 - Bi me spodbudilo, 5 - Zelo bi me spodbudilo]
Q41: Spol [1 - Ženska, 2 - Moški, 3 - Ne želim odgovoriti]
Q42: Prosimo, vpišite leto rojstva. [Odprto besedilo]
Q43: S kom živite v istem gospodinjstvu? [Izbira več možnih odgovorov]
Q44: Koliko prebivalcev živi v kraju, kjer prebivate? [1 - Manj kot 1.000 prebivalcev, 2 - 1.000–5.000 prebivalcev, 3 - 5.001–20.000 prebivalcev, 4 - 20.001–50.000 prebivalcev, 5 - 50.001–100.000 prebivalcev, 6 - Več kot 100.000 prebivalcev]
Q45: Katera je vaša primarna banka? [1 - Nova Ljubljanska Banka d.d. (NLB), 2 - BKS Bank AG, Bančna podružnica, 3 - OTP banka d.d., 4 - Banka Sparkasse d.d., 5 - Banka Intesa Sanpaolo d.d., 6 - UniCredit Banka Slovenija d.d., 7 - Revolut, 8 - N26, 9 - Gorenjska Banka d.d., 10 - Deželna Banka Slovenije d.d., 11 - Addiko Bank d.d., 12 - Delavska Hranilnica d.d., 13 - Drugo]
Q46: Kakšna je vaša trenutna zaposlitev? [1 - Študent/ka, 2 - Redno zaposlen/a, 3 - Upokojen/a, 4 - Brezposeln/a, 5 - Samozaposlen/a, 6 - Delno zaposlen/a]
Q47: Kakšen je vaš mesečni neto prihodek? [1 - Pod 1.000€, 2 - 1.000€–1.500€, 3 - 1.501€–2.000€, 4 - 2.001€–3.000€, 5 - 3.001€–5.000€, 6 - 5.001€–10.000€, 7 - Nad 10.000€, 8 - Ne želim odgovoriti]
Q48: Kakšna je vaša stopnja izobrazbe? [1 - Nedokončana osnovna šola, 2 - Dokončana osnovna šola, 3 - Dokončana nižja ali srednja poklicna izobrazba, 4 - Dokončana srednja strokovna ali splošna izobrazba, 5 - Dokončana višješolska strokovna ali visokošolska strokovna izobrazba (tudi 1. bolonjska stopnja), 6 - Dokončana visokošolska strokovna ali univerzitetna izobrazba (tudi 2. bolonjska stopnja), 7 - Dokončana specializacija, znanstveni magisterij ali doktorat]
Q49: Kako ste rešili anketo? [1 - Preko linka, 2 - Na tablici, 3 - Na listu papirja]
# Keep only specific columns (replace with actual column names)
data1_log_reg <- mydata_excel1 %>% select(Q1, Q41, Q42, Q43a, Q43b, Q43c, Q43d, Q43e, Q43f, Q43g, Q43h, Q44, Q45, Q46, Q47, Q48)
data2_log_reg <- mydata_excel2 %>% select(Q1, Q41, Q42, Q43a, Q43b, Q43c, Q43d, Q43e, Q43f, Q43g, Q43h, Q44, Q45, Q46, Q47, Q48)
merged_data_log_reg <- rbind(data1_log_reg, data2_log_reg)
# Rename columns using colnames()
colnames(merged_data_log_reg) <- c("Ali uporabljate mobilno aplikacijo",
"Spol","Leto rojstva","Živim s starši",
"Živim s sorojenci","Živim s sorodniki",
"Živim s partnerko/jem", "Živim z otroki",
"Živim s skrbnikom", "Živim sam/a", "Drugo",
"Št. prebivalcev v kraju bivanja", "Primarna banka",
"Trenutna zaposlitev", "Mesečni neto prihodek", "Stopnja izobrazbe")
# Remove the first row
merged_data_log_reg1 <- merged_data_log_reg[-1, ]
categorical_vars <- c("Ali uporabljate mobilno aplikacijo", "Spol",
"Živim s starši", "Živim s sorojenci",
"Živim s sorodniki", "Živim s partnerko/jem",
"Živim z otroki", "Živim s skrbnikom",
"Živim sam/a", "Drugo", "Primarna banka",
"Trenutna zaposlitev", "Stopnja izobrazbe")
merged_data_log_reg1[categorical_vars] <- lapply(merged_data_log_reg1[categorical_vars], as.factor)
# Convert numerical variables to numeric
numeric_vars <- c("Leto rojstva", "Št. prebivalcev v kraju bivanja", "Mesečni neto prihodek")
merged_data_log_reg1[numeric_vars] <- lapply(merged_data_log_reg1[numeric_vars], as.numeric)
merged_data_log_reg1 <- as.data.frame(merged_data_log_reg1)
head(merged_data_log_reg1)
## Ali uporabljate mobilno aplikacijo Spol Leto rojstva Živim s starši
## 1 1 2 2000 1
## 2 1 2 1998 0
## 3 1 2 2001 1
## 4 1 2 1994 0
## 5 1 2 2000 1
## 6 1 2 2004 1
## Živim s sorojenci Živim s sorodniki Živim s partnerko/jem Živim z otroki
## 1 0 0 0 0
## 2 0 0 1 0
## 3 0 0 0 0
## 4 0 0 1 0
## 5 1 0 0 0
## 6 0 0 0 0
## Živim s skrbnikom Živim sam/a Drugo Št. prebivalcev v kraju bivanja
## 1 0 0 0 2
## 2 0 0 0 6
## 3 0 0 0 2
## 4 0 0 0 6
## 5 0 0 0 2
## 6 0 0 0 3
## Primarna banka Trenutna zaposlitev Mesečni neto prihodek Stopnja izobrazbe
## 1 3 2 2 2
## 2 3 2 3 4
## 3 1 2 2 3
## 4 12 5 6 5
## 5 1 2 3 6
## 6 1 1 8 4
merged_data_clean <- merged_data_log_reg1 %>%
filter(complete.cases(.)) # Removes rows with any NAs
# Automatically calculate age and assign age groups
merged_data_clean <- merged_data_clean %>%
mutate(
`Leto rojstva` = as.numeric(as.character(`Leto rojstva`)), # Convert to numeric (handles factors & characters)
age = as.numeric(format(Sys.Date(), "%Y")) - `Leto rojstva`, # Calculate age
age_group = case_when(
age <= 27 ~ "Young",
age > 27 & age <= 65 ~ "Professional",
age > 65 ~ "Older"))
# Ensure output is a data frame (not a tibble)
merged_data_clean <- as.data.frame(merged_data_clean)
head(merged_data_clean)
## Ali uporabljate mobilno aplikacijo Spol Leto rojstva Živim s starši
## 1 1 2 2000 1
## 2 1 2 1998 0
## 3 1 2 2001 1
## 4 1 2 1994 0
## 5 1 2 2000 1
## 6 1 2 2004 1
## Živim s sorojenci Živim s sorodniki Živim s partnerko/jem Živim z otroki
## 1 0 0 0 0
## 2 0 0 1 0
## 3 0 0 0 0
## 4 0 0 1 0
## 5 1 0 0 0
## 6 0 0 0 0
## Živim s skrbnikom Živim sam/a Drugo Št. prebivalcev v kraju bivanja
## 1 0 0 0 2
## 2 0 0 0 6
## 3 0 0 0 2
## 4 0 0 0 6
## 5 0 0 0 2
## 6 0 0 0 3
## Primarna banka Trenutna zaposlitev Mesečni neto prihodek Stopnja izobrazbe
## 1 3 2 2 2
## 2 3 2 3 4
## 3 1 2 2 3
## 4 12 5 6 5
## 5 1 2 3 6
## 6 1 1 8 4
## age age_group
## 1 25 Young
## 2 27 Young
## 3 24 Young
## 4 31 Professional
## 5 25 Young
## 6 21 Young
# Add an ID column (sequential numbering)
merged_data_clean <- merged_data_clean %>%
mutate(ID = row_number())
# Ensure it's a data frame (not a tibble)
merged_data_clean <- as.data.frame(merged_data_clean)
# Display the first 6 rows to check
head(merged_data_clean)
## Ali uporabljate mobilno aplikacijo Spol Leto rojstva Živim s starši
## 1 1 2 2000 1
## 2 1 2 1998 0
## 3 1 2 2001 1
## 4 1 2 1994 0
## 5 1 2 2000 1
## 6 1 2 2004 1
## Živim s sorojenci Živim s sorodniki Živim s partnerko/jem Živim z otroki
## 1 0 0 0 0
## 2 0 0 1 0
## 3 0 0 0 0
## 4 0 0 1 0
## 5 1 0 0 0
## 6 0 0 0 0
## Živim s skrbnikom Živim sam/a Drugo Št. prebivalcev v kraju bivanja
## 1 0 0 0 2
## 2 0 0 0 6
## 3 0 0 0 2
## 4 0 0 0 6
## 5 0 0 0 2
## 6 0 0 0 3
## Primarna banka Trenutna zaposlitev Mesečni neto prihodek Stopnja izobrazbe
## 1 3 2 2 2
## 2 3 2 3 4
## 3 1 2 2 3
## 4 12 5 6 5
## 5 1 2 3 6
## 6 1 1 8 4
## age age_group ID
## 1 25 Young 1
## 2 27 Young 2
## 3 24 Young 3
## 4 31 Professional 4
## 5 25 Young 5
## 6 21 Young 6
# Convert a specific column to numeric
merged_data_clean <- merged_data_clean %>%
mutate(across(c(`Živim s starši`, `Živim s sorojenci`, `Živim s sorodniki`, `Živim s partnerko/jem`, `Živim z otroki`, `Živim s skrbnikom`, `Živim sam/a`, `Drugo`, `age`),
~ as.numeric(as.character(.))))
merged_data_clean <- merged_data_clean %>%
mutate(`Ali uporabljate mobilno aplikacijo` = ifelse(`Ali uporabljate mobilno aplikacijo` == 1, 0, ifelse(!1 == 2, 1, V1)))
head(merged_data_clean)
## Ali uporabljate mobilno aplikacijo Spol Leto rojstva Živim s starši
## 1 0 2 2000 1
## 2 0 2 1998 0
## 3 0 2 2001 1
## 4 0 2 1994 0
## 5 0 2 2000 1
## 6 0 2 2004 1
## Živim s sorojenci Živim s sorodniki Živim s partnerko/jem Živim z otroki
## 1 0 0 0 0
## 2 0 0 1 0
## 3 0 0 0 0
## 4 0 0 1 0
## 5 1 0 0 0
## 6 0 0 0 0
## Živim s skrbnikom Živim sam/a Drugo Št. prebivalcev v kraju bivanja
## 1 0 0 0 2
## 2 0 0 0 6
## 3 0 0 0 2
## 4 0 0 0 6
## 5 0 0 0 2
## 6 0 0 0 3
## Primarna banka Trenutna zaposlitev Mesečni neto prihodek Stopnja izobrazbe
## 1 3 2 2 2
## 2 3 2 3 4
## 3 1 2 2 3
## 4 12 5 6 5
## 5 1 2 3 6
## 6 1 1 8 4
## age age_group ID
## 1 25 Young 1
## 2 27 Young 2
## 3 24 Young 3
## 4 31 Professional 4
## 5 25 Young 5
## 6 21 Young 6
fit0 <- glm(`Ali uporabljate mobilno aplikacijo` ~ 1, #Dependent and explanatory variables
family = binomial, #Binary logistic regression
data = merged_data_clean)
summary(fit0)
##
## Call:
## glm(formula = `Ali uporabljate mobilno aplikacijo` ~ 1, family = binomial,
## data = merged_data_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.33798 0.09521 -3.55 0.000385 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 616.6 on 453 degrees of freedom
## Residual deviance: 616.6 on 453 degrees of freedom
## AIC: 618.6
##
## Number of Fisher Scoring iterations: 4
exp(cbind(odds = fit0$coefficients, confint.default(fit0))) #Odds for Y=1
## odds 2.5 % 97.5 %
## (Intercept) 0.7132075 0.5917995 0.8595225
head(fitted(fit0)) #Estimated probability for Y=1
## 1 2 3 4 5 6
## 0.4162996 0.4162996 0.4162996 0.4162996 0.4162996 0.4162996
fit1 <- glm(`Ali uporabljate mobilno aplikacijo` ~ age_group,
family = binomial,
data = merged_data_clean)
summary(fit1)
##
## Call:
## glm(formula = `Ali uporabljate mobilno aplikacijo` ~ age_group,
## family = binomial, data = merged_data_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.5506 0.2942 5.270 1.36e-07 ***
## age_groupProfessional -1.6854 0.3254 -5.180 2.22e-07 ***
## age_groupYoung -3.2341 0.3635 -8.896 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 616.6 on 453 degrees of freedom
## Residual deviance: 505.7 on 451 degrees of freedom
## AIC: 511.7
##
## Number of Fisher Scoring iterations: 3
exp(cbind(odds = fit1$coefficients, confint.default(fit1))) #Odds for Y=1
## odds 2.5 % 97.5 %
## (Intercept) 4.71428558 2.64836954 8.39176261
## age_groupProfessional 0.18536719 0.09796180 0.35075912
## age_groupYoung 0.03939394 0.01931873 0.08033047
head(fitted(fit1)) #Estimated probability for Y=1
## 1 2 3 4 5 6
## 0.1566265 0.1566265 0.1566265 0.4663462 0.1566265 0.1566265
#Ifelse za vprašanje s kom živijo
merged_data_clean <- merged_data_clean %>%
mutate(
# Create a text column listing all people they live with
S_kom_živijo = paste0(
ifelse(`Živim s starši` == 1, "Živim s starši, ", ""),
ifelse(`Živim s sorojenci` == 1, "Živim s sorojenci, ", ""),
ifelse(`Živim s sorodniki` == 1, "Živim s sorodniki, ", ""),
ifelse(`Živim s partnerko/jem` == 1, "Živim s partnerko/jem, ", ""),
ifelse(`Živim z otroki` == 1, "Živim z otroki, ", ""),
ifelse(`Živim s skrbnikom` == 1, "Živim s skrbnikom, ", ""),
ifelse(`Drugo` == 1, "Drugo, ", "")
),
S_kom_živijo = sub(", $", "", S_kom_živijo), # Remove trailing comma
# Categorize into "Lives alone" or "Lives with others"
Kako_živijo = ifelse(`Živim sam/a` == 1, "Živi sam", "Živi z drugimi")
)
anova(fit0, fit1, test = "Chi") #Comparision of models based on -2LL statistics
## Analysis of Deviance Table
##
## Model 1: `Ali uporabljate mobilno aplikacijo` ~ 1
## Model 2: `Ali uporabljate mobilno aplikacijo` ~ age_group
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 453 616.6
## 2 451 505.7 2 110.89 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(cbind(OR = fit1$coefficients, confint.default(fit1))) #Odds ratio for Y=1 (with 95% CI)
## OR 2.5 % 97.5 %
## (Intercept) 4.71428558 2.64836954 8.39176261
## age_groupProfessional 0.18536719 0.09796180 0.35075912
## age_groupYoung 0.03939394 0.01931873 0.08033047
merged_data_clean <- merged_data_clean %>%
mutate(across(c(`Št. prebivalcev v kraju bivanja`, `Trenutna zaposlitev`, `Mesečni neto prihodek`, `Stopnja izobrazbe`),
~ as.numeric(as.character(.))))
str(merged_data_clean)
## 'data.frame': 454 obs. of 21 variables:
## $ Ali uporabljate mobilno aplikacijo: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Spol : Factor w/ 4 levels "-3","1","2","3": 3 3 3 3 3 3 2 3 3 2 ...
## $ Leto rojstva : num 2000 1998 2001 1994 2000 ...
## $ Živim s starši : num 1 0 1 0 1 1 0 0 0 0 ...
## $ Živim s sorojenci : num 0 0 0 0 1 0 0 0 0 0 ...
## $ Živim s sorodniki : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Živim s partnerko/jem : num 0 1 0 1 0 0 1 0 1 1 ...
## $ Živim z otroki : num 0 0 0 0 0 0 0 0 1 0 ...
## $ Živim s skrbnikom : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Živim sam/a : num 0 0 0 0 0 0 0 1 0 0 ...
## $ Drugo : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Št. prebivalcev v kraju bivanja : num 2 6 2 6 2 3 6 6 1 4 ...
## $ Primarna banka : Factor w/ 14 levels "-3","1","10",..: 8 8 2 5 2 2 11 7 2 7 ...
## $ Trenutna zaposlitev : num 2 2 2 5 2 1 2 1 2 1 ...
## $ Mesečni neto prihodek : num 2 3 2 6 3 8 4 6 4 2 ...
## $ Stopnja izobrazbe : num 2 4 3 5 6 4 6 1 6 5 ...
## $ age : num 25 27 24 31 25 21 28 23 39 23 ...
## $ age_group : chr "Young" "Young" "Young" "Professional" ...
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ S_kom_živijo : chr "Živim s starši" "Živim s partnerko/jem" "Živim s starši" "Živim s partnerko/jem" ...
## $ Kako_živijo : chr "Živi z drugimi" "Živi z drugimi" "Živi z drugimi" "Živi z drugimi" ...
fit2 <- glm(`Ali uporabljate mobilno aplikacijo` ~ age_group + `Trenutna zaposlitev` + `Stopnja izobrazbe`,
family = binomial,
data = merged_data_clean)
summary(fit2)
##
## Call:
## glm(formula = `Ali uporabljate mobilno aplikacijo` ~ age_group +
## `Trenutna zaposlitev` + `Stopnja izobrazbe`, family = binomial,
## data = merged_data_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.64298 0.50628 5.220 1.79e-07 ***
## age_groupProfessional -0.71711 0.36821 -1.948 0.0515 .
## age_groupYoung -2.51214 0.40758 -6.164 7.11e-10 ***
## `Trenutna zaposlitev` 0.07741 0.10220 0.757 0.4488
## `Stopnja izobrazbe` -0.44330 0.08969 -4.943 7.71e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 616.60 on 453 degrees of freedom
## Residual deviance: 475.35 on 449 degrees of freedom
## AIC: 485.35
##
## Number of Fisher Scoring iterations: 5
exp(cbind(odds = fit2$coefficients, confint.default(fit2))) #Odds for Y=1
## odds 2.5 % 97.5 %
## (Intercept) 14.05507453 5.2105773 37.9123286
## age_groupProfessional 0.48816000 0.2372152 1.0045737
## age_groupYoung 0.08109449 0.0364804 0.1802699
## `Trenutna zaposlitev` 1.08048242 0.8843581 1.3201013
## `Stopnja izobrazbe` 0.64191150 0.5384305 0.7652805
head(fitted(fit2)) #Estimated probability for Y=1
## 1 2 3 4 5 6
## 0.35412612 0.18428812 0.26032964 0.52407772 0.08516365 0.17293487
fit3 <- glm(`Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja`,
family = binomial,
data = merged_data_clean)
summary(fit3)
##
## Call:
## glm(formula = `Ali uporabljate mobilno aplikacijo` ~ age_group +
## `Št. prebivalcev v kraju bivanja`, family = binomial, data = merged_data_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.95622 0.33318 5.871 4.32e-09 ***
## age_groupProfessional -1.18740 0.34527 -3.439 0.000584 ***
## age_groupYoung -2.80997 0.37710 -7.452 9.23e-14 ***
## `Št. prebivalcev v kraju bivanja` -0.24141 0.05459 -4.422 9.76e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 616.60 on 453 degrees of freedom
## Residual deviance: 484.64 on 450 degrees of freedom
## AIC: 492.64
##
## Number of Fisher Scoring iterations: 4
exp(cbind(odds = fit0$coefficients, confint.default(fit3))) #Odds for Y=1
## odds 2.5 % 97.5 %
## (Intercept) 0.7132075 3.68101588 13.5888375
## age_groupProfessional 0.7132075 0.15503284 0.6000846
## age_groupYoung 0.7132075 0.02875141 0.1260762
## `Št. prebivalcev v kraju bivanja` 0.7132075 0.70581729 0.8742236
head(fitted(fit3)) #Estimated probability for Y=1
## 1 2 3 4 5 6
## 0.20807504 0.09094061 0.20807504 0.33634239 0.20807504 0.17108220
fit4 <- glm(`Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` + `Mesečni neto prihodek` + `Kako_živijo`,
family = binomial,
data = merged_data_clean)
head(merged_data_clean[, c("age_group", "Št. prebivalcev v kraju bivanja", "Mesečni neto prihodek", "Kako_živijo")])
## age_group Št. prebivalcev v kraju bivanja Mesečni neto prihodek
## 1 Young 2 2
## 2 Young 6 3
## 3 Young 2 2
## 4 Professional 6 6
## 5 Young 2 3
## 6 Young 3 8
## Kako_živijo
## 1 Živi z drugimi
## 2 Živi z drugimi
## 3 Živi z drugimi
## 4 Živi z drugimi
## 5 Živi z drugimi
## 6 Živi z drugimi
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:DescTools':
##
## Recode
## The following object is masked from 'package:dplyr':
##
## recode
vif(fit4)
## GVIF Df GVIF^(1/(2*Df))
## age_group 1.340321 2 1.075975
## `Št. prebivalcev v kraju bivanja` 1.059177 1 1.029163
## `Mesečni neto prihodek` 1.342169 1 1.158520
## Kako_živijo 1.036132 1 1.017906
All VIF values are close to 1, meaning there is no serious multicollinearity in our model. Our predictors are independent enough to be used together.
summary(fit4)
##
## Call:
## glm(formula = `Ali uporabljate mobilno aplikacijo` ~ age_group +
## `Št. prebivalcev v kraju bivanja` + `Mesečni neto prihodek` +
## Kako_živijo, family = binomial, data = merged_data_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.83274 0.52146 5.432 5.56e-08 ***
## age_groupProfessional -0.89742 0.38778 -2.314 0.0207 *
## age_groupYoung -3.25271 0.43764 -7.432 1.07e-13 ***
## `Št. prebivalcev v kraju bivanja` -0.14921 0.06377 -2.340 0.0193 *
## `Mesečni neto prihodek` -0.48453 0.08232 -5.886 3.96e-09 ***
## Kako_živijoŽivi z drugimi 0.06792 0.31249 0.217 0.8279
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 616.60 on 453 degrees of freedom
## Residual deviance: 437.01 on 448 degrees of freedom
## AIC: 449.01
##
## Number of Fisher Scoring iterations: 5
exp(cbind(odds = fit4$coefficients, confint.default(fit4))) #Odds for Y=1
## odds 2.5 % 97.5 %
## (Intercept) 16.99197228 6.1147486 47.21815095
## age_groupProfessional 0.40761833 0.1906221 0.87163395
## age_groupYoung 0.03866918 0.0164000 0.09117718
## `Št. prebivalcev v kraju bivanja` 0.86138858 0.7601860 0.97606417
## `Mesečni neto prihodek` 0.61598806 0.5242040 0.72384281
## Kako_živijoŽivi z drugimi 1.07027620 0.5801038 1.97463141
head(fitted(fit4)) #Estimated probability for Y=1
## 1 2 3 4 5 6
## 0.165269812 0.062920556 0.165269812 0.141949708 0.108703138 0.009231083
fit5 <- glm(`Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` + `Trenutna zaposlitev` + `Mesečni neto prihodek` + `Stopnja izobrazbe` + `Kako_živijo`,
family = binomial,
data = merged_data_clean)
summary(fit5)
##
## Call:
## glm(formula = `Ali uporabljate mobilno aplikacijo` ~ age_group +
## `Št. prebivalcev v kraju bivanja` + `Trenutna zaposlitev` +
## `Mesečni neto prihodek` + `Stopnja izobrazbe` + Kako_živijo,
## family = binomial, data = merged_data_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.36990 0.74444 4.527 5.99e-06 ***
## age_groupProfessional -0.67280 0.40141 -1.676 0.0937 .
## age_groupYoung -2.99739 0.48179 -6.221 4.93e-10 ***
## `Št. prebivalcev v kraju bivanja` -0.12621 0.06518 -1.936 0.0528 .
## `Trenutna zaposlitev` 0.06624 0.11807 0.561 0.5748
## `Mesečni neto prihodek` -0.43289 0.08371 -5.171 2.33e-07 ***
## `Stopnja izobrazbe` -0.22244 0.11139 -1.997 0.0458 *
## Kako_živijoŽivi z drugimi 0.01338 0.31648 0.042 0.9663
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 616.60 on 453 degrees of freedom
## Residual deviance: 432.72 on 446 degrees of freedom
## AIC: 448.72
##
## Number of Fisher Scoring iterations: 6
anova(fit0, fit1, test = "Chi") #Comparision of models based on -2LL statistics
## Analysis of Deviance Table
##
## Model 1: `Ali uporabljate mobilno aplikacijo` ~ 1
## Model 2: `Ali uporabljate mobilno aplikacijo` ~ age_group
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 453 616.6
## 2 451 505.7 2 110.89 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(fit1, fit2, test = "Chi") #Comparision of models based on -2LL statistics
## Analysis of Deviance Table
##
## Model 1: `Ali uporabljate mobilno aplikacijo` ~ age_group
## Model 2: `Ali uporabljate mobilno aplikacijo` ~ age_group + `Trenutna zaposlitev` +
## `Stopnja izobrazbe`
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 451 505.70
## 2 449 475.35 2 30.347 2.571e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(fit2, fit3, test = "Chi") #Comparision of models based on -2LL statistics
## Analysis of Deviance Table
##
## Model 1: `Ali uporabljate mobilno aplikacijo` ~ age_group + `Trenutna zaposlitev` +
## `Stopnja izobrazbe`
## Model 2: `Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja`
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 449 475.35
## 2 450 484.64 -1 -9.2898 0.002304 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(fit3, fit4, test = "Chi") #Comparision of models based on -2LL statistics
## Analysis of Deviance Table
##
## Model 1: `Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja`
## Model 2: `Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` +
## `Mesečni neto prihodek` + Kako_živijo
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 450 484.64
## 2 448 437.01 2 47.633 4.536e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(fit4, fit5, test = "Chi") #Comparision of models based on -2LL statistics
## Analysis of Deviance Table
##
## Model 1: `Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` +
## `Mesečni neto prihodek` + Kako_živijo
## Model 2: `Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` +
## `Trenutna zaposlitev` + `Mesečni neto prihodek` + `Stopnja izobrazbe` +
## Kako_živijo
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 448 437.01
## 2 446 432.72 2 4.2928 0.1169
Glede na teste je najboljši fit4:
Ali uporabljate mobilno
aplikacijo~age_group+Št. prebivalcev v kraju
bivanja+Mesečni neto prihodek` + Kako_živijo
# Extract standardized residuals and Cook's distance from fit4
merged_data_clean$StdResid <- rstandard(fit4) # Standardized residuals
merged_data_clean$Cook <- cooks.distance(fit4) # Cook's distance
merged_data_clean$StdResid <- rstandard(fit4)
merged_data_clean$CooksD <- cooks.distance(fit4)
library(ggplot2)
StdResid <- ggplot(merged_data_clean, aes(x=StdResid)) +
theme_linedraw() +
geom_histogram() +
xlab("Standardized residuals")
library(ggplot2)
Cook <- ggplot(merged_data_clean, aes(x=CooksD)) +
theme_linedraw() +
geom_histogram() +
xlab("Cook's distances")
ggarrange(StdResid, Cook,
ncol = 2, nrow = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
head(merged_data_clean[order(merged_data_clean$StdResid), c("ID", "StdResid")], 5)
## ID StdResid
## 191 191 -2.105705
## 232 232 -2.066293
## 240 240 -1.881493
## 264 264 -1.792185
## 254 254 -1.779406
head(merged_data_clean[order(-merged_data_clean$StdResid), c("ID", "StdResid")], 5)
## ID StdResid
## 440 440 2.916655
## 313 313 2.505523
## 297 297 2.358535
## 394 394 2.256942
## 446 446 2.194612
head(merged_data_clean[order(-merged_data_clean$CooksD), c("ID", "CooksD")], 10)
## ID CooksD
## 440 440 0.04439633
## 191 191 0.02805883
## 313 313 0.02379869
## 240 240 0.02354029
## 232 232 0.02054852
## 446 446 0.01904230
## 380 380 0.01837936
## 394 394 0.01827372
## 296 296 0.01789714
## 322 322 0.01789714
We have 3 high impact values and 1 potential outlier. So we dropped those values.
library(dplyr)
merged_data_clean <- merged_data_clean %>%
filter(ID != 440 & ID != 191 & ID != 313 & ID != 232)
Let’s run the fits again and check if they’re better.
fit4 <- glm(`Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` + `Mesečni neto prihodek` + `Kako_živijo`,
family = binomial,
data = merged_data_clean)
summary(fit4)
##
## Call:
## glm(formula = `Ali uporabljate mobilno aplikacijo` ~ age_group +
## `Št. prebivalcev v kraju bivanja` + `Mesečni neto prihodek` +
## Kako_živijo, family = binomial, data = merged_data_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.32703 0.58135 5.723 1.05e-08 ***
## age_groupProfessional -1.13377 0.42267 -2.682 0.00731 **
## age_groupYoung -3.75194 0.48745 -7.697 1.39e-14 ***
## `Št. prebivalcev v kraju bivanja` -0.15270 0.06632 -2.303 0.02130 *
## `Mesečni neto prihodek` -0.57067 0.09076 -6.287 3.23e-10 ***
## Kako_živijoŽivi z drugimi 0.10018 0.32367 0.309 0.75694
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 610.94 on 449 degrees of freedom
## Residual deviance: 412.17 on 444 degrees of freedom
## AIC: 424.17
##
## Number of Fisher Scoring iterations: 6
fit5 <- glm(`Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` + `Trenutna zaposlitev` + `Mesečni neto prihodek` + `Stopnja izobrazbe` + `Kako_živijo`,
family = binomial,
data = merged_data_clean)
summary(fit5)
##
## Call:
## glm(formula = `Ali uporabljate mobilno aplikacijo` ~ age_group +
## `Št. prebivalcev v kraju bivanja` + `Trenutna zaposlitev` +
## `Mesečni neto prihodek` + `Stopnja izobrazbe` + Kako_živijo,
## family = binomial, data = merged_data_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.99040 0.83581 4.774 1.80e-06 ***
## age_groupProfessional -0.90296 0.43243 -2.088 0.0368 *
## age_groupYoung -3.49294 0.53299 -6.553 5.62e-11 ***
## `Št. prebivalcev v kraju bivanja` -0.12915 0.06771 -1.907 0.0565 .
## `Trenutna zaposlitev` 0.07230 0.12380 0.584 0.5592
## `Mesečni neto prihodek` -0.51303 0.09142 -5.612 2.00e-08 ***
## `Stopnja izobrazbe` -0.25354 0.11856 -2.139 0.0325 *
## Kako_živijoŽivi z drugimi 0.03498 0.32861 0.106 0.9152
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 610.94 on 449 degrees of freedom
## Residual deviance: 407.18 on 442 degrees of freedom
## AIC: 423.18
##
## Number of Fisher Scoring iterations: 6
And let’s run anova again:
anova(fit4, fit5, test = "Chi") #Comparision of models based on -2LL statistics
## Analysis of Deviance Table
##
## Model 1: `Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` +
## `Mesečni neto prihodek` + Kako_živijo
## Model 2: `Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` +
## `Trenutna zaposlitev` + `Mesečni neto prihodek` + `Stopnja izobrazbe` +
## Kako_živijo
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 444 412.17
## 2 442 407.18 2 4.9891 0.08253 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Fit4 if still better and we fixed it a bit.
# A data frame for coefficients of fit4
coeff_data <- data.frame(
Variable = c("Intercept", "age_groupProfessional", "age_groupYoung",
"Št. prebivalcev v kraju bivanja", "Mesečni neto prihodek", "Kako_živijoŽivi z drugimi"),
Estimate = c(-3.74939, 1.27803, 4.01414, 0.15967, 0.63420, -0.03244),
Std_Error = c(0.62769, 0.44997, 0.52222, 0.06804, 0.09739, 0.33023),
Z_Value = c(-5.973, 2.840, 7.687, 2.347, 6.512, -0.098),
P_Value = c(2.32e-09, 0.00451, 1.51e-14, 0.01894, 7.40e-11, 0.92174))
head(coeff_data)
## Variable Estimate Std_Error Z_Value P_Value
## 1 Intercept -3.74939 0.62769 -5.973 2.3200e-09
## 2 age_groupProfessional 1.27803 0.44997 2.840 4.5100e-03
## 3 age_groupYoung 4.01414 0.52222 7.687 1.5100e-14
## 4 Št. prebivalcev v kraju bivanja 0.15967 0.06804 2.347 1.8940e-02
## 5 Mesečni neto prihodek 0.63420 0.09739 6.512 7.4000e-11
## 6 Kako_živijoŽivi z drugimi -0.03244 0.33023 -0.098 9.2174e-01
# Ensure the dependent variable is a factor (assuming LatePayF is the outcome variable)
merged_data_clean$`Ali uporabljate mobilno aplikacijo` <- as.factor(merged_data_clean$`Ali uporabljate mobilno aplikacijo`)
# Fit logistic regression model
fit4 <- glm(`Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` + `Mesečni neto prihodek` + `Kako_živijo`,
family = binomial,
data = merged_data_clean)
# Generate predicted probabilities (Estimated Probability)
merged_data_clean <- merged_data_clean %>%
mutate(EstProb = predict(fit4, type = "response"))
# Classify based on probability threshold (0.50)
merged_data_clean <- merged_data_clean %>%
mutate(Classification = ifelse(EstProb < 0.50, "NO", "YES"),
ClassificationF = factor(Classification, levels = c("NO", "YES")))
# Create the classification table
ClassificationTable <- table(merged_data_clean$`Ali uporabljate mobilno aplikacijo`, merged_data_clean$ClassificationF)
# Display classification table
print(ClassificationTable)
##
## NO YES
## 0 223 40
## 1 62 125
# Compute Pseudo R² (Proportion of correctly classified cases)
Pseudo_R2_fit4 <- sum(diag(ClassificationTable)) / nrow(merged_data_clean)
# Display Pseudo R²
Pseudo_R2_fit4
## [1] 0.7733333
Age: A positive and highly significant effect on mobile app usage (p < 0.001). This suggests that as age increases, the likelihood of using a mobile application also increases.
Population of Place of Residence: A negative and significant impact (p < 0.05), implying that individuals from larger population areas are less likely to use mobile applications.
Monthly Net Income: Also negatively associated with app usage (p < 0.001), indicating that individuals with higher incomes are less likely to use mobile applications.
Living Situation (“Kako živijo”): The variable was not statistically significant, meaning that whether someone lives alone or with others does not strongly predict mobile app usage.
Employment Status and Education Level: When added in Model 2 (fit5), these factors did not significantly improve model performance (p = 0.1458 in the Chi-square test), suggesting they may not be strong predictors.
Fit4 vs. Fit5: The simpler model (fit4) performed slightly better, and adding employment status and education level did not significantly improve the fit.
Classification Performance: The logistic regression model achieved 78.29% accuracy, indicating a good ability to distinguish between app users and non-users.
Potential Issues: The warning message (glm.fit: fitted probabilities numerically 0 or 1 occurred) suggests some separation in the data, meaning some observations were perfectly predicted, which could affect model reliability.
Since the intercept value is highly negative, it suggests that the baseline probability of using the app is quite low.
age_groupProfessional (1.27803, p = 0.00451): Professionals are significantly more likely to use the app compared to the reference group. The odds ratio (exp(1.27803) ≈ 3.59) means professionals are about 3.6 times more likely to use the app than the reference group.
age_groupYoung (4.01414, p < 0.001): Younger individuals are much more likely to use the app compared to the reference group. The odds ratio (exp(4.01414) ≈ 55.44) means young individuals are over 55 times more likely to use the app than the reference group.
Number of Residents in the Living Area (0.15967, p = 0.01894) A
positive coefficient suggests that people from larger settlements are
more likely to use the app. The odds ratio (exp(0.15967) ≈ 1.17) means
that for each unit increase in population size, the odds of using the
app
increase by 17%.
Monthly Net Income (0.63420, p < 0.001) Higher income is
strongly associated with app usage. The odds ratio (exp(0.63420) ≈ 1.89)
suggests that for each unit increase in income, the odds of using the
app nearly
double.
The final model (fit4) provides a solid explanation of mobile app usage patterns.
The strong role of age, population size, and income aligns with expectations—older individuals and those in smaller towns tend to use apps more, while higher-income individuals might be less dependent on them.
Addressing the perfect separation issue (perhaps by regularization techniques like ridge regression or handling outliers) could enhance the model’s robustness.
Younger individuals are overwhelmingly more likely to use the mobile app.
Professionals also have significantly higher odds of using the app compared to the reference age group.
Higher income and living in a larger settlement are both associated with greater app usage.
Living arrangement does not significantly impact app usage.
#and we take a bow