Cleaning data

#install.packages
library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#install.packages("ggpubr")
library(ggpubr)
## Loading required package: ggplot2
#install.packages("DescTools")
library(DescTools)

mydata_excel1 <- read_excel("~/Desktop/NLB projekt/Logistična regresija/1ka data.xlsx")

mydata_excel2 <- read_excel("~/Desktop/NLB projekt/Logistična regresija/anketa_končni podatki.xlsx")

mydata_excel2 <- mydata_excel2[-1, ] #Delete first row in which the questions are written

mydata_excel2$ID <- seq(1,nrow(mydata_excel2))
mydata2 <- mydata_excel2[!(apply(mydata_excel2 == -3, 1, any)), ]

mydata2 <- subset(mydata2, select = -c(Q21:Q40))

mydata2$ID <- seq(1,nrow(mydata2))
mydata2$Q2 <- factor(mydata2$Q2, 
                         levels = c(1, 2), 
                         labels = c("Yes","No"))

mydata2$Q3 <- factor(mydata2$Q3, 
                         levels = c(1, 2), 
                         labels = c("Yes","No"))

mydata2$Q4 <- factor(mydata2$Q4, 
                         levels = c(1, 2, 3), 
                         labels = c("Yes","No", "I don't know"))

mydata2$Q5 <- factor(mydata2$Q5, 
                         levels = c(1, 2), 
                         labels = c("Yes","No"))

mydata2$Q7 <- factor(mydata2$Q7, 
                         levels = c(1, 2), 
                         labels = c("Yes","No"))

mydata2$Q16a <- factor(mydata2$Q16a, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata2$Q16b <- factor(mydata2$Q16b, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata2$Q16c <- factor(mydata2$Q16c, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata2$Q16d <- factor(mydata2$Q16d, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata2$Q16e <- factor(mydata2$Q16e, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata2$Q16f <- factor(mydata2$Q16f, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata2$Q16g <- factor(mydata2$Q16g, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata2$Q41 <- factor(mydata2$Q41, 
                         levels = c(1, 2, 3), 
                         labels = c("Female","Male", "I don't want to answer"))

mydata2$Q43a <- factor(mydata2$Q43a, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata2$Q43b <- factor(mydata2$Q43b, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata2$Q43c <- factor(mydata2$Q43c, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata2$Q43d <- factor(mydata2$Q43d, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata2$Q43e <- factor(mydata2$Q43e, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata2$Q43f <- factor(mydata2$Q43f, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata2$Q43g <- factor(mydata2$Q43g, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata2$Q43h <- factor(mydata2$Q43h, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata2$Q44 <- factor(mydata2$Q44, 
                         levels = c(2, 6, 1, 4, 3, 5), 
                         labels = c("1.000 – 5.000 habitants","More than 100.000 habitants", "Less than 1.000 habitants", "20.001 – 50.000 habitants", "5.001 – 20.000 habitants", "50.001 – 100.000 habitants"))

mydata2$Q45 <- factor(mydata2$Q45, 
                         levels = c(3, 5, 1, 9, 12, 7, 10, 4, 11, 6), 
                         labels = c("OTP banka d.d.","Banka Intesa Sanpaolo d.d.", "Nova Ljubljanska Banka d.d. (NLB)", "Gorenjska Banka d.d.", "Delavska Hranilnica d.d.", "Revolut", "Deželna Banka Slovenije d.d.", "Banka Sparkasse d.d.", "Addiko Bank d.d.", "UniCredit Banka Slovenija d.d."))

mydata2$Q46 <- factor(mydata2$Q46, 
                         levels = c(1, 2, 3, 5, 6, 4), 
                         labels = c("Študent/-ka","Redno zaposlen/-a", "Upokojen/-a", "Samozaposlen/-a", "Delno zaposlen/-a", "Brezposeln/-a"))

mydata2$Q47 <- factor(mydata2$Q47, 
                         levels = c(1, 5, 3, 8, 2, 4, 6, 7), 
                         labels = c("Pod 1.000€","3.001€ - 5.000€", "1.501€ - 2.000€", "I don't want to answer", "1.000€ - 1.500€", "2.001€ - 3.000€", "5.001€ - 10.000€", "Above 10.000€"))

mydata2$Q48 <- factor(mydata2$Q48, 
                         levels = c(2, 6, 3, 5, 7, 4), 
                         labels = c("Dokončana osnovna šola","Dokončana visokošolska strokovna univerzitetna izobrazba (tudi 2. bolonjska stopnja)", "Dokončana nižja ali srednja poklicna izobrazba", "Dokončana višješolska strokovna ali visokošolska strokovna izobrazba (tudi 1. bolonjska stopnja)", "Dokončana specializacija, znanstveni magisterij, doktorat", "Dokončana srednja strokovna ali splošna izobrazba"))

mydata2$Q49 <- factor(mydata2$Q49, 
                         levels = c(1, 2, 3), 
                         labels = c("Preko linka","Na tablici", "Na listu papirja"))

mydata2[c(6:14, 18:22, 25:39, 48:55, 58:62)] <- mydata2[c(6:14, 18:22, 25:39, 48:55, 58:62)] %>% mutate_all(as.numeric)

mydata2 <- mydata2 %>% mutate(across(where(is.numeric), ~ replace(., . == -2, mean(.[. != -2], na.rm = TRUE))))

mydata2 <- mydata2 %>% mutate(across(where(is.numeric), ~ replace(., is.na(.), mean(., na.rm = TRUE))))

mydata2 <- mydata2 %>%
  filter(!ID %in% c(2, 16, 17)) 

mydata2$ID <- seq(1, nrow(mydata2)) 

summary(mydata2[c(-1, -15, -16, -23, -24, -47, -56, -57, -63, -64, -66, -75, -78, -83)])
##     Q2         Q3                 Q4         Q5           Q6a       
##  Yes :121   Yes :131   Yes         :103   Yes : 25   Min.   :1.000  
##  No  : 36   No  : 26   No          : 28   No  :132   1st Qu.:3.000  
##  NA's:  1   NA's:  1   I don't know: 26   NA's:  1   Median :3.000  
##                        NA's        :  1              Mean   :3.433  
##                                                      3rd Qu.:4.000  
##                                                      Max.   :5.000  
##                                                                     
##       Q6b             Q6c             Q6d             Q6e       
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:3.000   1st Qu.:3.000   1st Qu.:1.000  
##  Median :4.000   Median :4.000   Median :4.000   Median :2.000  
##  Mean   :3.535   Mean   :3.847   Mean   :3.854   Mean   :2.688  
##  3rd Qu.:4.000   3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##                                                                 
##       Q6f             Q6g             Q6h            Q6i           Q7    
##  Min.   :1.000   Min.   :1.000   Min.   :1.00   Min.   :1.000   Yes :82  
##  1st Qu.:3.000   1st Qu.:2.000   1st Qu.:2.00   1st Qu.:3.000   No  :75  
##  Median :4.000   Median :3.086   Median :3.00   Median :4.000   NA's: 1  
##  Mean   :3.497   Mean   :3.172   Mean   :2.93   Mean   :3.465            
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.00   3rd Qu.:4.000            
##  Max.   :5.000   Max.   :5.000   Max.   :5.00   Max.   :5.000            
##                                                                          
##  Podpora in svetovanje
##  Min.   :1.00         
##  1st Qu.:3.00         
##  Median :4.00         
##  Mean   :3.79         
##  3rd Qu.:4.00         
##  Max.   :5.00         
##                       
##  Zagotavljanje varnosti in zaščita vaših finančnih podatkov ter transakcij
##  Min.   :1.000                                                            
##  1st Qu.:4.000                                                            
##  Median :5.000                                                            
##  Mean   :4.382                                                            
##  3rd Qu.:5.000                                                            
##  Max.   :5.000                                                            
##                                                                           
##  Transparentnost delovanja banke in njenih storitev   Priročnost   
##  Min.   :1.000                                      Min.   :1.000  
##  1st Qu.:4.000                                      1st Qu.:4.000  
##  Median :4.000                                      Median :4.000  
##  Mean   :4.108                                      Mean   :3.975  
##  3rd Qu.:5.000                                      3rd Qu.:5.000  
##  Max.   :5.000                                      Max.   :5.000  
##                                                                    
##  Hitrost in zanesljivost opravljene storitve Poslovalnica_Podpora in svetovanje
##  Min.   :1.000                               Min.   :2.000                     
##  1st Qu.:4.000                               1st Qu.:4.000                     
##  Median :4.000                               Median :4.000                     
##  Mean   :4.185                               Mean   :4.006                     
##  3rd Qu.:5.000                               3rd Qu.:5.000                     
##  Max.   :5.000                               Max.   :5.000                     
##                                                                                
##  Mobilna banka_Podpora in svetovanje
##  Min.   :1.00                       
##  1st Qu.:3.00                       
##  Median :3.00                       
##  Mean   :3.14                       
##  3rd Qu.:4.00                       
##  Max.   :5.00                       
##                                     
##  Poslovalnica_Zagotavljanje varnosti in zaščita vaših finančnih podatkov ter transakcij
##  Min.   :2.000                                                                         
##  1st Qu.:3.986                                                                         
##  Median :4.000                                                                         
##  Mean   :3.981                                                                         
##  3rd Qu.:5.000                                                                         
##  Max.   :5.000                                                                         
##                                                                                        
##  Mobilna banka_Zagotavljanje varnosti in zaščita vaših finančnih podatkov ter transakcij
##  Min.   :1.000                                                                          
##  1st Qu.:2.000                                                                          
##  Median :3.000                                                                          
##  Mean   :3.032                                                                          
##  3rd Qu.:4.000                                                                          
##  Max.   :5.000                                                                          
##                                                                                         
##  Poslovalnica_Transparentnost delovanja banke in njenih storitev
##  Min.   :1.000                                                  
##  1st Qu.:3.000                                                  
##  Median :4.000                                                  
##  Mean   :3.898                                                  
##  3rd Qu.:4.000                                                  
##  Max.   :5.000                                                  
##                                                                 
##  Mobilna banka_Transparentnost delovanja banke in njenih storitev
##  Min.   :1.000                                                   
##  1st Qu.:3.000                                                   
##  Median :3.000                                                   
##  Mean   :3.248                                                   
##  3rd Qu.:4.000                                                   
##  Max.   :5.000                                                   
##                                                                  
##  Poslovalnica_Priročnost Mobilna banka_Priročnost
##  Min.   :1.000           Min.   :1.000           
##  1st Qu.:3.000           1st Qu.:3.000           
##  Median :4.000           Median :3.000           
##  Mean   :3.752           Mean   :3.389           
##  3rd Qu.:4.000           3rd Qu.:4.000           
##  Max.   :5.000           Max.   :5.000           
##                                                  
##  Poslovalnica_Hitrost in zanesljivost opravljene storitve
##  Min.   :1.000                                           
##  1st Qu.:3.000                                           
##  Median :4.000                                           
##  Mean   :3.701                                           
##  3rd Qu.:4.000                                           
##  Max.   :5.000                                           
##                                                          
##  Mobilna banka_Hitrost in zanesljivost opravljene storitve      Q15a  
##  Min.   :1.000                                             Min.   :1  
##  1st Qu.:3.000                                             1st Qu.:4  
##  Median :4.000                                             Median :4  
##  Mean   :3.586                                             Mean   :4  
##  3rd Qu.:4.000                                             3rd Qu.:5  
##  Max.   :5.000                                             Max.   :5  
##                                                                       
##       Q15b            Q15c            Q15d            Q15e      
##  Min.   :2.000   Min.   :1.000   Min.   :1.000   Min.   :2.000  
##  1st Qu.:4.000   1st Qu.:4.000   1st Qu.:3.000   1st Qu.:4.000  
##  Median :4.000   Median :4.000   Median :4.000   Median :4.000  
##  Mean   :4.134   Mean   :4.115   Mean   :3.809   Mean   :4.127  
##  3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:4.000   3rd Qu.:5.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##                                                                 
##            Q16a              Q16b              Q16c              Q16d   
##  Selected    :98   Selected    :66   Selected    :68   Selected    :79  
##  Not selected:59   Not selected:91   Not selected:89   Not selected:78  
##  NA's        : 1   NA's        : 1   NA's        : 1   NA's        : 1  
##                                                                         
##                                                                         
##                                                                         
##                                                                         
##            Q16e               Q16f               Q16g          Q17a      
##  Selected    : 49   Selected    : 36   Selected    :  9   Min.   :0.000  
##  Not selected:108   Not selected:121   Not selected:148   1st Qu.:4.000  
##  NA's        :  1   NA's        :  1   NA's        :  1   Median :4.000  
##                                                           Mean   :3.662  
##                                                           3rd Qu.:4.000  
##                                                           Max.   :5.000  
##                                                                          
##       Q17b            Q17c            Q17d            Q17e            Q18a     
##  Min.   :0.000   Min.   :0.000   Min.   :0.000   Min.   :0.000   Min.   :1.00  
##  1st Qu.:4.000   1st Qu.:4.000   1st Qu.:2.000   1st Qu.:4.000   1st Qu.:3.00  
##  Median :4.000   Median :4.000   Median :4.000   Median :4.000   Median :4.00  
##  Mean   :3.694   Mean   :3.758   Mean   :3.223   Mean   :3.809   Mean   :3.35  
##  3rd Qu.:4.000   3rd Qu.:4.750   3rd Qu.:4.000   3rd Qu.:5.000   3rd Qu.:4.00  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.00  
##                                                                                
##       Q18b            Q18c            Q19a            Q19b      
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:3.000   1st Qu.:3.000   1st Qu.:3.000  
##  Median :4.000   Median :4.000   Median :3.000   Median :4.000  
##  Mean   :3.732   Mean   :3.713   Mean   :3.217   Mean   :3.548  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##                                                                 
##       Q19c            Q19d            Q19e                           Q41    
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Female                :78  
##  1st Qu.:3.000   1st Qu.:3.000   1st Qu.:2.000   Male                  :79  
##  Median :4.000   Median :4.000   Median :3.000   I don't want to answer: 0  
##  Mean   :3.331   Mean   :3.618   Mean   :3.051   NA's                  : 1  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000                              
##  Max.   :5.000   Max.   :5.000   Max.   :5.000                              
##                                                                             
##            Q43a               Q43b               Q43c               Q43d   
##  Selected    : 32   Selected    : 12   Selected    :  5   Selected    :81  
##  Not selected:125   Not selected:145   Not selected:152   Not selected:76  
##  NA's        :  1   NA's        :  1   NA's        :  1   NA's        : 1  
##                                                                            
##                                                                            
##                                                                            
##                                                                            
##            Q43e               Q43f               Q43g               Q43h    
##  Selected    : 57   Selected    :  1   Selected    : 29   Selected    :  0  
##  Not selected:100   Not selected:156   Not selected:128   Not selected:157  
##  NA's        :  1   NA's        :  1   NA's        :  1   NA's        :  1  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##                           Q44                                    Q45    
##  1.000 – 5.000 habitants    :44   OTP banka d.d.                   :71  
##  More than 100.000 habitants:36   Nova Ljubljanska Banka d.d. (NLB):35  
##  Less than 1.000 habitants  :30   Delavska Hranilnica d.d.         :12  
##  20.001 – 50.000 habitants  :14   Banka Intesa Sanpaolo d.d.       :10  
##  5.001 – 20.000 habitants   :22   Deželna Banka Slovenije d.d.     : 8  
##  50.001 – 100.000 habitants :11   (Other)                          :18  
##  NA's                       : 1   NA's                             : 4  
##                 Q46                  Q47    
##  Študent/-ka      :22   Pod 1.000€     :46  
##  Redno zaposlen/-a:73   1.000€ - 1.500€:39  
##  Upokojen/-a      :43   2.001€ - 3.000€:29  
##  Samozaposlen/-a  :13   1.501€ - 2.000€:25  
##  Delno zaposlen/-a: 3   3.001€ - 5.000€: 6  
##  Brezposeln/-a    : 3   (Other)        : 4  
##  NA's             : 1   NA's           : 9  
##                                                                                                Q48    
##  Dokončana osnovna šola                                                                          : 8  
##  Dokončana visokošolska strokovna univerzitetna izobrazba (tudi 2. bolonjska stopnja)            :35  
##  Dokončana nižja ali srednja poklicna izobrazba                                                  :23  
##  Dokončana višješolska strokovna ali visokošolska strokovna izobrazba (tudi 1. bolonjska stopnja):34  
##  Dokončana specializacija, znanstveni magisterij, doktorat                                       : 5  
##  Dokončana srednja strokovna ali splošna izobrazba                                               :50  
##  NA's                                                                                            : 3  
##                Q49    
##  Preko linka     :84  
##  Na tablici      :37  
##  Na listu papirja:17  
##  NA's            :20  
##                       
##                       
## 

Q1: Ali uporabljate mobilno aplikacijo banke, kjer imate odprt primarni račun (npr. KlikIn, in podobno)? [1 - Da, 2 - Ne]

Q2: Ali dnevno uporabljate pametni telefon? [1 - Da, 2 - Ne]

Q3: Ali uporabljate internet vsaj nekajkrat na teden? [1 - Da, 2 - Ne]

Q4: Ali je kdo od vaših družinskih članov uporabnik mobilne banke? [1 - Da, 2 - Ne, 3 - Ne vem]

Q5: Ali imate skupni bančni račun, ki ga upravlja en član družine? [1 - Da, 2 - Ne]

Q6: Zakaj ne uporabljate mobilne banke (npr. KlikIn, , Addiko Mobil, …)? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti se ne strinjam niti se strinjam, 4 - Se strinjam, 5 - Popolnoma se strinjam]

Q7: Se vam zdi, da bi potrebovali dodatno izobraževanje, preden bi lahko uporabljali mobilno banko? [1 - Da, 2 - Ne]

Q8: Kako pomembni so naslednji dejavniki pri odločitvi med poslovalnico ali mobilno aplikacijo? [1 - Sploh ni pomembno, 2 - Ni pomembno, 3 - Niti ni pomembno niti je pomembno, 4 - Je pomembno, 5 - Zelo pomembno]

Q10: Kako zaznavate podporo in svetovanje v poslovalnici in mobilni aplikaciji? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti niti, 4 - Se strinjam, 5 - Popolnoma se strinjam]

Q11: Kako zaznavate zagotavljanje varnosti in zaščito vaših finančnih podatkov ter transakcij v poslovalnici in mobilni aplikaciji? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti niti, 4 - Se strinjam, 5 - Popolnoma se strinjam]

Q12: Kako zaznavate transparentnost delovanja banke in njenih storitev v poslovalnici in mobilni aplikaciji? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti niti, 4 - Se strinjam, 5 - Popolnoma se strinjam]

Q13: Kako zaznavate priročnost (npr. enostavna uporaba, dostopnost od kjerkoli, hitre transakcije) v poslovalnici in mobilni aplikaciji? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti niti, 4 - Se strinjam, 5 - Popolnoma se strinjam]

Q14: Kako zaznavate zanesljivost opravljene storitve v poslovalnici in mobilni aplikaciji? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti niti, 4 - Se strinjam, 5 - Popolnoma se strinjam]

Q15: Kako zaskrbljeni ste glede naslednjih tveganj med uporabo mobilnega bančništva? [1 - Sploh me ne skrbi, 2 - Me ne skrbi, 3 - Niti me ne skrbi niti me skrbi, 4 - Me skrbi, 5 - Zelo me skrbi]

Q16: Kateri izmed spodaj navedenih dejavnikov bi vam omogočili večje zaupanje v mobilno bančništvo? [Izbira več možnih odgovorov]

Q17: Kako koristne bi se vam zdele naslednje funkcije v aplikaciji za mobilno bančništvo? [1 - Sploh ni koristna, 2 - Ni koristna, 3 - Niti ni koristna niti je koristna, 4 - Je koristna, 5 - Izjemno koristna]

Q18: Kateri finančni spodbudni ukrepi bi vas spodbudili k uporabi mobilnega bančništva? [1 - Sploh me ne bi spodbudilo, 2 - Ne bi me spodbudilo, 3 - Niti niti, 4 - Bi me spodbudilo, 5 - Zelo bi me spodbudilo]

Q19: Kateri nefinančni spodbudni ukrepi bi vas spodbudili k uporabi mobilnega bančništva? [1 - Sploh me ne bi spodbudilo, 2 - Ne bi me spodbudilo, 3 - Niti niti, 4 - Bi me spodbudilo, 5 - Zelo bi me spodbudilo]

Q41: Spol [1 - Ženska, 2 - Moški, 3 - Ne želim odgovoriti]

Q42: Prosimo, vpišite leto rojstva. [Odprto besedilo]

Q43: S kom živite v istem gospodinjstvu? [Izbira več možnih odgovorov]

Q44: Koliko prebivalcev živi v kraju, kjer prebivate? [1 - Manj kot 1.000 prebivalcev, 2 - 1.000–5.000 prebivalcev, 3 - 5.001–20.000 prebivalcev, 4 - 20.001–50.000 prebivalcev, 5 - 50.001–100.000 prebivalcev, 6 - Več kot 100.000 prebivalcev]

Q45: Katera je vaša primarna banka? [1 - Nova Ljubljanska Banka d.d. (NLB), 2 - BKS Bank AG, Bančna podružnica, 3 - OTP banka d.d., 4 - Banka Sparkasse d.d., 5 - Banka Intesa Sanpaolo d.d., 6 - UniCredit Banka Slovenija d.d., 7 - Revolut, 8 - N26, 9 - Gorenjska Banka d.d., 10 - Deželna Banka Slovenije d.d., 11 - Addiko Bank d.d., 12 - Delavska Hranilnica d.d., 13 - Drugo]

Q46: Kakšna je vaša trenutna zaposlitev? [1 - Študent/ka, 2 - Redno zaposlen/a, 3 - Upokojen/a, 4 - Brezposeln/a, 5 - Samozaposlen/a, 6 - Delno zaposlen/a]

Q47: Kakšen je vaš mesečni neto prihodek? [1 - Pod 1.000€, 2 - 1.000€–1.500€, 3 - 1.501€–2.000€, 4 - 2.001€–3.000€, 5 - 3.001€–5.000€, 6 - 5.001€–10.000€, 7 - Nad 10.000€, 8 - Ne želim odgovoriti]

Q48: Kakšna je vaša stopnja izobrazbe? [1 - Nedokončana osnovna šola, 2 - Dokončana osnovna šola, 3 - Dokončana nižja ali srednja poklicna izobrazba, 4 - Dokončana srednja strokovna ali splošna izobrazba, 5 - Dokončana višješolska strokovna ali visokošolska strokovna izobrazba (tudi 1. bolonjska stopnja), 6 - Dokončana visokošolska strokovna ali univerzitetna izobrazba (tudi 2. bolonjska stopnja), 7 - Dokončana specializacija, znanstveni magisterij ali doktorat]

Q49: Kako ste rešili anketo? [1 - Preko linka, 2 - Na tablici, 3 - Na listu papirja]

Merging data

# Keep only specific columns (replace with actual column names)

data1_log_reg <- mydata_excel1 %>% select(Q1, Q41, Q42, Q43a, Q43b, Q43c, Q43d, Q43e, Q43f, Q43g, Q43h, Q44, Q45, Q46, Q47, Q48)

data2_log_reg <- mydata_excel2 %>% select(Q1, Q41, Q42, Q43a, Q43b, Q43c, Q43d, Q43e, Q43f, Q43g, Q43h, Q44, Q45, Q46, Q47, Q48)
merged_data_log_reg <- rbind(data1_log_reg, data2_log_reg)

K-Means Clustering

# Rename columns using colnames()
colnames(merged_data_log_reg) <- c("Ali uporabljate mobilno aplikacijo",
                                   "Spol","Leto rojstva","Živim s starši",
                                   "Živim s sorojenci","Živim s sorodniki",
                                   "Živim s partnerko/jem", "Živim z otroki", 
                                   "Živim s skrbnikom", "Živim sam/a", "Drugo", 
                                   "Št. prebivalcev v kraju bivanja", "Primarna banka", 
                                   "Trenutna zaposlitev", "Mesečni neto prihodek", "Stopnja izobrazbe")

# Remove the first row
merged_data_log_reg1 <- merged_data_log_reg[-1, ]

categorical_vars <- c("Ali uporabljate mobilno aplikacijo", "Spol", 
                      "Živim s starši", "Živim s sorojenci", 
                      "Živim s sorodniki", "Živim s partnerko/jem", 
                      "Živim z otroki", "Živim s skrbnikom", 
                      "Živim sam/a", "Drugo", "Primarna banka", 
                      "Trenutna zaposlitev", "Stopnja izobrazbe")

merged_data_log_reg1[categorical_vars] <- lapply(merged_data_log_reg1[categorical_vars], as.factor)

# Convert numerical variables to numeric
numeric_vars <- c("Leto rojstva", "Št. prebivalcev v kraju bivanja", "Mesečni neto prihodek")
merged_data_log_reg1[numeric_vars] <- lapply(merged_data_log_reg1[numeric_vars], as.numeric)

merged_data_log_reg1 <- as.data.frame(merged_data_log_reg1)

head(merged_data_log_reg1)
##   Ali uporabljate mobilno aplikacijo Spol Leto rojstva Živim s starši
## 1                                  1    2         2000              1
## 2                                  1    2         1998              0
## 3                                  1    2         2001              1
## 4                                  1    2         1994              0
## 5                                  1    2         2000              1
## 6                                  1    2         2004              1
##   Živim s sorojenci Živim s sorodniki Živim s partnerko/jem Živim z otroki
## 1                 0                 0                     0              0
## 2                 0                 0                     1              0
## 3                 0                 0                     0              0
## 4                 0                 0                     1              0
## 5                 1                 0                     0              0
## 6                 0                 0                     0              0
##   Živim s skrbnikom Živim sam/a Drugo Št. prebivalcev v kraju bivanja
## 1                 0           0     0                               2
## 2                 0           0     0                               6
## 3                 0           0     0                               2
## 4                 0           0     0                               6
## 5                 0           0     0                               2
## 6                 0           0     0                               3
##   Primarna banka Trenutna zaposlitev Mesečni neto prihodek Stopnja izobrazbe
## 1              3                   2                     2                 2
## 2              3                   2                     3                 4
## 3              1                   2                     2                 3
## 4             12                   5                     6                 5
## 5              1                   2                     3                 6
## 6              1                   1                     8                 4
merged_data_clean <- merged_data_log_reg1 %>%
  filter(complete.cases(.))  # Removes rows with any NAs
# Automatically calculate age and assign age groups
merged_data_clean <- merged_data_clean %>%
  mutate(
    `Leto rojstva` = as.numeric(as.character(`Leto rojstva`)),  # Convert to numeric (handles factors & characters)
    age = as.numeric(format(Sys.Date(), "%Y")) - `Leto rojstva`, # Calculate age
    age_group = case_when(
      age <= 27 ~ "Young",
      age > 27 & age <= 65 ~ "Professional",
      age > 65 ~ "Older"))

# Ensure output is a data frame (not a tibble)
merged_data_clean <- as.data.frame(merged_data_clean)

head(merged_data_clean)
##   Ali uporabljate mobilno aplikacijo Spol Leto rojstva Živim s starši
## 1                                  1    2         2000              1
## 2                                  1    2         1998              0
## 3                                  1    2         2001              1
## 4                                  1    2         1994              0
## 5                                  1    2         2000              1
## 6                                  1    2         2004              1
##   Živim s sorojenci Živim s sorodniki Živim s partnerko/jem Živim z otroki
## 1                 0                 0                     0              0
## 2                 0                 0                     1              0
## 3                 0                 0                     0              0
## 4                 0                 0                     1              0
## 5                 1                 0                     0              0
## 6                 0                 0                     0              0
##   Živim s skrbnikom Živim sam/a Drugo Št. prebivalcev v kraju bivanja
## 1                 0           0     0                               2
## 2                 0           0     0                               6
## 3                 0           0     0                               2
## 4                 0           0     0                               6
## 5                 0           0     0                               2
## 6                 0           0     0                               3
##   Primarna banka Trenutna zaposlitev Mesečni neto prihodek Stopnja izobrazbe
## 1              3                   2                     2                 2
## 2              3                   2                     3                 4
## 3              1                   2                     2                 3
## 4             12                   5                     6                 5
## 5              1                   2                     3                 6
## 6              1                   1                     8                 4
##   age    age_group
## 1  25        Young
## 2  27        Young
## 3  24        Young
## 4  31 Professional
## 5  25        Young
## 6  21        Young
# Add an ID column (sequential numbering)
merged_data_clean <- merged_data_clean %>%
  mutate(ID = row_number())

# Ensure it's a data frame (not a tibble)
merged_data_clean <- as.data.frame(merged_data_clean)

# Display the first 6 rows to check
head(merged_data_clean)
##   Ali uporabljate mobilno aplikacijo Spol Leto rojstva Živim s starši
## 1                                  1    2         2000              1
## 2                                  1    2         1998              0
## 3                                  1    2         2001              1
## 4                                  1    2         1994              0
## 5                                  1    2         2000              1
## 6                                  1    2         2004              1
##   Živim s sorojenci Živim s sorodniki Živim s partnerko/jem Živim z otroki
## 1                 0                 0                     0              0
## 2                 0                 0                     1              0
## 3                 0                 0                     0              0
## 4                 0                 0                     1              0
## 5                 1                 0                     0              0
## 6                 0                 0                     0              0
##   Živim s skrbnikom Živim sam/a Drugo Št. prebivalcev v kraju bivanja
## 1                 0           0     0                               2
## 2                 0           0     0                               6
## 3                 0           0     0                               2
## 4                 0           0     0                               6
## 5                 0           0     0                               2
## 6                 0           0     0                               3
##   Primarna banka Trenutna zaposlitev Mesečni neto prihodek Stopnja izobrazbe
## 1              3                   2                     2                 2
## 2              3                   2                     3                 4
## 3              1                   2                     2                 3
## 4             12                   5                     6                 5
## 5              1                   2                     3                 6
## 6              1                   1                     8                 4
##   age    age_group ID
## 1  25        Young  1
## 2  27        Young  2
## 3  24        Young  3
## 4  31 Professional  4
## 5  25        Young  5
## 6  21        Young  6
# Convert a specific column to numeric

merged_data_clean <- merged_data_clean %>%
  mutate(across(c(`Živim s starši`, `Živim s sorojenci`, `Živim s sorodniki`, `Živim s partnerko/jem`, `Živim z otroki`, `Živim s skrbnikom`, `Živim sam/a`, `Drugo`, `age`), 
                ~ as.numeric(as.character(.))))

Logistic Regression

merged_data_clean <- merged_data_clean %>%
  mutate(`Ali uporabljate mobilno aplikacijo` = ifelse(`Ali uporabljate mobilno aplikacijo` == 1, 0, ifelse(!1 == 2, 1, V1)))

head(merged_data_clean)
##   Ali uporabljate mobilno aplikacijo Spol Leto rojstva Živim s starši
## 1                                  0    2         2000              1
## 2                                  0    2         1998              0
## 3                                  0    2         2001              1
## 4                                  0    2         1994              0
## 5                                  0    2         2000              1
## 6                                  0    2         2004              1
##   Živim s sorojenci Živim s sorodniki Živim s partnerko/jem Živim z otroki
## 1                 0                 0                     0              0
## 2                 0                 0                     1              0
## 3                 0                 0                     0              0
## 4                 0                 0                     1              0
## 5                 1                 0                     0              0
## 6                 0                 0                     0              0
##   Živim s skrbnikom Živim sam/a Drugo Št. prebivalcev v kraju bivanja
## 1                 0           0     0                               2
## 2                 0           0     0                               6
## 3                 0           0     0                               2
## 4                 0           0     0                               6
## 5                 0           0     0                               2
## 6                 0           0     0                               3
##   Primarna banka Trenutna zaposlitev Mesečni neto prihodek Stopnja izobrazbe
## 1              3                   2                     2                 2
## 2              3                   2                     3                 4
## 3              1                   2                     2                 3
## 4             12                   5                     6                 5
## 5              1                   2                     3                 6
## 6              1                   1                     8                 4
##   age    age_group ID
## 1  25        Young  1
## 2  27        Young  2
## 3  24        Young  3
## 4  31 Professional  4
## 5  25        Young  5
## 6  21        Young  6
fit0 <- glm(`Ali uporabljate mobilno aplikacijo` ~ 1, #Dependent and explanatory variables
            family = binomial, #Binary logistic regression
            data = merged_data_clean)

summary(fit0)
## 
## Call:
## glm(formula = `Ali uporabljate mobilno aplikacijo` ~ 1, family = binomial, 
##     data = merged_data_clean)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.33798    0.09521   -3.55 0.000385 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 616.6  on 453  degrees of freedom
## Residual deviance: 616.6  on 453  degrees of freedom
## AIC: 618.6
## 
## Number of Fisher Scoring iterations: 4
exp(cbind(odds = fit0$coefficients, confint.default(fit0))) #Odds for Y=1
##                  odds     2.5 %    97.5 %
## (Intercept) 0.7132075 0.5917995 0.8595225
head(fitted(fit0)) #Estimated probability for Y=1
##         1         2         3         4         5         6 
## 0.4162996 0.4162996 0.4162996 0.4162996 0.4162996 0.4162996
fit1 <- glm(`Ali uporabljate mobilno aplikacijo` ~ age_group,  
            family = binomial, 
            data = merged_data_clean)

summary(fit1)
## 
## Call:
## glm(formula = `Ali uporabljate mobilno aplikacijo` ~ age_group, 
##     family = binomial, data = merged_data_clean)
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             1.5506     0.2942   5.270 1.36e-07 ***
## age_groupProfessional  -1.6854     0.3254  -5.180 2.22e-07 ***
## age_groupYoung         -3.2341     0.3635  -8.896  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 616.6  on 453  degrees of freedom
## Residual deviance: 505.7  on 451  degrees of freedom
## AIC: 511.7
## 
## Number of Fisher Scoring iterations: 3
exp(cbind(odds = fit1$coefficients, confint.default(fit1))) #Odds for Y=1
##                             odds      2.5 %     97.5 %
## (Intercept)           4.71428558 2.64836954 8.39176261
## age_groupProfessional 0.18536719 0.09796180 0.35075912
## age_groupYoung        0.03939394 0.01931873 0.08033047
head(fitted(fit1)) #Estimated probability for Y=1
##         1         2         3         4         5         6 
## 0.1566265 0.1566265 0.1566265 0.4663462 0.1566265 0.1566265
#Ifelse za vprašanje s kom živijo

merged_data_clean <- merged_data_clean %>%
  mutate(
    # Create a text column listing all people they live with
    S_kom_živijo = paste0(
      ifelse(`Živim s starši` == 1, "Živim s starši, ", ""),
      ifelse(`Živim s sorojenci` == 1, "Živim s sorojenci, ", ""),
      ifelse(`Živim s sorodniki` == 1, "Živim s sorodniki, ", ""),
      ifelse(`Živim s partnerko/jem` == 1, "Živim s partnerko/jem, ", ""),
      ifelse(`Živim z otroki` == 1, "Živim z otroki, ", ""),
      ifelse(`Živim s skrbnikom` == 1, "Živim s skrbnikom, ", ""),
      ifelse(`Drugo` == 1, "Drugo, ", "")
    ),
    S_kom_živijo = sub(", $", "", S_kom_živijo), # Remove trailing comma

    # Categorize into "Lives alone" or "Lives with others"
    Kako_živijo = ifelse(`Živim sam/a` == 1, "Živi sam", "Živi z drugimi")
  )
anova(fit0, fit1, test = "Chi") #Comparision of models based on -2LL statistics
## Analysis of Deviance Table
## 
## Model 1: `Ali uporabljate mobilno aplikacijo` ~ 1
## Model 2: `Ali uporabljate mobilno aplikacijo` ~ age_group
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       453      616.6                          
## 2       451      505.7  2   110.89 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(cbind(OR = fit1$coefficients, confint.default(fit1))) #Odds ratio for Y=1 (with 95% CI)
##                               OR      2.5 %     97.5 %
## (Intercept)           4.71428558 2.64836954 8.39176261
## age_groupProfessional 0.18536719 0.09796180 0.35075912
## age_groupYoung        0.03939394 0.01931873 0.08033047
merged_data_clean <- merged_data_clean %>%
  mutate(across(c(`Št. prebivalcev v kraju bivanja`, `Trenutna zaposlitev`, `Mesečni neto prihodek`, `Stopnja izobrazbe`), 
                ~ as.numeric(as.character(.))))

str(merged_data_clean)
## 'data.frame':    454 obs. of  21 variables:
##  $ Ali uporabljate mobilno aplikacijo: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Spol                              : Factor w/ 4 levels "-3","1","2","3": 3 3 3 3 3 3 2 3 3 2 ...
##  $ Leto rojstva                      : num  2000 1998 2001 1994 2000 ...
##  $ Živim s starši                    : num  1 0 1 0 1 1 0 0 0 0 ...
##  $ Živim s sorojenci                 : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ Živim s sorodniki                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Živim s partnerko/jem             : num  0 1 0 1 0 0 1 0 1 1 ...
##  $ Živim z otroki                    : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ Živim s skrbnikom                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Živim sam/a                       : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ Drugo                             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Št. prebivalcev v kraju bivanja   : num  2 6 2 6 2 3 6 6 1 4 ...
##  $ Primarna banka                    : Factor w/ 14 levels "-3","1","10",..: 8 8 2 5 2 2 11 7 2 7 ...
##  $ Trenutna zaposlitev               : num  2 2 2 5 2 1 2 1 2 1 ...
##  $ Mesečni neto prihodek             : num  2 3 2 6 3 8 4 6 4 2 ...
##  $ Stopnja izobrazbe                 : num  2 4 3 5 6 4 6 1 6 5 ...
##  $ age                               : num  25 27 24 31 25 21 28 23 39 23 ...
##  $ age_group                         : chr  "Young" "Young" "Young" "Professional" ...
##  $ ID                                : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ S_kom_živijo                      : chr  "Živim s starši" "Živim s partnerko/jem" "Živim s starši" "Živim s partnerko/jem" ...
##  $ Kako_živijo                       : chr  "Živi z drugimi" "Živi z drugimi" "Živi z drugimi" "Živi z drugimi" ...
fit2 <- glm(`Ali uporabljate mobilno aplikacijo` ~ age_group + `Trenutna zaposlitev` + `Stopnja izobrazbe`,  
            family = binomial, 
            data = merged_data_clean)

summary(fit2)
## 
## Call:
## glm(formula = `Ali uporabljate mobilno aplikacijo` ~ age_group + 
##     `Trenutna zaposlitev` + `Stopnja izobrazbe`, family = binomial, 
##     data = merged_data_clean)
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            2.64298    0.50628   5.220 1.79e-07 ***
## age_groupProfessional -0.71711    0.36821  -1.948   0.0515 .  
## age_groupYoung        -2.51214    0.40758  -6.164 7.11e-10 ***
## `Trenutna zaposlitev`  0.07741    0.10220   0.757   0.4488    
## `Stopnja izobrazbe`   -0.44330    0.08969  -4.943 7.71e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 616.60  on 453  degrees of freedom
## Residual deviance: 475.35  on 449  degrees of freedom
## AIC: 485.35
## 
## Number of Fisher Scoring iterations: 5
exp(cbind(odds = fit2$coefficients, confint.default(fit2))) #Odds for Y=1
##                              odds     2.5 %     97.5 %
## (Intercept)           14.05507453 5.2105773 37.9123286
## age_groupProfessional  0.48816000 0.2372152  1.0045737
## age_groupYoung         0.08109449 0.0364804  0.1802699
## `Trenutna zaposlitev`  1.08048242 0.8843581  1.3201013
## `Stopnja izobrazbe`    0.64191150 0.5384305  0.7652805
head(fitted(fit2)) #Estimated probability for Y=1
##          1          2          3          4          5          6 
## 0.35412612 0.18428812 0.26032964 0.52407772 0.08516365 0.17293487
fit3 <- glm(`Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja`,  
            family = binomial, 
            data = merged_data_clean)

summary(fit3)
## 
## Call:
## glm(formula = `Ali uporabljate mobilno aplikacijo` ~ age_group + 
##     `Št. prebivalcev v kraju bivanja`, family = binomial, data = merged_data_clean)
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        1.95622    0.33318   5.871 4.32e-09 ***
## age_groupProfessional             -1.18740    0.34527  -3.439 0.000584 ***
## age_groupYoung                    -2.80997    0.37710  -7.452 9.23e-14 ***
## `Št. prebivalcev v kraju bivanja` -0.24141    0.05459  -4.422 9.76e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 616.60  on 453  degrees of freedom
## Residual deviance: 484.64  on 450  degrees of freedom
## AIC: 492.64
## 
## Number of Fisher Scoring iterations: 4
exp(cbind(odds = fit0$coefficients, confint.default(fit3))) #Odds for Y=1
##                                        odds      2.5 %     97.5 %
## (Intercept)                       0.7132075 3.68101588 13.5888375
## age_groupProfessional             0.7132075 0.15503284  0.6000846
## age_groupYoung                    0.7132075 0.02875141  0.1260762
## `Št. prebivalcev v kraju bivanja` 0.7132075 0.70581729  0.8742236
head(fitted(fit3)) #Estimated probability for Y=1
##          1          2          3          4          5          6 
## 0.20807504 0.09094061 0.20807504 0.33634239 0.20807504 0.17108220
fit4 <- glm(`Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` + `Mesečni neto prihodek` + `Kako_živijo`,
            family = binomial, 
            data = merged_data_clean)
head(merged_data_clean[, c("age_group", "Št. prebivalcev v kraju bivanja", "Mesečni neto prihodek", "Kako_živijo")])
##      age_group Št. prebivalcev v kraju bivanja Mesečni neto prihodek
## 1        Young                               2                     2
## 2        Young                               6                     3
## 3        Young                               2                     2
## 4 Professional                               6                     6
## 5        Young                               2                     3
## 6        Young                               3                     8
##      Kako_živijo
## 1 Živi z drugimi
## 2 Živi z drugimi
## 3 Živi z drugimi
## 4 Živi z drugimi
## 5 Živi z drugimi
## 6 Živi z drugimi
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:DescTools':
## 
##     Recode
## The following object is masked from 'package:dplyr':
## 
##     recode
vif(fit4)
##                                       GVIF Df GVIF^(1/(2*Df))
## age_group                         1.340321  2        1.075975
## `Št. prebivalcev v kraju bivanja` 1.059177  1        1.029163
## `Mesečni neto prihodek`           1.342169  1        1.158520
## Kako_živijo                       1.036132  1        1.017906

All VIF values are close to 1, meaning there is no serious multicollinearity in our model. Our predictors are independent enough to be used together.

summary(fit4)
## 
## Call:
## glm(formula = `Ali uporabljate mobilno aplikacijo` ~ age_group + 
##     `Št. prebivalcev v kraju bivanja` + `Mesečni neto prihodek` + 
##     Kako_živijo, family = binomial, data = merged_data_clean)
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        2.83274    0.52146   5.432 5.56e-08 ***
## age_groupProfessional             -0.89742    0.38778  -2.314   0.0207 *  
## age_groupYoung                    -3.25271    0.43764  -7.432 1.07e-13 ***
## `Št. prebivalcev v kraju bivanja` -0.14921    0.06377  -2.340   0.0193 *  
## `Mesečni neto prihodek`           -0.48453    0.08232  -5.886 3.96e-09 ***
## Kako_živijoŽivi z drugimi          0.06792    0.31249   0.217   0.8279    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 616.60  on 453  degrees of freedom
## Residual deviance: 437.01  on 448  degrees of freedom
## AIC: 449.01
## 
## Number of Fisher Scoring iterations: 5
exp(cbind(odds = fit4$coefficients, confint.default(fit4))) #Odds for Y=1
##                                          odds     2.5 %      97.5 %
## (Intercept)                       16.99197228 6.1147486 47.21815095
## age_groupProfessional              0.40761833 0.1906221  0.87163395
## age_groupYoung                     0.03866918 0.0164000  0.09117718
## `Št. prebivalcev v kraju bivanja`  0.86138858 0.7601860  0.97606417
## `Mesečni neto prihodek`            0.61598806 0.5242040  0.72384281
## Kako_živijoŽivi z drugimi          1.07027620 0.5801038  1.97463141
head(fitted(fit4)) #Estimated probability for Y=1
##           1           2           3           4           5           6 
## 0.165269812 0.062920556 0.165269812 0.141949708 0.108703138 0.009231083
fit5 <- glm(`Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` + `Trenutna zaposlitev` + `Mesečni neto prihodek` + `Stopnja izobrazbe` + `Kako_živijo`,
            family = binomial, 
            data = merged_data_clean)

summary(fit5)
## 
## Call:
## glm(formula = `Ali uporabljate mobilno aplikacijo` ~ age_group + 
##     `Št. prebivalcev v kraju bivanja` + `Trenutna zaposlitev` + 
##     `Mesečni neto prihodek` + `Stopnja izobrazbe` + Kako_živijo, 
##     family = binomial, data = merged_data_clean)
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        3.36990    0.74444   4.527 5.99e-06 ***
## age_groupProfessional             -0.67280    0.40141  -1.676   0.0937 .  
## age_groupYoung                    -2.99739    0.48179  -6.221 4.93e-10 ***
## `Št. prebivalcev v kraju bivanja` -0.12621    0.06518  -1.936   0.0528 .  
## `Trenutna zaposlitev`              0.06624    0.11807   0.561   0.5748    
## `Mesečni neto prihodek`           -0.43289    0.08371  -5.171 2.33e-07 ***
## `Stopnja izobrazbe`               -0.22244    0.11139  -1.997   0.0458 *  
## Kako_živijoŽivi z drugimi          0.01338    0.31648   0.042   0.9663    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 616.60  on 453  degrees of freedom
## Residual deviance: 432.72  on 446  degrees of freedom
## AIC: 448.72
## 
## Number of Fisher Scoring iterations: 6
anova(fit0, fit1, test = "Chi") #Comparision of models based on -2LL statistics
## Analysis of Deviance Table
## 
## Model 1: `Ali uporabljate mobilno aplikacijo` ~ 1
## Model 2: `Ali uporabljate mobilno aplikacijo` ~ age_group
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       453      616.6                          
## 2       451      505.7  2   110.89 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(fit1, fit2, test = "Chi") #Comparision of models based on -2LL statistics
## Analysis of Deviance Table
## 
## Model 1: `Ali uporabljate mobilno aplikacijo` ~ age_group
## Model 2: `Ali uporabljate mobilno aplikacijo` ~ age_group + `Trenutna zaposlitev` + 
##     `Stopnja izobrazbe`
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       451     505.70                          
## 2       449     475.35  2   30.347 2.571e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(fit2, fit3, test = "Chi") #Comparision of models based on -2LL statistics
## Analysis of Deviance Table
## 
## Model 1: `Ali uporabljate mobilno aplikacijo` ~ age_group + `Trenutna zaposlitev` + 
##     `Stopnja izobrazbe`
## Model 2: `Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja`
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)   
## 1       449     475.35                        
## 2       450     484.64 -1  -9.2898 0.002304 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(fit3, fit4, test = "Chi") #Comparision of models based on -2LL statistics
## Analysis of Deviance Table
## 
## Model 1: `Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja`
## Model 2: `Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` + 
##     `Mesečni neto prihodek` + Kako_živijo
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       450     484.64                          
## 2       448     437.01  2   47.633 4.536e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(fit4, fit5, test = "Chi") #Comparision of models based on -2LL statistics
## Analysis of Deviance Table
## 
## Model 1: `Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` + 
##     `Mesečni neto prihodek` + Kako_živijo
## Model 2: `Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` + 
##     `Trenutna zaposlitev` + `Mesečni neto prihodek` + `Stopnja izobrazbe` + 
##     Kako_živijo
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       448     437.01                     
## 2       446     432.72  2   4.2928   0.1169

Interpretation:

Glede na teste je najboljši fit4:

Ali uporabljate mobilno aplikacijo~age_group+Št. prebivalcev v kraju bivanja+Mesečni neto prihodek` + Kako_živijo

# Extract standardized residuals and Cook's distance from fit4
merged_data_clean$StdResid <- rstandard(fit4)   # Standardized residuals
merged_data_clean$Cook <- cooks.distance(fit4)  # Cook's distance
merged_data_clean$StdResid <- rstandard(fit4)
merged_data_clean$CooksD <- cooks.distance(fit4)

library(ggplot2)
StdResid <- ggplot(merged_data_clean, aes(x=StdResid)) +
             theme_linedraw() +
             geom_histogram() +
             xlab("Standardized residuals")

library(ggplot2)
Cook <- ggplot(merged_data_clean, aes(x=CooksD)) +
         theme_linedraw() +
         geom_histogram() +
         xlab("Cook's distances")

ggarrange(StdResid, Cook,
          ncol = 2, nrow = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

head(merged_data_clean[order(merged_data_clean$StdResid), c("ID", "StdResid")], 5)
##      ID  StdResid
## 191 191 -2.105705
## 232 232 -2.066293
## 240 240 -1.881493
## 264 264 -1.792185
## 254 254 -1.779406
head(merged_data_clean[order(-merged_data_clean$StdResid), c("ID", "StdResid")], 5)
##      ID StdResid
## 440 440 2.916655
## 313 313 2.505523
## 297 297 2.358535
## 394 394 2.256942
## 446 446 2.194612
head(merged_data_clean[order(-merged_data_clean$CooksD), c("ID", "CooksD")], 10)
##      ID     CooksD
## 440 440 0.04439633
## 191 191 0.02805883
## 313 313 0.02379869
## 240 240 0.02354029
## 232 232 0.02054852
## 446 446 0.01904230
## 380 380 0.01837936
## 394 394 0.01827372
## 296 296 0.01789714
## 322 322 0.01789714

We have 3 high impact values and 1 potential outlier. So we dropped those values.

library(dplyr)
merged_data_clean <- merged_data_clean %>%
  filter(ID != 440 & ID != 191 & ID != 313 & ID != 232)

Let’s run the fits again and check if they’re better.

fit4 <- glm(`Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` + `Mesečni neto prihodek` + `Kako_živijo`,
            family = binomial, 
            data = merged_data_clean)

summary(fit4)
## 
## Call:
## glm(formula = `Ali uporabljate mobilno aplikacijo` ~ age_group + 
##     `Št. prebivalcev v kraju bivanja` + `Mesečni neto prihodek` + 
##     Kako_živijo, family = binomial, data = merged_data_clean)
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        3.32703    0.58135   5.723 1.05e-08 ***
## age_groupProfessional             -1.13377    0.42267  -2.682  0.00731 ** 
## age_groupYoung                    -3.75194    0.48745  -7.697 1.39e-14 ***
## `Št. prebivalcev v kraju bivanja` -0.15270    0.06632  -2.303  0.02130 *  
## `Mesečni neto prihodek`           -0.57067    0.09076  -6.287 3.23e-10 ***
## Kako_živijoŽivi z drugimi          0.10018    0.32367   0.309  0.75694    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 610.94  on 449  degrees of freedom
## Residual deviance: 412.17  on 444  degrees of freedom
## AIC: 424.17
## 
## Number of Fisher Scoring iterations: 6
fit5 <- glm(`Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` + `Trenutna zaposlitev` + `Mesečni neto prihodek` + `Stopnja izobrazbe` + `Kako_živijo`,
            family = binomial, 
            data = merged_data_clean)

summary(fit5)
## 
## Call:
## glm(formula = `Ali uporabljate mobilno aplikacijo` ~ age_group + 
##     `Št. prebivalcev v kraju bivanja` + `Trenutna zaposlitev` + 
##     `Mesečni neto prihodek` + `Stopnja izobrazbe` + Kako_živijo, 
##     family = binomial, data = merged_data_clean)
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        3.99040    0.83581   4.774 1.80e-06 ***
## age_groupProfessional             -0.90296    0.43243  -2.088   0.0368 *  
## age_groupYoung                    -3.49294    0.53299  -6.553 5.62e-11 ***
## `Št. prebivalcev v kraju bivanja` -0.12915    0.06771  -1.907   0.0565 .  
## `Trenutna zaposlitev`              0.07230    0.12380   0.584   0.5592    
## `Mesečni neto prihodek`           -0.51303    0.09142  -5.612 2.00e-08 ***
## `Stopnja izobrazbe`               -0.25354    0.11856  -2.139   0.0325 *  
## Kako_živijoŽivi z drugimi          0.03498    0.32861   0.106   0.9152    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 610.94  on 449  degrees of freedom
## Residual deviance: 407.18  on 442  degrees of freedom
## AIC: 423.18
## 
## Number of Fisher Scoring iterations: 6

And let’s run anova again:

anova(fit4, fit5, test = "Chi") #Comparision of models based on -2LL statistics
## Analysis of Deviance Table
## 
## Model 1: `Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` + 
##     `Mesečni neto prihodek` + Kako_živijo
## Model 2: `Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` + 
##     `Trenutna zaposlitev` + `Mesečni neto prihodek` + `Stopnja izobrazbe` + 
##     Kako_živijo
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       444     412.17                       
## 2       442     407.18  2   4.9891  0.08253 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Interpretation:

Fit4 if still better and we fixed it a bit.

# A data frame for coefficients of fit4

coeff_data <- data.frame(
  Variable = c("Intercept", "age_groupProfessional", "age_groupYoung",
               "Št. prebivalcev v kraju bivanja", "Mesečni neto prihodek", "Kako_živijoŽivi z drugimi"),
  Estimate = c(-3.74939, 1.27803, 4.01414, 0.15967, 0.63420, -0.03244),
  Std_Error = c(0.62769, 0.44997, 0.52222, 0.06804, 0.09739, 0.33023),
  Z_Value = c(-5.973, 2.840, 7.687, 2.347, 6.512, -0.098),
  P_Value = c(2.32e-09, 0.00451, 1.51e-14, 0.01894, 7.40e-11, 0.92174))

head(coeff_data)
##                          Variable Estimate Std_Error Z_Value    P_Value
## 1                       Intercept -3.74939   0.62769  -5.973 2.3200e-09
## 2           age_groupProfessional  1.27803   0.44997   2.840 4.5100e-03
## 3                  age_groupYoung  4.01414   0.52222   7.687 1.5100e-14
## 4 Št. prebivalcev v kraju bivanja  0.15967   0.06804   2.347 1.8940e-02
## 5           Mesečni neto prihodek  0.63420   0.09739   6.512 7.4000e-11
## 6       Kako_živijoŽivi z drugimi -0.03244   0.33023  -0.098 9.2174e-01
# Ensure the dependent variable is a factor (assuming LatePayF is the outcome variable)
merged_data_clean$`Ali uporabljate mobilno aplikacijo` <- as.factor(merged_data_clean$`Ali uporabljate mobilno aplikacijo`)

# Fit logistic regression model
fit4 <- glm(`Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` + `Mesečni neto prihodek` + `Kako_živijo`,
            family = binomial, 
            data = merged_data_clean)

# Generate predicted probabilities (Estimated Probability)
merged_data_clean <- merged_data_clean %>%
  mutate(EstProb = predict(fit4, type = "response"))

# Classify based on probability threshold (0.50)
merged_data_clean <- merged_data_clean %>%
  mutate(Classification = ifelse(EstProb < 0.50, "NO", "YES"),
         ClassificationF = factor(Classification, levels = c("NO", "YES")))

# Create the classification table
ClassificationTable <- table(merged_data_clean$`Ali uporabljate mobilno aplikacijo`, merged_data_clean$ClassificationF)

# Display classification table
print(ClassificationTable)
##    
##      NO YES
##   0 223  40
##   1  62 125
# Compute Pseudo R² (Proportion of correctly classified cases)
Pseudo_R2_fit4 <- sum(diag(ClassificationTable)) / nrow(merged_data_clean)

# Display Pseudo R²
Pseudo_R2_fit4
## [1] 0.7733333

Conclusion Based on Our Analysis

Significant Predictors

  • Age: A positive and highly significant effect on mobile app usage (p < 0.001). This suggests that as age increases, the likelihood of using a mobile application also increases.

  • Population of Place of Residence: A negative and significant impact (p < 0.05), implying that individuals from larger population areas are less likely to use mobile applications.

  • Monthly Net Income: Also negatively associated with app usage (p < 0.001), indicating that individuals with higher incomes are less likely to use mobile applications.

Non-Significant Predictors

  • Living Situation (“Kako živijo”): The variable was not statistically significant, meaning that whether someone lives alone or with others does not strongly predict mobile app usage.

  • Employment Status and Education Level: When added in Model 2 (fit5), these factors did not significantly improve model performance (p = 0.1458 in the Chi-square test), suggesting they may not be strong predictors.

Model Comparison and Performance

  • Fit4 vs. Fit5: The simpler model (fit4) performed slightly better, and adding employment status and education level did not significantly improve the fit.

  • Classification Performance: The logistic regression model achieved 78.29% accuracy, indicating a good ability to distinguish between app users and non-users.

  • Potential Issues: The warning message (glm.fit: fitted probabilities numerically 0 or 1 occurred) suggests some separation in the data, meaning some observations were perfectly predicted, which could affect model reliability.

Coefficient interpretations of fit4

  • Since the intercept value is highly negative, it suggests that the baseline probability of using the app is quite low.

  • age_groupProfessional (1.27803, p = 0.00451): Professionals are significantly more likely to use the app compared to the reference group. The odds ratio (exp(1.27803) ≈ 3.59) means professionals are about 3.6 times more likely to use the app than the reference group.

  • age_groupYoung (4.01414, p < 0.001): Younger individuals are much more likely to use the app compared to the reference group. The odds ratio (exp(4.01414) ≈ 55.44) means young individuals are over 55 times more likely to use the app than the reference group.

  • Number of Residents in the Living Area (0.15967, p = 0.01894) A positive coefficient suggests that people from larger settlements are more likely to use the app. The odds ratio (exp(0.15967) ≈ 1.17) means that for each unit increase in population size, the odds of using the app
    increase by 17%.

  • Monthly Net Income (0.63420, p < 0.001) Higher income is strongly associated with app usage. The odds ratio (exp(0.63420) ≈ 1.89) suggests that for each unit increase in income, the odds of using the app nearly
    double.

Final Thoughts

  • The final model (fit4) provides a solid explanation of mobile app usage patterns.

  • The strong role of age, population size, and income aligns with expectations—older individuals and those in smaller towns tend to use apps more, while higher-income individuals might be less dependent on them.

  • Addressing the perfect separation issue (perhaps by regularization techniques like ridge regression or handling outliers) could enhance the model’s robustness.

  • Younger individuals are overwhelmingly more likely to use the mobile app.

  • Professionals also have significantly higher odds of using the app compared to the reference age group.

  • Higher income and living in a larger settlement are both associated with greater app usage.

  • Living arrangement does not significantly impact app usage.

#and we take a bow