Cleaning data
#install.packages
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
mydata_excel2 <- read_excel("~/Desktop/NLB projekt/Logistična regresija/anketa_končni podatki.xlsx")
mydata_excel2 <- mydata_excel2[-1, ] #Delete first row in which the questions are written
mydata_excel2$ID <- seq(1,nrow(mydata_excel2))
head(mydata_excel2)
## # A tibble: 6 × 129
## Q1 Q2 Q3 Q4 Q5 Q6a Q6b Q6c Q6d Q6e Q6f Q6g Q6h
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2 1 1 1 1 2 1 1 2 1 2 1 1
## 2 2 1 1 2 -2 -2 -2 -2 -2 -2 -2 -2 -2
## 3 2 1 1 1 2 4 5 5 4 3 5 5 4
## 4 2 1 1 1 2 -3 -3 -3 -3 -3 -3 -3 -3
## 5 2 1 1 1 2 2 3 4 4 1 2 2 3
## 6 2 1 1 1 2 3 2 5 1 1 1 1 1
## # ℹ 116 more variables: Q6i <chr>, Q6j <chr>, Q6j_text <chr>, Q7 <chr>,
## # `Podpora in svetovanje` <chr>,
## # `Zagotavljanje varnosti in zaščita vaših finančnih podatkov ter transakcij` <chr>,
## # `Transparentnost delovanja banke in njenih storitev` <chr>,
## # Priročnost <chr>, `Hitrost in zanesljivost opravljene storitve` <chr>,
## # Q8f <chr>, Q8f_text <chr>, `Poslovalnica_Podpora in svetovanje` <chr>,
## # `Mobilna banka_Podpora in svetovanje` <chr>, …
mydata2 <- mydata_excel2[!(apply(mydata_excel2 == -3, 1, any)), ]
mydata2 <- subset(mydata2, select = -c(Q21:Q40))
mydata2$ID <- seq(1,nrow(mydata2))
head(mydata2)
## # A tibble: 6 × 83
## Q1 Q2 Q3 Q4 Q5 Q6a Q6b Q6c Q6d Q6e Q6f Q6g Q6h
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2 1 1 1 1 2 1 1 2 1 2 1 1
## 2 2 1 1 2 -2 -2 -2 -2 -2 -2 -2 -2 -2
## 3 2 1 1 1 2 4 5 5 4 3 5 5 4
## 4 2 1 1 1 2 2 3 4 4 1 2 2 3
## 5 2 1 1 1 2 3 2 5 1 1 1 1 1
## 6 2 2 2 1 2 5 4 4 5 4 3 5 5
## # ℹ 70 more variables: Q6i <chr>, Q6j <chr>, Q6j_text <chr>, Q7 <chr>,
## # `Podpora in svetovanje` <chr>,
## # `Zagotavljanje varnosti in zaščita vaših finančnih podatkov ter transakcij` <chr>,
## # `Transparentnost delovanja banke in njenih storitev` <chr>,
## # Priročnost <chr>, `Hitrost in zanesljivost opravljene storitve` <chr>,
## # Q8f <chr>, Q8f_text <chr>, `Poslovalnica_Podpora in svetovanje` <chr>,
## # `Mobilna banka_Podpora in svetovanje` <chr>, …
mydata2$Q2 <- factor(mydata2$Q2,
levels = c(1, 2),
labels = c("Yes","No"))
mydata2$Q3 <- factor(mydata2$Q3,
levels = c(1, 2),
labels = c("Yes","No"))
mydata2$Q4 <- factor(mydata2$Q4,
levels = c(1, 2, 3),
labels = c("Yes","No", "I don't know"))
mydata2$Q5 <- factor(mydata2$Q5,
levels = c(1, 2),
labels = c("Yes","No"))
mydata2$Q7 <- factor(mydata2$Q7,
levels = c(1, 2),
labels = c("Yes","No"))
mydata2$Q16a <- factor(mydata2$Q16a,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q16b <- factor(mydata2$Q16b,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q16c <- factor(mydata2$Q16c,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q16d <- factor(mydata2$Q16d,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q16e <- factor(mydata2$Q16e,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q16f <- factor(mydata2$Q16f,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q16g <- factor(mydata2$Q16g,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q41 <- factor(mydata2$Q41,
levels = c(1, 2, 3),
labels = c("Female","Male", "I don't want to answer"))
mydata2$Q43a <- factor(mydata2$Q43a,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q43b <- factor(mydata2$Q43b,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q43c <- factor(mydata2$Q43c,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q43d <- factor(mydata2$Q43d,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q43e <- factor(mydata2$Q43e,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q43f <- factor(mydata2$Q43f,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q43g <- factor(mydata2$Q43g,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q43h <- factor(mydata2$Q43h,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata2$Q44 <- factor(mydata2$Q44,
levels = c(2, 6, 1, 4, 3, 5),
labels = c("1.000 – 5.000 habitants","More than 100.000 habitants", "Less than 1.000 habitants", "20.001 – 50.000 habitants", "5.001 – 20.000 habitants", "50.001 – 100.000 habitants"))
mydata2$Q45 <- factor(mydata2$Q45,
levels = c(3, 5, 1, 9, 12, 7, 10, 4, 11, 6),
labels = c("OTP banka d.d.","Banka Intesa Sanpaolo d.d.", "Nova Ljubljanska Banka d.d. (NLB)", "Gorenjska Banka d.d.", "Delavska Hranilnica d.d.", "Revolut", "Deželna Banka Slovenije d.d.", "Banka Sparkasse d.d.", "Addiko Bank d.d.", "UniCredit Banka Slovenija d.d."))
mydata2$Q46 <- factor(mydata2$Q46,
levels = c(1, 2, 3, 5, 6, 4),
labels = c("Študent/-ka","Redno zaposlen/-a", "Upokojen/-a", "Samozaposlen/-a", "Delno zaposlen/-a", "Brezposeln/-a"))
mydata2$Q47 <- factor(mydata2$Q47,
levels = c(1, 5, 3, 8, 2, 4, 6, 7),
labels = c("Pod 1.000€","3.001€ - 5.000€", "1.501€ - 2.000€", "I don't want to answer", "1.000€ - 1.500€", "2.001€ - 3.000€", "5.001€ - 10.000€", "Above 10.000€"))
mydata2$Q48 <- factor(mydata2$Q48,
levels = c(2, 6, 3, 5, 7, 4),
labels = c("Dokončana osnovna šola","Dokončana visokošolska strokovna univerzitetna izobrazba (tudi 2. bolonjska stopnja)", "Dokončana nižja ali srednja poklicna izobrazba", "Dokončana višješolska strokovna ali visokošolska strokovna izobrazba (tudi 1. bolonjska stopnja)", "Dokončana specializacija, znanstveni magisterij, doktorat", "Dokončana srednja strokovna ali splošna izobrazba"))
mydata2$Q49 <- factor(mydata2$Q49,
levels = c(1, 2, 3),
labels = c("Preko linka","Na tablici", "Na listu papirja"))
mydata2[c(6:14, 18:22, 25:39, 48:55, 58:62)] <- mydata2[c(6:14, 18:22, 25:39, 48:55, 58:62)] %>% mutate_all(as.numeric)
mydata2 <- mydata2 %>% mutate(across(where(is.numeric), ~ replace(., . == -2, mean(.[. != -2], na.rm = TRUE))))
mydata2 <- mydata2 %>% mutate(across(where(is.numeric), ~ replace(., is.na(.), mean(., na.rm = TRUE))))
mydata2 <- mydata2 %>%
filter(!ID %in% c(2, 16, 17))
mydata2$ID <- seq(1, nrow(mydata2))
summary(mydata2[c(-1, -15, -16, -23, -24, -47, -56, -57, -63, -64, -66, -75, -78, -83)])
## Q2 Q3 Q4 Q5 Q6a
## Yes :121 Yes :131 Yes :103 Yes : 25 Min. :1.000
## No : 36 No : 26 No : 28 No :132 1st Qu.:3.000
## NA's: 1 NA's: 1 I don't know: 26 NA's: 1 Median :3.000
## NA's : 1 Mean :3.433
## 3rd Qu.:4.000
## Max. :5.000
##
## Q6b Q6c Q6d Q6e
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:1.000
## Median :4.000 Median :4.000 Median :4.000 Median :2.000
## Mean :3.535 Mean :3.847 Mean :3.854 Mean :2.688
## 3rd Qu.:4.000 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
##
## Q6f Q6g Q6h Q6i Q7
## Min. :1.000 Min. :1.000 Min. :1.00 Min. :1.000 Yes :82
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:2.00 1st Qu.:3.000 No :75
## Median :4.000 Median :3.086 Median :3.00 Median :4.000 NA's: 1
## Mean :3.497 Mean :3.172 Mean :2.93 Mean :3.465
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.00 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.00 Max. :5.000
##
## Podpora in svetovanje
## Min. :1.00
## 1st Qu.:3.00
## Median :4.00
## Mean :3.79
## 3rd Qu.:4.00
## Max. :5.00
##
## Zagotavljanje varnosti in zaščita vaših finančnih podatkov ter transakcij
## Min. :1.000
## 1st Qu.:4.000
## Median :5.000
## Mean :4.382
## 3rd Qu.:5.000
## Max. :5.000
##
## Transparentnost delovanja banke in njenih storitev Priročnost
## Min. :1.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.:4.000
## Median :4.000 Median :4.000
## Mean :4.108 Mean :3.975
## 3rd Qu.:5.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000
##
## Hitrost in zanesljivost opravljene storitve Poslovalnica_Podpora in svetovanje
## Min. :1.000 Min. :2.000
## 1st Qu.:4.000 1st Qu.:4.000
## Median :4.000 Median :4.000
## Mean :4.185 Mean :4.006
## 3rd Qu.:5.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000
##
## Mobilna banka_Podpora in svetovanje
## Min. :1.00
## 1st Qu.:3.00
## Median :3.00
## Mean :3.14
## 3rd Qu.:4.00
## Max. :5.00
##
## Poslovalnica_Zagotavljanje varnosti in zaščita vaših finančnih podatkov ter transakcij
## Min. :2.000
## 1st Qu.:3.986
## Median :4.000
## Mean :3.981
## 3rd Qu.:5.000
## Max. :5.000
##
## Mobilna banka_Zagotavljanje varnosti in zaščita vaših finančnih podatkov ter transakcij
## Min. :1.000
## 1st Qu.:2.000
## Median :3.000
## Mean :3.032
## 3rd Qu.:4.000
## Max. :5.000
##
## Poslovalnica_Transparentnost delovanja banke in njenih storitev
## Min. :1.000
## 1st Qu.:3.000
## Median :4.000
## Mean :3.898
## 3rd Qu.:4.000
## Max. :5.000
##
## Mobilna banka_Transparentnost delovanja banke in njenih storitev
## Min. :1.000
## 1st Qu.:3.000
## Median :3.000
## Mean :3.248
## 3rd Qu.:4.000
## Max. :5.000
##
## Poslovalnica_Priročnost Mobilna banka_Priročnost
## Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000
## Median :4.000 Median :3.000
## Mean :3.752 Mean :3.389
## 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000
##
## Poslovalnica_Hitrost in zanesljivost opravljene storitve
## Min. :1.000
## 1st Qu.:3.000
## Median :4.000
## Mean :3.701
## 3rd Qu.:4.000
## Max. :5.000
##
## Mobilna banka_Hitrost in zanesljivost opravljene storitve Q15a
## Min. :1.000 Min. :1
## 1st Qu.:3.000 1st Qu.:4
## Median :4.000 Median :4
## Mean :3.586 Mean :4
## 3rd Qu.:4.000 3rd Qu.:5
## Max. :5.000 Max. :5
##
## Q15b Q15c Q15d Q15e
## Min. :2.000 Min. :1.000 Min. :1.000 Min. :2.000
## 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:3.000 1st Qu.:4.000
## Median :4.000 Median :4.000 Median :4.000 Median :4.000
## Mean :4.134 Mean :4.115 Mean :3.809 Mean :4.127
## 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
##
## Q16a Q16b Q16c Q16d
## Selected :98 Selected :66 Selected :68 Selected :79
## Not selected:59 Not selected:91 Not selected:89 Not selected:78
## NA's : 1 NA's : 1 NA's : 1 NA's : 1
##
##
##
##
## Q16e Q16f Q16g Q17a
## Selected : 49 Selected : 36 Selected : 9 Min. :0.000
## Not selected:108 Not selected:121 Not selected:148 1st Qu.:4.000
## NA's : 1 NA's : 1 NA's : 1 Median :4.000
## Mean :3.662
## 3rd Qu.:4.000
## Max. :5.000
##
## Q17b Q17c Q17d Q17e Q18a
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :1.00
## 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:2.000 1st Qu.:4.000 1st Qu.:3.00
## Median :4.000 Median :4.000 Median :4.000 Median :4.000 Median :4.00
## Mean :3.694 Mean :3.758 Mean :3.223 Mean :3.809 Mean :3.35
## 3rd Qu.:4.000 3rd Qu.:4.750 3rd Qu.:4.000 3rd Qu.:5.000 3rd Qu.:4.00
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.00
##
## Q18b Q18c Q19a Q19b
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000
## Median :4.000 Median :4.000 Median :3.000 Median :4.000
## Mean :3.732 Mean :3.713 Mean :3.217 Mean :3.548
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
##
## Q19c Q19d Q19e Q41
## Min. :1.000 Min. :1.000 Min. :1.000 Female :78
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:2.000 Male :79
## Median :4.000 Median :4.000 Median :3.000 I don't want to answer: 0
## Mean :3.331 Mean :3.618 Mean :3.051 NA's : 1
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000
##
## Q43a Q43b Q43c Q43d
## Selected : 32 Selected : 12 Selected : 5 Selected :81
## Not selected:125 Not selected:145 Not selected:152 Not selected:76
## NA's : 1 NA's : 1 NA's : 1 NA's : 1
##
##
##
##
## Q43e Q43f Q43g Q43h
## Selected : 57 Selected : 1 Selected : 29 Selected : 0
## Not selected:100 Not selected:156 Not selected:128 Not selected:157
## NA's : 1 NA's : 1 NA's : 1 NA's : 1
##
##
##
##
## Q44 Q45
## 1.000 – 5.000 habitants :44 OTP banka d.d. :71
## More than 100.000 habitants:36 Nova Ljubljanska Banka d.d. (NLB):35
## Less than 1.000 habitants :30 Delavska Hranilnica d.d. :12
## 20.001 – 50.000 habitants :14 Banka Intesa Sanpaolo d.d. :10
## 5.001 – 20.000 habitants :22 Deželna Banka Slovenije d.d. : 8
## 50.001 – 100.000 habitants :11 (Other) :18
## NA's : 1 NA's : 4
## Q46 Q47
## Študent/-ka :22 Pod 1.000€ :46
## Redno zaposlen/-a:73 1.000€ - 1.500€:39
## Upokojen/-a :43 2.001€ - 3.000€:29
## Samozaposlen/-a :13 1.501€ - 2.000€:25
## Delno zaposlen/-a: 3 3.001€ - 5.000€: 6
## Brezposeln/-a : 3 (Other) : 4
## NA's : 1 NA's : 9
## Q48
## Dokončana osnovna šola : 8
## Dokončana visokošolska strokovna univerzitetna izobrazba (tudi 2. bolonjska stopnja) :35
## Dokončana nižja ali srednja poklicna izobrazba :23
## Dokončana višješolska strokovna ali visokošolska strokovna izobrazba (tudi 1. bolonjska stopnja):34
## Dokončana specializacija, znanstveni magisterij, doktorat : 5
## Dokončana srednja strokovna ali splošna izobrazba :50
## NA's : 3
## Q49
## Preko linka :84
## Na tablici :37
## Na listu papirja:17
## NA's :20
##
##
##
data2 <- mydata_excel2 %>% select(Q41, Q42, Q43a, Q43b, Q43c, Q43d, Q43e, Q43f, Q43g, Q43h, Q44, Q45, Q46, Q47, Q48)
# Rename columns using colnames()
colnames(data2) <- c("Spol","Leto rojstva","Živim s starši","Živim s sorojenci","Živim s sorodniki","Živim s partnerko/jem", "Živim z otroki", "Živim s skrbnikom", "Živim sam/a", "Drugo", "Št. prebivalcev v kraju bivanja", "Primarna banka", "Trenutna zaposlitev", "Mesečni neto prihodek", "Stopnja izobrazbe")
# View first few rows
head(data2)
## # A tibble: 6 × 15
## Spol `Leto rojstva` `Živim s starši` `Živim s sorojenci` `Živim s sorodniki`
## <chr> <chr> <chr> <chr> <chr>
## 1 2 2006 1 0 0
## 2 2 1988 0 0 0
## 3 2 1963 0 0 0
## 4 -3 -3 -3 -3 -3
## 5 2 2004 1 0 0
## 6 2 2000 1 1 0
## # ℹ 10 more variables: `Živim s partnerko/jem` <chr>, `Živim z otroki` <chr>,
## # `Živim s skrbnikom` <chr>, `Živim sam/a` <chr>, Drugo <chr>,
## # `Št. prebivalcev v kraju bivanja` <chr>, `Primarna banka` <chr>,
## # `Trenutna zaposlitev` <chr>, `Mesečni neto prihodek` <chr>,
## # `Stopnja izobrazbe` <chr>
# Automatically calculate age and assign age groups
data2 <- data2 %>%
mutate(
`Leto rojstva` = as.numeric(as.character(`Leto rojstva`)), # Convert to numeric (handles factors & characters)
age = as.numeric(format(Sys.Date(), "%Y")) - `Leto rojstva`, # Calculate age
age_group = case_when(
age <= 40 ~ "Young",
age > 40 & age <= 60 ~ "Professional",
age > 60 ~ "Older"))
# View first few rows
head(data2)
## # A tibble: 6 × 17
## Spol `Leto rojstva` `Živim s starši` `Živim s sorojenci` `Živim s sorodniki`
## <chr> <dbl> <chr> <chr> <chr>
## 1 2 2006 1 0 0
## 2 2 1988 0 0 0
## 3 2 1963 0 0 0
## 4 -3 -3 -3 -3 -3
## 5 2 2004 1 0 0
## 6 2 2000 1 1 0
## # ℹ 12 more variables: `Živim s partnerko/jem` <chr>, `Živim z otroki` <chr>,
## # `Živim s skrbnikom` <chr>, `Živim sam/a` <chr>, Drugo <chr>,
## # `Št. prebivalcev v kraju bivanja` <chr>, `Primarna banka` <chr>,
## # `Trenutna zaposlitev` <chr>, `Mesečni neto prihodek` <chr>,
## # `Stopnja izobrazbe` <chr>, age <dbl>, age_group <chr>
data2clean <- data2 %>%
filter(complete.cases(.)) # Removes rows with any NAs
head(data2)
## # A tibble: 6 × 17
## Spol `Leto rojstva` `Živim s starši` `Živim s sorojenci` `Živim s sorodniki`
## <chr> <dbl> <chr> <chr> <chr>
## 1 2 2006 1 0 0
## 2 2 1988 0 0 0
## 3 2 1963 0 0 0
## 4 -3 -3 -3 -3 -3
## 5 2 2004 1 0 0
## 6 2 2000 1 1 0
## # ℹ 12 more variables: `Živim s partnerko/jem` <chr>, `Živim z otroki` <chr>,
## # `Živim s skrbnikom` <chr>, `Živim sam/a` <chr>, Drugo <chr>,
## # `Št. prebivalcev v kraju bivanja` <chr>, `Primarna banka` <chr>,
## # `Trenutna zaposlitev` <chr>, `Mesečni neto prihodek` <chr>,
## # `Stopnja izobrazbe` <chr>, age <dbl>, age_group <chr>
summary(data2)
## Spol Leto rojstva Živim s starši Živim s sorojenci
## Length:190 Min. : -3 Length:190 Length:190
## Class :character 1st Qu.:1951 Class :character Class :character
## Mode :character Median :1967 Mode :character Mode :character
## Mean :1662
## 3rd Qu.:1986
## Max. :2007
## Živim s sorodniki Živim s partnerko/jem Živim z otroki Živim s skrbnikom
## Length:190 Length:190 Length:190 Length:190
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Živim sam/a Drugo Št. prebivalcev v kraju bivanja
## Length:190 Length:190 Length:190
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## Primarna banka Trenutna zaposlitev Mesečni neto prihodek
## Length:190 Length:190 Length:190
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## Stopnja izobrazbe age age_group
## Length:190 Min. : 18.00 Length:190
## Class :character 1st Qu.: 39.25 Class :character
## Mode :character Median : 58.00 Mode :character
## Mean : 362.84
## 3rd Qu.: 73.75
## Max. :2028.00