DataCleaning

Úvodná informácia o súbore

# velkost suboru
print(file.info("udaje/Questionary.csv")$size)

## [1] 19660

rm(list=ls())
udaje <- read.csv2("udaje/Questionary.csv",sep=";") # import dat

Názvy premenných

Funkcia clean_names() z balíka janitor dokáže automaticky previesť názvy stĺpcov na malé písmená, odstrániť medzery a nealfanumerické znaky a zabezpečiť, aby bol každý názov stĺpca jedinečný.

library(janitor)
# Clean up the column names - skratenie a uprava nazvov
udaje <- udaje %>% remove_empty(which="rows")
udaje.tmp <- udaje %>% remove_empty(which="cols")

Skrátene názvov premenných

library(stringr)

# Load the data
#udaje <- read.csv("your_file.csv")

# Shorten column names na prvých 10 znakov
short_names <- str_sub(names(udaje), 1, 10)
short_names <- make.unique(short_names)
names(udaje) <- short_names
head(udaje)

##   My_culture Other_cult I_m_not_in Most_peopl People_fro I_have_lit
## 1          2         NA          1          1          3          1
## 2          3          3          1          3          1          3
## 3          2          1          2          2          3          1
## 4          2          2          1          4          4          1
## 5          3          3          1          1          2          1
## 6          2          2          2          4          2          4
##   Most_peopl.1 People_in_ Generally_ I_like_to_ I_am_calm_ I_have_no_
## 1            1          2          3          3          3          3
## 2            4          4          5          4          4          4
## 3            2          3          4          4          4          4
## 4            2          2          4          4          4          3
## 5            3          3          3          4          4          4
## 6            5          2          4          4          3          4
##   While_conv I_face_the I_enjoy_in In_a_sense Although_a When_I_see The_fact_t
## 1          2          2          3          3          3          3          3
## 2          4          5          5          4          4          5          5
## 3          4          4          4          4          4          4          4
## 4          3          3          4          5          5          5          5
## 5          4          4          4          3          4          4          4
## 6          3          3          4          4          4          4          3
##   I_would_su I_believe_ People_sho Interactin I_doubt_th I_am_afrai My_fellow_
## 1          2          1          3          1          2          1          1
## 2          5          4          4          3          3          2          1
## 3          3          3          3          2          1          2          2
## 4          3          2          4          3          3          2          2
## 5          2          2          4          2          3          2          2
## 6          4          1          4          2          4          2          2
##   There_is_a My_fellow_.1 I_keep_my_ Gender          Age  Marital.St
## 1          1            1          1      0      30 - 40     Married
## 2          3            3          2      1      30 - 40 Not Married
## 3          2            2          2      0 More than 40     Married
## 4          3            2          2      0      30 - 40     Married
## 5          2            4          3      0      30 - 40     Unknown
## 6          2            3          2      1      30 - 40     Married
##    Nationalit                   Education Employment                 Position.l
## 1      Indian                    Master's  Full time     Professional/Technical
## 2  Srilankan  Less than Bachelor's Degree  Full time    Middle/Lower Management
## 3      Indian           Bachelor's Degree  Full time Operational Level Employee
## 4      Indian           Bachelor's Degree  Full time Operational Level Employee
## 5   Filipino            Bachelor's Degree  Full time    Middle/Lower Management
## 6  Sri lankan           Bachelor's Degree  Full time     Professional/Technical
##                   Country.of        Length.of.
## 1 United Arab Emirates (UAE) More than 4 Years
## 2 United Arab Emirates (UAE) More than 4 Years
## 3 United Arab Emirates (UAE) More than 4 Years
## 4 United Arab Emirates (UAE) More than 4 Years
## 5 United Arab Emirates (UAE) More than 4 Years
## 6 United Arab Emirates (UAE) More than 4 Years
##                                                                                      Category.o
## 1 Self Initiated Expatriate (found the job by yourself before or after arriving at the country)
## 2 Self Initiated Expatriate (found the job by yourself before or after arriving at the country)
## 3 Self Initiated Expatriate (found the job by yourself before or after arriving at the country)
## 4 Self Initiated Expatriate (found the job by yourself before or after arriving at the country)
## 5 Self Initiated Expatriate (found the job by yourself before or after arriving at the country)
## 6 Self Initiated Expatriate (found the job by yourself before or after arriving at the country)

# cleaning names of varibles
colnames(udaje)

##  [1] "My_culture"   "Other_cult"   "I_m_not_in"   "Most_peopl"   "People_fro"  
##  [6] "I_have_lit"   "Most_peopl.1" "People_in_"   "Generally_"   "I_like_to_"  
## [11] "I_am_calm_"   "I_have_no_"   "While_conv"   "I_face_the"   "I_enjoy_in"  
## [16] "In_a_sense"   "Although_a"   "When_I_see"   "The_fact_t"   "I_would_su"  
## [21] "I_believe_"   "People_sho"   "Interactin"   "I_doubt_th"   "I_am_afrai"  
## [26] "My_fellow_"   "There_is_a"   "My_fellow_.1" "I_keep_my_"   "Gender"      
## [31] "Age"          "Marital.St"   "Nationalit"   "Education"    "Employment"  
## [36] "Position.l"   "Country.of"   "Length.of."   "Category.o"

Čistenie obsahu databázy

Chýbajúce údaje a základné imputácie

library(Amelia)

# Load your data
#data <- read.csv("your_file.csv")

Čistenie prázdnych riadkov a stĺpcov

# cistenie uplne prazdnych stlpcov alebo riadkov
udaje <- udaje %>% remove_empty(whic=c("rows"))
udaje.tmp <<- udaje %>% remove_empty(whic=c("cols"))

attach(udaje)   # grafical information about the relation among the variables
pairs(udaje[,c(1,2,3,4,5,6,7,8,9,10)])

## Missing data and basic imputations

library(Amelia)

# Load your data
#data <- read.csv("your_file.csv")

# Create a missing map
missmap(udaje, col=c("yellow", "black"), legend=TRUE)

# Identifikácia numerických stĺpcov a konverzia
numeric_cols <- sapply(udaje, function(x){
  # Stĺpec je numerický alebo faktory/char s číselnými hodnotami
  is_num <- is.numeric(x)
  is_factor_num <- (is.factor(x) || is.character(x)) && all(grepl("^[1-5]$", x[!is.na(x)]))
  is_num || is_factor_num
})

# Vyfiltrujeme len názvy stĺpcov, ktoré sú TRUE
numeric_cols <- names(udaje)[numeric_cols]

# Imputácia chýbajúcich hodnôt priemerom (len pre numerické stĺpce)
for(col in numeric_cols){
  udaje[[col]] <- as.numeric(udaje[[col]])
  udaje[[col]][is.na(udaje[[col]])] <- mean(udaje[[col]], na.rm=TRUE)
}

datatable(udaje,
          options = list(
            pageLength = 10,
            scrollX = TRUE,
            fixedColumns = list(leftColumns = 1)
          ),
          rownames = FALSE,
          class = 'stripe hover compact')