# velkost suboru
print(file.info("udaje/Questionary.csv")$size)
## [1] 19660
rm(list=ls())
udaje <- read.csv2("udaje/Questionary.csv",sep=";") # import dat
Funkcia clean_names() z balíka janitor dokáže automaticky previesť názvy stĺpcov na malé písmená, odstrániť medzery a nealfanumerické znaky a zabezpečiť, aby bol každý názov stĺpca jedinečný.
library(janitor)
# Clean up the column names - skratenie a uprava nazvov
udaje <- udaje %>% remove_empty(which="rows")
udaje.tmp <- udaje %>% remove_empty(which="cols")
library(stringr)
# Load the data
#udaje <- read.csv("your_file.csv")
# Shorten column names na prvých 10 znakov
short_names <- str_sub(names(udaje), 1, 10)
short_names <- make.unique(short_names)
names(udaje) <- short_names
head(udaje)
## My_culture Other_cult I_m_not_in Most_peopl People_fro I_have_lit
## 1 2 NA 1 1 3 1
## 2 3 3 1 3 1 3
## 3 2 1 2 2 3 1
## 4 2 2 1 4 4 1
## 5 3 3 1 1 2 1
## 6 2 2 2 4 2 4
## Most_peopl.1 People_in_ Generally_ I_like_to_ I_am_calm_ I_have_no_
## 1 1 2 3 3 3 3
## 2 4 4 5 4 4 4
## 3 2 3 4 4 4 4
## 4 2 2 4 4 4 3
## 5 3 3 3 4 4 4
## 6 5 2 4 4 3 4
## While_conv I_face_the I_enjoy_in In_a_sense Although_a When_I_see The_fact_t
## 1 2 2 3 3 3 3 3
## 2 4 5 5 4 4 5 5
## 3 4 4 4 4 4 4 4
## 4 3 3 4 5 5 5 5
## 5 4 4 4 3 4 4 4
## 6 3 3 4 4 4 4 3
## I_would_su I_believe_ People_sho Interactin I_doubt_th I_am_afrai My_fellow_
## 1 2 1 3 1 2 1 1
## 2 5 4 4 3 3 2 1
## 3 3 3 3 2 1 2 2
## 4 3 2 4 3 3 2 2
## 5 2 2 4 2 3 2 2
## 6 4 1 4 2 4 2 2
## There_is_a My_fellow_.1 I_keep_my_ Gender Age Marital.St
## 1 1 1 1 0 30 - 40 Married
## 2 3 3 2 1 30 - 40 Not Married
## 3 2 2 2 0 More than 40 Married
## 4 3 2 2 0 30 - 40 Married
## 5 2 4 3 0 30 - 40 Unknown
## 6 2 3 2 1 30 - 40 Married
## Nationalit Education Employment Position.l
## 1 Indian Master's Full time Professional/Technical
## 2 Srilankan Less than Bachelor's Degree Full time Middle/Lower Management
## 3 Indian Bachelor's Degree Full time Operational Level Employee
## 4 Indian Bachelor's Degree Full time Operational Level Employee
## 5 Filipino Bachelor's Degree Full time Middle/Lower Management
## 6 Sri lankan Bachelor's Degree Full time Professional/Technical
## Country.of Length.of.
## 1 United Arab Emirates (UAE) More than 4 Years
## 2 United Arab Emirates (UAE) More than 4 Years
## 3 United Arab Emirates (UAE) More than 4 Years
## 4 United Arab Emirates (UAE) More than 4 Years
## 5 United Arab Emirates (UAE) More than 4 Years
## 6 United Arab Emirates (UAE) More than 4 Years
## Category.o
## 1 Self Initiated Expatriate (found the job by yourself before or after arriving at the country)
## 2 Self Initiated Expatriate (found the job by yourself before or after arriving at the country)
## 3 Self Initiated Expatriate (found the job by yourself before or after arriving at the country)
## 4 Self Initiated Expatriate (found the job by yourself before or after arriving at the country)
## 5 Self Initiated Expatriate (found the job by yourself before or after arriving at the country)
## 6 Self Initiated Expatriate (found the job by yourself before or after arriving at the country)
# cleaning names of varibles
colnames(udaje)
## [1] "My_culture" "Other_cult" "I_m_not_in" "Most_peopl" "People_fro"
## [6] "I_have_lit" "Most_peopl.1" "People_in_" "Generally_" "I_like_to_"
## [11] "I_am_calm_" "I_have_no_" "While_conv" "I_face_the" "I_enjoy_in"
## [16] "In_a_sense" "Although_a" "When_I_see" "The_fact_t" "I_would_su"
## [21] "I_believe_" "People_sho" "Interactin" "I_doubt_th" "I_am_afrai"
## [26] "My_fellow_" "There_is_a" "My_fellow_.1" "I_keep_my_" "Gender"
## [31] "Age" "Marital.St" "Nationalit" "Education" "Employment"
## [36] "Position.l" "Country.of" "Length.of." "Category.o"
library(Amelia)
# Load your data
#data <- read.csv("your_file.csv")
# cistenie uplne prazdnych stlpcov alebo riadkov
udaje <- udaje %>% remove_empty(whic=c("rows"))
udaje.tmp <<- udaje %>% remove_empty(whic=c("cols"))
attach(udaje) # grafical information about the relation among the variables
pairs(udaje[,c(1,2,3,4,5,6,7,8,9,10)])
## Missing data and basic imputations
library(Amelia)
# Load your data
#data <- read.csv("your_file.csv")
# Create a missing map
missmap(udaje, col=c("yellow", "black"), legend=TRUE)
# Identifikácia numerických stĺpcov a konverzia
numeric_cols <- sapply(udaje, function(x){
# Stĺpec je numerický alebo faktory/char s číselnými hodnotami
is_num <- is.numeric(x)
is_factor_num <- (is.factor(x) || is.character(x)) && all(grepl("^[1-5]$", x[!is.na(x)]))
is_num || is_factor_num
})
# Vyfiltrujeme len názvy stĺpcov, ktoré sú TRUE
numeric_cols <- names(udaje)[numeric_cols]
# Imputácia chýbajúcich hodnôt priemerom (len pre numerické stĺpce)
for(col in numeric_cols){
udaje[[col]] <- as.numeric(udaje[[col]])
udaje[[col]][is.na(udaje[[col]])] <- mean(udaje[[col]], na.rm=TRUE)
}
datatable(udaje,
options = list(
pageLength = 10,
scrollX = TRUE,
fixedColumns = list(leftColumns = 1)
),
rownames = FALSE,
class = 'stripe hover compact')