sample_dirty_dataset$age = ifelse(sample_dirty_dataset$age == "InvalidAge", NA, sample_dirty_dataset$age)
sample_dirty_dataset$occupation = ifelse(sample_dirty_dataset$occupation == "InvalidOccupation", NA, sample_dirty_dataset$occupation)
sample_dirty_dataset$capital_loss = ifelse(sample_dirty_dataset$capital_loss == -999, NA, sample_dirty_dataset$capital_loss)
sample_dirty_dataset$capital_gain = ifelse(sample_dirty_dataset$capital_gain == -999, NA, sample_dirty_dataset$capital_gain)
sample_dirty_dataset$race = ifelse(sample_dirty_dataset$race == "InvalidRace", NA, sample_dirty_dataset$race)
sample_dirty_dataset$education = ifelse(sample_dirty_dataset$education == "InvalidEducation", NA, sample_dirty_dataset$education)
inconsistencies_education <- sample_dirty_dataset$education %in% c("Bachelors", "11th")
sample_dirty_dataset$education[sample(1:n, 50)] <- "InvalidEducation"
sample_dirty_dataset$education[inconsistencies_education] <- "InvalidEducation"
inconsistencies_race <- sample_dirty_dataset$race %in% c("Other")
sample_dirty_dataset$race[sample(1:n, 50)] <- "InvalidRace"
sample_dirty_dataset$race[inconsistencies_race] <- "InvalidRace"
#missing values before changes
gg_miss_var(sample_dirty_dataset, show_pct = TRUE)
#places with missing data
vis_miss(sample_dirty_dataset)
### Handling missing numeric data
# imput missing numeric data
sample_dirty_dataset <- complete(mice(sample_dirty_dataset, m = 1))
##
## iter imp variable
## 1 1 capital_gain capital_loss hours_per_week
## 2 1 capital_gain capital_loss hours_per_week
## 3 1 capital_gain capital_loss hours_per_week
## 4 1 capital_gain capital_loss hours_per_week
## 5 1 capital_gain capital_loss hours_per_week
## Warning: Number of logged events: 10
# now there is no missing data in these columns
vis_miss(sample_dirty_dataset)
# the mean of true data is similar to the mean of imput data
mean(sample_dirty_dataset$capital_gain)
## [1] 50122.07
mean(capital_gain_true)
## [1] 49815.04
mean(sample_dirty_dataset$capital_loss)
## [1] 48416.53
mean(capital_loss_true)
## [1] 48587.17
mean(sample_dirty_dataset$hours_per_week)
## [1] 48.449
mean(hours_per_week_true)
## [1] 48.434
# using mutate to adress and change data
sample_dirty_dataset <- mutate(sample_dirty_dataset, workclass = ifelse(is.na(workclass), "Private", workclass))
sample_dirty_dataset <- mutate(sample_dirty_dataset, relationship = ifelse(is.na(relationship), "Other-realtive", relationship))
sample_dirty_dataset <- mutate(sample_dirty_dataset, occupation = ifelse(is.na(occupation), "Other-service", occupation))
sample_dirty_dataset <- mutate(sample_dirty_dataset, race = ifelse(is.na(race), "Other", race))
# now there is no missing data in these columns
vis_miss(sample_dirty_dataset)
# using mutate to change data through conclusion
sample_dirty_dataset <- sample_dirty_dataset %>%
mutate(sex = if_else(is.na(sex) & relationship == "Wife", "Male", sex, missing = sex))
sample_dirty_dataset <- sample_dirty_dataset %>%
mutate(sex = if_else(is.na(sex) & relationship == "Husband", "Female", sex, missing = sex))
sample_dirty_dataset <- sample_dirty_dataset %>%
mutate(sex = if_else(is.na(sex) & marital_status == "Widowed", "Female", sex, missing = sex))
# now there is less mising data in the column
vis_miss(sample_dirty_dataset)
#removing other data - they cannot be imputed
sample_dirty_dataset <- na.omit(sample_dirty_dataset)
#missing values removed
gg_miss_var(sample_dirty_dataset, show_pct = TRUE)