Load all the neccessary libraries.
library(mice)
library(VIM)
Load the dataset.
salary <- read.csv("adult.txt", sep =",", na.strings = c(" ? "," ?","?"),
stringsAsFactors = T)
names(salary) <- c("age", "workclass", "fnlwgt", "education", "education_num", "marital",
"occupation", "relationship",
"race", "sex", "capital_gain", "capital_loss", "hours_per_week",
"nat_country", "income")
Percentages of missing values in each column:
pMiss <- function(x){sum(is.na(x))/length(x)*100}
apply(salary,2,pMiss)
## age workclass fnlwgt education education_num
## 0.000000 5.638821 0.000000 0.000000 0.000000
## marital occupation relationship race sex
## 0.000000 5.660319 0.000000 0.000000 0.000000
## capital_gain capital_loss hours_per_week nat_country income
## 0.000000 0.000000 0.000000 1.790541 0.000000
Total number of missing values in each column:
Miss <- function(x){sum(is.na(x))}
apply(salary,2,Miss)
## age workclass fnlwgt education education_num
## 0 1836 0 0 0
## marital occupation relationship race sex
## 0 1843 0 0 0
## capital_gain capital_loss hours_per_week nat_country income
## 0 0 0 583 0
Percentages of incomplete cases:
icn(salary)/nrow(salary)* 100
## [1] 7.367936
Patterns of missing values across all the rows.
md.pattern(salary)
## age fnlwgt education education_num marital relationship race sex
## 30161 1 1 1 1 1 1 1 1
## 7 1 1 1 1 1 1 1 1
## 556 1 1 1 1 1 1 1 1
## 1809 1 1 1 1 1 1 1 1
## 27 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0
## capital_gain capital_loss hours_per_week income nat_country
## 30161 1 1 1 1 1
## 7 1 1 1 1 1
## 556 1 1 1 1 0
## 1809 1 1 1 1 1
## 27 1 1 1 1 0
## 0 0 0 0 583
## workclass occupation
## 30161 1 1 0
## 7 1 0 1
## 556 1 1 1
## 1809 0 0 2
## 27 0 0 3
## 1836 1843 4262
Patterns of missing values across all the columns:
aggr_plot <- aggr(salary, col=c('orange','red'),bars = FALSE, numbers=TRUE, sortVars=TRUE, labels=names(salary), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## occupation 0.05660319
## workclass 0.05638821
## nat_country 0.01790541
## age 0.00000000
## fnlwgt 0.00000000
## education 0.00000000
## education_num 0.00000000
## marital 0.00000000
## relationship 0.00000000
## race 0.00000000
## sex 0.00000000
## capital_gain 0.00000000
## capital_loss 0.00000000
## hours_per_week 0.00000000
## income 0.00000000