Sample data
# Set seed for reproducibility
set.seed(44100)
# Generate sample data
n <- 1000 # Number of observations
# Age between 18 and 90
age <- sample(18:90, n, replace = TRUE)
# Introduce errors in age column
age[sample(1:n, 50)] <- "InvalidAge"
# Workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked
workclass <- sample(c("Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", "Local-gov", "State-gov", "Without-pay", "Never-worked"), n, replace = TRUE)
# Introduce missing values in workclass column
workclass[sample(1:n, 50)] <- NA
# Education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool
education <- sample(c("Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"), n, replace = TRUE)
# Introduce inconsistencies in education column
education[sample(1:n, 50)] <- "InvalidEducation"
# Education Number: 1-16
education_num <- sample(1:16, n, replace = TRUE)
# Marital Status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse
marital_status <- sample(c("Married-civ-spouse", "Divorced", "Never-married", "Separated", "Widowed", "Married-spouse-absent", "Married-AF-spouse"), n, replace = TRUE)
# Introduce missing values in marital_status column
marital_status[sample(1:n, 50)] <- NA
# Occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces
occupation <- sample(c("Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv", "Armed-Forces"), n, replace = TRUE)
# Introduce errors in occupation column
occupation[sample(1:n, 50)] <- "InvalidOccupation"
# Relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried
relationship <- sample(c("Wife", "Own-child", "Husband", "Not-in-family", "Other-relative", "Unmarried"), n, replace = TRUE)
# Introduce missing values in relationship column
relationship[sample(1:n, 50)] <- NA
# Race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black
race <- sample(c("White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"), n, replace = TRUE)
# Introduce inconsistencies in race column
race[sample(1:n, 50)] <- "InvalidRace"
# Sex: Female, Male
sex <- sample(c("Female", "Male"), n, replace = TRUE)
# Introduce missing values in sex column
sex[sample(1:n, 50)] <- NA
# Capital Gain: 0-99999
capital_gain <- sample(0:99999, n, replace = TRUE)
# Introduce errors in capital_gain column
capital_gain[sample(1:n, 50)] <- -999
# Capital Loss: 0-99999
capital_loss <- sample(0:99999, n, replace = TRUE)
# Introduce errors in capital_loss column
capital_loss[sample(1:n, 50)] <- -999
# Hours per week: 1-99
hours_per_week <- sample(1:99, n, replace = TRUE)
# Introduce missing values in hours_per_week column
hours_per_week[sample(1:n, 50)] <- NA
# Native country: United-States, Cambodia, England, ...
native_country <- sample(c("United-States", "Cambodia", "England", "Puerto-Rico", "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras", "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico", "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"), n, replace = TRUE)
# Introduce missing values in native_country column
native_country[sample(1:n, 50)] <- NA
# Income: <=50K, >50K
income <- sample(c("<=50K", ">50K"), n, replace = TRUE, prob = c(0.75, 0.25))
# Create dataframe
sample_dirty_dataset <- data.frame(age, workclass, education, education_num, marital_status, occupation, relationship, race, sex, capital_gain, capital_loss, hours_per_week, native_country, income)
# Print first few rows of the dataset
head(sample_dirty_dataset)
## age workclass education education_num marital_status occupation
## 1 46 Never-worked Masters 11 Divorced Adm-clerical
## 2 54 Without-pay 11th 10 Widowed Armed-Forces
## 3 53 Private Bachelors 16 Separated Machine-op-inspct
## 4 70 Private 9th 14 Separated Farming-fishing
## 5 78 Local-gov 7th-8th 14 Married-AF-spouse Handlers-cleaners
## 6 71 Self-emp-inc Assoc-voc 14 Married-AF-spouse InvalidOccupation
## relationship race sex capital_gain capital_loss
## 1 Unmarried Asian-Pac-Islander Male 81712 69387
## 2 Own-child Other Female 25014 23069
## 3 Not-in-family Other Male 65305 57731
## 4 Not-in-family InvalidRace Male 7401 41746
## 5 Own-child Amer-Indian-Eskimo Female 45980 99713
## 6 Other-relative Asian-Pac-Islander Male -999 81896
## hours_per_week native_country income
## 1 49 Poland >50K
## 2 13 <NA> >50K
## 3 8 United-States <=50K
## 4 26 Trinadad&Tobago <=50K
## 5 71 Nicaragua >50K
## 6 16 Guatemala <=50K
Visualizing missing data, firstly generating diagnose and EDA report, also replacing invalid results in some columns with the “NA”, to make them easier to work with:
library(naniar)
library(dlookr)
diagnose_web_report(sample_dirty_dataset, browse = FALSE,title = "Diagnostical report", author = "Evgenei Khodar",create_date = Sys.time())
##
|
| | 0%
|
|. | 1%
|
|. | 3% [setup]
|
|.. | 4%
|
|... | 6% [load_packages]
|
|... | 7%
|
|.... | 9% [get-parameters]
|
|..... | 10%
|
|..... | 12% [unnamed-chunk-3]
|
|...... | 13%
|
|....... | 15% [diagose]
|
|....... | 16%
|
|........ | 18% [create-overview]
|
|......... | 19%
|
|......... | 21% [overview]
|
|.......... | 22%
|
|........... | 24% [overview-datastructure]
|
|........... | 25%
|
|............ | 27% [overview-pre]
|
|............. | 28%
|
|............. | 30% [overview-warnings]
|
|.............. | 31%
|
|............... | 33% [warnings_summary]
|
|............... | 34%
|
|................ | 36% [warnings]
|
|................. | 37%
|
|................. | 39% [overview-variables]
|
|.................. | 40%
|
|................... | 42% [variables]
|
|................... | 43%
|
|.................... | 45% [missing]
|
|..................... | 46%
|
|..................... | 48% [missing-list]
|
|...................... | 49%
|
|....................... | 51% [missing-data]
|
|........................ | 52%
|
|........................ | 54% [missing-visualization]
|
|......................... | 55%
|
|.......................... | 57% [missing-viz2]
|
|.......................... | 58%
|
|........................... | 60% [unique]
|
|............................ | 61%
|
|............................ | 63% [unique-categorical]
|
|............................. | 64%
|
|.............................. | 66% [unique-date-category]
|
|.............................. | 67%
|
|............................... | 69% [unique-numerical]
|
|................................ | 70%
|
|................................ | 72% [unique-data-numeric]
|
|................................. | 73%
|
|.................................. | 75% [outliers]
|
|.................................. | 76%
|
|................................... | 78% [outliers-list]
|
|.................................... | 79%
|
|.................................... | 81% [samples]
|
|..................................... | 82%
|
|...................................... | 84% [duplicated]
|
|...................................... | 85%
|
|....................................... | 87% [duplicated-list]
|
|........................................ | 88%
|
|........................................ | 90% [heades]
|
|......................................... | 91%
|
|.......................................... | 93% [sample-head]
|
|.......................................... | 94%
|
|........................................... | 96% [tails]
|
|............................................ | 97%
|
|............................................ | 99% [sample-tail]
|
|.............................................| 100%
## "C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/pandoc" +RTS -K512m -RTS diagnosis_temp.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc53105da07df.html --lua-filter "C:\Users\evgro\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\pagebreak.lua" --lua-filter "C:\Users\evgro\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\latex-div.lua" --embed-resources --standalone --variable bs3=TRUE --section-divs --template "C:\Users\evgro\AppData\Local\R\win-library\4.3\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable theme=bootstrap --css "C:/Users/evgro/AppData/Local/R/win-library/4.3/dlookr/resources/dlookr-bootstrap.css" --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\evgro\AppData\Local\Temp\RtmpKOBefl\rmarkdown-str53104b8c7073.html" --variable code_folding=show --variable code_menu=1 --include-in-header header_temp.html --include-after-body "C:\Users\evgro\AppData\Local\R\win-library\4.3\dlookr\resources\footer.html"
eda_web_report(sample_dirty_dataset, browse = FALSE,title = "EDA report",author = "Evgenei Khodar",create_date = Sys.time())
##
|
| | 0%
|
|. | 2%
|
|.. | 3% [setup]
|
|.. | 5%
|
|... | 6% [load_packages]
|
|.... | 8%
|
|..... | 10% [unnamed-chunk-1]
|
|...... | 11%
|
|...... | 13% [udf]
|
|....... | 14%
|
|........ | 16% [check_variables]
|
|......... | 17%
|
|.......... | 19% [create-overview]
|
|.......... | 21%
|
|........... | 22% [overview]
|
|............ | 24%
|
|............. | 25% [overview-pre]
|
|............. | 27%
|
|.............. | 29% [unnamed-chunk-2]
|
|............... | 30%
|
|................ | 32% [unnamed-chunk-3]
|
|................. | 33%
|
|................. | 35% [variables]
|
|.................. | 37%
|
|................... | 38% [normality]
|
|.................... | 40%
|
|..................... | 41% [normality-list]
|
|..................... | 43%
|
|...................... | 44% [unnamed-chunk-4]
|
|....................... | 46%
|
|........................ | 48% [unnamed-chunk-5]
|
|......................... | 49%
|
|......................... | 51% [compare_numerical]
|
|.......................... | 52%
|
|........................... | 54% [unnamed-chunk-6]
|
|............................ | 56%
|
|............................. | 57% [compare-category]
|
|............................. | 59%
|
|.............................. | 60% [unnamed-chunk-7]
|
|............................... | 62%
|
|................................ | 63% [unnamed-chunk-8]
|
|................................. | 65%
|
|................................. | 67% [unnamed-chunk-9]
|
|.................................. | 68%
|
|................................... | 70% [correlation]
|
|.................................... | 71%
|
|..................................... | 73% [unnamed-chunk-10]
|
|..................................... | 75%
|
|...................................... | 76% [plot-correlation]
|
|....................................... | 78%
|
|........................................ | 79% [unnamed-chunk-11]
|
|........................................ | 81%
|
|......................................... | 83% [unnamed-chunk-12]
|
|.......................................... | 84%
|
|........................................... | 86% [group-numerical]
|
|............................................ | 87%
|
|............................................ | 89% [unnamed-chunk-13]
|
|............................................. | 90%
|
|.............................................. | 92% [group-categorical]
|
|............................................... | 94%
|
|................................................ | 95% [unnamed-chunk-14]
|
|................................................ | 97%
|
|................................................. | 98% [group-correlation]
|
|..................................................| 100%
## "C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/pandoc" +RTS -K512m -RTS eda_temp.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc53101f434d4.html --lua-filter "C:\Users\evgro\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\pagebreak.lua" --lua-filter "C:\Users\evgro\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\latex-div.lua" --embed-resources --standalone --variable bs3=TRUE --section-divs --template "C:\Users\evgro\AppData\Local\R\win-library\4.3\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable theme=bootstrap --css "C:/Users/evgro/AppData/Local/R/win-library/4.3/dlookr/resources/dlookr-bootstrap.css" --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\evgro\AppData\Local\Temp\RtmpKOBefl\rmarkdown-str53105b6b719e.html" --variable code_folding=show --variable code_menu=1 --include-in-header header_temp.html --include-after-body "C:\Users\evgro\AppData\Local\R\win-library\4.3\dlookr\resources\footer.html"
sample_dirty_dataset$age[sample_dirty_dataset$age=="InvalidAge"] <- NA
sample_dirty_dataset$education[sample_dirty_dataset$education=="InvalidEducation"] <- NA
sample_dirty_dataset$race[sample_dirty_dataset$race=="InvalidRace"] <- NA
sample_dirty_dataset$occupation[sample_dirty_dataset$occupation=="InvalidOccupation"] <- NA
Then visualizing missing cases table, and statistics of complete and incomplete observations:
library(naniar)
library(dlookr)
vis_miss(sample_dirty_dataset, cluster = TRUE)
sample_dirty_dataset%>%
miss_case_table()
## # A tibble: 4 × 3
## n_miss_in_case n_cases pct_cases
## <int> <int> <dbl>
## 1 0 589 58.9
## 2 1 335 33.5
## 3 2 63 6.3
## 4 3 13 1.3
And visualization of numerical variables to find outliers:
library(rrcov)
library(naniar)
library(dlookr)
plot(sample_dirty_dataset$capital_gain, pch = ifelse(sample_dirty_dataset$capital_gain < 0, 19, 1), col = ifelse(sample_dirty_dataset$capital_gain < 0, "red", "blue"),
xlab = "Capital gain", ylab = "Values", main = "Graphic of capital_gain variable values")
plot(sample_dirty_dataset$capital_loss, pch = ifelse(sample_dirty_dataset$capital_loss < 0, 19, 1), col = ifelse(sample_dirty_dataset$capital_loss < 0, "red", "blue"),
xlab = "Capital loss", ylab = "Values", main = "Graphic of capital_loss variable values")
library(rrcov)
par(mfrow=c(2,2))
plot(covMcd(sample_dirty_dataset$capital_gain))
Although only 3.6% of values are missing, if we delete all cases, that have “NA”, it would remove more than 40% of all observations.Also, there are not many cases, where many variables are missing at the same time. Dealing with missing values of qualitative columns, the most logical way would be to remove such observations, where inputting seems not a good option. Such candidates are workclass, marital_status, relationship, sex, native_country.
library(naniar)
library(DescTools)
repaired_dataset <- sample_dirty_dataset[complete.cases(sample_dirty_dataset[c("workclass", "marital_status", "relationship", "sex", "native_country")]), ]
median_value <- median(repaired_dataset[["hours_per_week"]], na.rm = TRUE)
repaired_dataset[["hours_per_week"]][is.na(repaired_dataset[["hours_per_week"]])] <- median_value
repaired_dataset$age <- as.integer(repaired_dataset$age)
mean_age<-mean(repaired_dataset$age, na.rm = TRUE)
mean_age <- as.integer(mean_age)
repaired_dataset$age[is.na(repaired_dataset$age)] <- mean_age
repaired_dataset <- repaired_dataset[, -which(names(repaired_dataset) == "education_num")]
mode_edu<-Mode(repaired_dataset$education,na.rm = TRUE)
repaired_dataset$education[is.na(repaired_dataset$education)] <- mode_edu
repaired_dataset$race[is.na(repaired_dataset$race)] <- "Other"
repaired_dataset$occupation[is.na(repaired_dataset$occupation)] <- "Other-service"
head(repaired_dataset)
## age workclass education marital_status occupation relationship
## 1 46 Never-worked Masters Divorced Adm-clerical Unmarried
## 3 53 Private Bachelors Separated Machine-op-inspct Not-in-family
## 4 70 Private 9th Separated Farming-fishing Not-in-family
## 5 78 Local-gov 7th-8th Married-AF-spouse Handlers-cleaners Own-child
## 6 71 Self-emp-inc Assoc-voc Married-AF-spouse Other-service Other-relative
## 8 52 Federal-gov Masters Never-married Other-service Husband
## race sex capital_gain capital_loss hours_per_week
## 1 Asian-Pac-Islander Male 81712 69387 49
## 3 Other Male 65305 57731 8
## 4 Other Male 7401 41746 26
## 5 Amer-Indian-Eskimo Female 45980 99713 71
## 6 Asian-Pac-Islander Male -999 81896 16
## 8 White Male 51768 53002 53
## native_country income
## 1 Poland >50K
## 3 United-States <=50K
## 4 Trinadad&Tobago <=50K
## 5 Nicaragua >50K
## 6 Guatemala <=50K
## 8 Taiwan <=50K
vis_miss(repaired_dataset, cluster = TRUE)
Variable age was transformed to int data type, which seems more suitable for it, and then “NA” were changed to the mean value. Missing cases in “hours_per_week” variable were replaced with the median value. Education variable was input with the mode value. race and occupation variables were adjusted with the values “Other” and “Other-service” respectively.
As we can see from the table, we got rid of “NA” values.
library(DescTools)
boxplot(repaired_dataset$capital_gain)
mean_gain <- repaired_dataset$capital_gain %>%
mean(na.rm = TRUE) %>%
as.integer()
repaired_dataset$capital_gain[repaired_dataset$capital_gain < 0] <- mean_gain
plot(repaired_dataset$capital_gain, pch = ifelse(repaired_dataset$capital_gain < 0, 19, 1), col = ifelse(repaired_dataset$capital_gain < 0, "red", "blue"),
xlab = "Index", ylab = "Values", main = "Point Graphic with Values less than Zero Marked")
library(DescTools)
library(dlookr)
mean_loss <- repaired_dataset$capital_loss %>%
mean(na.rm = TRUE) %>%
as.integer()
repaired_dataset$capital_loss[repaired_dataset$capital_loss < 0] <- mean_loss
plot(repaired_dataset$capital_loss, pch = ifelse(repaired_dataset$capital_loss < 0, 19, 1), col = ifelse(repaired_dataset$capital_loss < 0, "red", "blue"),
xlab = "Index", ylab = "Values", main = "Point Graphic with Values less than Zero Marked")
transformation_web_report(repaired_dataset,browse = FALSE,title = "Transformation report",author = "Evgenei Khodar",create_date = Sys.time())
##
|
| | 0%
|
|. | 3%
|
|... | 5% [setup]
|
|.... | 8%
|
|..... | 10% [load_packages]
|
|....... | 13%
|
|........ | 15% [unnamed-chunk-6]
|
|......... | 18%
|
|.......... | 21% [udf]
|
|............ | 23%
|
|............. | 26% [create-overview]
|
|.............. | 28%
|
|................ | 31% [overview]
|
|................. | 33%
|
|.................. | 36% [overview-pre]
|
|.................... | 38%
|
|..................... | 41% [unnamed-chunk-7]
|
|...................... | 44%
|
|........................ | 46% [unnamed-chunk-8]
|
|......................... | 49%
|
|.......................... | 51% [unnamed-chunk-9]
|
|........................... | 54%
|
|............................. | 56% [nalist]
|
|.............................. | 59%
|
|............................... | 62% [unnamed-chunk-10]
|
|................................. | 64%
|
|.................................. | 67% [outlist]
|
|................................... | 69%
|
|..................................... | 72% [unnamed-chunk-11]
|
|...................................... | 74%
|
|....................................... | 77% [skweness]
|
|......................................... | 79%
|
|.......................................... | 82% [unnamed-chunk-12]
|
|........................................... | 85%
|
|............................................ | 87% [binning]
|
|.............................................. | 90%
|
|............................................... | 92% [unnamed-chunk-13]
|
|................................................ | 95%
|
|.................................................. | 97% [optimal-binning]
|
|...................................................| 100%
## "C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/pandoc" +RTS -K512m -RTS transformation_temp.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc53101f551d0.html --lua-filter "C:\Users\evgro\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\pagebreak.lua" --lua-filter "C:\Users\evgro\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\latex-div.lua" --embed-resources --standalone --variable bs3=TRUE --section-divs --template "C:\Users\evgro\AppData\Local\R\win-library\4.3\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable theme=bootstrap --css "C:/Users/evgro/AppData/Local/R/win-library/4.3/dlookr/resources/dlookr-bootstrap.css" --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\evgro\AppData\Local\Temp\RtmpKOBefl\rmarkdown-str53101d3a632a.html" --variable code_folding=show --variable code_menu=1 --include-in-header header_temp.html --include-after-body "C:\Users\evgro\AppData\Local\R\win-library\4.3\dlookr\resources\footer.html"
To numerical variables “capital_gain” and “capital_loss” the same method was applied in order to deal with outliers, values less then zero were replaced with the mean value. As it could be observed from the graphics, variables now are in better condition.