Authors: Evgenei Khodar, Marta Szulca
library(rrcov)
# Warning: package 'rrcov' was built under R version 4.3.3
# Loading required package: robustbase
# Warning: package 'robustbase' was built under R version 4.3.3
# Scalable Robust Estimators with High Breakdown Point (version 1.7-5)
library(naniar)
# Warning: package 'naniar' was built under R version 4.3.3
library(dlookr)
# Warning: package 'dlookr' was built under R version 4.3.3
# Registered S3 methods overwritten by 'dlookr':
# method from
# plot.transform scales
# print.transform scalesFirst task - load the dataset
# Load necessary libraries
library(tidyverse)
# Warning: package 'tidyverse' was built under R version 4.3.3
# ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
# ✔ dplyr 1.1.4 ✔ readr 2.1.5
# ✔ forcats 1.0.0 ✔ stringr 1.5.1
# ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
# ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
# ✔ purrr 1.0.2
# Set seed for reproducibility
set.seed(15)
# Generate sample data
n <- 1000 # Number of observations
# Age between 18 and 90
age <- sample(18:90, n, replace = TRUE)
# Introduce errors in age column
age[sample(1:n, 50)] <- "InvalidAge"
# Workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked
workclass <- sample(c("Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", "Local-gov", "State-gov", "Without-pay", "Never-worked"), n, replace = TRUE)
# Introduce missing values in workclass column
workclass[sample(1:n, 50)] <- NA
# Education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool
education <- sample(c("Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"), n, replace = TRUE)
# Introduce inconsistencies in education column
education[sample(1:n, 50)] <- "InvalidEducation"
# Education Number: 1-16
education_num <- sample(1:16, n, replace = TRUE)
# Marital Status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse
marital_status <- sample(c("Married-civ-spouse", "Divorced", "Never-married", "Separated", "Widowed", "Married-spouse-absent", "Married-AF-spouse"), n, replace = TRUE)
# Introduce missing values in marital_status column
marital_status[sample(1:n, 50)] <- NA
# Occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces
occupation <- sample(c("Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv", "Armed-Forces"), n, replace = TRUE)
# Introduce errors in occupation column
occupation[sample(1:n, 50)] <- "InvalidOccupation"
# Relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried
relationship <- sample(c("Wife", "Own-child", "Husband", "Not-in-family", "Other-relative", "Unmarried"), n, replace = TRUE)
# Introduce missing values in relationship column
relationship[sample(1:n, 50)] <- NA
# Race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black
race <- sample(c("White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"), n, replace = TRUE)
# Introduce inconsistencies in race column
race[sample(1:n, 50)] <- "InvalidRace"
# Sex: Female, Male
sex <- sample(c("Female", "Male"), n, replace = TRUE)
# Introduce missing values in sex column
sex[sample(1:n, 50)] <- NA
# Capital Gain: 0-99999
capital_gain <- sample(0:99999, n, replace = TRUE)
# Introduce errors in capital_gain column
capital_gain[sample(1:n, 50)] <- -999
# Capital Loss: 0-99999
capital_loss <- sample(0:99999, n, replace = TRUE)
# Introduce errors in capital_loss column
capital_loss[sample(1:n, 50)] <- -999
# Hours per week: 1-99
hours_per_week <- sample(1:99, n, replace = TRUE)
# Introduce missing values in hours_per_week column
hours_per_week[sample(1:n, 50)] <- NA
# Native country: United-States, Cambodia, England, ...
native_country <- sample(c("United-States", "Cambodia", "England", "Puerto-Rico", "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras", "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico", "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"), n, replace = TRUE)
# Introduce missing values in native_country column
native_country[sample(1:n, 50)] <- NA
# Income: <=50K, >50K
income <- sample(c("<=50K", ">50K"), n, replace = TRUE, prob = c(0.75, 0.25))
# Create dataframe
sample_dirty_dataset <- data.frame(age, workclass, education, education_num, marital_status, occupation, relationship, race, sex, capital_gain, capital_loss, hours_per_week, native_country, income)
# Print first few rows of the dataset
head(sample_dirty_dataset)
# age workclass education education_num marital_status
# 1 54 State-gov Prof-school 6 Married-civ-spouse
# 2 51 Local-gov Assoc-acdm 9 Divorced
# 3 55 Local-gov Assoc-acdm 4 Never-married
# 4 66 Self-emp-not-inc HS-grad 13 Widowed
# 5 22 Local-gov Prof-school 10 Widowed
# 6 82 Federal-gov HS-grad 4 Married-civ-spouse
# occupation relationship race sex capital_gain
# 1 Machine-op-inspct Own-child Amer-Indian-Eskimo Female 50151
# 2 InvalidOccupation Other-relative Asian-Pac-Islander Male 42486
# 3 Other-service Other-relative Amer-Indian-Eskimo Male 2927
# 4 Farming-fishing Other-relative Amer-Indian-Eskimo Female 57887
# 5 Sales <NA> White Male 48687
# 6 Adm-clerical Husband Amer-Indian-Eskimo <NA> 86926
# capital_loss hours_per_week native_country income
# 1 58652 26 Ireland <=50K
# 2 91507 64 Puerto-Rico >50K
# 3 75615 34 Japan <=50K
# 4 90593 43 Italy >50K
# 5 29546 22 Mexico <=50K
# 6 68771 99 Vietnam <=50KSecond task - find problems with the dataset
library(tidyverse)
library(dlookr)
diagnose(sample_dirty_dataset)
# # A tibble: 14 × 6
# variables types missing_count missing_percent unique_count unique_rate
# <chr> <chr> <int> <dbl> <int> <dbl>
# 1 age charac… 0 0 74 0.074
# 2 workclass charac… 50 5 9 0.009
# 3 education charac… 0 0 17 0.017
# 4 education_num integer 0 0 16 0.016
# 5 marital_status charac… 50 5 8 0.008
# 6 occupation charac… 0 0 15 0.015
# 7 relationship charac… 50 5 7 0.007
# 8 race charac… 0 0 6 0.006
# 9 sex charac… 50 5 3 0.003
# 10 capital_gain numeric 0 0 944 0.944
# 11 capital_loss numeric 0 0 949 0.949
# 12 hours_per_week integer 50 5 100 0.1
# 13 native_country charac… 50 5 42 0.042
# 14 income charac… 0 0 2 0.002Missing values in: work class, sex, relationship, native country, marital status and hours per week.
Third task - correct erroneous values
sample_dirty_dataset$age[sample_dirty_dataset$age=="InvalidAge"] <- NA
sample_dirty_dataset$education[sample_dirty_dataset$education=="InvalidEducation"] <- NAFourth task - deal with data errors
sample_dirty_dataset$race[sample_dirty_dataset$race=="InvalidRace"] <- NA
sample_dirty_dataset$occupation[sample_dirty_dataset$occupation=="InvalidOccupation"] <- NAFifth task - address missing values
sample_dirty_dataset%>%
miss_case_table()
# # A tibble: 4 × 3
# n_miss_in_case n_cases pct_cases
# <int> <int> <dbl>
# 1 0 591 59.1
# 2 1 328 32.8
# 3 2 71 7.1
# 4 3 10 1library(naniar)
library(DescTools)
# Warning: package 'DescTools' was built under R version 4.3.3
repaired_dataset <- sample_dirty_dataset[complete.cases(sample_dirty_dataset[c("workclass", "marital_status", "relationship", "sex", "native_country")]), ]
median_value <- median(repaired_dataset[["hours_per_week"]], na.rm = TRUE)
repaired_dataset[["hours_per_week"]][is.na(repaired_dataset[["hours_per_week"]])] <- median_value
repaired_dataset$age <- as.integer(repaired_dataset$age)
mean_age<-mean(repaired_dataset$age, na.rm = TRUE)
mean_age <- as.integer(mean_age)
repaired_dataset$age[is.na(repaired_dataset$age)] <- mean_age
repaired_dataset <- repaired_dataset[, -which(names(repaired_dataset) == "education_num")]
mode_edu<-Mode(repaired_dataset$education,na.rm = TRUE)
repaired_dataset$education[is.na(repaired_dataset$education)] <- mode_edu
repaired_dataset$race[is.na(repaired_dataset$race)] <- "Other"
repaired_dataset$occupation[is.na(repaired_dataset$occupation)] <- "Other-service"
head(repaired_dataset)
# age workclass education marital_status occupation
# 1 54 State-gov Prof-school Married-civ-spouse Machine-op-inspct
# 2 51 Local-gov Assoc-acdm Divorced Other-service
# 3 55 Local-gov Assoc-acdm Never-married Other-service
# 4 66 Self-emp-not-inc HS-grad Widowed Farming-fishing
# 7 29 Self-emp-not-inc Doctorate Separated Prof-specialty
# 8 54 Local-gov Doctorate Never-married Adm-clerical
# relationship race sex capital_gain capital_loss
# 1 Own-child Amer-Indian-Eskimo Female 50151 58652
# 2 Other-relative Asian-Pac-Islander Male 42486 91507
# 3 Other-relative Amer-Indian-Eskimo Male 2927 75615
# 4 Other-relative Amer-Indian-Eskimo Female 57887 90593
# 7 Wife Asian-Pac-Islander Male 75882 83140
# 8 Wife Asian-Pac-Islander Female 83072 11312
# hours_per_week native_country income
# 1 26 Ireland <=50K
# 2 64 Puerto-Rico >50K
# 3 34 Japan <=50K
# 4 43 Italy >50K
# 7 55 Canada >50K
# 8 21 Jamaica <=50KSixth task - detect possible outliers
plot(repaired_dataset$capital_gain, pch = ifelse(repaired_dataset$capital_gain < 0, 19, 1), col = ifelse(repaired_dataset$capital_gain < 0, "red", "blue"),
xlab = "Capital gain", ylab = "Values", main = "Graphic of capital_gain variable values")
plot(repaired_dataset$capital_loss, pch = ifelse(repaired_dataset$capital_loss < 0, 19, 1), col = ifelse(repaired_dataset$capital_loss < 0, "red", "blue"),
xlab = "Capital loss", ylab = "Values", main = "Graphic of capital_loss variable values")# numeric(0)
Seventh task - transform variables
mean_gain <- repaired_dataset$capital_gain %>%
mean(na.rm = TRUE) %>%
as.integer()
repaired_dataset$capital_gain[repaired_dataset$capital_gain < 0] <- mean_gain
plot(repaired_dataset$capital_gain, pch = ifelse(repaired_dataset$capital_gain < 0, 19, 1), col = ifelse(repaired_dataset$capital_gain < 0, "red", "blue"),
xlab = "Index", ylab = "Values", main = "Graphic of the capital_gain variable values after transformation")library(DescTools)
library(dlookr)
mean_loss <- repaired_dataset$capital_loss %>%
mean(na.rm = TRUE) %>%
as.integer()
repaired_dataset$capital_loss[repaired_dataset$capital_loss < 0] <- mean_loss
plot(repaired_dataset$capital_loss, pch = ifelse(repaired_dataset$capital_loss < 0, 19, 1), col = ifelse(repaired_dataset$capital_loss < 0, "red", "blue"),
xlab = "Index", ylab = "Values", main = "Graphic of the capital_loss variable values after transformation")Eight task - explore
Ninth task - summary:
At first, we have checked where was the missing data, then replaced invalid values in some columns with the “NA”, to make them easier to work with and then visualized all of the the missing data and missing cases table.
Although only 3.6% of values were missing, if we would delete all cases with “NA” values, it would remove more than 40% of all observations. Also, there were not many cases, where many variables were missing at the same time.
Dealing with missing values of qualitative columns, the most logical way would be to remove such observations, where inputting seems not a good option. Such candidates are workclass, marital_status, relationship, sex, native_country.
Variable age was transformed to int data type, which seems more suitable for it, and then “NA” values were changed to the mean value. Missing cases in “hours_per_week” variable were replaced with the median value. Education variable was input with the mode value. race and occupation variables were adjusted with the values “Other” and “Other-service” respectively.
Then, we have visualized the numerical values to find outliers.
To numerical variables “capital_gain” and “capital_loss” the same method was applied in order to deal with outliers, values less then zero were replaced with the mean value.
As we can see from the tables in task 7 and 8, we got rid of all “NA” values and outliers, the variables now are in better condition.
As we can see, data cleaning helps a lot with its visualization and understanding.
Tenth task - generate reports
diagnose_web_report(repaired_dataset, title = "Diagnostical report")
# processing file: diagnosis_temp.Rmd
# Error in parse_block(g[-1], g[1], params.src, markdown_mode): Duplicate chunk label 'setup', which has been used for the chunk:
# knitr::opts_chunk$set(echo = TRUE)
#
# library(conflicted)
# conflicts_prefer(dplyr::filter)
# conflicts_prefer(dplyr::lag)eda_web_report(repaired_dataset, title = "EDA report")
# processing file: eda_temp.Rmd
# Error in parse_block(g[-1], g[1], params.src, markdown_mode): Duplicate chunk label 'setup', which has been used for the chunk:
# knitr::opts_chunk$set(echo = TRUE)
#
# library(conflicted)
# conflicts_prefer(dplyr::filter)
# conflicts_prefer(dplyr::lag)transformation_web_report(repaired_dataset,title = "Static report")
# processing file: transformation_temp.Rmd
# Error in parse_block(g[-1], g[1], params.src, markdown_mode): Duplicate chunk label 'setup', which has been used for the chunk:
# knitr::opts_chunk$set(echo = TRUE)
#
# library(conflicted)
# conflicts_prefer(dplyr::filter)
# conflicts_prefer(dplyr::lag)sessionInfo()
# R version 4.3.2 (2023-10-31 ucrt)
# Platform: x86_64-w64-mingw32/x64 (64-bit)
# Running under: Windows 10 x64 (build 19045)
#
# Matrix products: default
#
#
# locale:
# [1] LC_COLLATE=Russian_Belarus.utf8 LC_CTYPE=Russian_Belarus.utf8
# [3] LC_MONETARY=Russian_Belarus.utf8 LC_NUMERIC=C
# [5] LC_TIME=Russian_Belarus.utf8
#
# time zone: Europe/Warsaw
# tzcode source: internal
#
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] DescTools_0.99.54 lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1
# [5] dplyr_1.1.4 purrr_1.0.2 readr_2.1.5 tidyr_1.3.1
# [9] tibble_3.2.1 ggplot2_3.4.4 tidyverse_2.0.0 dlookr_0.6.3
# [13] naniar_1.1.0 rrcov_1.7-5 robustbase_0.99-2 conflicted_1.2.0
#
# loaded via a namespace (and not attached):
# [1] gridExtra_2.3 gld_2.6.6 readxl_1.4.3
# [4] rlang_1.1.3 magrittr_2.0.3 hrbrthemes_0.8.7
# [7] e1071_1.7-14 compiler_4.3.2 systemfonts_1.0.5
# [10] vctrs_0.6.5 sysfonts_0.8.9 httpcode_0.3.0
# [13] pkgconfig_2.0.3 crayon_1.5.2 fastmap_1.1.1
# [16] labeling_0.4.3 utf8_1.2.4 promises_1.2.1
# [19] rmarkdown_2.25 tzdb_0.4.0 visdat_0.6.0
# [22] UpSetR_1.4.0 xfun_0.42 reprex_2.1.0
# [25] showtext_0.9-7 cachem_1.0.8 jsonlite_1.8.8
# [28] highr_0.10 later_1.3.2 R6_2.5.1
# [31] bslib_0.6.1 stringi_1.8.3 pagedown_0.20
# [34] boot_1.3-28.1 extrafontdb_1.0 jquerylib_0.1.4
# [37] cellranger_1.1.0 Rcpp_1.0.12 knitr_1.45
# [40] extrafont_0.19 httpuv_1.6.15 Matrix_1.6-1.1
# [43] timechange_0.3.0 tidyselect_1.2.0 rstudioapi_0.15.0
# [46] yaml_2.3.8 curl_5.2.0 lattice_0.21-9
# [49] plyr_1.8.9 shiny_1.8.1 withr_3.0.0
# [52] evaluate_0.23 proxy_0.4-27 xml2_1.3.6
# [55] pillar_1.9.0 stats4_4.3.2 pcaPP_2.0-4
# [58] generics_0.1.3 hms_1.1.3 rootSolve_1.8.2.4
# [61] munsell_0.5.0 scales_1.3.0 xtable_1.8-4
# [64] class_7.3-22 glue_1.7.0 gdtools_0.3.7
# [67] lmom_3.0 tools_4.3.2 gfonts_0.2.0
# [70] data.table_1.15.0 Exact_3.2 reactable_0.4.4
# [73] fs_1.6.3 mvtnorm_1.2-4 grid_4.3.2
# [76] Rttf2pt1_1.3.12 colorspace_2.1-0 showtextdb_3.0
# [79] cli_3.6.2 kableExtra_1.4.0 fontBitstreamVera_0.1.1
# [82] fansi_1.0.6 expm_0.999-9 viridisLite_0.4.2
# [85] svglite_2.1.3 gtable_0.3.4 DEoptimR_1.1-3
# [88] sass_0.4.8 digest_0.6.34 fontquiver_0.2.1
# [91] crul_1.4.0 htmlwidgets_1.6.4 farver_2.1.1
# [94] memoise_2.0.1 htmltools_0.5.7 lifecycle_1.0.4
# [97] httr_1.4.7 mime_0.12 fontLiberation_0.1.0
# [100] MASS_7.3-60