Authors: Evgenei Khodar, Marta Szulca

library(rrcov)
# Warning: package 'rrcov' was built under R version 4.3.3
# Loading required package: robustbase
# Warning: package 'robustbase' was built under R version 4.3.3
# Scalable Robust Estimators with High Breakdown Point (version 1.7-5)
library(naniar)
# Warning: package 'naniar' was built under R version 4.3.3
library(dlookr)
# Warning: package 'dlookr' was built under R version 4.3.3
# Registered S3 methods overwritten by 'dlookr':
#   method          from  
#   plot.transform  scales
#   print.transform scales

First task - load the dataset

# Load necessary libraries
library(tidyverse)
# Warning: package 'tidyverse' was built under R version 4.3.3
# ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
# ✔ dplyr     1.1.4     ✔ readr     2.1.5
# ✔ forcats   1.0.0     ✔ stringr   1.5.1
# ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
# ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
# ✔ purrr     1.0.2

# Set seed for reproducibility
set.seed(15)

# Generate sample data
n <- 1000  # Number of observations

# Age between 18 and 90
age <- sample(18:90, n, replace = TRUE)

# Introduce errors in age column
age[sample(1:n, 50)] <- "InvalidAge"

# Workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked
workclass <- sample(c("Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", "Local-gov", "State-gov", "Without-pay", "Never-worked"), n, replace = TRUE)

# Introduce missing values in workclass column
workclass[sample(1:n, 50)] <- NA

# Education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool
education <- sample(c("Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"), n, replace = TRUE)

# Introduce inconsistencies in education column
education[sample(1:n, 50)] <- "InvalidEducation"

# Education Number: 1-16
education_num <- sample(1:16, n, replace = TRUE)

# Marital Status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse
marital_status <- sample(c("Married-civ-spouse", "Divorced", "Never-married", "Separated", "Widowed", "Married-spouse-absent", "Married-AF-spouse"), n, replace = TRUE)

# Introduce missing values in marital_status column
marital_status[sample(1:n, 50)] <- NA

# Occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces
occupation <- sample(c("Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv", "Armed-Forces"), n, replace = TRUE)

# Introduce errors in occupation column
occupation[sample(1:n, 50)] <- "InvalidOccupation"

# Relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried
relationship <- sample(c("Wife", "Own-child", "Husband", "Not-in-family", "Other-relative", "Unmarried"), n, replace = TRUE)

# Introduce missing values in relationship column
relationship[sample(1:n, 50)] <- NA

# Race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black
race <- sample(c("White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"), n, replace = TRUE)

# Introduce inconsistencies in race column
race[sample(1:n, 50)] <- "InvalidRace"

# Sex: Female, Male
sex <- sample(c("Female", "Male"), n, replace = TRUE)

# Introduce missing values in sex column
sex[sample(1:n, 50)] <- NA

# Capital Gain: 0-99999
capital_gain <- sample(0:99999, n, replace = TRUE)

# Introduce errors in capital_gain column
capital_gain[sample(1:n, 50)] <- -999

# Capital Loss: 0-99999
capital_loss <- sample(0:99999, n, replace = TRUE)

# Introduce errors in capital_loss column
capital_loss[sample(1:n, 50)] <- -999

# Hours per week: 1-99
hours_per_week <- sample(1:99, n, replace = TRUE)

# Introduce missing values in hours_per_week column
hours_per_week[sample(1:n, 50)] <- NA

# Native country: United-States, Cambodia, England, ...
native_country <- sample(c("United-States", "Cambodia", "England", "Puerto-Rico", "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras", "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico", "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"), n, replace = TRUE)

# Introduce missing values in native_country column
native_country[sample(1:n, 50)] <- NA

# Income: <=50K, >50K
income <- sample(c("<=50K", ">50K"), n, replace = TRUE, prob = c(0.75, 0.25))

# Create dataframe
sample_dirty_dataset <- data.frame(age, workclass, education, education_num, marital_status, occupation, relationship, race, sex, capital_gain, capital_loss, hours_per_week, native_country, income)

# Print first few rows of the dataset
head(sample_dirty_dataset)
#   age        workclass   education education_num     marital_status
# 1  54        State-gov Prof-school             6 Married-civ-spouse
# 2  51        Local-gov  Assoc-acdm             9           Divorced
# 3  55        Local-gov  Assoc-acdm             4      Never-married
# 4  66 Self-emp-not-inc     HS-grad            13            Widowed
# 5  22        Local-gov Prof-school            10            Widowed
# 6  82      Federal-gov     HS-grad             4 Married-civ-spouse
#          occupation   relationship               race    sex capital_gain
# 1 Machine-op-inspct      Own-child Amer-Indian-Eskimo Female        50151
# 2 InvalidOccupation Other-relative Asian-Pac-Islander   Male        42486
# 3     Other-service Other-relative Amer-Indian-Eskimo   Male         2927
# 4   Farming-fishing Other-relative Amer-Indian-Eskimo Female        57887
# 5             Sales           <NA>              White   Male        48687
# 6      Adm-clerical        Husband Amer-Indian-Eskimo   <NA>        86926
#   capital_loss hours_per_week native_country income
# 1        58652             26        Ireland  <=50K
# 2        91507             64    Puerto-Rico   >50K
# 3        75615             34          Japan  <=50K
# 4        90593             43          Italy   >50K
# 5        29546             22         Mexico  <=50K
# 6        68771             99        Vietnam  <=50K

Second task - find problems with the dataset

library(tidyverse)
library(dlookr)
diagnose(sample_dirty_dataset)
# # A tibble: 14 × 6
#    variables      types   missing_count missing_percent unique_count unique_rate
#    <chr>          <chr>           <int>           <dbl>        <int>       <dbl>
#  1 age            charac…             0               0           74       0.074
#  2 workclass      charac…            50               5            9       0.009
#  3 education      charac…             0               0           17       0.017
#  4 education_num  integer             0               0           16       0.016
#  5 marital_status charac…            50               5            8       0.008
#  6 occupation     charac…             0               0           15       0.015
#  7 relationship   charac…            50               5            7       0.007
#  8 race           charac…             0               0            6       0.006
#  9 sex            charac…            50               5            3       0.003
# 10 capital_gain   numeric             0               0          944       0.944
# 11 capital_loss   numeric             0               0          949       0.949
# 12 hours_per_week integer            50               5          100       0.1  
# 13 native_country charac…            50               5           42       0.042
# 14 income         charac…             0               0            2       0.002

Missing values in: work class, sex, relationship, native country, marital status and hours per week.

Third task - correct erroneous values

sample_dirty_dataset$age[sample_dirty_dataset$age=="InvalidAge"] <- NA
sample_dirty_dataset$education[sample_dirty_dataset$education=="InvalidEducation"] <- NA

Fourth task - deal with data errors

sample_dirty_dataset$race[sample_dirty_dataset$race=="InvalidRace"] <- NA
sample_dirty_dataset$occupation[sample_dirty_dataset$occupation=="InvalidOccupation"] <- NA

Fifth task - address missing values

vis_miss(sample_dirty_dataset, cluster = TRUE)

sample_dirty_dataset%>%
  miss_case_table()
# # A tibble: 4 × 3
#   n_miss_in_case n_cases pct_cases
#            <int>   <int>     <dbl>
# 1              0     591      59.1
# 2              1     328      32.8
# 3              2      71       7.1
# 4              3      10       1
gg_miss_upset(sample_dirty_dataset, 
              nsets = 6)

library(naniar)
library(DescTools)
# Warning: package 'DescTools' was built under R version 4.3.3
repaired_dataset <- sample_dirty_dataset[complete.cases(sample_dirty_dataset[c("workclass", "marital_status", "relationship", "sex", "native_country")]), ]

median_value <- median(repaired_dataset[["hours_per_week"]], na.rm = TRUE) 

repaired_dataset[["hours_per_week"]][is.na(repaired_dataset[["hours_per_week"]])] <- median_value 

repaired_dataset$age <- as.integer(repaired_dataset$age)

mean_age<-mean(repaired_dataset$age, na.rm = TRUE)
 mean_age <- as.integer(mean_age)

repaired_dataset$age[is.na(repaired_dataset$age)] <- mean_age


repaired_dataset <- repaired_dataset[, -which(names(repaired_dataset) == "education_num")]

mode_edu<-Mode(repaired_dataset$education,na.rm = TRUE)
 
repaired_dataset$education[is.na(repaired_dataset$education)] <- mode_edu


repaired_dataset$race[is.na(repaired_dataset$race)] <- "Other"


repaired_dataset$occupation[is.na(repaired_dataset$occupation)] <- "Other-service"

head(repaired_dataset)
#   age        workclass   education     marital_status        occupation
# 1  54        State-gov Prof-school Married-civ-spouse Machine-op-inspct
# 2  51        Local-gov  Assoc-acdm           Divorced     Other-service
# 3  55        Local-gov  Assoc-acdm      Never-married     Other-service
# 4  66 Self-emp-not-inc     HS-grad            Widowed   Farming-fishing
# 7  29 Self-emp-not-inc   Doctorate          Separated    Prof-specialty
# 8  54        Local-gov   Doctorate      Never-married      Adm-clerical
#     relationship               race    sex capital_gain capital_loss
# 1      Own-child Amer-Indian-Eskimo Female        50151        58652
# 2 Other-relative Asian-Pac-Islander   Male        42486        91507
# 3 Other-relative Amer-Indian-Eskimo   Male         2927        75615
# 4 Other-relative Amer-Indian-Eskimo Female        57887        90593
# 7           Wife Asian-Pac-Islander   Male        75882        83140
# 8           Wife Asian-Pac-Islander Female        83072        11312
#   hours_per_week native_country income
# 1             26        Ireland  <=50K
# 2             64    Puerto-Rico   >50K
# 3             34          Japan  <=50K
# 4             43          Italy   >50K
# 7             55         Canada   >50K
# 8             21        Jamaica  <=50K

Sixth task - detect possible outliers

plot(repaired_dataset$capital_gain, pch = ifelse(repaired_dataset$capital_gain < 0, 19, 1), col = ifelse(repaired_dataset$capital_gain < 0, "red", "blue"),
     xlab = "Capital gain", ylab = "Values", main = "Graphic of capital_gain variable values")



plot(repaired_dataset$capital_loss, pch = ifelse(repaired_dataset$capital_loss < 0, 19, 1), col = ifelse(repaired_dataset$capital_loss < 0, "red", "blue"),
     xlab = "Capital loss", ylab = "Values", main = "Graphic of capital_loss variable values")

library(DescTools)
boxplot(repaired_dataset$hours_per_week)$out

# numeric(0)
par(mfrow=c(2,2))
plot(covMcd(repaired_dataset$capital_gain))

Seventh task - transform variables

mean_gain <- repaired_dataset$capital_gain %>%
  mean(na.rm = TRUE) %>%
  as.integer()

repaired_dataset$capital_gain[repaired_dataset$capital_gain < 0] <- mean_gain


plot(repaired_dataset$capital_gain, pch = ifelse(repaired_dataset$capital_gain < 0, 19, 1), col = ifelse(repaired_dataset$capital_gain < 0, "red", "blue"),
     xlab = "Index", ylab = "Values", main = "Graphic of the capital_gain variable values after transformation")

library(DescTools)

library(dlookr)

mean_loss <- repaired_dataset$capital_loss %>%
  mean(na.rm = TRUE) %>%
  as.integer()

repaired_dataset$capital_loss[repaired_dataset$capital_loss < 0] <- mean_loss


plot(repaired_dataset$capital_loss, pch = ifelse(repaired_dataset$capital_loss < 0, 19, 1), col = ifelse(repaired_dataset$capital_loss < 0, "red", "blue"),
     xlab = "Index", ylab = "Values", main = "Graphic of the capital_loss variable values after transformation")

Eight task - explore

vis_miss(repaired_dataset, cluster = TRUE)

plot_outlier(repaired_dataset)

Ninth task - summary:

At first, we have checked where was the missing data, then replaced invalid values in some columns with the “NA”, to make them easier to work with and then visualized all of the the missing data and missing cases table.

Although only 3.6% of values were missing, if we would delete all cases with “NA” values, it would remove more than 40% of all observations. Also, there were not many cases, where many variables were missing at the same time.

Dealing with missing values of qualitative columns, the most logical way would be to remove such observations, where inputting seems not a good option. Such candidates are workclass, marital_status, relationship, sex, native_country.

Variable age was transformed to int data type, which seems more suitable for it, and then “NA” values were changed to the mean value. Missing cases in “hours_per_week” variable were replaced with the median value. Education variable was input with the mode value. race and occupation variables were adjusted with the values “Other” and “Other-service” respectively.

Then, we have visualized the numerical values to find outliers.

To numerical variables “capital_gain” and “capital_loss” the same method was applied in order to deal with outliers, values less then zero were replaced with the mean value.

As we can see from the tables in task 7 and 8, we got rid of all “NA” values and outliers, the variables now are in better condition.

As we can see, data cleaning helps a lot with its visualization and understanding.

Tenth task - generate reports

diagnose_web_report(repaired_dataset, title = "Diagnostical report")
# processing file: diagnosis_temp.Rmd
# Error in parse_block(g[-1], g[1], params.src, markdown_mode): Duplicate chunk label 'setup', which has been used for the chunk:
# knitr::opts_chunk$set(echo = TRUE)
# 
# library(conflicted)
# conflicts_prefer(dplyr::filter)
# conflicts_prefer(dplyr::lag)
eda_web_report(repaired_dataset, title = "EDA report")
# processing file: eda_temp.Rmd
# Error in parse_block(g[-1], g[1], params.src, markdown_mode): Duplicate chunk label 'setup', which has been used for the chunk:
# knitr::opts_chunk$set(echo = TRUE)
# 
# library(conflicted)
# conflicts_prefer(dplyr::filter)
# conflicts_prefer(dplyr::lag)
transformation_web_report(repaired_dataset,title = "Static report")
# processing file: transformation_temp.Rmd
# Error in parse_block(g[-1], g[1], params.src, markdown_mode): Duplicate chunk label 'setup', which has been used for the chunk:
# knitr::opts_chunk$set(echo = TRUE)
# 
# library(conflicted)
# conflicts_prefer(dplyr::filter)
# conflicts_prefer(dplyr::lag)
Standard output and standard error
✖ Install the styler package in order to use `style = TRUE`.
Session info
sessionInfo()
# R version 4.3.2 (2023-10-31 ucrt)
# Platform: x86_64-w64-mingw32/x64 (64-bit)
# Running under: Windows 10 x64 (build 19045)
# 
# Matrix products: default
# 
# 
# locale:
# [1] LC_COLLATE=Russian_Belarus.utf8  LC_CTYPE=Russian_Belarus.utf8   
# [3] LC_MONETARY=Russian_Belarus.utf8 LC_NUMERIC=C                    
# [5] LC_TIME=Russian_Belarus.utf8    
# 
# time zone: Europe/Warsaw
# tzcode source: internal
# 
# attached base packages:
# [1] stats     graphics  grDevices utils     datasets  methods   base     
# 
# other attached packages:
#  [1] DescTools_0.99.54 lubridate_1.9.3   forcats_1.0.0     stringr_1.5.1    
#  [5] dplyr_1.1.4       purrr_1.0.2       readr_2.1.5       tidyr_1.3.1      
#  [9] tibble_3.2.1      ggplot2_3.4.4     tidyverse_2.0.0   dlookr_0.6.3     
# [13] naniar_1.1.0      rrcov_1.7-5       robustbase_0.99-2 conflicted_1.2.0 
# 
# loaded via a namespace (and not attached):
#   [1] gridExtra_2.3           gld_2.6.6               readxl_1.4.3           
#   [4] rlang_1.1.3             magrittr_2.0.3          hrbrthemes_0.8.7       
#   [7] e1071_1.7-14            compiler_4.3.2          systemfonts_1.0.5      
#  [10] vctrs_0.6.5             sysfonts_0.8.9          httpcode_0.3.0         
#  [13] pkgconfig_2.0.3         crayon_1.5.2            fastmap_1.1.1          
#  [16] labeling_0.4.3          utf8_1.2.4              promises_1.2.1         
#  [19] rmarkdown_2.25          tzdb_0.4.0              visdat_0.6.0           
#  [22] UpSetR_1.4.0            xfun_0.42               reprex_2.1.0           
#  [25] showtext_0.9-7          cachem_1.0.8            jsonlite_1.8.8         
#  [28] highr_0.10              later_1.3.2             R6_2.5.1               
#  [31] bslib_0.6.1             stringi_1.8.3           pagedown_0.20          
#  [34] boot_1.3-28.1           extrafontdb_1.0         jquerylib_0.1.4        
#  [37] cellranger_1.1.0        Rcpp_1.0.12             knitr_1.45             
#  [40] extrafont_0.19          httpuv_1.6.15           Matrix_1.6-1.1         
#  [43] timechange_0.3.0        tidyselect_1.2.0        rstudioapi_0.15.0      
#  [46] yaml_2.3.8              curl_5.2.0              lattice_0.21-9         
#  [49] plyr_1.8.9              shiny_1.8.1             withr_3.0.0            
#  [52] evaluate_0.23           proxy_0.4-27            xml2_1.3.6             
#  [55] pillar_1.9.0            stats4_4.3.2            pcaPP_2.0-4            
#  [58] generics_0.1.3          hms_1.1.3               rootSolve_1.8.2.4      
#  [61] munsell_0.5.0           scales_1.3.0            xtable_1.8-4           
#  [64] class_7.3-22            glue_1.7.0              gdtools_0.3.7          
#  [67] lmom_3.0                tools_4.3.2             gfonts_0.2.0           
#  [70] data.table_1.15.0       Exact_3.2               reactable_0.4.4        
#  [73] fs_1.6.3                mvtnorm_1.2-4           grid_4.3.2             
#  [76] Rttf2pt1_1.3.12         colorspace_2.1-0        showtextdb_3.0         
#  [79] cli_3.6.2               kableExtra_1.4.0        fontBitstreamVera_0.1.1
#  [82] fansi_1.0.6             expm_0.999-9            viridisLite_0.4.2      
#  [85] svglite_2.1.3           gtable_0.3.4            DEoptimR_1.1-3         
#  [88] sass_0.4.8              digest_0.6.34           fontquiver_0.2.1       
#  [91] crul_1.4.0              htmlwidgets_1.6.4       farver_2.1.1           
#  [94] memoise_2.0.1           htmltools_0.5.7         lifecycle_1.0.4        
#  [97] httr_1.4.7              mime_0.12               fontLiberation_0.1.0   
# [100] MASS_7.3-60