Data_Cleansing

Evgenei Khodar

Sample data

# Set seed for reproducibility
set.seed(44100)
# Generate sample data
n <- 1000  # Number of observations
# Age between 18 and 90
age <- sample(18:90, n, replace = TRUE)
# Introduce errors in age column
age[sample(1:n, 50)] <- "InvalidAge"
# Workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked
workclass <- sample(c("Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", "Local-gov", "State-gov", "Without-pay", "Never-worked"), n, replace = TRUE)
# Introduce missing values in workclass column
workclass[sample(1:n, 50)] <- NA
# Education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool
education <- sample(c("Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"), n, replace = TRUE)
# Introduce inconsistencies in education column
education[sample(1:n, 50)] <- "InvalidEducation"
# Education Number: 1-16
education_num <- sample(1:16, n, replace = TRUE)
# Marital Status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse
marital_status <- sample(c("Married-civ-spouse", "Divorced", "Never-married", "Separated", "Widowed", "Married-spouse-absent", "Married-AF-spouse"), n, replace = TRUE)
# Introduce missing values in marital_status column
marital_status[sample(1:n, 50)] <- NA
# Occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces
occupation <- sample(c("Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv", "Armed-Forces"), n, replace = TRUE)
# Introduce errors in occupation column
occupation[sample(1:n, 50)] <- "InvalidOccupation"
# Relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried
relationship <- sample(c("Wife", "Own-child", "Husband", "Not-in-family", "Other-relative", "Unmarried"), n, replace = TRUE)
# Introduce missing values in relationship column
relationship[sample(1:n, 50)] <- NA
# Race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black
race <- sample(c("White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"), n, replace = TRUE)
# Introduce inconsistencies in race column
race[sample(1:n, 50)] <- "InvalidRace"
# Sex: Female, Male
sex <- sample(c("Female", "Male"), n, replace = TRUE)
# Introduce missing values in sex column
sex[sample(1:n, 50)] <- NA
# Capital Gain: 0-99999
capital_gain <- sample(0:99999, n, replace = TRUE)
# Introduce errors in capital_gain column
capital_gain[sample(1:n, 50)] <- -999
# Capital Loss: 0-99999
capital_loss <- sample(0:99999, n, replace = TRUE)
# Introduce errors in capital_loss column
capital_loss[sample(1:n, 50)] <- -999
# Hours per week: 1-99
hours_per_week <- sample(1:99, n, replace = TRUE)
# Introduce missing values in hours_per_week column
hours_per_week[sample(1:n, 50)] <- NA
# Native country: United-States, Cambodia, England, ...
native_country <- sample(c("United-States", "Cambodia", "England", "Puerto-Rico", "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras", "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico", "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"), n, replace = TRUE)
# Introduce missing values in native_country column
native_country[sample(1:n, 50)] <- NA
# Income: <=50K, >50K
income <- sample(c("<=50K", ">50K"), n, replace = TRUE, prob = c(0.75, 0.25))
# Create dataframe
sample_dirty_dataset <- data.frame(age, workclass, education, education_num, marital_status, occupation, relationship, race, sex, capital_gain, capital_loss, hours_per_week, native_country, income)
# Print first few rows of the dataset
head(sample_dirty_dataset)
##   age    workclass education education_num    marital_status        occupation
## 1  46 Never-worked   Masters            11          Divorced      Adm-clerical
## 2  54  Without-pay      11th            10           Widowed      Armed-Forces
## 3  53      Private Bachelors            16         Separated Machine-op-inspct
## 4  70      Private       9th            14         Separated   Farming-fishing
## 5  78    Local-gov   7th-8th            14 Married-AF-spouse Handlers-cleaners
## 6  71 Self-emp-inc Assoc-voc            14 Married-AF-spouse InvalidOccupation
##     relationship               race    sex capital_gain capital_loss
## 1      Unmarried Asian-Pac-Islander   Male        81712        69387
## 2      Own-child              Other Female        25014        23069
## 3  Not-in-family              Other   Male        65305        57731
## 4  Not-in-family        InvalidRace   Male         7401        41746
## 5      Own-child Amer-Indian-Eskimo Female        45980        99713
## 6 Other-relative Asian-Pac-Islander   Male         -999        81896
##   hours_per_week  native_country income
## 1             49          Poland   >50K
## 2             13            <NA>   >50K
## 3              8   United-States  <=50K
## 4             26 Trinadad&Tobago  <=50K
## 5             71       Nicaragua   >50K
## 6             16       Guatemala  <=50K

Visualizing missing data, firstly generating diagnose and EDA report, also replacing invalid results in some columns with the “NA”, to make them easier to work with:

library(naniar)
library(dlookr)



diagnose_web_report(sample_dirty_dataset,  browse = FALSE,title = "Diagnostical report", author = "Evgenei Khodar",create_date = Sys.time())
## 
  |                                                   
  |                                             |   0%
  |                                                   
  |.                                            |   1%                         
  |                                                   
  |.                                            |   3% [setup]                 
  |                                                   
  |..                                           |   4%                         
  |                                                   
  |...                                          |   6% [load_packages]         
  |                                                   
  |...                                          |   7%                         
  |                                                   
  |....                                         |   9% [get-parameters]        
  |                                                   
  |.....                                        |  10%                         
  |                                                   
  |.....                                        |  12% [unnamed-chunk-3]       
  |                                                   
  |......                                       |  13%                         
  |                                                   
  |.......                                      |  15% [diagose]               
  |                                                   
  |.......                                      |  16%                         
  |                                                   
  |........                                     |  18% [create-overview]       
  |                                                   
  |.........                                    |  19%                         
  |                                                   
  |.........                                    |  21% [overview]              
  |                                                   
  |..........                                   |  22%                         
  |                                                   
  |...........                                  |  24% [overview-datastructure]
  |                                                   
  |...........                                  |  25%                         
  |                                                   
  |............                                 |  27% [overview-pre]          
  |                                                   
  |.............                                |  28%                         
  |                                                   
  |.............                                |  30% [overview-warnings]     
  |                                                   
  |..............                               |  31%                         
  |                                                   
  |...............                              |  33% [warnings_summary]      
  |                                                   
  |...............                              |  34%                         
  |                                                   
  |................                             |  36% [warnings]              
  |                                                   
  |.................                            |  37%                         
  |                                                   
  |.................                            |  39% [overview-variables]    
  |                                                   
  |..................                           |  40%                         
  |                                                   
  |...................                          |  42% [variables]             
  |                                                   
  |...................                          |  43%                         
  |                                                   
  |....................                         |  45% [missing]               
  |                                                   
  |.....................                        |  46%                         
  |                                                   
  |.....................                        |  48% [missing-list]          
  |                                                   
  |......................                       |  49%                         
  |                                                   
  |.......................                      |  51% [missing-data]          
  |                                                   
  |........................                     |  52%                         
  |                                                   
  |........................                     |  54% [missing-visualization] 
  |                                                   
  |.........................                    |  55%                         
  |                                                   
  |..........................                   |  57% [missing-viz2]          
  |                                                   
  |..........................                   |  58%                         
  |                                                   
  |...........................                  |  60% [unique]                
  |                                                   
  |............................                 |  61%                         
  |                                                   
  |............................                 |  63% [unique-categorical]    
  |                                                   
  |.............................                |  64%                         
  |                                                   
  |..............................               |  66% [unique-date-category]  
  |                                                   
  |..............................               |  67%                         
  |                                                   
  |...............................              |  69% [unique-numerical]      
  |                                                   
  |................................             |  70%                         
  |                                                   
  |................................             |  72% [unique-data-numeric]   
  |                                                   
  |.................................            |  73%                         
  |                                                   
  |..................................           |  75% [outliers]              
  |                                                   
  |..................................           |  76%                         
  |                                                   
  |...................................          |  78% [outliers-list]         
  |                                                   
  |....................................         |  79%                         
  |                                                   
  |....................................         |  81% [samples]               
  |                                                   
  |.....................................        |  82%                         
  |                                                   
  |......................................       |  84% [duplicated]            
  |                                                   
  |......................................       |  85%                         
  |                                                   
  |.......................................      |  87% [duplicated-list]       
  |                                                   
  |........................................     |  88%                         
  |                                                   
  |........................................     |  90% [heades]                
  |                                                   
  |.........................................    |  91%                         
  |                                                   
  |..........................................   |  93% [sample-head]           
  |                                                   
  |..........................................   |  94%                         
  |                                                   
  |...........................................  |  96% [tails]                 
  |                                                   
  |............................................ |  97%                         
  |                                                   
  |............................................ |  99% [sample-tail]           
  |                                                   
  |.............................................| 100%                         
                                                                                                                   
## "C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/pandoc" +RTS -K512m -RTS diagnosis_temp.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc53105da07df.html --lua-filter "C:\Users\evgro\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\pagebreak.lua" --lua-filter "C:\Users\evgro\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\latex-div.lua" --embed-resources --standalone --variable bs3=TRUE --section-divs --template "C:\Users\evgro\AppData\Local\R\win-library\4.3\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable theme=bootstrap --css "C:/Users/evgro/AppData/Local/R/win-library/4.3/dlookr/resources/dlookr-bootstrap.css" --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\evgro\AppData\Local\Temp\RtmpKOBefl\rmarkdown-str53104b8c7073.html" --variable code_folding=show --variable code_menu=1 --include-in-header header_temp.html --include-after-body "C:\Users\evgro\AppData\Local\R\win-library\4.3\dlookr\resources\footer.html"
eda_web_report(sample_dirty_dataset,  browse = FALSE,title = "EDA report",author = "Evgenei Khodar",create_date = Sys.time())
## 
  |                                                        
  |                                                  |   0%
  |                                                        
  |.                                                 |   2%                    
  |                                                        
  |..                                                |   3% [setup]            
  |                                                        
  |..                                                |   5%                    
  |                                                        
  |...                                               |   6% [load_packages]    
  |                                                        
  |....                                              |   8%                    
  |                                                        
  |.....                                             |  10% [unnamed-chunk-1]  
  |                                                        
  |......                                            |  11%                    
  |                                                        
  |......                                            |  13% [udf]              
  |                                                        
  |.......                                           |  14%                    
  |                                                        
  |........                                          |  16% [check_variables]  
  |                                                        
  |.........                                         |  17%                    
  |                                                        
  |..........                                        |  19% [create-overview]  
  |                                                        
  |..........                                        |  21%                    
  |                                                        
  |...........                                       |  22% [overview]         
  |                                                        
  |............                                      |  24%                    
  |                                                        
  |.............                                     |  25% [overview-pre]     
  |                                                        
  |.............                                     |  27%                    
  |                                                        
  |..............                                    |  29% [unnamed-chunk-2]  
  |                                                        
  |...............                                   |  30%                    
  |                                                        
  |................                                  |  32% [unnamed-chunk-3]  
  |                                                        
  |.................                                 |  33%                    
  |                                                        
  |.................                                 |  35% [variables]        
  |                                                        
  |..................                                |  37%                    
  |                                                        
  |...................                               |  38% [normality]        
  |                                                        
  |....................                              |  40%                    
  |                                                        
  |.....................                             |  41% [normality-list]   
  |                                                        
  |.....................                             |  43%                    
  |                                                        
  |......................                            |  44% [unnamed-chunk-4]  
  |                                                        
  |.......................                           |  46%                    
  |                                                        
  |........................                          |  48% [unnamed-chunk-5]  
  |                                                        
  |.........................                         |  49%                    
  |                                                        
  |.........................                         |  51% [compare_numerical]
  |                                                        
  |..........................                        |  52%                    
  |                                                        
  |...........................                       |  54% [unnamed-chunk-6]  
  |                                                        
  |............................                      |  56%                    
  |                                                        
  |.............................                     |  57% [compare-category] 
  |                                                        
  |.............................                     |  59%                    
  |                                                        
  |..............................                    |  60% [unnamed-chunk-7]  
  |                                                        
  |...............................                   |  62%                    
  |                                                        
  |................................                  |  63% [unnamed-chunk-8]  
  |                                                        
  |.................................                 |  65%                    
  |                                                        
  |.................................                 |  67% [unnamed-chunk-9]  
  |                                                        
  |..................................                |  68%                    
  |                                                        
  |...................................               |  70% [correlation]      
  |                                                        
  |....................................              |  71%                    
  |                                                        
  |.....................................             |  73% [unnamed-chunk-10] 
  |                                                        
  |.....................................             |  75%                    
  |                                                        
  |......................................            |  76% [plot-correlation] 
  |                                                        
  |.......................................           |  78%                    
  |                                                        
  |........................................          |  79% [unnamed-chunk-11] 
  |                                                        
  |........................................          |  81%                    
  |                                                        
  |.........................................         |  83% [unnamed-chunk-12] 
  |                                                        
  |..........................................        |  84%                    
  |                                                        
  |...........................................       |  86% [group-numerical]  
  |                                                        
  |............................................      |  87%                    
  |                                                        
  |............................................      |  89% [unnamed-chunk-13] 
  |                                                        
  |.............................................     |  90%                    
  |                                                        
  |..............................................    |  92% [group-categorical]
  |                                                        
  |...............................................   |  94%                    
  |                                                        
  |................................................  |  95% [unnamed-chunk-14] 
  |                                                        
  |................................................  |  97%                    
  |                                                        
  |................................................. |  98% [group-correlation]
  |                                                        
  |..................................................| 100%                    
                                                                                                              
## "C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/pandoc" +RTS -K512m -RTS eda_temp.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc53101f434d4.html --lua-filter "C:\Users\evgro\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\pagebreak.lua" --lua-filter "C:\Users\evgro\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\latex-div.lua" --embed-resources --standalone --variable bs3=TRUE --section-divs --template "C:\Users\evgro\AppData\Local\R\win-library\4.3\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable theme=bootstrap --css "C:/Users/evgro/AppData/Local/R/win-library/4.3/dlookr/resources/dlookr-bootstrap.css" --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\evgro\AppData\Local\Temp\RtmpKOBefl\rmarkdown-str53105b6b719e.html" --variable code_folding=show --variable code_menu=1 --include-in-header header_temp.html --include-after-body "C:\Users\evgro\AppData\Local\R\win-library\4.3\dlookr\resources\footer.html"
sample_dirty_dataset$age[sample_dirty_dataset$age=="InvalidAge"] <- NA
sample_dirty_dataset$education[sample_dirty_dataset$education=="InvalidEducation"] <- NA
sample_dirty_dataset$race[sample_dirty_dataset$race=="InvalidRace"] <- NA
sample_dirty_dataset$occupation[sample_dirty_dataset$occupation=="InvalidOccupation"] <- NA

Then visualizing missing cases table, and statistics of complete and incomplete observations:

library(naniar)
library(dlookr)
vis_miss(sample_dirty_dataset, cluster = TRUE)

sample_dirty_dataset%>%
  miss_case_table()
## # A tibble: 4 × 3
##   n_miss_in_case n_cases pct_cases
##            <int>   <int>     <dbl>
## 1              0     589      58.9
## 2              1     335      33.5
## 3              2      63       6.3
## 4              3      13       1.3

And visualization of numerical variables to find outliers:

library(rrcov)
library(naniar)
library(dlookr)
plot(sample_dirty_dataset$capital_gain, pch = ifelse(sample_dirty_dataset$capital_gain < 0, 19, 1), col = ifelse(sample_dirty_dataset$capital_gain < 0, "red", "blue"),
     xlab = "Capital gain", ylab = "Values", main = "Graphic of capital_gain variable values")

plot(sample_dirty_dataset$capital_loss, pch = ifelse(sample_dirty_dataset$capital_loss < 0, 19, 1), col = ifelse(sample_dirty_dataset$capital_loss < 0, "red", "blue"),
     xlab = "Capital loss", ylab = "Values", main = "Graphic of capital_loss variable values")

library(rrcov)
par(mfrow=c(2,2))
plot(covMcd(sample_dirty_dataset$capital_gain))

Although only 3.6% of values are missing, if we delete all cases, that have “NA”, it would remove more than 40% of all observations.Also, there are not many cases, where many variables are missing at the same time. Dealing with missing values of qualitative columns, the most logical way would be to remove such observations, where inputting seems not a good option. Such candidates are workclass, marital_status, relationship, sex, native_country.

library(naniar)
library(DescTools)
repaired_dataset <- sample_dirty_dataset[complete.cases(sample_dirty_dataset[c("workclass", "marital_status", "relationship", "sex", "native_country")]), ]

median_value <- median(repaired_dataset[["hours_per_week"]], na.rm = TRUE) 

repaired_dataset[["hours_per_week"]][is.na(repaired_dataset[["hours_per_week"]])] <- median_value 

repaired_dataset$age <- as.integer(repaired_dataset$age)

mean_age<-mean(repaired_dataset$age, na.rm = TRUE)
 mean_age <- as.integer(mean_age)

repaired_dataset$age[is.na(repaired_dataset$age)] <- mean_age


repaired_dataset <- repaired_dataset[, -which(names(repaired_dataset) == "education_num")]

mode_edu<-Mode(repaired_dataset$education,na.rm = TRUE)
 
repaired_dataset$education[is.na(repaired_dataset$education)] <- mode_edu


repaired_dataset$race[is.na(repaired_dataset$race)] <- "Other"


repaired_dataset$occupation[is.na(repaired_dataset$occupation)] <- "Other-service"

head(repaired_dataset)
##   age    workclass education    marital_status        occupation   relationship
## 1  46 Never-worked   Masters          Divorced      Adm-clerical      Unmarried
## 3  53      Private Bachelors         Separated Machine-op-inspct  Not-in-family
## 4  70      Private       9th         Separated   Farming-fishing  Not-in-family
## 5  78    Local-gov   7th-8th Married-AF-spouse Handlers-cleaners      Own-child
## 6  71 Self-emp-inc Assoc-voc Married-AF-spouse     Other-service Other-relative
## 8  52  Federal-gov   Masters     Never-married     Other-service        Husband
##                 race    sex capital_gain capital_loss hours_per_week
## 1 Asian-Pac-Islander   Male        81712        69387             49
## 3              Other   Male        65305        57731              8
## 4              Other   Male         7401        41746             26
## 5 Amer-Indian-Eskimo Female        45980        99713             71
## 6 Asian-Pac-Islander   Male         -999        81896             16
## 8              White   Male        51768        53002             53
##    native_country income
## 1          Poland   >50K
## 3   United-States  <=50K
## 4 Trinadad&Tobago  <=50K
## 5       Nicaragua   >50K
## 6       Guatemala  <=50K
## 8          Taiwan  <=50K
vis_miss(repaired_dataset, cluster = TRUE)

Variable age was transformed to int data type, which seems more suitable for it, and then “NA” were changed to the mean value. Missing cases in “hours_per_week” variable were replaced with the median value. Education variable was input with the mode value. race and occupation variables were adjusted with the values “Other” and “Other-service” respectively.

As we can see from the table, we got rid of “NA” values.

library(DescTools)


boxplot(repaired_dataset$capital_gain)

mean_gain <- repaired_dataset$capital_gain %>%
  mean(na.rm = TRUE) %>%
  as.integer()

repaired_dataset$capital_gain[repaired_dataset$capital_gain < 0] <- mean_gain


plot(repaired_dataset$capital_gain, pch = ifelse(repaired_dataset$capital_gain < 0, 19, 1), col = ifelse(repaired_dataset$capital_gain < 0, "red", "blue"),
     xlab = "Index", ylab = "Values", main = "Point Graphic with Values less than Zero Marked")

library(DescTools)

library(dlookr)

mean_loss <- repaired_dataset$capital_loss %>%
  mean(na.rm = TRUE) %>%
  as.integer()

repaired_dataset$capital_loss[repaired_dataset$capital_loss < 0] <- mean_loss


plot(repaired_dataset$capital_loss, pch = ifelse(repaired_dataset$capital_loss < 0, 19, 1), col = ifelse(repaired_dataset$capital_loss < 0, "red", "blue"),
     xlab = "Index", ylab = "Values", main = "Point Graphic with Values less than Zero Marked")

transformation_web_report(repaired_dataset,browse = FALSE,title = "Transformation report",author = "Evgenei Khodar",create_date = Sys.time())
## 
  |                                                         
  |                                                   |   0%
  |                                                         
  |.                                                  |   3%                   
  |                                                         
  |...                                                |   5% [setup]           
  |                                                         
  |....                                               |   8%                   
  |                                                         
  |.....                                              |  10% [load_packages]   
  |                                                         
  |.......                                            |  13%                   
  |                                                         
  |........                                           |  15% [unnamed-chunk-6] 
  |                                                         
  |.........                                          |  18%                   
  |                                                         
  |..........                                         |  21% [udf]             
  |                                                         
  |............                                       |  23%                   
  |                                                         
  |.............                                      |  26% [create-overview] 
  |                                                         
  |..............                                     |  28%                   
  |                                                         
  |................                                   |  31% [overview]        
  |                                                         
  |.................                                  |  33%                   
  |                                                         
  |..................                                 |  36% [overview-pre]    
  |                                                         
  |....................                               |  38%                   
  |                                                         
  |.....................                              |  41% [unnamed-chunk-7] 
  |                                                         
  |......................                             |  44%                   
  |                                                         
  |........................                           |  46% [unnamed-chunk-8] 
  |                                                         
  |.........................                          |  49%                   
  |                                                         
  |..........................                         |  51% [unnamed-chunk-9] 
  |                                                         
  |...........................                        |  54%                   
  |                                                         
  |.............................                      |  56% [nalist]          
  |                                                         
  |..............................                     |  59%                   
  |                                                         
  |...............................                    |  62% [unnamed-chunk-10]
  |                                                         
  |.................................                  |  64%                   
  |                                                         
  |..................................                 |  67% [outlist]         
  |                                                         
  |...................................                |  69%                   
  |                                                         
  |.....................................              |  72% [unnamed-chunk-11]
  |                                                         
  |......................................             |  74%                   
  |                                                         
  |.......................................            |  77% [skweness]        
  |                                                         
  |.........................................          |  79%                   
  |                                                         
  |..........................................         |  82% [unnamed-chunk-12]
  |                                                         
  |...........................................        |  85%                   
  |                                                         
  |............................................       |  87% [binning]         
  |                                                         
  |..............................................     |  90%                   
  |                                                         
  |...............................................    |  92% [unnamed-chunk-13]
  |                                                         
  |................................................   |  95%                   
  |                                                         
  |.................................................. |  97% [optimal-binning] 
  |                                                         
  |...................................................| 100%                   
                                                                                                             
## "C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/pandoc" +RTS -K512m -RTS transformation_temp.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc53101f551d0.html --lua-filter "C:\Users\evgro\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\pagebreak.lua" --lua-filter "C:\Users\evgro\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\latex-div.lua" --embed-resources --standalone --variable bs3=TRUE --section-divs --template "C:\Users\evgro\AppData\Local\R\win-library\4.3\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable theme=bootstrap --css "C:/Users/evgro/AppData/Local/R/win-library/4.3/dlookr/resources/dlookr-bootstrap.css" --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\evgro\AppData\Local\Temp\RtmpKOBefl\rmarkdown-str53101d3a632a.html" --variable code_folding=show --variable code_menu=1 --include-in-header header_temp.html --include-after-body "C:\Users\evgro\AppData\Local\R\win-library\4.3\dlookr\resources\footer.html"

To numerical variables “capital_gain” and “capital_loss” the same method was applied in order to deal with outliers, values less then zero were replaced with the mean value. As it could be observed from the graphics, variables now are in better condition.