Sample data

# Load necessary libraries
library(tidyverse)
#;-) Warning: package 'tidyverse' was built under R version 4.3.3
#;-) Warning: package 'ggplot2' was built under R version 4.3.3
#;-) Warning: package 'tidyr' was built under R version 4.3.3
#;-) Warning: package 'readr' was built under R version 4.3.3
#;-) Warning: package 'purrr' was built under R version 4.3.3
#;-) Warning: package 'forcats' was built under R version 4.3.3
#;-) Warning: package 'lubridate' was built under R version 4.3.3
#;-) ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#;-) ✔ dplyr     1.1.4     ✔ readr     2.1.5
#;-) ✔ forcats   1.0.0     ✔ stringr   1.5.1
#;-) ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
#;-) ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
#;-) ✔ purrr     1.0.2     
#;-) ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#;-) ✖ dplyr::filter() masks stats::filter()
#;-) ✖ dplyr::lag()    masks stats::lag()
#;-) ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# Set seed for reproducibility
set.seed(123)

# Generate sample data
n <- 1000  # Number of observations

# Age between 18 and 90
age <- sample(18:90, n, replace = TRUE)

# Introduce errors in age column
age[sample(1:n, 50)] <- "InvalidAge"

# Workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked
workclass <- sample(c("Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", "Local-gov", "State-gov", "Without-pay", "Never-worked"), n, replace = TRUE)

# Introduce missing values in workclass column
workclass[sample(1:n, 50)] <- NA

# Education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool
education <- sample(c("Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"), n, replace = TRUE)

# Introduce inconsistencies in education column
education[sample(1:n, 50)] <- "InvalidEducation"

# Education Number: 1-16
education_num <- sample(1:16, n, replace = TRUE)


# Marital Status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse
marital_status <- sample(c("Married-civ-spouse", "Divorced", "Never-married", "Separated", "Widowed", "Married-spouse-absent", "Married-AF-spouse"), n, replace = TRUE)

# Introduce missing values in marital_status column
marital_status[sample(1:n, 50)] <- NA

# Occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces
occupation <- sample(c("Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv", "Armed-Forces"), n, replace = TRUE)

# Introduce errors in occupation column
occupation[sample(1:n, 50)] <- "InvalidOccupation"

# Relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried
relationship <- sample(c("Wife", "Own-child", "Husband", "Not-in-family", "Other-relative", "Unmarried"), n, replace = TRUE)

# Introduce missing values in relationship column
relationship[sample(1:n, 50)] <- NA

# Race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black
race <- sample(c("White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"), n, replace = TRUE)

# Introduce inconsistencies in race column
race[sample(1:n, 50)] <- "InvalidRace"

# Sex: Female, Male
sex <- sample(c("Female", "Male"), n, replace = TRUE)

# Introduce missing values in sex column
sex[sample(1:n, 50)] <- NA

# Capital Gain: 0-99999
capital_gain <- sample(0:99999, n, replace = TRUE)

# Introduce errors in capital_gain column
capital_gain[sample(1:n, 50)] <- -999

# Capital Loss: 0-99999
capital_loss <- sample(0:99999, n, replace = TRUE)

# Introduce errors in capital_loss column
capital_loss[sample(1:n, 50)] <- -999

# Hours per week: 1-99
hours_per_week <- sample(1:99, n, replace = TRUE)

# Introduce missing values in hours_per_week column
hours_per_week[sample(1:n, 50)] <- NA

# Native country: United-States, Cambodia, England, ...
native_country <- sample(c("United-States", "Cambodia", "England", "Puerto-Rico", "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras", "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico", "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"), n, replace = TRUE)

# Introduce missing values in native_country column
native_country[sample(1:n, 50)] <- NA

# Income: <=50K, >50K
income <- sample(c("<=50K", ">50K"), n, replace = TRUE, prob = c(0.75, 0.25))

# Create dataframe
sample_dirty_dataset <- data.frame(age, workclass, education, education_num, marital_status, occupation, relationship, race, sex, capital_gain, capital_loss, hours_per_week, native_country, income)

# Print first few rows of the dataset
#head(sample_dirty_dataset)

##############################################
# My code

# Step 4
# Repair data

sample_dirty_dataset$capital_gain[sample_dirty_dataset$capital_gain < 0] <- NA
sample_dirty_dataset$capital_loss[sample_dirty_dataset$capital_loss < 0] <- NA
sample_dirty_dataset[sample_dirty_dataset == "InvalidRace"] <- NA
sample_dirty_dataset[sample_dirty_dataset == "InvalidOccupation"] <- NA
sample_dirty_dataset[sample_dirty_dataset == "InvalidEducation"] <- NA
sample_dirty_dataset[sample_dirty_dataset == "InvalidAge"] <- NA

# Step 5 
# Missing data
# I use the mice algorithm to approximate the values and impute them (Single imputation)
# There is missing data in workclass column, marital_status column, relationship column, sex column, hours_per_week column

library("mice")
#;-) Warning: package 'mice' was built under R version 4.3.3
#;-) Warning in check_dep_version(): ABI version mismatch: 
#;-) lme4 was built with Matrix ABI version 1
#;-) Current Matrix ABI version is 0
#;-) Please re-install lme4 from source or restore original 'Matrix' package
#;-) 
#;-) Attaching package: 'mice'
#;-) 
#;-) The following object is masked from 'package:stats':
#;-) 
#;-)     filter
#;-) 
#;-) The following objects are masked from 'package:base':
#;-) 
#;-)     cbind, rbind
sample_dirty_dataset <- complete(mice(sample_dirty_dataset, m = 1))
#;-) 
#;-)  iter imp variable
#;-)   1   1  capital_gain  capital_loss  hours_per_week
#;-)   2   1  capital_gain  capital_loss  hours_per_week
#;-)   3   1  capital_gain  capital_loss  hours_per_week
#;-)   4   1  capital_gain  capital_loss  hours_per_week
#;-)   5   1  capital_gain  capital_loss  hours_per_week
#;-) Warning: Number of logged events: 10
# The only column that can be imputed are the columns with numbers so I have to delete the rest
sample_dirty_dataset <- na.omit(sample_dirty_dataset)

# Step 6 
# Detect outliers

# Method 1
boxplot(sample_dirty_dataset$education_num)     

boxplot(sample_dirty_dataset$capital_gain)

boxplot(sample_dirty_dataset$capital_loss)

boxplot(sample_dirty_dataset$hours_per_week)


# Method 2
zscore <- abs(scale(sample_dirty_dataset$education_num))
plot(zscore, type = "n")
abline(h=2,col="red")
text(1:length(sample_dirty_dataset$education_num),zscore)


zscore <- abs(scale(sample_dirty_dataset$capital_gain))
plot(zscore, type = "n")
abline(h=2,col="red")
text(1:length(sample_dirty_dataset$capital_gain),zscore)


zscore <- abs(scale(sample_dirty_dataset$capital_loss))
plot(zscore, type = "n")
abline(h=2,col="red")
text(1:length(sample_dirty_dataset$capital_loss),zscore)


zscore <- abs(scale(sample_dirty_dataset$hours_per_week))
plot(zscore, type = "n")
abline(h=2,col="red")
text(1:length(sample_dirty_dataset$hours_per_week),zscore)


# Method 3
library(rrcov)
#;-) Warning: package 'rrcov' was built under R version 4.3.3
#;-) Loading required package: robustbase
#;-) Warning: package 'robustbase' was built under R version 4.3.3
#;-) 
#;-) Attaching package: 'robustbase'
#;-) 
#;-) The following object is masked _by_ '.GlobalEnv':
#;-) 
#;-)     education
#;-) 
#;-) Scalable Robust Estimators with High Breakdown Point (version 1.7-5)

par(mfrow=c(2,2))
plot(covMcd(sample_dirty_dataset$education_num))

plot(covMcd(sample_dirty_dataset$capital_gain))

plot(covMcd(sample_dirty_dataset$capital_loss))

plot(covMcd(sample_dirty_dataset$hours_per_week))


# Step 7 
# Do any variables need standardization? Input data doesn't differ that much from each other and everything in the same unit, so we don't need standardization
# Do any variables need normalization? I am not sure but probably not 
# Do any variables need binning? Yes

library(dlookr)
#;-) Warning: package 'dlookr' was built under R version 4.3.3
#;-) Registered S3 methods overwritten by 'dlookr':
#;-)   method          from  
#;-)   plot.transform  scales
#;-)   print.transform scales
#;-) 
#;-) Attaching package: 'dlookr'
#;-) 
#;-) The following object is masked from 'package:tidyr':
#;-) 
#;-)     extract
#;-) 
#;-) The following object is masked from 'package:base':
#;-) 
#;-)     transform

bin <- binning(sample_dirty_dataset$education_num)
summary(bin)
#;-)     levels freq       rate
#;-) 1    [1,2]   92 0.14743590
#;-) 2    (2,3]   27 0.04326923
#;-) 3    (3,5]   89 0.14262821
#;-) 4    (5,6]   33 0.05288462
#;-) 5    (6,8]   65 0.10416667
#;-) 6    (8,9]   39 0.06250000
#;-) 7   (9,11]   92 0.14743590
#;-) 8  (11,12]   28 0.04487179
#;-) 9  (12,14]   84 0.13461538
#;-) 10 (14,15]   38 0.06089744
#;-) 11 (15,16]   37 0.05929487
plot(bin)
#;-) Don't know how to automatically pick scale for object of type <table>.
#;-) Defaulting to continuous.


# Step 10

# 1 Diagnostic Report
sample_dirty_dataset %>% diagnose_web_report(theme = "blue")
#;-) processing file: diagnosis_temp.Rmd
#;-) output file: diagnosis_temp.knit.md
#;-) "C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/pandoc" +RTS -K512m -RTS diagnosis_temp.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc559c4918c83.html --lua-filter "C:\Users\Michal\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\pagebreak.lua" --lua-filter "C:\Users\Michal\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\latex-div.lua" --embed-resources --standalone --variable bs3=TRUE --section-divs --template "C:\Users\Michal\AppData\Local\R\win-library\4.3\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable theme=bootstrap --css "C:/Users/Michal/AppData/Local/R/win-library/4.3/dlookr/resources/dlookr-bootstrap.css" --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\Michal\AppData\Local\Temp\RtmpOuLjNx\rmarkdown-str559c13734ba9.html" --variable code_folding=show --variable code_menu=1 --include-in-header header_temp.html --include-after-body "C:\Users\Michal\AppData\Local\R\win-library\4.3\dlookr\resources\footer.html"
#;-) 
#;-) Output created: C:\Users\Michal\AppData\Local\Temp\RtmpOuLjNx/Diagnosis_Report.html

# 2 EDA Report
sample_dirty_dataset %>% eda_web_report(theme = "blue")
#;-) processing file: eda_temp.Rmd
#;-) Error in parse_block(g[-1], g[1], params.src, markdown_mode): Duplicate chunk label 'unnamed-chunk-1', which has been used for the chunk:
#;-) cat(sprintf("*Local `.Rprofile` detected at `%s`*", normalizePath(".Rprofile")))

# 3 Static report
sample_dirty_dataset %>% transformation_web_report(theme = "blue")
#;-) processing file: transformation_temp.Rmd
#;-) Error in parse_block(g[-1], g[1], params.src, markdown_mode): Duplicate chunk label 'unnamed-chunk-1', which has been used for the chunk:
#;-) cat(sprintf("*Local `.Rprofile` detected at `%s`*", normalizePath(".Rprofile")))
Standard output and standard error
✖ Install the styler package in order to use `style = TRUE`.
Session info
sessionInfo()
#;-) R version 4.3.2 (2023-10-31 ucrt)
#;-) Platform: x86_64-w64-mingw32/x64 (64-bit)
#;-) Running under: Windows 10 x64 (build 19045)
#;-) 
#;-) Matrix products: default
#;-) 
#;-) 
#;-) locale:
#;-) [1] LC_COLLATE=English_United States.utf8 
#;-) [2] LC_CTYPE=English_United States.utf8   
#;-) [3] LC_MONETARY=English_United States.utf8
#;-) [4] LC_NUMERIC=C                          
#;-) [5] LC_TIME=English_United States.utf8    
#;-) 
#;-) time zone: Europe/Warsaw
#;-) tzcode source: internal
#;-) 
#;-) attached base packages:
#;-) [1] stats     graphics  grDevices utils     datasets  methods   base     
#;-) 
#;-) other attached packages:
#;-)  [1] htmltools_0.5.7   reactable_0.4.4   kableExtra_1.4.0  dlookr_0.6.3     
#;-)  [5] rrcov_1.7-5       robustbase_0.99-2 mice_3.16.0       lubridate_1.9.3  
#;-)  [9] forcats_1.0.0     stringr_1.5.1     dplyr_1.1.4       purrr_1.0.2      
#;-) [13] readr_2.1.5       tidyr_1.3.1       tibble_3.2.1      ggplot2_3.5.0    
#;-) [17] tidyverse_2.0.0  
#;-) 
#;-) loaded via a namespace (and not attached):
#;-)   [1] gridExtra_2.3           rlang_1.1.3             magrittr_2.0.3         
#;-)   [4] hrbrthemes_0.8.7        compiler_4.3.2          systemfonts_1.0.6      
#;-)   [7] vctrs_0.6.5             sysfonts_0.8.9          httpcode_0.3.0         
#;-)  [10] pkgconfig_2.0.3         shape_1.4.6.1           crayon_1.5.2           
#;-)  [13] fastmap_1.1.1           ellipsis_0.3.2          backports_1.4.1        
#;-)  [16] fontawesome_0.5.2       labeling_0.4.3          utf8_1.2.4             
#;-)  [19] promises_1.2.1          rmarkdown_2.25          tzdb_0.4.0             
#;-)  [22] nloptr_2.0.3            xfun_0.42               glmnet_4.1-8           
#;-)  [25] jomo_2.7-6              reprex_2.1.0            cachem_1.0.8           
#;-)  [28] showtext_0.9-7          jsonlite_1.8.8          highr_0.10             
#;-)  [31] later_1.3.2             pan_1.9                 broom_1.0.5            
#;-)  [34] R6_2.5.1                bslib_0.6.1             stringi_1.8.3          
#;-)  [37] pagedown_0.20           boot_1.3-28.1           extrafontdb_1.0        
#;-)  [40] rpart_4.1.21            jquerylib_0.1.4         Rcpp_1.0.12            
#;-)  [43] iterators_1.0.14        knitr_1.45              base64enc_0.1-3        
#;-)  [46] extrafont_0.19          httpuv_1.6.15           Matrix_1.6-1.1         
#;-)  [49] splines_4.3.2           nnet_7.3-19             timechange_0.3.0       
#;-)  [52] tidyselect_1.2.1        rstudioapi_0.15.0       yaml_2.3.8             
#;-)  [55] codetools_0.2-19        curl_5.2.1              lattice_0.21-9         
#;-)  [58] shiny_1.8.1             withr_3.0.0             evaluate_0.23          
#;-)  [61] survival_3.5-7          xml2_1.3.6              pillar_1.9.0           
#;-)  [64] foreach_1.5.2           stats4_4.3.2            pcaPP_2.0-4            
#;-)  [67] generics_0.1.3          hms_1.1.3               munsell_0.5.0          
#;-)  [70] scales_1.3.0            minqa_1.2.6             xtable_1.8-4           
#;-)  [73] glue_1.7.0              gdtools_0.3.7           tools_4.3.2            
#;-)  [76] gfonts_0.2.0            lme4_1.1-35.2           fs_1.6.3               
#;-)  [79] mvtnorm_1.2-4           grid_4.3.2              Rttf2pt1_1.3.12        
#;-)  [82] colorspace_2.1-0        nlme_3.1-163            showtextdb_3.0         
#;-)  [85] cli_3.6.2               fontBitstreamVera_0.1.1 fansi_1.0.6            
#;-)  [88] viridisLite_0.4.2       svglite_2.1.3           gtable_0.3.4           
#;-)  [91] DEoptimR_1.1-3          reactR_0.5.0            sass_0.4.8             
#;-)  [94] digest_0.6.34           fontquiver_0.2.1        crul_1.4.0             
#;-)  [97] farver_2.1.1            htmlwidgets_1.6.4       lifecycle_1.0.4        
#;-) [100] mitml_0.4-5             mime_0.12               fontLiberation_0.1.0   
#;-) [103] MASS_7.3-60