author: “Aleksandra Templin & Ksawery Raupuk”

Sample data

# Load necessary libraries
library(tidyverse)
#;-) Warning: pakiet 'tidyverse' został zbudowany w wersji R 4.3.3
#;-) ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#;-) ✔ dplyr     1.1.4     ✔ readr     2.1.5
#;-) ✔ forcats   1.0.0     ✔ stringr   1.5.1
#;-) ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
#;-) ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
#;-) ✔ purrr     1.0.2     
#;-) ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#;-) ✖ dplyr::filter() masks stats::filter()
#;-) ✖ dplyr::lag()    masks stats::lag()
#;-) ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dlookr)
#;-) Warning: pakiet 'dlookr' został zbudowany w wersji R 4.3.3
#;-) Registered S3 methods overwritten by 'dlookr':
#;-)   method          from  
#;-)   plot.transform  scales
#;-)   print.transform scales
#;-) 
#;-) Dołączanie pakietu: 'dlookr'
#;-) 
#;-) Następujący obiekt został zakryty z 'package:tidyr':
#;-) 
#;-)     extract
#;-) 
#;-) Następujący obiekt został zakryty z 'package:base':
#;-) 
#;-)     transform
library(naniar)
#;-) Warning: pakiet 'naniar' został zbudowany w wersji R 4.3.3
library(dplyr)

# Set seed for reproducibility
set.seed(7)

# Generate sample data
n <- 1000  # Number of observations

# Age between 18 and 90
age <- sample(18:90, n, replace = TRUE)

# Introduce errors in age column
age[sample(1:n, 50)] <- "InvalidAge"

# Workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked
workclass <- sample(c("Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", "Local-gov", "State-gov", "Without-pay", "Never-worked"), n, replace = TRUE)

# Introduce missing values in workclass column
workclass[sample(1:n, 50)] <- NA

# Education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool
education <- sample(c("Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"), n, replace = TRUE)

# Introduce inconsistencies in education column
education[sample(1:n, 50)] <- "InvalidEducation"

# Education Number: 1-16
education_num <- sample(1:16, n, replace = TRUE)

# Marital Status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse
marital_status <- sample(c("Married-civ-spouse", "Divorced", "Never-married", "Separated", "Widowed", "Married-spouse-absent", "Married-AF-spouse"), n, replace = TRUE)

# Introduce missing values in marital_status column
marital_status[sample(1:n, 50)] <- NA

# Occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces
occupation <- sample(c("Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv", "Armed-Forces"), n, replace = TRUE)

# Introduce errors in occupation column
occupation[sample(1:n, 50)] <- "InvalidOccupation"

# Relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried
relationship <- sample(c("Wife", "Own-child", "Husband", "Not-in-family", "Other-relative", "Unmarried"), n, replace = TRUE)

# Introduce missing values in relationship column
relationship[sample(1:n, 50)] <- NA

# Race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black
race <- sample(c("White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"), n, replace = TRUE)

# Introduce inconsistencies in race column
race[sample(1:n, 50)] <- "InvalidRace"

# Sex: Female, Male
sex <- sample(c("Female", "Male"), n, replace = TRUE)

# Introduce missing values in sex column
sex[sample(1:n, 50)] <- NA

# Capital Gain: 0-99999
capital_gain <- sample(0:99999, n, replace = TRUE)

# Introduce errors in capital_gain column
capital_gain[sample(1:n, 50)] <- -999

# Capital Loss: 0-99999
capital_loss <- sample(0:99999, n, replace = TRUE)

# Introduce errors in capital_loss column
capital_loss[sample(1:n, 50)] <- -999

# Hours per week: 1-99
hours_per_week <- sample(1:99, n, replace = TRUE)

# Introduce missing values in hours_per_week column
hours_per_week[sample(1:n, 50)] <- NA

# Native country: United-States, Cambodia, England, ...
native_country <- sample(c("United-States", "Cambodia", "England", "Puerto-Rico", "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras", "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico", "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"), n, replace = TRUE)

# Introduce missing values in native_country column
native_country[sample(1:n, 50)] <- NA

# Income: <=50K, >50K
income <- sample(c("<=50K", ">50K"), n, replace = TRUE, prob = c(0.75, 0.25))

# Create dataframe
sample_dirty_dataset <- data.frame(age, workclass, education, education_num, marital_status, occupation, relationship, race, sex, capital_gain, capital_loss, hours_per_week, native_country, income)

# Print first few rows of the dataset
head(sample_dirty_dataset)
#;-)   age    workclass   education education_num     marital_status
#;-) 1  59  Without-pay        10th            15          Separated
#;-) 2  48    State-gov  Assoc-acdm             6 Married-civ-spouse
#;-) 3  83    State-gov   Bachelors             1 Married-civ-spouse
#;-) 4  32 Never-worked Prof-school            16  Married-AF-spouse
#;-) 5  25      Private     Masters             9 Married-civ-spouse
#;-) 6  84  Without-pay     1st-4th             9            Widowed
#;-)          occupation   relationship               race    sex capital_gain
#;-) 1   Priv-house-serv           Wife              Black   Male        35050
#;-) 2 Machine-op-inspct Other-relative Amer-Indian-Eskimo Female         -999
#;-) 3   Priv-house-serv           <NA>              Black   Male        84587
#;-) 4  Transport-moving           <NA> Asian-Pac-Islander Female        24715
#;-) 5 InvalidOccupation Other-relative              Other Female        16599
#;-) 6      Adm-clerical      Unmarried              White   Male         1505
#;-)   capital_loss hours_per_week native_country income
#;-) 1        96892             34         Poland  <=50K
#;-) 2        20459             40           Iran  <=50K
#;-) 3        76544             52         Poland   >50K
#;-) 4        92227              8        Vietnam  <=50K
#;-) 5         -999             85           Hong  <=50K
#;-) 6        33344             43        Ireland  <=50K

Steps 3&4 - identify issues

# Detecting errors - where do we have 'NA'? - using naniar

sample_dirty_dataset[sample_dirty_dataset == "InvalidAge" | sample_dirty_dataset == "InvalidEducation" | sample_dirty_dataset == "InvalidOccupation" | sample_dirty_dataset == "InvalidRace"] <- NA
sample_dirty_dataset$age <- as.numeric(sample_dirty_dataset$age)

sample_dirty_dataset$capital_gain[sample_dirty_dataset$capital_gain < 0] <- NA

sample_dirty_dataset$capital_loss[sample_dirty_dataset$capital_loss < 0] <- NA

vis_miss(sample_dirty_dataset, cluster = TRUE, sort_miss = TRUE)

Step 5 - impute/remove missing values

# Imputing missing values for numerical variables with mean
numerical_vars <- c("capital_gain", "capital_loss", "hours_per_week", "age")

for (var in numerical_vars) {
  sample_dirty_dataset[[var]] <- ifelse(is.na(sample_dirty_dataset[[var]]),mean(sample_dirty_dataset[[var]],na.rm = TRUE),sample_dirty_dataset[[var]])
}

# Removing values that were not imputed
sample_dirty_dataset <- sample_dirty_dataset[complete.cases(sample_dirty_dataset), ]

head(sample_dirty_dataset)
#;-)    age    workclass  education education_num        marital_status
#;-) 1   59  Without-pay       10th            15             Separated
#;-) 2   48    State-gov Assoc-acdm             6    Married-civ-spouse
#;-) 6   84  Without-pay    1st-4th             9               Widowed
#;-) 10  25 Never-worked    Masters            12         Never-married
#;-) 11  76  Without-pay       11th            10 Married-spouse-absent
#;-) 12  29  Federal-gov  Preschool             5               Widowed
#;-)           occupation   relationship               race    sex capital_gain
#;-) 1    Priv-house-serv           Wife              Black   Male     35050.00
#;-) 2  Machine-op-inspct Other-relative Amer-Indian-Eskimo Female     49411.71
#;-) 6       Adm-clerical      Unmarried              White   Male      1505.00
#;-) 10   Protective-serv  Not-in-family Amer-Indian-Eskimo   Male     60313.00
#;-) 11   Farming-fishing           Wife Asian-Pac-Islander   Male     10088.00
#;-) 12 Handlers-cleaners  Not-in-family              Other Female     48526.00
#;-)    capital_loss hours_per_week  native_country income
#;-) 1         96892             34          Poland  <=50K
#;-) 2         20459             40            Iran  <=50K
#;-) 6         33344             43         Ireland  <=50K
#;-) 10        73126             44            Cuba  <=50K
#;-) 11         7797             31         Germany  <=50K
#;-) 12        73226             31 Trinadad&Tobago  <=50K
vis_miss(sample_dirty_dataset, cluster = TRUE, sort_miss = TRUE)

Step 6 - detection and visualization of outliers

# Histogram
hist(sample_dirty_dataset$age, main = "Histogram of Age", xlab = "Age")


# Boxplot
boxplot(sample_dirty_dataset$age, main = "Boxplot of Age")


# Z-score
z_scores <- scale(sample_dirty_dataset$age)
hist(z_scores)


# it looks like we don't have much outliers in 'age' column, so we can leave them

Step 7 - Standardization, normalization or binning

library(dlookr)

bin <- binning(sample_dirty_dataset$education_num)
summary(bin)
#;-)     levels freq       rate
#;-) 1    [1,2]   81 0.12235650
#;-) 2    (2,3]   46 0.06948640
#;-) 3    (3,5]   86 0.12990937
#;-) 4    (5,6]   57 0.08610272
#;-) 5    (6,7]   38 0.05740181
#;-) 6    (7,9]   81 0.12235650
#;-) 7   (9,10]   43 0.06495468
#;-) 8  (10,12]   68 0.10271903
#;-) 9  (12,13]   47 0.07099698
#;-) 10 (13,15]   84 0.12688822
#;-) 11 (15,16]   31 0.04682779
plot(bin)

Step 10

sample_dirty_dataset %>% diagnose_web_report(theme = "blue")
#;-) processing file: diagnosis_temp.Rmd
#;-) output file: diagnosis_temp.knit.md
#;-) "C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/pandoc" +RTS -K512m -RTS diagnosis_temp.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc49a06bcc15e3.html --lua-filter "C:\Users\aleks\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\pagebreak.lua" --lua-filter "C:\Users\aleks\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\latex-div.lua" --embed-resources --standalone --variable bs3=TRUE --section-divs --template "C:\Users\aleks\AppData\Local\R\win-library\4.3\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable theme=bootstrap --css "C:/Users/aleks/AppData/Local/R/win-library/4.3/dlookr/resources/dlookr-bootstrap.css" --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\aleks\AppData\Local\Temp\Rtmp0CP0yZ\rmarkdown-str49a03a4e6bda.html" --variable code_folding=show --variable code_menu=1 --include-in-header header_temp.html --include-after-body "C:\Users\aleks\AppData\Local\R\win-library\4.3\dlookr\resources\footer.html"
#;-) 
#;-) Output created: C:\Users\aleks\AppData\Local\Temp\Rtmp0CP0yZ/Diagnosis_Report.html

sample_dirty_dataset %>% eda_web_report(theme = "blue")
#;-) processing file: eda_temp.Rmd
#;-) Error in parse_block(g[-1], g[1], params.src, markdown_mode): Duplicate chunk label 'unnamed-chunk-1', which has been used for the chunk:
#;-) cat(sprintf("*Local `.Rprofile` detected at `%s`*", normalizePath(".Rprofile")))

sample_dirty_dataset %>% transformation_web_report(theme = "blue")
#;-) processing file: transformation_temp.Rmd
#;-) Error in parse_block(g[-1], g[1], params.src, markdown_mode): Duplicate chunk label 'unnamed-chunk-1', which has been used for the chunk:
#;-) cat(sprintf("*Local `.Rprofile` detected at `%s`*", normalizePath(".Rprofile")))
Standard output and standard error
 Install the styler package in order to use `style = TRUE`.
Session info
sessionInfo()
#;-) R version 4.3.2 (2023-10-31 ucrt)
#;-) Platform: x86_64-w64-mingw32/x64 (64-bit)
#;-) Running under: Windows 11 x64 (build 22631)
#;-) 
#;-) Matrix products: default
#;-) 
#;-) 
#;-) locale:
#;-) [1] LC_COLLATE=Polish_Poland.utf8  LC_CTYPE=Polish_Poland.utf8   
#;-) [3] LC_MONETARY=Polish_Poland.utf8 LC_NUMERIC=C                  
#;-) [5] LC_TIME=Polish_Poland.utf8    
#;-) 
#;-) time zone: Europe/Warsaw
#;-) tzcode source: internal
#;-) 
#;-) attached base packages:
#;-) [1] stats     graphics  grDevices utils     datasets  methods   base     
#;-) 
#;-) other attached packages:
#;-)  [1] htmltools_0.5.7  reactable_0.4.4  kableExtra_1.4.0 naniar_1.1.0    
#;-)  [5] dlookr_0.6.3     lubridate_1.9.3  forcats_1.0.0    stringr_1.5.1   
#;-)  [9] dplyr_1.1.4      purrr_1.0.2      readr_2.1.5      tidyr_1.3.1     
#;-) [13] tibble_3.2.1     ggplot2_3.4.4    tidyverse_2.0.0 
#;-) 
#;-) loaded via a namespace (and not attached):
#;-)  [1] tidyselect_1.2.0        viridisLite_0.4.2       farver_2.1.1           
#;-)  [4] fastmap_1.1.1           pagedown_0.20           fontquiver_0.2.1       
#;-)  [7] promises_1.2.1          reprex_2.1.0            digest_0.6.34          
#;-) [10] timechange_0.3.0        mime_0.12               lifecycle_1.0.4        
#;-) [13] gfonts_0.2.0            ellipsis_0.3.2          magrittr_2.0.3         
#;-) [16] compiler_4.3.2          rlang_1.1.3             sass_0.4.8             
#;-) [19] tools_4.3.2             utf8_1.2.4              yaml_2.3.8             
#;-) [22] knitr_1.45              labeling_0.4.3          htmlwidgets_1.6.4      
#;-) [25] curl_5.2.0              xml2_1.3.6              showtextdb_3.0         
#;-) [28] httpcode_0.3.0          withr_3.0.0             grid_4.3.2             
#;-) [31] fansi_1.0.6             sysfonts_0.8.9          gdtools_0.3.7          
#;-) [34] xtable_1.8-4            colorspace_2.1-0        extrafontdb_1.0        
#;-) [37] scales_1.3.0            crul_1.4.0              cli_3.6.2              
#;-) [40] rmarkdown_2.25          crayon_1.5.2            generics_0.1.3         
#;-) [43] rstudioapi_0.15.0       tzdb_0.4.0              cachem_1.0.8           
#;-) [46] base64enc_0.1-3         vctrs_0.6.5             jsonlite_1.8.8         
#;-) [49] fontBitstreamVera_0.1.1 hms_1.1.3               visdat_0.6.0           
#;-) [52] systemfonts_1.0.5       fontawesome_0.5.2       jquerylib_0.1.4        
#;-) [55] glue_1.7.0              reactR_0.5.0            stringi_1.8.3          
#;-) [58] gtable_0.3.4            later_1.3.2             extrafont_0.19         
#;-) [61] munsell_0.5.0           pillar_1.9.0            showtext_0.9-7         
#;-) [64] R6_2.5.1                evaluate_0.23           shiny_1.8.1.1          
#;-) [67] highr_0.10              fontLiberation_0.1.0    httpuv_1.6.15          
#;-) [70] bslib_0.6.1             hrbrthemes_0.8.7        Rcpp_1.0.12            
#;-) [73] svglite_2.1.3           gridExtra_2.3           Rttf2pt1_1.3.12        
#;-) [76] xfun_0.42               fs_1.6.3                pkgconfig_2.0.3