Sample data
# Load necessary libraries
library(tidyverse)
## Warning: package 'tidyr' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dlookr)
## Warning: package 'dlookr' was built under R version 4.3.3
## Registered S3 methods overwritten by 'dlookr':
## method from
## plot.transform scales
## print.transform scales
##
## Attaching package: 'dlookr'
##
## The following object is masked from 'package:tidyr':
##
## extract
##
## The following object is masked from 'package:base':
##
## transform
library(rrcov)
## Warning: package 'rrcov' was built under R version 4.3.3
## Zorunlu paket yükleniyor: robustbase
## Warning: package 'robustbase' was built under R version 4.3.3
## Scalable Robust Estimators with High Breakdown Point (version 1.7-5)
# Set seed for reproducibility
set.seed(31)
# Generate sample data
n <- 1000 # Number of observations
# Age between 18 and 90
age <- sample(18:90, n, replace = TRUE)
# Introduce errors in age column
age[sample(1:n, 50)] <- "InvalidAge"
# Workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked
workclass <- sample(c("Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", "Local-gov", "State-gov", "Without-pay", "Never-worked"), n, replace = TRUE)
# Introduce missing values in workclass column
workclass[sample(1:n, 50)] <- NA
# Education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool
education <- sample(c("Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"), n, replace = TRUE)
# Introduce inconsistencies in education column
education[sample(1:n, 50)] <- "InvalidEducation"
# Education Number: 1-16
education_num <- sample(1:16, n, replace = TRUE)
# Marital Status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse
marital_status <- sample(c("Married-civ-spouse", "Divorced", "Never-married", "Separated", "Widowed", "Married-spouse-absent", "Married-AF-spouse"), n, replace = TRUE)
# Introduce missing values in marital_status column
marital_status[sample(1:n, 50)] <- NA
# Occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces
occupation <- sample(c("Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv", "Armed-Forces"), n, replace = TRUE)
# Introduce errors in occupation column
occupation[sample(1:n, 50)] <- "InvalidOccupation"
# Relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried
relationship <- sample(c("Wife", "Own-child", "Husband", "Not-in-family", "Other-relative", "Unmarried"), n, replace = TRUE)
# Introduce missing values in relationship column
relationship[sample(1:n, 50)] <- NA
# Race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black
race <- sample(c("White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"), n, replace = TRUE)
# Introduce inconsistencies in race column
race[sample(1:n, 50)] <- "InvalidRace"
# Sex: Female, Male
sex <- sample(c("Female", "Male"), n, replace = TRUE)
# Introduce missing values in sex column
sex[sample(1:n, 50)] <- NA
# Capital Gain: 0-99999
capital_gain <- sample(0:99999, n, replace = TRUE)
# Introduce errors in capital_gain column
capital_gain[sample(1:n, 50)] <- -999
# Capital Loss: 0-99999
capital_loss <- sample(0:99999, n, replace = TRUE)
# Introduce errors in capital_loss column
capital_loss[sample(1:n, 50)] <- -999
# Hours per week: 1-99
hours_per_week <- sample(1:99, n, replace = TRUE)
# Introduce missing values in hours_per_week column
hours_per_week[sample(1:n, 50)] <- NA
# Native country: United-States, Cambodia, England, ...
native_country <- sample(c("United-States", "Cambodia", "England", "Puerto-Rico", "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras", "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico", "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"), n, replace = TRUE)
# Introduce missing values in native_country column
native_country[sample(1:n, 50)] <- NA
# Income: <=50K, >50K
income <- sample(c("<=50K", ">50K"), n, replace = TRUE, prob = c(0.75, 0.25))
# Create dataframe
sample_dirty_dataset <- data.frame(age, workclass, education, education_num, marital_status, occupation, relationship, race, sex, capital_gain, capital_loss, hours_per_week, native_country, income)
# Print first few rows of the dataset
head(sample_dirty_dataset)
## age workclass education education_num marital_status
## 1 62 Without-pay Masters 5 Separated
## 2 66 Without-pay 1st-4th 6 Married-civ-spouse
## 3 57 Never-worked Some-college 13 Married-civ-spouse
## 4 81 State-gov Bachelors 2 Married-civ-spouse
## 5 60 Self-emp-inc 1st-4th 5 Never-married
## 6 72 Self-emp-inc HS-grad 15 Separated
## occupation relationship race sex capital_gain
## 1 Priv-house-serv Wife Amer-Indian-Eskimo Female 70407
## 2 Other-service Own-child Black Female 42541
## 3 Other-service Own-child White Male 70503
## 4 InvalidOccupation Own-child White Female 51598
## 5 Farming-fishing Not-in-family InvalidRace Female 8284
## 6 Machine-op-inspct <NA> White Male 88182
## capital_loss hours_per_week native_country income
## 1 -999 4 South <=50K
## 2 37405 59 Outlying-US(Guam-USVI-etc) <=50K
## 3 78767 34 India <=50K
## 4 -999 61 Ireland <=50K
## 5 63324 20 Ecuador <=50K
## 6 -999 11 Cuba <=50K
our codes
# changing all invalid data to NA
sample_dirty_dataset$age[sample_dirty_dataset$age == "InvalidAge"] <- NA
sample_dirty_dataset$education[sample_dirty_dataset$education == "InvalidEducation"] <- NA
sample_dirty_dataset$occupation[sample_dirty_dataset$occupation == "InvalidOccupation"] <- NA
sample_dirty_dataset$race[sample_dirty_dataset$race == "InvalidRace"] <- NA
sample_dirty_dataset$capital_gain[sample_dirty_dataset$capital_gain < 0] <- NA
sample_dirty_dataset$capital_loss[sample_dirty_dataset$capital_loss < 0] <- NA
#The function updates the dataset by replacing invalid entries in specified columns with valid ones and removes rows with missing values (NA).
invalid_values <- c("InvalidEducation", "InvalidOccupation", "InvalidRelationship", "InvalidRace")
columns_to_clean <- c("education", "occupation", "relationship", "race")
clean_data_inplace <- function(dataset, columns_to_clean, invalid_values) {
for (column in columns_to_clean) {
valid_entries <- unique(dataset[[column]][!dataset[[column]] %in% invalid_values])
dataset[[column]][dataset[[column]] %in% invalid_values] <- sample(valid_entries, sum(dataset[[column]] %in% invalid_values), replace = TRUE)
}
dataset <- dataset %>% drop_na()
return(dataset)
}
sample_dirty_dataset <- clean_data_inplace(sample_dirty_dataset, columns_to_clean, invalid_values)
sample_dirty_dataset <- sample_dirty_dataset %>% drop_na()
# the first method
boxplot(sample_dirty_dataset[, c("capital_gain", "capital_loss")])

boxplot(sample_dirty_dataset$hours_per_week)

#the second method
df <- data.frame(capital_loss)
par(mfrow=c(2,2))
plot(covMcd(df$capital_loss))

df <- data.frame(capital_gain)
par(mfrow=c(2,2))
plot(covMcd(df$capital_gain))

df <- data.frame(hours_per_week)
par(mfrow=c(2,2))
plot(covMcd(df$hours_per_week))

#the third method - using zscore
zscore <- abs(scale(sample_dirty_dataset$capital_gain))
plot(zscore, type = "n")
abline(h = 2, col = "red")
text(1:length(sample_dirty_dataset$capital_gain), zscore)

#In this approach, we wanted to accept values greater than 1.0 as outliers and we created a solution accordingly.
df <- data.frame(capital_gain)
zscore <- abs(scale(sample_dirty_dataset$capital_gain))
# Determine the threshold for outliers (e.g., z-score > 1)
outlier_threshold <- 1
# Identify outliers
outliers <- which(zscore > outlier_threshold)
# Plotting z-scores for the specific variable with outliers highlighted
plot(zscore, type = "n", main = "Z-scores Plot")
abline(h = outlier_threshold, col = "red")
text(outliers, zscore[outliers], labels = outliers, pos = 4, col = "blue")

diagnose_web_report(sample_dirty_dataset)
##
##
## processing file: diagnosis_temp.Rmd
##
|
| | 0%
|
|. | 1%
|
|. | 3% [setup]
|
|.. | 4%
|
|... | 6% [load_packages]
|
|... | 7%
|
|.... | 9% [get-parameters]
|
|..... | 10%
|
|..... | 12% [unnamed-chunk-4]
|
|...... | 13%
|
|....... | 15% [diagose]
|
|....... | 16%
|
|........ | 18% [create-overview]
|
|......... | 19%
|
|......... | 21% [overview]
|
|.......... | 22%
|
|........... | 24% [overview-datastructure]
|
|........... | 25%
|
|............ | 27% [overview-pre]
|
|............. | 28%
|
|............. | 30% [overview-warnings]
|
|.............. | 31%
|
|............... | 33% [warnings_summary]
|
|............... | 34%
|
|................ | 36% [warnings]
|
|................. | 37%
|
|................. | 39% [overview-variables]
|
|.................. | 40%
|
|................... | 42% [variables]
|
|................... | 43%
|
|.................... | 45% [missing]
|
|..................... | 46%
|
|..................... | 48% [missing-list]
|
|...................... | 49%
|
|....................... | 51% [missing-data]
|
|........................ | 52%
|
|........................ | 54% [missing-visualization]
|
|......................... | 55%
|
|.......................... | 57% [missing-viz2]
|
|.......................... | 58%
|
|........................... | 60% [unique]
|
|............................ | 61%
|
|............................ | 63% [unique-categorical]
|
|............................. | 64%
|
|.............................. | 66% [unique-date-category]
|
|.............................. | 67%
|
|............................... | 69% [unique-numerical]
|
|................................ | 70%
|
|................................ | 72% [unique-data-numeric]
|
|................................. | 73%
|
|.................................. | 75% [outliers]
|
|.................................. | 76%
|
|................................... | 78% [outliers-list]
|
|.................................... | 79%
|
|.................................... | 81% [samples]
|
|..................................... | 82%
|
|...................................... | 84% [duplicated]
|
|...................................... | 85%
|
|....................................... | 87% [duplicated-list]
|
|........................................ | 88%
|
|........................................ | 90% [heades]
|
|......................................... | 91%
|
|.......................................... | 93% [sample-head]
|
|.......................................... | 94%
|
|........................................... | 96% [tails]
|
|............................................ | 97%
|
|............................................ | 99% [sample-tail]
|
|.............................................| 100%
## output file: diagnosis_temp.knit.md
## "C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/pandoc" +RTS -K512m -RTS diagnosis_temp.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc1190342a4a5e.html --lua-filter "C:\Users\emrea\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\pagebreak.lua" --lua-filter "C:\Users\emrea\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\latex-div.lua" --embed-resources --standalone --variable bs3=TRUE --section-divs --template "C:\Users\emrea\AppData\Local\R\win-library\4.3\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable theme=bootstrap --css "C:/Users/emrea/AppData/Local/R/win-library/4.3/dlookr/resources/dlookr-bootstrap.css" --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\emrea\AppData\Local\Temp\Rtmp6R3M25\rmarkdown-str1190436b4d6a.html" --variable code_folding=show --variable code_menu=1 --include-in-header header_temp.html --include-after-body "C:\Users\emrea\AppData\Local\R\win-library\4.3\dlookr\resources\footer.html"
##
## Output created: C:\Users\emrea\AppData\Local\Temp\Rtmp6R3M25/Diagnosis_Report.html
eda_web_report(sample_dirty_dataset)
##
##
## processing file: eda_temp.Rmd
##
|
| | 0%
|
|. | 2%
|
|.. | 3% [setup]
|
|.. | 5%
|
|... | 6% [load_packages]
|
|.... | 8%
|
|..... | 10% [unnamed-chunk-1]
|
|...... | 11%
|
|...... | 13% [udf]
|
|....... | 14%
|
|........ | 16% [check_variables]
|
|......... | 17%
|
|.......... | 19% [create-overview]
|
|.......... | 21%
|
|........... | 22% [overview]
|
|............ | 24%
|
|............. | 25% [overview-pre]
|
|............. | 27%
|
|.............. | 29% [unnamed-chunk-2]
|
|............... | 30%
|
|................ | 32% [unnamed-chunk-3]
|
|................. | 33%
|
|................. | 35% [variables]
|
|.................. | 37%
|
|................... | 38% [normality]
|
|.................... | 40%
|
|..................... | 41% [normality-list]
|
|..................... | 43%
|
|...................... | 44% [unnamed-chunk-4]
|
|....................... | 46%
|
|........................ | 48% [unnamed-chunk-5]
|
|......................... | 49%
|
|......................... | 51% [compare_numerical]
|
|.......................... | 52%
|
|........................... | 54% [unnamed-chunk-6]
|
|............................ | 56%
|
|............................. | 57% [compare-category]
|
|............................. | 59%
|
|.............................. | 60% [unnamed-chunk-7]
|
|............................... | 62%
|
|................................ | 63% [unnamed-chunk-8]
|
|................................. | 65%
|
|................................. | 67% [unnamed-chunk-9]
|
|.................................. | 68%
|
|................................... | 70% [correlation]
|
|.................................... | 71%
|
|..................................... | 73% [unnamed-chunk-10]
|
|..................................... | 75%
|
|...................................... | 76% [plot-correlation]
|
|....................................... | 78%
|
|........................................ | 79% [unnamed-chunk-11]
|
|........................................ | 81%
|
|......................................... | 83% [unnamed-chunk-12]
|
|.......................................... | 84%
|
|........................................... | 86% [group-numerical]
|
|............................................ | 87%
|
|............................................ | 89% [unnamed-chunk-13]
|
|............................................. | 90%
|
|.............................................. | 92% [group-categorical]
|
|............................................... | 94%
|
|................................................ | 95% [unnamed-chunk-14]
|
|................................................ | 97%
|
|................................................. | 98% [group-correlation]
|
|..................................................| 100%
## output file: eda_temp.knit.md
## "C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/pandoc" +RTS -K512m -RTS eda_temp.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc119017be151e.html --lua-filter "C:\Users\emrea\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\pagebreak.lua" --lua-filter "C:\Users\emrea\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\latex-div.lua" --embed-resources --standalone --variable bs3=TRUE --section-divs --template "C:\Users\emrea\AppData\Local\R\win-library\4.3\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable theme=bootstrap --css "C:/Users/emrea/AppData/Local/R/win-library/4.3/dlookr/resources/dlookr-bootstrap.css" --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\emrea\AppData\Local\Temp\Rtmp6R3M25\rmarkdown-str11907cb46451.html" --variable code_folding=show --variable code_menu=1 --include-in-header header_temp.html --include-after-body "C:\Users\emrea\AppData\Local\R\win-library\4.3\dlookr\resources\footer.html"
##
## Output created: C:\Users\emrea\AppData\Local\Temp\Rtmp6R3M25/EDA_Report.html
transformation_web_report(sample_dirty_dataset)
##
##
## processing file: transformation_temp.Rmd
##
|
| | 0%
|
|. | 3%
|
|... | 5% [setup]
|
|.... | 8%
|
|..... | 10% [load_packages]
|
|....... | 13%
|
|........ | 15% [unnamed-chunk-1]
|
|......... | 18%
|
|........... | 21% [udf]
|
|............ | 23%
|
|............. | 26% [create-overview]
|
|............... | 28%
|
|................ | 31% [overview]
|
|................. | 33%
|
|................... | 36% [overview-pre]
|
|.................... | 38%
|
|..................... | 41% [unnamed-chunk-2]
|
|....................... | 44%
|
|........................ | 46% [unnamed-chunk-3]
|
|......................... | 49%
|
|........................... | 51% [unnamed-chunk-4]
|
|............................ | 54%
|
|............................. | 56% [nalist]
|
|............................... | 59%
|
|................................ | 62% [unnamed-chunk-5]
|
|................................. | 64%
|
|................................... | 67% [outlist]
|
|.................................... | 69%
|
|..................................... | 72% [unnamed-chunk-6]
|
|....................................... | 74%
|
|........................................ | 77% [skweness]
|
|......................................... | 79%
|
|........................................... | 82% [unnamed-chunk-7]
|
|............................................ | 85%
|
|............................................. | 87% [binning]
|
|............................................... | 90%
|
|................................................ | 92% [unnamed-chunk-8]
|
|................................................. | 95%
|
|................................................... | 97% [optimal-binning]
|
|....................................................| 100%
## output file: transformation_temp.knit.md
## "C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/pandoc" +RTS -K512m -RTS transformation_temp.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc11904c07de7.html --lua-filter "C:\Users\emrea\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\pagebreak.lua" --lua-filter "C:\Users\emrea\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\latex-div.lua" --embed-resources --standalone --variable bs3=TRUE --section-divs --template "C:\Users\emrea\AppData\Local\R\win-library\4.3\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable theme=bootstrap --css "C:/Users/emrea/AppData/Local/R/win-library/4.3/dlookr/resources/dlookr-bootstrap.css" --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\emrea\AppData\Local\Temp\Rtmp6R3M25\rmarkdown-str11905fbf26a1.html" --variable code_folding=show --variable code_menu=1 --include-in-header header_temp.html --include-after-body "C:\Users\emrea\AppData\Local\R\win-library\4.3\dlookr\resources\footer.html"
##
## Output created: C:\Users\emrea\AppData\Local\Temp\Rtmp6R3M25/Transformation_Report.html