# Load necessary libraries
library(tidyverse)
#;-) Warning: pakiet 'tidyr' został zbudowany w wersji R 4.3.3
#;-) ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#;-) ✔ dplyr     1.1.4     ✔ readr     2.1.5
#;-) ✔ forcats   1.0.0     ✔ stringr   1.5.1
#;-) ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
#;-) ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
#;-) ✔ purrr     1.0.2     
#;-) ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#;-) ✖ dplyr::filter() masks stats::filter()
#;-) ✖ dplyr::lag()    masks stats::lag()
#;-) ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dlookr)
#;-) Warning: pakiet 'dlookr' został zbudowany w wersji R 4.3.3
#;-) Registered S3 methods overwritten by 'dlookr':
#;-)   method          from  
#;-)   plot.transform  scales
#;-)   print.transform scales
#;-) 
#;-) Dołączanie pakietu: 'dlookr'
#;-) 
#;-) Następujący obiekt został zakryty z 'package:tidyr':
#;-) 
#;-)     extract
#;-) 
#;-) Następujący obiekt został zakryty z 'package:base':
#;-) 
#;-)     transform
library(rrcov)
#;-) Warning: pakiet 'rrcov' został zbudowany w wersji R 4.3.3
#;-) Ładowanie wymaganego pakietu: robustbase
#;-) Warning: pakiet 'robustbase' został zbudowany w wersji R 4.3.3
#;-) Scalable Robust Estimators with High Breakdown Point (version 1.7-5)

# Set seed for reproducibility
set.seed(777)

# Generate sample data
n <- 1000  # Number of observations

# Age between 18 and 90
age <- sample(18:90, n, replace = TRUE)

# Introduce errors in age column
age[sample(1:n, 50)] <- "InvalidAge"

# Workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked
workclass <- sample(c("Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", "Local-gov", "State-gov", "Without-pay", "Never-worked"), n, replace = TRUE)

# Introduce missing values in workclass column
workclass[sample(1:n, 50)] <- NA

# Education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool
education <- sample(c("Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"), n, replace = TRUE)

inconsistent_indices <- education %in% c("Bachelors", "11th")

# Introduce inconsistencies in education column
education[sample(1:n, 50)] <- "InvalidEducation"
education[inconsistent_indices] <- "InvalidEducation"

# Education Number: 1-16
education_num <- sample(1:16, n, replace = TRUE)

# Marital Status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse
marital_status <- sample(c("Married-civ-spouse", "Divorced", "Never-married", "Separated", "Widowed", "Married-spouse-absent", "Married-AF-spouse"), n, replace = TRUE)

# Introduce missing values in marital_status column
marital_status[sample(1:n, 50)] <- NA

# Occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces
occupation <- sample(c("Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv", "Armed-Forces"), n, replace = TRUE)

# Introduce errors in occupation column
occupation[sample(1:n, 50)] <- "InvalidOccupation"

# Relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried
relationship <- sample(c("Wife", "Own-child", "Husband", "Not-in-family", "Other-relative", "Unmarried"), n, replace = TRUE)
inconsistent_relation <- relationship %in% c("Own-child" , "Other-relative")
relationship[inconsistent_relation] <- "InvalidRelationship"
# Introduce missing values in relationship column
relationship[sample(1:n, 50)] <- NA

# Race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black
race <- sample(c("White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"), n, replace = TRUE)
inconsistent_race <- race %in% c("Asian-Pac-Islander", "Amer-Indian-Eskimo")
race[inconsistent_race] <- "InvalidRace"
# Introduce inconsistencies in race column
race[sample(1:n, 50)] <- "InvalidRace"

# Sex: Female, Male
sex <- sample(c("Female", "Male"), n, replace = TRUE)

# Introduce missing values in sex column
sex[sample(1:n, 50)] <- NA

# Capital Gain: 0-99999
capital_gain <- sample(0:99999, n, replace = TRUE)

# Introduce errors in capital_gain column
capital_gain[sample(1:n, 50)] <- -999

# Capital Loss: 0-99999
capital_loss <- sample(0:99999, n, replace = TRUE)

# Introduce errors in capital_loss column
capital_loss[sample(1:n, 50)] <- -999

# Hours per week: 1-99
hours_per_week <- sample(1:99, n, replace = TRUE)

# Introduce missing values in hours_per_week column
hours_per_week[sample(1:n, 50)] <- NA

# Native country: United-States, Cambodia, England, ...
native_country <- sample(c("United-States", "Cambodia", "England", "Puerto-Rico", "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras", "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico", "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"), n, replace = TRUE)

# Introduce missing values in native_country column
native_country[sample(1:n, 50)] <- NA

# Income: <=50K, >50K
income <- sample(c("<=50K", ">50K"), n, replace = TRUE, prob = c(0.75, 0.25))

# Create dataframe
sample_dirty_dataset <- data.frame(age, workclass, education, education_num, marital_status, occupation, relationship, race, sex, capital_gain, capital_loss, hours_per_week, native_country, income)


# 3.STEP
# Impute missing values for age, capital_gain, and capital_loss
sample_dirty_dataset <- sample_dirty_dataset %>%
  mutate(age = ifelse(age == "InvalidAge", NA_integer_, age),  # Convert InvalidAge to NA
         age = ifelse(is.na(age), median(as.numeric(age), na.rm = TRUE), age),  # Impute missing age values with median
         capital_gain = pmax(capital_gain, 0),   # Set negative capital_gain values to 0
         capital_loss = pmax(capital_loss, 0))   # Set negative capital_loss values to 0

# Identify unique values (excluding "InvalidOccupation")
valid_occupations <- unique(sample_dirty_dataset$occupation[sample_dirty_dataset$occupation != "InvalidOccupation"])

# Replace "InvalidOccupation" with a random valid occupation
sample_dirty_dataset$occupation <- replace(sample_dirty_dataset$occupation, sample_dirty_dataset$occupation == "InvalidOccupation", sample(valid_occupations, sum(sample_dirty_dataset$occupation == "InvalidOccupation"), replace = TRUE))

# 5.STEP
# List of columns to clean
columns_to_clean <- c("education", "occupation", "relationship", "race")

# Loop over each column
for (column in columns_to_clean) {
  # Get valid entries
  valid_entries <- unique(sample_dirty_dataset[[column]][!sample_dirty_dataset[[column]] %in% c("InvalidEducation", "InvalidOccupation", "InvalidRelationship", "InvalidRace")])
  
  # Replace invalid entries with randomly sampled valid ones
  sample_dirty_dataset[[column]] <- replace(sample_dirty_dataset[[column]], sample_dirty_dataset[[column]] %in% c("InvalidEducation", "InvalidOccupation", "InvalidRelationship", "InvalidRace"), sample(valid_entries, sum(sample_dirty_dataset[[column]] %in% c("InvalidEducation", "InvalidOccupation", "InvalidRelationship", "InvalidRace")), replace = TRUE))
}

sample_dirty_dataset <- sample_dirty_dataset %>%
  drop_na()

# 6.STEP - OUTLIERS
#first method - using boxplot
which(sample_dirty_dataset$capital_gain %in% boxplot(sample_dirty_dataset$capital_gain)$out)




# Print first few rows of the dataset
head(sample_dirty_dataset)
#;-)   age        workclass   education education_num     marital_status
#;-) 1  41 Self-emp-not-inc     7th-8th             5 Married-civ-spouse
#;-) 2  77        State-gov   Assoc-voc             6            Widowed
#;-) 3  42     Never-worked     7th-8th            10  Married-AF-spouse
#;-) 4  26        Local-gov   Assoc-voc             1          Separated
#;-) 5  33        Local-gov   Preschool            16 Married-civ-spouse
#;-) 6  52      Without-pay Prof-school             7 Married-civ-spouse
#;-)          occupation  relationship  race    sex capital_gain capital_loss
#;-) 1 Handlers-cleaners       Husband White   Male        73933        88732
#;-) 2 Handlers-cleaners       Husband Other   Male         8053        61667
#;-) 3      Armed-Forces Not-in-family White Female        62910        46573
#;-) 4   Farming-fishing     Unmarried White Female        89640        34015
#;-) 5 Machine-op-inspct       Husband White   Male        92799        23554
#;-) 6   Farming-fishing          Wife Black Female        26938           67
#;-)   hours_per_week     native_country income
#;-) 1             11 Holand-Netherlands   >50K
#;-) 2             23            Ecuador  <=50K
#;-) 3             21             Mexico  <=50K
#;-) 4             58               Peru  <=50K
#;-) 5             45           Thailand  <=50K
#;-) 6             99            Vietnam   >50K

diagnose_web_report(sample_dirty_dataset)
#;-) processing file: diagnosis_temp.Rmd
#;-) output file: diagnosis_temp.knit.md
#;-) "C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/pandoc" +RTS -K512m -RTS diagnosis_temp.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc29801df57082.html --lua-filter "C:\Users\huber\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\pagebreak.lua" --lua-filter "C:\Users\huber\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\latex-div.lua" --embed-resources --standalone --variable bs3=TRUE --section-divs --template "C:\Users\huber\AppData\Local\R\win-library\4.3\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable theme=bootstrap --css "C:/Users/huber/AppData/Local/R/win-library/4.3/dlookr/resources/dlookr-bootstrap.css" --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\huber\AppData\Local\Temp\RtmpsJZDna\rmarkdown-str29801b276656.html" --variable code_folding=show --variable code_menu=1 --include-in-header header_temp.html --include-after-body "C:\Users\huber\AppData\Local\R\win-library\4.3\dlookr\resources\footer.html"
#;-) 
#;-) Output created: C:\Users\huber\AppData\Local\Temp\RtmpsJZDna/Diagnosis_Report.html

Sample data