Sample data

# Load necessary libraries
library(tidyverse)
#;-) ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#;-) ✔ dplyr     1.1.4     ✔ readr     2.1.5
#;-) ✔ forcats   1.0.0     ✔ stringr   1.5.1
#;-) ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
#;-) ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
#;-) ✔ purrr     1.0.2     
#;-) ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#;-) ✖ dplyr::filter() masks stats::filter()
#;-) ✖ dplyr::lag()    masks stats::lag()
#;-) ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# Set seed for reproducibility
set.seed(123)

# Generate sample data
n <- 1000  # Number of observations

# Age between 18 and 90
age <- sample(18:90, n, replace = TRUE)

# Introduce errors in age column
age[sample(1:n, 50)] <- "InvalidAge"

# Workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked
workclass <- sample(c("Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", "Local-gov", "State-gov", "Without-pay", "Never-worked"), n, replace = TRUE)

# Introduce missing values in workclass column
workclass[sample(1:n, 50)] <- NA

# Education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool
education <- sample(c("Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"), n, replace = TRUE)

# Introduce inconsistencies in education column
education[sample(1:n, 50)] <- "InvalidEducation"

# Education Number: 1-16
education_num <- sample(1:16, n, replace = TRUE)

# Marital Status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse
marital_status <- sample(c("Married-civ-spouse", "Divorced", "Never-married", "Separated", "Widowed", "Married-spouse-absent", "Married-AF-spouse"), n, replace = TRUE)

# Introduce missing values in marital_status column
marital_status[sample(1:n, 50)] <- NA

# Occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces
occupation <- sample(c("Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv", "Armed-Forces"), n, replace = TRUE)

# Introduce errors in occupation column
occupation[sample(1:n, 50)] <- "InvalidOccupation"

# Relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried
relationship <- sample(c("Wife", "Own-child", "Husband", "Not-in-family", "Other-relative", "Unmarried"), n, replace = TRUE)

# Introduce missing values in relationship column
relationship[sample(1:n, 50)] <- NA

# Race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black
race <- sample(c("White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"), n, replace = TRUE)

# Introduce inconsistencies in race column
race[sample(1:n, 50)] <- "InvalidRace"

# Sex: Female, Male
sex <- sample(c("Female", "Male"), n, replace = TRUE)

# Introduce missing values in sex column
sex[sample(1:n, 50)] <- NA

# Capital Gain: 0-99999
capital_gain <- sample(0:99999, n, replace = TRUE)

# Introduce errors in capital_gain column
capital_gain[sample(1:n, 50)] <- -999

# Capital Loss: 0-99999
capital_loss <- sample(0:99999, n, replace = TRUE)

# Introduce errors in capital_loss column
capital_loss[sample(1:n, 50)] <- -999

# Hours per week: 1-99
hours_per_week <- sample(1:99, n, replace = TRUE)

# Introduce missing values in hours_per_week column
hours_per_week[sample(1:n, 50)] <- NA

# Native country: United-States, Cambodia, England, ...
native_country <- sample(c("United-States", "Cambodia", "England", "Puerto-Rico", "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras", "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico", "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"), n, replace = TRUE)

# Introduce missing values in native_country column
native_country[sample(1:n, 50)] <- NA

# Income: <=50K, >50K
income <- sample(c("<=50K", ">50K"), n, replace = TRUE, prob = c(0.75, 0.25))

# Create dataframe
sample_dirty_dataset <- data.frame(age, workclass, education, education_num, marital_status, occupation, relationship, race, sex, capital_gain, capital_loss, hours_per_week, native_country, income)

# Print first few rows of the dataset
head(sample_dirty_dataset)
#;-)   age        workclass        education education_num        marital_status
#;-) 1  48        Local-gov          7th-8th             1                  <NA>
#;-) 2  68             <NA>      Prof-school             1              Divorced
#;-) 3  31        State-gov        Bachelors             2 Married-spouse-absent
#;-) 4  84     Never-worked          7th-8th            10             Separated
#;-) 5  59          Private             12th            16    Married-civ-spouse
#;-) 6  67 Self-emp-not-inc InvalidEducation            12 Married-spouse-absent
#;-)          occupation   relationship               race    sex capital_gain
#;-) 1    Prof-specialty Other-relative              Black   Male         7826
#;-) 2   Priv-house-serv      Unmarried Asian-Pac-Islander Female        84532
#;-) 3   Protective-serv           Wife              Black   Male        35297
#;-) 4 Handlers-cleaners Other-relative Asian-Pac-Islander   Male        27074
#;-) 5 Machine-op-inspct Other-relative              White Female        73785
#;-) 6      Adm-clerical        Husband              Black   Male         2553
#;-)   capital_loss hours_per_week native_country income
#;-) 1        77249             60         Poland   >50K
#;-) 2        60981             73           <NA>  <=50K
#;-) 3        15093             98        Hungary  <=50K
#;-) 4        40203             NA          South  <=50K
#;-) 5         2056              4          Haiti  <=50K
#;-) 6        17031             70       Scotland  <=50K
Standard output and standard error
 Install the styler package in order to use `style = TRUE`.
Session info
sessioninfo::session_info()
#;-) ─ Session info ───────────────────────────────────────────────────────────────
#;-)  setting  value
#;-)  version  R version 4.3.2 (2023-10-31)
#;-)  os       macOS Sonoma 14.4
#;-)  system   aarch64, darwin20
#;-)  ui       X11
#;-)  language (EN)
#;-)  collate  en_US.UTF-8
#;-)  ctype    en_US.UTF-8
#;-)  tz       Europe/Warsaw
#;-)  date     2024-03-19
#;-)  pandoc   3.1.1 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)
#;-) 
#;-) ─ Packages ───────────────────────────────────────────────────────────────────
#;-)  package     * version date (UTC) lib source
#;-)  cli           3.6.2   2023-12-11 [2] CRAN (R 4.3.1)
#;-)  colorspace    2.1-0   2023-01-23 [2] CRAN (R 4.3.0)
#;-)  digest        0.6.34  2024-01-11 [2] CRAN (R 4.3.1)
#;-)  dplyr       * 1.1.4   2023-11-17 [2] CRAN (R 4.3.1)
#;-)  evaluate      0.23    2023-11-01 [2] CRAN (R 4.3.1)
#;-)  fansi         1.0.6   2023-12-08 [2] CRAN (R 4.3.1)
#;-)  fastmap       1.1.1   2023-02-24 [2] CRAN (R 4.3.0)
#;-)  forcats     * 1.0.0   2023-01-29 [2] CRAN (R 4.3.0)
#;-)  fs            1.6.3   2023-07-20 [2] CRAN (R 4.3.0)
#;-)  generics      0.1.3   2022-07-05 [2] CRAN (R 4.3.0)
#;-)  ggplot2     * 3.4.4   2023-10-12 [2] CRAN (R 4.3.1)
#;-)  glue          1.7.0   2024-01-09 [2] CRAN (R 4.3.1)
#;-)  gtable        0.3.4   2023-08-21 [2] CRAN (R 4.3.0)
#;-)  hms           1.1.3   2023-03-21 [2] CRAN (R 4.3.0)
#;-)  htmltools     0.5.7   2023-11-03 [2] CRAN (R 4.3.1)
#;-)  knitr         1.45    2023-10-30 [2] CRAN (R 4.3.1)
#;-)  lifecycle     1.0.4   2023-11-07 [2] CRAN (R 4.3.1)
#;-)  lubridate   * 1.9.3   2023-09-27 [2] CRAN (R 4.3.1)
#;-)  magrittr      2.0.3   2022-03-30 [2] CRAN (R 4.3.0)
#;-)  munsell       0.5.0   2018-06-12 [2] CRAN (R 4.3.0)
#;-)  pillar        1.9.0   2023-03-22 [2] CRAN (R 4.3.0)
#;-)  pkgconfig     2.0.3   2019-09-22 [2] CRAN (R 4.3.0)
#;-)  purrr       * 1.0.2   2023-08-10 [2] CRAN (R 4.3.0)
#;-)  R6            2.5.1   2021-08-19 [2] CRAN (R 4.3.0)
#;-)  readr       * 2.1.5   2024-01-10 [2] CRAN (R 4.3.1)
#;-)  reprex        2.1.0   2024-01-11 [2] CRAN (R 4.3.1)
#;-)  rlang         1.1.3   2024-01-10 [2] CRAN (R 4.3.1)
#;-)  rmarkdown     2.25    2023-09-18 [2] CRAN (R 4.3.1)
#;-)  rstudioapi    0.15.0  2023-07-07 [2] CRAN (R 4.3.0)
#;-)  scales        1.3.0   2023-11-28 [2] CRAN (R 4.3.1)
#;-)  sessioninfo   1.2.2   2021-12-06 [2] CRAN (R 4.3.0)
#;-)  stringi       1.8.3   2023-12-11 [2] CRAN (R 4.3.1)
#;-)  stringr     * 1.5.1   2023-11-14 [2] CRAN (R 4.3.1)
#;-)  tibble      * 3.2.1   2023-03-20 [2] CRAN (R 4.3.0)
#;-)  tidyr       * 1.3.0   2023-01-24 [2] CRAN (R 4.3.0)
#;-)  tidyselect    1.2.0   2022-10-10 [2] CRAN (R 4.3.0)
#;-)  tidyverse   * 2.0.0   2023-02-22 [2] CRAN (R 4.3.0)
#;-)  timechange    0.3.0   2024-01-18 [1] CRAN (R 4.3.1)
#;-)  tzdb          0.4.0   2023-05-12 [2] CRAN (R 4.3.0)
#;-)  utf8          1.2.4   2023-10-22 [2] CRAN (R 4.3.1)
#;-)  vctrs         0.6.5   2023-12-01 [2] CRAN (R 4.3.1)
#;-)  withr         3.0.0   2024-01-16 [2] CRAN (R 4.3.1)
#;-)  xfun          0.41    2023-11-01 [2] CRAN (R 4.3.1)
#;-)  yaml          2.3.8   2023-12-11 [2] CRAN (R 4.3.1)
#;-) 
#;-)  [1] /Users/Karol_1/Library/R/arm64/4.3/library
#;-)  [2] /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library
#;-) 
#;-) ──────────────────────────────────────────────────────────────────────────────