# Load necessary libraries
library(tidyverse)
#;-) ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#;-) ✔ dplyr 1.1.4 ✔ readr 2.1.5
#;-) ✔ forcats 1.0.0 ✔ stringr 1.5.1
#;-) ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
#;-) ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
#;-) ✔ purrr 1.0.2
#;-) ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#;-) ✖ dplyr::filter() masks stats::filter()
#;-) ✖ dplyr::lag() masks stats::lag()
#;-) ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Set seed for reproducibility
set.seed(123)
# Generate sample data
n <- 1000 # Number of observations
# Age between 18 and 90
age <- sample(18:90, n, replace = TRUE)
# Introduce errors in age column
age[sample(1:n, 50)] <- "InvalidAge"
# Workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked
workclass <- sample(c("Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", "Local-gov", "State-gov", "Without-pay", "Never-worked"), n, replace = TRUE)
# Introduce missing values in workclass column
workclass[sample(1:n, 50)] <- NA
# Education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool
education <- sample(c("Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"), n, replace = TRUE)
# Introduce inconsistencies in education column
education[sample(1:n, 50)] <- "InvalidEducation"
# Education Number: 1-16
education_num <- sample(1:16, n, replace = TRUE)
# Marital Status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse
marital_status <- sample(c("Married-civ-spouse", "Divorced", "Never-married", "Separated", "Widowed", "Married-spouse-absent", "Married-AF-spouse"), n, replace = TRUE)
# Introduce missing values in marital_status column
marital_status[sample(1:n, 50)] <- NA
# Occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces
occupation <- sample(c("Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv", "Armed-Forces"), n, replace = TRUE)
# Introduce errors in occupation column
occupation[sample(1:n, 50)] <- "InvalidOccupation"
# Relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried
relationship <- sample(c("Wife", "Own-child", "Husband", "Not-in-family", "Other-relative", "Unmarried"), n, replace = TRUE)
# Introduce missing values in relationship column
relationship[sample(1:n, 50)] <- NA
# Race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black
race <- sample(c("White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"), n, replace = TRUE)
# Introduce inconsistencies in race column
race[sample(1:n, 50)] <- "InvalidRace"
# Sex: Female, Male
sex <- sample(c("Female", "Male"), n, replace = TRUE)
# Introduce missing values in sex column
sex[sample(1:n, 50)] <- NA
# Capital Gain: 0-99999
capital_gain <- sample(0:99999, n, replace = TRUE)
# Introduce errors in capital_gain column
capital_gain[sample(1:n, 50)] <- -999
# Capital Loss: 0-99999
capital_loss <- sample(0:99999, n, replace = TRUE)
# Introduce errors in capital_loss column
capital_loss[sample(1:n, 50)] <- -999
# Hours per week: 1-99
hours_per_week <- sample(1:99, n, replace = TRUE)
# Introduce missing values in hours_per_week column
hours_per_week[sample(1:n, 50)] <- NA
# Native country: United-States, Cambodia, England, ...
native_country <- sample(c("United-States", "Cambodia", "England", "Puerto-Rico", "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras", "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico", "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"), n, replace = TRUE)
# Introduce missing values in native_country column
native_country[sample(1:n, 50)] <- NA
# Income: <=50K, >50K
income <- sample(c("<=50K", ">50K"), n, replace = TRUE, prob = c(0.75, 0.25))
# Create dataframe
sample_dirty_dataset <- data.frame(age, workclass, education, education_num, marital_status, occupation, relationship, race, sex, capital_gain, capital_loss, hours_per_week, native_country, income)
# Print first few rows of the dataset
head(sample_dirty_dataset)
#;-) age workclass education education_num marital_status
#;-) 1 48 Local-gov 7th-8th 1 <NA>
#;-) 2 68 <NA> Prof-school 1 Divorced
#;-) 3 31 State-gov Bachelors 2 Married-spouse-absent
#;-) 4 84 Never-worked 7th-8th 10 Separated
#;-) 5 59 Private 12th 16 Married-civ-spouse
#;-) 6 67 Self-emp-not-inc InvalidEducation 12 Married-spouse-absent
#;-) occupation relationship race sex capital_gain
#;-) 1 Prof-specialty Other-relative Black Male 7826
#;-) 2 Priv-house-serv Unmarried Asian-Pac-Islander Female 84532
#;-) 3 Protective-serv Wife Black Male 35297
#;-) 4 Handlers-cleaners Other-relative Asian-Pac-Islander Male 27074
#;-) 5 Machine-op-inspct Other-relative White Female 73785
#;-) 6 Adm-clerical Husband Black Male 2553
#;-) capital_loss hours_per_week native_country income
#;-) 1 77249 60 Poland >50K
#;-) 2 60981 73 <NA> <=50K
#;-) 3 15093 98 Hungary <=50K
#;-) 4 40203 NA South <=50K
#;-) 5 2056 4 Haiti <=50K
#;-) 6 17031 70 Scotland <=50K#identifying erroneous values
sample_dirty_dataset$age[sample_dirty_dataset$age == "InvalidAge"]<-NA
sample_dirty_dataset$capital_gain[sample_dirty_dataset$capital_gain < 0]<-NA
sample_dirty_dataset$capital_loss[sample_dirty_dataset$capital_loss < 0]<-NA
sample_dirty_dataset$capital_gain[sample_dirty_dataset$capital_gain < 0]<-NA
sample_dirty_dataset$occupation[sample_dirty_dataset$occupation == "InvalidOccupation"]<-NA
sample_dirty_dataset$education[sample_dirty_dataset$education == "InvalidEducation"]<-NA
sample_dirty_dataset$race[sample_dirty_dataset$race == "InvalidRace"]<-NA
head(sample_dirty_dataset)
#;-) age workclass education education_num marital_status
#;-) 1 48 Local-gov 7th-8th 1 <NA>
#;-) 2 68 <NA> Prof-school 1 Divorced
#;-) 3 31 State-gov Bachelors 2 Married-spouse-absent
#;-) 4 84 Never-worked 7th-8th 10 Separated
#;-) 5 59 Private 12th 16 Married-civ-spouse
#;-) 6 67 Self-emp-not-inc <NA> 12 Married-spouse-absent
#;-) occupation relationship race sex capital_gain
#;-) 1 Prof-specialty Other-relative Black Male 7826
#;-) 2 Priv-house-serv Unmarried Asian-Pac-Islander Female 84532
#;-) 3 Protective-serv Wife Black Male 35297
#;-) 4 Handlers-cleaners Other-relative Asian-Pac-Islander Male 27074
#;-) 5 Machine-op-inspct Other-relative White Female 73785
#;-) 6 Adm-clerical Husband Black Male 2553
#;-) capital_loss hours_per_week native_country income
#;-) 1 77249 60 Poland >50K
#;-) 2 60981 73 <NA> <=50K
#;-) 3 15093 98 Hungary <=50K
#;-) 4 40203 NA South <=50K
#;-) 5 2056 4 Haiti <=50K
#;-) 6 17031 70 Scotland <=50K#changing missing values to mean, or removing
sample_dirty_dataset$age <- as.numeric(age)
#;-) Warning: NAs introduced by coercion
sample_dirty_dataset$age[is.na(sample_dirty_dataset$age)] <- mean(sample_dirty_dataset$age, na.rm = TRUE)
sample_dirty_dataset$capital_gain[is.na(sample_dirty_dataset$capital_gain)] <- mean(sample_dirty_dataset$capital_gain, na.rm = TRUE)
sample_dirty_dataset$capital_loss[is.na(sample_dirty_dataset$capital_loss)] <- mean(sample_dirty_dataset$capital_loss, na.rm = TRUE)
sample_dirty_dataset$hours_per_week[is.na(sample_dirty_dataset$hours_per_week)] <- mean(sample_dirty_dataset$hours_per_week, na.rm = TRUE)
rows_with_na <- apply(sample_dirty_dataset[sapply(sample_dirty_dataset, is.character)], 1, function(x) any(is.na(x)))
# Subset the dataset to remove rows with NA values in character columns
cleaned_dataset <- sample_dirty_dataset[!rows_with_na, ]
head(cleaned_dataset)
#;-) age workclass education education_num marital_status
#;-) 3 31 State-gov Bachelors 2 Married-spouse-absent
#;-) 4 84 Never-worked 7th-8th 10 Separated
#;-) 5 59 Private 12th 16 Married-civ-spouse
#;-) 7 60 Never-worked Prof-school 11 Married-spouse-absent
#;-) 8 31 Self-emp-not-inc 5th-6th 13 Married-AF-spouse
#;-) 10 86 Local-gov Masters 12 Widowed
#;-) occupation relationship race sex capital_gain
#;-) 3 Protective-serv Wife Black Male 35297
#;-) 4 Handlers-cleaners Other-relative Asian-Pac-Islander Male 27074
#;-) 5 Machine-op-inspct Other-relative White Female 73785
#;-) 7 Adm-clerical Other-relative Other Female 19616
#;-) 8 Handlers-cleaners Husband Asian-Pac-Islander Male 91705
#;-) 10 Prof-specialty Not-in-family Asian-Pac-Islander Female 21279
#;-) capital_loss hours_per_week native_country income
#;-) 3 15093 98.00000 Hungary <=50K
#;-) 4 40203 48.37684 South <=50K
#;-) 5 2056 4.00000 Haiti <=50K
#;-) 7 51961 15.00000 Cambodia <=50K
#;-) 8 46300 96.00000 Philippines <=50K
#;-) 10 52751 11.00000 Thailand <=50Klibrary(ggplot2)
cleaned_dataset %>%
ggplot(aes(x = 1, y = capital_gain)) +
geom_boxplot() + labs(x = "Distribution of Capital Gain", y = "Capital Gain")
zscore<-abs(scale(cleaned_dataset$age))
plot(zscore, type = "l", main = "Z-scores for Age", xlab = "Observation", ylab = "Z-score", col = "blue")
abline(h = 2, col = "red")
library(rrcov)
#;-) Warning: package 'rrcov' was built under R version 4.3.3
#;-) Loading required package: robustbase
#;-) Warning: package 'robustbase' was built under R version 4.3.3
#;-)
#;-) Attaching package: 'robustbase'
#;-) The following object is masked _by_ '.GlobalEnv':
#;-)
#;-) education
#;-) Scalable Robust Estimators with High Breakdown Point (version 1.7-5)
par(mfrow=c(2,2))
plot(covMcd(cleaned_dataset$hours_per_week))diagnose_web_report(cleaned_dataset)
#;-) Error in diagnose_web_report(cleaned_dataset): could not find function "diagnose_web_report"eda_web_report(cleaned_dataset)
#;-) Error in eda_web_report(cleaned_dataset): could not find function "eda_web_report"cleaned_dataset %>%
transformation_paged_report(target = "hours_per_week", subtitle = "cleaned_dataset",
output_dir = "./", output_file = "transformation.pdf",
theme = "blue")
#;-) Error in transformation_paged_report(., target = "hours_per_week", subtitle = "cleaned_dataset", : could not find function "transformation_paged_report"sessionInfo()
#;-) R version 4.3.2 (2023-10-31 ucrt)
#;-) Platform: x86_64-w64-mingw32/x64 (64-bit)
#;-) Running under: Windows 11 x64 (build 22631)
#;-)
#;-) Matrix products: default
#;-)
#;-)
#;-) locale:
#;-) [1] LC_COLLATE=English_United States.utf8
#;-) [2] LC_CTYPE=English_United States.utf8
#;-) [3] LC_MONETARY=English_United States.utf8
#;-) [4] LC_NUMERIC=C
#;-) [5] LC_TIME=English_United States.utf8
#;-)
#;-) time zone: Europe/Warsaw
#;-) tzcode source: internal
#;-)
#;-) attached base packages:
#;-) [1] stats graphics grDevices utils datasets methods base
#;-)
#;-) other attached packages:
#;-) [1] rrcov_1.7-5 robustbase_0.99-2 lubridate_1.9.3 forcats_1.0.0
#;-) [5] stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2 readr_2.1.5
#;-) [9] tidyr_1.3.1 tibble_3.2.1 ggplot2_3.4.4 tidyverse_2.0.0
#;-)
#;-) loaded via a namespace (and not attached):
#;-) [1] utf8_1.2.4 generics_0.1.3 xml2_1.3.6 stringi_1.8.3
#;-) [5] lattice_0.21-9 hms_1.1.3 digest_0.6.34 magrittr_2.0.3
#;-) [9] evaluate_0.23 grid_4.3.2 timechange_0.3.0 mvtnorm_1.2-4
#;-) [13] fastmap_1.1.1 fansi_1.0.6 scales_1.3.0 cli_3.6.2
#;-) [17] rlang_1.1.3 munsell_0.5.0 reprex_2.1.0 withr_3.0.0
#;-) [21] yaml_2.3.8 tools_4.3.2 tzdb_0.4.0 colorspace_2.1-0
#;-) [25] curl_5.2.0 vctrs_0.6.5 R6_2.5.1 stats4_4.3.2
#;-) [29] lifecycle_1.0.4 fs_1.6.3 pcaPP_2.0-4 pkgconfig_2.0.3
#;-) [33] pillar_1.9.0 gtable_0.3.4 glue_1.7.0 DEoptimR_1.1-3
#;-) [37] xfun_0.42 tidyselect_1.2.0 highr_0.10 rstudioapi_0.15.0
#;-) [41] knitr_1.45 farver_2.1.1 htmltools_0.5.7 rmarkdown_2.25
#;-) [45] labeling_0.4.3 compiler_4.3.2