# Load necessary libraries
library(tidyverse)
# Set seed for reproducibility
set.seed(123)
# Generate sample data
n <- 1000 # Number of observations
# Age between 18 and 90
age <- sample(18:90, n, replace = TRUE)
# Introduce errors in age column
age[sample(1:n, 50)] <- "InvalidAge"
# Workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked
workclass <- sample(c("Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", "Local-gov", "State-gov", "Without-pay", "Never-worked"), n, replace = TRUE)
# Introduce missing values in workclass column
workclass[sample(1:n, 50)] <- NA
# Education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool
education <- sample(c("Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"), n, replace = TRUE)
# Introduce inconsistencies in education column
education[sample(1:n, 50)] <- "InvalidEducation"
# Education Number: 1-16
education_num <- sample(1:16, n, replace = TRUE)
# Marital Status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse
marital_status <- sample(c("Married-civ-spouse", "Divorced", "Never-married", "Separated", "Widowed", "Married-spouse-absent", "Married-AF-spouse"), n, replace = TRUE)
# Introduce missing values in marital_status column
marital_status[sample(1:n, 50)] <- NA
# Occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces
occupation <- sample(c("Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv", "Armed-Forces"), n, replace = TRUE)
# Introduce errors in occupation column
occupation[sample(1:n, 50)] <- "InvalidOccupation"
# Relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried
relationship <- sample(c("Wife", "Own-child", "Husband", "Not-in-family", "Other-relative", "Unmarried"), n, replace = TRUE)
# Introduce missing values in relationship column
relationship[sample(1:n, 50)] <- NA
# Race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black
race <- sample(c("White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"), n, replace = TRUE)
# Introduce inconsistencies in race column
race[sample(1:n, 50)] <- "InvalidRace"
# Sex: Female, Male
sex <- sample(c("Female", "Male"), n, replace = TRUE)
# Introduce missing values in sex column
sex[sample(1:n, 50)] <- NA
# Capital Gain: 0-99999
capital_gain <- sample(0:99999, n, replace = TRUE)
# Introduce errors in capital_gain column
capital_gain[sample(1:n, 50)] <- -999
# Capital Loss: 0-99999
capital_loss <- sample(0:99999, n, replace = TRUE)
# Introduce errors in capital_loss column
capital_loss[sample(1:n, 50)] <- -999
# Hours per week: 1-99
hours_per_week <- sample(1:99, n, replace = TRUE)
# Introduce missing values in hours_per_week column
hours_per_week[sample(1:n, 50)] <- NA
# Native country: United-States, Cambodia, England, ...
native_country <- sample(c("United-States", "Cambodia", "England", "Puerto-Rico", "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras", "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico", "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"), n, replace = TRUE)
# Introduce missing values in native_country column
native_country[sample(1:n, 50)] <- NA
# Income: <=50K, >50K
income <- sample(c("<=50K", ">50K"), n, replace = TRUE, prob = c(0.75, 0.25))
# Create dataframe
sample_dirty_dataset <- data.frame(age, workclass, education, education_num, marital_status, occupation, relationship, race, sex, capital_gain, capital_loss, hours_per_week, native_country, income)
kable(head(sample_dirty_dataset))
| age | workclass | education | education_num | marital_status | occupation | relationship | race | sex | capital_gain | capital_loss | hours_per_week | native_country | income |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 48 | Local-gov | 7th-8th | 1 | NA | Prof-specialty | Other-relative | Black | Male | 7826 | 77249 | 60 | Poland | >50K |
| 68 | NA | Prof-school | 1 | Divorced | Priv-house-serv | Unmarried | Asian-Pac-Islander | Female | 84532 | 60981 | 73 | NA | <=50K |
| 31 | State-gov | Bachelors | 2 | Married-spouse-absent | Protective-serv | Wife | Black | Male | 35297 | 15093 | 98 | Hungary | <=50K |
| 84 | Never-worked | 7th-8th | 10 | Separated | Handlers-cleaners | Other-relative | Asian-Pac-Islander | Male | 27074 | 40203 | NA | South | <=50K |
| 59 | Private | 12th | 16 | Married-civ-spouse | Machine-op-inspct | Other-relative | White | Female | 73785 | 2056 | 4 | Haiti | <=50K |
| 67 | Self-emp-not-inc | InvalidEducation | 12 | Married-spouse-absent | Adm-clerical | Husband | Black | Male | 2553 | 17031 | 70 | Scotland | <=50K |
diagnose_category(sample_dirty_dataset) %>% flextable()
variables | levels | N | freq | ratio | rank |
|---|---|---|---|---|---|
age | InvalidAge | 1,000 | 50 | 5.0 | 1 |
age | 89 | 1,000 | 21 | 2.1 | 2 |
age | 28 | 1,000 | 20 | 2.0 | 3 |
age | 52 | 1,000 | 20 | 2.0 | 3 |
age | 42 | 1,000 | 19 | 1.9 | 5 |
age | 84 | 1,000 | 19 | 1.9 | 5 |
age | 34 | 1,000 | 18 | 1.8 | 7 |
age | 56 | 1,000 | 18 | 1.8 | 7 |
age | 88 | 1,000 | 18 | 1.8 | 7 |
age | 26 | 1,000 | 17 | 1.7 | 10 |
workclass | Self-emp-inc | 1,000 | 134 | 13.4 | 1 |
workclass | Private | 1,000 | 130 | 13.0 | 2 |
workclass | Never-worked | 1,000 | 124 | 12.4 | 3 |
workclass | State-gov | 1,000 | 116 | 11.6 | 4 |
workclass | Without-pay | 1,000 | 115 | 11.5 | 5 |
workclass | Self-emp-not-inc | 1,000 | 114 | 11.4 | 6 |
workclass | Local-gov | 1,000 | 111 | 11.1 | 7 |
workclass | Federal-gov | 1,000 | 106 | 10.6 | 8 |
workclass | 1,000 | 50 | 5.0 | 9 | |
education | 5th-6th | 1,000 | 78 | 7.8 | 1 |
education | Masters | 1,000 | 68 | 6.8 | 2 |
education | 12th | 1,000 | 66 | 6.6 | 3 |
education | HS-grad | 1,000 | 66 | 6.6 | 3 |
education | Prof-school | 1,000 | 63 | 6.3 | 5 |
education | 9th | 1,000 | 62 | 6.2 | 6 |
education | 10th | 1,000 | 60 | 6.0 | 7 |
education | 1st-4th | 1,000 | 60 | 6.0 | 7 |
education | Some-college | 1,000 | 58 | 5.8 | 9 |
education | Assoc-acdm | 1,000 | 57 | 5.7 | 10 |
marital_status | Divorced | 1,000 | 155 | 15.5 | 1 |
marital_status | Married-spouse-absent | 1,000 | 139 | 13.9 | 2 |
marital_status | Married-AF-spouse | 1,000 | 136 | 13.6 | 3 |
marital_status | Separated | 1,000 | 135 | 13.5 | 4 |
marital_status | Married-civ-spouse | 1,000 | 129 | 12.9 | 5 |
marital_status | Never-married | 1,000 | 129 | 12.9 | 5 |
marital_status | Widowed | 1,000 | 127 | 12.7 | 7 |
marital_status | 1,000 | 50 | 5.0 | 8 | |
occupation | Tech-support | 1,000 | 87 | 8.7 | 1 |
occupation | Machine-op-inspct | 1,000 | 80 | 8.0 | 2 |
occupation | Armed-Forces | 1,000 | 79 | 7.9 | 3 |
occupation | Craft-repair | 1,000 | 75 | 7.5 | 4 |
occupation | Handlers-cleaners | 1,000 | 69 | 6.9 | 5 |
occupation | Adm-clerical | 1,000 | 67 | 6.7 | 6 |
occupation | Exec-managerial | 1,000 | 67 | 6.7 | 6 |
occupation | Sales | 1,000 | 64 | 6.4 | 8 |
occupation | Protective-serv | 1,000 | 63 | 6.3 | 9 |
occupation | Transport-moving | 1,000 | 63 | 6.3 | 9 |
relationship | Wife | 1,000 | 167 | 16.7 | 1 |
relationship | Not-in-family | 1,000 | 162 | 16.2 | 2 |
relationship | Husband | 1,000 | 160 | 16.0 | 3 |
relationship | Unmarried | 1,000 | 159 | 15.9 | 4 |
relationship | Other-relative | 1,000 | 152 | 15.2 | 5 |
relationship | Own-child | 1,000 | 150 | 15.0 | 6 |
relationship | 1,000 | 50 | 5.0 | 7 | |
race | White | 1,000 | 194 | 19.4 | 1 |
race | Amer-Indian-Eskimo | 1,000 | 193 | 19.3 | 2 |
race | Other | 1,000 | 189 | 18.9 | 3 |
race | Asian-Pac-Islander | 1,000 | 187 | 18.7 | 4 |
race | Black | 1,000 | 187 | 18.7 | 4 |
race | InvalidRace | 1,000 | 50 | 5.0 | 6 |
sex | Female | 1,000 | 499 | 49.9 | 1 |
sex | Male | 1,000 | 451 | 45.1 | 2 |
sex | 1,000 | 50 | 5.0 | 3 | |
native_country | 1,000 | 50 | 5.0 | 1 | |
native_country | Dominican-Republic | 1,000 | 32 | 3.2 | 2 |
native_country | Hungary | 1,000 | 32 | 3.2 | 2 |
native_country | Laos | 1,000 | 31 | 3.1 | 4 |
native_country | Portugal | 1,000 | 31 | 3.1 | 4 |
native_country | India | 1,000 | 30 | 3.0 | 6 |
native_country | Cambodia | 1,000 | 27 | 2.7 | 7 |
native_country | Ecuador | 1,000 | 27 | 2.7 | 7 |
native_country | Hong | 1,000 | 27 | 2.7 | 7 |
native_country | Jamaica | 1,000 | 27 | 2.7 | 7 |
income | <=50K | 1,000 | 776 | 77.6 | 1 |
income | >50K | 1,000 | 224 | 22.4 | 2 |
diagnose_numeric(sample_dirty_dataset) %>% flextable()
variables | min | Q1 | mean | median | Q3 | max | zero | minus | outlier |
|---|---|---|---|---|---|---|---|---|---|
education_num | 1 | 4.0 | 8.34500 | 9.0 | 12.00 | 16 | 0 | 0 | 0 |
capital_gain | -999 | 22,805.0 | 47,118.50900 | 47,667.0 | 72,652.25 | 99,960 | 0 | 50 | 0 |
capital_loss | -999 | 19,684.5 | 46,222.49000 | 45,288.5 | 71,802.25 | 99,944 | 0 | 50 | 0 |
hours_per_week | 1 | 23.0 | 48.37684 | 48.0 | 72.00 | 99 | 0 | 0 | 0 |
sample_dirty_dataset$education[sample_dirty_dataset$education=="9th"] <- "InvalidEducation"
sample_dirty_dataset$education[sample_dirty_dataset$education=="10th"] <- "InvalidEducation"
sample_dirty_dataset$education[sample_dirty_dataset$education=="11th"] <- "InvalidEducation"
sample_dirty_dataset <- replace_with_na(sample_dirty_dataset, replace = list(age = "InvalidAge", race = "InvalidRace" , capital_gain = "-999", capital_loss = "-999", education = "InvalidEducation"))
diagnose(sample_dirty_dataset) %>% flextable()
variables | types | missing_count | missing_percent | unique_count | unique_rate |
|---|---|---|---|---|---|
age | character | 50 | 5.0 | 74 | 0.074 |
workclass | character | 50 | 5.0 | 9 | 0.009 |
education | character | 218 | 21.8 | 14 | 0.014 |
education_num | integer | 0 | 0.0 | 16 | 0.016 |
marital_status | character | 50 | 5.0 | 8 | 0.008 |
occupation | character | 0 | 0.0 | 15 | 0.015 |
relationship | character | 50 | 5.0 | 7 | 0.007 |
race | character | 50 | 5.0 | 6 | 0.006 |
sex | character | 50 | 5.0 | 3 | 0.003 |
capital_gain | numeric | 50 | 5.0 | 947 | 0.947 |
capital_loss | numeric | 50 | 5.0 | 948 | 0.948 |
hours_per_week | integer | 50 | 5.0 | 100 | 0.100 |
native_country | character | 50 | 5.0 | 42 | 0.042 |
income | character | 0 | 0.0 | 2 | 0.002 |
diagnose_category(sample_dirty_dataset) %>% flextable()
variables | levels | N | freq | ratio | rank |
|---|---|---|---|---|---|
age | 1,000 | 50 | 5.0 | 1 | |
age | 89 | 1,000 | 21 | 2.1 | 2 |
age | 28 | 1,000 | 20 | 2.0 | 3 |
age | 52 | 1,000 | 20 | 2.0 | 3 |
age | 42 | 1,000 | 19 | 1.9 | 5 |
age | 84 | 1,000 | 19 | 1.9 | 5 |
age | 34 | 1,000 | 18 | 1.8 | 7 |
age | 56 | 1,000 | 18 | 1.8 | 7 |
age | 88 | 1,000 | 18 | 1.8 | 7 |
age | 26 | 1,000 | 17 | 1.7 | 10 |
workclass | Self-emp-inc | 1,000 | 134 | 13.4 | 1 |
workclass | Private | 1,000 | 130 | 13.0 | 2 |
workclass | Never-worked | 1,000 | 124 | 12.4 | 3 |
workclass | State-gov | 1,000 | 116 | 11.6 | 4 |
workclass | Without-pay | 1,000 | 115 | 11.5 | 5 |
workclass | Self-emp-not-inc | 1,000 | 114 | 11.4 | 6 |
workclass | Local-gov | 1,000 | 111 | 11.1 | 7 |
workclass | Federal-gov | 1,000 | 106 | 10.6 | 8 |
workclass | 1,000 | 50 | 5.0 | 9 | |
education | 1,000 | 218 | 21.8 | 1 | |
education | 5th-6th | 1,000 | 78 | 7.8 | 2 |
education | Masters | 1,000 | 68 | 6.8 | 3 |
education | 12th | 1,000 | 66 | 6.6 | 4 |
education | HS-grad | 1,000 | 66 | 6.6 | 4 |
education | Prof-school | 1,000 | 63 | 6.3 | 6 |
education | 1st-4th | 1,000 | 60 | 6.0 | 7 |
education | Some-college | 1,000 | 58 | 5.8 | 8 |
education | Assoc-acdm | 1,000 | 57 | 5.7 | 9 |
education | Assoc-voc | 1,000 | 57 | 5.7 | 9 |
marital_status | Divorced | 1,000 | 155 | 15.5 | 1 |
marital_status | Married-spouse-absent | 1,000 | 139 | 13.9 | 2 |
marital_status | Married-AF-spouse | 1,000 | 136 | 13.6 | 3 |
marital_status | Separated | 1,000 | 135 | 13.5 | 4 |
marital_status | Married-civ-spouse | 1,000 | 129 | 12.9 | 5 |
marital_status | Never-married | 1,000 | 129 | 12.9 | 5 |
marital_status | Widowed | 1,000 | 127 | 12.7 | 7 |
marital_status | 1,000 | 50 | 5.0 | 8 | |
occupation | Tech-support | 1,000 | 87 | 8.7 | 1 |
occupation | Machine-op-inspct | 1,000 | 80 | 8.0 | 2 |
occupation | Armed-Forces | 1,000 | 79 | 7.9 | 3 |
occupation | Craft-repair | 1,000 | 75 | 7.5 | 4 |
occupation | Handlers-cleaners | 1,000 | 69 | 6.9 | 5 |
occupation | Adm-clerical | 1,000 | 67 | 6.7 | 6 |
occupation | Exec-managerial | 1,000 | 67 | 6.7 | 6 |
occupation | Sales | 1,000 | 64 | 6.4 | 8 |
occupation | Protective-serv | 1,000 | 63 | 6.3 | 9 |
occupation | Transport-moving | 1,000 | 63 | 6.3 | 9 |
relationship | Wife | 1,000 | 167 | 16.7 | 1 |
relationship | Not-in-family | 1,000 | 162 | 16.2 | 2 |
relationship | Husband | 1,000 | 160 | 16.0 | 3 |
relationship | Unmarried | 1,000 | 159 | 15.9 | 4 |
relationship | Other-relative | 1,000 | 152 | 15.2 | 5 |
relationship | Own-child | 1,000 | 150 | 15.0 | 6 |
relationship | 1,000 | 50 | 5.0 | 7 | |
race | White | 1,000 | 194 | 19.4 | 1 |
race | Amer-Indian-Eskimo | 1,000 | 193 | 19.3 | 2 |
race | Other | 1,000 | 189 | 18.9 | 3 |
race | Asian-Pac-Islander | 1,000 | 187 | 18.7 | 4 |
race | Black | 1,000 | 187 | 18.7 | 4 |
race | 1,000 | 50 | 5.0 | 6 | |
sex | Female | 1,000 | 499 | 49.9 | 1 |
sex | Male | 1,000 | 451 | 45.1 | 2 |
sex | 1,000 | 50 | 5.0 | 3 | |
native_country | 1,000 | 50 | 5.0 | 1 | |
native_country | Dominican-Republic | 1,000 | 32 | 3.2 | 2 |
native_country | Hungary | 1,000 | 32 | 3.2 | 2 |
native_country | Laos | 1,000 | 31 | 3.1 | 4 |
native_country | Portugal | 1,000 | 31 | 3.1 | 4 |
native_country | India | 1,000 | 30 | 3.0 | 6 |
native_country | Cambodia | 1,000 | 27 | 2.7 | 7 |
native_country | Ecuador | 1,000 | 27 | 2.7 | 7 |
native_country | Hong | 1,000 | 27 | 2.7 | 7 |
native_country | Jamaica | 1,000 | 27 | 2.7 | 7 |
income | <=50K | 1,000 | 776 | 77.6 | 1 |
income | >50K | 1,000 | 224 | 22.4 | 2 |
diagnose_numeric(sample_dirty_dataset) %>% flextable()
variables | min | Q1 | mean | median | Q3 | max | zero | minus | outlier |
|---|---|---|---|---|---|---|---|---|---|
education_num | 1 | 4.00 | 8.34500 | 9.0 | 12 | 16 | 0 | 0 | 0 |
capital_gain | 20 | 25,714.75 | 49,651.00947 | 50,639.0 | 74,052 | 99,960 | 0 | 0 | 0 |
capital_loss | 72 | 23,732.25 | 48,707.83158 | 48,896.5 | 73,255 | 99,944 | 0 | 0 | 0 |
hours_per_week | 1 | 23.00 | 48.37684 | 48.0 | 72 | 99 | 0 | 0 | 0 |
diagnose_outlier(sample_dirty_dataset) %>% flextable()
variables | outliers_cnt | outliers_ratio | outliers_mean | with_mean | without_mean |
|---|---|---|---|---|---|
education_num | 0 | 0 | 8.34500 | 8.34500 | |
capital_gain | 0 | 0 | 49,651.00947 | 49,651.00947 | |
capital_loss | 0 | 0 | 48,707.83158 | 48,707.83158 | |
hours_per_week | 0 | 0 | 48.37684 | 48.37684 |
plot_na_pareto(sample_dirty_dataset)
plot_na_hclust(sample_dirty_dataset)
#Detecting missing values
sample_dirty_dataset %>%
summarise(count = sum(is.na(sample_dirty_dataset)))
## count
## 1 718
which(is.na(sample_dirty_dataset))
## [1] 60 98 132 145 163 167 179 202 213 230 256 261
## [13] 287 290 306 309 310 323 352 366 374 384 407 424
## [25] 467 468 477 484 492 504 529 583 589 603 608 611
## [37] 645 659 683 706 737 751 752 857 862 882 912 915
## [49] 925 933 1002 1037 1087 1108 1133 1179 1184 1192 1224 1253
## [61] 1275 1289 1320 1340 1367 1369 1374 1456 1470 1473 1497 1504
## [73] 1520 1543 1569 1585 1666 1675 1677 1686 1695 1728 1744 1768
## [85] 1773 1780 1781 1785 1818 1820 1827 1828 1829 1858 1863 1899
## [97] 1905 1949 1974 1993 2006 2009 2019 2025 2028 2031 2033 2035
## [109] 2037 2040 2043 2047 2054 2056 2064 2065 2067 2069 2079 2089
## [121] 2092 2093 2099 2102 2108 2109 2113 2115 2116 2118 2119 2121
## [133] 2124 2129 2131 2134 2137 2138 2144 2146 2166 2169 2173 2183
## [145] 2187 2188 2190 2197 2199 2203 2206 2215 2217 2222 2227 2251
## [157] 2254 2260 2269 2291 2293 2295 2305 2308 2312 2313 2316 2323
## [169] 2326 2334 2344 2345 2350 2356 2361 2362 2369 2373 2385 2386
## [181] 2387 2396 2397 2400 2410 2412 2418 2420 2421 2422 2429 2436
## [193] 2441 2452 2456 2457 2458 2459 2471 2477 2480 2489 2490 2492
## [205] 2494 2496 2504 2506 2509 2514 2520 2523 2530 2535 2539 2544
## [217] 2546 2559 2568 2570 2577 2579 2581 2582 2584 2603 2611 2613
## [229] 2617 2624 2625 2647 2654 2658 2659 2660 2663 2667 2669 2677
## [241] 2678 2680 2681 2684 2692 2697 2702 2703 2708 2710 2712 2716
## [253] 2719 2726 2729 2732 2742 2744 2746 2750 2752 2762 2768 2770
## [265] 2783 2784 2785 2788 2789 2799 2800 2801 2804 2805 2807 2811
## [277] 2813 2822 2827 2831 2839 2840 2843 2848 2851 2859 2866 2872
## [289] 2880 2883 2886 2892 2894 2896 2900 2920 2925 2926 2931 2936
## [301] 2939 2940 2941 2942 2945 2946 2959 2960 2972 2979 2980 2983
## [313] 2984 2990 2991 2993 2998 2999 4001 4016 4022 4038 4044 4056
## [325] 4057 4115 4141 4149 4160 4189 4249 4273 4291 4292 4293 4346
## [337] 4361 4365 4366 4387 4465 4466 4522 4524 4533 4534 4554 4574
## [349] 4576 4589 4591 4600 4610 4691 4711 4724 4730 4747 4781 4821
## [361] 4894 4907 4922 4931 4943 4963 4964 4990 6028 6075 6079 6100
## [373] 6135 6139 6164 6172 6190 6245 6248 6266 6268 6277 6288 6310
## [385] 6315 6336 6338 6341 6391 6439 6444 6484 6517 6537 6625 6633
## [397] 6636 6657 6677 6691 6720 6726 6727 6754 6774 6782 6786 6788
## [409] 6796 6801 6831 6835 6836 6885 6919 6938 6943 6994 7013 7014
## [421] 7030 7033 7103 7110 7117 7135 7145 7163 7178 7244 7272 7277
## [433] 7283 7294 7298 7312 7323 7324 7380 7395 7461 7488 7491 7513
## [445] 7522 7541 7564 7595 7604 7619 7635 7709 7711 7713 7714 7739
## [457] 7754 7787 7789 7823 7837 7862 7883 7924 7964 7973 7980 7986
## [469] 8031 8074 8078 8099 8123 8138 8173 8174 8217 8224 8291 8324
## [481] 8330 8342 8360 8382 8410 8416 8419 8440 8466 8475 8476 8549
## [493] 8580 8624 8635 8640 8666 8684 8737 8744 8750 8758 8791 8814
## [505] 8822 8841 8853 8868 8876 8877 8913 8918 8925 8973 8975 8978
## [517] 8986 9000 9038 9098 9115 9122 9129 9169 9176 9192 9214 9270
## [529] 9278 9289 9328 9346 9385 9391 9417 9422 9424 9431 9438 9457
## [541] 9466 9472 9500 9508 9538 9568 9571 9574 9600 9601 9607 9657
## [553] 9678 9744 9758 9777 9804 9814 9833 9881 9915 9919 9925 9946
## [565] 9947 9948 9956 9958 10060 10133 10136 10147 10157 10171 10186 10217
## [577] 10230 10233 10249 10259 10268 10285 10299 10308 10329 10332 10366 10373
## [589] 10375 10378 10393 10400 10457 10481 10531 10551 10619 10657 10660 10662
## [601] 10684 10708 10728 10734 10752 10763 10768 10770 10789 10806 10817 10836
## [613] 10848 10867 10924 10925 10952 10958 11004 11018 11022 11037 11116 11123
## [625] 11129 11149 11169 11170 11177 11185 11203 11222 11228 11235 11245 11321
## [637] 11340 11375 11385 11412 11483 11502 11532 11549 11573 11579 11608 11629
## [649] 11639 11669 11685 11704 11723 11730 11738 11742 11745 11791 11796 11816
## [661] 11832 11855 11870 11876 11917 11943 11983 11999 12002 12014 12020 12037
## [673] 12039 12053 12082 12095 12119 12179 12192 12194 12198 12202 12233 12249
## [685] 12250 12252 12271 12274 12345 12354 12362 12366 12367 12446 12466 12508
## [697] 12511 12571 12580 12604 12636 12642 12697 12698 12743 12771 12791 12792
## [709] 12800 12803 12871 12880 12881 12935 12946 12976 12987 12996
#Imputing (only columns with numerical values)
sample_dirty_dataset$age[is.na(sample_dirty_dataset$age)] <- as.integer(mean(sample_dirty_dataset$age, na.rm = TRUE))
## Warning in mean.default(sample_dirty_dataset$age, na.rm = TRUE): argument nie
## jest wartością liczbową ani logiczną: zwracanie wartości NA
sample_dirty_dataset$capital_gain[is.na(sample_dirty_dataset$capital_gain)] <- as.integer(mean(sample_dirty_dataset$capital_gain, na.rm = TRUE))
sample_dirty_dataset$capital_loss[is.na(sample_dirty_dataset$capital_loss)] <- as.integer(mean(sample_dirty_dataset$capital_loss, na.rm = TRUE))
#Removing rows with missing values (which are not numerical)
sample_dirty_dataset <- sample_dirty_dataset %>% drop_na()
#Methods for Detection - Using zscore
sample_dirty_dataset$age <- as.numeric(as.character(sample_dirty_dataset$age))
zscore <- abs(scale(sample_dirty_dataset$age))
plot(zscore, type = "n")
abline(h = 1.0, col = "blue")
text(1:length(sample_dirty_dataset$age), zscore)
#Methods for Detection - Using boxplot
boxplot(sample_dirty_dataset$capital_gain)
boxplot(sample_dirty_dataset$capital_gain)$out
## numeric(0)
which(sample_dirty_dataset$capital_gain %in% boxplot(sample_dirty_dataset$capital_gain)$out)
## integer(0)
#Methods for Detection - Using Mahalanobis Charts
par(mfrow=c(2,2))
plot(covMcd(sample_dirty_dataset$hours_per_week))
#sample_dirty_dataset %>%
# diagnose_web_report(subtitle = "diagnose",
# output_file = "Diagn.html", theme = "blue")
#sample_dirty_dataset %>%
#eda_web_report( subtitle = "eda", output_file = "EDA.html", theme = "blue")
#sample_dirty_dataset %>%
#transformation_web_report(subtitle = "trans", output_file = "transformation.html",
#theme = "blue")