Data Clensing Report

Sample data

# Load necessary libraries
library(tidyverse)

# Set seed for reproducibility
set.seed(123)

# Generate sample data
n <- 1000  # Number of observations

# Age between 18 and 90
age <- sample(18:90, n, replace = TRUE)

# Introduce errors in age column
age[sample(1:n, 50)] <- "InvalidAge"

# Workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked
workclass <- sample(c("Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", "Local-gov", "State-gov", "Without-pay", "Never-worked"), n, replace = TRUE)

# Introduce missing values in workclass column
workclass[sample(1:n, 50)] <- NA

# Education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool
education <- sample(c("Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"), n, replace = TRUE)

# Introduce inconsistencies in education column
education[sample(1:n, 50)] <- "InvalidEducation"

# Education Number: 1-16
education_num <- sample(1:16, n, replace = TRUE)

# Marital Status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse
marital_status <- sample(c("Married-civ-spouse", "Divorced", "Never-married", "Separated", "Widowed", "Married-spouse-absent", "Married-AF-spouse"), n, replace = TRUE)

# Introduce missing values in marital_status column
marital_status[sample(1:n, 50)] <- NA

# Occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces
occupation <- sample(c("Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv", "Armed-Forces"), n, replace = TRUE)

# Introduce errors in occupation column
occupation[sample(1:n, 50)] <- "InvalidOccupation"

# Relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried
relationship <- sample(c("Wife", "Own-child", "Husband", "Not-in-family", "Other-relative", "Unmarried"), n, replace = TRUE)

# Introduce missing values in relationship column
relationship[sample(1:n, 50)] <- NA

# Race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black
race <- sample(c("White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"), n, replace = TRUE)

# Introduce inconsistencies in race column
race[sample(1:n, 50)] <- "InvalidRace"

# Sex: Female, Male
sex <- sample(c("Female", "Male"), n, replace = TRUE)

# Introduce missing values in sex column
sex[sample(1:n, 50)] <- NA

# Capital Gain: 0-99999
capital_gain <- sample(0:99999, n, replace = TRUE)

# Introduce errors in capital_gain column
capital_gain[sample(1:n, 50)] <- -999

# Capital Loss: 0-99999
capital_loss <- sample(0:99999, n, replace = TRUE)

# Introduce errors in capital_loss column
capital_loss[sample(1:n, 50)] <- -999

# Hours per week: 1-99
hours_per_week <- sample(1:99, n, replace = TRUE)

# Introduce missing values in hours_per_week column
hours_per_week[sample(1:n, 50)] <- NA

# Native country: United-States, Cambodia, England, ...
native_country <- sample(c("United-States", "Cambodia", "England", "Puerto-Rico", "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras", "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico", "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"), n, replace = TRUE)

# Introduce missing values in native_country column
native_country[sample(1:n, 50)] <- NA

# Income: <=50K, >50K
income <- sample(c("<=50K", ">50K"), n, replace = TRUE, prob = c(0.75, 0.25))

# Create dataframe
sample_dirty_dataset <- data.frame(age, workclass, education, education_num, marital_status, occupation, relationship, race, sex, capital_gain, capital_loss, hours_per_week, native_country, income)

kable(head(sample_dirty_dataset))

age	workclass	education	education_num	marital_status	occupation	relationship	race	sex	capital_gain	capital_loss	hours_per_week	native_country	income
48	Local-gov	7th-8th	1	NA	Prof-specialty	Other-relative	Black	Male	7826	77249	60	Poland	>50K
68	NA	Prof-school	1	Divorced	Priv-house-serv	Unmarried	Asian-Pac-Islander	Female	84532	60981	73	NA	<=50K
31	State-gov	Bachelors	2	Married-spouse-absent	Protective-serv	Wife	Black	Male	35297	15093	98	Hungary	<=50K
84	Never-worked	7th-8th	10	Separated	Handlers-cleaners	Other-relative	Asian-Pac-Islander	Male	27074	40203	NA	South	<=50K
59	Private	12th	16	Married-civ-spouse	Machine-op-inspct	Other-relative	White	Female	73785	2056	4	Haiti	<=50K
67	Self-emp-not-inc	InvalidEducation	12	Married-spouse-absent	Adm-clerical	Husband	Black	Male	2553	17031	70	Scotland	<=50K

Diagnose category 1

diagnose_category(sample_dirty_dataset) %>% flextable()

variables	levels	N	freq	ratio	rank
age	InvalidAge	1,000	50	5.0	1
age	89	1,000	21	2.1	2
age	28	1,000	20	2.0	3
age	52	1,000	20	2.0	3
age	42	1,000	19	1.9	5
age	84	1,000	19	1.9	5
age	34	1,000	18	1.8	7
age	56	1,000	18	1.8	7
age	88	1,000	18	1.8	7
age	26	1,000	17	1.7	10
workclass	Self-emp-inc	1,000	134	13.4	1
workclass	Private	1,000	130	13.0	2
workclass	Never-worked	1,000	124	12.4	3
workclass	State-gov	1,000	116	11.6	4
workclass	Without-pay	1,000	115	11.5	5
workclass	Self-emp-not-inc	1,000	114	11.4	6
workclass	Local-gov	1,000	111	11.1	7
workclass	Federal-gov	1,000	106	10.6	8
workclass		1,000	50	5.0	9
education	5th-6th	1,000	78	7.8	1
education	Masters	1,000	68	6.8	2
education	12th	1,000	66	6.6	3
education	HS-grad	1,000	66	6.6	3
education	Prof-school	1,000	63	6.3	5
education	9th	1,000	62	6.2	6
education	10th	1,000	60	6.0	7
education	1st-4th	1,000	60	6.0	7
education	Some-college	1,000	58	5.8	9
education	Assoc-acdm	1,000	57	5.7	10
marital_status	Divorced	1,000	155	15.5	1
marital_status	Married-spouse-absent	1,000	139	13.9	2
marital_status	Married-AF-spouse	1,000	136	13.6	3
marital_status	Separated	1,000	135	13.5	4
marital_status	Married-civ-spouse	1,000	129	12.9	5
marital_status	Never-married	1,000	129	12.9	5
marital_status	Widowed	1,000	127	12.7	7
marital_status		1,000	50	5.0	8
occupation	Tech-support	1,000	87	8.7	1
occupation	Machine-op-inspct	1,000	80	8.0	2
occupation	Armed-Forces	1,000	79	7.9	3
occupation	Craft-repair	1,000	75	7.5	4
occupation	Handlers-cleaners	1,000	69	6.9	5
occupation	Adm-clerical	1,000	67	6.7	6
occupation	Exec-managerial	1,000	67	6.7	6
occupation	Sales	1,000	64	6.4	8
occupation	Protective-serv	1,000	63	6.3	9
occupation	Transport-moving	1,000	63	6.3	9
relationship	Wife	1,000	167	16.7	1
relationship	Not-in-family	1,000	162	16.2	2
relationship	Husband	1,000	160	16.0	3
relationship	Unmarried	1,000	159	15.9	4
relationship	Other-relative	1,000	152	15.2	5
relationship	Own-child	1,000	150	15.0	6
relationship		1,000	50	5.0	7
race	White	1,000	194	19.4	1
race	Amer-Indian-Eskimo	1,000	193	19.3	2
race	Other	1,000	189	18.9	3
race	Asian-Pac-Islander	1,000	187	18.7	4
race	Black	1,000	187	18.7	4
race	InvalidRace	1,000	50	5.0	6
sex	Female	1,000	499	49.9	1
sex	Male	1,000	451	45.1	2
sex		1,000	50	5.0	3
native_country		1,000	50	5.0	1
native_country	Dominican-Republic	1,000	32	3.2	2
native_country	Hungary	1,000	32	3.2	2
native_country	Laos	1,000	31	3.1	4
native_country	Portugal	1,000	31	3.1	4
native_country	India	1,000	30	3.0	6
native_country	Cambodia	1,000	27	2.7	7
native_country	Ecuador	1,000	27	2.7	7
native_country	Hong	1,000	27	2.7	7
native_country	Jamaica	1,000	27	2.7	7
income	<=50K	1,000	776	77.6	1
income	>50K	1,000	224	22.4	2

diagnose_numeric(sample_dirty_dataset) %>% flextable()

variables	min	Q1	mean	median	Q3	max	minus
education_num	1	4.0	8.34500	9.0	12.00	16	0
capital_gain	-999	22,805.0	47,118.50900	47,667.0	72,652.25	99,960	50
capital_loss	-999	19,684.5	46,222.49000	45,288.5	71,802.25	99,944	50
hours_per_week	1	23.0	48.37684	48.0	72.00	99	0

Solving inconsistensies

sample_dirty_dataset$education[sample_dirty_dataset$education=="9th"] <- "InvalidEducation"
sample_dirty_dataset$education[sample_dirty_dataset$education=="10th"] <- "InvalidEducation"
sample_dirty_dataset$education[sample_dirty_dataset$education=="11th"] <- "InvalidEducation"

Invalid values change

sample_dirty_dataset <- replace_with_na(sample_dirty_dataset, replace = list(age = "InvalidAge", race = "InvalidRace" , capital_gain = "-999", capital_loss = "-999", education = "InvalidEducation"))

Diagnose category 2

diagnose(sample_dirty_dataset) %>% flextable()

variables	types	missing_count	missing_percent	unique_count	unique_rate
age	character	50	5.0	74	0.074
workclass	character	50	5.0	9	0.009
education	character	218	21.8	14	0.014
education_num	integer	0	0.0	16	0.016
marital_status	character	50	5.0	8	0.008
occupation	character	0	0.0	15	0.015
relationship	character	50	5.0	7	0.007
race	character	50	5.0	6	0.006
sex	character	50	5.0	3	0.003
capital_gain	numeric	50	5.0	947	0.947
capital_loss	numeric	50	5.0	948	0.948
hours_per_week	integer	50	5.0	100	0.100
native_country	character	50	5.0	42	0.042
income	character	0	0.0	2	0.002

diagnose_category(sample_dirty_dataset) %>% flextable()

variables	levels	N	freq	ratio	rank
age		1,000	50	5.0	1
age	89	1,000	21	2.1	2
age	28	1,000	20	2.0	3
age	52	1,000	20	2.0	3
age	42	1,000	19	1.9	5
age	84	1,000	19	1.9	5
age	34	1,000	18	1.8	7
age	56	1,000	18	1.8	7
age	88	1,000	18	1.8	7
age	26	1,000	17	1.7	10
workclass	Self-emp-inc	1,000	134	13.4	1
workclass	Private	1,000	130	13.0	2
workclass	Never-worked	1,000	124	12.4	3
workclass	State-gov	1,000	116	11.6	4
workclass	Without-pay	1,000	115	11.5	5
workclass	Self-emp-not-inc	1,000	114	11.4	6
workclass	Local-gov	1,000	111	11.1	7
workclass	Federal-gov	1,000	106	10.6	8
workclass		1,000	50	5.0	9
education		1,000	218	21.8	1
education	5th-6th	1,000	78	7.8	2
education	Masters	1,000	68	6.8	3
education	12th	1,000	66	6.6	4
education	HS-grad	1,000	66	6.6	4
education	Prof-school	1,000	63	6.3	6
education	1st-4th	1,000	60	6.0	7
education	Some-college	1,000	58	5.8	8
education	Assoc-acdm	1,000	57	5.7	9
education	Assoc-voc	1,000	57	5.7	9
marital_status	Divorced	1,000	155	15.5	1
marital_status	Married-spouse-absent	1,000	139	13.9	2
marital_status	Married-AF-spouse	1,000	136	13.6	3
marital_status	Separated	1,000	135	13.5	4
marital_status	Married-civ-spouse	1,000	129	12.9	5
marital_status	Never-married	1,000	129	12.9	5
marital_status	Widowed	1,000	127	12.7	7
marital_status		1,000	50	5.0	8
occupation	Tech-support	1,000	87	8.7	1
occupation	Machine-op-inspct	1,000	80	8.0	2
occupation	Armed-Forces	1,000	79	7.9	3
occupation	Craft-repair	1,000	75	7.5	4
occupation	Handlers-cleaners	1,000	69	6.9	5
occupation	Adm-clerical	1,000	67	6.7	6
occupation	Exec-managerial	1,000	67	6.7	6
occupation	Sales	1,000	64	6.4	8
occupation	Protective-serv	1,000	63	6.3	9
occupation	Transport-moving	1,000	63	6.3	9
relationship	Wife	1,000	167	16.7	1
relationship	Not-in-family	1,000	162	16.2	2
relationship	Husband	1,000	160	16.0	3
relationship	Unmarried	1,000	159	15.9	4
relationship	Other-relative	1,000	152	15.2	5
relationship	Own-child	1,000	150	15.0	6
relationship		1,000	50	5.0	7
race	White	1,000	194	19.4	1
race	Amer-Indian-Eskimo	1,000	193	19.3	2
race	Other	1,000	189	18.9	3
race	Asian-Pac-Islander	1,000	187	18.7	4
race	Black	1,000	187	18.7	4
race		1,000	50	5.0	6
sex	Female	1,000	499	49.9	1
sex	Male	1,000	451	45.1	2
sex		1,000	50	5.0	3
native_country		1,000	50	5.0	1
native_country	Dominican-Republic	1,000	32	3.2	2
native_country	Hungary	1,000	32	3.2	2
native_country	Laos	1,000	31	3.1	4
native_country	Portugal	1,000	31	3.1	4
native_country	India	1,000	30	3.0	6
native_country	Cambodia	1,000	27	2.7	7
native_country	Ecuador	1,000	27	2.7	7
native_country	Hong	1,000	27	2.7	7
native_country	Jamaica	1,000	27	2.7	7
income	<=50K	1,000	776	77.6	1
income	>50K	1,000	224	22.4	2

diagnose_numeric(sample_dirty_dataset) %>% flextable()

variables	min	Q1	mean	median	Q3	max
education_num	1	4.00	8.34500	9.0	12	16
capital_gain	20	25,714.75	49,651.00947	50,639.0	74,052	99,960
capital_loss	72	23,732.25	48,707.83158	48,896.5	73,255	99,944
hours_per_week	1	23.00	48.37684	48.0	72	99

diagnose_outlier(sample_dirty_dataset) %>% flextable()

variables	with_mean	without_mean
education_num	8.34500	8.34500
capital_gain	49,651.00947	49,651.00947
capital_loss	48,707.83158	48,707.83158
hours_per_week	48.37684	48.37684

plot_na_pareto(sample_dirty_dataset)

plot_na_hclust(sample_dirty_dataset)

Missing Values

#Detecting missing values
sample_dirty_dataset %>%
summarise(count = sum(is.na(sample_dirty_dataset)))

##   count
## 1   718

which(is.na(sample_dirty_dataset))

##   [1]    60    98   132   145   163   167   179   202   213   230   256   261
##  [13]   287   290   306   309   310   323   352   366   374   384   407   424
##  [25]   467   468   477   484   492   504   529   583   589   603   608   611
##  [37]   645   659   683   706   737   751   752   857   862   882   912   915
##  [49]   925   933  1002  1037  1087  1108  1133  1179  1184  1192  1224  1253
##  [61]  1275  1289  1320  1340  1367  1369  1374  1456  1470  1473  1497  1504
##  [73]  1520  1543  1569  1585  1666  1675  1677  1686  1695  1728  1744  1768
##  [85]  1773  1780  1781  1785  1818  1820  1827  1828  1829  1858  1863  1899
##  [97]  1905  1949  1974  1993  2006  2009  2019  2025  2028  2031  2033  2035
## [109]  2037  2040  2043  2047  2054  2056  2064  2065  2067  2069  2079  2089
## [121]  2092  2093  2099  2102  2108  2109  2113  2115  2116  2118  2119  2121
## [133]  2124  2129  2131  2134  2137  2138  2144  2146  2166  2169  2173  2183
## [145]  2187  2188  2190  2197  2199  2203  2206  2215  2217  2222  2227  2251
## [157]  2254  2260  2269  2291  2293  2295  2305  2308  2312  2313  2316  2323
## [169]  2326  2334  2344  2345  2350  2356  2361  2362  2369  2373  2385  2386
## [181]  2387  2396  2397  2400  2410  2412  2418  2420  2421  2422  2429  2436
## [193]  2441  2452  2456  2457  2458  2459  2471  2477  2480  2489  2490  2492
## [205]  2494  2496  2504  2506  2509  2514  2520  2523  2530  2535  2539  2544
## [217]  2546  2559  2568  2570  2577  2579  2581  2582  2584  2603  2611  2613
## [229]  2617  2624  2625  2647  2654  2658  2659  2660  2663  2667  2669  2677
## [241]  2678  2680  2681  2684  2692  2697  2702  2703  2708  2710  2712  2716
## [253]  2719  2726  2729  2732  2742  2744  2746  2750  2752  2762  2768  2770
## [265]  2783  2784  2785  2788  2789  2799  2800  2801  2804  2805  2807  2811
## [277]  2813  2822  2827  2831  2839  2840  2843  2848  2851  2859  2866  2872
## [289]  2880  2883  2886  2892  2894  2896  2900  2920  2925  2926  2931  2936
## [301]  2939  2940  2941  2942  2945  2946  2959  2960  2972  2979  2980  2983
## [313]  2984  2990  2991  2993  2998  2999  4001  4016  4022  4038  4044  4056
## [325]  4057  4115  4141  4149  4160  4189  4249  4273  4291  4292  4293  4346
## [337]  4361  4365  4366  4387  4465  4466  4522  4524  4533  4534  4554  4574
## [349]  4576  4589  4591  4600  4610  4691  4711  4724  4730  4747  4781  4821
## [361]  4894  4907  4922  4931  4943  4963  4964  4990  6028  6075  6079  6100
## [373]  6135  6139  6164  6172  6190  6245  6248  6266  6268  6277  6288  6310
## [385]  6315  6336  6338  6341  6391  6439  6444  6484  6517  6537  6625  6633
## [397]  6636  6657  6677  6691  6720  6726  6727  6754  6774  6782  6786  6788
## [409]  6796  6801  6831  6835  6836  6885  6919  6938  6943  6994  7013  7014
## [421]  7030  7033  7103  7110  7117  7135  7145  7163  7178  7244  7272  7277
## [433]  7283  7294  7298  7312  7323  7324  7380  7395  7461  7488  7491  7513
## [445]  7522  7541  7564  7595  7604  7619  7635  7709  7711  7713  7714  7739
## [457]  7754  7787  7789  7823  7837  7862  7883  7924  7964  7973  7980  7986
## [469]  8031  8074  8078  8099  8123  8138  8173  8174  8217  8224  8291  8324
## [481]  8330  8342  8360  8382  8410  8416  8419  8440  8466  8475  8476  8549
## [493]  8580  8624  8635  8640  8666  8684  8737  8744  8750  8758  8791  8814
## [505]  8822  8841  8853  8868  8876  8877  8913  8918  8925  8973  8975  8978
## [517]  8986  9000  9038  9098  9115  9122  9129  9169  9176  9192  9214  9270
## [529]  9278  9289  9328  9346  9385  9391  9417  9422  9424  9431  9438  9457
## [541]  9466  9472  9500  9508  9538  9568  9571  9574  9600  9601  9607  9657
## [553]  9678  9744  9758  9777  9804  9814  9833  9881  9915  9919  9925  9946
## [565]  9947  9948  9956  9958 10060 10133 10136 10147 10157 10171 10186 10217
## [577] 10230 10233 10249 10259 10268 10285 10299 10308 10329 10332 10366 10373
## [589] 10375 10378 10393 10400 10457 10481 10531 10551 10619 10657 10660 10662
## [601] 10684 10708 10728 10734 10752 10763 10768 10770 10789 10806 10817 10836
## [613] 10848 10867 10924 10925 10952 10958 11004 11018 11022 11037 11116 11123
## [625] 11129 11149 11169 11170 11177 11185 11203 11222 11228 11235 11245 11321
## [637] 11340 11375 11385 11412 11483 11502 11532 11549 11573 11579 11608 11629
## [649] 11639 11669 11685 11704 11723 11730 11738 11742 11745 11791 11796 11816
## [661] 11832 11855 11870 11876 11917 11943 11983 11999 12002 12014 12020 12037
## [673] 12039 12053 12082 12095 12119 12179 12192 12194 12198 12202 12233 12249
## [685] 12250 12252 12271 12274 12345 12354 12362 12366 12367 12446 12466 12508
## [697] 12511 12571 12580 12604 12636 12642 12697 12698 12743 12771 12791 12792
## [709] 12800 12803 12871 12880 12881 12935 12946 12976 12987 12996

#Imputing (only columns with numerical values)
sample_dirty_dataset$age[is.na(sample_dirty_dataset$age)] <- as.integer(mean(sample_dirty_dataset$age, na.rm = TRUE))

## Warning in mean.default(sample_dirty_dataset$age, na.rm = TRUE): argument nie
## jest wartością liczbową ani logiczną: zwracanie wartości NA

sample_dirty_dataset$capital_gain[is.na(sample_dirty_dataset$capital_gain)] <- as.integer(mean(sample_dirty_dataset$capital_gain, na.rm = TRUE))
sample_dirty_dataset$capital_loss[is.na(sample_dirty_dataset$capital_loss)] <- as.integer(mean(sample_dirty_dataset$capital_loss, na.rm = TRUE))

#Removing rows with missing values (which are not numerical)
sample_dirty_dataset <- sample_dirty_dataset %>% drop_na()

Outliers

#Methods for Detection - Using zscore
sample_dirty_dataset$age <- as.numeric(as.character(sample_dirty_dataset$age))
zscore <- abs(scale(sample_dirty_dataset$age))
plot(zscore, type = "n")
abline(h = 1.0, col = "blue")
text(1:length(sample_dirty_dataset$age), zscore)

#Methods for Detection - Using boxplot
boxplot(sample_dirty_dataset$capital_gain)
boxplot(sample_dirty_dataset$capital_gain)$out

## numeric(0)

which(sample_dirty_dataset$capital_gain %in% boxplot(sample_dirty_dataset$capital_gain)$out)

## integer(0)

#Methods for Detection - Using Mahalanobis Charts
par(mfrow=c(2,2))
plot(covMcd(sample_dirty_dataset$hours_per_week))

#sample_dirty_dataset %>%
 # diagnose_web_report(subtitle = "diagnose", 
                     # output_file = "Diagn.html", theme = "blue")

#sample_dirty_dataset %>%
  #eda_web_report( subtitle = "eda",  output_file = "EDA.html", theme = "blue")

#sample_dirty_dataset %>%
  #transformation_web_report(subtitle = "trans", output_file = "transformation.html", 
                            #theme = "blue")

Data Clensing Report

Alicja(197772), Hubert(197740), Yaren(201924)

Last edited: 4.03.2024

Sample data

Diagnose category 1

Solving inconsistensies

Invalid values change

Diagnose category 2

Missing Values

Outliers