Sample data

# Load necessary libraries
library(tidyverse)

# Set seed for reproducibility
set.seed(123)

# Generate sample data
n <- 1000  # Number of observations

# Age between 18 and 90
age <- sample(18:90, n, replace = TRUE)

# Introduce errors in age column
age[sample(1:n, 50)] <- "InvalidAge"

# Workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked
workclass <- sample(c("Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", "Local-gov", "State-gov", "Without-pay", "Never-worked"), n, replace = TRUE)

# Introduce missing values in workclass column
workclass[sample(1:n, 50)] <- NA

# Education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool
education <- sample(c("Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"), n, replace = TRUE)

# Introduce inconsistencies in education column
education[sample(1:n, 50)] <- "InvalidEducation"

# Education Number: 1-16
education_num <- sample(1:16, n, replace = TRUE)

# Marital Status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse
marital_status <- sample(c("Married-civ-spouse", "Divorced", "Never-married", "Separated", "Widowed", "Married-spouse-absent", "Married-AF-spouse"), n, replace = TRUE)

# Introduce missing values in marital_status column
marital_status[sample(1:n, 50)] <- NA

# Occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces
occupation <- sample(c("Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv", "Armed-Forces"), n, replace = TRUE)

# Introduce errors in occupation column
occupation[sample(1:n, 50)] <- "InvalidOccupation"

# Relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried
relationship <- sample(c("Wife", "Own-child", "Husband", "Not-in-family", "Other-relative", "Unmarried"), n, replace = TRUE)

# Introduce missing values in relationship column
relationship[sample(1:n, 50)] <- NA

# Race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black
race <- sample(c("White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"), n, replace = TRUE)

# Introduce inconsistencies in race column
race[sample(1:n, 50)] <- "InvalidRace"

# Sex: Female, Male
sex <- sample(c("Female", "Male"), n, replace = TRUE)

# Introduce missing values in sex column
sex[sample(1:n, 50)] <- NA

# Capital Gain: 0-99999
capital_gain <- sample(0:99999, n, replace = TRUE)

# Introduce errors in capital_gain column
capital_gain[sample(1:n, 50)] <- -999

# Capital Loss: 0-99999
capital_loss <- sample(0:99999, n, replace = TRUE)

# Introduce errors in capital_loss column
capital_loss[sample(1:n, 50)] <- -999

# Hours per week: 1-99
hours_per_week <- sample(1:99, n, replace = TRUE)

# Introduce missing values in hours_per_week column
hours_per_week[sample(1:n, 50)] <- NA

# Native country: United-States, Cambodia, England, ...
native_country <- sample(c("United-States", "Cambodia", "England", "Puerto-Rico", "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras", "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico", "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"), n, replace = TRUE)

# Introduce missing values in native_country column
native_country[sample(1:n, 50)] <- NA

# Income: <=50K, >50K
income <- sample(c("<=50K", ">50K"), n, replace = TRUE, prob = c(0.75, 0.25))

# Create dataframe
sample_dirty_dataset <- data.frame(age, workclass, education, education_num, marital_status, occupation, relationship, race, sex, capital_gain, capital_loss, hours_per_week, native_country, income)
kable(head(sample_dirty_dataset))
age workclass education education_num marital_status occupation relationship race sex capital_gain capital_loss hours_per_week native_country income
48 Local-gov 7th-8th 1 NA Prof-specialty Other-relative Black Male 7826 77249 60 Poland >50K
68 NA Prof-school 1 Divorced Priv-house-serv Unmarried Asian-Pac-Islander Female 84532 60981 73 NA <=50K
31 State-gov Bachelors 2 Married-spouse-absent Protective-serv Wife Black Male 35297 15093 98 Hungary <=50K
84 Never-worked 7th-8th 10 Separated Handlers-cleaners Other-relative Asian-Pac-Islander Male 27074 40203 NA South <=50K
59 Private 12th 16 Married-civ-spouse Machine-op-inspct Other-relative White Female 73785 2056 4 Haiti <=50K
67 Self-emp-not-inc InvalidEducation 12 Married-spouse-absent Adm-clerical Husband Black Male 2553 17031 70 Scotland <=50K

Diagnose category 1

diagnose_category(sample_dirty_dataset) %>% flextable()

variables

levels

N

freq

ratio

rank

age

InvalidAge

1,000

50

5.0

1

age

89

1,000

21

2.1

2

age

28

1,000

20

2.0

3

age

52

1,000

20

2.0

3

age

42

1,000

19

1.9

5

age

84

1,000

19

1.9

5

age

34

1,000

18

1.8

7

age

56

1,000

18

1.8

7

age

88

1,000

18

1.8

7

age

26

1,000

17

1.7

10

workclass

Self-emp-inc

1,000

134

13.4

1

workclass

Private

1,000

130

13.0

2

workclass

Never-worked

1,000

124

12.4

3

workclass

State-gov

1,000

116

11.6

4

workclass

Without-pay

1,000

115

11.5

5

workclass

Self-emp-not-inc

1,000

114

11.4

6

workclass

Local-gov

1,000

111

11.1

7

workclass

Federal-gov

1,000

106

10.6

8

workclass

1,000

50

5.0

9

education

5th-6th

1,000

78

7.8

1

education

Masters

1,000

68

6.8

2

education

12th

1,000

66

6.6

3

education

HS-grad

1,000

66

6.6

3

education

Prof-school

1,000

63

6.3

5

education

9th

1,000

62

6.2

6

education

10th

1,000

60

6.0

7

education

1st-4th

1,000

60

6.0

7

education

Some-college

1,000

58

5.8

9

education

Assoc-acdm

1,000

57

5.7

10

marital_status

Divorced

1,000

155

15.5

1

marital_status

Married-spouse-absent

1,000

139

13.9

2

marital_status

Married-AF-spouse

1,000

136

13.6

3

marital_status

Separated

1,000

135

13.5

4

marital_status

Married-civ-spouse

1,000

129

12.9

5

marital_status

Never-married

1,000

129

12.9

5

marital_status

Widowed

1,000

127

12.7

7

marital_status

1,000

50

5.0

8

occupation

Tech-support

1,000

87

8.7

1

occupation

Machine-op-inspct

1,000

80

8.0

2

occupation

Armed-Forces

1,000

79

7.9

3

occupation

Craft-repair

1,000

75

7.5

4

occupation

Handlers-cleaners

1,000

69

6.9

5

occupation

Adm-clerical

1,000

67

6.7

6

occupation

Exec-managerial

1,000

67

6.7

6

occupation

Sales

1,000

64

6.4

8

occupation

Protective-serv

1,000

63

6.3

9

occupation

Transport-moving

1,000

63

6.3

9

relationship

Wife

1,000

167

16.7

1

relationship

Not-in-family

1,000

162

16.2

2

relationship

Husband

1,000

160

16.0

3

relationship

Unmarried

1,000

159

15.9

4

relationship

Other-relative

1,000

152

15.2

5

relationship

Own-child

1,000

150

15.0

6

relationship

1,000

50

5.0

7

race

White

1,000

194

19.4

1

race

Amer-Indian-Eskimo

1,000

193

19.3

2

race

Other

1,000

189

18.9

3

race

Asian-Pac-Islander

1,000

187

18.7

4

race

Black

1,000

187

18.7

4

race

InvalidRace

1,000

50

5.0

6

sex

Female

1,000

499

49.9

1

sex

Male

1,000

451

45.1

2

sex

1,000

50

5.0

3

native_country

1,000

50

5.0

1

native_country

Dominican-Republic

1,000

32

3.2

2

native_country

Hungary

1,000

32

3.2

2

native_country

Laos

1,000

31

3.1

4

native_country

Portugal

1,000

31

3.1

4

native_country

India

1,000

30

3.0

6

native_country

Cambodia

1,000

27

2.7

7

native_country

Ecuador

1,000

27

2.7

7

native_country

Hong

1,000

27

2.7

7

native_country

Jamaica

1,000

27

2.7

7

income

<=50K

1,000

776

77.6

1

income

>50K

1,000

224

22.4

2

diagnose_numeric(sample_dirty_dataset) %>% flextable()

variables

min

Q1

mean

median

Q3

max

zero

minus

outlier

education_num

1

4.0

8.34500

9.0

12.00

16

0

0

0

capital_gain

-999

22,805.0

47,118.50900

47,667.0

72,652.25

99,960

0

50

0

capital_loss

-999

19,684.5

46,222.49000

45,288.5

71,802.25

99,944

0

50

0

hours_per_week

1

23.0

48.37684

48.0

72.00

99

0

0

0

Solving inconsistensies

sample_dirty_dataset$education[sample_dirty_dataset$education=="9th"] <- "InvalidEducation"
sample_dirty_dataset$education[sample_dirty_dataset$education=="10th"] <- "InvalidEducation"
sample_dirty_dataset$education[sample_dirty_dataset$education=="11th"] <- "InvalidEducation"

Invalid values change

sample_dirty_dataset <- replace_with_na(sample_dirty_dataset, replace = list(age = "InvalidAge", race = "InvalidRace" , capital_gain = "-999", capital_loss = "-999", education = "InvalidEducation"))

Diagnose category 2

diagnose(sample_dirty_dataset) %>% flextable()

variables

types

missing_count

missing_percent

unique_count

unique_rate

age

character

50

5.0

74

0.074

workclass

character

50

5.0

9

0.009

education

character

218

21.8

14

0.014

education_num

integer

0

0.0

16

0.016

marital_status

character

50

5.0

8

0.008

occupation

character

0

0.0

15

0.015

relationship

character

50

5.0

7

0.007

race

character

50

5.0

6

0.006

sex

character

50

5.0

3

0.003

capital_gain

numeric

50

5.0

947

0.947

capital_loss

numeric

50

5.0

948

0.948

hours_per_week

integer

50

5.0

100

0.100

native_country

character

50

5.0

42

0.042

income

character

0

0.0

2

0.002

diagnose_category(sample_dirty_dataset) %>% flextable()

variables

levels

N

freq

ratio

rank

age

1,000

50

5.0

1

age

89

1,000

21

2.1

2

age

28

1,000

20

2.0

3

age

52

1,000

20

2.0

3

age

42

1,000

19

1.9

5

age

84

1,000

19

1.9

5

age

34

1,000

18

1.8

7

age

56

1,000

18

1.8

7

age

88

1,000

18

1.8

7

age

26

1,000

17

1.7

10

workclass

Self-emp-inc

1,000

134

13.4

1

workclass

Private

1,000

130

13.0

2

workclass

Never-worked

1,000

124

12.4

3

workclass

State-gov

1,000

116

11.6

4

workclass

Without-pay

1,000

115

11.5

5

workclass

Self-emp-not-inc

1,000

114

11.4

6

workclass

Local-gov

1,000

111

11.1

7

workclass

Federal-gov

1,000

106

10.6

8

workclass

1,000

50

5.0

9

education

1,000

218

21.8

1

education

5th-6th

1,000

78

7.8

2

education

Masters

1,000

68

6.8

3

education

12th

1,000

66

6.6

4

education

HS-grad

1,000

66

6.6

4

education

Prof-school

1,000

63

6.3

6

education

1st-4th

1,000

60

6.0

7

education

Some-college

1,000

58

5.8

8

education

Assoc-acdm

1,000

57

5.7

9

education

Assoc-voc

1,000

57

5.7

9

marital_status

Divorced

1,000

155

15.5

1

marital_status

Married-spouse-absent

1,000

139

13.9

2

marital_status

Married-AF-spouse

1,000

136

13.6

3

marital_status

Separated

1,000

135

13.5

4

marital_status

Married-civ-spouse

1,000

129

12.9

5

marital_status

Never-married

1,000

129

12.9

5

marital_status

Widowed

1,000

127

12.7

7

marital_status

1,000

50

5.0

8

occupation

Tech-support

1,000

87

8.7

1

occupation

Machine-op-inspct

1,000

80

8.0

2

occupation

Armed-Forces

1,000

79

7.9

3

occupation

Craft-repair

1,000

75

7.5

4

occupation

Handlers-cleaners

1,000

69

6.9

5

occupation

Adm-clerical

1,000

67

6.7

6

occupation

Exec-managerial

1,000

67

6.7

6

occupation

Sales

1,000

64

6.4

8

occupation

Protective-serv

1,000

63

6.3

9

occupation

Transport-moving

1,000

63

6.3

9

relationship

Wife

1,000

167

16.7

1

relationship

Not-in-family

1,000

162

16.2

2

relationship

Husband

1,000

160

16.0

3

relationship

Unmarried

1,000

159

15.9

4

relationship

Other-relative

1,000

152

15.2

5

relationship

Own-child

1,000

150

15.0

6

relationship

1,000

50

5.0

7

race

White

1,000

194

19.4

1

race

Amer-Indian-Eskimo

1,000

193

19.3

2

race

Other

1,000

189

18.9

3

race

Asian-Pac-Islander

1,000

187

18.7

4

race

Black

1,000

187

18.7

4

race

1,000

50

5.0

6

sex

Female

1,000

499

49.9

1

sex

Male

1,000

451

45.1

2

sex

1,000

50

5.0

3

native_country

1,000

50

5.0

1

native_country

Dominican-Republic

1,000

32

3.2

2

native_country

Hungary

1,000

32

3.2

2

native_country

Laos

1,000

31

3.1

4

native_country

Portugal

1,000

31

3.1

4

native_country

India

1,000

30

3.0

6

native_country

Cambodia

1,000

27

2.7

7

native_country

Ecuador

1,000

27

2.7

7

native_country

Hong

1,000

27

2.7

7

native_country

Jamaica

1,000

27

2.7

7

income

<=50K

1,000

776

77.6

1

income

>50K

1,000

224

22.4

2

diagnose_numeric(sample_dirty_dataset) %>% flextable()

variables

min

Q1

mean

median

Q3

max

zero

minus

outlier

education_num

1

4.00

8.34500

9.0

12

16

0

0

0

capital_gain

20

25,714.75

49,651.00947

50,639.0

74,052

99,960

0

0

0

capital_loss

72

23,732.25

48,707.83158

48,896.5

73,255

99,944

0

0

0

hours_per_week

1

23.00

48.37684

48.0

72

99

0

0

0

diagnose_outlier(sample_dirty_dataset) %>% flextable()

variables

outliers_cnt

outliers_ratio

outliers_mean

with_mean

without_mean

education_num

0

0

8.34500

8.34500

capital_gain

0

0

49,651.00947

49,651.00947

capital_loss

0

0

48,707.83158

48,707.83158

hours_per_week

0

0

48.37684

48.37684

plot_na_pareto(sample_dirty_dataset)

plot_na_hclust(sample_dirty_dataset)

Missing Values

#Detecting missing values
sample_dirty_dataset %>%
summarise(count = sum(is.na(sample_dirty_dataset)))
##   count
## 1   718
which(is.na(sample_dirty_dataset))
##   [1]    60    98   132   145   163   167   179   202   213   230   256   261
##  [13]   287   290   306   309   310   323   352   366   374   384   407   424
##  [25]   467   468   477   484   492   504   529   583   589   603   608   611
##  [37]   645   659   683   706   737   751   752   857   862   882   912   915
##  [49]   925   933  1002  1037  1087  1108  1133  1179  1184  1192  1224  1253
##  [61]  1275  1289  1320  1340  1367  1369  1374  1456  1470  1473  1497  1504
##  [73]  1520  1543  1569  1585  1666  1675  1677  1686  1695  1728  1744  1768
##  [85]  1773  1780  1781  1785  1818  1820  1827  1828  1829  1858  1863  1899
##  [97]  1905  1949  1974  1993  2006  2009  2019  2025  2028  2031  2033  2035
## [109]  2037  2040  2043  2047  2054  2056  2064  2065  2067  2069  2079  2089
## [121]  2092  2093  2099  2102  2108  2109  2113  2115  2116  2118  2119  2121
## [133]  2124  2129  2131  2134  2137  2138  2144  2146  2166  2169  2173  2183
## [145]  2187  2188  2190  2197  2199  2203  2206  2215  2217  2222  2227  2251
## [157]  2254  2260  2269  2291  2293  2295  2305  2308  2312  2313  2316  2323
## [169]  2326  2334  2344  2345  2350  2356  2361  2362  2369  2373  2385  2386
## [181]  2387  2396  2397  2400  2410  2412  2418  2420  2421  2422  2429  2436
## [193]  2441  2452  2456  2457  2458  2459  2471  2477  2480  2489  2490  2492
## [205]  2494  2496  2504  2506  2509  2514  2520  2523  2530  2535  2539  2544
## [217]  2546  2559  2568  2570  2577  2579  2581  2582  2584  2603  2611  2613
## [229]  2617  2624  2625  2647  2654  2658  2659  2660  2663  2667  2669  2677
## [241]  2678  2680  2681  2684  2692  2697  2702  2703  2708  2710  2712  2716
## [253]  2719  2726  2729  2732  2742  2744  2746  2750  2752  2762  2768  2770
## [265]  2783  2784  2785  2788  2789  2799  2800  2801  2804  2805  2807  2811
## [277]  2813  2822  2827  2831  2839  2840  2843  2848  2851  2859  2866  2872
## [289]  2880  2883  2886  2892  2894  2896  2900  2920  2925  2926  2931  2936
## [301]  2939  2940  2941  2942  2945  2946  2959  2960  2972  2979  2980  2983
## [313]  2984  2990  2991  2993  2998  2999  4001  4016  4022  4038  4044  4056
## [325]  4057  4115  4141  4149  4160  4189  4249  4273  4291  4292  4293  4346
## [337]  4361  4365  4366  4387  4465  4466  4522  4524  4533  4534  4554  4574
## [349]  4576  4589  4591  4600  4610  4691  4711  4724  4730  4747  4781  4821
## [361]  4894  4907  4922  4931  4943  4963  4964  4990  6028  6075  6079  6100
## [373]  6135  6139  6164  6172  6190  6245  6248  6266  6268  6277  6288  6310
## [385]  6315  6336  6338  6341  6391  6439  6444  6484  6517  6537  6625  6633
## [397]  6636  6657  6677  6691  6720  6726  6727  6754  6774  6782  6786  6788
## [409]  6796  6801  6831  6835  6836  6885  6919  6938  6943  6994  7013  7014
## [421]  7030  7033  7103  7110  7117  7135  7145  7163  7178  7244  7272  7277
## [433]  7283  7294  7298  7312  7323  7324  7380  7395  7461  7488  7491  7513
## [445]  7522  7541  7564  7595  7604  7619  7635  7709  7711  7713  7714  7739
## [457]  7754  7787  7789  7823  7837  7862  7883  7924  7964  7973  7980  7986
## [469]  8031  8074  8078  8099  8123  8138  8173  8174  8217  8224  8291  8324
## [481]  8330  8342  8360  8382  8410  8416  8419  8440  8466  8475  8476  8549
## [493]  8580  8624  8635  8640  8666  8684  8737  8744  8750  8758  8791  8814
## [505]  8822  8841  8853  8868  8876  8877  8913  8918  8925  8973  8975  8978
## [517]  8986  9000  9038  9098  9115  9122  9129  9169  9176  9192  9214  9270
## [529]  9278  9289  9328  9346  9385  9391  9417  9422  9424  9431  9438  9457
## [541]  9466  9472  9500  9508  9538  9568  9571  9574  9600  9601  9607  9657
## [553]  9678  9744  9758  9777  9804  9814  9833  9881  9915  9919  9925  9946
## [565]  9947  9948  9956  9958 10060 10133 10136 10147 10157 10171 10186 10217
## [577] 10230 10233 10249 10259 10268 10285 10299 10308 10329 10332 10366 10373
## [589] 10375 10378 10393 10400 10457 10481 10531 10551 10619 10657 10660 10662
## [601] 10684 10708 10728 10734 10752 10763 10768 10770 10789 10806 10817 10836
## [613] 10848 10867 10924 10925 10952 10958 11004 11018 11022 11037 11116 11123
## [625] 11129 11149 11169 11170 11177 11185 11203 11222 11228 11235 11245 11321
## [637] 11340 11375 11385 11412 11483 11502 11532 11549 11573 11579 11608 11629
## [649] 11639 11669 11685 11704 11723 11730 11738 11742 11745 11791 11796 11816
## [661] 11832 11855 11870 11876 11917 11943 11983 11999 12002 12014 12020 12037
## [673] 12039 12053 12082 12095 12119 12179 12192 12194 12198 12202 12233 12249
## [685] 12250 12252 12271 12274 12345 12354 12362 12366 12367 12446 12466 12508
## [697] 12511 12571 12580 12604 12636 12642 12697 12698 12743 12771 12791 12792
## [709] 12800 12803 12871 12880 12881 12935 12946 12976 12987 12996
#Imputing (only columns with numerical values)
sample_dirty_dataset$age[is.na(sample_dirty_dataset$age)] <- as.integer(mean(sample_dirty_dataset$age, na.rm = TRUE))
## Warning in mean.default(sample_dirty_dataset$age, na.rm = TRUE): argument nie
## jest wartością liczbową ani logiczną: zwracanie wartości NA
sample_dirty_dataset$capital_gain[is.na(sample_dirty_dataset$capital_gain)] <- as.integer(mean(sample_dirty_dataset$capital_gain, na.rm = TRUE))
sample_dirty_dataset$capital_loss[is.na(sample_dirty_dataset$capital_loss)] <- as.integer(mean(sample_dirty_dataset$capital_loss, na.rm = TRUE))

#Removing rows with missing values (which are not numerical)
sample_dirty_dataset <- sample_dirty_dataset %>% drop_na()

Outliers

#Methods for Detection - Using zscore
sample_dirty_dataset$age <- as.numeric(as.character(sample_dirty_dataset$age))
zscore <- abs(scale(sample_dirty_dataset$age))
plot(zscore, type = "n")
abline(h = 1.0, col = "blue")
text(1:length(sample_dirty_dataset$age), zscore)

#Methods for Detection - Using boxplot
boxplot(sample_dirty_dataset$capital_gain)
boxplot(sample_dirty_dataset$capital_gain)$out

## numeric(0)
which(sample_dirty_dataset$capital_gain %in% boxplot(sample_dirty_dataset$capital_gain)$out)
## integer(0)
#Methods for Detection - Using Mahalanobis Charts
par(mfrow=c(2,2))
plot(covMcd(sample_dirty_dataset$hours_per_week))

#sample_dirty_dataset %>%
 # diagnose_web_report(subtitle = "diagnose", 
                     # output_file = "Diagn.html", theme = "blue")
#sample_dirty_dataset %>%
  #eda_web_report( subtitle = "eda",  output_file = "EDA.html", theme = "blue")
#sample_dirty_dataset %>%
  #transformation_web_report(subtitle = "trans", output_file = "transformation.html", 
                            #theme = "blue")