Cleansing_Report

Our Dirty Dataframe

head(sample_dirty_dataset)

##   age        workclass        education education_num        marital_status
## 1  48        Local-gov          7th-8th             1                  <NA>
## 2  68             <NA>      Prof-school             1              Divorced
## 3  31        State-gov        Bachelors             2 Married-spouse-absent
## 4  84     Never-worked          7th-8th            10             Separated
## 5  59          Private             12th            16    Married-civ-spouse
## 6  67 Self-emp-not-inc InvalidEducation            12 Married-spouse-absent
##          occupation   relationship               race    sex capital_gain
## 1    Prof-specialty Other-relative              Black   Male         7826
## 2   Priv-house-serv      Unmarried Asian-Pac-Islander Female        84532
## 3   Protective-serv           Wife              Black   Male        35297
## 4 Handlers-cleaners Other-relative Asian-Pac-Islander   Male        27074
## 5 Machine-op-inspct Other-relative              White Female        73785
## 6      Adm-clerical        Husband              Black   Male         2553
##   capital_loss hours_per_week native_country income
## 1        77249             60         Poland   >50K
## 2        60981             73           <NA>  <=50K
## 3        15093             98        Hungary  <=50K
## 4        40203             NA          South  <=50K
## 5         2056              4          Haiti  <=50K
## 6        17031             70       Scotland  <=50K

Diagnose Missing Values

diagnose(sample_dirty_dataset)

## # A tibble: 14 × 6
##    variables      types   missing_count missing_percent unique_count unique_rate
##    <chr>          <chr>           <int>           <dbl>        <int>       <dbl>
##  1 age            charac…             0               0           74       0.074
##  2 workclass      charac…            50               5            9       0.009
##  3 education      charac…             0               0           17       0.017
##  4 education_num  integer             0               0           16       0.016
##  5 marital_status charac…            50               5            8       0.008
##  6 occupation     charac…             0               0           15       0.015
##  7 relationship   charac…            50               5            7       0.007
##  8 race           charac…             0               0            6       0.006
##  9 sex            charac…            50               5            3       0.003
## 10 capital_gain   numeric             0               0          947       0.947
## 11 capital_loss   numeric             0               0          948       0.948
## 12 hours_per_week integer            50               5          100       0.1  
## 13 native_country charac…            50               5           42       0.042
## 14 income         charac…             0               0            2       0.002

diagnose_category(sample_dirty_dataset)

##         variables                levels    N freq ratio rank
## 1             age            InvalidAge 1000   50   5.0    1
## 2             age                    89 1000   21   2.1    2
## 3             age                    28 1000   20   2.0    3
## 4             age                    52 1000   20   2.0    3
## 5             age                    42 1000   19   1.9    5
## 6             age                    84 1000   19   1.9    5
## 7             age                    34 1000   18   1.8    7
## 8             age                    56 1000   18   1.8    7
## 9             age                    88 1000   18   1.8    7
## 10            age                    26 1000   17   1.7   10
## 11      workclass          Self-emp-inc 1000  134  13.4    1
## 12      workclass               Private 1000  130  13.0    2
## 13      workclass          Never-worked 1000  124  12.4    3
## 14      workclass             State-gov 1000  116  11.6    4
## 15      workclass           Without-pay 1000  115  11.5    5
## 16      workclass      Self-emp-not-inc 1000  114  11.4    6
## 17      workclass             Local-gov 1000  111  11.1    7
## 18      workclass           Federal-gov 1000  106  10.6    8
## 19      workclass                  <NA> 1000   50   5.0    9
## 20      education               5th-6th 1000   78   7.8    1
## 21      education               Masters 1000   68   6.8    2
## 22      education                  12th 1000   66   6.6    3
## 23      education               HS-grad 1000   66   6.6    3
## 24      education           Prof-school 1000   63   6.3    5
## 25      education                   9th 1000   62   6.2    6
## 26      education                  10th 1000   60   6.0    7
## 27      education               1st-4th 1000   60   6.0    7
## 28      education          Some-college 1000   58   5.8    9
## 29      education            Assoc-acdm 1000   57   5.7   10
## 30 marital_status              Divorced 1000  155  15.5    1
## 31 marital_status Married-spouse-absent 1000  139  13.9    2
## 32 marital_status     Married-AF-spouse 1000  136  13.6    3
## 33 marital_status             Separated 1000  135  13.5    4
## 34 marital_status    Married-civ-spouse 1000  129  12.9    5
## 35 marital_status         Never-married 1000  129  12.9    5
## 36 marital_status               Widowed 1000  127  12.7    7
## 37 marital_status                  <NA> 1000   50   5.0    8
## 38     occupation          Tech-support 1000   87   8.7    1
## 39     occupation     Machine-op-inspct 1000   80   8.0    2
## 40     occupation          Armed-Forces 1000   79   7.9    3
## 41     occupation          Craft-repair 1000   75   7.5    4
## 42     occupation     Handlers-cleaners 1000   69   6.9    5
## 43     occupation          Adm-clerical 1000   67   6.7    6
## 44     occupation       Exec-managerial 1000   67   6.7    6
## 45     occupation                 Sales 1000   64   6.4    8
## 46     occupation       Protective-serv 1000   63   6.3    9
## 47     occupation      Transport-moving 1000   63   6.3    9
## 48   relationship                  Wife 1000  167  16.7    1
## 49   relationship         Not-in-family 1000  162  16.2    2
## 50   relationship               Husband 1000  160  16.0    3
## 51   relationship             Unmarried 1000  159  15.9    4
## 52   relationship        Other-relative 1000  152  15.2    5
## 53   relationship             Own-child 1000  150  15.0    6
## 54   relationship                  <NA> 1000   50   5.0    7
## 55           race                 White 1000  194  19.4    1
## 56           race    Amer-Indian-Eskimo 1000  193  19.3    2
## 57           race                 Other 1000  189  18.9    3
## 58           race    Asian-Pac-Islander 1000  187  18.7    4
## 59           race                 Black 1000  187  18.7    4
## 60           race           InvalidRace 1000   50   5.0    6
## 61            sex                Female 1000  499  49.9    1
## 62            sex                  Male 1000  451  45.1    2
## 63            sex                  <NA> 1000   50   5.0    3
## 64 native_country                  <NA> 1000   50   5.0    1
## 65 native_country    Dominican-Republic 1000   32   3.2    2
## 66 native_country               Hungary 1000   32   3.2    2
## 67 native_country                  Laos 1000   31   3.1    4
## 68 native_country              Portugal 1000   31   3.1    4
## 69 native_country                 India 1000   30   3.0    6
## 70 native_country              Cambodia 1000   27   2.7    7
## 71 native_country               Ecuador 1000   27   2.7    7
## 72 native_country                  Hong 1000   27   2.7    7
## 73 native_country               Jamaica 1000   27   2.7    7
## 74         income                 <=50K 1000  776  77.6    1
## 75         income                  >50K 1000  224  22.4    2

diagnose_numeric(sample_dirty_dataset)

##        variables  min      Q1        mean  median       Q3   max zero minus
## 1  education_num    1     4.0     8.34500     9.0    12.00    16    0     0
## 2   capital_gain -999 22805.0 47118.50900 47667.0 72652.25 99960    0    50
## 3   capital_loss -999 19684.5 46222.49000 45288.5 71802.25 99944    0    50
## 4 hours_per_week    1    23.0    48.37684    48.0    72.00    99    0     0
##   outlier
## 1       0
## 2       0
## 3       0
## 4       0

And Observe If There Is Any Correlation

plot_na_pareto(sample_dirty_dataset)

plot_na_hclust(sample_dirty_dataset)

plot_na_intersect(sample_dirty_dataset)

Change Invalid Values To NA’s

First, create rules for each variable

## 
## Data model:
## dat1 : education %in% c('10th', '11th', '12th', '1st-4th', '5th-6th', '7th-8th', '9th', 'Assoc-acdm', 'Assoc-voc', 'Bachelors', 'Doctorate', 'HS-grad', 'Masters', 'Preschool', 'Prof-school', 'Some-college')
## dat2 : income %in% c('<=50K', '>50K')
## dat3 : marital_status %in% c('Divorced', 'Married-AF-spouse', 'Married-civ-spouse', 'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed')
## dat4 : native_country %in% c('Cambodia', 'Canada', 'China', 'Columbia', 'Cuba', 'Dominican-Republic', 'Ecuador', 'El-Salvador', 'England', 'France', 'Germany', 'Greece', 'Guatemala', 'Haiti', 'Holand-Netherlands', 'Honduras', 'Hong', 'Hungary', 'India', 'Iran', 'Ireland', 'Italy', 'Jamaica', 'Japan', 'Laos', 'Mexico', 'Nicaragua', 'Outlying-US(Guam-USVI-etc)', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Puerto-Rico', 'Scotland', 'South', 'Taiwan', 'Thailand', 'Trinadad&Tobago', 'United-States', 'Vietnam', 'Yugoslavia')
## dat5 : occupation %in% c('Adm-clerical', 'Armed-Forces', 'Craft-repair', 'Exec-managerial', 'Farming-fishing', 'Handlers-cleaners', 'Machine-op-inspct', 'Other-service', 'Priv-house-serv', 'Prof-specialty', 'Protective-serv', 'Sales', 'Tech-support', 'Transport-moving')
## dat6 : race %in% c('Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White')
## dat7 : relationship %in% c('Husband', 'Not-in-family', 'Other-relative', 'Own-child', 'Unmarried', 'Wife')
## dat8 : sex %in% c('Female', 'Male')
## dat9 : workclass %in% c('Federal-gov', 'Local-gov', 'Never-worked', 'Private', 'Self-emp-inc', 'Self-emp-not-inc', 'State-gov', 'Without-pay') 
## 
## Edit set:
## num1  : 18 <= age
## num2  : age <= 90
## num3  : 1 <= education_num
## num4  : education_num <= 16
## num5  : 0 <= capital_gain
## num6  : capital_gain <= 99999
## num7  : 0 <= capital_loss
## num8  : capital_loss <= 99999
## num9  : 0 <= hours_per_week
## num10 : hours_per_week <= 999

And see how many rules have been violated

summary(violated)

## Edit violations, 1000 observations, 0 completely missing (0%):
## 
##  editname freq rel
##      num2   50  5%
##      num5   50  5%
##      num7   50  5%
##      dat1   50  5%
##      dat3   50  5%
##      dat4   50  5%
##      dat5   50  5%
##      dat6   50  5%
##      dat7   50  5%
##      dat8   50  5%
##      dat9   50  5%
## 
## Edit violations per record:
## 
##  errors freq   rel
##       0  542 54.2%
##       1  307 30.7%
##       2  118 11.8%
##       3   25  2.5%
##       4    8  0.8%

Change violated records to NA’s and analize dataset with deleted invalid values

diagnose(sample_data_with_NA)

## # A tibble: 14 × 6
##    variables      types   missing_count missing_percent unique_count unique_rate
##    <chr>          <chr>           <int>           <dbl>        <int>       <dbl>
##  1 age            numeric            50               5           74       0.074
##  2 workclass      charac…            50               5            9       0.009
##  3 education      charac…            50               5           17       0.017
##  4 education_num  integer             0               0           16       0.016
##  5 marital_status charac…            50               5            8       0.008
##  6 occupation     charac…            50               5           15       0.015
##  7 relationship   charac…            50               5            7       0.007
##  8 race           charac…            50               5            6       0.006
##  9 sex            charac…            50               5            3       0.003
## 10 capital_gain   numeric            50               5          947       0.947
## 11 capital_loss   numeric            50               5          948       0.948
## 12 hours_per_week integer            50               5          100       0.1  
## 13 native_country charac…            50               5           42       0.042
## 14 income         charac…             0               0            2       0.002

diagnose_category(sample_data_with_NA)

##         variables                levels    N freq ratio rank
## 1       workclass          Self-emp-inc 1000  134  13.4    1
## 2       workclass               Private 1000  130  13.0    2
## 3       workclass          Never-worked 1000  124  12.4    3
## 4       workclass             State-gov 1000  116  11.6    4
## 5       workclass           Without-pay 1000  115  11.5    5
## 6       workclass      Self-emp-not-inc 1000  114  11.4    6
## 7       workclass             Local-gov 1000  111  11.1    7
## 8       workclass           Federal-gov 1000  106  10.6    8
## 9       workclass                  <NA> 1000   50   5.0    9
## 10      education               5th-6th 1000   78   7.8    1
## 11      education               Masters 1000   68   6.8    2
## 12      education                  12th 1000   66   6.6    3
## 13      education               HS-grad 1000   66   6.6    3
## 14      education           Prof-school 1000   63   6.3    5
## 15      education                   9th 1000   62   6.2    6
## 16      education                  10th 1000   60   6.0    7
## 17      education               1st-4th 1000   60   6.0    7
## 18      education          Some-college 1000   58   5.8    9
## 19      education            Assoc-acdm 1000   57   5.7   10
## 20 marital_status              Divorced 1000  155  15.5    1
## 21 marital_status Married-spouse-absent 1000  139  13.9    2
## 22 marital_status     Married-AF-spouse 1000  136  13.6    3
## 23 marital_status             Separated 1000  135  13.5    4
## 24 marital_status    Married-civ-spouse 1000  129  12.9    5
## 25 marital_status         Never-married 1000  129  12.9    5
## 26 marital_status               Widowed 1000  127  12.7    7
## 27 marital_status                  <NA> 1000   50   5.0    8
## 28     occupation          Tech-support 1000   87   8.7    1
## 29     occupation     Machine-op-inspct 1000   80   8.0    2
## 30     occupation          Armed-Forces 1000   79   7.9    3
## 31     occupation          Craft-repair 1000   75   7.5    4
## 32     occupation     Handlers-cleaners 1000   69   6.9    5
## 33     occupation          Adm-clerical 1000   67   6.7    6
## 34     occupation       Exec-managerial 1000   67   6.7    6
## 35     occupation                 Sales 1000   64   6.4    8
## 36     occupation       Protective-serv 1000   63   6.3    9
## 37     occupation      Transport-moving 1000   63   6.3    9
## 38   relationship                  Wife 1000  167  16.7    1
## 39   relationship         Not-in-family 1000  162  16.2    2
## 40   relationship               Husband 1000  160  16.0    3
## 41   relationship             Unmarried 1000  159  15.9    4
## 42   relationship        Other-relative 1000  152  15.2    5
## 43   relationship             Own-child 1000  150  15.0    6
## 44   relationship                  <NA> 1000   50   5.0    7
## 45           race                 White 1000  194  19.4    1
## 46           race    Amer-Indian-Eskimo 1000  193  19.3    2
## 47           race                 Other 1000  189  18.9    3
## 48           race    Asian-Pac-Islander 1000  187  18.7    4
## 49           race                 Black 1000  187  18.7    4
## 50           race                  <NA> 1000   50   5.0    6
## 51            sex                Female 1000  499  49.9    1
## 52            sex                  Male 1000  451  45.1    2
## 53            sex                  <NA> 1000   50   5.0    3
## 54 native_country                  <NA> 1000   50   5.0    1
## 55 native_country    Dominican-Republic 1000   32   3.2    2
## 56 native_country               Hungary 1000   32   3.2    2
## 57 native_country                  Laos 1000   31   3.1    4
## 58 native_country              Portugal 1000   31   3.1    4
## 59 native_country                 India 1000   30   3.0    6
## 60 native_country              Cambodia 1000   27   2.7    7
## 61 native_country               Ecuador 1000   27   2.7    7
## 62 native_country                  Hong 1000   27   2.7    7
## 63 native_country               Jamaica 1000   27   2.7    7
## 64         income                 <=50K 1000  776  77.6    1
## 65         income                  >50K 1000  224  22.4    2

diagnose_numeric(sample_data_with_NA)

##        variables min       Q1        mean  median    Q3   max zero minus
## 1            age  18    36.00    53.81684    53.0    71    90    0     0
## 2  education_num   1     4.00     8.34500     9.0    12    16    0     0
## 3   capital_gain  20 25714.75 49651.00947 50639.0 74052 99960    0     0
## 4   capital_loss  72 23732.25 48707.83158 48896.5 73255 99944    0     0
## 5 hours_per_week   1    23.00    48.37684    48.0    72    99    0     0
##   outlier
## 1       0
## 2       0
## 3       0
## 4       0
## 5       0

plot_na_pareto(sample_data_with_NA)

plot_na_hclust(sample_data_with_NA)

plot_na_intersect(sample_data_with_NA)

 #diagnose_paged_report(sample_data_with_NA)
# this report is in First diagnosis.pdf
 #eda_paged_report(sample_data_with_NA)
# this report is in First eda.pdf

NA’S In Numeric Variables

#how many observations contain NA's in numeric variables

numerical <-sample_data_with_NA %>%
  select_if(is.numeric) 

numerical %>%
  miss_case_table()

## # A tibble: 4 × 3
##   n_miss_in_case n_cases pct_cases
##            <int>   <int>     <dbl>
## 1              0     817      81.7
## 2              1     167      16.7
## 3              2      15       1.5
## 4              3       1       0.1

howmany <- numerical %>% 
  find_na(index = TRUE) %>%
    length()

gg_miss_upset(numerical, nsets = howmany)

Based on variable age check how different methods of imputation work

#preview different methods on age variable


data_mean <- numerical %>%
  imputate_na(age, method="mean")

data_mode <- numerical %>%
  imputate_na(age, method="mode")

data_median <- numerical %>%
  imputate_na(age, method="median")

data_mice <- numerical %>%
  imputate_na(age, c("hours_per_week", "capital_gain", "capital_loss"), method="mice")

## 
##  iter imp variable
##   1   1  age
##   1   2  age
##   1   3  age
##   1   4  age
##   1   5  age
##   2   1  age
##   2   2  age
##   2   3  age
##   2   4  age
##   2   5  age
##   3   1  age
##   3   2  age
##   3   3  age
##   3   4  age
##   3   5  age
##   4   1  age
##   4   2  age
##   4   3  age
##   4   4  age
##   4   5  age
##   5   1  age
##   5   2  age
##   5   3  age
##   5   4  age
##   5   5  age

data_rpart <- numerical %>%
  imputate_na(age, c("hours_per_week", "capital_gain", "capital_loss"), method="rpart")

#check different inputting missing values methods and choose most fitting one

diagnose_numeric(numerical)

##        variables min       Q1        mean  median    Q3   max zero minus
## 1            age  18    36.00    53.81684    53.0    71    90    0     0
## 2  education_num   1     4.00     8.34500     9.0    12    16    0     0
## 3   capital_gain  20 25714.75 49651.00947 50639.0 74052 99960    0     0
## 4   capital_loss  72 23732.25 48707.83158 48896.5 73255 99944    0     0
## 5 hours_per_week   1    23.00    48.37684    48.0    72    99    0     0
##   outlier
## 1       0
## 2       0
## 3       0
## 4       0
## 5       0

#mode
data_mode <- numerical %>%
  impute_mode()

#mean
data_mean <- numerical %>%
  impute_mean_all()

#median
data_median <- numerical %>%
  impute_median_all()

#mice method
data_mice <- numerical %>%
  mice(print=FALSE) 


data_mice <- complete(data_mice)

##        variables min       Q1      mean  median       Q3   max zero minus
## 1            age  18    35.75    53.814    53.0    71.00    90    0     0
## 2  education_num   1     4.00     8.345     9.0    12.00    16    0     0
## 3   capital_gain  20 25628.25 49262.297 50309.5 73141.75 99960    0     0
## 4   capital_loss  72 24544.75 48940.852 49095.0 73355.25 99944    0     0
## 5 hours_per_week   1    23.00    48.365    47.0    72.00    99    0     0
##   outlier
## 1       0
## 2       0
## 3       0
## 4       0
## 5       0

##        variables min       Q1        mean      median       Q3   max zero minus
## 1            age  18    37.00    53.81684    53.81684    70.00    90    0     0
## 2  education_num   1     4.00     8.34500     9.00000    12.00    16    0     0
## 3   capital_gain  20 27438.75 49651.00947 49651.00947 72652.25 99960    0     0
## 4   capital_loss  72 25649.50 48707.83158 48707.83158 71802.25 99944    0     0
## 5 hours_per_week   1    24.75    48.37684    48.37684    71.00    99    0     0
##   outlier
## 1       0
## 2       0
## 3       0
## 4       0
## 5       0

##        variables min       Q1      mean  median       Q3   max zero minus
## 1            age  18    37.00    53.776    53.0    70.00    90    0     0
## 2  education_num   1     4.00     8.345     9.0    12.00    16    0     0
## 3   capital_gain  20 27438.75 49700.409 50639.0 72652.25 99960    0     0
## 4   capital_loss  72 25649.50 48717.265 48896.5 71802.25 99944    0     0
## 5 hours_per_week   1    24.75    48.358    48.0    71.00    99    0     0
##   outlier
## 1       0
## 2       0
## 3       0
## 4       0
## 5       0

##        variables min       Q1        mean  median       Q3        max zero
## 1            age  18    37.00    56.39257    55.0    75.00   105.3315    0
## 2  education_num   1     4.00     8.34500     9.0    12.00    16.0000    0
## 3   capital_gain  20 22805.00 47173.72557 47667.0 72652.25 99960.0000    0
## 4   capital_loss  72 19684.50 46277.70657 45288.5 71802.25 99944.0000    0
## 5 hours_per_week   1    24.75    51.22457    50.0    76.00   105.3315    0
##   minus outlier
## 1     0       0
## 2     0       0
## 3     0       0
## 4     0       0
## 5     0       0

NA’s In Categorical Variables

category <- sample_data_with_some_NA %>%
  select_if(is.character)

how_many <- category %>% 
  find_na(index = TRUE) %>%
    length()

diagnose_category(category)

##         variables                levels    N freq ratio rank
## 1       workclass          Self-emp-inc 1000  134  13.4    1
## 2       workclass               Private 1000  130  13.0    2
## 3       workclass          Never-worked 1000  124  12.4    3
## 4       workclass             State-gov 1000  116  11.6    4
## 5       workclass           Without-pay 1000  115  11.5    5
## 6       workclass      Self-emp-not-inc 1000  114  11.4    6
## 7       workclass             Local-gov 1000  111  11.1    7
## 8       workclass           Federal-gov 1000  106  10.6    8
## 9       workclass                  <NA> 1000   50   5.0    9
## 10      education               5th-6th 1000   78   7.8    1
## 11      education               Masters 1000   68   6.8    2
## 12      education                  12th 1000   66   6.6    3
## 13      education               HS-grad 1000   66   6.6    3
## 14      education           Prof-school 1000   63   6.3    5
## 15      education                   9th 1000   62   6.2    6
## 16      education                  10th 1000   60   6.0    7
## 17      education               1st-4th 1000   60   6.0    7
## 18      education          Some-college 1000   58   5.8    9
## 19      education            Assoc-acdm 1000   57   5.7   10
## 20 marital_status              Divorced 1000  155  15.5    1
## 21 marital_status Married-spouse-absent 1000  139  13.9    2
## 22 marital_status     Married-AF-spouse 1000  136  13.6    3
## 23 marital_status             Separated 1000  135  13.5    4
## 24 marital_status    Married-civ-spouse 1000  129  12.9    5
## 25 marital_status         Never-married 1000  129  12.9    5
## 26 marital_status               Widowed 1000  127  12.7    7
## 27 marital_status                  <NA> 1000   50   5.0    8
## 28     occupation          Tech-support 1000   87   8.7    1
## 29     occupation     Machine-op-inspct 1000   80   8.0    2
## 30     occupation          Armed-Forces 1000   79   7.9    3
## 31     occupation          Craft-repair 1000   75   7.5    4
## 32     occupation     Handlers-cleaners 1000   69   6.9    5
## 33     occupation          Adm-clerical 1000   67   6.7    6
## 34     occupation       Exec-managerial 1000   67   6.7    6
## 35     occupation                 Sales 1000   64   6.4    8
## 36     occupation       Protective-serv 1000   63   6.3    9
## 37     occupation      Transport-moving 1000   63   6.3    9
## 38   relationship                  Wife 1000  167  16.7    1
## 39   relationship         Not-in-family 1000  162  16.2    2
## 40   relationship               Husband 1000  160  16.0    3
## 41   relationship             Unmarried 1000  159  15.9    4
## 42   relationship        Other-relative 1000  152  15.2    5
## 43   relationship             Own-child 1000  150  15.0    6
## 44   relationship                  <NA> 1000   50   5.0    7
## 45           race                 White 1000  194  19.4    1
## 46           race    Amer-Indian-Eskimo 1000  193  19.3    2
## 47           race                 Other 1000  189  18.9    3
## 48           race    Asian-Pac-Islander 1000  187  18.7    4
## 49           race                 Black 1000  187  18.7    4
## 50           race                  <NA> 1000   50   5.0    6
## 51            sex                Female 1000  499  49.9    1
## 52            sex                  Male 1000  451  45.1    2
## 53            sex                  <NA> 1000   50   5.0    3
## 54 native_country                  <NA> 1000   50   5.0    1
## 55 native_country    Dominican-Republic 1000   32   3.2    2
## 56 native_country               Hungary 1000   32   3.2    2
## 57 native_country                  Laos 1000   31   3.1    4
## 58 native_country              Portugal 1000   31   3.1    4
## 59 native_country                 India 1000   30   3.0    6
## 60 native_country              Cambodia 1000   27   2.7    7
## 61 native_country               Ecuador 1000   27   2.7    7
## 62 native_country                  Hong 1000   27   2.7    7
## 63 native_country               Jamaica 1000   27   2.7    7
## 64         income                 <=50K 1000  776  77.6    1
## 65         income                  >50K 1000  224  22.4    2

category %>%
  miss_case_table()

## # A tibble: 4 × 3
##   n_miss_in_case n_cases pct_cases
##            <int>   <int>     <dbl>
## 1              0     657      65.7
## 2              1     288      28.8
## 3              2      53       5.3
## 4              3       2       0.2

gg_miss_upset(category, nsets = how_many)

Impute using k-nearest neighbours method

clean_data_ <- kNN(sample_data_with_some_NA)

clean_data <- clean_data_ %>%
  select(where(~ !is.logical(.)))

diagnose_category(clean_data)

##         variables                levels    N freq ratio rank
## 1       workclass               Private 1000  138  13.8    1
## 2       workclass          Self-emp-inc 1000  138  13.8    1
## 3       workclass          Never-worked 1000  132  13.2    3
## 4       workclass             State-gov 1000  123  12.3    4
## 5       workclass             Local-gov 1000  119  11.9    5
## 6       workclass      Self-emp-not-inc 1000  119  11.9    5
## 7       workclass           Without-pay 1000  119  11.9    5
## 8       workclass           Federal-gov 1000  112  11.2    8
## 9       education               5th-6th 1000   83   8.3    1
## 10      education               Masters 1000   73   7.3    2
## 11      education                  12th 1000   71   7.1    3
## 12      education                   9th 1000   67   6.7    4
## 13      education               HS-grad 1000   67   6.7    4
## 14      education               1st-4th 1000   66   6.6    6
## 15      education           Prof-school 1000   65   6.5    7
## 16      education                  10th 1000   64   6.4    8
## 17      education          Some-college 1000   62   6.2    9
## 18      education            Assoc-acdm 1000   59   5.9   10
## 19 marital_status              Divorced 1000  170  17.0    1
## 20 marital_status Married-spouse-absent 1000  147  14.7    2
## 21 marital_status     Married-AF-spouse 1000  141  14.1    3
## 22 marital_status             Separated 1000  140  14.0    4
## 23 marital_status    Married-civ-spouse 1000  138  13.8    5
## 24 marital_status         Never-married 1000  134  13.4    6
## 25 marital_status               Widowed 1000  130  13.0    7
## 26     occupation          Tech-support 1000   96   9.6    1
## 27     occupation     Machine-op-inspct 1000   87   8.7    2
## 28     occupation          Armed-Forces 1000   86   8.6    3
## 29     occupation          Craft-repair 1000   79   7.9    4
## 30     occupation     Handlers-cleaners 1000   70   7.0    5
## 31     occupation          Adm-clerical 1000   69   6.9    6
## 32     occupation                 Sales 1000   69   6.9    6
## 33     occupation       Exec-managerial 1000   68   6.8    8
## 34     occupation       Protective-serv 1000   65   6.5    9
## 35     occupation      Transport-moving 1000   65   6.5    9
## 36   relationship               Husband 1000  172  17.2    1
## 37   relationship         Not-in-family 1000  172  17.2    1
## 38   relationship                  Wife 1000  171  17.1    3
## 39   relationship             Unmarried 1000  167  16.7    4
## 40   relationship        Other-relative 1000  162  16.2    5
## 41   relationship             Own-child 1000  156  15.6    6
## 42           race                 White 1000  211  21.1    1
## 43           race    Amer-Indian-Eskimo 1000  201  20.1    2
## 44           race                 Other 1000  199  19.9    3
## 45           race    Asian-Pac-Islander 1000  195  19.5    4
## 46           race                 Black 1000  194  19.4    5
## 47            sex                Female 1000  525  52.5    1
## 48            sex                  Male 1000  475  47.5    2
## 49 native_country    Dominican-Republic 1000   35   3.5    1
## 50 native_country               Hungary 1000   35   3.5    1
## 51 native_country              Portugal 1000   35   3.5    1
## 52 native_country                  Laos 1000   34   3.4    4
## 53 native_country                 India 1000   32   3.2    5
## 54 native_country                  Hong 1000   31   3.1    6
## 55 native_country                  Peru 1000   30   3.0    7
## 56 native_country               Jamaica 1000   29   2.9    8
## 57 native_country              Cambodia 1000   28   2.8    9
## 58 native_country               Ecuador 1000   28   2.8    9
## 59         income                 <=50K 1000  776  77.6    1
## 60         income                  >50K 1000  224  22.4    2

Imputing cathegory variables in Python

import pandas as pd import matplotlib.pyplot as plt

Read CSV file and get some basic info about it

df = pd.read_csv(“output.csv”) df.head()

Lets see the columns where we have missing values df.head().isna().sum()

WORKCLASS

workclass_counts = df[‘workclass’].value_counts(dropna=False) workclass_counts

def plot_data(data, title, xlabel):

# Plotting plt.figure(figsize=(8, 4)) # Set width and height here colors = plt.cm.tab10(range(len(data))) data.plot(kind=‘bar’, color=colors)

plt.xlabel(xlabel) plt.ylabel(‘Count’) plt.title(title) plt.legend()

plt.show()

plot_data(workclass_counts, “Workclass”, “workclass”)

Filling missing workclass values

In order to fill missing values we will use mode.

Calculate the mode of the ‘workclass’ column

workclass_mode = df[‘workclass’].mode()[0]

mode() returns a Series, [0] gets the first value

Fill missing values with the mode df[‘workclass’].fillna(workclass_mode, inplace=True)

workclass_counts = df[‘workclass’].value_counts() plot_data(workclass_counts, “Workclass”, “workclass”)

MARTIAL STATUS

marital_counts = df[‘marital_status’].value_counts(dropna=False) marital_counts

plot_data(marital_counts, “Marital Status”, “marital status”)

Filling missing marital status values

Calculate the mode of the ‘workclass’ column

marital_mode = df[‘marital_status’].mode()[0]

mode() returns a Series, [0] gets the first value

Fill missing values with the mode

df[‘marital_status’].fillna(marital_mode, inplace=True)

marital_counts = df[‘marital_status’].value_counts() plot_data(marital_counts, “Marital Status”, “marital status”)

HOURS PER WEEK

print(f”Missing values = {df[‘hours_per_week’].isna().sum()}“)

Filling missing hours_per_week We will use median to fill missing values

hours_per_week_median = df[‘hours_per_week’].median()

Fill missing values with the median

df[‘hours_per_week’].fillna(hours_per_week_median, inplace=True)

print(f”Missing values = {df[‘hours_per_week’].isna().sum()}“)

NATIVE COUNTRY

print(f”Missing = {df[‘native_country’].isna().sum()}“)

capital_counts = df[‘native_country’].value_counts(dropna=False) plot_data(capital_counts, “Native Country”, “native country”)

Filling native_country

country_mode = df[‘native_country’].mode()[0] # mode() returns a Series, [0] gets the first value

Fill missing values with the mode df[‘native_country’].fillna(country_mode, inplace=True)

Now you can proceed with your analysis Assuming ‘workclass’ is the name of the column you’re interested in

country_counts = df[‘native_country’].value_counts()

country_counts = df[‘native_country’].value_counts(dropna=False) plot_data(capital_counts, “Native Country”, “native country”)

FINAL DATASET

df.head() df.isna().sum()

JASON FILE WITH THIS CODE IS ATTACHED ON ENAUCZANIE

Clean Dataset

miss_case_table(clean_data)

## # A tibble: 1 × 3
##   n_miss_in_case n_cases pct_cases
##            <int>   <int>     <dbl>
## 1              0    1000       100

head(clean_data)

##   age        workclass   education education_num        marital_status
## 1  48        Local-gov     7th-8th             1         Never-married
## 2  68     Never-worked Prof-school             1              Divorced
## 3  31        State-gov   Bachelors             2 Married-spouse-absent
## 4  84     Never-worked     7th-8th            10             Separated
## 5  59          Private        12th            16    Married-civ-spouse
## 6  67 Self-emp-not-inc        10th            12 Married-spouse-absent
##          occupation   relationship               race    sex capital_gain
## 1    Prof-specialty Other-relative              Black   Male         7826
## 2   Priv-house-serv      Unmarried Asian-Pac-Islander Female        84532
## 3   Protective-serv           Wife              Black   Male        35297
## 4 Handlers-cleaners Other-relative Asian-Pac-Islander   Male        27074
## 5 Machine-op-inspct Other-relative              White Female        73785
## 6      Adm-clerical        Husband              Black   Male         2553
##   capital_loss hours_per_week native_country income
## 1        77249             60         Poland   >50K
## 2        60981             73        Jamaica  <=50K
## 3        15093             98        Hungary  <=50K
## 4        40203             32          South  <=50K
## 5         2056              4          Haiti  <=50K
## 6        17031             70       Scotland  <=50K

 #diagnose_paged_report(clean_data)
# this report is in Second diagnosis.pdf
 #eda_paged_report(clean_data) 
# this eda report is in Second report.pdf