Cleansing Data Report

Diagnosis

diagnose(sample_dirty_dataset)

## # A tibble: 14 × 6
##    variables      types   missing_count missing_percent unique_count unique_rate
##    <chr>          <chr>           <int>           <dbl>        <int>       <dbl>
##  1 age            charac…             0               0           74       0.074
##  2 workclass      charac…            50               5            9       0.009
##  3 education      charac…             0               0           17       0.017
##  4 education_num  integer             0               0           16       0.016
##  5 marital_status charac…            50               5            8       0.008
##  6 occupation     charac…             0               0           15       0.015
##  7 relationship   charac…            50               5            7       0.007
##  8 race           charac…             0               0            6       0.006
##  9 sex            charac…            50               5            3       0.003
## 10 capital_gain   numeric             0               0          947       0.947
## 11 capital_loss   numeric             0               0          948       0.948
## 12 hours_per_week integer            50               5          100       0.1  
## 13 native_country charac…            50               5           42       0.042
## 14 income         charac…             0               0            2       0.002

diagnose_category(sample_dirty_dataset)

##         variables                levels    N freq ratio rank
## 1             age            InvalidAge 1000   50   5.0    1
## 2             age                    89 1000   21   2.1    2
## 3             age                    28 1000   20   2.0    3
## 4             age                    52 1000   20   2.0    3
## 5             age                    42 1000   19   1.9    5
## 6             age                    84 1000   19   1.9    5
## 7             age                    34 1000   18   1.8    7
## 8             age                    56 1000   18   1.8    7
## 9             age                    88 1000   18   1.8    7
## 10            age                    26 1000   17   1.7   10
## 11      workclass          Self-emp-inc 1000  134  13.4    1
## 12      workclass               Private 1000  130  13.0    2
## 13      workclass          Never-worked 1000  124  12.4    3
## 14      workclass             State-gov 1000  116  11.6    4
## 15      workclass           Without-pay 1000  115  11.5    5
## 16      workclass      Self-emp-not-inc 1000  114  11.4    6
## 17      workclass             Local-gov 1000  111  11.1    7
## 18      workclass           Federal-gov 1000  106  10.6    8
## 19      workclass                  <NA> 1000   50   5.0    9
## 20      education               5th-6th 1000   78   7.8    1
## 21      education               Masters 1000   68   6.8    2
## 22      education                  12th 1000   66   6.6    3
## 23      education               HS-grad 1000   66   6.6    3
## 24      education           Prof-school 1000   63   6.3    5
## 25      education                   9th 1000   62   6.2    6
## 26      education                  10th 1000   60   6.0    7
## 27      education               1st-4th 1000   60   6.0    7
## 28      education          Some-college 1000   58   5.8    9
## 29      education            Assoc-acdm 1000   57   5.7   10
## 30 marital_status              Divorced 1000  155  15.5    1
## 31 marital_status Married-spouse-absent 1000  139  13.9    2
## 32 marital_status     Married-AF-spouse 1000  136  13.6    3
## 33 marital_status             Separated 1000  135  13.5    4
## 34 marital_status    Married-civ-spouse 1000  129  12.9    5
## 35 marital_status         Never-married 1000  129  12.9    5
## 36 marital_status               Widowed 1000  127  12.7    7
## 37 marital_status                  <NA> 1000   50   5.0    8
## 38     occupation          Tech-support 1000   87   8.7    1
## 39     occupation     Machine-op-inspct 1000   80   8.0    2
## 40     occupation          Armed-Forces 1000   79   7.9    3
## 41     occupation          Craft-repair 1000   75   7.5    4
## 42     occupation     Handlers-cleaners 1000   69   6.9    5
## 43     occupation          Adm-clerical 1000   67   6.7    6
## 44     occupation       Exec-managerial 1000   67   6.7    6
## 45     occupation                 Sales 1000   64   6.4    8
## 46     occupation       Protective-serv 1000   63   6.3    9
## 47     occupation      Transport-moving 1000   63   6.3    9
## 48   relationship                  Wife 1000  167  16.7    1
## 49   relationship         Not-in-family 1000  162  16.2    2
## 50   relationship               Husband 1000  160  16.0    3
## 51   relationship             Unmarried 1000  159  15.9    4
## 52   relationship        Other-relative 1000  152  15.2    5
## 53   relationship             Own-child 1000  150  15.0    6
## 54   relationship                  <NA> 1000   50   5.0    7
## 55           race                 White 1000  194  19.4    1
## 56           race    Amer-Indian-Eskimo 1000  193  19.3    2
## 57           race                 Other 1000  189  18.9    3
## 58           race    Asian-Pac-Islander 1000  187  18.7    4
## 59           race                 Black 1000  187  18.7    4
## 60           race           InvalidRace 1000   50   5.0    6
## 61            sex                Female 1000  499  49.9    1
## 62            sex                  Male 1000  451  45.1    2
## 63            sex                  <NA> 1000   50   5.0    3
## 64 native_country                  <NA> 1000   50   5.0    1
## 65 native_country    Dominican-Republic 1000   32   3.2    2
## 66 native_country               Hungary 1000   32   3.2    2
## 67 native_country                  Laos 1000   31   3.1    4
## 68 native_country              Portugal 1000   31   3.1    4
## 69 native_country                 India 1000   30   3.0    6
## 70 native_country              Cambodia 1000   27   2.7    7
## 71 native_country               Ecuador 1000   27   2.7    7
## 72 native_country                  Hong 1000   27   2.7    7
## 73 native_country               Jamaica 1000   27   2.7    7
## 74         income                 <=50K 1000  776  77.6    1
## 75         income                  >50K 1000  224  22.4    2

diagnose_numeric(sample_dirty_dataset)

##        variables  min      Q1        mean  median       Q3   max zero minus
## 1  education_num    1     4.0     8.34500     9.0    12.00    16    0     0
## 2   capital_gain -999 22805.0 47118.50900 47667.0 72652.25 99960    0    50
## 3   capital_loss -999 19684.5 46222.49000 45288.5 71802.25 99944    0    50
## 4 hours_per_week    1    23.0    48.37684    48.0    72.00    99    0     0
##   outlier
## 1       0
## 2       0
## 3       0
## 4       0

plot_na_pareto(sample_dirty_dataset)

plot_na_hclust(sample_dirty_dataset)

plot_na_intersect(sample_dirty_dataset)

#diagnose_web_report(sample_dirty_dataset)

Here is the link to first diagnosis report There are some variables with missing values and both capital gain and loss have negative values. Also age is not considered as numerical value. Some variables have values “Invalid…” wchich are not visible in diagnose report. So let’s create rules and see how many variables are violating them to have all needed data about our dataset.

workclass_enum <- c("Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", "Local-gov", "State-gov", "Without-pay", "Never-worked")
education_enum <- c("Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool")
marital_status_enum <- c("Married-civ-spouse", "Divorced", "Never-married", "Separated", "Widowed", "Married-spouse-absent", "Married-AF-spouse")
occupation_enum <- c("Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv", "Armed-Forces")
relationship_enum <- c("Wife", "Own-child", "Husband", "Not-in-family", "Other-relative", "Unmarried")
race_enum <- c("White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black")
sex_enum <- c("Female", "Male")
native_country_enum <- c("United-States", "Cambodia", "England", "Puerto-Rico", "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras", "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico", "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands")
income_enum <- c("<=50K", ">50K")
RULE <- editset(c("age >= 18", "age <= 90",
                  "workclass %in% workclass_enum",
                  "education %in% education_enum",
                  "education_num >= 1", "education_num <= 16",
                  "marital_status %in% marital_status_enum",
                  "occupation %in% occupation_enum",
                  "relationship %in% relationship_enum",
                  "race %in% race_enum",
                  "sex %in% sex_enum",
                  "capital_gain >= 0", "capital_gain <= 99999",
                  "capital_loss >= 0", "capital_loss <= 99999",
                  "hours_per_week >= 0", "hours_per_week <= 999",
                  "native_country %in% native_country_enum",
                  "income %in% income_enum"))
RULE

## 
## Data model:
## dat1 : education %in% c('10th', '11th', '12th', '1st-4th', '5th-6th', '7th-8th', '9th', 'Assoc-acdm', 'Assoc-voc', 'Bachelors', 'Doctorate', 'HS-grad', 'Masters', 'Preschool', 'Prof-school', 'Some-college')
## dat2 : income %in% c('<=50K', '>50K')
## dat3 : marital_status %in% c('Divorced', 'Married-AF-spouse', 'Married-civ-spouse', 'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed')
## dat4 : native_country %in% c('Cambodia', 'Canada', 'China', 'Columbia', 'Cuba', 'Dominican-Republic', 'Ecuador', 'El-Salvador', 'England', 'France', 'Germany', 'Greece', 'Guatemala', 'Haiti', 'Holand-Netherlands', 'Honduras', 'Hong', 'Hungary', 'India', 'Iran', 'Ireland', 'Italy', 'Jamaica', 'Japan', 'Laos', 'Mexico', 'Nicaragua', 'Outlying-US(Guam-USVI-etc)', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Puerto-Rico', 'Scotland', 'South', 'Taiwan', 'Thailand', 'Trinadad&Tobago', 'United-States', 'Vietnam', 'Yugoslavia')
## dat5 : occupation %in% c('Adm-clerical', 'Armed-Forces', 'Craft-repair', 'Exec-managerial', 'Farming-fishing', 'Handlers-cleaners', 'Machine-op-inspct', 'Other-service', 'Priv-house-serv', 'Prof-specialty', 'Protective-serv', 'Sales', 'Tech-support', 'Transport-moving')
## dat6 : race %in% c('Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White')
## dat7 : relationship %in% c('Husband', 'Not-in-family', 'Other-relative', 'Own-child', 'Unmarried', 'Wife')
## dat8 : sex %in% c('Female', 'Male')
## dat9 : workclass %in% c('Federal-gov', 'Local-gov', 'Never-worked', 'Private', 'Self-emp-inc', 'Self-emp-not-inc', 'State-gov', 'Without-pay') 
## 
## Edit set:
## num1  : 18 <= age
## num2  : age <= 90
## num3  : 1 <= education_num
## num4  : education_num <= 16
## num5  : 0 <= capital_gain
## num6  : capital_gain <= 99999
## num7  : 0 <= capital_loss
## num8  : capital_loss <= 99999
## num9  : 0 <= hours_per_week
## num10 : hours_per_week <= 999

violated <- violatedEdits(RULE, sample_dirty_dataset)
summary(violated)

## Edit violations, 1000 observations, 0 completely missing (0%):
## 
##  editname freq rel
##      num2   50  5%
##      num5   50  5%
##      num7   50  5%
##      dat1   50  5%
##      dat3   50  5%
##      dat4   50  5%
##      dat5   50  5%
##      dat6   50  5%
##      dat7   50  5%
##      dat8   50  5%
##      dat9   50  5%
## 
## Edit violations per record:
## 
##  errors freq   rel
##       0  542 54.2%
##       1  307 30.7%
##       2  118 11.8%
##       3   25  2.5%
##       4    8  0.8%

Changing into NA

Now we can see that more variables are violating rules than it was shown in diagnosis report. Let’s change all values which violate the rules to NA and change age variable to numeric type.

sample_data_with_NA <- sample_dirty_dataset
sample_data_with_NA$age[sample_data_with_NA$age == "InvalidAge"] <- NA
sample_data_with_NA$age <- as.numeric(sample_data_with_NA$age)

sample_data_with_NA$education[sample_data_with_NA$education == "InvalidEducation"] <- NA

sample_data_with_NA$occupation[sample_data_with_NA$occupation == "InvalidOccupation"] <- NA

sample_data_with_NA$race[sample_data_with_NA$race == "InvalidRace"] <- NA

sample_data_with_NA$capital_gain[sample_data_with_NA$capital_gain < 0] <- NA

sample_data_with_NA$capital_loss[sample_data_with_NA$capital_loss < 0] <- NA
summary(sample_data_with_NA)

##       age         workclass          education         education_num   
##  Min.   :18.00   Length:1000        Length:1000        Min.   : 1.000  
##  1st Qu.:36.00   Class :character   Class :character   1st Qu.: 4.000  
##  Median :53.00   Mode  :character   Mode  :character   Median : 9.000  
##  Mean   :53.82                                         Mean   : 8.345  
##  3rd Qu.:71.00                                         3rd Qu.:12.000  
##  Max.   :90.00                                         Max.   :16.000  
##  NA's   :50                                                            
##  marital_status      occupation        relationship           race          
##  Length:1000        Length:1000        Length:1000        Length:1000       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##      sex             capital_gain    capital_loss   hours_per_week 
##  Length:1000        Min.   :   20   Min.   :   72   Min.   : 1.00  
##  Class :character   1st Qu.:25715   1st Qu.:23732   1st Qu.:23.00  
##  Mode  :character   Median :50639   Median :48897   Median :48.00  
##                     Mean   :49651   Mean   :48708   Mean   :48.38  
##                     3rd Qu.:74052   3rd Qu.:73255   3rd Qu.:72.00  
##                     Max.   :99960   Max.   :99944   Max.   :99.00  
##                     NA's   :50      NA's   :50      NA's   :50     
##  native_country        income         
##  Length:1000        Length:1000       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
##

Diagnosis

Let’s now run diagnosis on our dataset.

diagnose(sample_data_with_NA)

## # A tibble: 14 × 6
##    variables      types   missing_count missing_percent unique_count unique_rate
##    <chr>          <chr>           <int>           <dbl>        <int>       <dbl>
##  1 age            numeric            50               5           74       0.074
##  2 workclass      charac…            50               5            9       0.009
##  3 education      charac…            50               5           17       0.017
##  4 education_num  integer             0               0           16       0.016
##  5 marital_status charac…            50               5            8       0.008
##  6 occupation     charac…            50               5           15       0.015
##  7 relationship   charac…            50               5            7       0.007
##  8 race           charac…            50               5            6       0.006
##  9 sex            charac…            50               5            3       0.003
## 10 capital_gain   numeric            50               5          947       0.947
## 11 capital_loss   numeric            50               5          948       0.948
## 12 hours_per_week integer            50               5          100       0.1  
## 13 native_country charac…            50               5           42       0.042
## 14 income         charac…             0               0            2       0.002

diagnose_category(sample_data_with_NA)

##         variables                levels    N freq ratio rank
## 1       workclass          Self-emp-inc 1000  134  13.4    1
## 2       workclass               Private 1000  130  13.0    2
## 3       workclass          Never-worked 1000  124  12.4    3
## 4       workclass             State-gov 1000  116  11.6    4
## 5       workclass           Without-pay 1000  115  11.5    5
## 6       workclass      Self-emp-not-inc 1000  114  11.4    6
## 7       workclass             Local-gov 1000  111  11.1    7
## 8       workclass           Federal-gov 1000  106  10.6    8
## 9       workclass                  <NA> 1000   50   5.0    9
## 10      education               5th-6th 1000   78   7.8    1
## 11      education               Masters 1000   68   6.8    2
## 12      education                  12th 1000   66   6.6    3
## 13      education               HS-grad 1000   66   6.6    3
## 14      education           Prof-school 1000   63   6.3    5
## 15      education                   9th 1000   62   6.2    6
## 16      education                  10th 1000   60   6.0    7
## 17      education               1st-4th 1000   60   6.0    7
## 18      education          Some-college 1000   58   5.8    9
## 19      education            Assoc-acdm 1000   57   5.7   10
## 20 marital_status              Divorced 1000  155  15.5    1
## 21 marital_status Married-spouse-absent 1000  139  13.9    2
## 22 marital_status     Married-AF-spouse 1000  136  13.6    3
## 23 marital_status             Separated 1000  135  13.5    4
## 24 marital_status    Married-civ-spouse 1000  129  12.9    5
## 25 marital_status         Never-married 1000  129  12.9    5
## 26 marital_status               Widowed 1000  127  12.7    7
## 27 marital_status                  <NA> 1000   50   5.0    8
## 28     occupation          Tech-support 1000   87   8.7    1
## 29     occupation     Machine-op-inspct 1000   80   8.0    2
## 30     occupation          Armed-Forces 1000   79   7.9    3
## 31     occupation          Craft-repair 1000   75   7.5    4
## 32     occupation     Handlers-cleaners 1000   69   6.9    5
## 33     occupation          Adm-clerical 1000   67   6.7    6
## 34     occupation       Exec-managerial 1000   67   6.7    6
## 35     occupation                 Sales 1000   64   6.4    8
## 36     occupation       Protective-serv 1000   63   6.3    9
## 37     occupation      Transport-moving 1000   63   6.3    9
## 38   relationship                  Wife 1000  167  16.7    1
## 39   relationship         Not-in-family 1000  162  16.2    2
## 40   relationship               Husband 1000  160  16.0    3
## 41   relationship             Unmarried 1000  159  15.9    4
## 42   relationship        Other-relative 1000  152  15.2    5
## 43   relationship             Own-child 1000  150  15.0    6
## 44   relationship                  <NA> 1000   50   5.0    7
## 45           race                 White 1000  194  19.4    1
## 46           race    Amer-Indian-Eskimo 1000  193  19.3    2
## 47           race                 Other 1000  189  18.9    3
## 48           race    Asian-Pac-Islander 1000  187  18.7    4
## 49           race                 Black 1000  187  18.7    4
## 50           race                  <NA> 1000   50   5.0    6
## 51            sex                Female 1000  499  49.9    1
## 52            sex                  Male 1000  451  45.1    2
## 53            sex                  <NA> 1000   50   5.0    3
## 54 native_country                  <NA> 1000   50   5.0    1
## 55 native_country    Dominican-Republic 1000   32   3.2    2
## 56 native_country               Hungary 1000   32   3.2    2
## 57 native_country                  Laos 1000   31   3.1    4
## 58 native_country              Portugal 1000   31   3.1    4
## 59 native_country                 India 1000   30   3.0    6
## 60 native_country              Cambodia 1000   27   2.7    7
## 61 native_country               Ecuador 1000   27   2.7    7
## 62 native_country                  Hong 1000   27   2.7    7
## 63 native_country               Jamaica 1000   27   2.7    7
## 64         income                 <=50K 1000  776  77.6    1
## 65         income                  >50K 1000  224  22.4    2

diagnose_numeric(sample_data_with_NA)

##        variables min       Q1        mean  median    Q3   max zero minus
## 1            age  18    36.00    53.81684    53.0    71    90    0     0
## 2  education_num   1     4.00     8.34500     9.0    12    16    0     0
## 3   capital_gain  20 25714.75 49651.00947 50639.0 74052 99960    0     0
## 4   capital_loss  72 23732.25 48707.83158 48896.5 73255 99944    0     0
## 5 hours_per_week   1    23.00    48.37684    48.0    72    99    0     0
##   outlier
## 1       0
## 2       0
## 3       0
## 4       0
## 5       0

plot_na_pareto(sample_data_with_NA)

plot_na_hclust(sample_data_with_NA)

plot_na_intersect(sample_data_with_NA)

#diagnose_paged_report(sample_data_with_NA)

Here is the link to second diagnosis report Now we see how many missing values there are in each variable and we can start imputing issing values.

NUMERICAL VARIABLES

How many observations contain NA’s in numeric variables

numerical <-sample_data_with_NA %>%
  select_if(is.numeric) 

numerical %>%
  miss_case_table()

## # A tibble: 4 × 3
##   n_miss_in_case n_cases pct_cases
##            <int>   <int>     <dbl>
## 1              0     817      81.7
## 2              1     167      16.7
## 3              2      15       1.5
## 4              3       1       0.1

howmany <- numerical %>% 
  find_na(index = TRUE) %>%
    length()

gg_miss_upset(numerical, nsets = howmany)

## Preview different methods on age variable

data_mean <- numerical %>%
  imputate_na(age, method="mean")

data_mode <- numerical %>%
  imputate_na(age, method="mode")

data_median <- numerical %>%
  imputate_na(age, method="median")

data_mice <- numerical %>%
  imputate_na(age, c("hours_per_week", "capital_gain", "capital_loss"), method="mice")

## 
##  iter imp variable
##   1   1  age
##   1   2  age
##   1   3  age
##   1   4  age
##   1   5  age
##   2   1  age
##   2   2  age
##   2   3  age
##   2   4  age
##   2   5  age
##   3   1  age
##   3   2  age
##   3   3  age
##   3   4  age
##   3   5  age
##   4   1  age
##   4   2  age
##   4   3  age
##   4   4  age
##   4   5  age
##   5   1  age
##   5   2  age
##   5   3  age
##   5   4  age
##   5   5  age

data_rpart <- numerical %>%
  imputate_na(age, c("hours_per_week", "capital_gain", "capital_loss"), method="rpart")


plot(data_mean)

plot(data_mode)

plot(data_median)

plot(data_mice)

plot(data_rpart)

#check different inputting missing values methods and choose most fitting one

diagnose_numeric(numerical)

##        variables min       Q1        mean  median    Q3   max zero minus
## 1            age  18    36.00    53.81684    53.0    71    90    0     0
## 2  education_num   1     4.00     8.34500     9.0    12    16    0     0
## 3   capital_gain  20 25714.75 49651.00947 50639.0 74052 99960    0     0
## 4   capital_loss  72 23732.25 48707.83158 48896.5 73255 99944    0     0
## 5 hours_per_week   1    23.00    48.37684    48.0    72    99    0     0
##   outlier
## 1       0
## 2       0
## 3       0
## 4       0
## 5       0

#mode
data_mode <- numerical %>%
  impute_mode()

#mean
data_mean <- numerical %>%
  impute_mean_all()

#median
data_median <- numerical %>%
  impute_median_all()

#mice method
data_mice <- numerical %>%
  mice(print=FALSE) 


data_mice <- complete(data_mice)

diagnose_numeric(data_mice)

##        variables min       Q1      mean  median      Q3   max zero minus
## 1            age  18    35.00    53.844    53.0    71.0    90    0     0
## 2  education_num   1     4.00     8.345     9.0    12.0    16    0     0
## 3   capital_gain  20 25690.50 49607.551 50666.0 73745.5 99960    0     0
## 4   capital_loss  72 23768.25 48418.579 48569.5 72827.5 99944    0     0
## 5 hours_per_week   1    23.00    48.585    48.0    73.0    99    0     0
##   outlier
## 1       0
## 2       0
## 3       0
## 4       0
## 5       0

diagnose_numeric(data_mean)

##        variables min       Q1        mean      median       Q3   max zero minus
## 1            age  18    37.00    53.81684    53.81684    70.00    90    0     0
## 2  education_num   1     4.00     8.34500     9.00000    12.00    16    0     0
## 3   capital_gain  20 27438.75 49651.00947 49651.00947 72652.25 99960    0     0
## 4   capital_loss  72 25649.50 48707.83158 48707.83158 71802.25 99944    0     0
## 5 hours_per_week   1    24.75    48.37684    48.37684    71.00    99    0     0
##   outlier
## 1       0
## 2       0
## 3       0
## 4       0
## 5       0

diagnose_numeric(data_median)

##        variables min       Q1      mean  median       Q3   max zero minus
## 1            age  18    37.00    53.776    53.0    70.00    90    0     0
## 2  education_num   1     4.00     8.345     9.0    12.00    16    0     0
## 3   capital_gain  20 27438.75 49700.409 50639.0 72652.25 99960    0     0
## 4   capital_loss  72 25649.50 48717.265 48896.5 71802.25 99944    0     0
## 5 hours_per_week   1    24.75    48.358    48.0    71.00    99    0     0
##   outlier
## 1       0
## 2       0
## 3       0
## 4       0
## 5       0

diagnose_numeric(data_mode)

##        variables min       Q1        mean  median       Q3        max zero
## 1            age  18    37.00    56.39257    55.0    75.00   105.3315    0
## 2  education_num   1     4.00     8.34500     9.0    12.00    16.0000    0
## 3   capital_gain  20 22805.00 47173.72557 47667.0 72652.25 99960.0000    0
## 4   capital_loss  72 19684.50 46277.70657 45288.5 71802.25 99944.0000    0
## 5 hours_per_week   1    24.75    51.22457    50.0    76.00   105.3315    0
##   minus outlier
## 1     0       0
## 2     0       0
## 3     0       0
## 4     0       0
## 5     0       0

Now let’s impute missing numerical values with new ones.

sample_data_with_some_NA <- sample_data_with_NA

sample_data_with_some_NA$age <-data_mice$age
sample_data_with_some_NA$education_num <- data_mice$education_num
sample_data_with_some_NA$capital_gain <- data_mice$capital_gain
sample_data_with_some_NA$capital_loss <- data_mice$capital_loss
sample_data_with_some_NA$hours_per_week <- data_mice$hours_per_week

summary(sample_data_with_some_NA)

##       age         workclass          education         education_num   
##  Min.   :18.00   Length:1000        Length:1000        Min.   : 1.000  
##  1st Qu.:35.00   Class :character   Class :character   1st Qu.: 4.000  
##  Median :53.00   Mode  :character   Mode  :character   Median : 9.000  
##  Mean   :53.84                                         Mean   : 8.345  
##  3rd Qu.:71.00                                         3rd Qu.:12.000  
##  Max.   :90.00                                         Max.   :16.000  
##  marital_status      occupation        relationship           race          
##  Length:1000        Length:1000        Length:1000        Length:1000       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##      sex             capital_gain    capital_loss   hours_per_week 
##  Length:1000        Min.   :   20   Min.   :   72   Min.   : 1.00  
##  Class :character   1st Qu.:25691   1st Qu.:23768   1st Qu.:23.00  
##  Mode  :character   Median :50666   Median :48570   Median :48.00  
##                     Mean   :49608   Mean   :48419   Mean   :48.59  
##                     3rd Qu.:73746   3rd Qu.:72828   3rd Qu.:73.00  
##                     Max.   :99960   Max.   :99944   Max.   :99.00  
##  native_country        income         
##  Length:1000        Length:1000       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##

We can see now that no numerical variable has NA.

CATEGORICAL VARIABLES

import pandas as pd
import matplotlib.pyplot as plt

write.csv(sample_data_with_some_NA, file = "output.csv", row.names = FALSE)

Read CSV file and get some basic info about it

df = pd.read_csv("output.csv")
df.head()

##    age     workclass    education  ...  hours_per_week native_country income
## 0   48     Local-gov      7th-8th  ...              60         Poland   >50K
## 1   68           NaN  Prof-school  ...              73            NaN  <=50K
## 2   31     State-gov    Bachelors  ...              98        Hungary  <=50K
## 3   84  Never-worked      7th-8th  ...              62          South  <=50K
## 4   59       Private         12th  ...               4          Haiti  <=50K
## 
## [5 rows x 14 columns]

# Lets see the columns where we have missing values
df.head().isna().sum()

## age               0
## workclass         1
## education         0
## education_num     0
## marital_status    1
## occupation        0
## relationship      0
## race              0
## sex               0
## capital_gain      0
## capital_loss      0
## hours_per_week    0
## native_country    1
## income            0
## dtype: int64

Workclass

workclass_counts = df['workclass'].value_counts(dropna=False)
workclass_counts

## workclass
## Self-emp-inc        134
## Private             130
## Never-worked        124
## State-gov           116
## Without-pay         115
## Self-emp-not-inc    114
## Local-gov           111
## Federal-gov         106
## NaN                  50
## Name: count, dtype: int64

def plot_data(data, title, xlabel):

  # Plotting
  plt.figure(figsize=(8, 4))  # Set width and height here
  colors = plt.cm.tab10(range(len(data)))
  data.plot(kind='bar', color=colors)

  plt.xlabel(xlabel)
  plt.ylabel('Count')
  plt.title(title)
  plt.legend()

  plt.show()

plot_data(workclass_counts, "Workclass", "workclass")

Filling missing workclass values

In order to fill missing values we will use mode.

# Calculate the mode of the 'workclass' column
workclass_mode = df['workclass'].mode()[0]  # mode() returns a Series, [0] gets the first value

# Fill missing values with the mode
df['workclass'].fillna(workclass_mode, inplace=True)

## <string>:3: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
## The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
## 
## For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.

workclass_counts = df['workclass'].value_counts()
plot_data(workclass_counts, "Workclass", "workclass")

Marital Status

marital_counts = df['marital_status'].value_counts(dropna=False)
marital_counts

## marital_status
## Divorced                 155
## Married-spouse-absent    139
## Married-AF-spouse        136
## Separated                135
## Married-civ-spouse       129
## Never-married            129
## Widowed                  127
## NaN                       50
## Name: count, dtype: int64

plot_data(marital_counts, "Marital Status", "marital status")

Filling missing marital status values

# Calculate the mode of the 'workclass' column
marital_mode = df['marital_status'].mode()[0]  # mode() returns a Series, [0] gets the first value

# Fill missing values with the mode
df['marital_status'].fillna(marital_mode, inplace=True)

## <string>:3: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
## The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
## 
## For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.

marital_counts = df['marital_status'].value_counts()
plot_data(marital_counts, "Marital Status", "marital status")

Hours per week

print(f"Missing values = {df['hours_per_week'].isna().sum()}")

## Missing values = 0

Fliing missing hours_per_week

We Will use median to fill missing values

hours_per_week_median = df['hours_per_week'].median()

# Fill missing values with the median
df['hours_per_week'].fillna(hours_per_week_median, inplace=True)

## <string>:3: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
## The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
## 
## For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.

print(f"Missing values = {df['hours_per_week'].isna().sum()}")

## Missing values = 0

Native country

print(f"Missing = {df['native_country'].isna().sum()}")

## Missing = 50

capital_counts = df['native_country'].value_counts(dropna=False)
plot_data(capital_counts, "Native Country", "native country")

Filling native_country

country_mode = df['native_country'].mode()[0]  # mode() returns a Series, [0] gets the first value

# Fill missing values with the mode
df['native_country'].fillna(country_mode, inplace=True)

## <string>:3: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
## The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
## 
## For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


# Now you can proceed with your analysis
# Assuming 'workclass' is the name of the column you're interested in
country_counts = df['native_country'].value_counts()

country_counts = df['native_country'].value_counts(dropna=False)
plot_data(capital_counts, "Native Country", "native country")

Final Dataset

df.head()

##    age     workclass    education  ...  hours_per_week      native_country income
## 0   48     Local-gov      7th-8th  ...              60              Poland   >50K
## 1   68  Self-emp-inc  Prof-school  ...              73  Dominican-Republic  <=50K
## 2   31     State-gov    Bachelors  ...              98             Hungary  <=50K
## 3   84  Never-worked      7th-8th  ...              62               South  <=50K
## 4   59       Private         12th  ...               4               Haiti  <=50K
## 
## [5 rows x 14 columns]