Data Cleaning and Transformation Report

## Sample data

# Load necessary libraries
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# Set seed for reproducibility
set.seed(2)

# Generate sample data
n <- 1000  # Number of observations

# Age between 18 and 90
age <- sample(18:90, n, replace = TRUE)

# Introduce errors in age column
age[sample(1:n, 50)] <- "InvalidAge"

# Workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked
workclass <- sample(c("Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", "Local-gov", "State-gov", "Without-pay", "Never-worked"), n, replace = TRUE)

# Introduce missing values in workclass column
workclass[sample(1:n, 50)] <- NA

# Education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool
education <- sample(c("Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"), n, replace = TRUE)

# Introduce inconsistencies in education column
education[sample(1:n, 50)] <- "InvalidEducation"

# Education Number: 1-16
education_num <- sample(1:16, n, replace = TRUE)

# Marital Status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse
marital_status <- sample(c("Married-civ-spouse", "Divorced", "Never-married", "Separated", "Widowed", "Married-spouse-absent", "Married-AF-spouse"), n, replace = TRUE)

# Introduce missing values in marital_status column
marital_status[sample(1:n, 50)] <- NA

# Occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces
occupation <- sample(c("Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv", "Armed-Forces"), n, replace = TRUE)

# Introduce errors in occupation column
occupation[sample(1:n, 50)] <- "InvalidOccupation"

# Relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried
relationship <- sample(c("Wife", "Own-child", "Husband", "Not-in-family", "Other-relative", "Unmarried"), n, replace = TRUE)

# Introduce missing values in relationship column
relationship[sample(1:n, 50)] <- NA

# Race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black
race <- sample(c("White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"), n, replace = TRUE)

# Introduce inconsistencies in race column
race[sample(1:n, 50)] <- "InvalidRace"

# Sex: Female, Male
sex <- sample(c("Female", "Male"), n, replace = TRUE)

# Introduce missing values in sex column
sex[sample(1:n, 50)] <- NA

# Capital Gain: 0-99999
capital_gain <- sample(0:99999, n, replace = TRUE)

# Introduce errors in capital_gain column
capital_gain[sample(1:n, 50)] <- -999

# Capital Loss: 0-99999
capital_loss <- sample(0:99999, n, replace = TRUE)

# Introduce errors in capital_loss column
capital_loss[sample(1:n, 50)] <- -999

# Hours per week: 1-99
hours_per_week <- sample(1:99, n, replace = TRUE)

# Introduce missing values in hours_per_week column
hours_per_week[sample(1:n, 50)] <- NA

# Native country: United-States, Cambodia, England, ...
native_country <- sample(c("United-States", "Cambodia", "England", "Puerto-Rico", "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras", "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico", "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"), n, replace = TRUE)

# Introduce missing values in native_country column
native_country[sample(1:n, 50)] <- NA

# Income: <=50K, >50K
income <- sample(c("<=50K", ">50K"), n, replace = TRUE, prob = c(0.75, 0.25))

# Create dataframe
sample_dirty_dataset <- data.frame(age, workclass, education, education_num, marital_status, occupation, relationship, race, sex, capital_gain, capital_loss, hours_per_week, native_country, income)
# Step 1
# Print first few rows of the dataset
head(sample_dirty_dataset)

##          age        workclass   education education_num     marital_status
## 1         87             <NA>        10th             6           Divorced
## 2         23        Local-gov Prof-school             3            Widowed
## 3 InvalidAge        Local-gov     Masters             9          Separated
## 4         25 Self-emp-not-inc     Masters             8 Married-civ-spouse
## 5         34     Self-emp-inc     5th-6th            14          Separated
## 6         58     Self-emp-inc Prof-school             6            Widowed
##          occupation  relationship               race    sex capital_gain
## 1 InvalidOccupation       Husband              Black Female        57594
## 2 InvalidOccupation Not-in-family              Black   Male        49733
## 3 Handlers-cleaners          Wife Amer-Indian-Eskimo Female        87495
## 4      Tech-support     Unmarried              White   Male        58211
## 5      Armed-Forces     Unmarried              Black Female        46277
## 6      Armed-Forces     Own-child Asian-Pac-Islander   Male        38858
##   capital_loss hours_per_week  native_country income
## 1        73012             31        Columbia   >50K
## 2         9818             91 Trinadad&Tobago  <=50K
## 3        14342              6         Jamaica  <=50K
## 4        65478             76     El-Salvador   >50K
## 5        19906             38          Taiwan  <=50K
## 6        87676             74         Ireland  <=50K

Step 1, 2

# Load necessary libraries
library(tidyverse)

# Load the sample dataset
data <- sample_dirty_dataset

# Examine variables and their data types
str(data)

## 'data.frame':    1000 obs. of  14 variables:
##  $ age           : chr  "87" "23" "InvalidAge" "25" ...
##  $ workclass     : chr  NA "Local-gov" "Local-gov" "Self-emp-not-inc" ...
##  $ education     : chr  "10th" "Prof-school" "Masters" "Masters" ...
##  $ education_num : int  6 3 9 8 14 6 1 4 10 14 ...
##  $ marital_status: chr  "Divorced" "Widowed" "Separated" "Married-civ-spouse" ...
##  $ occupation    : chr  "InvalidOccupation" "InvalidOccupation" "Handlers-cleaners" "Tech-support" ...
##  $ relationship  : chr  "Husband" "Not-in-family" "Wife" "Unmarried" ...
##  $ race          : chr  "Black" "Black" "Amer-Indian-Eskimo" "White" ...
##  $ sex           : chr  "Female" "Male" "Female" "Male" ...
##  $ capital_gain  : num  57594 49733 87495 58211 46277 ...
##  $ capital_loss  : num  73012 9818 14342 65478 19906 ...
##  $ hours_per_week: int  31 91 6 76 38 74 26 12 56 51 ...
##  $ native_country: chr  "Columbia" "Trinadad&Tobago" "Jamaica" "El-Salvador" ...
##  $ income        : chr  ">50K" "<=50K" "<=50K" ">50K" ...

# Check for missing values
missing_values <- colSums(is.na(data))
variables_with_missing <- names(missing_values[missing_values > 0])

# Identify variables with errors or inconsistencies
variables_with_errors <- c("age", "education", "occupation", "race")

# Print variables with missing values and errors
print("Variables with Missing Values:")

## [1] "Variables with Missing Values:"

print(variables_with_missing)

## [1] "workclass"      "marital_status" "relationship"   "sex"           
## [5] "hours_per_week" "native_country"

print("Variables with Errors or Inconsistencies:")

## [1] "Variables with Errors or Inconsistencies:"

print(variables_with_errors)

## [1] "age"        "education"  "occupation" "race"

Steps 3,4,5

library(mice)

## 
## Dołączanie pakietu: 'mice'

## Następujący obiekt został zakryty z 'package:stats':
## 
##     filter

## Następujące obiekty zostały zakryte z 'package:base':
## 
##     cbind, rbind

library(dplyr)
library(naniar)

data <- sample_dirty_dataset

# Correct invalid ages
data$age <- ifelse(data$age == "InvalidAge", NA, as.integer(data$age))

## Warning in ifelse(data$age == "InvalidAge", NA, as.integer(data$age)): pojawiły
## się wartości NA na skutek przekształcenia

# Correct negative capital gains or losses
data$capital_gain <- ifelse(data$capital_gain < 0, NA, data$capital_gain)
data$capital_loss <- ifelse(data$capital_loss < 0, NA, data$capital_loss)

# Correct incorrectly spelled or invalid values in categorical variables 
data$occupation <- ifelse(data$occupation == "InvalidOccupation", NA, data$occupation)
data$education <- ifelse(data$education == "InvalidEducation", NA, data$education)

# Step 4 
data$education <- ifelse(data$education %in% c("11th", "12th", "1st-4th", "5th-6th", "7th-8th", "9th", "10th", "Preschool"),
                         "InvalidEducation", data$education)
data$race <- ifelse(data$race == "InvalidRace", "InvalidRace", data$race)

# changing from invalid to NA
data$education <- ifelse(data$education == "InvalidEducation", NA, data$education)
data$race <- ifelse(data$race == "InvalidRace", NA, data$race)

#Step 5

# Recognize missingness patterns
missing_pattern <- md.pattern(data)

print("Missingness Patterns:")

## [1] "Missingness Patterns:"

print(missing_pattern)

##     education_num income age workclass marital_status occupation relationship
## 278             1      1   1         1              1          1            1
## 286             1      1   1         1              1          1            1
## 13              1      1   1         1              1          1            1
## 16              1      1   1         1              1          1            1
## 16              1      1   1         1              1          1            1
## 18              1      1   1         1              1          1            1
## 1               1      1   1         1              1          1            1
## 16              1      1   1         1              1          1            1
## 19              1      1   1         1              1          1            1
## 1               1      1   1         1              1          1            1
## 17              1      1   1         1              1          1            1
## 14              1      1   1         1              1          1            1
## 1               1      1   1         1              1          1            1
## 1               1      1   1         1              1          1            1
## 2               1      1   1         1              1          1            1
## 1               1      1   1         1              1          1            1
## 1               1      1   1         1              1          1            1
## 12              1      1   1         1              1          1            1
## 13              1      1   1         1              1          1            1
## 1               1      1   1         1              1          1            1
## 1               1      1   1         1              1          1            1
## 2               1      1   1         1              1          1            1
## 1               1      1   1         1              1          1            1
## 1               1      1   1         1              1          1            1
## 19              1      1   1         1              1          1            1
## 13              1      1   1         1              1          1            1
## 1               1      1   1         1              1          1            1
## 1               1      1   1         1              1          1            1
## 2               1      1   1         1              1          1            1
## 1               1      1   1         1              1          1            1
## 1               1      1   1         1              1          1            1
## 1               1      1   1         1              1          1            1
## 1               1      1   1         1              1          1            1
## 11              1      1   1         1              1          1            0
## 19              1      1   1         1              1          1            0
## 1               1      1   1         1              1          1            0
## 2               1      1   1         1              1          1            0
## 1               1      1   1         1              1          1            0
## 2               1      1   1         1              1          1            0
## 3               1      1   1         1              1          1            0
## 1               1      1   1         1              1          1            0
## 2               1      1   1         1              1          1            0
## 1               1      1   1         1              1          1            0
## 1               1      1   1         1              1          1            0
## 14              1      1   1         1              1          0            1
## 16              1      1   1         1              1          0            1
## 3               1      1   1         1              1          0            1
## 3               1      1   1         1              1          0            1
## 1               1      1   1         1              1          0            1
## 1               1      1   1         1              1          0            1
## 1               1      1   1         1              1          0            1
## 1               1      1   1         1              1          0            1
## 1               1      1   1         1              1          0            1
## 1               1      1   1         1              1          0            0
## 1               1      1   1         1              1          0            0
## 12              1      1   1         1              0          1            1
## 11              1      1   1         1              0          1            1
## 1               1      1   1         1              0          1            1
## 2               1      1   1         1              0          1            1
## 3               1      1   1         1              0          1            1
## 1               1      1   1         1              0          1            1
## 3               1      1   1         1              0          1            1
## 1               1      1   1         1              0          1            1
## 1               1      1   1         1              0          1            1
## 1               1      1   1         1              0          1            1
## 2               1      1   1         1              0          1            1
## 1               1      1   1         1              0          1            0
## 1               1      1   1         1              0          1            0
## 1               1      1   1         1              0          1            0
## 1               1      1   1         1              0          0            1
## 2               1      1   1         1              0          0            1
## 15              1      1   1         0              1          1            1
## 12              1      1   1         0              1          1            1
## 2               1      1   1         0              1          1            1
## 1               1      1   1         0              1          1            1
## 1               1      1   1         0              1          1            1
## 2               1      1   1         0              1          1            1
## 1               1      1   1         0              1          1            1
## 1               1      1   1         0              1          1            1
## 1               1      1   1         0              1          1            1
## 2               1      1   1         0              1          1            1
## 1               1      1   1         0              1          1            1
## 1               1      1   1         0              1          1            1
## 1               1      1   1         0              1          1            0
## 2               1      1   1         0              1          0            1
## 1               1      1   1         0              1          0            1
## 1               1      1   1         0              0          1            1
## 2               1      1   1         0              0          1            1
## 21              1      1   0         1              1          1            1
## 17              1      1   0         1              1          1            1
## 1               1      1   0         1              1          1            1
## 1               1      1   0         1              1          1            1
## 1               1      1   0         1              1          1            1
## 1               1      1   0         1              1          1            1
## 1               1      1   0         1              1          1            1
## 1               1      1   0         1              1          0            1
## 2               1      1   0         1              0          1            1
## 1               1      1   0         1              0          1            1
## 1               1      1   0         0              1          1            1
## 1               1      1   0         0              1          1            1
## 1               1      1   0         0              1          1            1
##                 0      0  50        50             50         50           50
##     race sex capital_gain capital_loss hours_per_week native_country education
## 278    1   1            1            1              1              1         1
## 286    1   1            1            1              1              1         0
## 13     1   1            1            1              1              0         1
## 16     1   1            1            1              1              0         0
## 16     1   1            1            1              0              1         1
## 18     1   1            1            1              0              1         0
## 1      1   1            1            1              0              0         0
## 16     1   1            1            0              1              1         1
## 19     1   1            1            0              1              1         0
## 1      1   1            1            0              1              0         1
## 17     1   1            0            1              1              1         1
## 14     1   1            0            1              1              1         0
## 1      1   1            0            1              1              0         0
## 1      1   1            0            1              0              1         1
## 2      1   1            0            1              0              1         0
## 1      1   1            0            0              1              1         1
## 1      1   1            0            0              1              1         0
## 12     1   0            1            1              1              1         1
## 13     1   0            1            1              1              1         0
## 1      1   0            1            1              0              1         1
## 1      1   0            1            0              1              1         1
## 2      1   0            1            0              1              1         0
## 1      1   0            0            1              1              1         1
## 1      1   0            0            1              1              1         0
## 19     0   1            1            1              1              1         1
## 13     0   1            1            1              1              1         0
## 1      0   1            1            1              1              0         1
## 1      0   1            1            1              1              0         0
## 2      0   1            1            0              1              1         0
## 1      0   1            0            1              1              1         1
## 1      0   1            0            1              1              1         0
## 1      0   0            1            1              1              1         1
## 1      0   0            1            1              1              1         0
## 11     1   1            1            1              1              1         1
## 19     1   1            1            1              1              1         0
## 1      1   1            1            1              1              0         1
## 2      1   1            1            1              0              1         0
## 1      1   1            1            0              1              1         1
## 2      1   1            1            0              1              1         0
## 3      1   1            0            1              1              1         1
## 1      1   0            1            1              1              1         1
## 2      1   0            1            1              1              1         0
## 1      0   1            1            1              1              1         1
## 1      0   1            1            1              1              1         0
## 14     1   1            1            1              1              1         1
## 16     1   1            1            1              1              1         0
## 3      1   1            1            1              1              0         1
## 3      1   1            1            1              1              0         0
## 1      1   1            0            1              1              0         0
## 1      1   0            1            1              1              1         1
## 1      1   0            1            1              1              0         0
## 1      0   1            1            1              1              1         1
## 1      0   1            1            1              1              1         0
## 1      1   1            1            1              1              1         0
## 1      1   1            1            1              1              0         1
## 12     1   1            1            1              1              1         1
## 11     1   1            1            1              1              1         0
## 1      1   1            1            1              1              0         1
## 2      1   1            1            1              0              1         1
## 3      1   1            1            1              0              1         0
## 1      1   1            1            1              0              0         0
## 3      1   0            1            1              1              1         1
## 1      1   0            1            1              1              1         0
## 1      1   0            1            0              1              1         0
## 1      0   1            1            1              1              1         1
## 2      0   0            1            1              1              1         1
## 1      1   1            1            1              1              1         1
## 1      1   1            1            1              1              1         0
## 1      0   1            1            1              1              0         1
## 1      1   1            1            1              1              1         1
## 2      1   1            1            1              1              1         0
## 15     1   1            1            1              1              1         1
## 12     1   1            1            1              1              1         0
## 2      1   1            1            1              1              0         1
## 1      1   1            1            1              0              1         0
## 1      1   1            1            0              1              1         1
## 2      1   1            1            0              1              1         0
## 1      1   1            0            1              1              1         1
## 1      1   1            0            1              1              1         0
## 1      1   0            1            1              1              1         1
## 2      1   0            1            1              1              1         0
## 1      0   1            1            1              1              1         1
## 1      0   1            1            1              1              1         0
## 1      1   1            0            1              1              1         0
## 2      1   1            1            1              1              1         1
## 1      1   1            1            1              1              1         0
## 1      1   1            1            1              1              1         1
## 2      1   1            1            1              1              1         0
## 21     1   1            1            1              1              1         1
## 17     1   1            1            1              1              1         0
## 1      1   1            1            1              1              0         1
## 1      1   1            1            1              0              1         0
## 1      1   1            0            1              1              1         1
## 1      1   0            1            1              1              1         1
## 1      1   0            1            1              1              1         0
## 1      1   1            1            1              0              1         1
## 2      1   1            1            1              1              1         1
## 1      1   1            1            1              1              1         0
## 1      1   1            1            1              1              1         1
## 1      1   1            1            1              1              1         0
## 1      1   1            0            1              1              1         0
##       50  50           50           50             50             50       506
##         
## 278    0
## 286    1
## 13     1
## 16     2
## 16     1
## 18     2
## 1      3
## 16     1
## 19     2
## 1      2
## 17     1
## 14     2
## 1      3
## 1      2
## 2      3
## 1      2
## 1      3
## 12     1
## 13     2
## 1      2
## 1      2
## 2      3
## 1      2
## 1      3
## 19     1
## 13     2
## 1      2
## 1      3
## 2      3
## 1      2
## 1      3
## 1      2
## 1      3
## 11     1
## 19     2
## 1      2
## 2      3
## 1      2
## 2      3
## 3      2
## 1      2
## 2      3
## 1      2
## 1      3
## 14     1
## 16     2
## 3      2
## 3      3
## 1      4
## 1      2
## 1      4
## 1      2
## 1      3
## 1      3
## 1      3
## 12     1
## 11     2
## 1      2
## 2      2
## 3      3
## 1      4
## 3      2
## 1      3
## 1      4
## 1      2
## 2      3
## 1      2
## 1      3
## 1      4
## 1      2
## 2      3
## 15     1
## 12     2
## 2      2
## 1      3
## 1      2
## 2      3
## 1      2
## 1      3
## 1      2
## 2      3
## 1      2
## 1      3
## 1      4
## 2      2
## 1      3
## 1      2
## 2      3
## 21     1
## 17     2
## 1      2
## 1      3
## 1      2
## 1      2
## 1      3
## 1      3
## 2      2
## 1      3
## 1      2
## 1      3
## 1      4
##     1056

# Impute missing values for numerical variables with mean
data <- data %>%
  mutate(across(c(age, capital_gain, capital_loss, hours_per_week), ~ifelse(is.na(.), mean(., na.rm = TRUE), .)))

# Remove observations with missing values for categorical variables 
data <- data %>%
  drop_na(occupation, education, race)

STEP 6

# step 6 outliers for Capial_loss using z_scores, capital_gain using IQR method, and for age using boxplot.
#There are no outliers using standard method so we fixed the value for z_score threshold to show that it works.
  #Load necessary libraries
library(tidyverse)
print("outliers for Capial_loss using z_scores, capital_gain using IQR method, and for age using boxplot. There are no outliers using standard method so we fixed the value for z_score threshold to show that it works.")

## [1] "outliers for Capial_loss using z_scores, capital_gain using IQR method, and for age using boxplot. There are no outliers using standard method so we fixed the value for z_score threshold to show that it works."

  #Examine variables and their data types
str(data)

## 'data.frame':    442 obs. of  14 variables:
##  $ age           : num  54.2 25 58 54.2 80 ...
##  $ workclass     : chr  "Local-gov" "Self-emp-not-inc" "Self-emp-inc" "Private" ...
##  $ education     : chr  "Masters" "Masters" "Prof-school" "Bachelors" ...
##  $ education_num : int  9 8 6 14 12 6 7 8 13 8 ...
##  $ marital_status: chr  "Separated" "Married-civ-spouse" "Widowed" "Divorced" ...
##  $ occupation    : chr  "Handlers-cleaners" "Tech-support" "Armed-Forces" "Priv-house-serv" ...
##  $ relationship  : chr  "Wife" "Unmarried" "Own-child" "Other-relative" ...
##  $ race          : chr  "Amer-Indian-Eskimo" "White" "Asian-Pac-Islander" "White" ...
##  $ sex           : chr  "Female" "Male" "Male" "Female" ...
##  $ capital_gain  : num  87495 58211 38858 19832 74960 ...
##  $ capital_loss  : num  14342 65478 87676 66456 18386 ...
##  $ hours_per_week: num  6 76 74 51 21 33 40 44 6 51 ...
##  $ native_country: chr  "Jamaica" "El-Salvador" "Ireland" "Outlying-US(Guam-USVI-etc)" ...
##  $ income        : chr  "<=50K" ">50K" "<=50K" "<=50K" ...

  # calculate Z-score for capital_loss
z_scores <- scale(data$capital_loss)

# define the threshold
threshold <- 1.5

# identify outliers
capital_loss_outliers <- data$capital_loss[abs(z_scores) > threshold]

# impute outliers for capital_loss using mean imputation
mean_capital_loss <- mean(data$capital_loss, na.rm = TRUE)
data$capital_loss[abs(z_scores) > threshold] <- mean_capital_loss

# print identified and imputed capital_loss outliers
print("Identified and Imputed Capital Loss Outliers using Z-score:")

## [1] "Identified and Imputed Capital Loss Outliers using Z-score:"

print(capital_loss_outliers)

##  [1]  6465 95205  2143 97420  2169  4597 98429 97963  5721  4722  8914  1781
## [13] 95572 95876   134 95258  5933  1482  2593  4073  4603  2809 98513  2766
## [25] 98079 94838 94906 97879  2853  3481  2718  5506  5774 94868  2661  7457
## [37] 98711  2336 97737  7293  7082   357 94946 99084  4788  1108   903    61
## [49]  3443   293  7513  2218 99601 99538 95307  7871  3386   401 94637   368
## [61]  2735  9153 98749

# calculate IQR
Q1 <- quantile(data$capital_gain, 0.25)
Q3 <- quantile(data$capital_gain, 0.75)
IQR <- Q3 - Q1
print("Q1 and Q3")

## [1] "Q1 and Q3"

print(Q1)

##      25% 
## 23997.25

print(Q3)

##      75% 
## 73262.25

# bounds
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
# printing bounds to check
print(lower_bound)

##       25% 
## -49900.25

print(upper_bound)

##      75% 
## 147159.8

# identify outliers
capital_gain_outliers <- data$capital_gain[data$capital_gain < lower_bound | data$capital_gain > upper_bound]

# impute outliers for capital_gain using mean imputation
mean_capital_gain <- mean(data$capital_gain)
data$capital_gain[data$capital_gain < lower_bound | data$capital_gain > upper_bound] <- mean_capital_gain

# print identified and imputed capital_gain outliers
print("Identified and Imputed Capital Gain Outliers using IQR:")

## [1] "Identified and Imputed Capital Gain Outliers using IQR:"

print(capital_gain_outliers)

## numeric(0)

# boxplot visualization for age
ggplot(data, aes(x = "", y = age)) +
  geom_boxplot() +
  labs(title = "Boxplot of Age") +
  theme_minimal()

library(dlookr)

## Registered S3 methods overwritten by 'dlookr':
##   method          from  
##   plot.transform  scales
##   print.transform scales

## 
## Dołączanie pakietu: 'dlookr'

## Następujący obiekt został zakryty z 'package:tidyr':
## 
##     extract

## Następujący obiekt został zakryty z 'package:base':
## 
##     transform

Step 7

#step 7 - stadardization normalziation and bvinng the dataset 
# manual standardization
manual_standardize <- function(x) {
  (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
}
# Apply manual standardization to numeric columns in the dataset
data_standardized <- data
numeric_columns <- sapply(data, is.numeric)
data_standardized[, numeric_columns] <- lapply(data_standardized[, numeric_columns], manual_standardize)

# View the standardized dataset
print("standarized data")

## [1] "standarized data"

head(data_standardized)

##           age        workclass   education education_num     marital_status
## 1 -0.01451842        Local-gov     Masters    0.04416864          Separated
## 2 -1.45318295 Self-emp-not-inc     Masters   -0.17767841 Married-civ-spouse
## 3  0.17446531     Self-emp-inc Prof-school   -0.62137252            Widowed
## 4 -0.01451842          Private   Bachelors    1.15340391           Divorced
## 5  1.25956414     Never-worked     HS-grad    0.70970981  Married-AF-spouse
## 6  0.27311065      Federal-gov   Bachelors   -0.62137252  Married-AF-spouse
##          occupation   relationship               race    sex capital_gain
## 1 Handlers-cleaners           Wife Amer-Indian-Eskimo Female   1.33829201
## 2      Tech-support      Unmarried              White   Male   0.31008346
## 3      Armed-Forces      Own-child Asian-Pac-Islander   Male  -0.36943163
## 4   Priv-house-serv Other-relative              White Female  -1.03746523
## 5   Exec-managerial  Not-in-family              White   Male   0.89816791
## 6      Tech-support        Husband Asian-Pac-Islander Female   0.01942951
##   capital_loss hours_per_week             native_country income
## 1   -1.8000696    -1.66421635                    Jamaica  <=50K
## 2    0.5253476     0.89005484                El-Salvador   >50K
## 3    1.5348049     0.81707566                    Ireland  <=50K
## 4    0.5698222    -0.02218487 Outlying-US(Guam-USVI-etc)  <=50K
## 5   -1.6161681    -1.11687252              United-States  <=50K
## 6    1.6155687    -0.67899746                El-Salvador  <=50K

# manual normalization
manual_normalize <- function(x) {
  (x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
}
data_normalized <- data
numeric_columns <- sapply(data, is.numeric)
data_normalized[, numeric_columns] <- lapply(data_normalized[, numeric_columns], manual_normalize)

# View the normalized dataset
print("normalzied data")

## [1] "normalzied data"

head(data_normalized)

##          age        workclass   education education_num     marital_status
## 1 0.50233918        Local-gov     Masters     0.5333333          Separated
## 2 0.09722222 Self-emp-not-inc     Masters     0.4666667 Married-civ-spouse
## 3 0.55555556     Self-emp-inc Prof-school     0.3333333            Widowed
## 4 0.50233918          Private   Bachelors     0.8666667           Divorced
## 5 0.86111111     Never-worked     HS-grad     0.7333333  Married-AF-spouse
## 6 0.58333333      Federal-gov   Bachelors     0.3333333  Married-AF-spouse
##          occupation   relationship               race    sex capital_gain
## 1 Handlers-cleaners           Wife Amer-Indian-Eskimo Female    0.8751953
## 2      Tech-support      Unmarried              White   Male    0.5819212
## 3      Armed-Forces      Own-child Asian-Pac-Islander   Male    0.3881044
## 4   Priv-house-serv Other-relative              White Female    0.1975624
## 5   Exec-managerial  Not-in-family              White   Male    0.7496595
## 6      Tech-support        Husband Asian-Pac-Islander Female    0.4990185
##   capital_loss hours_per_week             native_country income
## 1    0.0595338     0.05102041                    Jamaica  <=50K
## 2    0.6636854     0.76530612                El-Salvador   >50K
## 3    0.9259461     0.74489796                    Ireland  <=50K
## 4    0.6752401     0.51020408 Outlying-US(Guam-USVI-etc)  <=50K
## 5    0.1073121     0.20408163              United-States  <=50K
## 6    0.9469288     0.32653061                El-Salvador  <=50K

# Define the breaks for age groups
age_breaks <- c(0, 20, 30, 40, 50, 60, 70, 80, 90, Inf)

# Define the labels for age groups
age_labels <- c("0-20", "21-30", "31-40", "41-50", "51-60", "61-70", "71-80", "81-90", "90+")

# Binning the 'age' variable
data_binned <- data
data_binned$age_group <- cut(data_binned$age, breaks = age_breaks, labels = age_labels, include.lowest = TRUE)

# View the binned dataset
print("binned data")

## [1] "binned data"

head(data_binned)

##        age        workclass   education education_num     marital_status
## 1 54.16842        Local-gov     Masters             9          Separated
## 2 25.00000 Self-emp-not-inc     Masters             8 Married-civ-spouse
## 3 58.00000     Self-emp-inc Prof-school             6            Widowed
## 4 54.16842          Private   Bachelors            14           Divorced
## 5 80.00000     Never-worked     HS-grad            12  Married-AF-spouse
## 6 60.00000      Federal-gov   Bachelors             6  Married-AF-spouse
##          occupation   relationship               race    sex capital_gain
## 1 Handlers-cleaners           Wife Amer-Indian-Eskimo Female        87495
## 2      Tech-support      Unmarried              White   Male        58211
## 3      Armed-Forces      Own-child Asian-Pac-Islander   Male        38858
## 4   Priv-house-serv Other-relative              White Female        19832
## 5   Exec-managerial  Not-in-family              White   Male        74960
## 6      Tech-support        Husband Asian-Pac-Islander Female        49933
##   capital_loss hours_per_week             native_country income age_group
## 1        14342              6                    Jamaica  <=50K     51-60
## 2        65478             76                El-Salvador   >50K     21-30
## 3        87676             74                    Ireland  <=50K     51-60
## 4        66456             51 Outlying-US(Guam-USVI-etc)  <=50K     51-60
## 5        18386             21              United-States  <=50K     71-80
## 6        89452             33                El-Salvador  <=50K     51-60

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)

##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.