## Sample data
# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Set seed for reproducibility
set.seed(2)
# Generate sample data
n <- 1000 # Number of observations
# Age between 18 and 90
age <- sample(18:90, n, replace = TRUE)
# Introduce errors in age column
age[sample(1:n, 50)] <- "InvalidAge"
# Workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked
workclass <- sample(c("Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", "Local-gov", "State-gov", "Without-pay", "Never-worked"), n, replace = TRUE)
# Introduce missing values in workclass column
workclass[sample(1:n, 50)] <- NA
# Education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool
education <- sample(c("Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"), n, replace = TRUE)
# Introduce inconsistencies in education column
education[sample(1:n, 50)] <- "InvalidEducation"
# Education Number: 1-16
education_num <- sample(1:16, n, replace = TRUE)
# Marital Status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse
marital_status <- sample(c("Married-civ-spouse", "Divorced", "Never-married", "Separated", "Widowed", "Married-spouse-absent", "Married-AF-spouse"), n, replace = TRUE)
# Introduce missing values in marital_status column
marital_status[sample(1:n, 50)] <- NA
# Occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces
occupation <- sample(c("Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv", "Armed-Forces"), n, replace = TRUE)
# Introduce errors in occupation column
occupation[sample(1:n, 50)] <- "InvalidOccupation"
# Relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried
relationship <- sample(c("Wife", "Own-child", "Husband", "Not-in-family", "Other-relative", "Unmarried"), n, replace = TRUE)
# Introduce missing values in relationship column
relationship[sample(1:n, 50)] <- NA
# Race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black
race <- sample(c("White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"), n, replace = TRUE)
# Introduce inconsistencies in race column
race[sample(1:n, 50)] <- "InvalidRace"
# Sex: Female, Male
sex <- sample(c("Female", "Male"), n, replace = TRUE)
# Introduce missing values in sex column
sex[sample(1:n, 50)] <- NA
# Capital Gain: 0-99999
capital_gain <- sample(0:99999, n, replace = TRUE)
# Introduce errors in capital_gain column
capital_gain[sample(1:n, 50)] <- -999
# Capital Loss: 0-99999
capital_loss <- sample(0:99999, n, replace = TRUE)
# Introduce errors in capital_loss column
capital_loss[sample(1:n, 50)] <- -999
# Hours per week: 1-99
hours_per_week <- sample(1:99, n, replace = TRUE)
# Introduce missing values in hours_per_week column
hours_per_week[sample(1:n, 50)] <- NA
# Native country: United-States, Cambodia, England, ...
native_country <- sample(c("United-States", "Cambodia", "England", "Puerto-Rico", "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras", "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico", "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"), n, replace = TRUE)
# Introduce missing values in native_country column
native_country[sample(1:n, 50)] <- NA
# Income: <=50K, >50K
income <- sample(c("<=50K", ">50K"), n, replace = TRUE, prob = c(0.75, 0.25))
# Create dataframe
sample_dirty_dataset <- data.frame(age, workclass, education, education_num, marital_status, occupation, relationship, race, sex, capital_gain, capital_loss, hours_per_week, native_country, income)
# Step 1
# Print first few rows of the dataset
head(sample_dirty_dataset)
## age workclass education education_num marital_status
## 1 87 <NA> 10th 6 Divorced
## 2 23 Local-gov Prof-school 3 Widowed
## 3 InvalidAge Local-gov Masters 9 Separated
## 4 25 Self-emp-not-inc Masters 8 Married-civ-spouse
## 5 34 Self-emp-inc 5th-6th 14 Separated
## 6 58 Self-emp-inc Prof-school 6 Widowed
## occupation relationship race sex capital_gain
## 1 InvalidOccupation Husband Black Female 57594
## 2 InvalidOccupation Not-in-family Black Male 49733
## 3 Handlers-cleaners Wife Amer-Indian-Eskimo Female 87495
## 4 Tech-support Unmarried White Male 58211
## 5 Armed-Forces Unmarried Black Female 46277
## 6 Armed-Forces Own-child Asian-Pac-Islander Male 38858
## capital_loss hours_per_week native_country income
## 1 73012 31 Columbia >50K
## 2 9818 91 Trinadad&Tobago <=50K
## 3 14342 6 Jamaica <=50K
## 4 65478 76 El-Salvador >50K
## 5 19906 38 Taiwan <=50K
## 6 87676 74 Ireland <=50K
# Load necessary libraries
library(tidyverse)
# Load the sample dataset
data <- sample_dirty_dataset
# Examine variables and their data types
str(data)
## 'data.frame': 1000 obs. of 14 variables:
## $ age : chr "87" "23" "InvalidAge" "25" ...
## $ workclass : chr NA "Local-gov" "Local-gov" "Self-emp-not-inc" ...
## $ education : chr "10th" "Prof-school" "Masters" "Masters" ...
## $ education_num : int 6 3 9 8 14 6 1 4 10 14 ...
## $ marital_status: chr "Divorced" "Widowed" "Separated" "Married-civ-spouse" ...
## $ occupation : chr "InvalidOccupation" "InvalidOccupation" "Handlers-cleaners" "Tech-support" ...
## $ relationship : chr "Husband" "Not-in-family" "Wife" "Unmarried" ...
## $ race : chr "Black" "Black" "Amer-Indian-Eskimo" "White" ...
## $ sex : chr "Female" "Male" "Female" "Male" ...
## $ capital_gain : num 57594 49733 87495 58211 46277 ...
## $ capital_loss : num 73012 9818 14342 65478 19906 ...
## $ hours_per_week: int 31 91 6 76 38 74 26 12 56 51 ...
## $ native_country: chr "Columbia" "Trinadad&Tobago" "Jamaica" "El-Salvador" ...
## $ income : chr ">50K" "<=50K" "<=50K" ">50K" ...
# Check for missing values
missing_values <- colSums(is.na(data))
variables_with_missing <- names(missing_values[missing_values > 0])
# Identify variables with errors or inconsistencies
variables_with_errors <- c("age", "education", "occupation", "race")
# Print variables with missing values and errors
print("Variables with Missing Values:")
## [1] "Variables with Missing Values:"
print(variables_with_missing)
## [1] "workclass" "marital_status" "relationship" "sex"
## [5] "hours_per_week" "native_country"
print("Variables with Errors or Inconsistencies:")
## [1] "Variables with Errors or Inconsistencies:"
print(variables_with_errors)
## [1] "age" "education" "occupation" "race"
library(mice)
##
## Dołączanie pakietu: 'mice'
## Następujący obiekt został zakryty z 'package:stats':
##
## filter
## Następujące obiekty zostały zakryte z 'package:base':
##
## cbind, rbind
library(dplyr)
library(naniar)
data <- sample_dirty_dataset
# Correct invalid ages
data$age <- ifelse(data$age == "InvalidAge", NA, as.integer(data$age))
## Warning in ifelse(data$age == "InvalidAge", NA, as.integer(data$age)): pojawiły
## się wartości NA na skutek przekształcenia
# Correct negative capital gains or losses
data$capital_gain <- ifelse(data$capital_gain < 0, NA, data$capital_gain)
data$capital_loss <- ifelse(data$capital_loss < 0, NA, data$capital_loss)
# Correct incorrectly spelled or invalid values in categorical variables
data$occupation <- ifelse(data$occupation == "InvalidOccupation", NA, data$occupation)
data$education <- ifelse(data$education == "InvalidEducation", NA, data$education)
# Step 4
data$education <- ifelse(data$education %in% c("11th", "12th", "1st-4th", "5th-6th", "7th-8th", "9th", "10th", "Preschool"),
"InvalidEducation", data$education)
data$race <- ifelse(data$race == "InvalidRace", "InvalidRace", data$race)
# changing from invalid to NA
data$education <- ifelse(data$education == "InvalidEducation", NA, data$education)
data$race <- ifelse(data$race == "InvalidRace", NA, data$race)
#Step 5
# Recognize missingness patterns
missing_pattern <- md.pattern(data)
print("Missingness Patterns:")
## [1] "Missingness Patterns:"
print(missing_pattern)
## education_num income age workclass marital_status occupation relationship
## 278 1 1 1 1 1 1 1
## 286 1 1 1 1 1 1 1
## 13 1 1 1 1 1 1 1
## 16 1 1 1 1 1 1 1
## 16 1 1 1 1 1 1 1
## 18 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 16 1 1 1 1 1 1 1
## 19 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 17 1 1 1 1 1 1 1
## 14 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 12 1 1 1 1 1 1 1
## 13 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 19 1 1 1 1 1 1 1
## 13 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 11 1 1 1 1 1 1 0
## 19 1 1 1 1 1 1 0
## 1 1 1 1 1 1 1 0
## 2 1 1 1 1 1 1 0
## 1 1 1 1 1 1 1 0
## 2 1 1 1 1 1 1 0
## 3 1 1 1 1 1 1 0
## 1 1 1 1 1 1 1 0
## 2 1 1 1 1 1 1 0
## 1 1 1 1 1 1 1 0
## 1 1 1 1 1 1 1 0
## 14 1 1 1 1 1 0 1
## 16 1 1 1 1 1 0 1
## 3 1 1 1 1 1 0 1
## 3 1 1 1 1 1 0 1
## 1 1 1 1 1 1 0 1
## 1 1 1 1 1 1 0 1
## 1 1 1 1 1 1 0 1
## 1 1 1 1 1 1 0 1
## 1 1 1 1 1 1 0 1
## 1 1 1 1 1 1 0 0
## 1 1 1 1 1 1 0 0
## 12 1 1 1 1 0 1 1
## 11 1 1 1 1 0 1 1
## 1 1 1 1 1 0 1 1
## 2 1 1 1 1 0 1 1
## 3 1 1 1 1 0 1 1
## 1 1 1 1 1 0 1 1
## 3 1 1 1 1 0 1 1
## 1 1 1 1 1 0 1 1
## 1 1 1 1 1 0 1 1
## 1 1 1 1 1 0 1 1
## 2 1 1 1 1 0 1 1
## 1 1 1 1 1 0 1 0
## 1 1 1 1 1 0 1 0
## 1 1 1 1 1 0 1 0
## 1 1 1 1 1 0 0 1
## 2 1 1 1 1 0 0 1
## 15 1 1 1 0 1 1 1
## 12 1 1 1 0 1 1 1
## 2 1 1 1 0 1 1 1
## 1 1 1 1 0 1 1 1
## 1 1 1 1 0 1 1 1
## 2 1 1 1 0 1 1 1
## 1 1 1 1 0 1 1 1
## 1 1 1 1 0 1 1 1
## 1 1 1 1 0 1 1 1
## 2 1 1 1 0 1 1 1
## 1 1 1 1 0 1 1 1
## 1 1 1 1 0 1 1 1
## 1 1 1 1 0 1 1 0
## 2 1 1 1 0 1 0 1
## 1 1 1 1 0 1 0 1
## 1 1 1 1 0 0 1 1
## 2 1 1 1 0 0 1 1
## 21 1 1 0 1 1 1 1
## 17 1 1 0 1 1 1 1
## 1 1 1 0 1 1 1 1
## 1 1 1 0 1 1 1 1
## 1 1 1 0 1 1 1 1
## 1 1 1 0 1 1 1 1
## 1 1 1 0 1 1 1 1
## 1 1 1 0 1 1 0 1
## 2 1 1 0 1 0 1 1
## 1 1 1 0 1 0 1 1
## 1 1 1 0 0 1 1 1
## 1 1 1 0 0 1 1 1
## 1 1 1 0 0 1 1 1
## 0 0 50 50 50 50 50
## race sex capital_gain capital_loss hours_per_week native_country education
## 278 1 1 1 1 1 1 1
## 286 1 1 1 1 1 1 0
## 13 1 1 1 1 1 0 1
## 16 1 1 1 1 1 0 0
## 16 1 1 1 1 0 1 1
## 18 1 1 1 1 0 1 0
## 1 1 1 1 1 0 0 0
## 16 1 1 1 0 1 1 1
## 19 1 1 1 0 1 1 0
## 1 1 1 1 0 1 0 1
## 17 1 1 0 1 1 1 1
## 14 1 1 0 1 1 1 0
## 1 1 1 0 1 1 0 0
## 1 1 1 0 1 0 1 1
## 2 1 1 0 1 0 1 0
## 1 1 1 0 0 1 1 1
## 1 1 1 0 0 1 1 0
## 12 1 0 1 1 1 1 1
## 13 1 0 1 1 1 1 0
## 1 1 0 1 1 0 1 1
## 1 1 0 1 0 1 1 1
## 2 1 0 1 0 1 1 0
## 1 1 0 0 1 1 1 1
## 1 1 0 0 1 1 1 0
## 19 0 1 1 1 1 1 1
## 13 0 1 1 1 1 1 0
## 1 0 1 1 1 1 0 1
## 1 0 1 1 1 1 0 0
## 2 0 1 1 0 1 1 0
## 1 0 1 0 1 1 1 1
## 1 0 1 0 1 1 1 0
## 1 0 0 1 1 1 1 1
## 1 0 0 1 1 1 1 0
## 11 1 1 1 1 1 1 1
## 19 1 1 1 1 1 1 0
## 1 1 1 1 1 1 0 1
## 2 1 1 1 1 0 1 0
## 1 1 1 1 0 1 1 1
## 2 1 1 1 0 1 1 0
## 3 1 1 0 1 1 1 1
## 1 1 0 1 1 1 1 1
## 2 1 0 1 1 1 1 0
## 1 0 1 1 1 1 1 1
## 1 0 1 1 1 1 1 0
## 14 1 1 1 1 1 1 1
## 16 1 1 1 1 1 1 0
## 3 1 1 1 1 1 0 1
## 3 1 1 1 1 1 0 0
## 1 1 1 0 1 1 0 0
## 1 1 0 1 1 1 1 1
## 1 1 0 1 1 1 0 0
## 1 0 1 1 1 1 1 1
## 1 0 1 1 1 1 1 0
## 1 1 1 1 1 1 1 0
## 1 1 1 1 1 1 0 1
## 12 1 1 1 1 1 1 1
## 11 1 1 1 1 1 1 0
## 1 1 1 1 1 1 0 1
## 2 1 1 1 1 0 1 1
## 3 1 1 1 1 0 1 0
## 1 1 1 1 1 0 0 0
## 3 1 0 1 1 1 1 1
## 1 1 0 1 1 1 1 0
## 1 1 0 1 0 1 1 0
## 1 0 1 1 1 1 1 1
## 2 0 0 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 0
## 1 0 1 1 1 1 0 1
## 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 0
## 15 1 1 1 1 1 1 1
## 12 1 1 1 1 1 1 0
## 2 1 1 1 1 1 0 1
## 1 1 1 1 1 0 1 0
## 1 1 1 1 0 1 1 1
## 2 1 1 1 0 1 1 0
## 1 1 1 0 1 1 1 1
## 1 1 1 0 1 1 1 0
## 1 1 0 1 1 1 1 1
## 2 1 0 1 1 1 1 0
## 1 0 1 1 1 1 1 1
## 1 0 1 1 1 1 1 0
## 1 1 1 0 1 1 1 0
## 2 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 0
## 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 0
## 21 1 1 1 1 1 1 1
## 17 1 1 1 1 1 1 0
## 1 1 1 1 1 1 0 1
## 1 1 1 1 1 0 1 0
## 1 1 1 0 1 1 1 1
## 1 1 0 1 1 1 1 1
## 1 1 0 1 1 1 1 0
## 1 1 1 1 1 0 1 1
## 2 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 0
## 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 0
## 1 1 1 0 1 1 1 0
## 50 50 50 50 50 50 506
##
## 278 0
## 286 1
## 13 1
## 16 2
## 16 1
## 18 2
## 1 3
## 16 1
## 19 2
## 1 2
## 17 1
## 14 2
## 1 3
## 1 2
## 2 3
## 1 2
## 1 3
## 12 1
## 13 2
## 1 2
## 1 2
## 2 3
## 1 2
## 1 3
## 19 1
## 13 2
## 1 2
## 1 3
## 2 3
## 1 2
## 1 3
## 1 2
## 1 3
## 11 1
## 19 2
## 1 2
## 2 3
## 1 2
## 2 3
## 3 2
## 1 2
## 2 3
## 1 2
## 1 3
## 14 1
## 16 2
## 3 2
## 3 3
## 1 4
## 1 2
## 1 4
## 1 2
## 1 3
## 1 3
## 1 3
## 12 1
## 11 2
## 1 2
## 2 2
## 3 3
## 1 4
## 3 2
## 1 3
## 1 4
## 1 2
## 2 3
## 1 2
## 1 3
## 1 4
## 1 2
## 2 3
## 15 1
## 12 2
## 2 2
## 1 3
## 1 2
## 2 3
## 1 2
## 1 3
## 1 2
## 2 3
## 1 2
## 1 3
## 1 4
## 2 2
## 1 3
## 1 2
## 2 3
## 21 1
## 17 2
## 1 2
## 1 3
## 1 2
## 1 2
## 1 3
## 1 3
## 2 2
## 1 3
## 1 2
## 1 3
## 1 4
## 1056
# Impute missing values for numerical variables with mean
data <- data %>%
mutate(across(c(age, capital_gain, capital_loss, hours_per_week), ~ifelse(is.na(.), mean(., na.rm = TRUE), .)))
# Remove observations with missing values for categorical variables
data <- data %>%
drop_na(occupation, education, race)
# step 6 outliers for Capial_loss using z_scores, capital_gain using IQR method, and for age using boxplot.
#There are no outliers using standard method so we fixed the value for z_score threshold to show that it works.
#Load necessary libraries
library(tidyverse)
print("outliers for Capial_loss using z_scores, capital_gain using IQR method, and for age using boxplot. There are no outliers using standard method so we fixed the value for z_score threshold to show that it works.")
## [1] "outliers for Capial_loss using z_scores, capital_gain using IQR method, and for age using boxplot. There are no outliers using standard method so we fixed the value for z_score threshold to show that it works."
#Examine variables and their data types
str(data)
## 'data.frame': 442 obs. of 14 variables:
## $ age : num 54.2 25 58 54.2 80 ...
## $ workclass : chr "Local-gov" "Self-emp-not-inc" "Self-emp-inc" "Private" ...
## $ education : chr "Masters" "Masters" "Prof-school" "Bachelors" ...
## $ education_num : int 9 8 6 14 12 6 7 8 13 8 ...
## $ marital_status: chr "Separated" "Married-civ-spouse" "Widowed" "Divorced" ...
## $ occupation : chr "Handlers-cleaners" "Tech-support" "Armed-Forces" "Priv-house-serv" ...
## $ relationship : chr "Wife" "Unmarried" "Own-child" "Other-relative" ...
## $ race : chr "Amer-Indian-Eskimo" "White" "Asian-Pac-Islander" "White" ...
## $ sex : chr "Female" "Male" "Male" "Female" ...
## $ capital_gain : num 87495 58211 38858 19832 74960 ...
## $ capital_loss : num 14342 65478 87676 66456 18386 ...
## $ hours_per_week: num 6 76 74 51 21 33 40 44 6 51 ...
## $ native_country: chr "Jamaica" "El-Salvador" "Ireland" "Outlying-US(Guam-USVI-etc)" ...
## $ income : chr "<=50K" ">50K" "<=50K" "<=50K" ...
# calculate Z-score for capital_loss
z_scores <- scale(data$capital_loss)
# define the threshold
threshold <- 1.5
# identify outliers
capital_loss_outliers <- data$capital_loss[abs(z_scores) > threshold]
# impute outliers for capital_loss using mean imputation
mean_capital_loss <- mean(data$capital_loss, na.rm = TRUE)
data$capital_loss[abs(z_scores) > threshold] <- mean_capital_loss
# print identified and imputed capital_loss outliers
print("Identified and Imputed Capital Loss Outliers using Z-score:")
## [1] "Identified and Imputed Capital Loss Outliers using Z-score:"
print(capital_loss_outliers)
## [1] 6465 95205 2143 97420 2169 4597 98429 97963 5721 4722 8914 1781
## [13] 95572 95876 134 95258 5933 1482 2593 4073 4603 2809 98513 2766
## [25] 98079 94838 94906 97879 2853 3481 2718 5506 5774 94868 2661 7457
## [37] 98711 2336 97737 7293 7082 357 94946 99084 4788 1108 903 61
## [49] 3443 293 7513 2218 99601 99538 95307 7871 3386 401 94637 368
## [61] 2735 9153 98749
# calculate IQR
Q1 <- quantile(data$capital_gain, 0.25)
Q3 <- quantile(data$capital_gain, 0.75)
IQR <- Q3 - Q1
print("Q1 and Q3")
## [1] "Q1 and Q3"
print(Q1)
## 25%
## 23997.25
print(Q3)
## 75%
## 73262.25
# bounds
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
# printing bounds to check
print(lower_bound)
## 25%
## -49900.25
print(upper_bound)
## 75%
## 147159.8
# identify outliers
capital_gain_outliers <- data$capital_gain[data$capital_gain < lower_bound | data$capital_gain > upper_bound]
# impute outliers for capital_gain using mean imputation
mean_capital_gain <- mean(data$capital_gain)
data$capital_gain[data$capital_gain < lower_bound | data$capital_gain > upper_bound] <- mean_capital_gain
# print identified and imputed capital_gain outliers
print("Identified and Imputed Capital Gain Outliers using IQR:")
## [1] "Identified and Imputed Capital Gain Outliers using IQR:"
print(capital_gain_outliers)
## numeric(0)
# boxplot visualization for age
ggplot(data, aes(x = "", y = age)) +
geom_boxplot() +
labs(title = "Boxplot of Age") +
theme_minimal()
library(dlookr)
## Registered S3 methods overwritten by 'dlookr':
## method from
## plot.transform scales
## print.transform scales
##
## Dołączanie pakietu: 'dlookr'
## Następujący obiekt został zakryty z 'package:tidyr':
##
## extract
## Następujący obiekt został zakryty z 'package:base':
##
## transform
#step 7 - stadardization normalziation and bvinng the dataset
# manual standardization
manual_standardize <- function(x) {
(x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
}
# Apply manual standardization to numeric columns in the dataset
data_standardized <- data
numeric_columns <- sapply(data, is.numeric)
data_standardized[, numeric_columns] <- lapply(data_standardized[, numeric_columns], manual_standardize)
# View the standardized dataset
print("standarized data")
## [1] "standarized data"
head(data_standardized)
## age workclass education education_num marital_status
## 1 -0.01451842 Local-gov Masters 0.04416864 Separated
## 2 -1.45318295 Self-emp-not-inc Masters -0.17767841 Married-civ-spouse
## 3 0.17446531 Self-emp-inc Prof-school -0.62137252 Widowed
## 4 -0.01451842 Private Bachelors 1.15340391 Divorced
## 5 1.25956414 Never-worked HS-grad 0.70970981 Married-AF-spouse
## 6 0.27311065 Federal-gov Bachelors -0.62137252 Married-AF-spouse
## occupation relationship race sex capital_gain
## 1 Handlers-cleaners Wife Amer-Indian-Eskimo Female 1.33829201
## 2 Tech-support Unmarried White Male 0.31008346
## 3 Armed-Forces Own-child Asian-Pac-Islander Male -0.36943163
## 4 Priv-house-serv Other-relative White Female -1.03746523
## 5 Exec-managerial Not-in-family White Male 0.89816791
## 6 Tech-support Husband Asian-Pac-Islander Female 0.01942951
## capital_loss hours_per_week native_country income
## 1 -1.8000696 -1.66421635 Jamaica <=50K
## 2 0.5253476 0.89005484 El-Salvador >50K
## 3 1.5348049 0.81707566 Ireland <=50K
## 4 0.5698222 -0.02218487 Outlying-US(Guam-USVI-etc) <=50K
## 5 -1.6161681 -1.11687252 United-States <=50K
## 6 1.6155687 -0.67899746 El-Salvador <=50K
# manual normalization
manual_normalize <- function(x) {
(x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
}
data_normalized <- data
numeric_columns <- sapply(data, is.numeric)
data_normalized[, numeric_columns] <- lapply(data_normalized[, numeric_columns], manual_normalize)
# View the normalized dataset
print("normalzied data")
## [1] "normalzied data"
head(data_normalized)
## age workclass education education_num marital_status
## 1 0.50233918 Local-gov Masters 0.5333333 Separated
## 2 0.09722222 Self-emp-not-inc Masters 0.4666667 Married-civ-spouse
## 3 0.55555556 Self-emp-inc Prof-school 0.3333333 Widowed
## 4 0.50233918 Private Bachelors 0.8666667 Divorced
## 5 0.86111111 Never-worked HS-grad 0.7333333 Married-AF-spouse
## 6 0.58333333 Federal-gov Bachelors 0.3333333 Married-AF-spouse
## occupation relationship race sex capital_gain
## 1 Handlers-cleaners Wife Amer-Indian-Eskimo Female 0.8751953
## 2 Tech-support Unmarried White Male 0.5819212
## 3 Armed-Forces Own-child Asian-Pac-Islander Male 0.3881044
## 4 Priv-house-serv Other-relative White Female 0.1975624
## 5 Exec-managerial Not-in-family White Male 0.7496595
## 6 Tech-support Husband Asian-Pac-Islander Female 0.4990185
## capital_loss hours_per_week native_country income
## 1 0.0595338 0.05102041 Jamaica <=50K
## 2 0.6636854 0.76530612 El-Salvador >50K
## 3 0.9259461 0.74489796 Ireland <=50K
## 4 0.6752401 0.51020408 Outlying-US(Guam-USVI-etc) <=50K
## 5 0.1073121 0.20408163 United-States <=50K
## 6 0.9469288 0.32653061 El-Salvador <=50K
# Define the breaks for age groups
age_breaks <- c(0, 20, 30, 40, 50, 60, 70, 80, 90, Inf)
# Define the labels for age groups
age_labels <- c("0-20", "21-30", "31-40", "41-50", "51-60", "61-70", "71-80", "81-90", "90+")
# Binning the 'age' variable
data_binned <- data
data_binned$age_group <- cut(data_binned$age, breaks = age_breaks, labels = age_labels, include.lowest = TRUE)
# View the binned dataset
print("binned data")
## [1] "binned data"
head(data_binned)
## age workclass education education_num marital_status
## 1 54.16842 Local-gov Masters 9 Separated
## 2 25.00000 Self-emp-not-inc Masters 8 Married-civ-spouse
## 3 58.00000 Self-emp-inc Prof-school 6 Widowed
## 4 54.16842 Private Bachelors 14 Divorced
## 5 80.00000 Never-worked HS-grad 12 Married-AF-spouse
## 6 60.00000 Federal-gov Bachelors 6 Married-AF-spouse
## occupation relationship race sex capital_gain
## 1 Handlers-cleaners Wife Amer-Indian-Eskimo Female 87495
## 2 Tech-support Unmarried White Male 58211
## 3 Armed-Forces Own-child Asian-Pac-Islander Male 38858
## 4 Priv-house-serv Other-relative White Female 19832
## 5 Exec-managerial Not-in-family White Male 74960
## 6 Tech-support Husband Asian-Pac-Islander Female 49933
## capital_loss hours_per_week native_country income age_group
## 1 14342 6 Jamaica <=50K 51-60
## 2 65478 76 El-Salvador >50K 21-30
## 3 87676 74 Ireland <=50K 51-60
## 4 66456 51 Outlying-US(Guam-USVI-etc) <=50K 51-60
## 5 18386 21 United-States <=50K 71-80
## 6 89452 33 El-Salvador <=50K 51-60
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.