Cleansing2

Sample data

In this part of the code, we create the dataset that we will use in the rest of our report. This set basically contains 1000 observations: age, salary type, education, marital status, occupation, relationship status, race, gender, family financial situation (income-expense), weekly working hours, While it includes data such as nationality and salary, it also contains some missing data.

# Load necessary libraries
library(tidyverse)

# Set seed for reproducibility
set.seed(31)

# Generate sample data
n <- 1000  # Number of observations

# Age between 18 and 90
age <- sample(18:90, n, replace = TRUE)

# Introduce errors in age column
age[sample(1:n, 50)] <- "InvalidAge"

# Workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked
workclass <- sample(c("Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", "Local-gov", "State-gov", "Without-pay", "Never-worked"), n, replace = TRUE)

# Introduce missing values in workclass column
workclass[sample(1:n, 50)] <- NA

# Education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool
education <- sample(c("Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"), n, replace = TRUE)

# Introduce inconsistencies in education column
education[sample(1:n, 50)] <- "InvalidEducation"

# Education Number: 1-16
education_num <- sample(1:16, n, replace = TRUE)

# Marital Status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse
marital_status <- sample(c("Married-civ-spouse", "Divorced", "Never-married", "Separated", "Widowed", "Married-spouse-absent", "Married-AF-spouse"), n, replace = TRUE)

# Introduce missing values in marital_status column
marital_status[sample(1:n, 50)] <- NA

# Occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces
occupation <- sample(c("Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv", "Armed-Forces"), n, replace = TRUE)

# Introduce errors in occupation column
occupation[sample(1:n, 50)] <- "InvalidOccupation"

# Relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried
relationship <- sample(c("Wife", "Own-child", "Husband", "Not-in-family", "Other-relative", "Unmarried"), n, replace = TRUE)

# Introduce missing values in relationship column
relationship[sample(1:n, 50)] <- NA

# Race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black
race <- sample(c("White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"), n, replace = TRUE)

# Introduce inconsistencies in race column
race[sample(1:n, 50)] <- "InvalidRace"

# Sex: Female, Male
sex <- sample(c("Female", "Male"), n, replace = TRUE)

# Introduce missing values in sex column
sex[sample(1:n, 50)] <- NA

# Capital Gain: 0-99999
capital_gain <- sample(0:99999, n, replace = TRUE)

# Introduce errors in capital_gain column
capital_gain[sample(1:n, 50)] <- -999

# Capital Loss: 0-99999
capital_loss <- sample(0:99999, n, replace = TRUE)

# Introduce errors in capital_loss column
capital_loss[sample(1:n, 50)] <- -999

# Hours per week: 1-99
hours_per_week <- sample(1:99, n, replace = TRUE)

# Introduce missing values in hours_per_week column
hours_per_week[sample(1:n, 50)] <- NA

# Native country: United-States, Cambodia, England, ...
native_country <- sample(c("United-States", "Cambodia", "England", "Puerto-Rico", "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras", "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico", "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"), n, replace = TRUE)

# Introduce missing values in native_country column
native_country[sample(1:n, 50)] <- NA

# Income: <=50K, >50K
income <- sample(c("<=50K", ">50K"), n, replace = TRUE, prob = c(0.75, 0.25))

# Create dataframe
sample_dirty_dataset <- data.frame(age, workclass, education, education_num, marital_status, occupation, relationship, race, sex, capital_gain, capital_loss, hours_per_week, native_country, income)

OUR CODES

2nd Task

It contains two separate functions to determine invalid and missing values in the data set. Performing such pre-processing before performing data analysis increases the accuracy of the analysis results and prevents misleading results.

# Detecting invalid values for all columns in the data set
check_invalid_values <- function(dataset) {
  invalid_columns <- c()
  
  if (any(dataset$capital_gain < 0 | dataset$capital_gain > 99999)) {
    invalid_columns <- c(invalid_columns, "capital_gain")
  }
  if (any(dataset$capital_loss < 0 | dataset$capital_loss > 99999)) {
    invalid_columns <- c(invalid_columns, "capital_loss")
  }

  for (col_name in names(dataset)) {
    if (any(grepl("Invalid", dataset[[col_name]], ignore.case = TRUE))) {
      invalid_columns <- c(invalid_columns, col_name)
    }
  }

  return(unique(invalid_columns))
}

# Function to find missing values in the data set
find_na_columns <- function(dataset) {
  na_counts <- colSums(is.na(dataset))
  na_columns <- names(na_counts[na_counts > 0])
  return(na_columns)
}

# Run functions on dataset 'sample_dirty_dataset'
invalid_columns <- check_invalid_values(sample_dirty_dataset)
na_columns <- find_na_columns(sample_dirty_dataset)

# Print the results
cat("Columns containing invalid values:", invalid_columns, "\n")

## Columns containing invalid values: capital_gain capital_loss age education occupation race

cat("Columns with missing values:", na_columns, "\n")

## Columns with missing values: workclass marital_status relationship sex hours_per_week native_country

check_invalid_values Function: This function detects invalid values (values not between 0 and 99999) in the capital_gain and capital_loss columns. Additionally, it loops through all columns and finds values that contain the word “Invalid”. This process identifies specific erroneous data by looking at each column in the data set.

find_na_columns Function: This function loops through each column in the data set and calculates the number of missing (NA) values in each. It identifies columns that contain missing values and returns the names of these columns as a list.

The check_invalid_values and find_na_columns functions are run on the data set named sample_dirty_dataset to determine the columns containing invalid and missing values, respectively.

The results are printed on the screen and it is reported which columns have problems.

3rd Task

This R code performs data cleaning operations to resolve the issues identified in the previous step. It contains two main functions and one auxiliary function used to clean invalid and erroneous data in the data set.

# Purge invalid values for specific columns in the dataset
clean_invalid_values <- function(dataset, column, lower_bound, upper_bound) {
  invalid_indices <- dataset[[column]] < lower_bound | dataset[[column]] > upper_bound
  dataset[[column]][invalid_indices] <- NA
  return(dataset)
}

# Clear data containing 'Invalid' in all columns in the dataset
clean_invalid_entries <- function(dataset) {
  columns <- colnames(dataset)
  for (col in columns) {
    invalid_indices <- grepl("Invalid", dataset[[col]], ignore.case = TRUE)
    dataset[[col]][invalid_indices] <- NA
  }
  return(dataset)
}

apply_cleaning <- function(dataset) {
  dataset <- clean_invalid_values(dataset, "capital_gain", 0, 99999)
  dataset <- clean_invalid_values(dataset, "capital_loss", 0, 99999)
  dataset <- clean_invalid_entries(dataset)
  return(dataset)
}

sample_dirty_dataset <- apply_cleaning(sample_dirty_dataset)
head(sample_dirty_dataset)

##   age    workclass    education education_num     marital_status
## 1  62  Without-pay      Masters             5          Separated
## 2  66  Without-pay      1st-4th             6 Married-civ-spouse
## 3  57 Never-worked Some-college            13 Married-civ-spouse
## 4  81    State-gov    Bachelors             2 Married-civ-spouse
## 5  60 Self-emp-inc      1st-4th             5      Never-married
## 6  72 Self-emp-inc      HS-grad            15          Separated
##          occupation  relationship               race    sex capital_gain
## 1   Priv-house-serv          Wife Amer-Indian-Eskimo Female        70407
## 2     Other-service     Own-child              Black Female        42541
## 3     Other-service     Own-child              White   Male        70503
## 4              <NA>     Own-child              White Female        51598
## 5   Farming-fishing Not-in-family               <NA> Female         8284
## 6 Machine-op-inspct          <NA>              White   Male        88182
##   capital_loss hours_per_week             native_country income
## 1           NA              4                      South  <=50K
## 2        37405             59 Outlying-US(Guam-USVI-etc)  <=50K
## 3        78767             34                      India  <=50K
## 4           NA             61                    Ireland  <=50K
## 5        63324             20                    Ecuador  <=50K
## 6           NA             11                       Cuba  <=50K

clean_invalid_values Function: This function replaces values that fall outside the specified lower and upper limits in a given column with NA (missing value). For example, in the capital_gain and capital_loss columns, values that are not between 0 and 99999 are considered invalid and replaced with NA.

clean_invalid_entries Function: This function detects values containing “Invalid” in all columns in the data set and replaces these values with NA. This process traverses all columns in the data set, providing general cleaning and removing specialized invalid entries.

apply_cleaning Function: This helper function applies the cleaning functions defined above to the data set, respectively. First, invalid value cleaning is performed for the capital_gain and capital_loss columns. The global invalid entries in all columns are then purged. In conclusion, The data set with all cleaning operations applied is returned.

The functions used in this process are aimed at improving data quality by effectively handling incorrect and invalid entries in the data set.

4th Task

This piece of R code performs data cleaning by checking certain columns (education, race, marital_status, and occupation) in the data set named sample_dirty_dataset and replacing values that are outside the defined valid categories with NA (missing value).

sample_dirty_dataset <- sample_dirty_dataset %>%
  mutate(
    education = ifelse(education %in% c('Bachelors', 'Some-college', '11th', 'HS-grad', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', '9th', '7th-8th', '12th', 'Masters', '1st-4th', '10th', 'Doctorate', '5th-6th', 'Preschool'), education, NA),
    race = ifelse(race %in% c('White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black'), race, NA),
    marital_status = ifelse(marital_status %in% c('Married-civ-spouse', 'Divorced', 'Never-married', 'Separated', 'Widowed', 'Married-spouse-absent', 'Married-AF-spouse'), marital_status, NA),
    occupation = ifelse(occupation %in% c('Tech-support', 'Craft-repair', 'Other-service', 'Sales', 'Exec-managerial', 'Prof-specialty', 'Handlers-cleaners', 'Machine-op-inspct', 'Adm-clerical', 'Farming-fishing', 'Transport-moving', 'Priv-house-serv', 'Protective-serv', 'Armed-Forces'), occupation, NA)
  )

head(sample_dirty_dataset) %>% 
  select(education, race, marital_status, occupation)

##      education               race     marital_status        occupation
## 1      Masters Amer-Indian-Eskimo          Separated   Priv-house-serv
## 2      1st-4th              Black Married-civ-spouse     Other-service
## 3 Some-college              White Married-civ-spouse     Other-service
## 4    Bachelors              White Married-civ-spouse              <NA>
## 5      1st-4th               <NA>      Never-married   Farming-fishing
## 6      HS-grad              White          Separated Machine-op-inspct

mutate(): This function is used to transform existing columns or add new columns. Here it is used with the ifelse() function to make conditional changes to specific columns.

ifelse(): Used to replace values outside the valid categories specified for each column with NA. For example, For the education column, only certain education statuses are accepted, all other values are replaced with NA.

Education: Only specified education levels are accepted (e.g. ‘Bachelors’, ‘HS-grad’). Values other than these are replaced with NA.

Race: Except for defined races such as ‘White’, ‘Asian-Pac-Islander’, they are replaced with NA.

Marital Status (marital_status): ‘Married-civ-spouse’ only, Certain marital statuses such as ‘Divorced’ are accepted, others are replaced with NA.

Occupation: Certain occupations are accepted (such as ‘Tech-support’, ‘Sales’), other values are replaced with NA.

head(): Shows the first six rows of the cleaned data set. This is used to observe the results of the cleaning process.

select(): Used to select the columns to be displayed. The cleared education, race, marital_status, and occupation columns are shown here.

5th Task

This piece of R code fills in missing values in numeric columns in the sample_dirty_dataset dataset and then removes rows with any missing values.

# Let's fill the numerical columns with median. The median is generally preferred because it is less affected by outliers:
sample_dirty_dataset <- sample_dirty_dataset %>%
  mutate(across(where(is.numeric), ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))

sample_dirty_dataset <- drop_na(sample_dirty_dataset)

across(where(is.numeric), ...) selects all columns that are numeric and applies the specified operation.

median(): Calculates the median value for numeric data. The na.rm = TRUE parameter ensures that missing values are ignored when calculating the median.

drop_na(): Removes all rows with missing values from the data set. This process is important for those who prefer to work only on the full data set in the analysis.

6th Task

This piece of R code is used to detect outliers by performing z-score analysis on the age column in the sample_dirty_dataset dataset.

# METHOD 1

sample_dirty_dataset$age <- as.numeric(sample_dirty_dataset$age)
zscore <- abs(scale(sample_dirty_dataset$age))
plot(zscore, type = "n", main = "Z-score for Age", xlab = "Index", ylab = "Z-score")

# Draw threshold value (2 standard deviations)
abline(h = 2, col = "red")

# Mark values above the threshold with red
text(1:length(sample_dirty_dataset$age), zscore, labels = round(zscore, 2), col = ifelse(zscore > 2, "red", "black"))

as.numeric(): Converts the values in the age column to numeric type. This may be necessary after clearing the data set of invalid values such as “InvalidAge” because these values may not be in the appropriate format for numerical operations.

scale(): Performs standardization and calculates how far each age value is from the mean in terms of standard deviation. The values obtained as a result of this process are known as z-score.

abs(): The absolute value function prevents z-scores from being negative and makes it easier to use in analysis.

plot(): Plots a plot of Z-scores. The type=“n” parameter plots only the axes (index vs. z-score) of the chart, not the points.

abline(): A specific z- draws a red horizontal line indicating the score threshold (2 standard deviations in this example). This line indicates the z-score threshold that can be considered an outlier.

text(): Adds each z-score value as a label to the graph. Z-score values exceeding the threshold are marked in red, others in black.

This snippet provides an effective method to identify outliers in the age column. Identifying outliers increases the accuracy of analyses, especially on important demographic variables such as age.

This piece of R code performs analysis by drawing a boxplot for the education_num column in the sample_dirty_dataset dataset and identifying outliers in this column.

#METHOD 2

boxplot_education <- boxplot(sample_dirty_dataset$education_num, main = "Boxplot for Education Number", ylab = "Education Number")

outliers_education <- boxplot_education$out

cat("Outliers for Education Number:\n")

## Outliers for Education Number:

print(outliers_education)

## numeric(0)

outlier_indices <- which(sample_dirty_dataset$education_num %in% outliers_education)
cat("Indices of outliers for Education Number:\n")

## Indices of outliers for Education Number:

print(outlier_indices)

## integer(0)

boxplot(): This function plots a boxplot for the education_num column. The box plot shows the central tendency and spread of the data distribution and is also used to visually identify outliers. The chart shows the lower and upper bounds of the box (Q1 and Q3), the median, and outliers.

$out: The out component of the object returned by the boxplot() function contains the values determined as outliers in the graph.

cat() and print(): These functions are used to print outliers and their indices to the screen.

This piece of code provides an effective method for outlier detection in a numeric variable such as training number. Identifying outliers is an important part of considering potential errors in the data set or special cases that need to be considered.

This piece of R code performs an analysis specifically for identifying outliers using robust statistical methods for the hours_per_week column in the sample_dirty_dataset dataset. Code, It calculates a robust covariance matrix with the Minimum Covariance Determinant (MCD) method using the rrcov library and presents this matrix graphically.

#METHOD 3
library(rrcov)
par(mfrow=c(2,2))
plot(covMcd(sample_dirty_dataset$hours_per_week))

par(mfrow=c(2,2)): It is used to display multiple graphs side by side in the same graph window by dividing the output area into four sections (2 rows and 2 columns).

covMcd(): Applies the Minimum Covariance Determinant method on the given data set. This method reduces the effect of outliers in a way that does not disrupt the main structure of the data set. Calculates the covariance matrix of the data set.

plot(): Plots various summary plots of the robust covariance matrix calculated by the covMcd() function. These graphs provide important information about the distribution of the dataset and outliers.

This method is extremely useful for performing outlier analysis on continuous variables such as working hours, especially in large data sets or in fields such as economics and social sciences. Robust methods help prevent possible errors due to outliers in the data set, thus allowing more reliable results to be obtained. These analyzes are critical to better understand the overall structure of the data set and identify potential data entry errors or measurement errors.

7th Task

This R code snippet manually standardizes numeric columns in a dataset using a custom function and applies it to sample_dirty_dataset.

# Manual standardization function
manual_standardize <- function(x) {
  (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
}

data_standardized <- sample_dirty_dataset
numeric_columns <- sapply(data_standardized, is.numeric)
if (!any(numeric_columns)) {
  stop("No numeric columns found in 'data_standardized'.")
}
data_standardized[, numeric_columns] <- lapply(data_standardized[, numeric_columns, drop = FALSE], manual_standardize)

head(data_standardized)

##          age        workclass    education education_num        marital_status
## 1  0.3941790      Without-pay      Masters    -0.6793678             Separated
## 2  0.5862427      Without-pay      1st-4th    -0.4675769    Married-civ-spouse
## 3  0.1540994     Never-worked Some-college     1.0149591    Married-civ-spouse
## 4 -0.4701076          Private    Preschool     1.4385408             Separated
## 5 -1.2383623        State-gov         11th    -0.6793678               Widowed
## 6 -0.3740757 Self-emp-not-inc    Doctorate     1.6503317 Married-spouse-absent
##          occupation relationship               race    sex capital_gain
## 1   Priv-house-serv         Wife Amer-Indian-Eskimo Female   0.80234026
## 2     Other-service    Own-child              Black Female  -0.20198266
## 3     Other-service    Own-child              White   Male   0.80580021
## 4      Tech-support    Unmarried Asian-Pac-Islander Female   0.01905776
## 5 Machine-op-inspct    Own-child              White   Male  -1.38989924
## 6  Transport-moving    Own-child              White   Male   0.59358987
##   capital_loss hours_per_week             native_country income
## 1 -0.001034116     -1.6716729                      South  <=50K
## 2 -0.423978454      0.3222502 Outlying-US(Guam-USVI-etc)  <=50K
## 3  1.058611803     -0.5840785                      India  <=50K
## 4 -1.532713087     -1.6716729                   Scotland  <=50K
## 5  1.704274448     -1.3816477                  Nicaragua   >50K
## 6  0.793615193     -1.0553694                   Cambodia   >50K

manual_standardize function: Standardizes a numerical vector by subtracting its mean and dividing by the standard deviation. This process is often called Z-score normalization. For any scalar vector x, calculates x-mean(x)/sd(x), where mean(x, na.rm = TRUE) is the mean of x, sd(x, na. rm = TRUE) calculates the standard deviation in a similar way.

numeric_columns: Determines the numeric columns in data_standardized using sapply() and is.numeric. This results in a logical vector that points to a TRUE numeric column.

This R code snippet defines a function for manually normalizing numeric columns in a dataset (sample_dirty_dataset) and applies this transformation.

# Define manual normalization function
manual_normalize <- function(x) {
  (x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
}
data_normalized <- sample_dirty_dataset
numeric_columns <- sapply(data_normalized, is.numeric)
if (!any(numeric_columns)) {
  stop("No numeric columns found in 'data_normalized'.")
}
data_normalized[, numeric_columns] <- lapply(data_normalized[, numeric_columns, drop = FALSE], manual_normalize)

head(data_normalized)

##         age        workclass    education education_num        marital_status
## 1 0.6111111      Without-pay      Masters     0.2666667             Separated
## 2 0.6666667      Without-pay      1st-4th     0.3333333    Married-civ-spouse
## 3 0.5416667     Never-worked Some-college     0.8000000    Married-civ-spouse
## 4 0.3611111          Private    Preschool     0.9333333             Separated
## 5 0.1388889        State-gov         11th     0.2666667               Widowed
## 6 0.3888889 Self-emp-not-inc    Doctorate     1.0000000 Married-spouse-absent
##          occupation relationship               race    sex capital_gain
## 1   Priv-house-serv         Wife Amer-Indian-Eskimo Female   0.70447875
## 2     Other-service    Own-child              Black Female   0.42533583
## 3     Other-service    Own-child              White   Male   0.70544041
## 4      Tech-support    Unmarried Asian-Pac-Islander Female   0.48677212
## 5 Machine-op-inspct    Own-child              White   Male   0.09516463
## 6  Transport-moving    Own-child              White   Male   0.64645837
##   capital_loss hours_per_week             native_country income
## 1   0.49150327     0.03061224                      South  <=50K
## 2   0.37334522     0.59183673 Outlying-US(Guam-USVI-etc)  <=50K
## 3   0.78753680     0.33673469                      India  <=50K
## 4   0.06359776     0.03061224                   Scotland  <=50K
## 5   0.96791572     0.11224490                  Nicaragua   >50K
## 6   0.71350464     0.20408163                   Cambodia   >50K

manual_normalize function: Normalizes a numeric vector to a range between 0 and 1. This is done by subtracting the minimum value of the vector and dividing by the range (maximum - minimum) of the vector. For any given numeric vector x, it computes x-min(x)/max(x)-min(x), where min(x, na.rm = TRUE) and max(x, na.rm = TRUE) calculate the minimum and maximum of x, ignoring NA values.

Conditional check: If there is no numeric column in the data set (!any(numeric_columns)), the process is stopped and an error message is given. This ensures that no further incorrect operations are made.

The normalization process is critical to avoid biases that may arise from differences in scale between variables in the data set.

This piece of R code divides age data in a data frame (my_dataset) into categorical groups and assigns labels based on age ranges.

my_dataset <- data.frame(age = c(25, 30, 35, NA, 40, 82, 95), gender = c("male", "female", "female", "male", "female", "male", "female"))

age_breaks <- c(0, 20, 30, 40, 50, 60, 70, 80, 90, Inf)
age_labels <- c("0-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90+")

my_dataset_binned <- my_dataset
my_dataset_binned$age_group <- cut(my_dataset_binned$age, breaks = age_breaks, labels = age_labels, right = FALSE)

head(head(my_dataset_binned))

##   age gender age_group
## 1  25   male     20-30
## 2  30 female     30-40
## 3  35 female     30-40
## 4  NA   male      <NA>
## 5  40 female     40-50
## 6  82   male     80-90

age_breaks: A set of boundaries used to determine age ranges. These limits start from 0 and go up to infinity (Inf) for values above 90.

age_labels: Contains the labels to be used for each age range, for example “0-20”, “20-30”.

This code converts a continuous variable such as age into categorical groups that are more useful for analysis and visualization. This process facilitates demographic analysis by age groups and can also handle observations with missing values (NA). Grouping process, It provides the opportunity to examine the distribution of different age groups in the data set and evaluate the characteristics specific to these groups. As a result, this approach can provide in-depth insight into data analysis and support decision-making processes.

10th Task

This piece of R code produces various data analysis reports on the dataset.

# Load necessary library
library(dlookr)
diagnose_web_report(sample_dirty_dataset)

## 
  |                                                   
  |                                             |   0%
  |                                                   
  |.                                            |   1%                         
  |                                                   
  |.                                            |   3% [setup]                 
  |                                                   
  |..                                           |   4%                         
  |                                                   
  |...                                          |   6% [load_packages]         
  |                                                   
  |...                                          |   7%                         
  |                                                   
  |....                                         |   9% [get-parameters]        
  |                                                   
  |.....                                        |  10%                         
  |                                                   
  |.....                                        |  12% [unnamed-chunk-14]      
  |                                                   
  |......                                       |  13%                         
  |                                                   
  |.......                                      |  15% [diagose]               
  |                                                   
  |.......                                      |  16%                         
  |                                                   
  |........                                     |  18% [create-overview]       
  |                                                   
  |.........                                    |  19%                         
  |                                                   
  |.........                                    |  21% [overview]              
  |                                                   
  |..........                                   |  22%                         
  |                                                   
  |...........                                  |  24% [overview-datastructure]
  |                                                   
  |...........                                  |  25%                         
  |                                                   
  |............                                 |  27% [overview-pre]          
  |                                                   
  |.............                                |  28%                         
  |                                                   
  |.............                                |  30% [overview-warnings]     
  |                                                   
  |..............                               |  31%                         
  |                                                   
  |...............                              |  33% [warnings_summary]      
  |                                                   
  |...............                              |  34%                         
  |                                                   
  |................                             |  36% [warnings]              
  |                                                   
  |.................                            |  37%                         
  |                                                   
  |.................                            |  39% [overview-variables]    
  |                                                   
  |..................                           |  40%                         
  |                                                   
  |...................                          |  42% [variables]             
  |                                                   
  |...................                          |  43%                         
  |                                                   
  |....................                         |  45% [missing]               
  |                                                   
  |.....................                        |  46%                         
  |                                                   
  |.....................                        |  48% [missing-list]          
  |                                                   
  |......................                       |  49%                         
  |                                                   
  |.......................                      |  51% [missing-data]          
  |                                                   
  |........................                     |  52%                         
  |                                                   
  |........................                     |  54% [missing-visualization] 
  |                                                   
  |.........................                    |  55%                         
  |                                                   
  |..........................                   |  57% [missing-viz2]          
  |                                                   
  |..........................                   |  58%                         
  |                                                   
  |...........................                  |  60% [unique]                
  |                                                   
  |............................                 |  61%                         
  |                                                   
  |............................                 |  63% [unique-categorical]    
  |                                                   
  |.............................                |  64%                         
  |                                                   
  |..............................               |  66% [unique-date-category]  
  |                                                   
  |..............................               |  67%                         
  |                                                   
  |...............................              |  69% [unique-numerical]      
  |                                                   
  |................................             |  70%                         
  |                                                   
  |................................             |  72% [unique-data-numeric]   
  |                                                   
  |.................................            |  73%                         
  |                                                   
  |..................................           |  75% [outliers]              
  |                                                   
  |..................................           |  76%                         
  |                                                   
  |...................................          |  78% [outliers-list]         
  |                                                   
  |....................................         |  79%                         
  |                                                   
  |....................................         |  81% [samples]               
  |                                                   
  |.....................................        |  82%                         
  |                                                   
  |......................................       |  84% [duplicated]            
  |                                                   
  |......................................       |  85%                         
  |                                                   
  |.......................................      |  87% [duplicated-list]       
  |                                                   
  |........................................     |  88%                         
  |                                                   
  |........................................     |  90% [heades]                
  |                                                   
  |.........................................    |  91%                         
  |                                                   
  |..........................................   |  93% [sample-head]           
  |                                                   
  |..........................................   |  94%                         
  |                                                   
  |...........................................  |  96% [tails]                 
  |                                                   
  |............................................ |  97%                         
  |                                                   
  |............................................ |  99% [sample-tail]           
  |                                                   
  |.............................................| 100%                         
                                                                                                                   
## "C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/pandoc" +RTS -K512m -RTS diagnosis_temp.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc21518499460c.html --lua-filter "C:\Users\emrea\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\pagebreak.lua" --lua-filter "C:\Users\emrea\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\latex-div.lua" --embed-resources --standalone --variable bs3=TRUE --section-divs --template "C:\Users\emrea\AppData\Local\R\win-library\4.3\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable theme=bootstrap --css "C:/Users/emrea/AppData/Local/R/win-library/4.3/dlookr/resources/dlookr-bootstrap.css" --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\emrea\AppData\Local\Temp\Rtmp2Zexf5\rmarkdown-str2151853c02eff.html" --variable code_folding=show --variable code_menu=1 --include-in-header header_temp.html --include-after-body "C:\Users\emrea\AppData\Local\R\win-library\4.3\dlookr\resources\footer.html"

eda_web_report(sample_dirty_dataset)

## 
  |                                                        
  |                                                  |   0%
  |                                                        
  |.                                                 |   2%                    
  |                                                        
  |..                                                |   3% [setup]            
  |                                                        
  |..                                                |   5%                    
  |                                                        
  |...                                               |   6% [load_packages]    
  |                                                        
  |....                                              |   8%                    
  |                                                        
  |.....                                             |  10% [unnamed-chunk-1]  
  |                                                        
  |......                                            |  11%                    
  |                                                        
  |......                                            |  13% [udf]              
  |                                                        
  |.......                                           |  14%                    
  |                                                        
  |........                                          |  16% [check_variables]  
  |                                                        
  |.........                                         |  17%                    
  |                                                        
  |..........                                        |  19% [create-overview]  
  |                                                        
  |..........                                        |  21%                    
  |                                                        
  |...........                                       |  22% [overview]         
  |                                                        
  |............                                      |  24%                    
  |                                                        
  |.............                                     |  25% [overview-pre]     
  |                                                        
  |.............                                     |  27%                    
  |                                                        
  |..............                                    |  29% [unnamed-chunk-2]  
  |                                                        
  |...............                                   |  30%                    
  |                                                        
  |................                                  |  32% [unnamed-chunk-3]  
  |                                                        
  |.................                                 |  33%                    
  |                                                        
  |.................                                 |  35% [variables]        
  |                                                        
  |..................                                |  37%                    
  |                                                        
  |...................                               |  38% [normality]        
  |                                                        
  |....................                              |  40%                    
  |                                                        
  |.....................                             |  41% [normality-list]   
  |                                                        
  |.....................                             |  43%                    
  |                                                        
  |......................                            |  44% [unnamed-chunk-4]  
  |                                                        
  |.......................                           |  46%                    
  |                                                        
  |........................                          |  48% [unnamed-chunk-5]  
  |                                                        
  |.........................                         |  49%                    
  |                                                        
  |.........................                         |  51% [compare_numerical]
  |                                                        
  |..........................                        |  52%                    
  |                                                        
  |...........................                       |  54% [unnamed-chunk-6]  
  |                                                        
  |............................                      |  56%                    
  |                                                        
  |.............................                     |  57% [compare-category] 
  |                                                        
  |.............................                     |  59%                    
  |                                                        
  |..............................                    |  60% [unnamed-chunk-7]  
  |                                                        
  |...............................                   |  62%                    
  |                                                        
  |................................                  |  63% [unnamed-chunk-8]  
  |                                                        
  |.................................                 |  65%                    
  |                                                        
  |.................................                 |  67% [unnamed-chunk-9]  
  |                                                        
  |..................................                |  68%                    
  |                                                        
  |...................................               |  70% [correlation]      
  |                                                        
  |....................................              |  71%                    
  |                                                        
  |.....................................             |  73% [unnamed-chunk-10] 
  |                                                        
  |.....................................             |  75%                    
  |                                                        
  |......................................            |  76% [plot-correlation] 
  |                                                        
  |.......................................           |  78%                    
  |                                                        
  |........................................          |  79% [unnamed-chunk-11] 
  |                                                        
  |........................................          |  81%                    
  |                                                        
  |.........................................         |  83% [unnamed-chunk-12] 
  |                                                        
  |..........................................        |  84%                    
  |                                                        
  |...........................................       |  86% [group-numerical]  
  |                                                        
  |............................................      |  87%                    
  |                                                        
  |............................................      |  89% [unnamed-chunk-13] 
  |                                                        
  |.............................................     |  90%                    
  |                                                        
  |..............................................    |  92% [group-categorical]
  |                                                        
  |...............................................   |  94%                    
  |                                                        
  |................................................  |  95% [unnamed-chunk-14] 
  |                                                        
  |................................................  |  97%                    
  |                                                        
  |................................................. |  98% [group-correlation]
  |                                                        
  |..................................................| 100%                    
                                                                                                              
## "C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/pandoc" +RTS -K512m -RTS eda_temp.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc2151827173d6c.html --lua-filter "C:\Users\emrea\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\pagebreak.lua" --lua-filter "C:\Users\emrea\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\latex-div.lua" --embed-resources --standalone --variable bs3=TRUE --section-divs --template "C:\Users\emrea\AppData\Local\R\win-library\4.3\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable theme=bootstrap --css "C:/Users/emrea/AppData/Local/R/win-library/4.3/dlookr/resources/dlookr-bootstrap.css" --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\emrea\AppData\Local\Temp\Rtmp2Zexf5\rmarkdown-str21518fa011e5.html" --variable code_folding=show --variable code_menu=1 --include-in-header header_temp.html --include-after-body "C:\Users\emrea\AppData\Local\R\win-library\4.3\dlookr\resources\footer.html"

transformation_web_report(sample_dirty_dataset)

## 
  |                                                          
  |                                                    |   0%
  |                                                          
  |.                                                   |   3%                  
  |                                                          
  |...                                                 |   5% [setup]          
  |                                                          
  |....                                                |   8%                  
  |                                                          
  |.....                                               |  10% [load_packages]  
  |                                                          
  |.......                                             |  13%                  
  |                                                          
  |........                                            |  15% [unnamed-chunk-1]
  |                                                          
  |.........                                           |  18%                  
  |                                                          
  |...........                                         |  21% [udf]            
  |                                                          
  |............                                        |  23%                  
  |                                                          
  |.............                                       |  26% [create-overview]
  |                                                          
  |...............                                     |  28%                  
  |                                                          
  |................                                    |  31% [overview]       
  |                                                          
  |.................                                   |  33%                  
  |                                                          
  |...................                                 |  36% [overview-pre]   
  |                                                          
  |....................                                |  38%                  
  |                                                          
  |.....................                               |  41% [unnamed-chunk-2]
  |                                                          
  |.......................                             |  44%                  
  |                                                          
  |........................                            |  46% [unnamed-chunk-3]
  |                                                          
  |.........................                           |  49%                  
  |                                                          
  |...........................                         |  51% [unnamed-chunk-4]
  |                                                          
  |............................                        |  54%                  
  |                                                          
  |.............................                       |  56% [nalist]         
  |                                                          
  |...............................                     |  59%                  
  |                                                          
  |................................                    |  62% [unnamed-chunk-5]
  |                                                          
  |.................................                   |  64%                  
  |                                                          
  |...................................                 |  67% [outlist]        
  |                                                          
  |....................................                |  69%                  
  |                                                          
  |.....................................               |  72% [unnamed-chunk-6]
  |                                                          
  |.......................................             |  74%                  
  |                                                          
  |........................................            |  77% [skweness]       
  |                                                          
  |.........................................           |  79%                  
  |                                                          
  |...........................................         |  82% [unnamed-chunk-7]
  |                                                          
  |............................................        |  85%                  
  |                                                          
  |.............................................       |  87% [binning]        
  |                                                          
  |...............................................     |  90%                  
  |                                                          
  |................................................    |  92% [unnamed-chunk-8]
  |                                                          
  |.................................................   |  95%                  
  |                                                          
  |................................................... |  97% [optimal-binning]
  |                                                          
  |....................................................| 100%                  
                                                                                                            
## "C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/pandoc" +RTS -K512m -RTS transformation_temp.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc2151810957b2c.html --lua-filter "C:\Users\emrea\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\pagebreak.lua" --lua-filter "C:\Users\emrea\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\latex-div.lua" --embed-resources --standalone --variable bs3=TRUE --section-divs --template "C:\Users\emrea\AppData\Local\R\win-library\4.3\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable theme=bootstrap --css "C:/Users/emrea/AppData/Local/R/win-library/4.3/dlookr/resources/dlookr-bootstrap.css" --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\emrea\AppData\Local\Temp\Rtmp2Zexf5\rmarkdown-str2151871239be.html" --variable code_folding=show --variable code_menu=1 --include-in-header header_temp.html --include-after-body "C:\Users\emrea\AppData\Local\R\win-library\4.3\dlookr\resources\footer.html"

diagnose_web_report(): It identifies potential problems in the data set and produces a web-based report to resolve these problems. This report includes issues such as missing values, outliers, and distribution distortions. Provides an interactive report that can be opened in the browser, This report evaluates the overall health of the dataset and recommends cleaning or preprocessing steps.

eda_web_report(): Creates an exploratory data analysis (EDA) report for the data set. This report presents visual and statistical summaries of the data set’s structural properties, distributions, and relationships between variables. It provides an interactive web report with scatter plots of variables, correlation analyzes and other exploratory analyses.

transformation_web_report(): Analyzes data transformation techniques and shows the effects of these transformations on the data set. This report suggests optimal transformations that can be applied prior to the modeling process. Produces a web-based report showing the results of suggested conversions. Thanks to this report, you can evaluate which conversions are suitable for the data set.

The web reports produced provide a user-friendly and interactive interface, helping to examine the data set in detail.

Cleansing2

Emre Aydin, Semih Elmas

2024-04-30

Sample data