knitr::opts_chunk$set(echo = TRUE)
In this part of the code, we create the dataset that we will use in the rest of our report. This set basically contains 1000 observations: age, salary type, education, marital status, occupation, relationship status, race, gender, family financial situation (income-expense), weekly working hours, While it includes data such as nationality and salary, it also contains some missing data.
# Load necessary libraries
library(tidyverse)
# Set seed for reproducibility
set.seed(31)
# Generate sample data
n <- 1000 # Number of observations
# Age between 18 and 90
age <- sample(18:90, n, replace = TRUE)
# Introduce errors in age column
age[sample(1:n, 50)] <- "InvalidAge"
# Workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked
workclass <- sample(c("Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", "Local-gov", "State-gov", "Without-pay", "Never-worked"), n, replace = TRUE)
# Introduce missing values in workclass column
workclass[sample(1:n, 50)] <- NA
# Education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool
education <- sample(c("Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"), n, replace = TRUE)
# Introduce inconsistencies in education column
education[sample(1:n, 50)] <- "InvalidEducation"
# Education Number: 1-16
education_num <- sample(1:16, n, replace = TRUE)
# Marital Status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse
marital_status <- sample(c("Married-civ-spouse", "Divorced", "Never-married", "Separated", "Widowed", "Married-spouse-absent", "Married-AF-spouse"), n, replace = TRUE)
# Introduce missing values in marital_status column
marital_status[sample(1:n, 50)] <- NA
# Occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces
occupation <- sample(c("Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv", "Armed-Forces"), n, replace = TRUE)
# Introduce errors in occupation column
occupation[sample(1:n, 50)] <- "InvalidOccupation"
# Relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried
relationship <- sample(c("Wife", "Own-child", "Husband", "Not-in-family", "Other-relative", "Unmarried"), n, replace = TRUE)
# Introduce missing values in relationship column
relationship[sample(1:n, 50)] <- NA
# Race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black
race <- sample(c("White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"), n, replace = TRUE)
# Introduce inconsistencies in race column
race[sample(1:n, 50)] <- "InvalidRace"
# Sex: Female, Male
sex <- sample(c("Female", "Male"), n, replace = TRUE)
# Introduce missing values in sex column
sex[sample(1:n, 50)] <- NA
# Capital Gain: 0-99999
capital_gain <- sample(0:99999, n, replace = TRUE)
# Introduce errors in capital_gain column
capital_gain[sample(1:n, 50)] <- -999
# Capital Loss: 0-99999
capital_loss <- sample(0:99999, n, replace = TRUE)
# Introduce errors in capital_loss column
capital_loss[sample(1:n, 50)] <- -999
# Hours per week: 1-99
hours_per_week <- sample(1:99, n, replace = TRUE)
# Introduce missing values in hours_per_week column
hours_per_week[sample(1:n, 50)] <- NA
# Native country: United-States, Cambodia, England, ...
native_country <- sample(c("United-States", "Cambodia", "England", "Puerto-Rico", "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras", "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico", "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"), n, replace = TRUE)
# Introduce missing values in native_country column
native_country[sample(1:n, 50)] <- NA
# Income: <=50K, >50K
income <- sample(c("<=50K", ">50K"), n, replace = TRUE, prob = c(0.75, 0.25))
# Create dataframe
sample_dirty_dataset <- data.frame(age, workclass, education, education_num, marital_status, occupation, relationship, race, sex, capital_gain, capital_loss, hours_per_week, native_country, income)
OUR CODES
2nd Task
It contains two separate functions to determine invalid and missing values in the data set. Performing such pre-processing before performing data analysis increases the accuracy of the analysis results and prevents misleading results.
# Detecting invalid values for all columns in the data set
check_invalid_values <- function(dataset) {
invalid_columns <- c()
if (any(dataset$capital_gain < 0 | dataset$capital_gain > 99999)) {
invalid_columns <- c(invalid_columns, "capital_gain")
}
if (any(dataset$capital_loss < 0 | dataset$capital_loss > 99999)) {
invalid_columns <- c(invalid_columns, "capital_loss")
}
for (col_name in names(dataset)) {
if (any(grepl("Invalid", dataset[[col_name]], ignore.case = TRUE))) {
invalid_columns <- c(invalid_columns, col_name)
}
}
return(unique(invalid_columns))
}
# Function to find missing values in the data set
find_na_columns <- function(dataset) {
na_counts <- colSums(is.na(dataset))
na_columns <- names(na_counts[na_counts > 0])
return(na_columns)
}
# Run functions on dataset 'sample_dirty_dataset'
invalid_columns <- check_invalid_values(sample_dirty_dataset)
na_columns <- find_na_columns(sample_dirty_dataset)
# Print the results
cat("Columns containing invalid values:", invalid_columns, "\n")
## Columns containing invalid values: capital_gain capital_loss age education occupation race
cat("Columns with missing values:", na_columns, "\n")
## Columns with missing values: workclass marital_status relationship sex hours_per_week native_country
check_invalid_values Function: This function detects
invalid values (values not between 0 and 99999) in the capital_gain and
capital_loss columns. Additionally, it loops through all columns and
finds values that contain the word “Invalid”. This process identifies
specific erroneous data by looking at each column in the data set.
find_na_columns Function: This function loops through
each column in the data set and calculates the number of missing (NA)
values in each. It identifies columns that contain missing values and
returns the names of these columns as a list.
The check_invalid_values and
find_na_columns functions are run on the data set named
sample_dirty_dataset to determine the columns containing
invalid and missing values, respectively.
The results are printed on the screen and it is reported which columns have problems.
3rd Task
This R code performs data cleaning operations to resolve the issues identified in the previous step. It contains two main functions and one auxiliary function used to clean invalid and erroneous data in the data set.
# Purge invalid values for specific columns in the dataset
clean_invalid_values <- function(dataset, column, lower_bound, upper_bound) {
invalid_indices <- dataset[[column]] < lower_bound | dataset[[column]] > upper_bound
dataset[[column]][invalid_indices] <- NA
return(dataset)
}
# Clear data containing 'Invalid' in all columns in the dataset
clean_invalid_entries <- function(dataset) {
columns <- colnames(dataset)
for (col in columns) {
invalid_indices <- grepl("Invalid", dataset[[col]], ignore.case = TRUE)
dataset[[col]][invalid_indices] <- NA
}
return(dataset)
}
apply_cleaning <- function(dataset) {
dataset <- clean_invalid_values(dataset, "capital_gain", 0, 99999)
dataset <- clean_invalid_values(dataset, "capital_loss", 0, 99999)
dataset <- clean_invalid_entries(dataset)
return(dataset)
}
sample_dirty_dataset <- apply_cleaning(sample_dirty_dataset)
head(sample_dirty_dataset)
## age workclass education education_num marital_status
## 1 62 Without-pay Masters 5 Separated
## 2 66 Without-pay 1st-4th 6 Married-civ-spouse
## 3 57 Never-worked Some-college 13 Married-civ-spouse
## 4 81 State-gov Bachelors 2 Married-civ-spouse
## 5 60 Self-emp-inc 1st-4th 5 Never-married
## 6 72 Self-emp-inc HS-grad 15 Separated
## occupation relationship race sex capital_gain
## 1 Priv-house-serv Wife Amer-Indian-Eskimo Female 70407
## 2 Other-service Own-child Black Female 42541
## 3 Other-service Own-child White Male 70503
## 4 <NA> Own-child White Female 51598
## 5 Farming-fishing Not-in-family <NA> Female 8284
## 6 Machine-op-inspct <NA> White Male 88182
## capital_loss hours_per_week native_country income
## 1 NA 4 South <=50K
## 2 37405 59 Outlying-US(Guam-USVI-etc) <=50K
## 3 78767 34 India <=50K
## 4 NA 61 Ireland <=50K
## 5 63324 20 Ecuador <=50K
## 6 NA 11 Cuba <=50K
clean_invalid_values Function: This function replaces
values that fall outside the specified lower and upper limits in a given
column with NA (missing value). For example, in the capital_gain and
capital_loss columns, values that are not between 0 and 99999 are
considered invalid and replaced with NA.
clean_invalid_entries Function: This function detects
values containing “Invalid” in all columns in the data set and replaces
these values with NA. This process traverses all columns in the data
set, providing general cleaning and removing specialized invalid
entries.
apply_cleaning Function: This helper function applies
the cleaning functions defined above to the data set, respectively.
First, invalid value cleaning is performed for the capital_gain and
capital_loss columns. The global invalid entries in all columns are then
purged. In conclusion, The data set with all cleaning operations applied
is returned.
The functions used in this process are aimed at improving data quality by effectively handling incorrect and invalid entries in the data set.
4th Task
This piece of R code performs data cleaning by checking certain
columns (education, race, marital_status, and occupation) in the data
set named sample_dirty_dataset and replacing values that
are outside the defined valid categories with NA (missing value).
sample_dirty_dataset <- sample_dirty_dataset %>%
mutate(
education = ifelse(education %in% c('Bachelors', 'Some-college', '11th', 'HS-grad', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', '9th', '7th-8th', '12th', 'Masters', '1st-4th', '10th', 'Doctorate', '5th-6th', 'Preschool'), education, NA),
race = ifelse(race %in% c('White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black'), race, NA),
marital_status = ifelse(marital_status %in% c('Married-civ-spouse', 'Divorced', 'Never-married', 'Separated', 'Widowed', 'Married-spouse-absent', 'Married-AF-spouse'), marital_status, NA),
occupation = ifelse(occupation %in% c('Tech-support', 'Craft-repair', 'Other-service', 'Sales', 'Exec-managerial', 'Prof-specialty', 'Handlers-cleaners', 'Machine-op-inspct', 'Adm-clerical', 'Farming-fishing', 'Transport-moving', 'Priv-house-serv', 'Protective-serv', 'Armed-Forces'), occupation, NA)
)
head(sample_dirty_dataset) %>%
select(education, race, marital_status, occupation)
## education race marital_status occupation
## 1 Masters Amer-Indian-Eskimo Separated Priv-house-serv
## 2 1st-4th Black Married-civ-spouse Other-service
## 3 Some-college White Married-civ-spouse Other-service
## 4 Bachelors White Married-civ-spouse <NA>
## 5 1st-4th <NA> Never-married Farming-fishing
## 6 HS-grad White Separated Machine-op-inspct
mutate(): This function is used to transform existing
columns or add new columns. Here it is used with the
ifelse() function to make conditional changes to specific
columns.
ifelse(): Used to replace values outside the valid
categories specified for each column with NA. For example, For the
education column, only certain education statuses are accepted, all
other values are replaced with NA.
Education: Only specified education levels are accepted (e.g. ‘Bachelors’, ‘HS-grad’). Values other than these are replaced with NA.
Race: Except for defined races such as ‘White’, ‘Asian-Pac-Islander’, they are replaced with NA.
Marital Status (marital_status): ‘Married-civ-spouse’ only, Certain marital statuses such as ‘Divorced’ are accepted, others are replaced with NA.
Occupation: Certain occupations are accepted (such as ‘Tech-support’, ‘Sales’), other values are replaced with NA.
head(): Shows the first six rows of the cleaned data
set. This is used to observe the results of the cleaning process.
select(): Used to select the columns to be displayed.
The cleared education, race, marital_status, and occupation columns are
shown here.
5th Task
This piece of R code fills in missing values in numeric columns in
the sample_dirty_dataset dataset and then removes rows with
any missing values.
# Let's fill the numerical columns with median. The median is generally preferred because it is less affected by outliers:
sample_dirty_dataset <- sample_dirty_dataset %>%
mutate(across(where(is.numeric), ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))
sample_dirty_dataset <- drop_na(sample_dirty_dataset)
across(where(is.numeric), ...) selects all columns that
are numeric and applies the specified operation.
median(): Calculates the median value for numeric data.
The na.rm = TRUE parameter ensures that missing values are ignored when
calculating the median.
drop_na(): Removes all rows with missing values from the
data set. This process is important for those who prefer to work only on
the full data set in the analysis.
6th Task
This piece of R code is used to detect outliers by performing z-score
analysis on the age column in the sample_dirty_dataset
dataset.
# METHOD 1
sample_dirty_dataset$age <- as.numeric(sample_dirty_dataset$age)
zscore <- abs(scale(sample_dirty_dataset$age))
plot(zscore, type = "n", main = "Z-score for Age", xlab = "Index", ylab = "Z-score")
# Draw threshold value (2 standard deviations)
abline(h = 2, col = "red")
# Mark values above the threshold with red
text(1:length(sample_dirty_dataset$age), zscore, labels = round(zscore, 2), col = ifelse(zscore > 2, "red", "black"))
as.numeric(): Converts the values in the age column to
numeric type. This may be necessary after clearing the data set of
invalid values such as “InvalidAge” because these values may not be in
the appropriate format for numerical operations.
scale(): Performs standardization and calculates how far
each age value is from the mean in terms of standard deviation. The
values obtained as a result of this process are known as z-score.
abs(): The absolute value function prevents z-scores
from being negative and makes it easier to use in analysis.
plot(): Plots a plot of Z-scores. The type=“n” parameter
plots only the axes (index vs. z-score) of the chart, not the
points.
abline(): A specific z- draws a red horizontal line
indicating the score threshold (2 standard deviations in this example).
This line indicates the z-score threshold that can be considered an
outlier.
text(): Adds each z-score value as a label to the graph.
Z-score values exceeding the threshold are marked in red, others in
black.
This snippet provides an effective method to identify outliers in the age column. Identifying outliers increases the accuracy of analyses, especially on important demographic variables such as age.
This piece of R code performs analysis by drawing a boxplot for the
education_num column in the sample_dirty_dataset dataset
and identifying outliers in this column.
#METHOD 2
boxplot_education <- boxplot(sample_dirty_dataset$education_num, main = "Boxplot for Education Number", ylab = "Education Number")
outliers_education <- boxplot_education$out
cat("Outliers for Education Number:\n")
## Outliers for Education Number:
print(outliers_education)
## numeric(0)
outlier_indices <- which(sample_dirty_dataset$education_num %in% outliers_education)
cat("Indices of outliers for Education Number:\n")
## Indices of outliers for Education Number:
print(outlier_indices)
## integer(0)
boxplot(): This function plots a boxplot for the
education_num column. The box plot shows the central
tendency and spread of the data distribution and is also used to
visually identify outliers. The chart shows the lower and upper bounds
of the box (Q1 and Q3), the median, and outliers.
$out: The out component of the object returned by the
boxplot() function contains the values determined as
outliers in the graph.
cat() and print(): These functions are used
to print outliers and their indices to the screen.
This piece of code provides an effective method for outlier detection in a numeric variable such as training number. Identifying outliers is an important part of considering potential errors in the data set or special cases that need to be considered.
This piece of R code performs an analysis specifically for
identifying outliers using robust statistical methods for the
hours_per_week column in the
sample_dirty_dataset dataset. Code, It calculates a robust
covariance matrix with the Minimum Covariance Determinant (MCD) method
using the rrcov library and presents this matrix graphically.
#METHOD 3
library(rrcov)
par(mfrow=c(2,2))
plot(covMcd(sample_dirty_dataset$hours_per_week))
par(mfrow=c(2,2)): It is used to display multiple graphs
side by side in the same graph window by dividing the output area into
four sections (2 rows and 2 columns).
covMcd(): Applies the Minimum Covariance Determinant
method on the given data set. This method reduces the effect of outliers
in a way that does not disrupt the main structure of the data set.
Calculates the covariance matrix of the data set.
plot(): Plots various summary plots of the robust
covariance matrix calculated by the covMcd() function. These graphs
provide important information about the distribution of the dataset and
outliers.
This method is extremely useful for performing outlier analysis on continuous variables such as working hours, especially in large data sets or in fields such as economics and social sciences. Robust methods help prevent possible errors due to outliers in the data set, thus allowing more reliable results to be obtained. These analyzes are critical to better understand the overall structure of the data set and identify potential data entry errors or measurement errors.
7th Task
This R code snippet manually standardizes numeric columns in a
dataset using a custom function and applies it to
sample_dirty_dataset.
# Manual standardization function
manual_standardize <- function(x) {
(x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
}
data_standardized <- sample_dirty_dataset
numeric_columns <- sapply(data_standardized, is.numeric)
if (!any(numeric_columns)) {
stop("No numeric columns found in 'data_standardized'.")
}
data_standardized[, numeric_columns] <- lapply(data_standardized[, numeric_columns, drop = FALSE], manual_standardize)
head(data_standardized)
## age workclass education education_num marital_status
## 1 0.3941790 Without-pay Masters -0.6793678 Separated
## 2 0.5862427 Without-pay 1st-4th -0.4675769 Married-civ-spouse
## 3 0.1540994 Never-worked Some-college 1.0149591 Married-civ-spouse
## 4 -0.4701076 Private Preschool 1.4385408 Separated
## 5 -1.2383623 State-gov 11th -0.6793678 Widowed
## 6 -0.3740757 Self-emp-not-inc Doctorate 1.6503317 Married-spouse-absent
## occupation relationship race sex capital_gain
## 1 Priv-house-serv Wife Amer-Indian-Eskimo Female 0.80234026
## 2 Other-service Own-child Black Female -0.20198266
## 3 Other-service Own-child White Male 0.80580021
## 4 Tech-support Unmarried Asian-Pac-Islander Female 0.01905776
## 5 Machine-op-inspct Own-child White Male -1.38989924
## 6 Transport-moving Own-child White Male 0.59358987
## capital_loss hours_per_week native_country income
## 1 -0.001034116 -1.6716729 South <=50K
## 2 -0.423978454 0.3222502 Outlying-US(Guam-USVI-etc) <=50K
## 3 1.058611803 -0.5840785 India <=50K
## 4 -1.532713087 -1.6716729 Scotland <=50K
## 5 1.704274448 -1.3816477 Nicaragua >50K
## 6 0.793615193 -1.0553694 Cambodia >50K
manual_standardize function: Standardizes a numerical
vector by subtracting its mean and dividing by the standard deviation.
This process is often called Z-score normalization. For any scalar
vector x, calculates x-mean(x)/sd(x), where
mean(x, na.rm = TRUE) is the mean of x,
sd(x, na. rm = TRUE) calculates the standard deviation in a
similar way.
numeric_columns: Determines the numeric columns in
data_standardized using sapply() and
is.numeric. This results in a logical vector that points to
a TRUE numeric column.
This R code snippet defines a function for manually normalizing
numeric columns in a dataset (sample_dirty_dataset) and
applies this transformation.
# Define manual normalization function
manual_normalize <- function(x) {
(x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
}
data_normalized <- sample_dirty_dataset
numeric_columns <- sapply(data_normalized, is.numeric)
if (!any(numeric_columns)) {
stop("No numeric columns found in 'data_normalized'.")
}
data_normalized[, numeric_columns] <- lapply(data_normalized[, numeric_columns, drop = FALSE], manual_normalize)
head(data_normalized)
## age workclass education education_num marital_status
## 1 0.6111111 Without-pay Masters 0.2666667 Separated
## 2 0.6666667 Without-pay 1st-4th 0.3333333 Married-civ-spouse
## 3 0.5416667 Never-worked Some-college 0.8000000 Married-civ-spouse
## 4 0.3611111 Private Preschool 0.9333333 Separated
## 5 0.1388889 State-gov 11th 0.2666667 Widowed
## 6 0.3888889 Self-emp-not-inc Doctorate 1.0000000 Married-spouse-absent
## occupation relationship race sex capital_gain
## 1 Priv-house-serv Wife Amer-Indian-Eskimo Female 0.70447875
## 2 Other-service Own-child Black Female 0.42533583
## 3 Other-service Own-child White Male 0.70544041
## 4 Tech-support Unmarried Asian-Pac-Islander Female 0.48677212
## 5 Machine-op-inspct Own-child White Male 0.09516463
## 6 Transport-moving Own-child White Male 0.64645837
## capital_loss hours_per_week native_country income
## 1 0.49150327 0.03061224 South <=50K
## 2 0.37334522 0.59183673 Outlying-US(Guam-USVI-etc) <=50K
## 3 0.78753680 0.33673469 India <=50K
## 4 0.06359776 0.03061224 Scotland <=50K
## 5 0.96791572 0.11224490 Nicaragua >50K
## 6 0.71350464 0.20408163 Cambodia >50K
manual_normalize function: Normalizes a numeric
vector to a range between 0 and 1. This is done by subtracting the
minimum value of the vector and dividing by the range (maximum -
minimum) of the vector. For any given numeric vector x, it
computes x-min(x)/max(x)-min(x), where min(x, na.rm = TRUE)
and max(x, na.rm = TRUE) calculate the minimum and maximum
of x, ignoring NA values.
Conditional check: If there is no numeric column in the data set
(!any(numeric_columns)), the process is stopped and an
error message is given. This ensures that no further incorrect
operations are made.
The normalization process is critical to avoid biases that may arise from differences in scale between variables in the data set.
This piece of R code divides age data in a data frame
(my_dataset) into categorical groups and assigns labels
based on age ranges.
my_dataset <- data.frame(age = c(25, 30, 35, NA, 40, 82, 95), gender = c("male", "female", "female", "male", "female", "male", "female"))
age_breaks <- c(0, 20, 30, 40, 50, 60, 70, 80, 90, Inf)
age_labels <- c("0-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90+")
my_dataset_binned <- my_dataset
my_dataset_binned$age_group <- cut(my_dataset_binned$age, breaks = age_breaks, labels = age_labels, right = FALSE)
head(head(my_dataset_binned))
## age gender age_group
## 1 25 male 20-30
## 2 30 female 30-40
## 3 35 female 30-40
## 4 NA male <NA>
## 5 40 female 40-50
## 6 82 male 80-90
age_breaks: A set of boundaries used to determine age
ranges. These limits start from 0 and go up to infinity (Inf) for values
above 90.
age_labels: Contains the labels to be used for each age
range, for example “0-20”, “20-30”.
This code converts a continuous variable such as age into categorical groups that are more useful for analysis and visualization. This process facilitates demographic analysis by age groups and can also handle observations with missing values (NA). Grouping process, It provides the opportunity to examine the distribution of different age groups in the data set and evaluate the characteristics specific to these groups. As a result, this approach can provide in-depth insight into data analysis and support decision-making processes.
10th Task
This piece of R code produces various data analysis reports on the dataset.
# Load necessary library
library(dlookr)
diagnose_web_report(sample_dirty_dataset)
##
|
| | 0%
|
|. | 1%
|
|. | 3% [setup]
|
|.. | 4%
|
|... | 6% [load_packages]
|
|... | 7%
|
|.... | 9% [get-parameters]
|
|..... | 10%
|
|..... | 12% [unnamed-chunk-14]
|
|...... | 13%
|
|....... | 15% [diagose]
|
|....... | 16%
|
|........ | 18% [create-overview]
|
|......... | 19%
|
|......... | 21% [overview]
|
|.......... | 22%
|
|........... | 24% [overview-datastructure]
|
|........... | 25%
|
|............ | 27% [overview-pre]
|
|............. | 28%
|
|............. | 30% [overview-warnings]
|
|.............. | 31%
|
|............... | 33% [warnings_summary]
|
|............... | 34%
|
|................ | 36% [warnings]
|
|................. | 37%
|
|................. | 39% [overview-variables]
|
|.................. | 40%
|
|................... | 42% [variables]
|
|................... | 43%
|
|.................... | 45% [missing]
|
|..................... | 46%
|
|..................... | 48% [missing-list]
|
|...................... | 49%
|
|....................... | 51% [missing-data]
|
|........................ | 52%
|
|........................ | 54% [missing-visualization]
|
|......................... | 55%
|
|.......................... | 57% [missing-viz2]
|
|.......................... | 58%
|
|........................... | 60% [unique]
|
|............................ | 61%
|
|............................ | 63% [unique-categorical]
|
|............................. | 64%
|
|.............................. | 66% [unique-date-category]
|
|.............................. | 67%
|
|............................... | 69% [unique-numerical]
|
|................................ | 70%
|
|................................ | 72% [unique-data-numeric]
|
|................................. | 73%
|
|.................................. | 75% [outliers]
|
|.................................. | 76%
|
|................................... | 78% [outliers-list]
|
|.................................... | 79%
|
|.................................... | 81% [samples]
|
|..................................... | 82%
|
|...................................... | 84% [duplicated]
|
|...................................... | 85%
|
|....................................... | 87% [duplicated-list]
|
|........................................ | 88%
|
|........................................ | 90% [heades]
|
|......................................... | 91%
|
|.......................................... | 93% [sample-head]
|
|.......................................... | 94%
|
|........................................... | 96% [tails]
|
|............................................ | 97%
|
|............................................ | 99% [sample-tail]
|
|.............................................| 100%
## "C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/pandoc" +RTS -K512m -RTS diagnosis_temp.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc21518499460c.html --lua-filter "C:\Users\emrea\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\pagebreak.lua" --lua-filter "C:\Users\emrea\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\latex-div.lua" --embed-resources --standalone --variable bs3=TRUE --section-divs --template "C:\Users\emrea\AppData\Local\R\win-library\4.3\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable theme=bootstrap --css "C:/Users/emrea/AppData/Local/R/win-library/4.3/dlookr/resources/dlookr-bootstrap.css" --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\emrea\AppData\Local\Temp\Rtmp2Zexf5\rmarkdown-str2151853c02eff.html" --variable code_folding=show --variable code_menu=1 --include-in-header header_temp.html --include-after-body "C:\Users\emrea\AppData\Local\R\win-library\4.3\dlookr\resources\footer.html"
eda_web_report(sample_dirty_dataset)
##
|
| | 0%
|
|. | 2%
|
|.. | 3% [setup]
|
|.. | 5%
|
|... | 6% [load_packages]
|
|.... | 8%
|
|..... | 10% [unnamed-chunk-1]
|
|...... | 11%
|
|...... | 13% [udf]
|
|....... | 14%
|
|........ | 16% [check_variables]
|
|......... | 17%
|
|.......... | 19% [create-overview]
|
|.......... | 21%
|
|........... | 22% [overview]
|
|............ | 24%
|
|............. | 25% [overview-pre]
|
|............. | 27%
|
|.............. | 29% [unnamed-chunk-2]
|
|............... | 30%
|
|................ | 32% [unnamed-chunk-3]
|
|................. | 33%
|
|................. | 35% [variables]
|
|.................. | 37%
|
|................... | 38% [normality]
|
|.................... | 40%
|
|..................... | 41% [normality-list]
|
|..................... | 43%
|
|...................... | 44% [unnamed-chunk-4]
|
|....................... | 46%
|
|........................ | 48% [unnamed-chunk-5]
|
|......................... | 49%
|
|......................... | 51% [compare_numerical]
|
|.......................... | 52%
|
|........................... | 54% [unnamed-chunk-6]
|
|............................ | 56%
|
|............................. | 57% [compare-category]
|
|............................. | 59%
|
|.............................. | 60% [unnamed-chunk-7]
|
|............................... | 62%
|
|................................ | 63% [unnamed-chunk-8]
|
|................................. | 65%
|
|................................. | 67% [unnamed-chunk-9]
|
|.................................. | 68%
|
|................................... | 70% [correlation]
|
|.................................... | 71%
|
|..................................... | 73% [unnamed-chunk-10]
|
|..................................... | 75%
|
|...................................... | 76% [plot-correlation]
|
|....................................... | 78%
|
|........................................ | 79% [unnamed-chunk-11]
|
|........................................ | 81%
|
|......................................... | 83% [unnamed-chunk-12]
|
|.......................................... | 84%
|
|........................................... | 86% [group-numerical]
|
|............................................ | 87%
|
|............................................ | 89% [unnamed-chunk-13]
|
|............................................. | 90%
|
|.............................................. | 92% [group-categorical]
|
|............................................... | 94%
|
|................................................ | 95% [unnamed-chunk-14]
|
|................................................ | 97%
|
|................................................. | 98% [group-correlation]
|
|..................................................| 100%
## "C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/pandoc" +RTS -K512m -RTS eda_temp.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc2151827173d6c.html --lua-filter "C:\Users\emrea\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\pagebreak.lua" --lua-filter "C:\Users\emrea\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\latex-div.lua" --embed-resources --standalone --variable bs3=TRUE --section-divs --template "C:\Users\emrea\AppData\Local\R\win-library\4.3\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable theme=bootstrap --css "C:/Users/emrea/AppData/Local/R/win-library/4.3/dlookr/resources/dlookr-bootstrap.css" --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\emrea\AppData\Local\Temp\Rtmp2Zexf5\rmarkdown-str21518fa011e5.html" --variable code_folding=show --variable code_menu=1 --include-in-header header_temp.html --include-after-body "C:\Users\emrea\AppData\Local\R\win-library\4.3\dlookr\resources\footer.html"
transformation_web_report(sample_dirty_dataset)
##
|
| | 0%
|
|. | 3%
|
|... | 5% [setup]
|
|.... | 8%
|
|..... | 10% [load_packages]
|
|....... | 13%
|
|........ | 15% [unnamed-chunk-1]
|
|......... | 18%
|
|........... | 21% [udf]
|
|............ | 23%
|
|............. | 26% [create-overview]
|
|............... | 28%
|
|................ | 31% [overview]
|
|................. | 33%
|
|................... | 36% [overview-pre]
|
|.................... | 38%
|
|..................... | 41% [unnamed-chunk-2]
|
|....................... | 44%
|
|........................ | 46% [unnamed-chunk-3]
|
|......................... | 49%
|
|........................... | 51% [unnamed-chunk-4]
|
|............................ | 54%
|
|............................. | 56% [nalist]
|
|............................... | 59%
|
|................................ | 62% [unnamed-chunk-5]
|
|................................. | 64%
|
|................................... | 67% [outlist]
|
|.................................... | 69%
|
|..................................... | 72% [unnamed-chunk-6]
|
|....................................... | 74%
|
|........................................ | 77% [skweness]
|
|......................................... | 79%
|
|........................................... | 82% [unnamed-chunk-7]
|
|............................................ | 85%
|
|............................................. | 87% [binning]
|
|............................................... | 90%
|
|................................................ | 92% [unnamed-chunk-8]
|
|................................................. | 95%
|
|................................................... | 97% [optimal-binning]
|
|....................................................| 100%
## "C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/pandoc" +RTS -K512m -RTS transformation_temp.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc2151810957b2c.html --lua-filter "C:\Users\emrea\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\pagebreak.lua" --lua-filter "C:\Users\emrea\AppData\Local\R\win-library\4.3\rmarkdown\rmarkdown\lua\latex-div.lua" --embed-resources --standalone --variable bs3=TRUE --section-divs --template "C:\Users\emrea\AppData\Local\R\win-library\4.3\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable theme=bootstrap --css "C:/Users/emrea/AppData/Local/R/win-library/4.3/dlookr/resources/dlookr-bootstrap.css" --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\emrea\AppData\Local\Temp\Rtmp2Zexf5\rmarkdown-str2151871239be.html" --variable code_folding=show --variable code_menu=1 --include-in-header header_temp.html --include-after-body "C:\Users\emrea\AppData\Local\R\win-library\4.3\dlookr\resources\footer.html"
diagnose_web_report(): It identifies potential problems
in the data set and produces a web-based report to resolve these
problems. This report includes issues such as missing values, outliers,
and distribution distortions. Provides an interactive report that can be
opened in the browser, This report evaluates the overall health of the
dataset and recommends cleaning or preprocessing steps.
eda_web_report(): Creates an exploratory data analysis
(EDA) report for the data set. This report presents visual and
statistical summaries of the data set’s structural properties,
distributions, and relationships between variables. It provides an
interactive web report with scatter plots of variables, correlation
analyzes and other exploratory analyses.
transformation_web_report(): Analyzes data
transformation techniques and shows the effects of these transformations
on the data set. This report suggests optimal transformations that can
be applied prior to the modeling process. Produces a web-based report
showing the results of suggested conversions. Thanks to this report, you
can evaluate which conversions are suitable for the data set.
The web reports produced provide a user-friendly and interactive interface, helping to examine the data set in detail.