# read file here
df <- diamonds[sample(1:nrow(diamonds),5000),]
# Set target/dependent variable, if any
target_var <- "price"
# No target variable
#target_var <- NULL
If a numeric variable has just a few unique values, it might make more sense to convert it into categorical variable. If the total number of unique values is less than 5% of the total rows, convert.
# Iterate over each column of the data frame
for (i in seq_along(df)) {
# Check if the column is numeric and has less than 5 unique values
if (is.numeric(df[[i]]) && length(unique(df[[i]])) / nrow(df) < 0.05 && length(unique(df[[i]])) < 13) {
# Convert the column to a factor
df[[i]] <- as.factor(df[[i]])
cat("\nColumn ", colnames(df)[i], "was converted to a factor.")
}
}
If a non-numeric variable has too many unique values, it might be names or IDs that is not useful in analysis.
cutoff <- 0.8
# Initialize a vector to store the names of columns to be removed
cols_to_remove <- c()
# Loop through each column
for (col_name in names(df)) {
# Check if column is non-numeric and has unique values > 80% of total rows
if (!is.numeric(df[[col_name]]) && length(unique(df[[col_name]])) > cutoff * nrow(df)) {
cols_to_remove <- c(cols_to_remove, col_name)
cat("\nColumn", cols_to_remove, " was excluded from analysis.")
}
}
# Remove the identified columns
df <- df %>% select(-one_of(cols_to_remove))
If target variable is specified, bin this variable into another variable called target_bin.
# Check if the target_var is in the dataframe and is not NA
if (!is.na(target_var) && target_var %in% names(df)) {
# Check if the target variable is numerical
if (is.numeric(df[[target_var]])) {
# Create bins for the numerical target variable
target_bin <- cut(df[[target_var]],
breaks = 5,
labels = c("low", "low_mid", "mid", "upper_mid", "high"),
include.lowest = TRUE
)
print(paste("Binned target variable", target_var, " as", "target_bin"))
} else {
print(paste("The target variable, ", target_var, ", is a categorical variable."))
}
} else {
print("No valid target variable selected. Proceeding with general EDA.")
target_var <- NULL
}
## [1] "Binned target variable price as target_bin"
str(df)
## tibble [5,000 × 10] (S3: tbl_df/tbl/data.frame)
## $ carat : num [1:5000] 0.72 0.2 1.01 0.33 0.36 0.26 1.22 0.64 1.02 0.32 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 1 5 5 5 5 3 3 4 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 4 2 5 1 3 2 7 5 1 4 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 6 4 6 4 5 7 3 6 4 6 ...
## $ depth : num [1:5000] 62 61.1 65 60.3 61.5 61.7 60.6 60 62.7 60.8 ...
## $ table : num [1:5000] 55 59 55 55 55 56 56 59 58 59 ...
## $ price : int [1:5000] 3352 367 4355 667 803 635 5586 2060 6974 730 ...
## $ x : num [1:5000] 5.72 3.81 6.31 4.5 4.57 4.09 6.96 5.59 6.35 4.41 ...
## $ y : num [1:5000] 5.76 3.78 6.27 4.52 4.61 4.11 6.95 5.65 6.44 4.44 ...
## $ z : num [1:5000] 3.56 2.32 4.09 2.72 2.82 2.53 4.22 3.37 4.01 2.69 ...
summary(df)
## carat cut color clarity depth
## Min. :0.2000 Fair : 151 D: 623 SI1 :1196 Min. :43.00
## 1st Qu.:0.4000 Good : 475 E: 935 VS2 :1175 1st Qu.:61.10
## Median :0.7000 Very Good:1114 F: 861 SI2 : 833 Median :61.80
## Mean :0.7962 Premium :1272 G:1058 VS1 : 722 Mean :61.76
## 3rd Qu.:1.0400 Ideal :1988 H: 754 VVS2 : 488 3rd Qu.:62.50
## Max. :2.7500 I: 495 VVS1 : 346 Max. :79.00
## J: 274 (Other): 240
## table price x y
## Min. :52.00 Min. : 335.0 Min. :0.000 Min. : 0.000
## 1st Qu.:56.00 1st Qu.: 976.8 1st Qu.:4.740 1st Qu.: 4.750
## Median :57.00 Median : 2389.5 Median :5.690 Median : 5.710
## Mean :57.46 Mean : 3948.0 Mean :5.729 Mean : 5.744
## 3rd Qu.:59.00 3rd Qu.: 5269.0 3rd Qu.:6.530 3rd Qu.: 6.530
## Max. :73.00 Max. :18806.0 Max. :9.040 Max. :58.900
##
## z
## Min. :0.00
## 1st Qu.:2.93
## Median :3.52
## Mean :3.54
## 3rd Qu.:4.03
## Max. :8.06
##
# Calculate the total number of missing values per column
missing_values <- sapply(df, function(x) sum(is.na(x)))
# Calculate the number of cases with at least one missing value
cases_with_missing <- sum(apply(df, 1, function(x) any(is.na(x))))
# Check if there are any missing values
if (all(missing_values == 0)) {
print("There is no missing data in any columns.")
} else {
# Create a data frame for plotting
missing_data_df <- data.frame(
Column = c(names(missing_values), "At Least One Missing"),
MissingValues = c(missing_values, cases_with_missing)
)
# Calculate the percentage of missing values per column
# missing_percentage <- (missing_values / nrow(df)) * 100
# Plot the number of missing values for all columns with labels
ggplot(missing_data_df, aes(x = Column, y = MissingValues, fill = Column)) +
geom_bar(stat = "identity") +
geom_text(aes(label = sprintf("%.0f%%", MissingValues / nrow(df) * 100)), hjust = -0.3) + # Add labels to the bars
# geom_text(aes(label = sprintf("%.2f%%", MissingPercentage)), hjust = -0.3) +
coord_flip() + # Makes the bars horizontal
labs(title = "Number of Missing Values by Column", x = "Column", y = "Number of Missing Values") +
scale_fill_brewer(palette = "Set3") + # Use a color palette for different bars
theme(legend.position = "none", axis.title.y = element_blank()) + # Remove the legend
scale_y_continuous(expand = expansion(mult = c(0, 0.2))) # Extend the y-axis limits by 10%
}
## [1] "There is no missing data in any columns."
detect_outliers_mad <- function(df, accuracy = 0.99) {
# Function to calculate MAD
mad_function <- function(x) {
median(abs(x - median(x)))
}
# Calculate z-score equivalent for the given accuracy
z_threshold <- qnorm(accuracy + (1 - accuracy) / 2)
# Calculate MAD threshold
mad_threshold <- z_threshold
# Initialize a list to store outlier indices for each numeric column
outliers_list <- list()
# Initialize a vector to keep track of rows with outliers
rows_with_outliers <- rep(FALSE, nrow(df))
# Loop through each column in the dataframe
for (col_name in names(df)) {
# Check if the column is numeric
if (is.numeric(df[[col_name]])) {
# Calculate MAD and median for the column
mad_value <- mad_function(df[[col_name]])
median_value <- median(df[[col_name]])
# Calculate the deviation scores (using a modified z-score)
deviation_scores <- 0.6745 * (df[[col_name]] - median_value) / mad_value
# Identify indices of outliers
outlier_indices <- which(abs(deviation_scores) > mad_threshold)
# Store the indices in the list
outliers_list[[col_name]] <- outlier_indices
# Update rows with outliers
rows_with_outliers[outlier_indices] <- TRUE
}
}
# Calculate the number of outliers in each column
num_outliers_each_col <- sapply(outliers_list, length)
# Calculate the number of rows with at least one outlier
num_rows_with_outliers <- sum(rows_with_outliers)
# Combine the results into one vector
combined_results <- c(num_outliers_each_col, "Rows w/ Outliers" = num_rows_with_outliers)
# Return the combined results
return(combined_results)
}
# Detect outliers using the previously defined function
outliers_info <- detect_outliers_mad(df)
# Check if there are any outliers
if (all(outliers_info == 0)) {
print("There are no outliers in any columns.")
} else {
# Create a data frame for plotting
outliers_data_df <- data.frame(
Column = names(outliers_info),
Outliers = outliers_info,
OutlierPercentage = (outliers_info / nrow(df)) * 100 # Calculate the percentage of outliers
)
# Plot the number of outliers for all columns with labels
ggplot(outliers_data_df, aes(x = Column, y = Outliers, fill = Column)) +
geom_bar(stat = "identity") +
geom_text(aes(label = sprintf("%.2f%%", OutlierPercentage)), hjust = -0.3, vjust = 0) + # Add labels to the bars
coord_flip() + # Makes the bars horizontal
labs(title = "Number of Outliers by Column", x = "Column", y = "Number of Outliers") +
scale_fill_brewer(palette = "Set3") + # Use a color palette for different bars
theme(legend.position = "none", axis.title.y = element_blank()) + # Remove the legend
scale_y_continuous(expand = expansion(mult = c(0, 0.2))) # Extend the y-axis limits
}
library(gridExtra)
library(e1071) # for skewness
# Function to check if the variable is highly skewed
is_highly_skewed <- function(x) {
# Remove missing values before computing skewness
x <- na.omit(x)
abs(e1071::skewness(x)) > 1
}
create_plots <- function(df, var) {
skewness_value <- round(e1071::skewness(na.omit(df[[var]])), 2)
# Histogram
p1 <- ggplot(df, aes_string(x = var)) +
geom_histogram(bins = 15, fill = "skyblue", color = "black") +
ggtitle(paste("Histogram of", var)) +
annotate("text", x = Inf, y = Inf, label = paste("Skewness:", skewness_value), hjust = 1.1, vjust = 1.1)
# QQ Plot with reference line
p2 <- ggplot(df, aes_string(sample = var)) +
stat_qq() +
stat_qq_line(color = "red") +
ggtitle(paste("QQ Plot of", var))
if (is_highly_skewed(df[[var]])) {
df$log_transformed <- log(df[[var]] + 1)
skewness_log_value <- round(e1071::skewness(na.omit(df$log_transformed)), 2)
# Histogram after log transformation
p3 <- ggplot(df, aes(x = log_transformed)) +
geom_histogram(bins = 15, fill = "lightgreen", color = "black") +
ggtitle(paste("Log-transformed", var)) +
annotate("text", x = Inf, y = Inf, label = paste("Skewness:", skewness_log_value), hjust = 1.1, vjust = 1.1)
# QQ Plot after log transformation
p4 <- ggplot(df, aes(sample = log_transformed)) +
stat_qq() +
stat_qq_line(color = "red") +
ggtitle(paste("log-transformed", var))
return(grid.arrange(p1, p2, p3, p4, ncol = 2))
} else {
return(grid.arrange(p1, p2, ncol = 2))
}
}
# If the dataframe has more than 2000 rows, sample 2000 rows randomly
df_reduced <- df
if (nrow(df) > max_data_points_eda) {
set.seed(123) # Set a random seed for reproducibility
df_reduced <- df_reduced[sample(nrow(df_reduced), 2000), ]
cat("Since there are too many rows, ", max_data_points_eda," randomly selected rows are shown scatter plots.")
}
## Since there are too many rows, 2000 randomly selected rows are shown scatter plots.
# Apply the function to each numeric variable in the dataframe
lapply(names(df_reduced)[sapply(df_reduced, is.numeric)], function(var) create_plots(df, var))
## [[1]]
## TableGrob (2 x 2) "arrange": 4 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
##
## [[2]]
## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
##
## [[3]]
## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
##
## [[4]]
## TableGrob (2 x 2) "arrange": 4 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
##
## [[5]]
## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
##
## [[6]]
## TableGrob (2 x 2) "arrange": 4 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
##
## [[7]]
## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
# Function to create bar plots
create_bar_plot <- function(df, column_name) {
# Checking if the column is a factor
if (!is.numeric(df[[column_name]])) {
df[[column_name]] <- as.factor(df[[column_name]])
factor_levels <- levels(df[[column_name]])
} else {
factor_levels <- NULL
}
# Modify the data for plotting
plot_data <- df %>%
count(!!sym(column_name)) %>%
arrange(desc(n)) %>%
mutate(value = ifelse(row_number() > 12, "Other", as.character(!!sym(column_name)))) %>%
group_by(value) %>%
summarize(n = sum(n))
# If the column is a factor, adjust the value names
if (!is.null(factor_levels)) {
plot_data$value <- factor(plot_data$value, levels = unique(c(factor_levels, "Other")))
}
# Finding the maximum value for adjusting y-axis
max_value <- max(plot_data$n)
# Creating the bar plot
p <- ggplot(plot_data, aes(x = value, y = n, fill = value)) +
geom_bar(stat = "identity") +
geom_text(aes(label = paste0(round(n / sum(n) * 100, 1), "%")),
vjust = -0.5
) +
labs(title = paste("Bar Plot for", column_name), y = "Count") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.x = element_blank()
) +
theme(legend.position = "none") + # Remove the legend
scale_y_continuous(limits = c(0, max_value * 1.2)) # Extend y-axis by 20%
print(p) # Explicitly print the plot
}
# Iterating over each column
for (column_name in names(df)) {
if (!is.numeric(df[[column_name]])) { # Checking if the column is non-numeric
unique_values <- length(unique(df[[column_name]]))
if (unique_values != nrow(df)) { # Ignoring columns with all unique values
create_bar_plot(df, column_name)
}
}
}
data_for_heatmap <- df
if(!is.null(target_bin)) {
data_for_heatmap$target_bin <- target_bin
}
numeric_cols <- sapply(data_for_heatmap, is.numeric)
# Prepare data for the heatmap
# Filter out rows with missing values
data_for_heatmap <- na.omit(data_for_heatmap)
# more than 2 numeric columns
if (sum(numeric_cols) > 1) {
# If the dataframe has more than 2000 rows, sample 2000 rows randomly
if (nrow(data_for_heatmap) > max_data_points_eda) {
set.seed(123) # Set a random seed for reproducibility
data_for_heatmap <- data_for_heatmap[sample(nrow(data_for_heatmap), 2000), ]
cat("Since there are too many rows, ", max_data_points_eda," randomly selected rows are shown in this heatmap")
}
# Determine whether to include row names based on the number of rows
show_row_names <- nrow(data_for_heatmap) <= 100
# Define a color palette from yellow to blue
my_palette <- colorRampPalette(c("yellow", "blue"))(n = 299)
# Initialize the RowSideColors parameter as NULL
row_side_colors <- NULL
dist_pearson <- function(x, ...)
as.dist(1-cor(t(x), method="pearson"))
# Check if target_var is provided and not null
if (!is.null(target_var) && target_var %in% names(data_for_heatmap)) {
# Use the 'target_bin' column if it exists, otherwise use the column specified by target_var
target_col <- ifelse("target_bin" %in% names(data_for_heatmap), 'target_bin', target_var)
# define groups of rows
groups <- data_for_heatmap[[target_col]]
groups_colors <- as.numeric(factor(groups))
unique_groups <- unique(groups)
row_side_colors <- rainbow(length(unique_groups))[groups_colors]
# Create the heatmap with clustering trees and color bar
gplots::heatmap.2(
as.matrix(data_for_heatmap[numeric_cols]),
scale = "column", # Data is already scaled, so no scaling is needed here
distfun = dist_pearson, # for distance between rows
hclustfun = function(x) hclust(x, method = "average"), # for hierarchical clustering
dendrogram = "both", # only row dendrograms
trace = "none", # turns off trace lines inside the heatmap
density.info = "none", # turns off density plot inside color legend
margins = c(8, 8), # adjusts the margins around the plot
Colv = TRUE, # cluster columns
Rowv = TRUE, # always cluster rows
labRow = if(show_row_names) rownames(data_for_heatmap) else NA, # show/hide row names
key = TRUE, # whether to show the color key
keysize = 1, # size of the color key
symbreaks = FALSE , # whether to make color breaks symmetrical around zero
col = my_palette, # yellow and blue
RowSideColors = row_side_colors, # add side color bar if side_colors is not NULL
cexCol = 0.9, # Make column labels smaller
srtCol = 45, # Rotate column labels 45 degrees
adjCol = c(1,0) # Adjust the position of the column labels
)
legend("topright", legend = unique_groups, fill = rainbow(length(unique_groups)), title = target_var)
} else { # RowSideColors = NULL gives errors.
# Create the heatmap without the color bar
gplots::heatmap.2(
as.matrix(data_for_heatmap[numeric_cols]),
scale = "column", # Data is already scaled, so no scaling is needed here
distfun = dist_pearson, # for distance between rows
hclustfun = function(x) hclust(x, method = "average"), # for hierarchical clustering
dendrogram = "both", # only row dendrograms
trace = "none", # turns off trace lines inside the heatmap
density.info = "none", # turns off density plot inside color legend
margins = c(8, 8), # adjusts the margins around the plot
Colv = TRUE, # cluster columns
Rowv = TRUE, # always cluster rows
labRow = if(show_row_names) rownames(data_for_heatmap) else NA, # show/hide row names
key = TRUE, # whether to show the color key
keysize = 1, # size of the color key
symbreaks = FALSE , # whether to make color breaks symmetrical around zero
col = my_palette, # yellow and blue
cexCol = 0.9, # Make column labels smaller
srtCol = 45, # Rotate column labels 45 degrees
adjCol = c(1,0) # Adjust the position of the column labels
)
}
}
## Since there are too many rows, 2000 randomly selected rows are shown in this heatmap
library(corrplot)
df2 <- df[sapply(df, is.numeric)]
df2 <- na.omit(df2)
M <- cor(df2)
testRes <- cor.mtest(df2, conf.level = 0.95)
## leave blank on non-significant coefficient
## add significant correlation coefficients
corrplot(M,
p.mat = testRes$p, method = "circle", type = "lower", insig = "blank",
addCoef.col = "black", number.cex = 0.8, order = "AOE", diag = FALSE
)
library(GGally)
library(hexbin)
num_cols <- names(df)[sapply(df, is.numeric)]
if(length(num_cols) > 1) {
num_col_pairs <- combn(num_cols, 2, simplify = FALSE)
for (pair in num_col_pairs) {
col_x <- pair[1]
col_y <- pair[2]
# Perform correlation test
corr_test <- cor.test(df[[col_x]], df[[col_y]], method = "pearson")
if (corr_test$p.value < 0.01 && abs(corr_test$estimate) > 0.1) {
corr_label <- paste("R =", round(corr_test$estimate, 2), "\nP =", format(corr_test$p.value, scientific = TRUE, digits = 2))
if (nrow(df) > 5000) {
# Use hexbin plot
p <- ggplot(df, aes_string(x = col_x, y = col_y)) +
labs(title = paste(col_x, "vs", col_y), x = col_x, y = col_y) +
geom_hex(bins = 70) +
scale_fill_continuous(type = "viridis")
} else {
# Use scatter plot
if (!is.null(target_var) && !is.na(target_var)) {
if(target_var != col_x && target_var != col_y){
df_ggplot <- df
if(!is.null(target_bin)) {
df_ggplot$target_bin <- target_bin
}
color_var <- ifelse(!is.numeric(df[[target_var]]), target_var, "target_bin")
p <- ggplot(df_ggplot, aes_string(x = col_x, y = col_y, color = color_var)) +
geom_point(alpha = 0.7) +
labs(title = paste(col_x, "vs", col_y), x = col_x, y = col_y) +
guides(color = guide_legend(title = color_var))
}
} else {
p <- ggplot(df, aes_string(x = col_x, y = col_y)) +
geom_point(alpha = 0.7) +
labs(title = paste(col_x, "vs", col_y), x = col_x, y = col_y)
}
}
p <- p + annotate("text", x = Inf, y = Inf, label = corr_label, hjust = 1.1, vjust = 1.1, size = 4)
print(p)
}
}
} else {
cat("Not applicable.")
}
num_cols <- sapply(df, is.numeric)
cat_cols <- sapply(df, is.factor)
if(length(num_cols) > 1 & length(cat_cols) > 1) {
# Perform ANOVA and create violin plots for significant cases
for (num_var in names(df)[num_cols]) {
for (cat_var in names(df)[cat_cols]) {
if (cat_var != "target_bin") {
anova_result <- aov(df[[num_var]] ~ df[[cat_var]], data = df)
p_value <- summary(anova_result)[[1]]$"Pr(>F)"[1]
if (p_value < 0.01) {
plot <- ggplot(df, aes_string(x = cat_var, y = num_var)) +
geom_violin(trim = FALSE, fill = "lightblue", color = "black") +
geom_boxplot(width = 0.2, fill = "white", color = "black") +
labs(title = paste(num_var, "by", cat_var, "(ANOVA P =", format(p_value, scientific = TRUE, digits = 2), ")"), x = cat_var, y = num_var)
if (nrow(df) < 300) {
plot <- plot + geom_jitter(width = 0.2, color = "black")
}
print(plot)
}
}
}
}
} else {
cat("Not applicable.")
}
cat_cols <- !(sapply(df, is.numeric)) & names(df) != "target_bin"
cat_var_names <- names(df)[cat_cols]
if(length(cat_var_names) > 1) {
# Perform chi-squared tests and create stacked bar plots if p-value < 0.01
for (i in 1:(length(cat_var_names) - 1)) {
for (j in (i + 1):length(cat_var_names)) {
tab <- table(df[[cat_var_names[i]]], df[[cat_var_names[j]]])
chi_test <- chisq.test(tab)
if (is.na(chi_test$p.value)) next
if (chi_test$p.value < 0.01) {
p <- ggplot(df, aes_string(x = cat_var_names[i], fill = cat_var_names[j])) +
geom_bar(position = "fill") +
labs(
title = paste(cat_var_names[i], "vs", cat_var_names[j], "(Chisq P =", format(chi_test$p.value, scientific = TRUE, digits = 2), ")"),
x = paste(cat_var_names[i]),
y = "Proportion"
) +
coord_flip()
print(p)
}
}
}
} else {
cat("There is one or zero categorical variable")
}
This RMarkdown document was written by Steven Ge, assisted heavily by GPT-4 (who wrote 90% of the code). Source code on GitHub. No guarantees. No right reserved.