# Load Library
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.3
library(readr)
## Warning: package 'readr' was built under R version 4.3.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.3.3
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(mice)
## Warning: package 'mice' was built under R version 4.3.3
## Warning in check_dep_version(): ABI version mismatch:
## lme4 was built with Matrix ABI version 1
## Current Matrix ABI version is 0
## Please re-install lme4 from source or restore original 'Matrix' package
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(VIM)
## Warning: package 'VIM' was built under R version 4.3.3
## Loading required package: colorspace
## Warning: package 'colorspace' was built under R version 4.3.3
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(readr)
data <- read_csv("C:/Users/asus/Downloads/Life Expectancy Data.csv")
head(data)
## # A tibble: 6 × 22
## Country Year Status `Life expectancy` `Adult Mortality` `infant deaths`
## <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 Afghanistan 2015 Develop… 65 263 62
## 2 Afghanistan 2014 Develop… 59.9 271 64
## 3 Afghanistan 2013 Develop… 59.9 268 66
## 4 Afghanistan 2012 Develop… 59.5 272 69
## 5 Afghanistan 2011 Develop… 59.2 275 71
## 6 Afghanistan 2010 Develop… 58.8 279 74
## # ℹ 16 more variables: Alcohol <dbl>, `percentage expenditure` <dbl>,
## # `Hepatitis B` <dbl>, Measles <dbl>, BMI <dbl>, `under-five deaths` <dbl>,
## # Polio <dbl>, `Total expenditure` <dbl>, Diphtheria <dbl>, `HIV/AIDS` <dbl>,
## # GDP <dbl>, Population <dbl>, `thinness 1-19 years` <dbl>,
## # `thinness 5-9 years` <dbl>, `Income composition of resources` <dbl>,
## # Schooling <dbl>
summary(data)
## Country Year Status Life expectancy
## Length:2938 Min. :2000 Length:2938 Min. :36.30
## Class :character 1st Qu.:2004 Class :character 1st Qu.:63.10
## Mode :character Median :2008 Mode :character Median :72.10
## Mean :2008 Mean :69.22
## 3rd Qu.:2012 3rd Qu.:75.70
## Max. :2015 Max. :89.00
## NA's :10
## Adult Mortality infant deaths Alcohol percentage expenditure
## Min. : 1.0 Min. : 0.0 Min. : 0.0100 Min. : 0.000
## 1st Qu.: 74.0 1st Qu.: 0.0 1st Qu.: 0.8775 1st Qu.: 4.685
## Median :144.0 Median : 3.0 Median : 3.7550 Median : 64.913
## Mean :164.8 Mean : 30.3 Mean : 4.6029 Mean : 738.251
## 3rd Qu.:228.0 3rd Qu.: 22.0 3rd Qu.: 7.7025 3rd Qu.: 441.534
## Max. :723.0 Max. :1800.0 Max. :17.8700 Max. :19479.912
## NA's :10 NA's :194
## Hepatitis B Measles BMI under-five deaths
## Min. : 1.00 Min. : 0.0 Min. : 1.00 Min. : 0.00
## 1st Qu.:77.00 1st Qu.: 0.0 1st Qu.:19.30 1st Qu.: 0.00
## Median :92.00 Median : 17.0 Median :43.50 Median : 4.00
## Mean :80.94 Mean : 2419.6 Mean :38.32 Mean : 42.04
## 3rd Qu.:97.00 3rd Qu.: 360.2 3rd Qu.:56.20 3rd Qu.: 28.00
## Max. :99.00 Max. :212183.0 Max. :87.30 Max. :2500.00
## NA's :553 NA's :34
## Polio Total expenditure Diphtheria HIV/AIDS
## Min. : 3.00 Min. : 0.370 Min. : 2.00 Min. : 0.100
## 1st Qu.:78.00 1st Qu.: 4.260 1st Qu.:78.00 1st Qu.: 0.100
## Median :93.00 Median : 5.755 Median :93.00 Median : 0.100
## Mean :82.55 Mean : 5.938 Mean :82.32 Mean : 1.742
## 3rd Qu.:97.00 3rd Qu.: 7.492 3rd Qu.:97.00 3rd Qu.: 0.800
## Max. :99.00 Max. :17.600 Max. :99.00 Max. :50.600
## NA's :19 NA's :226 NA's :19
## GDP Population thinness 1-19 years
## Min. : 1.68 Min. :3.400e+01 Min. : 0.10
## 1st Qu.: 463.94 1st Qu.:1.958e+05 1st Qu.: 1.60
## Median : 1766.95 Median :1.387e+06 Median : 3.30
## Mean : 7483.16 Mean :1.275e+07 Mean : 4.84
## 3rd Qu.: 5910.81 3rd Qu.:7.420e+06 3rd Qu.: 7.20
## Max. :119172.74 Max. :1.294e+09 Max. :27.70
## NA's :448 NA's :652 NA's :34
## thinness 5-9 years Income composition of resources Schooling
## Min. : 0.10 Min. :0.0000 Min. : 0.00
## 1st Qu.: 1.50 1st Qu.:0.4930 1st Qu.:10.10
## Median : 3.30 Median :0.6770 Median :12.30
## Mean : 4.87 Mean :0.6276 Mean :11.99
## 3rd Qu.: 7.20 3rd Qu.:0.7790 3rd Qu.:14.30
## Max. :28.60 Max. :0.9480 Max. :20.70
## NA's :34 NA's :167 NA's :163
sapply(data, class)
## Country Year
## "character" "numeric"
## Status Life expectancy
## "character" "numeric"
## Adult Mortality infant deaths
## "numeric" "numeric"
## Alcohol percentage expenditure
## "numeric" "numeric"
## Hepatitis B Measles
## "numeric" "numeric"
## BMI under-five deaths
## "numeric" "numeric"
## Polio Total expenditure
## "numeric" "numeric"
## Diphtheria HIV/AIDS
## "numeric" "numeric"
## GDP Population
## "numeric" "numeric"
## thinness 1-19 years thinness 5-9 years
## "numeric" "numeric"
## Income composition of resources Schooling
## "numeric" "numeric"
names(data) <- make.names(names(data))
numeric_cols <- data[sapply(data, is.numeric)]
# Perform imputation
# imputed_data <- mice(numeric_cols,m=5,maxit=50,meth='cart',seed=42)
numeric_cols <- data[sapply(data, is.numeric)]
# Set up the plotting grid (3 rows x 3 columns in this case)
par(mfrow = c(4, 5), mar = c(4, 4, 2, 1)) # Adjust margins for better spacing
# Create boxplots for each numeric column
for (col_name in names(numeric_cols)) {
boxplot(numeric_cols[[col_name]],
main = col_name,
xlab = "",
ylab = "Values",
col = "lightblue",
border = "blue")
}

par(mfrow = c(4, 5), mar = c(4, 4, 2, 1)) # Adjust margins for better spacing
# Create density plots for each numeric column
for (col_name in names(numeric_cols)) {
# Ensure there are enough unique values for a meaningful density plot
if (length(unique(numeric_cols[[col_name]])) > 1) {
plot(density(numeric_cols[[col_name]], na.rm = TRUE),
main = col_name,
xlab = "Values",
ylab = "Density",
col = "blue",
lwd = 2)
} else {
plot(1, type = "n", xlab = "", ylab = "", main = paste(col_name, "\n(No density)"))
text(1, 1, "Not enough data", cex = 1.2)
}
}

log_transform <- function(x) {
# Check for NA and create a temporary vector that removes NAs
temp_x <- na.omit(x)
# Apply transformation only if there are no non-positive values
if (any(temp_x <= 0)) {
# Shift values by adding 1 to avoid log(0) and apply log
return(ifelse(x > 0, log(x + 1), NA)) # Keep NAs for non-positive values
} else {
return(log(x))
}
}
# Function to apply Box-Cox transformation and handle NA values
apply_boxcox <- function(x) {
# Remove NA values for the Box-Cox calculation
x_non_na <- x[!is.na(x)]
# Apply Box-Cox transformation if there are enough data points
if (length(x_non_na) > 1 && all(x_non_na > 0)) {
lambda <- boxcox(x_non_na ~ 1, lambda = seq(-2, 2, 0.1))$x[which.max(boxcox(x_non_na ~ 1, lambda = seq(-2, 2, 0.1))$y)]
transformed <- (x^lambda - 1) / lambda # Box-Cox transformation
return(ifelse(is.na(x), NA, transformed)) # Retain NA values
} else {
return(x) # Return original data if conditions not met
}
}
numeric_cols <- data[sapply(data, is.numeric)]
# Initialize an outlier counts dataframe
outlier_counts <- data.frame(Column = character(), OriginalCount = integer(), LogCount = integer(), BoxcoxCount = integer(), stringsAsFactors = FALSE)
# Function to calculate outliers and counts
calculate_outliers <- function(column) {
Q1 <- quantile(column, 0.25, na.rm = TRUE)
Q3 <- quantile(column, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
# Determine lower and upper bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
# Count outliers, excluding NA values
outlier_count <- sum(column < lower_bound | column > upper_bound, na.rm = TRUE)
return(outlier_count)
}
# Loop through numeric columns to calculate outlier counts
for (col_name in names(numeric_cols)) {
# Count outliers in original data
original_count <- calculate_outliers(numeric_cols[[col_name]])
# Apply log transformation and count outliers
log_transformed_col <- log_transform(numeric_cols[[col_name]])
log_count <- calculate_outliers(log_transformed_col)
# Apply Box-Cox transformation and count outliers
boxcox_transformed_col <- apply_boxcox(numeric_cols[[col_name]])
boxcox_count <- calculate_outliers(boxcox_transformed_col)
# Append results to outlier_counts dataframe
outlier_counts <- rbind(outlier_counts, data.frame(Column = col_name, OriginalCount = original_count, LogCount = log_count, BoxcoxCount = boxcox_count))
}














# Assuming outlier_counts is your outlier summary dataframe
# Find the minimum outlier counts across the three sets (original, log, Box-Cox)
outlier_counts$MinCount <- pmin(outlier_counts$OriginalCount, outlier_counts$LogCount, outlier_counts$BoxcoxCount)
# Sort the columns by the minimum outlier count
sorted_outliers <- outlier_counts[order(outlier_counts$MinCount), ]
# Select columns with the least outliers (e.g., top 5 columns with the least outliers)
least_outliers <- head(sorted_outliers, n = 20)
# Print the columns with the least outliers
print(least_outliers)
## Column OriginalCount LogCount BoxcoxCount MinCount
## 1 Year 0 0 0 0
## 5 Alcohol 0 315 0 0
## 6 percentage.expenditure 389 0 389 0
## 8 Measles 542 0 542 0
## 9 BMI 0 103 0 0
## 14 HIV.AIDS 542 65 0 0
## 17 thinness..1.19.years 89 28 0 0
## 18 thinness.5.9.years 96 37 0 0
## 19 Income.composition.of.resources 130 0 130 0
## 2 Life.expectancy 10 77 1 1
## 15 GDP 365 5 4 4
## 10 under.five.deaths 394 8 394 8
## 16 Population 294 30 9 9
## 4 infant.deaths 315 10 315 10
## 3 Adult.Mortality 82 166 11 11
## 12 Total.expenditure 32 116 27 27
## 20 Schooling 44 122 44 44
## 7 Hepatitis.B 254 293 186 186
## 11 Polio 279 332 186 186
## 13 Diphtheria 298 332 202 202
# Identify categorical columns
categorical_cols <- data[sapply(data, is.factor) | sapply(data, is.character)]
# Initialize a new dataframe to hold transformed numeric columns
best_transformed_data <- data.frame(matrix(ncol = ncol(numeric_cols), nrow = nrow(numeric_cols)))
# Set column names with transformations
colnames(best_transformed_data) <- names(numeric_cols)
# Loop through the outlier_counts to apply the best transformation
for (row in 1:nrow(outlier_counts)) {
col_name <- outlier_counts$Column[row]
if (outlier_counts$MinCount[row] == outlier_counts$OriginalCount[row]) {
best_transformed_data[[col_name]] <- numeric_cols[[col_name]]
colnames(best_transformed_data)[row] <- paste(col_name, "Original", sep = ".")
} else if (outlier_counts$MinCount[row] == outlier_counts$LogCount[row]) {
best_transformed_data[[col_name]] <- log_transform(as.numeric(numeric_cols[[col_name]]))
colnames(best_transformed_data)[row] <- paste(col_name, "Log", sep = ".")
} else {
best_transformed_data[[col_name]] <- apply_boxcox(numeric_cols[[col_name]])
colnames(best_transformed_data)[row] <- paste(col_name, "BoxCox", sep = ".")
}
}











# Combine with categorical columns
transformed_data <- cbind(best_transformed_data, categorical_cols)
# Display the final dataframe with transformed numeric and original categorical columns
print(head(transformed_data))
## Year.Original Life.expectancy.BoxCox Adult.Mortality.BoxCox infant.deaths.Log
## 1 2015 2112.000 22.70590 4.143135
## 2 2014 1793.505 23.02654 4.174387
## 3 2013 1793.505 22.90695 4.204693
## 4 2012 1769.625 23.06624 4.248495
## 5 2011 1751.820 23.18482 4.276666
## 6 2010 1728.220 23.34178 4.317488
## Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log
## 1 0.01 4.280542 2112.0 7.051856
## 2 0.01 4.311116 1921.5 6.200509
## 3 0.01 4.307023 2047.5 6.066108
## 4 0.01 4.371777 2244.0 7.933080
## 5 0.01 2.091507 2311.5 8.011023
## 6 0.01 4.390483 2177.5 7.595890
## BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox
## 1 19.1 4.430817 17.5 4.131972
## 2 18.6 4.465908 1681.5 4.140352
## 3 18.1 4.499810 1921.5 4.119386
## 4 17.6 4.543295 2244.0 4.281537
## 5 17.2 4.584967 2311.5 4.009491
## 6 16.7 4.634729 2177.5 4.557087
## Diphtheria.BoxCox HIV.AIDS.BoxCox GDP.BoxCox Population.BoxCox
## 1 2112.0 -5.462383 6.798422 30.67723
## 2 1921.5 -5.462383 6.852500 19.12425
## 3 2047.5 -5.462383 6.887365 30.50238
## 4 2244.0 -5.462383 6.954307 24.76056
## 5 2311.5 -5.462383 4.330698 24.22380
## 6 2177.5 -5.462383 6.736593 24.14351
## thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox
## 1 3.967874 3.978790
## 2 4.000475 4.000475
## 3 4.021968 4.021968
## 4 4.043273 4.053856
## 5 4.074886 4.074886
## 6 4.095737 4.095737
## Income.composition.of.resources.Log Schooling.Original Country Status
## 1 0.3913662 10.1 Afghanistan Developing
## 2 0.3893357 10.0 Afghanistan Developing
## 3 0.3852624 9.9 Afghanistan Developing
## 4 0.3804891 9.8 Afghanistan Developing
## 5 0.3743184 9.5 Afghanistan Developing
## 6 0.3701833 9.2 Afghanistan Developing
transformed_data$Life.expectancy.BoxCox <- data$Life.expectancy
names(transformed_data)[names(transformed_data) == "Life.expectancy.BoxCox"] <- "Life.expectancy"
summary(transformed_data)
## Year.Original Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log
## Min. :2000 Min. :36.30 Min. : 0.00 Min. :0.6931
## 1st Qu.:2004 1st Qu.:63.10 1st Qu.:12.28 1st Qu.:1.0986
## Median :2008 Median :72.10 Median :17.05 Median :2.3026
## Mean :2008 Mean :69.22 Mean :16.59 Mean :2.4862
## 3rd Qu.:2012 3rd Qu.:75.70 3rd Qu.:21.23 3rd Qu.:3.5264
## Max. :2015 Max. :89.00 Max. :36.13 Max. :7.4961
## NA's :10 NA's :10 NA's :848
## Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox
## Min. : 0.0100 Min. :0.0952 Min. : 0
## 1st Qu.: 0.8775 1st Qu.:3.6175 1st Qu.:2964
## Median : 3.7550 Median :5.0511 Median :4232
## Mean : 4.6029 Mean :5.0330 Mean :3589
## 3rd Qu.: 7.7025 3rd Qu.:6.4477 3rd Qu.:4704
## Max. :17.8700 Max. :9.8772 Max. :4900
## NA's :194 NA's :611 NA's :553
## Measles.Log BMI.Original under.five.deaths.Log Polio.BoxCox
## Min. : 0.6931 Min. : 1.00 Min. :0.6931 Min. : 4
## 1st Qu.: 2.8904 1st Qu.:19.30 1st Qu.:1.3863 1st Qu.:3042
## Median : 4.8598 Median :43.50 Median :2.4849 Median :4324
## Mean : 5.0450 Mean :38.32 Mean :2.6475 Mean :3681
## 3rd Qu.: 7.0992 3rd Qu.:56.20 3rd Qu.:3.8501 3rd Qu.:4704
## Max. :12.2652 Max. :87.30 Max. :7.8244 Max. :4900
## NA's :983 NA's :34 NA's :785 NA's :19
## Total.expenditure.BoxCox Diphtheria.BoxCox HIV.AIDS.BoxCox GDP.BoxCox
## Min. :-0.7536 Min. : 1.5 Min. :-5.4624 Min. : 0.5223
## 1st Qu.: 2.2829 1st Qu.:3041.5 1st Qu.:-5.4624 1st Qu.: 6.5368
## Median : 3.0518 Median :4324.0 Median :-5.4624 Median : 8.0713
## Mean : 3.0301 Mean :3669.3 Mean :-3.3728 Mean : 8.1009
## 3rd Qu.: 3.8472 3rd Qu.:4704.0 3rd Qu.:-0.2406 3rd Qu.: 9.4929
## Max. : 7.4532 Max. :4900.0 Max. : 1.3904 Max. :13.1836
## NA's :226 NA's :19 NA's :448
## Population.BoxCox thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox
## Min. : 3.932 Min. :-1.8023 Min. :-1.8023
## 1st Qu.:18.030 1st Qu.: 0.4954 1st Qu.: 0.4243
## Median :22.380 Median : 1.3673 Median : 1.3673
## Mean :22.282 Mean : 1.4341 Mean : 1.4273
## 3rd Qu.:26.540 3rd Qu.: 2.4780 3rd Qu.: 2.4780
## Max. :42.346 Max. : 4.9138 Max. : 4.9809
## NA's :652 NA's :34 NA's :34
## Income.composition.of.resources.Log Schooling.Original Country
## Min. :0.2255 Min. : 0.00 Length:2938
## 1st Qu.:0.4207 1st Qu.:10.10 Class :character
## Median :0.5230 Median :12.30 Mode :character
## Mean :0.5009 Mean :11.99
## 3rd Qu.:0.5789 3rd Qu.:14.30
## Max. :0.6668 Max. :20.70
## NA's :297 NA's :163
## Status
## Length:2938
## Class :character
## Mode :character
##
##
##
##
replace_outliers <- function(column) {
Q1 <- quantile(column, 0.25, na.rm = TRUE)
Q3 <- quantile(column, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
# Replace outliers
column[column < lower_bound] <- lower_bound
column[column > upper_bound] <- upper_bound
return(column)
}
# List of columns to check for and replace outliers
cols_to_check <- c("GDP.BoxCox", "under.five.deaths.Log", "Population.BoxCox",
"infant.deaths.Log", "Adult.Mortality.BoxCox",
"Total.expenditure.BoxCox", "Schooling.Original",
"Hepatitis.B.BoxCox", "Polio.BoxCox", "Diphtheria.BoxCox")
# Loop through each specified column to replace outliers
for (col in cols_to_check) {
if (col %in% names(transformed_data)) {
transformed_data[[col]] <- replace_outliers(transformed_data[[col]])
} else {
cat("Warning: Column", col, "not found in data frame.\n")
}
}
# Display the cleaned data frame
View(transformed_data)
numeric_cols_names <- c(
"Life.expectancy",
"Adult.Mortality.BoxCox",
"infant.deaths.Log",
"Alcohol.Original",
"percentage.expenditure.Log",
"Hepatitis.B.BoxCox",
"Measles.Log",
"BMI.Original",
"under.five.deaths.Log",
"Polio.BoxCox",
"Total.expenditure.BoxCox",
"Diphtheria.BoxCox",
"HIV.AIDS.BoxCox",
"GDP.BoxCox",
"Population.BoxCox",
"thinness..1.19.years.BoxCox",
"thinness.5.9.years.BoxCox",
"Income.composition.of.resources.Log",
"Schooling.Original"
)
# Extract numeric columns based on the specified names
numeric_cols <- transformed_data[numeric_cols_names]
# Check the structure of the resulting numeric columns
str(numeric_cols)
## 'data.frame': 2938 obs. of 19 variables:
## $ Life.expectancy : num 65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
## $ Adult.Mortality.BoxCox : num 22.7 23 22.9 23.1 23.2 ...
## $ infant.deaths.Log : num 4.14 4.17 4.2 4.25 4.28 ...
## $ Alcohol.Original : num 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
## $ percentage.expenditure.Log : num 4.28 4.31 4.31 4.37 2.09 ...
## $ Hepatitis.B.BoxCox : num 2112 1922 2048 2244 2312 ...
## $ Measles.Log : num 7.05 6.2 6.07 7.93 8.01 ...
## $ BMI.Original : num 19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
## $ under.five.deaths.Log : num 4.43 4.47 4.5 4.54 4.58 ...
## $ Polio.BoxCox : num 548 1682 1922 2244 2312 ...
## $ Total.expenditure.BoxCox : num 4.13 4.14 4.12 4.28 4.01 ...
## $ Diphtheria.BoxCox : num 2112 1922 2048 2244 2312 ...
## $ HIV.AIDS.BoxCox : num -5.46 -5.46 -5.46 -5.46 -5.46 ...
## $ GDP.BoxCox : num 6.8 6.85 6.89 6.95 4.33 ...
## $ Population.BoxCox : num 30.7 19.1 30.5 24.8 24.2 ...
## $ thinness..1.19.years.BoxCox : num 3.97 4 4.02 4.04 4.07 ...
## $ thinness.5.9.years.BoxCox : num 3.98 4 4.02 4.05 4.07 ...
## $ Income.composition.of.resources.Log: num 0.391 0.389 0.385 0.38 0.374 ...
## $ Schooling.Original : num 10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...
# Set up the plotting grid (adjust as needed based on the number of numeric columns)
par(mfrow = c(4, 5), mar = c(4, 4, 2, 1)) # 4 rows x 5 columns
# Create boxplots for each numeric column
for (col_name in numeric_cols_names) {
boxplot(numeric_cols[[col_name]],
main = col_name,
xlab = "",
ylab = "Values",
col = "lightblue",
border = "blue",
outline = TRUE) # outline = TRUE will show outliers
}

# Create the aggregation plot again
aggr_plot <- aggr(transformed_data,
col = c('navyblue', 'red'),
numbers = TRUE,
sortVars = TRUE,
labels = names(transformed_data), # Use the original labels
cex.axis = 0.7, # Increase text size slightly
gap = 3,
ylab = c("Histogram of missing data", "Pattern"),
prop = T,
combined = F,
oma = c(20,5,5,3))

##
## Variables sorted by number of missings:
## Variable Count
## Measles.Log 0.334581348
## infant.deaths.Log 0.288631722
## under.five.deaths.Log 0.267188564
## Population.BoxCox 0.221919673
## percentage.expenditure.Log 0.207964602
## Hepatitis.B.BoxCox 0.188223281
## GDP.BoxCox 0.152484683
## Income.composition.of.resources.Log 0.101089176
## Total.expenditure.BoxCox 0.076923077
## Alcohol.Original 0.066031314
## Schooling.Original 0.055479918
## BMI.Original 0.011572498
## thinness..1.19.years.BoxCox 0.011572498
## thinness.5.9.years.BoxCox 0.011572498
## Polio.BoxCox 0.006466984
## Diphtheria.BoxCox 0.006466984
## Life.expectancy 0.003403676
## Adult.Mortality.BoxCox 0.003403676
## Year.Original 0.000000000
## HIV.AIDS.BoxCox 0.000000000
## Country 0.000000000
## Status 0.000000000
impute_data <- mice(transformed_data, m = 5, method = 'pmm', seed = 123)
##
## iter imp variable
## 1 1 Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox Diphtheria.BoxCox GDP.BoxCox Population.BoxCox thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox Income.composition.of.resources.Log Schooling.Original
## 1 2 Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox Diphtheria.BoxCox GDP.BoxCox Population.BoxCox thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox Income.composition.of.resources.Log Schooling.Original
## 1 3 Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox Diphtheria.BoxCox GDP.BoxCox Population.BoxCox thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox Income.composition.of.resources.Log Schooling.Original
## 1 4 Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox Diphtheria.BoxCox GDP.BoxCox Population.BoxCox thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox Income.composition.of.resources.Log Schooling.Original
## 1 5 Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox Diphtheria.BoxCox GDP.BoxCox Population.BoxCox thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox Income.composition.of.resources.Log Schooling.Original
## 2 1 Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox Diphtheria.BoxCox GDP.BoxCox Population.BoxCox thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox Income.composition.of.resources.Log Schooling.Original
## 2 2 Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox Diphtheria.BoxCox GDP.BoxCox Population.BoxCox thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox Income.composition.of.resources.Log Schooling.Original
## 2 3 Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox Diphtheria.BoxCox GDP.BoxCox Population.BoxCox thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox Income.composition.of.resources.Log Schooling.Original
## 2 4 Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox Diphtheria.BoxCox GDP.BoxCox Population.BoxCox thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox Income.composition.of.resources.Log Schooling.Original
## 2 5 Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox Diphtheria.BoxCox GDP.BoxCox Population.BoxCox thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox Income.composition.of.resources.Log Schooling.Original
## 3 1 Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox Diphtheria.BoxCox GDP.BoxCox Population.BoxCox thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox Income.composition.of.resources.Log Schooling.Original
## 3 2 Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox Diphtheria.BoxCox GDP.BoxCox Population.BoxCox thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox Income.composition.of.resources.Log Schooling.Original
## 3 3 Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox Diphtheria.BoxCox GDP.BoxCox Population.BoxCox thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox Income.composition.of.resources.Log Schooling.Original
## 3 4 Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox Diphtheria.BoxCox GDP.BoxCox Population.BoxCox thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox Income.composition.of.resources.Log Schooling.Original
## 3 5 Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox Diphtheria.BoxCox GDP.BoxCox Population.BoxCox thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox Income.composition.of.resources.Log Schooling.Original
## 4 1 Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox Diphtheria.BoxCox GDP.BoxCox Population.BoxCox thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox Income.composition.of.resources.Log Schooling.Original
## 4 2 Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox Diphtheria.BoxCox GDP.BoxCox Population.BoxCox thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox Income.composition.of.resources.Log Schooling.Original
## 4 3 Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox Diphtheria.BoxCox GDP.BoxCox Population.BoxCox thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox Income.composition.of.resources.Log Schooling.Original
## 4 4 Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox Diphtheria.BoxCox GDP.BoxCox Population.BoxCox thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox Income.composition.of.resources.Log Schooling.Original
## 4 5 Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox Diphtheria.BoxCox GDP.BoxCox Population.BoxCox thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox Income.composition.of.resources.Log Schooling.Original
## 5 1 Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox Diphtheria.BoxCox GDP.BoxCox Population.BoxCox thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox Income.composition.of.resources.Log Schooling.Original
## 5 2 Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox Diphtheria.BoxCox GDP.BoxCox Population.BoxCox thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox Income.composition.of.resources.Log Schooling.Original
## 5 3 Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox Diphtheria.BoxCox GDP.BoxCox Population.BoxCox thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox Income.composition.of.resources.Log Schooling.Original
## 5 4 Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox Diphtheria.BoxCox GDP.BoxCox Population.BoxCox thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox Income.composition.of.resources.Log Schooling.Original
## 5 5 Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox Diphtheria.BoxCox GDP.BoxCox Population.BoxCox thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox Income.composition.of.resources.Log Schooling.Original
imputed_data <- complete(impute_data, 1)
aggr_plot <- aggr(imputed_data,
col = c('navyblue', 'red'),
numbers = TRUE,
sortVars = TRUE,
labels = names(imputed_data), # Use the original labels
cex.axis = 0.7, # Increase text size slightly
gap = 3,
ylab = c("Histogram of missing data", "Pattern"),
prop = T,
combined = F,
oma = c(20,5,5,3))

##
## Variables sorted by number of missings:
## Variable Count
## Year.Original 0
## Life.expectancy 0
## Adult.Mortality.BoxCox 0
## infant.deaths.Log 0
## Alcohol.Original 0
## percentage.expenditure.Log 0
## Hepatitis.B.BoxCox 0
## Measles.Log 0
## BMI.Original 0
## under.five.deaths.Log 0
## Polio.BoxCox 0
## Total.expenditure.BoxCox 0
## Diphtheria.BoxCox 0
## HIV.AIDS.BoxCox 0
## GDP.BoxCox 0
## Population.BoxCox 0
## thinness..1.19.years.BoxCox 0
## thinness.5.9.years.BoxCox 0
## Income.composition.of.resources.Log 0
## Schooling.Original 0
## Country 0
## Status 0
scaled_data <- imputed_data
columns_to_scale <- setdiff(names(scaled_data), c("Year.Original", "Life.expectancy", "Status", "Country"))
min_max_scale <- function(x) {
return((x - min(x)) / (max(x) - min(x)))
}
# Apply Min-Max scaling to the selected columns
scaled_data[, columns_to_scale] <- lapply(scaled_data[, columns_to_scale], min_max_scale)
# Display the scaled DataFrame
print(head(scaled_data))
## Year.Original Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log
## 1 2015 65.0 0.6550187 0.5328301
## 2 2014 59.9 0.6642686 0.5376569
## 3 2013 59.9 0.6608185 0.5423374
## 4 2012 59.5 0.6654137 0.5491024
## 5 2011 59.2 0.6688346 0.5534533
## 6 2010 58.8 0.6733626 0.5597580
## Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log
## 1 0 0.4278624 0.3867136 0.5494879
## 2 0 0.4309879 0.3448086 0.4759188
## 3 0 0.4305695 0.3725253 0.4643045
## 4 0 0.4371892 0.4157501 0.6256389
## 5 0 0.2040803 0.4305983 0.6323744
## 6 0 0.4391015 0.4011219 0.5965007
## BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox
## 1 0.2097335 0.5454238 0.0000000 0.6705082
## 2 0.2039397 0.5505446 0.2604974 0.6718475
## 3 0.1981460 0.5554917 0.3156413 0.6684968
## 4 0.1923523 0.5618373 0.3897409 0.6944110
## 5 0.1877173 0.5679184 0.4052502 0.6509339
## 6 0.1819235 0.5751799 0.3744615 0.7384481
## Diphtheria.BoxCox HIV.AIDS.BoxCox GDP.BoxCox Population.BoxCox
## 1 0.3594118 0 0.4237765 0.7465382
## 2 0.3156413 0 0.4286566 0.4071410
## 3 0.3445919 0 0.4318030 0.7414018
## 4 0.3897409 0 0.4378441 0.5727215
## 5 0.4052502 0 0.2010809 0.5569528
## 6 0.3744615 0 0.4181968 0.5545941
## thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox
## 1 0.8591598 0.8522667
## 2 0.8640139 0.8554636
## 3 0.8672142 0.8586322
## 4 0.8703864 0.8633333
## 5 0.8750935 0.8664336
## 6 0.8781981 0.8695075
## Income.composition.of.resources.Log Schooling.Original Country Status
## 1 0.3757978 0.3750000 Afghanistan Developing
## 2 0.3711964 0.3690476 Afghanistan Developing
## 3 0.3619653 0.3630952 Afghanistan Developing
## 4 0.3511480 0.3571429 Afghanistan Developing
## 5 0.3371637 0.3392857 Afghanistan Developing
## 6 0.3277927 0.3214286 Afghanistan Developing
pca_data <- scaled_data[, !(names(scaled_data) %in% c("Country", "Status", "Life.expectancy", "Year.Original"))]
# Step 3: Perform PCA
pca_result <- prcomp(pca_data, center = TRUE, scale. = TRUE)
# Step 4: Summary of PCA (to see variance explained by principal components)
pca_summary <- summary(pca_result)
pca_summary
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.7878 1.3691 1.24961 1.08407 1.02726 0.88909 0.83200
## Proportion of Variance 0.4318 0.1041 0.08675 0.06529 0.05863 0.04392 0.03846
## Cumulative Proportion 0.4318 0.5359 0.62266 0.68795 0.74657 0.79049 0.82895
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.82321 0.73053 0.67439 0.63867 0.58154 0.53208 0.39885
## Proportion of Variance 0.03765 0.02965 0.02527 0.02266 0.01879 0.01573 0.00884
## Cumulative Proportion 0.86659 0.89624 0.92151 0.94417 0.96296 0.97869 0.98752
## PC15 PC16 PC17 PC18
## Standard deviation 0.26260 0.24465 0.22216 0.21544
## Proportion of Variance 0.00383 0.00333 0.00274 0.00258
## Cumulative Proportion 0.99135 0.99468 0.99742 1.00000
cumulative_variance <- pca_summary$importance[3, ]
num_components <- which(cumulative_variance >= 0.80)[1]
pca_scores <- as.data.frame(pca_result$x[, 1:num_components])
colnames(pca_scores) <- paste("PC", 1:num_components, sep = "")
print(head(pca_scores))
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## 1 -4.092536 0.55521256 -1.2772015 0.4993711 0.1909291 0.8034938 1.4975367
## 2 -3.783353 0.04342838 -0.1960483 0.7462391 0.1059007 1.6860901 0.9120533
## 3 -3.887147 0.09138994 -1.2887966 0.4034574 0.3602460 0.7117525 1.5857698
## 4 -3.846082 -0.14040545 -1.1403556 0.4141983 0.2869167 1.5244913 0.9451510
## 5 -4.517501 -0.61236990 -0.6349960 -0.5793528 -0.3421912 1.2223389 0.4466460
## 6 -3.999892 -0.11577365 -0.9764242 0.3106258 0.4120894 1.6822497 1.0762155
ols_data <- cbind(Life.expectancy = scaled_data$Life.expectancy, pca_scores)
# Step 2: Fit the OLS model
ols_model <- lm(Life.expectancy ~ ., data = ols_data)
# Step 3: Summary of the OLS model
summary(ols_model)
##
## Call:
## lm(formula = Life.expectancy ~ ., data = ols_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.4262 -2.3465 0.1761 2.6690 14.1678
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 69.23642 0.07814 886.078 < 2e-16 ***
## PC1 2.88268 0.02803 102.830 < 2e-16 ***
## PC2 -0.14684 0.05708 -2.572 0.0101 *
## PC3 -1.34359 0.06254 -21.483 < 2e-16 ***
## PC4 0.09457 0.07209 1.312 0.1897
## PC5 -2.12554 0.07608 -27.939 < 2e-16 ***
## PC6 0.70945 0.08790 8.071 1.01e-15 ***
## PC7 -0.01607 0.09393 -0.171 0.8642
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.235 on 2930 degrees of freedom
## Multiple R-squared: 0.8023, Adjusted R-squared: 0.8018
## F-statistic: 1699 on 7 and 2930 DF, p-value: < 2.2e-16
# Step 4: Optional - Diagnostics of the OLS model
par(mfrow = c(2, 2)) # Set up a 2x2 plotting area
plot(ols_model) # Residual diagnostics plots

# Step 1: Loadings and model coefficients
pca_loadings <- pca_result$rotation[, 1:num_components] # Retained loadings
coef <- ols_model$coefficients[-1] # Exclude intercept
# Step 2: Influence of original features on the target
feature_influence <- pca_loadings %*% coef
# Step 3: Create a data frame for interpretation
influence_df <- data.frame(Feature = rownames(pca_loadings), Influence = feature_influence)
influence_df <- influence_df[order(abs(influence_df$Influence), decreasing = TRUE), ]
# Step 4: View results
library(ggplot2)
# Create a bar plot for influence_df
ggplot(influence_df, aes(x = reorder(Feature, Influence), y = Influence)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() + # Flip coordinates for better readability
labs(title = "Influence of Features on Life Expectancy",
x = "Features",
y = "Influence") +
theme_minimal()
