# Load Library
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.3
library(readr)
## Warning: package 'readr' was built under R version 4.3.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.3.3
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(mice)
## Warning: package 'mice' was built under R version 4.3.3
## Warning in check_dep_version(): ABI version mismatch: 
## lme4 was built with Matrix ABI version 1
## Current Matrix ABI version is 0
## Please re-install lme4 from source or restore original 'Matrix' package
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(VIM)
## Warning: package 'VIM' was built under R version 4.3.3
## Loading required package: colorspace
## Warning: package 'colorspace' was built under R version 4.3.3
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(readr)
data <- read_csv("C:/Users/asus/Downloads/Life Expectancy Data.csv")

head(data)
## # A tibble: 6 × 22
##   Country      Year Status   `Life expectancy` `Adult Mortality` `infant deaths`
##   <chr>       <dbl> <chr>                <dbl>             <dbl>           <dbl>
## 1 Afghanistan  2015 Develop…              65                 263              62
## 2 Afghanistan  2014 Develop…              59.9               271              64
## 3 Afghanistan  2013 Develop…              59.9               268              66
## 4 Afghanistan  2012 Develop…              59.5               272              69
## 5 Afghanistan  2011 Develop…              59.2               275              71
## 6 Afghanistan  2010 Develop…              58.8               279              74
## # ℹ 16 more variables: Alcohol <dbl>, `percentage expenditure` <dbl>,
## #   `Hepatitis B` <dbl>, Measles <dbl>, BMI <dbl>, `under-five deaths` <dbl>,
## #   Polio <dbl>, `Total expenditure` <dbl>, Diphtheria <dbl>, `HIV/AIDS` <dbl>,
## #   GDP <dbl>, Population <dbl>, `thinness  1-19 years` <dbl>,
## #   `thinness 5-9 years` <dbl>, `Income composition of resources` <dbl>,
## #   Schooling <dbl>
summary(data)
##    Country               Year         Status          Life expectancy
##  Length:2938        Min.   :2000   Length:2938        Min.   :36.30  
##  Class :character   1st Qu.:2004   Class :character   1st Qu.:63.10  
##  Mode  :character   Median :2008   Mode  :character   Median :72.10  
##                     Mean   :2008                      Mean   :69.22  
##                     3rd Qu.:2012                      3rd Qu.:75.70  
##                     Max.   :2015                      Max.   :89.00  
##                                                       NA's   :10     
##  Adult Mortality infant deaths       Alcohol        percentage expenditure
##  Min.   :  1.0   Min.   :   0.0   Min.   : 0.0100   Min.   :    0.000     
##  1st Qu.: 74.0   1st Qu.:   0.0   1st Qu.: 0.8775   1st Qu.:    4.685     
##  Median :144.0   Median :   3.0   Median : 3.7550   Median :   64.913     
##  Mean   :164.8   Mean   :  30.3   Mean   : 4.6029   Mean   :  738.251     
##  3rd Qu.:228.0   3rd Qu.:  22.0   3rd Qu.: 7.7025   3rd Qu.:  441.534     
##  Max.   :723.0   Max.   :1800.0   Max.   :17.8700   Max.   :19479.912     
##  NA's   :10                       NA's   :194                             
##   Hepatitis B       Measles              BMI        under-five deaths
##  Min.   : 1.00   Min.   :     0.0   Min.   : 1.00   Min.   :   0.00  
##  1st Qu.:77.00   1st Qu.:     0.0   1st Qu.:19.30   1st Qu.:   0.00  
##  Median :92.00   Median :    17.0   Median :43.50   Median :   4.00  
##  Mean   :80.94   Mean   :  2419.6   Mean   :38.32   Mean   :  42.04  
##  3rd Qu.:97.00   3rd Qu.:   360.2   3rd Qu.:56.20   3rd Qu.:  28.00  
##  Max.   :99.00   Max.   :212183.0   Max.   :87.30   Max.   :2500.00  
##  NA's   :553                        NA's   :34                       
##      Polio       Total expenditure   Diphtheria       HIV/AIDS     
##  Min.   : 3.00   Min.   : 0.370    Min.   : 2.00   Min.   : 0.100  
##  1st Qu.:78.00   1st Qu.: 4.260    1st Qu.:78.00   1st Qu.: 0.100  
##  Median :93.00   Median : 5.755    Median :93.00   Median : 0.100  
##  Mean   :82.55   Mean   : 5.938    Mean   :82.32   Mean   : 1.742  
##  3rd Qu.:97.00   3rd Qu.: 7.492    3rd Qu.:97.00   3rd Qu.: 0.800  
##  Max.   :99.00   Max.   :17.600    Max.   :99.00   Max.   :50.600  
##  NA's   :19      NA's   :226       NA's   :19                      
##       GDP              Population        thinness  1-19 years
##  Min.   :     1.68   Min.   :3.400e+01   Min.   : 0.10       
##  1st Qu.:   463.94   1st Qu.:1.958e+05   1st Qu.: 1.60       
##  Median :  1766.95   Median :1.387e+06   Median : 3.30       
##  Mean   :  7483.16   Mean   :1.275e+07   Mean   : 4.84       
##  3rd Qu.:  5910.81   3rd Qu.:7.420e+06   3rd Qu.: 7.20       
##  Max.   :119172.74   Max.   :1.294e+09   Max.   :27.70       
##  NA's   :448         NA's   :652         NA's   :34          
##  thinness 5-9 years Income composition of resources   Schooling    
##  Min.   : 0.10      Min.   :0.0000                  Min.   : 0.00  
##  1st Qu.: 1.50      1st Qu.:0.4930                  1st Qu.:10.10  
##  Median : 3.30      Median :0.6770                  Median :12.30  
##  Mean   : 4.87      Mean   :0.6276                  Mean   :11.99  
##  3rd Qu.: 7.20      3rd Qu.:0.7790                  3rd Qu.:14.30  
##  Max.   :28.60      Max.   :0.9480                  Max.   :20.70  
##  NA's   :34         NA's   :167                     NA's   :163
sapply(data, class)
##                         Country                            Year 
##                     "character"                       "numeric" 
##                          Status                 Life expectancy 
##                     "character"                       "numeric" 
##                 Adult Mortality                   infant deaths 
##                       "numeric"                       "numeric" 
##                         Alcohol          percentage expenditure 
##                       "numeric"                       "numeric" 
##                     Hepatitis B                         Measles 
##                       "numeric"                       "numeric" 
##                             BMI               under-five deaths 
##                       "numeric"                       "numeric" 
##                           Polio               Total expenditure 
##                       "numeric"                       "numeric" 
##                      Diphtheria                        HIV/AIDS 
##                       "numeric"                       "numeric" 
##                             GDP                      Population 
##                       "numeric"                       "numeric" 
##            thinness  1-19 years              thinness 5-9 years 
##                       "numeric"                       "numeric" 
## Income composition of resources                       Schooling 
##                       "numeric"                       "numeric"
names(data) <- make.names(names(data))
numeric_cols <- data[sapply(data, is.numeric)]
# Perform imputation
# imputed_data <- mice(numeric_cols,m=5,maxit=50,meth='cart',seed=42)
numeric_cols <- data[sapply(data, is.numeric)]

# Set up the plotting grid (3 rows x 3 columns in this case)
par(mfrow = c(4, 5), mar = c(4, 4, 2, 1))  # Adjust margins for better spacing

# Create boxplots for each numeric column
for (col_name in names(numeric_cols)) {
  boxplot(numeric_cols[[col_name]],
          main = col_name,
          xlab = "",
          ylab = "Values",
          col = "lightblue",
          border = "blue")
}

par(mfrow = c(4, 5), mar = c(4, 4, 2, 1))  # Adjust margins for better spacing

# Create density plots for each numeric column
for (col_name in names(numeric_cols)) {
  # Ensure there are enough unique values for a meaningful density plot
  if (length(unique(numeric_cols[[col_name]])) > 1) {
    plot(density(numeric_cols[[col_name]], na.rm = TRUE),
         main = col_name,
         xlab = "Values",
         ylab = "Density",
         col = "blue",
         lwd = 2)
  } else {
    plot(1, type = "n", xlab = "", ylab = "", main = paste(col_name, "\n(No density)"))
    text(1, 1, "Not enough data", cex = 1.2)
  }
}

log_transform <- function(x) {
  # Check for NA and create a temporary vector that removes NAs
  temp_x <- na.omit(x)
  
  # Apply transformation only if there are no non-positive values
  if (any(temp_x <= 0)) {
    # Shift values by adding 1 to avoid log(0) and apply log
    return(ifelse(x > 0, log(x + 1), NA))  # Keep NAs for non-positive values
  } else {
    return(log(x))
  }
}
# Function to apply Box-Cox transformation and handle NA values
apply_boxcox <- function(x) {
  # Remove NA values for the Box-Cox calculation
  x_non_na <- x[!is.na(x)]
  
  # Apply Box-Cox transformation if there are enough data points
  if (length(x_non_na) > 1 && all(x_non_na > 0)) {
    lambda <- boxcox(x_non_na ~ 1, lambda = seq(-2, 2, 0.1))$x[which.max(boxcox(x_non_na ~ 1, lambda = seq(-2, 2, 0.1))$y)]
    transformed <- (x^lambda - 1) / lambda  # Box-Cox transformation
    return(ifelse(is.na(x), NA, transformed))  # Retain NA values
  } else {
    return(x)  # Return original data if conditions not met
  }
}
numeric_cols <- data[sapply(data, is.numeric)]

# Initialize an outlier counts dataframe
outlier_counts <- data.frame(Column = character(), OriginalCount = integer(), LogCount = integer(), BoxcoxCount = integer(), stringsAsFactors = FALSE)

# Function to calculate outliers and counts
calculate_outliers <- function(column) {
  Q1 <- quantile(column, 0.25, na.rm = TRUE)
  Q3 <- quantile(column, 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  
  # Determine lower and upper bounds for outliers
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  
  # Count outliers, excluding NA values
  outlier_count <- sum(column < lower_bound | column > upper_bound, na.rm = TRUE)
  
  return(outlier_count)
}

# Loop through numeric columns to calculate outlier counts
for (col_name in names(numeric_cols)) {
  # Count outliers in original data
  original_count <- calculate_outliers(numeric_cols[[col_name]])
  
  # Apply log transformation and count outliers
  log_transformed_col <- log_transform(numeric_cols[[col_name]])
  log_count <- calculate_outliers(log_transformed_col)
  
  # Apply Box-Cox transformation and count outliers
  boxcox_transformed_col <- apply_boxcox(numeric_cols[[col_name]])
  boxcox_count <- calculate_outliers(boxcox_transformed_col)
  
  # Append results to outlier_counts dataframe
  outlier_counts <- rbind(outlier_counts, data.frame(Column = col_name, OriginalCount = original_count, LogCount = log_count, BoxcoxCount = boxcox_count))
}

# Assuming outlier_counts is your outlier summary dataframe

# Find the minimum outlier counts across the three sets (original, log, Box-Cox)
outlier_counts$MinCount <- pmin(outlier_counts$OriginalCount, outlier_counts$LogCount, outlier_counts$BoxcoxCount)

# Sort the columns by the minimum outlier count
sorted_outliers <- outlier_counts[order(outlier_counts$MinCount), ]

# Select columns with the least outliers (e.g., top 5 columns with the least outliers)
least_outliers <- head(sorted_outliers, n = 20)

# Print the columns with the least outliers
print(least_outliers)
##                             Column OriginalCount LogCount BoxcoxCount MinCount
## 1                             Year             0        0           0        0
## 5                          Alcohol             0      315           0        0
## 6           percentage.expenditure           389        0         389        0
## 8                          Measles           542        0         542        0
## 9                              BMI             0      103           0        0
## 14                        HIV.AIDS           542       65           0        0
## 17            thinness..1.19.years            89       28           0        0
## 18              thinness.5.9.years            96       37           0        0
## 19 Income.composition.of.resources           130        0         130        0
## 2                  Life.expectancy            10       77           1        1
## 15                             GDP           365        5           4        4
## 10               under.five.deaths           394        8         394        8
## 16                      Population           294       30           9        9
## 4                    infant.deaths           315       10         315       10
## 3                  Adult.Mortality            82      166          11       11
## 12               Total.expenditure            32      116          27       27
## 20                       Schooling            44      122          44       44
## 7                      Hepatitis.B           254      293         186      186
## 11                           Polio           279      332         186      186
## 13                      Diphtheria           298      332         202      202
# Identify categorical columns
categorical_cols <- data[sapply(data, is.factor) | sapply(data, is.character)]

# Initialize a new dataframe to hold transformed numeric columns
best_transformed_data <- data.frame(matrix(ncol = ncol(numeric_cols), nrow = nrow(numeric_cols)))

# Set column names with transformations
colnames(best_transformed_data) <- names(numeric_cols)

# Loop through the outlier_counts to apply the best transformation
for (row in 1:nrow(outlier_counts)) {
  col_name <- outlier_counts$Column[row]
  
  if (outlier_counts$MinCount[row] == outlier_counts$OriginalCount[row]) {
    best_transformed_data[[col_name]] <- numeric_cols[[col_name]]
    colnames(best_transformed_data)[row] <- paste(col_name, "Original", sep = ".")
  } else if (outlier_counts$MinCount[row] == outlier_counts$LogCount[row]) {
    best_transformed_data[[col_name]] <- log_transform(as.numeric(numeric_cols[[col_name]]))
    colnames(best_transformed_data)[row] <- paste(col_name, "Log", sep = ".")
  } else {
    best_transformed_data[[col_name]] <- apply_boxcox(numeric_cols[[col_name]])
    colnames(best_transformed_data)[row] <- paste(col_name, "BoxCox", sep = ".")
  }
}

# Combine with categorical columns
transformed_data <- cbind(best_transformed_data, categorical_cols)

# Display the final dataframe with transformed numeric and original categorical columns
print(head(transformed_data))
##   Year.Original Life.expectancy.BoxCox Adult.Mortality.BoxCox infant.deaths.Log
## 1          2015               2112.000               22.70590          4.143135
## 2          2014               1793.505               23.02654          4.174387
## 3          2013               1793.505               22.90695          4.204693
## 4          2012               1769.625               23.06624          4.248495
## 5          2011               1751.820               23.18482          4.276666
## 6          2010               1728.220               23.34178          4.317488
##   Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log
## 1             0.01                   4.280542             2112.0    7.051856
## 2             0.01                   4.311116             1921.5    6.200509
## 3             0.01                   4.307023             2047.5    6.066108
## 4             0.01                   4.371777             2244.0    7.933080
## 5             0.01                   2.091507             2311.5    8.011023
## 6             0.01                   4.390483             2177.5    7.595890
##   BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox
## 1         19.1              4.430817         17.5                 4.131972
## 2         18.6              4.465908       1681.5                 4.140352
## 3         18.1              4.499810       1921.5                 4.119386
## 4         17.6              4.543295       2244.0                 4.281537
## 5         17.2              4.584967       2311.5                 4.009491
## 6         16.7              4.634729       2177.5                 4.557087
##   Diphtheria.BoxCox HIV.AIDS.BoxCox GDP.BoxCox Population.BoxCox
## 1            2112.0       -5.462383   6.798422          30.67723
## 2            1921.5       -5.462383   6.852500          19.12425
## 3            2047.5       -5.462383   6.887365          30.50238
## 4            2244.0       -5.462383   6.954307          24.76056
## 5            2311.5       -5.462383   4.330698          24.22380
## 6            2177.5       -5.462383   6.736593          24.14351
##   thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox
## 1                    3.967874                  3.978790
## 2                    4.000475                  4.000475
## 3                    4.021968                  4.021968
## 4                    4.043273                  4.053856
## 5                    4.074886                  4.074886
## 6                    4.095737                  4.095737
##   Income.composition.of.resources.Log Schooling.Original     Country     Status
## 1                           0.3913662               10.1 Afghanistan Developing
## 2                           0.3893357               10.0 Afghanistan Developing
## 3                           0.3852624                9.9 Afghanistan Developing
## 4                           0.3804891                9.8 Afghanistan Developing
## 5                           0.3743184                9.5 Afghanistan Developing
## 6                           0.3701833                9.2 Afghanistan Developing
transformed_data$Life.expectancy.BoxCox <- data$Life.expectancy
names(transformed_data)[names(transformed_data) == "Life.expectancy.BoxCox"] <- "Life.expectancy"

summary(transformed_data)
##  Year.Original  Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log
##  Min.   :2000   Min.   :36.30   Min.   : 0.00          Min.   :0.6931   
##  1st Qu.:2004   1st Qu.:63.10   1st Qu.:12.28          1st Qu.:1.0986   
##  Median :2008   Median :72.10   Median :17.05          Median :2.3026   
##  Mean   :2008   Mean   :69.22   Mean   :16.59          Mean   :2.4862   
##  3rd Qu.:2012   3rd Qu.:75.70   3rd Qu.:21.23          3rd Qu.:3.5264   
##  Max.   :2015   Max.   :89.00   Max.   :36.13          Max.   :7.4961   
##                 NA's   :10      NA's   :10             NA's   :848      
##  Alcohol.Original  percentage.expenditure.Log Hepatitis.B.BoxCox
##  Min.   : 0.0100   Min.   :0.0952             Min.   :   0      
##  1st Qu.: 0.8775   1st Qu.:3.6175             1st Qu.:2964      
##  Median : 3.7550   Median :5.0511             Median :4232      
##  Mean   : 4.6029   Mean   :5.0330             Mean   :3589      
##  3rd Qu.: 7.7025   3rd Qu.:6.4477             3rd Qu.:4704      
##  Max.   :17.8700   Max.   :9.8772             Max.   :4900      
##  NA's   :194       NA's   :611                NA's   :553       
##   Measles.Log       BMI.Original   under.five.deaths.Log  Polio.BoxCox 
##  Min.   : 0.6931   Min.   : 1.00   Min.   :0.6931        Min.   :   4  
##  1st Qu.: 2.8904   1st Qu.:19.30   1st Qu.:1.3863        1st Qu.:3042  
##  Median : 4.8598   Median :43.50   Median :2.4849        Median :4324  
##  Mean   : 5.0450   Mean   :38.32   Mean   :2.6475        Mean   :3681  
##  3rd Qu.: 7.0992   3rd Qu.:56.20   3rd Qu.:3.8501        3rd Qu.:4704  
##  Max.   :12.2652   Max.   :87.30   Max.   :7.8244        Max.   :4900  
##  NA's   :983       NA's   :34      NA's   :785           NA's   :19    
##  Total.expenditure.BoxCox Diphtheria.BoxCox HIV.AIDS.BoxCox     GDP.BoxCox     
##  Min.   :-0.7536          Min.   :   1.5    Min.   :-5.4624   Min.   : 0.5223  
##  1st Qu.: 2.2829          1st Qu.:3041.5    1st Qu.:-5.4624   1st Qu.: 6.5368  
##  Median : 3.0518          Median :4324.0    Median :-5.4624   Median : 8.0713  
##  Mean   : 3.0301          Mean   :3669.3    Mean   :-3.3728   Mean   : 8.1009  
##  3rd Qu.: 3.8472          3rd Qu.:4704.0    3rd Qu.:-0.2406   3rd Qu.: 9.4929  
##  Max.   : 7.4532          Max.   :4900.0    Max.   : 1.3904   Max.   :13.1836  
##  NA's   :226              NA's   :19                          NA's   :448      
##  Population.BoxCox thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox
##  Min.   : 3.932    Min.   :-1.8023             Min.   :-1.8023          
##  1st Qu.:18.030    1st Qu.: 0.4954             1st Qu.: 0.4243          
##  Median :22.380    Median : 1.3673             Median : 1.3673          
##  Mean   :22.282    Mean   : 1.4341             Mean   : 1.4273          
##  3rd Qu.:26.540    3rd Qu.: 2.4780             3rd Qu.: 2.4780          
##  Max.   :42.346    Max.   : 4.9138             Max.   : 4.9809          
##  NA's   :652       NA's   :34                  NA's   :34               
##  Income.composition.of.resources.Log Schooling.Original   Country         
##  Min.   :0.2255                      Min.   : 0.00      Length:2938       
##  1st Qu.:0.4207                      1st Qu.:10.10      Class :character  
##  Median :0.5230                      Median :12.30      Mode  :character  
##  Mean   :0.5009                      Mean   :11.99                        
##  3rd Qu.:0.5789                      3rd Qu.:14.30                        
##  Max.   :0.6668                      Max.   :20.70                        
##  NA's   :297                         NA's   :163                          
##     Status         
##  Length:2938       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
replace_outliers <- function(column) {
  Q1 <- quantile(column, 0.25, na.rm = TRUE)
  Q3 <- quantile(column, 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  
  # Replace outliers
  column[column < lower_bound] <- lower_bound
  column[column > upper_bound] <- upper_bound
  
  return(column)
}

# List of columns to check for and replace outliers
cols_to_check <- c("GDP.BoxCox", "under.five.deaths.Log", "Population.BoxCox", 
                   "infant.deaths.Log", "Adult.Mortality.BoxCox", 
                   "Total.expenditure.BoxCox", "Schooling.Original", 
                   "Hepatitis.B.BoxCox", "Polio.BoxCox", "Diphtheria.BoxCox")

# Loop through each specified column to replace outliers
for (col in cols_to_check) {
  if (col %in% names(transformed_data)) {
    transformed_data[[col]] <- replace_outliers(transformed_data[[col]])
  } else {
    cat("Warning: Column", col, "not found in data frame.\n")
  }
}

# Display the cleaned data frame
View(transformed_data)
numeric_cols_names <- c(
  "Life.expectancy",
  "Adult.Mortality.BoxCox",
  "infant.deaths.Log",
  "Alcohol.Original",
  "percentage.expenditure.Log",
  "Hepatitis.B.BoxCox",
  "Measles.Log",
  "BMI.Original",
  "under.five.deaths.Log",
  "Polio.BoxCox",
  "Total.expenditure.BoxCox",
  "Diphtheria.BoxCox",
  "HIV.AIDS.BoxCox",
  "GDP.BoxCox",
  "Population.BoxCox",
  "thinness..1.19.years.BoxCox",
  "thinness.5.9.years.BoxCox",
  "Income.composition.of.resources.Log",
  "Schooling.Original"
)

# Extract numeric columns based on the specified names
numeric_cols <- transformed_data[numeric_cols_names]

# Check the structure of the resulting numeric columns
str(numeric_cols)
## 'data.frame':    2938 obs. of  19 variables:
##  $ Life.expectancy                    : num  65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
##  $ Adult.Mortality.BoxCox             : num  22.7 23 22.9 23.1 23.2 ...
##  $ infant.deaths.Log                  : num  4.14 4.17 4.2 4.25 4.28 ...
##  $ Alcohol.Original                   : num  0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
##  $ percentage.expenditure.Log         : num  4.28 4.31 4.31 4.37 2.09 ...
##  $ Hepatitis.B.BoxCox                 : num  2112 1922 2048 2244 2312 ...
##  $ Measles.Log                        : num  7.05 6.2 6.07 7.93 8.01 ...
##  $ BMI.Original                       : num  19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
##  $ under.five.deaths.Log              : num  4.43 4.47 4.5 4.54 4.58 ...
##  $ Polio.BoxCox                       : num  548 1682 1922 2244 2312 ...
##  $ Total.expenditure.BoxCox           : num  4.13 4.14 4.12 4.28 4.01 ...
##  $ Diphtheria.BoxCox                  : num  2112 1922 2048 2244 2312 ...
##  $ HIV.AIDS.BoxCox                    : num  -5.46 -5.46 -5.46 -5.46 -5.46 ...
##  $ GDP.BoxCox                         : num  6.8 6.85 6.89 6.95 4.33 ...
##  $ Population.BoxCox                  : num  30.7 19.1 30.5 24.8 24.2 ...
##  $ thinness..1.19.years.BoxCox        : num  3.97 4 4.02 4.04 4.07 ...
##  $ thinness.5.9.years.BoxCox          : num  3.98 4 4.02 4.05 4.07 ...
##  $ Income.composition.of.resources.Log: num  0.391 0.389 0.385 0.38 0.374 ...
##  $ Schooling.Original                 : num  10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...
# Set up the plotting grid (adjust as needed based on the number of numeric columns)
par(mfrow = c(4, 5), mar = c(4, 4, 2, 1))  # 4 rows x 5 columns

# Create boxplots for each numeric column
for (col_name in numeric_cols_names) {
  boxplot(numeric_cols[[col_name]],
          main = col_name,
          xlab = "",
          ylab = "Values",
          col = "lightblue",
          border = "blue",
          outline = TRUE)  # outline = TRUE will show outliers
}

# Create the aggregation plot again
aggr_plot <- aggr(transformed_data, 
                  col = c('navyblue', 'red'), 
                  numbers = TRUE, 
                  sortVars = TRUE, 
                  labels = names(transformed_data),  # Use the original labels
                  cex.axis = 0.7,  # Increase text size slightly
                  gap = 3, 
                  ylab = c("Histogram of missing data", "Pattern"),
                  prop = T,
                  combined = F, 
                  oma = c(20,5,5,3))

## 
##  Variables sorted by number of missings: 
##                             Variable       Count
##                          Measles.Log 0.334581348
##                    infant.deaths.Log 0.288631722
##                under.five.deaths.Log 0.267188564
##                    Population.BoxCox 0.221919673
##           percentage.expenditure.Log 0.207964602
##                   Hepatitis.B.BoxCox 0.188223281
##                           GDP.BoxCox 0.152484683
##  Income.composition.of.resources.Log 0.101089176
##             Total.expenditure.BoxCox 0.076923077
##                     Alcohol.Original 0.066031314
##                   Schooling.Original 0.055479918
##                         BMI.Original 0.011572498
##          thinness..1.19.years.BoxCox 0.011572498
##            thinness.5.9.years.BoxCox 0.011572498
##                         Polio.BoxCox 0.006466984
##                    Diphtheria.BoxCox 0.006466984
##                      Life.expectancy 0.003403676
##               Adult.Mortality.BoxCox 0.003403676
##                        Year.Original 0.000000000
##                      HIV.AIDS.BoxCox 0.000000000
##                              Country 0.000000000
##                               Status 0.000000000
impute_data <- mice(transformed_data, m = 5, method = 'pmm', seed = 123)
## 
##  iter imp variable
##   1   1  Life.expectancy  Adult.Mortality.BoxCox  infant.deaths.Log  Alcohol.Original  percentage.expenditure.Log  Hepatitis.B.BoxCox  Measles.Log  BMI.Original  under.five.deaths.Log  Polio.BoxCox  Total.expenditure.BoxCox  Diphtheria.BoxCox  GDP.BoxCox  Population.BoxCox  thinness..1.19.years.BoxCox  thinness.5.9.years.BoxCox  Income.composition.of.resources.Log  Schooling.Original
##   1   2  Life.expectancy  Adult.Mortality.BoxCox  infant.deaths.Log  Alcohol.Original  percentage.expenditure.Log  Hepatitis.B.BoxCox  Measles.Log  BMI.Original  under.five.deaths.Log  Polio.BoxCox  Total.expenditure.BoxCox  Diphtheria.BoxCox  GDP.BoxCox  Population.BoxCox  thinness..1.19.years.BoxCox  thinness.5.9.years.BoxCox  Income.composition.of.resources.Log  Schooling.Original
##   1   3  Life.expectancy  Adult.Mortality.BoxCox  infant.deaths.Log  Alcohol.Original  percentage.expenditure.Log  Hepatitis.B.BoxCox  Measles.Log  BMI.Original  under.five.deaths.Log  Polio.BoxCox  Total.expenditure.BoxCox  Diphtheria.BoxCox  GDP.BoxCox  Population.BoxCox  thinness..1.19.years.BoxCox  thinness.5.9.years.BoxCox  Income.composition.of.resources.Log  Schooling.Original
##   1   4  Life.expectancy  Adult.Mortality.BoxCox  infant.deaths.Log  Alcohol.Original  percentage.expenditure.Log  Hepatitis.B.BoxCox  Measles.Log  BMI.Original  under.five.deaths.Log  Polio.BoxCox  Total.expenditure.BoxCox  Diphtheria.BoxCox  GDP.BoxCox  Population.BoxCox  thinness..1.19.years.BoxCox  thinness.5.9.years.BoxCox  Income.composition.of.resources.Log  Schooling.Original
##   1   5  Life.expectancy  Adult.Mortality.BoxCox  infant.deaths.Log  Alcohol.Original  percentage.expenditure.Log  Hepatitis.B.BoxCox  Measles.Log  BMI.Original  under.five.deaths.Log  Polio.BoxCox  Total.expenditure.BoxCox  Diphtheria.BoxCox  GDP.BoxCox  Population.BoxCox  thinness..1.19.years.BoxCox  thinness.5.9.years.BoxCox  Income.composition.of.resources.Log  Schooling.Original
##   2   1  Life.expectancy  Adult.Mortality.BoxCox  infant.deaths.Log  Alcohol.Original  percentage.expenditure.Log  Hepatitis.B.BoxCox  Measles.Log  BMI.Original  under.five.deaths.Log  Polio.BoxCox  Total.expenditure.BoxCox  Diphtheria.BoxCox  GDP.BoxCox  Population.BoxCox  thinness..1.19.years.BoxCox  thinness.5.9.years.BoxCox  Income.composition.of.resources.Log  Schooling.Original
##   2   2  Life.expectancy  Adult.Mortality.BoxCox  infant.deaths.Log  Alcohol.Original  percentage.expenditure.Log  Hepatitis.B.BoxCox  Measles.Log  BMI.Original  under.five.deaths.Log  Polio.BoxCox  Total.expenditure.BoxCox  Diphtheria.BoxCox  GDP.BoxCox  Population.BoxCox  thinness..1.19.years.BoxCox  thinness.5.9.years.BoxCox  Income.composition.of.resources.Log  Schooling.Original
##   2   3  Life.expectancy  Adult.Mortality.BoxCox  infant.deaths.Log  Alcohol.Original  percentage.expenditure.Log  Hepatitis.B.BoxCox  Measles.Log  BMI.Original  under.five.deaths.Log  Polio.BoxCox  Total.expenditure.BoxCox  Diphtheria.BoxCox  GDP.BoxCox  Population.BoxCox  thinness..1.19.years.BoxCox  thinness.5.9.years.BoxCox  Income.composition.of.resources.Log  Schooling.Original
##   2   4  Life.expectancy  Adult.Mortality.BoxCox  infant.deaths.Log  Alcohol.Original  percentage.expenditure.Log  Hepatitis.B.BoxCox  Measles.Log  BMI.Original  under.five.deaths.Log  Polio.BoxCox  Total.expenditure.BoxCox  Diphtheria.BoxCox  GDP.BoxCox  Population.BoxCox  thinness..1.19.years.BoxCox  thinness.5.9.years.BoxCox  Income.composition.of.resources.Log  Schooling.Original
##   2   5  Life.expectancy  Adult.Mortality.BoxCox  infant.deaths.Log  Alcohol.Original  percentage.expenditure.Log  Hepatitis.B.BoxCox  Measles.Log  BMI.Original  under.five.deaths.Log  Polio.BoxCox  Total.expenditure.BoxCox  Diphtheria.BoxCox  GDP.BoxCox  Population.BoxCox  thinness..1.19.years.BoxCox  thinness.5.9.years.BoxCox  Income.composition.of.resources.Log  Schooling.Original
##   3   1  Life.expectancy  Adult.Mortality.BoxCox  infant.deaths.Log  Alcohol.Original  percentage.expenditure.Log  Hepatitis.B.BoxCox  Measles.Log  BMI.Original  under.five.deaths.Log  Polio.BoxCox  Total.expenditure.BoxCox  Diphtheria.BoxCox  GDP.BoxCox  Population.BoxCox  thinness..1.19.years.BoxCox  thinness.5.9.years.BoxCox  Income.composition.of.resources.Log  Schooling.Original
##   3   2  Life.expectancy  Adult.Mortality.BoxCox  infant.deaths.Log  Alcohol.Original  percentage.expenditure.Log  Hepatitis.B.BoxCox  Measles.Log  BMI.Original  under.five.deaths.Log  Polio.BoxCox  Total.expenditure.BoxCox  Diphtheria.BoxCox  GDP.BoxCox  Population.BoxCox  thinness..1.19.years.BoxCox  thinness.5.9.years.BoxCox  Income.composition.of.resources.Log  Schooling.Original
##   3   3  Life.expectancy  Adult.Mortality.BoxCox  infant.deaths.Log  Alcohol.Original  percentage.expenditure.Log  Hepatitis.B.BoxCox  Measles.Log  BMI.Original  under.five.deaths.Log  Polio.BoxCox  Total.expenditure.BoxCox  Diphtheria.BoxCox  GDP.BoxCox  Population.BoxCox  thinness..1.19.years.BoxCox  thinness.5.9.years.BoxCox  Income.composition.of.resources.Log  Schooling.Original
##   3   4  Life.expectancy  Adult.Mortality.BoxCox  infant.deaths.Log  Alcohol.Original  percentage.expenditure.Log  Hepatitis.B.BoxCox  Measles.Log  BMI.Original  under.five.deaths.Log  Polio.BoxCox  Total.expenditure.BoxCox  Diphtheria.BoxCox  GDP.BoxCox  Population.BoxCox  thinness..1.19.years.BoxCox  thinness.5.9.years.BoxCox  Income.composition.of.resources.Log  Schooling.Original
##   3   5  Life.expectancy  Adult.Mortality.BoxCox  infant.deaths.Log  Alcohol.Original  percentage.expenditure.Log  Hepatitis.B.BoxCox  Measles.Log  BMI.Original  under.five.deaths.Log  Polio.BoxCox  Total.expenditure.BoxCox  Diphtheria.BoxCox  GDP.BoxCox  Population.BoxCox  thinness..1.19.years.BoxCox  thinness.5.9.years.BoxCox  Income.composition.of.resources.Log  Schooling.Original
##   4   1  Life.expectancy  Adult.Mortality.BoxCox  infant.deaths.Log  Alcohol.Original  percentage.expenditure.Log  Hepatitis.B.BoxCox  Measles.Log  BMI.Original  under.five.deaths.Log  Polio.BoxCox  Total.expenditure.BoxCox  Diphtheria.BoxCox  GDP.BoxCox  Population.BoxCox  thinness..1.19.years.BoxCox  thinness.5.9.years.BoxCox  Income.composition.of.resources.Log  Schooling.Original
##   4   2  Life.expectancy  Adult.Mortality.BoxCox  infant.deaths.Log  Alcohol.Original  percentage.expenditure.Log  Hepatitis.B.BoxCox  Measles.Log  BMI.Original  under.five.deaths.Log  Polio.BoxCox  Total.expenditure.BoxCox  Diphtheria.BoxCox  GDP.BoxCox  Population.BoxCox  thinness..1.19.years.BoxCox  thinness.5.9.years.BoxCox  Income.composition.of.resources.Log  Schooling.Original
##   4   3  Life.expectancy  Adult.Mortality.BoxCox  infant.deaths.Log  Alcohol.Original  percentage.expenditure.Log  Hepatitis.B.BoxCox  Measles.Log  BMI.Original  under.five.deaths.Log  Polio.BoxCox  Total.expenditure.BoxCox  Diphtheria.BoxCox  GDP.BoxCox  Population.BoxCox  thinness..1.19.years.BoxCox  thinness.5.9.years.BoxCox  Income.composition.of.resources.Log  Schooling.Original
##   4   4  Life.expectancy  Adult.Mortality.BoxCox  infant.deaths.Log  Alcohol.Original  percentage.expenditure.Log  Hepatitis.B.BoxCox  Measles.Log  BMI.Original  under.five.deaths.Log  Polio.BoxCox  Total.expenditure.BoxCox  Diphtheria.BoxCox  GDP.BoxCox  Population.BoxCox  thinness..1.19.years.BoxCox  thinness.5.9.years.BoxCox  Income.composition.of.resources.Log  Schooling.Original
##   4   5  Life.expectancy  Adult.Mortality.BoxCox  infant.deaths.Log  Alcohol.Original  percentage.expenditure.Log  Hepatitis.B.BoxCox  Measles.Log  BMI.Original  under.five.deaths.Log  Polio.BoxCox  Total.expenditure.BoxCox  Diphtheria.BoxCox  GDP.BoxCox  Population.BoxCox  thinness..1.19.years.BoxCox  thinness.5.9.years.BoxCox  Income.composition.of.resources.Log  Schooling.Original
##   5   1  Life.expectancy  Adult.Mortality.BoxCox  infant.deaths.Log  Alcohol.Original  percentage.expenditure.Log  Hepatitis.B.BoxCox  Measles.Log  BMI.Original  under.five.deaths.Log  Polio.BoxCox  Total.expenditure.BoxCox  Diphtheria.BoxCox  GDP.BoxCox  Population.BoxCox  thinness..1.19.years.BoxCox  thinness.5.9.years.BoxCox  Income.composition.of.resources.Log  Schooling.Original
##   5   2  Life.expectancy  Adult.Mortality.BoxCox  infant.deaths.Log  Alcohol.Original  percentage.expenditure.Log  Hepatitis.B.BoxCox  Measles.Log  BMI.Original  under.five.deaths.Log  Polio.BoxCox  Total.expenditure.BoxCox  Diphtheria.BoxCox  GDP.BoxCox  Population.BoxCox  thinness..1.19.years.BoxCox  thinness.5.9.years.BoxCox  Income.composition.of.resources.Log  Schooling.Original
##   5   3  Life.expectancy  Adult.Mortality.BoxCox  infant.deaths.Log  Alcohol.Original  percentage.expenditure.Log  Hepatitis.B.BoxCox  Measles.Log  BMI.Original  under.five.deaths.Log  Polio.BoxCox  Total.expenditure.BoxCox  Diphtheria.BoxCox  GDP.BoxCox  Population.BoxCox  thinness..1.19.years.BoxCox  thinness.5.9.years.BoxCox  Income.composition.of.resources.Log  Schooling.Original
##   5   4  Life.expectancy  Adult.Mortality.BoxCox  infant.deaths.Log  Alcohol.Original  percentage.expenditure.Log  Hepatitis.B.BoxCox  Measles.Log  BMI.Original  under.five.deaths.Log  Polio.BoxCox  Total.expenditure.BoxCox  Diphtheria.BoxCox  GDP.BoxCox  Population.BoxCox  thinness..1.19.years.BoxCox  thinness.5.9.years.BoxCox  Income.composition.of.resources.Log  Schooling.Original
##   5   5  Life.expectancy  Adult.Mortality.BoxCox  infant.deaths.Log  Alcohol.Original  percentage.expenditure.Log  Hepatitis.B.BoxCox  Measles.Log  BMI.Original  under.five.deaths.Log  Polio.BoxCox  Total.expenditure.BoxCox  Diphtheria.BoxCox  GDP.BoxCox  Population.BoxCox  thinness..1.19.years.BoxCox  thinness.5.9.years.BoxCox  Income.composition.of.resources.Log  Schooling.Original
imputed_data <- complete(impute_data, 1)
aggr_plot <- aggr(imputed_data, 
                  col = c('navyblue', 'red'), 
                  numbers = TRUE, 
                  sortVars = TRUE, 
                  labels = names(imputed_data),  # Use the original labels
                  cex.axis = 0.7,  # Increase text size slightly
                  gap = 3, 
                  ylab = c("Histogram of missing data", "Pattern"),
                  prop = T,
                  combined = F, 
                  oma = c(20,5,5,3))

## 
##  Variables sorted by number of missings: 
##                             Variable Count
##                        Year.Original     0
##                      Life.expectancy     0
##               Adult.Mortality.BoxCox     0
##                    infant.deaths.Log     0
##                     Alcohol.Original     0
##           percentage.expenditure.Log     0
##                   Hepatitis.B.BoxCox     0
##                          Measles.Log     0
##                         BMI.Original     0
##                under.five.deaths.Log     0
##                         Polio.BoxCox     0
##             Total.expenditure.BoxCox     0
##                    Diphtheria.BoxCox     0
##                      HIV.AIDS.BoxCox     0
##                           GDP.BoxCox     0
##                    Population.BoxCox     0
##          thinness..1.19.years.BoxCox     0
##            thinness.5.9.years.BoxCox     0
##  Income.composition.of.resources.Log     0
##                   Schooling.Original     0
##                              Country     0
##                               Status     0
scaled_data <- imputed_data
columns_to_scale <- setdiff(names(scaled_data), c("Year.Original", "Life.expectancy", "Status", "Country"))

min_max_scale <- function(x) {
  return((x - min(x)) / (max(x) - min(x)))
}

# Apply Min-Max scaling to the selected columns
scaled_data[, columns_to_scale] <- lapply(scaled_data[, columns_to_scale], min_max_scale)

# Display the scaled DataFrame
print(head(scaled_data))
##   Year.Original Life.expectancy Adult.Mortality.BoxCox infant.deaths.Log
## 1          2015            65.0              0.6550187         0.5328301
## 2          2014            59.9              0.6642686         0.5376569
## 3          2013            59.9              0.6608185         0.5423374
## 4          2012            59.5              0.6654137         0.5491024
## 5          2011            59.2              0.6688346         0.5534533
## 6          2010            58.8              0.6733626         0.5597580
##   Alcohol.Original percentage.expenditure.Log Hepatitis.B.BoxCox Measles.Log
## 1                0                  0.4278624          0.3867136   0.5494879
## 2                0                  0.4309879          0.3448086   0.4759188
## 3                0                  0.4305695          0.3725253   0.4643045
## 4                0                  0.4371892          0.4157501   0.6256389
## 5                0                  0.2040803          0.4305983   0.6323744
## 6                0                  0.4391015          0.4011219   0.5965007
##   BMI.Original under.five.deaths.Log Polio.BoxCox Total.expenditure.BoxCox
## 1    0.2097335             0.5454238    0.0000000                0.6705082
## 2    0.2039397             0.5505446    0.2604974                0.6718475
## 3    0.1981460             0.5554917    0.3156413                0.6684968
## 4    0.1923523             0.5618373    0.3897409                0.6944110
## 5    0.1877173             0.5679184    0.4052502                0.6509339
## 6    0.1819235             0.5751799    0.3744615                0.7384481
##   Diphtheria.BoxCox HIV.AIDS.BoxCox GDP.BoxCox Population.BoxCox
## 1         0.3594118               0  0.4237765         0.7465382
## 2         0.3156413               0  0.4286566         0.4071410
## 3         0.3445919               0  0.4318030         0.7414018
## 4         0.3897409               0  0.4378441         0.5727215
## 5         0.4052502               0  0.2010809         0.5569528
## 6         0.3744615               0  0.4181968         0.5545941
##   thinness..1.19.years.BoxCox thinness.5.9.years.BoxCox
## 1                   0.8591598                 0.8522667
## 2                   0.8640139                 0.8554636
## 3                   0.8672142                 0.8586322
## 4                   0.8703864                 0.8633333
## 5                   0.8750935                 0.8664336
## 6                   0.8781981                 0.8695075
##   Income.composition.of.resources.Log Schooling.Original     Country     Status
## 1                           0.3757978          0.3750000 Afghanistan Developing
## 2                           0.3711964          0.3690476 Afghanistan Developing
## 3                           0.3619653          0.3630952 Afghanistan Developing
## 4                           0.3511480          0.3571429 Afghanistan Developing
## 5                           0.3371637          0.3392857 Afghanistan Developing
## 6                           0.3277927          0.3214286 Afghanistan Developing
pca_data <- scaled_data[, !(names(scaled_data) %in% c("Country", "Status", "Life.expectancy", "Year.Original"))]

# Step 3: Perform PCA
pca_result <- prcomp(pca_data, center = TRUE, scale. = TRUE)

# Step 4: Summary of PCA (to see variance explained by principal components)
pca_summary <- summary(pca_result)
pca_summary
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.7878 1.3691 1.24961 1.08407 1.02726 0.88909 0.83200
## Proportion of Variance 0.4318 0.1041 0.08675 0.06529 0.05863 0.04392 0.03846
## Cumulative Proportion  0.4318 0.5359 0.62266 0.68795 0.74657 0.79049 0.82895
##                            PC8     PC9    PC10    PC11    PC12    PC13    PC14
## Standard deviation     0.82321 0.73053 0.67439 0.63867 0.58154 0.53208 0.39885
## Proportion of Variance 0.03765 0.02965 0.02527 0.02266 0.01879 0.01573 0.00884
## Cumulative Proportion  0.86659 0.89624 0.92151 0.94417 0.96296 0.97869 0.98752
##                           PC15    PC16    PC17    PC18
## Standard deviation     0.26260 0.24465 0.22216 0.21544
## Proportion of Variance 0.00383 0.00333 0.00274 0.00258
## Cumulative Proportion  0.99135 0.99468 0.99742 1.00000
cumulative_variance <- pca_summary$importance[3, ]
num_components <- which(cumulative_variance >= 0.80)[1]

pca_scores <- as.data.frame(pca_result$x[, 1:num_components])
colnames(pca_scores) <- paste("PC", 1:num_components, sep = "")
print(head(pca_scores))
##         PC1         PC2        PC3        PC4        PC5       PC6       PC7
## 1 -4.092536  0.55521256 -1.2772015  0.4993711  0.1909291 0.8034938 1.4975367
## 2 -3.783353  0.04342838 -0.1960483  0.7462391  0.1059007 1.6860901 0.9120533
## 3 -3.887147  0.09138994 -1.2887966  0.4034574  0.3602460 0.7117525 1.5857698
## 4 -3.846082 -0.14040545 -1.1403556  0.4141983  0.2869167 1.5244913 0.9451510
## 5 -4.517501 -0.61236990 -0.6349960 -0.5793528 -0.3421912 1.2223389 0.4466460
## 6 -3.999892 -0.11577365 -0.9764242  0.3106258  0.4120894 1.6822497 1.0762155
ols_data <- cbind(Life.expectancy = scaled_data$Life.expectancy, pca_scores)

# Step 2: Fit the OLS model
ols_model <- lm(Life.expectancy ~ ., data = ols_data)

# Step 3: Summary of the OLS model
summary(ols_model)
## 
## Call:
## lm(formula = Life.expectancy ~ ., data = ols_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.4262  -2.3465   0.1761   2.6690  14.1678 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 69.23642    0.07814 886.078  < 2e-16 ***
## PC1          2.88268    0.02803 102.830  < 2e-16 ***
## PC2         -0.14684    0.05708  -2.572   0.0101 *  
## PC3         -1.34359    0.06254 -21.483  < 2e-16 ***
## PC4          0.09457    0.07209   1.312   0.1897    
## PC5         -2.12554    0.07608 -27.939  < 2e-16 ***
## PC6          0.70945    0.08790   8.071 1.01e-15 ***
## PC7         -0.01607    0.09393  -0.171   0.8642    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.235 on 2930 degrees of freedom
## Multiple R-squared:  0.8023, Adjusted R-squared:  0.8018 
## F-statistic:  1699 on 7 and 2930 DF,  p-value: < 2.2e-16
# Step 4: Optional - Diagnostics of the OLS model
par(mfrow = c(2, 2))  # Set up a 2x2 plotting area
plot(ols_model)       # Residual diagnostics plots

# Step 1: Loadings and model coefficients
pca_loadings <- pca_result$rotation[, 1:num_components]  # Retained loadings
coef <- ols_model$coefficients[-1]  # Exclude intercept

# Step 2: Influence of original features on the target
feature_influence <- pca_loadings %*% coef

# Step 3: Create a data frame for interpretation
influence_df <- data.frame(Feature = rownames(pca_loadings), Influence = feature_influence)
influence_df <- influence_df[order(abs(influence_df$Influence), decreasing = TRUE), ]

# Step 4: View results
library(ggplot2)

# Create a bar plot for influence_df
ggplot(influence_df, aes(x = reorder(Feature, Influence), y = Influence)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +  # Flip coordinates for better readability
    labs(title = "Influence of Features on Life Expectancy",
      x = "Features",
       y = "Influence") +
  theme_minimal()