1. Setting up the Environment

Import necessary libraries and load the dataset:

library(tidyverse)
library(lubridate)
library(skimr)
library(dplyr)
library(car)
life_exp <- read.csv("data/Life Expectancy Data.csv")

2. Initial Exploration of the Dataset

Explore the structure and summary of the dataset:

str(life_exp)  # View the structure of the dataset
## 'data.frame':    2938 obs. of  22 variables:
##  $ Country                        : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Year                           : int  2015 2014 2013 2012 2011 2010 2009 2008 2007 2006 ...
##  $ Status                         : chr  "Developing" "Developing" "Developing" "Developing" ...
##  $ Life.expectancy                : num  65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
##  $ Adult.Mortality                : int  263 271 268 272 275 279 281 287 295 295 ...
##  $ infant.deaths                  : int  62 64 66 69 71 74 77 80 82 84 ...
##  $ Alcohol                        : num  0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
##  $ percentage.expenditure         : num  71.3 73.5 73.2 78.2 7.1 ...
##  $ Hepatitis.B                    : int  65 62 64 67 68 66 63 64 63 64 ...
##  $ Measles                        : int  1154 492 430 2787 3013 1989 2861 1599 1141 1990 ...
##  $ BMI                            : num  19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
##  $ under.five.deaths              : int  83 86 89 93 97 102 106 110 113 116 ...
##  $ Polio                          : int  6 58 62 67 68 66 63 64 63 58 ...
##  $ Total.expenditure              : num  8.16 8.18 8.13 8.52 7.87 9.2 9.42 8.33 6.73 7.43 ...
##  $ Diphtheria                     : int  65 62 64 67 68 66 63 64 63 58 ...
##  $ HIV.AIDS                       : num  0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
##  $ GDP                            : num  584.3 612.7 631.7 670 63.5 ...
##  $ Population                     : num  33736494 327582 31731688 3696958 2978599 ...
##  $ thinness..1.19.years           : num  17.2 17.5 17.7 17.9 18.2 18.4 18.6 18.8 19 19.2 ...
##  $ thinness.5.9.years             : num  17.3 17.5 17.7 18 18.2 18.4 18.7 18.9 19.1 19.3 ...
##  $ Income.composition.of.resources: num  0.479 0.476 0.47 0.463 0.454 0.448 0.434 0.433 0.415 0.405 ...
##  $ Schooling                      : num  10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...
head(life_exp)  # Preview the first few rows
summary(life_exp)  # Summary statistics of the dataset
##    Country               Year         Status          Life.expectancy
##  Length:2938        Min.   :2000   Length:2938        Min.   :36.30  
##  Class :character   1st Qu.:2004   Class :character   1st Qu.:63.10  
##  Mode  :character   Median :2008   Mode  :character   Median :72.10  
##                     Mean   :2008                      Mean   :69.22  
##                     3rd Qu.:2012                      3rd Qu.:75.70  
##                     Max.   :2015                      Max.   :89.00  
##                                                       NA's   :10     
##  Adult.Mortality infant.deaths       Alcohol        percentage.expenditure
##  Min.   :  1.0   Min.   :   0.0   Min.   : 0.0100   Min.   :    0.000     
##  1st Qu.: 74.0   1st Qu.:   0.0   1st Qu.: 0.8775   1st Qu.:    4.685     
##  Median :144.0   Median :   3.0   Median : 3.7550   Median :   64.913     
##  Mean   :164.8   Mean   :  30.3   Mean   : 4.6029   Mean   :  738.251     
##  3rd Qu.:228.0   3rd Qu.:  22.0   3rd Qu.: 7.7025   3rd Qu.:  441.534     
##  Max.   :723.0   Max.   :1800.0   Max.   :17.8700   Max.   :19479.912     
##  NA's   :10                       NA's   :194                             
##   Hepatitis.B       Measles              BMI        under.five.deaths
##  Min.   : 1.00   Min.   :     0.0   Min.   : 1.00   Min.   :   0.00  
##  1st Qu.:77.00   1st Qu.:     0.0   1st Qu.:19.30   1st Qu.:   0.00  
##  Median :92.00   Median :    17.0   Median :43.50   Median :   4.00  
##  Mean   :80.94   Mean   :  2419.6   Mean   :38.32   Mean   :  42.04  
##  3rd Qu.:97.00   3rd Qu.:   360.2   3rd Qu.:56.20   3rd Qu.:  28.00  
##  Max.   :99.00   Max.   :212183.0   Max.   :87.30   Max.   :2500.00  
##  NA's   :553                        NA's   :34                       
##      Polio       Total.expenditure   Diphtheria       HIV.AIDS     
##  Min.   : 3.00   Min.   : 0.370    Min.   : 2.00   Min.   : 0.100  
##  1st Qu.:78.00   1st Qu.: 4.260    1st Qu.:78.00   1st Qu.: 0.100  
##  Median :93.00   Median : 5.755    Median :93.00   Median : 0.100  
##  Mean   :82.55   Mean   : 5.938    Mean   :82.32   Mean   : 1.742  
##  3rd Qu.:97.00   3rd Qu.: 7.492    3rd Qu.:97.00   3rd Qu.: 0.800  
##  Max.   :99.00   Max.   :17.600    Max.   :99.00   Max.   :50.600  
##  NA's   :19      NA's   :226       NA's   :19                      
##       GDP              Population        thinness..1.19.years
##  Min.   :     1.68   Min.   :3.400e+01   Min.   : 0.10       
##  1st Qu.:   463.94   1st Qu.:1.958e+05   1st Qu.: 1.60       
##  Median :  1766.95   Median :1.387e+06   Median : 3.30       
##  Mean   :  7483.16   Mean   :1.275e+07   Mean   : 4.84       
##  3rd Qu.:  5910.81   3rd Qu.:7.420e+06   3rd Qu.: 7.20       
##  Max.   :119172.74   Max.   :1.294e+09   Max.   :27.70       
##  NA's   :448         NA's   :652         NA's   :34          
##  thinness.5.9.years Income.composition.of.resources   Schooling    
##  Min.   : 0.10      Min.   :0.0000                  Min.   : 0.00  
##  1st Qu.: 1.50      1st Qu.:0.4930                  1st Qu.:10.10  
##  Median : 3.30      Median :0.6770                  Median :12.30  
##  Mean   : 4.87      Mean   :0.6276                  Mean   :11.99  
##  3rd Qu.: 7.20      3rd Qu.:0.7790                  3rd Qu.:14.30  
##  Max.   :28.60      Max.   :0.9480                  Max.   :20.70  
##  NA's   :34         NA's   :167                     NA's   :163
skim(life_exp)  # Detailed skim of the dataset
Data summary
Name life_exp
Number of rows 2938
Number of columns 22
_______________________
Column type frequency:
character 2
numeric 20
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Country 0 1 4 52 0 193 0
Status 0 1 9 10 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Year 0 1.00 2007.52 4.61 2000.00 2004.00 2008.00 2012.00 2.015000e+03 ▇▆▆▆▆
Life.expectancy 10 1.00 69.22 9.52 36.30 63.10 72.10 75.70 8.900000e+01 ▁▂▃▇▂
Adult.Mortality 10 1.00 164.80 124.29 1.00 74.00 144.00 228.00 7.230000e+02 ▇▆▂▁▁
infant.deaths 0 1.00 30.30 117.93 0.00 0.00 3.00 22.00 1.800000e+03 ▇▁▁▁▁
Alcohol 194 0.93 4.60 4.05 0.01 0.88 3.76 7.70 1.787000e+01 ▇▃▃▂▁
percentage.expenditure 0 1.00 738.25 1987.91 0.00 4.69 64.91 441.53 1.947991e+04 ▇▁▁▁▁
Hepatitis.B 553 0.81 80.94 25.07 1.00 77.00 92.00 97.00 9.900000e+01 ▁▁▁▂▇
Measles 0 1.00 2419.59 11467.27 0.00 0.00 17.00 360.25 2.121830e+05 ▇▁▁▁▁
BMI 34 0.99 38.32 20.04 1.00 19.30 43.50 56.20 8.730000e+01 ▅▅▅▇▁
under.five.deaths 0 1.00 42.04 160.45 0.00 0.00 4.00 28.00 2.500000e+03 ▇▁▁▁▁
Polio 19 0.99 82.55 23.43 3.00 78.00 93.00 97.00 9.900000e+01 ▁▁▁▂▇
Total.expenditure 226 0.92 5.94 2.50 0.37 4.26 5.76 7.49 1.760000e+01 ▃▇▃▁▁
Diphtheria 19 0.99 82.32 23.72 2.00 78.00 93.00 97.00 9.900000e+01 ▁▁▁▂▇
HIV.AIDS 0 1.00 1.74 5.08 0.10 0.10 0.10 0.80 5.060000e+01 ▇▁▁▁▁
GDP 448 0.85 7483.16 14270.17 1.68 463.94 1766.95 5910.81 1.191727e+05 ▇▁▁▁▁
Population 652 0.78 12753375.12 61012096.51 34.00 195793.25 1386542.00 7420359.00 1.293859e+09 ▇▁▁▁▁
thinness..1.19.years 34 0.99 4.84 4.42 0.10 1.60 3.30 7.20 2.770000e+01 ▇▃▁▁▁
thinness.5.9.years 34 0.99 4.87 4.51 0.10 1.50 3.30 7.20 2.860000e+01 ▇▃▁▁▁
Income.composition.of.resources 167 0.94 0.63 0.21 0.00 0.49 0.68 0.78 9.500000e-01 ▁▁▅▇▆
Schooling 163 0.94 11.99 3.36 0.00 10.10 12.30 14.30 2.070000e+01 ▁▂▇▇▁

Check for missing data:

missing_data <- colSums(is.na(life_exp))
missing_percent <- (missing_data/nrow(life_exp))*100
missing_df <- data.frame(
    variable = names(missing_data),
    missing_percent = missing_percent
)

Visualize missing data:

ggplot(missing_df, aes(x = reorder(variable, missing_percent),
                    y = missing_percent)) +
    geom_bar(stat = "identity") +
    coord_flip() +
    theme_minimal() +
    labs(title = "Percentage of Missing Values by Variable",
        x = "Variables", y = "Missing Percentage")

3. Data Cleaning and Transformation

Convert Year column to Date format, create new variables:

life_clean <- life_exp %>%
    mutate(
        Year = as.Date(paste0(Year, "-01-01")),  # Convert Year to Date format
        # Create a Development Status variable
        Development_Status = ifelse(Status == "Developing", "Developing", "Developed"),  
        GDP_per_capita = GDP / Population  # Create new meaningful variable for GDP per capita
    ) %>%
    filter(!is.na(Life.expectancy))  # Remove rows with missing life expectancy values

Group by Country and Development Status to calculate summary statistics:

country_stats <- life_clean %>%
    group_by(Country,Development_Status) %>%
    summarise(
        Avg_Life_Exp = mean(Life.expectancy),
        Avg_GDP_per_capita = mean(GDP_per_capita),
        Avg_Schooling = mean(Schooling, na.rm = TRUE),
        Avg_BMI = mean(BMI, na.rm = TRUE))  %>%
    arrange(desc(Avg_Life_Exp))
## `summarise()` has grouped output by 'Country'. You can override using the
## `.groups` argument.

4. Data Visualization

Time Trend Plot (Life Expectancy by Development Status)

yearly_trends <- life_clean %>%
    group_by(Year, Development_Status) %>%
    summarise(
        Avg_Life_Exp = mean(Life.expectancy),
        Avg_GDP_per_capita = mean(GDP_per_capita),
        Avg_Alcohol = mean(Alcohol, na.rm = TRUE),
        Avg_BMI = mean(BMI, na.rm = TRUE)
    ) %>%
    ungroup()
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
time_plot <- ggplot(yearly_trends,
                    aes(x = Year, y = Avg_Life_Exp, color = Development_Status)) +
    geom_line(linewidth = 1) +
    geom_point() +
    theme_minimal() +
    labs(title = "Life Expectancy Trends Over Time",
        subtitle = "Comparing Developed vs Developing Countries",
        x = "Year", y = "Average Life Expectancy", color = "Development Status") +
    theme(legend.position = "bottom")
print(time_plot)

Scatter Plot (Life Expectancy vs GDP per Capita):

scatter_plot <- ggplot(life_clean,
                    aes(x = GDP_per_capita, y = Life.expectancy, color = Development_Status)) +
    geom_point(alpha = 0.6) +
    scale_x_log10() +
    geom_smooth(method = "lm", se = FALSE) +
    theme_minimal() +
    labs(title = "Life Expectancy vs GDP per Capita",
        x = "GDP per Capita (log scale)", y = "Life Expectancy", color = "Development Status")
print(scatter_plot)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 666 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 666 rows containing missing values or values outside the scale range
## (`geom_point()`).

Box Plot (Life Expectancy by Development Status):

box_plot <- ggplot(life_clean,
                aes(x = Development_Status, y = Life.expectancy, fill = Development_Status)) +
    geom_boxplot() +
    geom_jitter(alpha = 0.1) +
    theme_minimal() +
    labs(title = "Life Expectancy Distribution by Development Status",
        x = "Development Status", y = "Life Expectancy")
print(box_plot)

5. Statistical Analysis: t-Test

Perform a t-test:

t_test_result <- t.test(Life.expectancy ~ Development_Status, data = life_clean)
print(t_test_result)
## 
##  Welch Two Sample t-test
## 
## data:  Life.expectancy by Development_Status
## t = 47.868, df = 1807, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Developed and group Developing is not equal to 0
## 95 percent confidence interval:
##  11.59118 12.58159
## sample estimates:
##  mean in group Developed mean in group Developing 
##                 79.19785                 67.11147

6. Correlation and Regression Analysis

Correlation Matrix:

correlation_matrix <- life_clean %>%
    select(Life.expectancy, GDP_per_capita, Schooling, BMI) %>%
    cor(use = "complete.obs")
print(correlation_matrix)
##                 Life.expectancy GDP_per_capita  Schooling         BMI
## Life.expectancy      1.00000000     0.05894331 0.76965029  0.59280888
## GDP_per_capita       0.05894331     1.00000000 0.03054593 -0.02118835
## Schooling            0.76965029     0.03054593 1.00000000  0.58803256
## BMI                  0.59280888    -0.02118835 0.58803256  1.00000000

Linear Regression Model: Predict life expectancy using GDP per capita, schooling, and BMI:

model <- lm(Life.expectancy ~ GDP_per_capita + Schooling + BMI, data = life_clean)
summary(model)
## 
## Call:
## lm(formula = Life.expectancy ~ GDP_per_capita + Schooling + BMI, 
##     data = life_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -26.8045  -3.1638   0.6539   3.9186  27.4558 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    42.035693   0.479364  87.691  < 2e-16 ***
## GDP_per_capita  0.084972   0.025101   3.385 0.000724 ***
## Schooling       1.898314   0.047567  39.908  < 2e-16 ***
## BMI             0.107314   0.007939  13.518  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.023 on 2234 degrees of freedom
##   (690 observations deleted due to missingness)
## Multiple R-squared:  0.6243, Adjusted R-squared:  0.6238 
## F-statistic:  1238 on 3 and 2234 DF,  p-value: < 2.2e-16

Scatterplot of the life_clean dataset variables with linear regression lines.

avPlots(model)