# Load Required Libraries
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data <- read_csv("C:/Users/anmol/OneDrive/Desktop/Anmol Project/Life Expectancy Data.csv")
## Rows: 2938 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): Country, Status
## dbl (20): Year, Life expectancy, Adult Mortality, infant deaths, Alcohol, pe...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(data)
## # A tibble: 6 × 22
##   Country      Year Status   `Life expectancy` `Adult Mortality` `infant deaths`
##   <chr>       <dbl> <chr>                <dbl>             <dbl>           <dbl>
## 1 Afghanistan  2015 Develop…              65                 263              62
## 2 Afghanistan  2014 Develop…              59.9               271              64
## 3 Afghanistan  2013 Develop…              59.9               268              66
## 4 Afghanistan  2012 Develop…              59.5               272              69
## 5 Afghanistan  2011 Develop…              59.2               275              71
## 6 Afghanistan  2010 Develop…              58.8               279              74
## # ℹ 16 more variables: Alcohol <dbl>, `percentage expenditure` <dbl>,
## #   `Hepatitis B` <dbl>, Measles <dbl>, BMI <dbl>, `under-five deaths` <dbl>,
## #   Polio <dbl>, `Total expenditure` <dbl>, Diphtheria <dbl>, `HIV/AIDS` <dbl>,
## #   GDP <dbl>, Population <dbl>, `thinness  1-19 years` <dbl>,
## #   `thinness 5-9 years` <dbl>, `Income composition of resources` <dbl>,
## #   Schooling <dbl>

#================ # Check Structure #================

str(data)
## spc_tbl_ [2,938 × 22] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Country                        : chr [1:2938] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Year                           : num [1:2938] 2015 2014 2013 2012 2011 ...
##  $ Status                         : chr [1:2938] "Developing" "Developing" "Developing" "Developing" ...
##  $ Life expectancy                : num [1:2938] 65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
##  $ Adult Mortality                : num [1:2938] 263 271 268 272 275 279 281 287 295 295 ...
##  $ infant deaths                  : num [1:2938] 62 64 66 69 71 74 77 80 82 84 ...
##  $ Alcohol                        : num [1:2938] 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
##  $ percentage expenditure         : num [1:2938] 71.3 73.5 73.2 78.2 7.1 ...
##  $ Hepatitis B                    : num [1:2938] 65 62 64 67 68 66 63 64 63 64 ...
##  $ Measles                        : num [1:2938] 1154 492 430 2787 3013 ...
##  $ BMI                            : num [1:2938] 19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
##  $ under-five deaths              : num [1:2938] 83 86 89 93 97 102 106 110 113 116 ...
##  $ Polio                          : num [1:2938] 6 58 62 67 68 66 63 64 63 58 ...
##  $ Total expenditure              : num [1:2938] 8.16 8.18 8.13 8.52 7.87 9.2 9.42 8.33 6.73 7.43 ...
##  $ Diphtheria                     : num [1:2938] 65 62 64 67 68 66 63 64 63 58 ...
##  $ HIV/AIDS                       : num [1:2938] 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
##  $ GDP                            : num [1:2938] 584.3 612.7 631.7 670 63.5 ...
##  $ Population                     : num [1:2938] 33736494 327582 31731688 3696958 2978599 ...
##  $ thinness  1-19 years           : num [1:2938] 17.2 17.5 17.7 17.9 18.2 18.4 18.6 18.8 19 19.2 ...
##  $ thinness 5-9 years             : num [1:2938] 17.3 17.5 17.7 18 18.2 18.4 18.7 18.9 19.1 19.3 ...
##  $ Income composition of resources: num [1:2938] 0.479 0.476 0.47 0.463 0.454 0.448 0.434 0.433 0.415 0.405 ...
##  $ Schooling                      : num [1:2938] 10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Country = col_character(),
##   ..   Year = col_double(),
##   ..   Status = col_character(),
##   ..   `Life expectancy` = col_double(),
##   ..   `Adult Mortality` = col_double(),
##   ..   `infant deaths` = col_double(),
##   ..   Alcohol = col_double(),
##   ..   `percentage expenditure` = col_double(),
##   ..   `Hepatitis B` = col_double(),
##   ..   Measles = col_double(),
##   ..   BMI = col_double(),
##   ..   `under-five deaths` = col_double(),
##   ..   Polio = col_double(),
##   ..   `Total expenditure` = col_double(),
##   ..   Diphtheria = col_double(),
##   ..   `HIV/AIDS` = col_double(),
##   ..   GDP = col_double(),
##   ..   Population = col_double(),
##   ..   `thinness  1-19 years` = col_double(),
##   ..   `thinness 5-9 years` = col_double(),
##   ..   `Income composition of resources` = col_double(),
##   ..   Schooling = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Summary Statistics

summary(data)
##    Country               Year         Status          Life expectancy
##  Length:2938        Min.   :2000   Length:2938        Min.   :36.30  
##  Class :character   1st Qu.:2004   Class :character   1st Qu.:63.10  
##  Mode  :character   Median :2008   Mode  :character   Median :72.10  
##                     Mean   :2008                      Mean   :69.22  
##                     3rd Qu.:2012                      3rd Qu.:75.70  
##                     Max.   :2015                      Max.   :89.00  
##                                                       NA's   :10     
##  Adult Mortality infant deaths       Alcohol        percentage expenditure
##  Min.   :  1.0   Min.   :   0.0   Min.   : 0.0100   Min.   :    0.000     
##  1st Qu.: 74.0   1st Qu.:   0.0   1st Qu.: 0.8775   1st Qu.:    4.685     
##  Median :144.0   Median :   3.0   Median : 3.7550   Median :   64.913     
##  Mean   :164.8   Mean   :  30.3   Mean   : 4.6029   Mean   :  738.251     
##  3rd Qu.:228.0   3rd Qu.:  22.0   3rd Qu.: 7.7025   3rd Qu.:  441.534     
##  Max.   :723.0   Max.   :1800.0   Max.   :17.8700   Max.   :19479.912     
##  NA's   :10                       NA's   :194                             
##   Hepatitis B       Measles              BMI        under-five deaths
##  Min.   : 1.00   Min.   :     0.0   Min.   : 1.00   Min.   :   0.00  
##  1st Qu.:77.00   1st Qu.:     0.0   1st Qu.:19.30   1st Qu.:   0.00  
##  Median :92.00   Median :    17.0   Median :43.50   Median :   4.00  
##  Mean   :80.94   Mean   :  2419.6   Mean   :38.32   Mean   :  42.04  
##  3rd Qu.:97.00   3rd Qu.:   360.2   3rd Qu.:56.20   3rd Qu.:  28.00  
##  Max.   :99.00   Max.   :212183.0   Max.   :87.30   Max.   :2500.00  
##  NA's   :553                        NA's   :34                       
##      Polio       Total expenditure   Diphtheria       HIV/AIDS     
##  Min.   : 3.00   Min.   : 0.370    Min.   : 2.00   Min.   : 0.100  
##  1st Qu.:78.00   1st Qu.: 4.260    1st Qu.:78.00   1st Qu.: 0.100  
##  Median :93.00   Median : 5.755    Median :93.00   Median : 0.100  
##  Mean   :82.55   Mean   : 5.938    Mean   :82.32   Mean   : 1.742  
##  3rd Qu.:97.00   3rd Qu.: 7.492    3rd Qu.:97.00   3rd Qu.: 0.800  
##  Max.   :99.00   Max.   :17.600    Max.   :99.00   Max.   :50.600  
##  NA's   :19      NA's   :226       NA's   :19                      
##       GDP              Population        thinness  1-19 years
##  Min.   :     1.68   Min.   :3.400e+01   Min.   : 0.10       
##  1st Qu.:   463.94   1st Qu.:1.958e+05   1st Qu.: 1.60       
##  Median :  1766.95   Median :1.387e+06   Median : 3.30       
##  Mean   :  7483.16   Mean   :1.275e+07   Mean   : 4.84       
##  3rd Qu.:  5910.81   3rd Qu.:7.420e+06   3rd Qu.: 7.20       
##  Max.   :119172.74   Max.   :1.294e+09   Max.   :27.70       
##  NA's   :448         NA's   :652         NA's   :34          
##  thinness 5-9 years Income composition of resources   Schooling    
##  Min.   : 0.10      Min.   :0.0000                  Min.   : 0.00  
##  1st Qu.: 1.50      1st Qu.:0.4930                  1st Qu.:10.10  
##  Median : 3.30      Median :0.6770                  Median :12.30  
##  Mean   : 4.87      Mean   :0.6276                  Mean   :11.99  
##  3rd Qu.: 7.20      3rd Qu.:0.7790                  3rd Qu.:14.30  
##  Max.   :28.60      Max.   :0.9480                  Max.   :20.70  
##  NA's   :34         NA's   :167                     NA's   :163

#=================== # Dataset Dimensions #===================

dim(data)
## [1] 2938   22

#===================== # Find Missing Values #=====================

sum(is.na(data))  # Total missing values
## [1] 2563
colSums(is.na(data))  # Column-wise missing values
##                         Country                            Year 
##                               0                               0 
##                          Status                 Life expectancy 
##                               0                              10 
##                 Adult Mortality                   infant deaths 
##                              10                               0 
##                         Alcohol          percentage expenditure 
##                             194                               0 
##                     Hepatitis B                         Measles 
##                             553                               0 
##                             BMI               under-five deaths 
##                              34                               0 
##                           Polio               Total expenditure 
##                              19                             226 
##                      Diphtheria                        HIV/AIDS 
##                              19                               0 
##                             GDP                      Population 
##                             448                             652 
##            thinness  1-19 years              thinness 5-9 years 
##                              34                              34 
## Income composition of resources                       Schooling 
##                             167                             163

#============================ # Clean Data (Remove NA rows) #============================

data <- na.omit(data)

#================================================ # Q1: Countries with life expectancy above 75 #================================================

high_life_exp <- data %>%
  filter(`Life expectancy` > 75) %>%
  distinct(Country)

print(high_life_exp)
## # A tibble: 53 × 1
##    Country               
##    <chr>                 
##  1 Albania               
##  2 Algeria               
##  3 Argentina             
##  4 Australia             
##  5 Austria               
##  6 Azerbaijan            
##  7 Bangladesh            
##  8 Belgium               
##  9 Bosnia and Herzegovina
## 10 Cabo Verde            
## # ℹ 43 more rows

#=================================================== # Q2: Countries with adult mortality more than 300 #==================================================

high_adult_mortality <- data %>%
  filter(`Adult Mortality` > 300) %>%
  distinct(Country)

print(high_adult_mortality)
## # A tibble: 25 × 1
##    Country                 
##    <chr>                   
##  1 Afghanistan             
##  2 Angola                  
##  3 Bhutan                  
##  4 Botswana                
##  5 Burundi                 
##  6 Cameroon                
##  7 Central African Republic
##  8 Chad                    
##  9 Eritrea                 
## 10 Kenya                   
## # ℹ 15 more rows

#============================================================ # Q3: Developing countries with life expectancy less than 60 #=============================================================

low_life_exp_dev <- data %>%
  filter(Status == "Developing", `Life expectancy` < 60) %>%
  distinct(Country)


print(low_life_exp_dev)
## # A tibble: 37 × 1
##    Country                 
##    <chr>                   
##  1 Afghanistan             
##  2 Angola                  
##  3 Benin                   
##  4 Botswana                
##  5 Burkina Faso            
##  6 Burundi                 
##  7 Cameroon                
##  8 Central African Republic
##  9 Chad                    
## 10 Comoros                 
## # ℹ 27 more rows

#===================================== # Level 3: Grouping & Summarization #=====================================

#============================================= # Q1: Average life expectancy for each country #=============================================

library(dplyr)
avg_life_exp_country <- data %>%
  group_by(Country) %>%
  summarise(Average_Life_Expectancy = mean(`Life expectancy`, na.rm = TRUE)) %>%
  arrange(desc(Average_Life_Expectancy))

# Show result
print(avg_life_exp_country)
## # A tibble: 133 × 2
##    Country     Average_Life_Expectancy
##    <chr>                         <dbl>
##  1 Ireland                        83.4
##  2 Canada                         82.2
##  3 France                         82.2
##  4 Italy                          82.2
##  5 Spain                          82.0
##  6 Australia                      81.9
##  7 Sweden                         81.9
##  8 Austria                        81.5
##  9 Netherlands                    81.3
## 10 Greece                         81.2
## # ℹ 123 more rows

#============================================================ # Q2: Average schooling in developed vs. developing countries #============================================================

avg_schooling_status <- data %>%
  group_by(Status) %>%
  summarise(Average_Schooling = mean(Schooling, na.rm = TRUE))

# Show result
print(avg_schooling_status)
## # A tibble: 2 × 2
##   Status     Average_Schooling
##   <chr>                  <dbl>
## 1 Developed               15.6
## 2 Developing              11.5

#================================================= # Q3: Year with highest average life expectancy #================================================

avg_life_exp_year <- data %>%
  group_by(Year) %>%
  summarise(Average_Life_Expectancy = mean(`Life expectancy`, na.rm = TRUE)) %>%
  arrange(desc(Average_Life_Expectancy))

# Show the top year
print(head(avg_life_exp_year, 1))
## # A tibble: 1 × 2
##    Year Average_Life_Expectancy
##   <dbl>                   <dbl>
## 1  2015                    71.4

Level 4 :Ranking & Comparison

#============================================================== # Q1: Rank countries by average life expectancy (high to low) #=============================================================

country_rank_life <- data %>%
  group_by(Country) %>%
  summarise(Average_Life_Expectancy = mean(`Life expectancy`, na.rm = TRUE)) %>%
  arrange(desc(Average_Life_Expectancy))

# Show top 10 ranked countries
head(country_rank_life, 10)
## # A tibble: 10 × 2
##    Country     Average_Life_Expectancy
##    <chr>                         <dbl>
##  1 Ireland                        83.4
##  2 Canada                         82.2
##  3 France                         82.2
##  4 Italy                          82.2
##  5 Spain                          82.0
##  6 Australia                      81.9
##  7 Sweden                         81.9
##  8 Austria                        81.5
##  9 Netherlands                    81.3
## 10 Greece                         81.2

#====================================================== # Q2: Find top 5 countries with highest infant deaths #=====================================================

top_infant_deaths <- data %>%
  group_by(Country) %>%
  summarise(Total_Infant_Deaths = sum(`infant deaths`, na.rm = TRUE)) %>%
  arrange(desc(Total_Infant_Deaths))

# Show top 5 countries
head(top_infant_deaths, 5)
## # A tibble: 5 × 2
##   Country   Total_Infant_Deaths
##   <chr>                   <dbl>
## 1 India                   13957
## 2 Nigeria                  5237
## 3 China                    4561
## 4 Pakistan                 4402
## 5 Indonesia                2305

#============================================================= # Q3: Which year had the highest total alcohol consumption? #===========================================================

yearly_alcohol <- data %>%
  group_by(Year) %>%
  summarise(Total_Alcohol = sum(Alcohol, na.rm = TRUE)) %>%
  arrange(desc(Total_Alcohol))

# Show the top year
head(yearly_alcohol, 1)
## # A tibble: 1 × 2
##    Year Total_Alcohol
##   <dbl>         <dbl>
## 1  2011          613.

#Level 5: Creating New Insights

data <- data %>%
  mutate(Development_Status = case_when(
    GDP < 1000 ~ "Low Income",
    GDP >= 1000 & GDP < 4000 ~ "Lower Middle Income",
    GDP >= 4000 & GDP < 12000 ~ "Upper Middle Income",
    GDP >= 12000 ~ "High Income",
    TRUE ~ "Unknown"
  ))
# Check sample output
head(data %>% select(Country, Year, GDP, Development_Status), 10)
## # A tibble: 10 × 4
##    Country      Year   GDP Development_Status
##    <chr>       <dbl> <dbl> <chr>             
##  1 Afghanistan  2015 584.  Low Income        
##  2 Afghanistan  2014 613.  Low Income        
##  3 Afghanistan  2013 632.  Low Income        
##  4 Afghanistan  2012 670.  Low Income        
##  5 Afghanistan  2011  63.5 Low Income        
##  6 Afghanistan  2010 553.  Low Income        
##  7 Afghanistan  2009 446.  Low Income        
##  8 Afghanistan  2008 373.  Low Income        
##  9 Afghanistan  2007 370.  Low Income        
## 10 Afghanistan  2006 273.  Low Income
data = data %>%
mutate(Health_Spending_Per_Capita = `percentage expenditure` / Population)
# Check sample output
head(data %>% select(Country, Year, `percentage expenditure`, Population,
                     Health_Spending_Per_Capita), 10)
## # A tibble: 10 × 5
##    Country      Year `percentage expenditure` Population Health_Spending_Per_C…¹
##    <chr>       <dbl>                    <dbl>      <dbl>                   <dbl>
##  1 Afghanistan  2015                    71.3    33736494             0.00000211 
##  2 Afghanistan  2014                    73.5      327582             0.000224   
##  3 Afghanistan  2013                    73.2    31731688             0.00000231 
##  4 Afghanistan  2012                    78.2     3696958             0.0000211  
##  5 Afghanistan  2011                     7.10    2978599             0.00000238 
##  6 Afghanistan  2010                    79.7     2883167             0.0000276  
##  7 Afghanistan  2009                    56.8      284331             0.000200   
##  8 Afghanistan  2008                    25.9     2729431             0.00000948 
##  9 Afghanistan  2007                    10.9    26616792             0.000000410
## 10 Afghanistan  2006                    17.2     2589345             0.00000663 
## # ℹ abbreviated name: ¹​Health_Spending_Per_Capita

#===================================================================== # Q1. Predict Life expectancy based on GDP (Simple Linear Regression) #====================================================================

# Now use the correct column names (likely "Life.expectancy" should be "Life.expectancy" without space, or you might need backticks)

# Simple Linear Regression: Predict Life expectancy based on GDP
model1 <- lm(`Life expectancy` ~ GDP, data = data)

# Summary of the model
cat("Summary of Simple Linear Regression (Life expectancy vs GDP):\n")
## Summary of Simple Linear Regression (Life expectancy vs GDP):
summary(model1)
## 
## Call:
## lm(formula = `Life expectancy` ~ GDP, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -26.084  -3.924   1.675   5.314  21.440 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6.742e+01  2.161e-01  311.94   <2e-16 ***
## GDP         3.383e-04  1.695e-05   19.96   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.896 on 1647 degrees of freedom
## Multiple R-squared:  0.1948, Adjusted R-squared:  0.1943 
## F-statistic: 398.4 on 1 and 1647 DF,  p-value: < 2.2e-16
# Plot GDP vs Life expectancy with regression line
plot(data$GDP, data$`Life expectancy`,
     col = "blue",
     pch = 16,
     main = "Life Expectancy vs GDP",
     xlab = "GDP",
     ylab = "Life Expectancy")

# Add regression line
abline(model1, col = "red", lwd = 2)

#==================================================================================== #Q2.Predict Life expectancy based on Alcohol consumption (Simple Linear Regression) #====================================================================================

# Simple Linear Regression: Predict Life Expectancy based on Alcohol
model_alcohol <- lm(`Life expectancy` ~ Alcohol, data = data)

# Summary of the model
cat("Summary of Simple Linear Regression (Life expectancy vs Alcohol):\n")
## Summary of Simple Linear Regression (Life expectancy vs Alcohol):
summary(model_alcohol)
## 
## Call:
## lm(formula = `Life expectancy` ~ Alcohol, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.678  -3.925   1.827   5.663  17.675 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 65.31651    0.29861  218.73   <2e-16 ***
## Alcohol      0.87925    0.04924   17.86   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.054 on 1647 degrees of freedom
## Multiple R-squared:  0.1622, Adjusted R-squared:  0.1617 
## F-statistic: 318.8 on 1 and 1647 DF,  p-value: < 2.2e-16
# Scatter plot: Life Expectancy vs Alcohol consumption
plot(data$Alcohol, data$`Life expectancy`,
     col = "skyblue",
     pch = 16,
     main = "Life Expectancy vs Alcohol Consumption",
     xlab = "Alcohol Consumption (litres per capita)",
     ylab = "Life Expectancy")

# Add regression line
abline(model_alcohol, col = "red", lwd = 2)

#=============================================================================================== #Q3.Predict Life expectancy based on GDP, BMI, and Adult Mortality (Multiple Linear Regression) #================================================================================================

# Multiple Linear Regression
model_mlr <- lm(`Life expectancy` ~ GDP + BMI + `Adult Mortality`, data = data)

# Summary
cat("Summary of Multiple Linear Regression (Life Expectancy vs GDP, BMI, Adult Mortality):\n")
## Summary of Multiple Linear Regression (Life Expectancy vs GDP, BMI, Adult Mortality):
summary(model_mlr)
## 
## Call:
## lm(formula = `Life expectancy` ~ GDP + BMI + `Adult Mortality`, 
##     data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -27.8265  -2.2861   0.4486   2.9917  20.8803 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        6.980e+01  4.022e-01  173.54   <2e-16 ***
## GDP                1.728e-04  1.198e-05   14.42   <2e-16 ***
## BMI                1.297e-01  7.189e-03   18.05   <2e-16 ***
## `Adult Mortality` -3.809e-02  1.130e-03  -33.72   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.293 on 1645 degrees of freedom
## Multiple R-squared:  0.6386, Adjusted R-squared:  0.6379 
## F-statistic: 968.9 on 3 and 1645 DF,  p-value: < 2.2e-16
# Scatter plots for each predictor vs Life Expectancy
par(mfrow = c(1, 3))  # 1 row, 3 columns layout

# Scatter plot 1: GDP vs Life Expectancy
plot(data$GDP, data$`Life expectancy`,
     col = "blue",
     pch = 16,
     main = "Life Expectancy vs GDP",
     xlab = "GDP",
     ylab = "Life Expectancy")
abline(lm(`Life expectancy` ~ GDP, data = data), col = "red", lwd = 2)

# Scatter plot 2: BMI vs Life Expectancy
plot(data$BMI, data$`Life expectancy`,
     col = "green",
     pch = 16,
     main = "Life Expectancy vs BMI",
     xlab = "BMI",
     ylab = "Life Expectancy")
abline(lm(`Life expectancy` ~ BMI, data = data), col = "red", lwd = 2)

# Scatter plot 3: Adult Mortality vs Life Expectancy
plot(data$`Adult Mortality`, data$`Life expectancy`,
     col = "purple",
     pch = 16,
     main = "Life Expectancy vs Adult Mortality",
     xlab = "Adult Mortality",
     ylab = "Life Expectancy")
abline(lm(`Life expectancy` ~ `Adult Mortality`, data = data), col = "red", lwd = 2)

# Reset plot layout
par(mfrow = c(1, 1))

#======================================================================= #Q4. predict Life Expectancy based on GDP using Polynomial Regression? #=======================================================================

# Polynomial Regression Model (Degree 2: Quadratic)
model_poly <- lm(`Life expectancy` ~ GDP + I(GDP^2), data = data)

# Summary of the model
cat("Summary of Polynomial Regression (Life expectancy vs GDP):\n")
## Summary of Polynomial Regression (Life expectancy vs GDP):
summary(model_poly)
## 
## Call:
## lm(formula = `Life expectancy` ~ GDP + I(GDP^2), data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -22.738  -4.128   1.055   5.215  22.318 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.640e+01  2.247e-01  295.55   <2e-16 ***
## GDP          6.803e-04  3.314e-05   20.53   <2e-16 ***
## I(GDP^2)    -5.437e-09  4.589e-10  -11.85   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.582 on 1646 degrees of freedom
## Multiple R-squared:  0.258,  Adjusted R-squared:  0.2571 
## F-statistic: 286.2 on 2 and 1646 DF,  p-value: < 2.2e-16
# Scatter plot of GDP vs Life Expectancy
plot(data$GDP, data$`Life expectancy`,
     col = "lightblue",
     pch = 16,
     main = "Polynomial Regression: Life Expectancy vs GDP",
     xlab = "GDP",
     ylab = "Life Expectancy")

# Add the polynomial regression curve
x_seq <- seq(min(data$GDP, na.rm=TRUE), max(data$GDP, na.rm=TRUE), length=100)
y_pred <- predict(model_poly, newdata = data.frame(GDP = x_seq))
lines(x_seq, y_pred, col = "red", lwd = 2)

#============================================================================ # Q1. Correlation between Life Expectancy and GDP per capita #============================================================================

cor.test(data$`Life expectancy`, data$GDP, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  data$`Life expectancy` and data$GDP
## t = 19.959, df = 1647, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4016054 0.4793813
## sample estimates:
##       cor 
## 0.4413218
# Scatter plot
plot(data$GDP, data$`Life expectancy`, 
     main = "Life Expectancy vs GDP",
     xlab = "GDP per capita", 
     ylab = "Life Expectancy (years)",
     col = ifelse(data$Status == "Developed", "blue", "red"))
legend("bottomright", legend = c("Developed", "Developing"), 
       col = c("blue", "red"), pch = 1)

#================================================================ # Q2. Correlation between Adult Mortality and Alcohol Consumption #================================================================

cor.test(data$`Adult Mortality`, data$Alcohol, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  data$`Adult Mortality` and data$Alcohol
## t = -7.2361, df = 1647, p-value = 7.048e-13
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2219266 -0.1283506
## sample estimates:
##        cor 
## -0.1755351
# Scatter plot with regression line
plot(data$Alcohol, data$`Adult Mortality`,
     main = "Adult Mortality vs Alcohol Consumption",
     xlab = "Alcohol Consumption (liters per capita)", 
     ylab = "Adult Mortality (per 1000 population)")
abline(lm(`Adult Mortality` ~ Alcohol, data = data), col = "red")

#================================================ # Q3. Correlation between BMI and Life Expectancy #===============================================

cor.test(data$BMI, data$`Life expectancy`, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  data$BMI and data$`Life expectancy`
## t = 26.177, df = 1647, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5070364 0.5752616
## sample estimates:
##       cor 
## 0.5420416
# Boxplot by Status
boxplot(BMI ~ Status, data = data,
        main = "BMI Distribution by Country Status",
        xlab = "Country Status",
        ylab = "BMI",
        col = c("lightblue", "lightgreen"))

#====================================================== # Q4. Correlation between Schooling and Life Expectancy #======================================================

cor.test(data$Schooling, data$`Life expectancy`, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  data$Schooling and data$`Life expectancy`
## t = 43.048, df = 1647, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7040885 0.7495739
## sample estimates:
##     cor 
## 0.72763
# Scatter plot with color by Status
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
ggplot(data, aes(x = Schooling, y = `Life expectancy`, color = Status)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Life Expectancy vs Schooling",
       x = "Average Years of Schooling",
       y = "Life Expectancy (years)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

df <- data

#======================================= # Q1. Scatter Plot: GDP vs Life Expectancy #=======================================

# Clean data
df_scatter1 <- df %>%
  select(GDP, `Life expectancy`) %>%
  na.omit()
# Create scatter plot
ggplot(df_scatter1, aes(x = GDP, y = `Life expectancy`)) +
  geom_point(color = "steelblue", alpha = 0.6) +
  labs(title = "GDP vs Life Expectancy",
       x = "GDP",
       y = "Life Expectancy") +
  theme_minimal()

#====================================================== # Q2. Scatter Plot: Adult Mortality vs Life Expectancy #======================================================

# Clean data
df_scatter2 <- df %>%
  select(`Adult Mortality`, `Life expectancy`) %>%
  na.omit()
# Create scatter plot
ggplot(df_scatter2, aes(x = `Adult Mortality`, y = `Life expectancy`)) +
  geom_point(color = "tomato", alpha = 0.6) +
  labs(title = "Adult Mortality vs Life Expectancy",
       x = "Adult Mortality",
       y = "Life Expectancy") +
  theme_minimal()

#=================================== # Boxplot: Life Expectancy by Status #===================================

# Clean data
df_boxplot1 <- df %>%
  select(`Life expectancy`, Status) %>%
  na.omit()

# Create boxplot
ggplot(df_boxplot1, aes(x = Status, y = `Life expectancy`, fill = Status)) +
  geom_boxplot() +
  labs(title = "Life Expectancy: Developed vs Developing",
       x = "Status",
       y = "Life Expectancy") +
  theme_minimal() +
  scale_fill_manual(values = c("#00BFC4", "#F8766D"))

#======================================================= # ANOVA: Check if Life Expectancy differs by Status #=======================================================

anova_result <- aov(`Life expectancy` ~ Status, data = df)
summary(anova_result)
##               Df Sum Sq Mean Sq F value Pr(>F)    
## Status         1  25005   25005   401.7 <2e-16 ***
## Residuals   1647 102525      62                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#=========================================== # Boxplot: Life Expectancy by Status #===========================================

# Clean data
df_boxplot <- df %>%
  select(`Life expectancy`, Status) %>%
  na.omit()

# Create boxplot
ggplot(df_boxplot, aes(x = Status, y = `Life expectancy`, fill = Status)) +
  geom_boxplot() +
  labs(title = "Life Expectancy by Status",
       x = "Status",
       y = "Life Expectancy") +
  theme_minimal() +
  scale_fill_manual(values = c("#00BFC4", "#F8766D"))

# Select numeric columns along with 'Status' for ggpairs plot
df_numeric <- df %>%
  select(`Life expectancy`, GDP, `Adult Mortality`, BMI, `percentage expenditure`, `Hepatitis B`, Measles, `Polio`, `Income composition of resources`, Schooling, Status)

# Create colorful ggpairs plot
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.3
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(df_numeric, 
        aes(color = Status, alpha = 0.7)) + 
  theme_minimal() + 
  scale_color_manual(values = c("#00BFC4", "#F8766D"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#conslusion: The “Life Expectancy” dataset contains 2,938 records with 22 columns, covering various factors affecting life expectancy across countries. After cleaning the data by removing rows with missing values, the dataset is ready for analysis. Some columns have missing values, such as “Hepatitis B” and “GDP.” The dataset includes key variables like “Life expectancy,” “Adult Mortality,” “Alcohol consumption,” “GDP,” and others. The summary statistics show a wide range in values, indicating significant variation across countries and years. Further analysis can uncover trends and relationships between life expectancy and other health, economic, and social factors.