# Load Required Libraries
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data <- read_csv("C:/Users/anmol/OneDrive/Desktop/Anmol Project/Life Expectancy Data.csv")
## Rows: 2938 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Country, Status
## dbl (20): Year, Life expectancy, Adult Mortality, infant deaths, Alcohol, pe...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(data)
## # A tibble: 6 × 22
## Country Year Status `Life expectancy` `Adult Mortality` `infant deaths`
## <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 Afghanistan 2015 Develop… 65 263 62
## 2 Afghanistan 2014 Develop… 59.9 271 64
## 3 Afghanistan 2013 Develop… 59.9 268 66
## 4 Afghanistan 2012 Develop… 59.5 272 69
## 5 Afghanistan 2011 Develop… 59.2 275 71
## 6 Afghanistan 2010 Develop… 58.8 279 74
## # ℹ 16 more variables: Alcohol <dbl>, `percentage expenditure` <dbl>,
## # `Hepatitis B` <dbl>, Measles <dbl>, BMI <dbl>, `under-five deaths` <dbl>,
## # Polio <dbl>, `Total expenditure` <dbl>, Diphtheria <dbl>, `HIV/AIDS` <dbl>,
## # GDP <dbl>, Population <dbl>, `thinness 1-19 years` <dbl>,
## # `thinness 5-9 years` <dbl>, `Income composition of resources` <dbl>,
## # Schooling <dbl>
#================ # Check Structure #================
str(data)
## spc_tbl_ [2,938 × 22] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Country : chr [1:2938] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Year : num [1:2938] 2015 2014 2013 2012 2011 ...
## $ Status : chr [1:2938] "Developing" "Developing" "Developing" "Developing" ...
## $ Life expectancy : num [1:2938] 65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
## $ Adult Mortality : num [1:2938] 263 271 268 272 275 279 281 287 295 295 ...
## $ infant deaths : num [1:2938] 62 64 66 69 71 74 77 80 82 84 ...
## $ Alcohol : num [1:2938] 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
## $ percentage expenditure : num [1:2938] 71.3 73.5 73.2 78.2 7.1 ...
## $ Hepatitis B : num [1:2938] 65 62 64 67 68 66 63 64 63 64 ...
## $ Measles : num [1:2938] 1154 492 430 2787 3013 ...
## $ BMI : num [1:2938] 19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
## $ under-five deaths : num [1:2938] 83 86 89 93 97 102 106 110 113 116 ...
## $ Polio : num [1:2938] 6 58 62 67 68 66 63 64 63 58 ...
## $ Total expenditure : num [1:2938] 8.16 8.18 8.13 8.52 7.87 9.2 9.42 8.33 6.73 7.43 ...
## $ Diphtheria : num [1:2938] 65 62 64 67 68 66 63 64 63 58 ...
## $ HIV/AIDS : num [1:2938] 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
## $ GDP : num [1:2938] 584.3 612.7 631.7 670 63.5 ...
## $ Population : num [1:2938] 33736494 327582 31731688 3696958 2978599 ...
## $ thinness 1-19 years : num [1:2938] 17.2 17.5 17.7 17.9 18.2 18.4 18.6 18.8 19 19.2 ...
## $ thinness 5-9 years : num [1:2938] 17.3 17.5 17.7 18 18.2 18.4 18.7 18.9 19.1 19.3 ...
## $ Income composition of resources: num [1:2938] 0.479 0.476 0.47 0.463 0.454 0.448 0.434 0.433 0.415 0.405 ...
## $ Schooling : num [1:2938] 10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...
## - attr(*, "spec")=
## .. cols(
## .. Country = col_character(),
## .. Year = col_double(),
## .. Status = col_character(),
## .. `Life expectancy` = col_double(),
## .. `Adult Mortality` = col_double(),
## .. `infant deaths` = col_double(),
## .. Alcohol = col_double(),
## .. `percentage expenditure` = col_double(),
## .. `Hepatitis B` = col_double(),
## .. Measles = col_double(),
## .. BMI = col_double(),
## .. `under-five deaths` = col_double(),
## .. Polio = col_double(),
## .. `Total expenditure` = col_double(),
## .. Diphtheria = col_double(),
## .. `HIV/AIDS` = col_double(),
## .. GDP = col_double(),
## .. Population = col_double(),
## .. `thinness 1-19 years` = col_double(),
## .. `thinness 5-9 years` = col_double(),
## .. `Income composition of resources` = col_double(),
## .. Schooling = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(data)
## Country Year Status Life expectancy
## Length:2938 Min. :2000 Length:2938 Min. :36.30
## Class :character 1st Qu.:2004 Class :character 1st Qu.:63.10
## Mode :character Median :2008 Mode :character Median :72.10
## Mean :2008 Mean :69.22
## 3rd Qu.:2012 3rd Qu.:75.70
## Max. :2015 Max. :89.00
## NA's :10
## Adult Mortality infant deaths Alcohol percentage expenditure
## Min. : 1.0 Min. : 0.0 Min. : 0.0100 Min. : 0.000
## 1st Qu.: 74.0 1st Qu.: 0.0 1st Qu.: 0.8775 1st Qu.: 4.685
## Median :144.0 Median : 3.0 Median : 3.7550 Median : 64.913
## Mean :164.8 Mean : 30.3 Mean : 4.6029 Mean : 738.251
## 3rd Qu.:228.0 3rd Qu.: 22.0 3rd Qu.: 7.7025 3rd Qu.: 441.534
## Max. :723.0 Max. :1800.0 Max. :17.8700 Max. :19479.912
## NA's :10 NA's :194
## Hepatitis B Measles BMI under-five deaths
## Min. : 1.00 Min. : 0.0 Min. : 1.00 Min. : 0.00
## 1st Qu.:77.00 1st Qu.: 0.0 1st Qu.:19.30 1st Qu.: 0.00
## Median :92.00 Median : 17.0 Median :43.50 Median : 4.00
## Mean :80.94 Mean : 2419.6 Mean :38.32 Mean : 42.04
## 3rd Qu.:97.00 3rd Qu.: 360.2 3rd Qu.:56.20 3rd Qu.: 28.00
## Max. :99.00 Max. :212183.0 Max. :87.30 Max. :2500.00
## NA's :553 NA's :34
## Polio Total expenditure Diphtheria HIV/AIDS
## Min. : 3.00 Min. : 0.370 Min. : 2.00 Min. : 0.100
## 1st Qu.:78.00 1st Qu.: 4.260 1st Qu.:78.00 1st Qu.: 0.100
## Median :93.00 Median : 5.755 Median :93.00 Median : 0.100
## Mean :82.55 Mean : 5.938 Mean :82.32 Mean : 1.742
## 3rd Qu.:97.00 3rd Qu.: 7.492 3rd Qu.:97.00 3rd Qu.: 0.800
## Max. :99.00 Max. :17.600 Max. :99.00 Max. :50.600
## NA's :19 NA's :226 NA's :19
## GDP Population thinness 1-19 years
## Min. : 1.68 Min. :3.400e+01 Min. : 0.10
## 1st Qu.: 463.94 1st Qu.:1.958e+05 1st Qu.: 1.60
## Median : 1766.95 Median :1.387e+06 Median : 3.30
## Mean : 7483.16 Mean :1.275e+07 Mean : 4.84
## 3rd Qu.: 5910.81 3rd Qu.:7.420e+06 3rd Qu.: 7.20
## Max. :119172.74 Max. :1.294e+09 Max. :27.70
## NA's :448 NA's :652 NA's :34
## thinness 5-9 years Income composition of resources Schooling
## Min. : 0.10 Min. :0.0000 Min. : 0.00
## 1st Qu.: 1.50 1st Qu.:0.4930 1st Qu.:10.10
## Median : 3.30 Median :0.6770 Median :12.30
## Mean : 4.87 Mean :0.6276 Mean :11.99
## 3rd Qu.: 7.20 3rd Qu.:0.7790 3rd Qu.:14.30
## Max. :28.60 Max. :0.9480 Max. :20.70
## NA's :34 NA's :167 NA's :163
#=================== # Dataset Dimensions #===================
dim(data)
## [1] 2938 22
#===================== # Find Missing Values #=====================
sum(is.na(data)) # Total missing values
## [1] 2563
colSums(is.na(data)) # Column-wise missing values
## Country Year
## 0 0
## Status Life expectancy
## 0 10
## Adult Mortality infant deaths
## 10 0
## Alcohol percentage expenditure
## 194 0
## Hepatitis B Measles
## 553 0
## BMI under-five deaths
## 34 0
## Polio Total expenditure
## 19 226
## Diphtheria HIV/AIDS
## 19 0
## GDP Population
## 448 652
## thinness 1-19 years thinness 5-9 years
## 34 34
## Income composition of resources Schooling
## 167 163
#============================ # Clean Data (Remove NA rows) #============================
data <- na.omit(data)
#================================================ # Q1: Countries with life expectancy above 75 #================================================
high_life_exp <- data %>%
filter(`Life expectancy` > 75) %>%
distinct(Country)
print(high_life_exp)
## # A tibble: 53 × 1
## Country
## <chr>
## 1 Albania
## 2 Algeria
## 3 Argentina
## 4 Australia
## 5 Austria
## 6 Azerbaijan
## 7 Bangladesh
## 8 Belgium
## 9 Bosnia and Herzegovina
## 10 Cabo Verde
## # ℹ 43 more rows
#=================================================== # Q2: Countries with adult mortality more than 300 #==================================================
high_adult_mortality <- data %>%
filter(`Adult Mortality` > 300) %>%
distinct(Country)
print(high_adult_mortality)
## # A tibble: 25 × 1
## Country
## <chr>
## 1 Afghanistan
## 2 Angola
## 3 Bhutan
## 4 Botswana
## 5 Burundi
## 6 Cameroon
## 7 Central African Republic
## 8 Chad
## 9 Eritrea
## 10 Kenya
## # ℹ 15 more rows
#============================================================ # Q3: Developing countries with life expectancy less than 60 #=============================================================
low_life_exp_dev <- data %>%
filter(Status == "Developing", `Life expectancy` < 60) %>%
distinct(Country)
print(low_life_exp_dev)
## # A tibble: 37 × 1
## Country
## <chr>
## 1 Afghanistan
## 2 Angola
## 3 Benin
## 4 Botswana
## 5 Burkina Faso
## 6 Burundi
## 7 Cameroon
## 8 Central African Republic
## 9 Chad
## 10 Comoros
## # ℹ 27 more rows
#===================================== # Level 3: Grouping & Summarization #=====================================
#============================================= # Q1: Average life expectancy for each country #=============================================
library(dplyr)
avg_life_exp_country <- data %>%
group_by(Country) %>%
summarise(Average_Life_Expectancy = mean(`Life expectancy`, na.rm = TRUE)) %>%
arrange(desc(Average_Life_Expectancy))
# Show result
print(avg_life_exp_country)
## # A tibble: 133 × 2
## Country Average_Life_Expectancy
## <chr> <dbl>
## 1 Ireland 83.4
## 2 Canada 82.2
## 3 France 82.2
## 4 Italy 82.2
## 5 Spain 82.0
## 6 Australia 81.9
## 7 Sweden 81.9
## 8 Austria 81.5
## 9 Netherlands 81.3
## 10 Greece 81.2
## # ℹ 123 more rows
#============================================================ # Q2: Average schooling in developed vs. developing countries #============================================================
avg_schooling_status <- data %>%
group_by(Status) %>%
summarise(Average_Schooling = mean(Schooling, na.rm = TRUE))
# Show result
print(avg_schooling_status)
## # A tibble: 2 × 2
## Status Average_Schooling
## <chr> <dbl>
## 1 Developed 15.6
## 2 Developing 11.5
#================================================= # Q3: Year with highest average life expectancy #================================================
avg_life_exp_year <- data %>%
group_by(Year) %>%
summarise(Average_Life_Expectancy = mean(`Life expectancy`, na.rm = TRUE)) %>%
arrange(desc(Average_Life_Expectancy))
# Show the top year
print(head(avg_life_exp_year, 1))
## # A tibble: 1 × 2
## Year Average_Life_Expectancy
## <dbl> <dbl>
## 1 2015 71.4
#============================================================== # Q1: Rank countries by average life expectancy (high to low) #=============================================================
country_rank_life <- data %>%
group_by(Country) %>%
summarise(Average_Life_Expectancy = mean(`Life expectancy`, na.rm = TRUE)) %>%
arrange(desc(Average_Life_Expectancy))
# Show top 10 ranked countries
head(country_rank_life, 10)
## # A tibble: 10 × 2
## Country Average_Life_Expectancy
## <chr> <dbl>
## 1 Ireland 83.4
## 2 Canada 82.2
## 3 France 82.2
## 4 Italy 82.2
## 5 Spain 82.0
## 6 Australia 81.9
## 7 Sweden 81.9
## 8 Austria 81.5
## 9 Netherlands 81.3
## 10 Greece 81.2
#====================================================== # Q2: Find top 5 countries with highest infant deaths #=====================================================
top_infant_deaths <- data %>%
group_by(Country) %>%
summarise(Total_Infant_Deaths = sum(`infant deaths`, na.rm = TRUE)) %>%
arrange(desc(Total_Infant_Deaths))
# Show top 5 countries
head(top_infant_deaths, 5)
## # A tibble: 5 × 2
## Country Total_Infant_Deaths
## <chr> <dbl>
## 1 India 13957
## 2 Nigeria 5237
## 3 China 4561
## 4 Pakistan 4402
## 5 Indonesia 2305
#============================================================= # Q3: Which year had the highest total alcohol consumption? #===========================================================
yearly_alcohol <- data %>%
group_by(Year) %>%
summarise(Total_Alcohol = sum(Alcohol, na.rm = TRUE)) %>%
arrange(desc(Total_Alcohol))
# Show the top year
head(yearly_alcohol, 1)
## # A tibble: 1 × 2
## Year Total_Alcohol
## <dbl> <dbl>
## 1 2011 613.
#Level 5: Creating New Insights
data <- data %>%
mutate(Development_Status = case_when(
GDP < 1000 ~ "Low Income",
GDP >= 1000 & GDP < 4000 ~ "Lower Middle Income",
GDP >= 4000 & GDP < 12000 ~ "Upper Middle Income",
GDP >= 12000 ~ "High Income",
TRUE ~ "Unknown"
))
# Check sample output
head(data %>% select(Country, Year, GDP, Development_Status), 10)
## # A tibble: 10 × 4
## Country Year GDP Development_Status
## <chr> <dbl> <dbl> <chr>
## 1 Afghanistan 2015 584. Low Income
## 2 Afghanistan 2014 613. Low Income
## 3 Afghanistan 2013 632. Low Income
## 4 Afghanistan 2012 670. Low Income
## 5 Afghanistan 2011 63.5 Low Income
## 6 Afghanistan 2010 553. Low Income
## 7 Afghanistan 2009 446. Low Income
## 8 Afghanistan 2008 373. Low Income
## 9 Afghanistan 2007 370. Low Income
## 10 Afghanistan 2006 273. Low Income
data = data %>%
mutate(Health_Spending_Per_Capita = `percentage expenditure` / Population)
# Check sample output
head(data %>% select(Country, Year, `percentage expenditure`, Population,
Health_Spending_Per_Capita), 10)
## # A tibble: 10 × 5
## Country Year `percentage expenditure` Population Health_Spending_Per_C…¹
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan 2015 71.3 33736494 0.00000211
## 2 Afghanistan 2014 73.5 327582 0.000224
## 3 Afghanistan 2013 73.2 31731688 0.00000231
## 4 Afghanistan 2012 78.2 3696958 0.0000211
## 5 Afghanistan 2011 7.10 2978599 0.00000238
## 6 Afghanistan 2010 79.7 2883167 0.0000276
## 7 Afghanistan 2009 56.8 284331 0.000200
## 8 Afghanistan 2008 25.9 2729431 0.00000948
## 9 Afghanistan 2007 10.9 26616792 0.000000410
## 10 Afghanistan 2006 17.2 2589345 0.00000663
## # ℹ abbreviated name: ¹Health_Spending_Per_Capita
#===================================================================== # Q1. Predict Life expectancy based on GDP (Simple Linear Regression) #====================================================================
# Now use the correct column names (likely "Life.expectancy" should be "Life.expectancy" without space, or you might need backticks)
# Simple Linear Regression: Predict Life expectancy based on GDP
model1 <- lm(`Life expectancy` ~ GDP, data = data)
# Summary of the model
cat("Summary of Simple Linear Regression (Life expectancy vs GDP):\n")
## Summary of Simple Linear Regression (Life expectancy vs GDP):
summary(model1)
##
## Call:
## lm(formula = `Life expectancy` ~ GDP, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.084 -3.924 1.675 5.314 21.440
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.742e+01 2.161e-01 311.94 <2e-16 ***
## GDP 3.383e-04 1.695e-05 19.96 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.896 on 1647 degrees of freedom
## Multiple R-squared: 0.1948, Adjusted R-squared: 0.1943
## F-statistic: 398.4 on 1 and 1647 DF, p-value: < 2.2e-16
# Plot GDP vs Life expectancy with regression line
plot(data$GDP, data$`Life expectancy`,
col = "blue",
pch = 16,
main = "Life Expectancy vs GDP",
xlab = "GDP",
ylab = "Life Expectancy")
# Add regression line
abline(model1, col = "red", lwd = 2)
#==================================================================================== #Q2.Predict Life expectancy based on Alcohol consumption (Simple Linear Regression) #====================================================================================
# Simple Linear Regression: Predict Life Expectancy based on Alcohol
model_alcohol <- lm(`Life expectancy` ~ Alcohol, data = data)
# Summary of the model
cat("Summary of Simple Linear Regression (Life expectancy vs Alcohol):\n")
## Summary of Simple Linear Regression (Life expectancy vs Alcohol):
summary(model_alcohol)
##
## Call:
## lm(formula = `Life expectancy` ~ Alcohol, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.678 -3.925 1.827 5.663 17.675
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 65.31651 0.29861 218.73 <2e-16 ***
## Alcohol 0.87925 0.04924 17.86 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.054 on 1647 degrees of freedom
## Multiple R-squared: 0.1622, Adjusted R-squared: 0.1617
## F-statistic: 318.8 on 1 and 1647 DF, p-value: < 2.2e-16
# Scatter plot: Life Expectancy vs Alcohol consumption
plot(data$Alcohol, data$`Life expectancy`,
col = "skyblue",
pch = 16,
main = "Life Expectancy vs Alcohol Consumption",
xlab = "Alcohol Consumption (litres per capita)",
ylab = "Life Expectancy")
# Add regression line
abline(model_alcohol, col = "red", lwd = 2)
#=============================================================================================== #Q3.Predict Life expectancy based on GDP, BMI, and Adult Mortality (Multiple Linear Regression) #================================================================================================
# Multiple Linear Regression
model_mlr <- lm(`Life expectancy` ~ GDP + BMI + `Adult Mortality`, data = data)
# Summary
cat("Summary of Multiple Linear Regression (Life Expectancy vs GDP, BMI, Adult Mortality):\n")
## Summary of Multiple Linear Regression (Life Expectancy vs GDP, BMI, Adult Mortality):
summary(model_mlr)
##
## Call:
## lm(formula = `Life expectancy` ~ GDP + BMI + `Adult Mortality`,
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.8265 -2.2861 0.4486 2.9917 20.8803
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.980e+01 4.022e-01 173.54 <2e-16 ***
## GDP 1.728e-04 1.198e-05 14.42 <2e-16 ***
## BMI 1.297e-01 7.189e-03 18.05 <2e-16 ***
## `Adult Mortality` -3.809e-02 1.130e-03 -33.72 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.293 on 1645 degrees of freedom
## Multiple R-squared: 0.6386, Adjusted R-squared: 0.6379
## F-statistic: 968.9 on 3 and 1645 DF, p-value: < 2.2e-16
# Scatter plots for each predictor vs Life Expectancy
par(mfrow = c(1, 3)) # 1 row, 3 columns layout
# Scatter plot 1: GDP vs Life Expectancy
plot(data$GDP, data$`Life expectancy`,
col = "blue",
pch = 16,
main = "Life Expectancy vs GDP",
xlab = "GDP",
ylab = "Life Expectancy")
abline(lm(`Life expectancy` ~ GDP, data = data), col = "red", lwd = 2)
# Scatter plot 2: BMI vs Life Expectancy
plot(data$BMI, data$`Life expectancy`,
col = "green",
pch = 16,
main = "Life Expectancy vs BMI",
xlab = "BMI",
ylab = "Life Expectancy")
abline(lm(`Life expectancy` ~ BMI, data = data), col = "red", lwd = 2)
# Scatter plot 3: Adult Mortality vs Life Expectancy
plot(data$`Adult Mortality`, data$`Life expectancy`,
col = "purple",
pch = 16,
main = "Life Expectancy vs Adult Mortality",
xlab = "Adult Mortality",
ylab = "Life Expectancy")
abline(lm(`Life expectancy` ~ `Adult Mortality`, data = data), col = "red", lwd = 2)
# Reset plot layout
par(mfrow = c(1, 1))
#======================================================================= #Q4. predict Life Expectancy based on GDP using Polynomial Regression? #=======================================================================
# Polynomial Regression Model (Degree 2: Quadratic)
model_poly <- lm(`Life expectancy` ~ GDP + I(GDP^2), data = data)
# Summary of the model
cat("Summary of Polynomial Regression (Life expectancy vs GDP):\n")
## Summary of Polynomial Regression (Life expectancy vs GDP):
summary(model_poly)
##
## Call:
## lm(formula = `Life expectancy` ~ GDP + I(GDP^2), data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.738 -4.128 1.055 5.215 22.318
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.640e+01 2.247e-01 295.55 <2e-16 ***
## GDP 6.803e-04 3.314e-05 20.53 <2e-16 ***
## I(GDP^2) -5.437e-09 4.589e-10 -11.85 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.582 on 1646 degrees of freedom
## Multiple R-squared: 0.258, Adjusted R-squared: 0.2571
## F-statistic: 286.2 on 2 and 1646 DF, p-value: < 2.2e-16
# Scatter plot of GDP vs Life Expectancy
plot(data$GDP, data$`Life expectancy`,
col = "lightblue",
pch = 16,
main = "Polynomial Regression: Life Expectancy vs GDP",
xlab = "GDP",
ylab = "Life Expectancy")
# Add the polynomial regression curve
x_seq <- seq(min(data$GDP, na.rm=TRUE), max(data$GDP, na.rm=TRUE), length=100)
y_pred <- predict(model_poly, newdata = data.frame(GDP = x_seq))
lines(x_seq, y_pred, col = "red", lwd = 2)
#============================================================================ # Q1. Correlation between Life Expectancy and GDP per capita #============================================================================
cor.test(data$`Life expectancy`, data$GDP, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: data$`Life expectancy` and data$GDP
## t = 19.959, df = 1647, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4016054 0.4793813
## sample estimates:
## cor
## 0.4413218
# Scatter plot
plot(data$GDP, data$`Life expectancy`,
main = "Life Expectancy vs GDP",
xlab = "GDP per capita",
ylab = "Life Expectancy (years)",
col = ifelse(data$Status == "Developed", "blue", "red"))
legend("bottomright", legend = c("Developed", "Developing"),
col = c("blue", "red"), pch = 1)
#================================================================ # Q2. Correlation between Adult Mortality and Alcohol Consumption #================================================================
cor.test(data$`Adult Mortality`, data$Alcohol, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: data$`Adult Mortality` and data$Alcohol
## t = -7.2361, df = 1647, p-value = 7.048e-13
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2219266 -0.1283506
## sample estimates:
## cor
## -0.1755351
# Scatter plot with regression line
plot(data$Alcohol, data$`Adult Mortality`,
main = "Adult Mortality vs Alcohol Consumption",
xlab = "Alcohol Consumption (liters per capita)",
ylab = "Adult Mortality (per 1000 population)")
abline(lm(`Adult Mortality` ~ Alcohol, data = data), col = "red")
#================================================ # Q3. Correlation between BMI and Life Expectancy #===============================================
cor.test(data$BMI, data$`Life expectancy`, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: data$BMI and data$`Life expectancy`
## t = 26.177, df = 1647, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5070364 0.5752616
## sample estimates:
## cor
## 0.5420416
# Boxplot by Status
boxplot(BMI ~ Status, data = data,
main = "BMI Distribution by Country Status",
xlab = "Country Status",
ylab = "BMI",
col = c("lightblue", "lightgreen"))
#====================================================== # Q4. Correlation between Schooling and Life Expectancy #======================================================
cor.test(data$Schooling, data$`Life expectancy`, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: data$Schooling and data$`Life expectancy`
## t = 43.048, df = 1647, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7040885 0.7495739
## sample estimates:
## cor
## 0.72763
# Scatter plot with color by Status
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
ggplot(data, aes(x = Schooling, y = `Life expectancy`, color = Status)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Life Expectancy vs Schooling",
x = "Average Years of Schooling",
y = "Life Expectancy (years)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
df <- data
#======================================= # Q1. Scatter Plot: GDP vs Life Expectancy #=======================================
# Clean data
df_scatter1 <- df %>%
select(GDP, `Life expectancy`) %>%
na.omit()
# Create scatter plot
ggplot(df_scatter1, aes(x = GDP, y = `Life expectancy`)) +
geom_point(color = "steelblue", alpha = 0.6) +
labs(title = "GDP vs Life Expectancy",
x = "GDP",
y = "Life Expectancy") +
theme_minimal()
#====================================================== # Q2. Scatter Plot: Adult Mortality vs Life Expectancy #======================================================
# Clean data
df_scatter2 <- df %>%
select(`Adult Mortality`, `Life expectancy`) %>%
na.omit()
# Create scatter plot
ggplot(df_scatter2, aes(x = `Adult Mortality`, y = `Life expectancy`)) +
geom_point(color = "tomato", alpha = 0.6) +
labs(title = "Adult Mortality vs Life Expectancy",
x = "Adult Mortality",
y = "Life Expectancy") +
theme_minimal()
#=================================== # Boxplot: Life Expectancy by Status #===================================
# Clean data
df_boxplot1 <- df %>%
select(`Life expectancy`, Status) %>%
na.omit()
# Create boxplot
ggplot(df_boxplot1, aes(x = Status, y = `Life expectancy`, fill = Status)) +
geom_boxplot() +
labs(title = "Life Expectancy: Developed vs Developing",
x = "Status",
y = "Life Expectancy") +
theme_minimal() +
scale_fill_manual(values = c("#00BFC4", "#F8766D"))
#======================================================= # ANOVA: Check if Life Expectancy differs by Status #=======================================================
anova_result <- aov(`Life expectancy` ~ Status, data = df)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## Status 1 25005 25005 401.7 <2e-16 ***
## Residuals 1647 102525 62
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#=========================================== # Boxplot: Life Expectancy by Status #===========================================
# Clean data
df_boxplot <- df %>%
select(`Life expectancy`, Status) %>%
na.omit()
# Create boxplot
ggplot(df_boxplot, aes(x = Status, y = `Life expectancy`, fill = Status)) +
geom_boxplot() +
labs(title = "Life Expectancy by Status",
x = "Status",
y = "Life Expectancy") +
theme_minimal() +
scale_fill_manual(values = c("#00BFC4", "#F8766D"))
# Select numeric columns along with 'Status' for ggpairs plot
df_numeric <- df %>%
select(`Life expectancy`, GDP, `Adult Mortality`, BMI, `percentage expenditure`, `Hepatitis B`, Measles, `Polio`, `Income composition of resources`, Schooling, Status)
# Create colorful ggpairs plot
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(df_numeric,
aes(color = Status, alpha = 0.7)) +
theme_minimal() +
scale_color_manual(values = c("#00BFC4", "#F8766D"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#conslusion: The “Life Expectancy” dataset contains 2,938 records with 22 columns, covering various factors affecting life expectancy across countries. After cleaning the data by removing rows with missing values, the dataset is ready for analysis. Some columns have missing values, such as “Hepatitis B” and “GDP.” The dataset includes key variables like “Life expectancy,” “Adult Mortality,” “Alcohol consumption,” “GDP,” and others. The summary statistics show a wide range in values, indicating significant variation across countries and years. Further analysis can uncover trends and relationships between life expectancy and other health, economic, and social factors.