In this exploratory analysis, I am using a data set called “Life Expectancy Data,” from the World Health Statistics 2020|Complete|Geo-Analysis. The link to the Kaggle source for the data is https://www.kaggle.com/datasets/utkarshxy/who-worldhealth-statistics-2020-complete.
install.packages("ggplot2", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5m/4f5rvwrn5rngf6j4gpl2mc9w0000gn/T//RtmprbeLkA/downloaded_packages
library("ggplot2")
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("tidyverse")
## Warning: package 'tidyr' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
install.packages("Hmisc", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5m/4f5rvwrn5rngf6j4gpl2mc9w0000gn/T//RtmprbeLkA/downloaded_packages
library("Hmisc")
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, units
install.packages("corrplot", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5m/4f5rvwrn5rngf6j4gpl2mc9w0000gn/T//RtmprbeLkA/downloaded_packages
library("corrplot")
## corrplot 0.92 loaded
install.packages("lubridate", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5m/4f5rvwrn5rngf6j4gpl2mc9w0000gn/T//RtmprbeLkA/downloaded_packages
library("lubridate")
# Loading the data into a data frame
life_expectancy_data= data.frame(read.csv("/Users/ursulapodosenin/Desktop/Life_Expectancy_Data.csv"))
# Getting a preview of the data
glimpse(life_expectancy_data)
## Rows: 1,649
## Columns: 22
## $ Country <chr> "Afghanistan", "Afghanistan", "Afghani…
## $ Year <int> 2015, 2014, 2013, 2012, 2011, 2010, 20…
## $ Status <chr> "Developing", "Developing", "Developin…
## $ Life.expectancy <dbl> 65.0, 59.9, 59.9, 59.5, 59.2, 58.8, 58…
## $ Adult.Mortality <int> 263, 271, 268, 272, 275, 279, 281, 287…
## $ infant.deaths <int> 62, 64, 66, 69, 71, 74, 77, 80, 82, 84…
## $ Alcohol <dbl> 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.…
## $ percentage.expenditure <dbl> 71.279624, 73.523582, 73.219243, 78.18…
## $ Hepatitis.B <int> 65, 62, 64, 67, 68, 66, 63, 64, 63, 64…
## $ Measles <int> 1154, 492, 430, 2787, 3013, 1989, 2861…
## $ BMI <dbl> 19.1, 18.6, 18.1, 17.6, 17.2, 16.7, 16…
## $ under.five.deaths <int> 83, 86, 89, 93, 97, 102, 106, 110, 113…
## $ Polio <int> 6, 58, 62, 67, 68, 66, 63, 64, 63, 58,…
## $ Total.expenditure <dbl> 8.16, 8.18, 8.13, 8.52, 7.87, 9.20, 9.…
## $ Diphtheria <int> 65, 62, 64, 67, 68, 66, 63, 64, 63, 58…
## $ HIV.AIDS <dbl> 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1…
## $ GDP <dbl> 584.25921, 612.69651, 631.74498, 669.9…
## $ Population <dbl> 33736494, 327582, 31731688, 3696958, 2…
## $ thinness..1.19.years <dbl> 17.2, 17.5, 17.7, 17.9, 18.2, 18.4, 18…
## $ thinness.5.9.years <dbl> 17.3, 17.5, 17.7, 18.0, 18.2, 18.4, 18…
## $ Income.composition.of.resources <dbl> 0.479, 0.476, 0.470, 0.463, 0.454, 0.4…
## $ Schooling <dbl> 10.1, 10.0, 9.9, 9.8, 9.5, 9.2, 8.9, 8…
str(life_expectancy_data)
## 'data.frame': 1649 obs. of 22 variables:
## $ Country : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Year : int 2015 2014 2013 2012 2011 2010 2009 2008 2007 2006 ...
## $ Status : chr "Developing" "Developing" "Developing" "Developing" ...
## $ Life.expectancy : num 65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
## $ Adult.Mortality : int 263 271 268 272 275 279 281 287 295 295 ...
## $ infant.deaths : int 62 64 66 69 71 74 77 80 82 84 ...
## $ Alcohol : num 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
## $ percentage.expenditure : num 71.3 73.5 73.2 78.2 7.1 ...
## $ Hepatitis.B : int 65 62 64 67 68 66 63 64 63 64 ...
## $ Measles : int 1154 492 430 2787 3013 1989 2861 1599 1141 1990 ...
## $ BMI : num 19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
## $ under.five.deaths : int 83 86 89 93 97 102 106 110 113 116 ...
## $ Polio : int 6 58 62 67 68 66 63 64 63 58 ...
## $ Total.expenditure : num 8.16 8.18 8.13 8.52 7.87 9.2 9.42 8.33 6.73 7.43 ...
## $ Diphtheria : int 65 62 64 67 68 66 63 64 63 58 ...
## $ HIV.AIDS : num 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
## $ GDP : num 584.3 612.7 631.7 670 63.5 ...
## $ Population : num 33736494 327582 31731688 3696958 2978599 ...
## $ thinness..1.19.years : num 17.2 17.5 17.7 17.9 18.2 18.4 18.6 18.8 19 19.2 ...
## $ thinness.5.9.years : num 17.3 17.5 17.7 18 18.2 18.4 18.7 18.9 19.1 19.3 ...
## $ Income.composition.of.resources: num 0.479 0.476 0.47 0.463 0.454 0.448 0.434 0.433 0.415 0.405 ...
## $ Schooling : num 10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...
sum(is.na(life_expectancy_data))
## [1] 0
# Grouping the data
trimmed_data_one<- life_expectancy_data[1:400, ]
trimmed_data_two<- life_expectancy_data[401:800, ]
trimmed_data_three<- life_expectancy_data[801:1200, ]
trimmed_data_four<- life_expectancy_data[1201:1649, ]
#Graphing life expectancy by country
ggplot(data = trimmed_data_one, aes(x = Life.expectancy, y= Country))+
geom_line(color= "blue", size= 1)+
labs(x= "Life Expectancy", Y= "Country")+
xlim(c(40, 100))+
theme_bw()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(data = trimmed_data_two, aes(x = Life.expectancy, y= Country))+
geom_line(color= "blue", size= 1)+
labs(x= "Life Expectancy", Y= "Country")+
xlim(c(40, 100))+
theme_bw()
ggplot(data = trimmed_data_three, aes(x = Life.expectancy, y= Country))+
geom_line(color= "blue", size= 1)+
labs(x= "Life Expectancy", Y= "Country")+
xlim(c(40, 100))+
theme_bw()
ggplot(data = trimmed_data_four, aes(x = Life.expectancy, y= Country))+
geom_line(color= "blue", size= 1)+
labs(x= "Life Expectancy", Y= "Country")+
xlim(c(40, 100))+
theme_bw()
# Adding a new column to the original data frame with the new data
life_expectancy_data$infant_deaths_relative_to_adults<- life_expectancy_data$infant.deaths/life_expectancy_data$Adult.Mortality
# Plotting infant deaths relative to adults by country status
ggplot(data= life_expectancy_data, aes(x= Status, y= infant_deaths_relative_to_adults))+
geom_point(color= "purple", size= 2.5)+
labs(x= "Country Status", y= "Infant Deaths Relative to Adults")+
theme_minimal() + theme_bw()
# Creating a subset of the original data
subset_one_life_expectancy_data<- life_expectancy_data[, c("Life.expectancy", "Alcohol", "BMI")]
# Obtaining the p-values
p_values <- rcorr(as.matrix(subset_one_life_expectancy_data))
print(p_values)
## Life.expectancy Alcohol BMI
## Life.expectancy 1.00 0.40 0.54
## Alcohol 0.40 1.00 0.35
## BMI 0.54 0.35 1.00
##
## n= 1649
##
##
## P
## Life.expectancy Alcohol BMI
## Life.expectancy 0 0
## Alcohol 0 0
## BMI 0 0
# Creating a visual correlation matrix
head(subset_one_life_expectancy_data)
## Life.expectancy Alcohol BMI
## 1 65.0 0.01 19.1
## 2 59.9 0.01 18.6
## 3 59.9 0.01 18.1
## 4 59.5 0.01 17.6
## 5 59.2 0.01 17.2
## 6 58.8 0.01 16.7
M<- cor(subset_one_life_expectancy_data)
corrplot(M, method= "ellipse")
#Creating a numeric correlation matrix
head(subset_one_life_expectancy_data)
## Life.expectancy Alcohol BMI
## 1 65.0 0.01 19.1
## 2 59.9 0.01 18.6
## 3 59.9 0.01 18.1
## 4 59.5 0.01 17.6
## 5 59.2 0.01 17.2
## 6 58.8 0.01 16.7
M<- cor(subset_one_life_expectancy_data)
corrplot(M, method= "number")
# Creating a new data frame with a subset of the original data
subset_two_life_expectancy_data<- life_expectancy_data[, c("Country", "Hepatitis.B", "Measles", "Polio", "Diphtheria", "HIV.AIDS")]
# Obtaining some summary statistics for some health conditions in each country
subset_two_life_expectancy_data |>
group_by(Country) |>
summarise(average_number_of_hepb_patients= mean(Hepatitis.B),
range_of_measles_patients= range(Measles),
range_of_polio_patients= range(Polio),
average_number_of_diphtheria_patients= mean(Diphtheria),
average_number_of_diphtheria_patients= mean(HIV.AIDS))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'Country'. You can override using the
## `.groups` argument.
## # A tibble: 266 × 5
## # Groups: Country [133]
## Country average_number_of_he…¹ range_of_measles_pat…² range_of_polio_patie…³
## <chr> <dbl> <int> <int>
## 1 Afghani… 64.6 430 5
## 2 Afghani… 64.6 8762 68
## 3 Albania 98 0 97
## 4 Albania 98 662 99
## 5 Algeria 76.5 0 86
## 6 Algeria 76.5 3289 95
## 7 Angola 71 265 63
## 8 Angola 71 11699 81
## 9 Argenti… 80.3 0 91
## 10 Argenti… 80.3 17 99
## # ℹ 256 more rows
## # ℹ abbreviated names: ¹average_number_of_hepb_patients,
## # ²range_of_measles_patients, ³range_of_polio_patients
## # ℹ 1 more variable: average_number_of_diphtheria_patients <dbl>
# Creating a data frame with the subset of data that is going to be used
Afghanistan_data<- life_expectancy_data[1:16, ]
Afghanistan_data
## Country Year Status Life.expectancy Adult.Mortality infant.deaths
## 1 Afghanistan 2015 Developing 65.0 263 62
## 2 Afghanistan 2014 Developing 59.9 271 64
## 3 Afghanistan 2013 Developing 59.9 268 66
## 4 Afghanistan 2012 Developing 59.5 272 69
## 5 Afghanistan 2011 Developing 59.2 275 71
## 6 Afghanistan 2010 Developing 58.8 279 74
## 7 Afghanistan 2009 Developing 58.6 281 77
## 8 Afghanistan 2008 Developing 58.1 287 80
## 9 Afghanistan 2007 Developing 57.5 295 82
## 10 Afghanistan 2006 Developing 57.3 295 84
## 11 Afghanistan 2005 Developing 57.3 291 85
## 12 Afghanistan 2004 Developing 57.0 293 87
## 13 Afghanistan 2003 Developing 56.7 295 87
## 14 Afghanistan 2002 Developing 56.2 3 88
## 15 Afghanistan 2001 Developing 55.3 316 88
## 16 Afghanistan 2000 Developing 54.8 321 88
## Alcohol percentage.expenditure Hepatitis.B Measles BMI under.five.deaths
## 1 0.01 71.279624 65 1154 19.1 83
## 2 0.01 73.523582 62 492 18.6 86
## 3 0.01 73.219243 64 430 18.1 89
## 4 0.01 78.184215 67 2787 17.6 93
## 5 0.01 7.097109 68 3013 17.2 97
## 6 0.01 79.679367 66 1989 16.7 102
## 7 0.01 56.762217 63 2861 16.2 106
## 8 0.03 25.873925 64 1599 15.7 110
## 9 0.02 10.910156 63 1141 15.2 113
## 10 0.03 17.171518 64 1990 14.7 116
## 11 0.02 1.388648 66 1296 14.2 118
## 12 0.02 15.296066 67 466 13.8 120
## 13 0.01 11.089053 65 798 13.4 122
## 14 0.01 16.887351 64 2486 13.0 122
## 15 0.01 10.574728 63 8762 12.6 122
## 16 0.01 10.424960 62 6532 12.2 122
## Polio Total.expenditure Diphtheria HIV.AIDS GDP Population
## 1 6 8.16 65 0.1 584.25921 33736494
## 2 58 8.18 62 0.1 612.69651 327582
## 3 62 8.13 64 0.1 631.74498 31731688
## 4 67 8.52 67 0.1 669.95900 3696958
## 5 68 7.87 68 0.1 63.53723 2978599
## 6 66 9.20 66 0.1 553.32894 2883167
## 7 63 9.42 63 0.1 445.89330 284331
## 8 64 8.33 64 0.1 373.36112 2729431
## 9 63 6.73 63 0.1 369.83580 26616792
## 10 58 7.43 58 0.1 272.56377 2589345
## 11 58 8.70 58 0.1 25.29413 257798
## 12 5 8.79 5 0.1 219.14135 24118979
## 13 41 8.82 41 0.1 198.72854 2364851
## 14 36 7.76 36 0.1 187.84595 21979923
## 15 35 7.80 33 0.1 117.49698 2966463
## 16 24 8.20 24 0.1 114.56000 293756
## thinness..1.19.years thinness.5.9.years Income.composition.of.resources
## 1 17.2 17.3 0.479
## 2 17.5 17.5 0.476
## 3 17.7 17.7 0.470
## 4 17.9 18.0 0.463
## 5 18.2 18.2 0.454
## 6 18.4 18.4 0.448
## 7 18.6 18.7 0.434
## 8 18.8 18.9 0.433
## 9 19.0 19.1 0.415
## 10 19.2 19.3 0.405
## 11 19.3 19.5 0.396
## 12 19.5 19.7 0.381
## 13 19.7 19.9 0.373
## 14 19.9 2.2 0.341
## 15 2.1 2.4 0.340
## 16 2.3 2.5 0.338
## Schooling infant_deaths_relative_to_adults
## 1 10.1 0.2357414
## 2 10.0 0.2361624
## 3 9.9 0.2462687
## 4 9.8 0.2536765
## 5 9.5 0.2581818
## 6 9.2 0.2652330
## 7 8.9 0.2740214
## 8 8.7 0.2787456
## 9 8.4 0.2779661
## 10 8.1 0.2847458
## 11 7.9 0.2920962
## 12 6.8 0.2969283
## 13 6.5 0.2949153
## 14 6.2 29.3333333
## 15 5.9 0.2784810
## 16 5.5 0.2741433
# Creating a graph of time series data
myts <- ts(Afghanistan_data$Measles, start=c(2000), end=c(2015), frequency=12)
myts2 <- window(myts, start=c(2014, 1), end=c(2014, 12))
plot(myts2,
xlab ="Year",
ylab ="Number of People",
main ="Prevalence of Measles in Afghanistan",
col.main ="darkgreen")+
xlim(2014, 2015)+
ylim(0, 10000)
## NULL