In this exploratory analysis, I am using a data set called “Life Expectancy Data,” from the World Health Statistics 2020|Complete|Geo-Analysis. The link to the Kaggle source for the data is https://www.kaggle.com/datasets/utkarshxy/who-worldhealth-statistics-2020-complete.

install.packages("ggplot2", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/5m/4f5rvwrn5rngf6j4gpl2mc9w0000gn/T//RtmprbeLkA/downloaded_packages
library("ggplot2")
library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("tidyverse")
## Warning: package 'tidyr' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ✔ readr     2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
install.packages("Hmisc", repos = "http://cran.us.r-project.org") 
## 
## The downloaded binary packages are in
##  /var/folders/5m/4f5rvwrn5rngf6j4gpl2mc9w0000gn/T//RtmprbeLkA/downloaded_packages
library("Hmisc")
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
install.packages("corrplot", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/5m/4f5rvwrn5rngf6j4gpl2mc9w0000gn/T//RtmprbeLkA/downloaded_packages
library("corrplot")
## corrplot 0.92 loaded
install.packages("lubridate", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/5m/4f5rvwrn5rngf6j4gpl2mc9w0000gn/T//RtmprbeLkA/downloaded_packages
library("lubridate") 
# Loading the data into a data frame
life_expectancy_data= data.frame(read.csv("/Users/ursulapodosenin/Desktop/Life_Expectancy_Data.csv"))

# Getting a preview of the data 
glimpse(life_expectancy_data)
## Rows: 1,649
## Columns: 22
## $ Country                         <chr> "Afghanistan", "Afghanistan", "Afghani…
## $ Year                            <int> 2015, 2014, 2013, 2012, 2011, 2010, 20…
## $ Status                          <chr> "Developing", "Developing", "Developin…
## $ Life.expectancy                 <dbl> 65.0, 59.9, 59.9, 59.5, 59.2, 58.8, 58…
## $ Adult.Mortality                 <int> 263, 271, 268, 272, 275, 279, 281, 287…
## $ infant.deaths                   <int> 62, 64, 66, 69, 71, 74, 77, 80, 82, 84…
## $ Alcohol                         <dbl> 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.…
## $ percentage.expenditure          <dbl> 71.279624, 73.523582, 73.219243, 78.18…
## $ Hepatitis.B                     <int> 65, 62, 64, 67, 68, 66, 63, 64, 63, 64…
## $ Measles                         <int> 1154, 492, 430, 2787, 3013, 1989, 2861…
## $ BMI                             <dbl> 19.1, 18.6, 18.1, 17.6, 17.2, 16.7, 16…
## $ under.five.deaths               <int> 83, 86, 89, 93, 97, 102, 106, 110, 113…
## $ Polio                           <int> 6, 58, 62, 67, 68, 66, 63, 64, 63, 58,…
## $ Total.expenditure               <dbl> 8.16, 8.18, 8.13, 8.52, 7.87, 9.20, 9.…
## $ Diphtheria                      <int> 65, 62, 64, 67, 68, 66, 63, 64, 63, 58…
## $ HIV.AIDS                        <dbl> 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1…
## $ GDP                             <dbl> 584.25921, 612.69651, 631.74498, 669.9…
## $ Population                      <dbl> 33736494, 327582, 31731688, 3696958, 2…
## $ thinness..1.19.years            <dbl> 17.2, 17.5, 17.7, 17.9, 18.2, 18.4, 18…
## $ thinness.5.9.years              <dbl> 17.3, 17.5, 17.7, 18.0, 18.2, 18.4, 18…
## $ Income.composition.of.resources <dbl> 0.479, 0.476, 0.470, 0.463, 0.454, 0.4…
## $ Schooling                       <dbl> 10.1, 10.0, 9.9, 9.8, 9.5, 9.2, 8.9, 8…
str(life_expectancy_data)
## 'data.frame':    1649 obs. of  22 variables:
##  $ Country                        : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Year                           : int  2015 2014 2013 2012 2011 2010 2009 2008 2007 2006 ...
##  $ Status                         : chr  "Developing" "Developing" "Developing" "Developing" ...
##  $ Life.expectancy                : num  65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
##  $ Adult.Mortality                : int  263 271 268 272 275 279 281 287 295 295 ...
##  $ infant.deaths                  : int  62 64 66 69 71 74 77 80 82 84 ...
##  $ Alcohol                        : num  0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
##  $ percentage.expenditure         : num  71.3 73.5 73.2 78.2 7.1 ...
##  $ Hepatitis.B                    : int  65 62 64 67 68 66 63 64 63 64 ...
##  $ Measles                        : int  1154 492 430 2787 3013 1989 2861 1599 1141 1990 ...
##  $ BMI                            : num  19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
##  $ under.five.deaths              : int  83 86 89 93 97 102 106 110 113 116 ...
##  $ Polio                          : int  6 58 62 67 68 66 63 64 63 58 ...
##  $ Total.expenditure              : num  8.16 8.18 8.13 8.52 7.87 9.2 9.42 8.33 6.73 7.43 ...
##  $ Diphtheria                     : int  65 62 64 67 68 66 63 64 63 58 ...
##  $ HIV.AIDS                       : num  0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
##  $ GDP                            : num  584.3 612.7 631.7 670 63.5 ...
##  $ Population                     : num  33736494 327582 31731688 3696958 2978599 ...
##  $ thinness..1.19.years           : num  17.2 17.5 17.7 17.9 18.2 18.4 18.6 18.8 19 19.2 ...
##  $ thinness.5.9.years             : num  17.3 17.5 17.7 18 18.2 18.4 18.7 18.9 19.1 19.3 ...
##  $ Income.composition.of.resources: num  0.479 0.476 0.47 0.463 0.454 0.448 0.434 0.433 0.415 0.405 ...
##  $ Schooling                      : num  10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...
sum(is.na(life_expectancy_data))
## [1] 0
# Grouping the data
trimmed_data_one<- life_expectancy_data[1:400, ]
trimmed_data_two<- life_expectancy_data[401:800, ]
trimmed_data_three<- life_expectancy_data[801:1200, ]
trimmed_data_four<- life_expectancy_data[1201:1649, ]
#Graphing life expectancy by country 
ggplot(data = trimmed_data_one, aes(x = Life.expectancy, y= Country))+
  geom_line(color= "blue", size= 1)+
  labs(x= "Life Expectancy", Y= "Country")+
   xlim(c(40, 100))+
    theme_bw()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot(data = trimmed_data_two, aes(x = Life.expectancy, y= Country))+
   geom_line(color= "blue", size= 1)+
    labs(x= "Life Expectancy", Y= "Country")+
     xlim(c(40, 100))+
      theme_bw()

ggplot(data = trimmed_data_three, aes(x = Life.expectancy, y= Country))+
   geom_line(color= "blue", size= 1)+
    labs(x= "Life Expectancy", Y= "Country")+
     xlim(c(40, 100))+
      theme_bw()

ggplot(data = trimmed_data_four, aes(x = Life.expectancy, y= Country))+
   geom_line(color= "blue", size= 1)+
    labs(x= "Life Expectancy", Y= "Country")+
     xlim(c(40, 100))+
      theme_bw()

# Adding a new column to the original data frame with the new data 
life_expectancy_data$infant_deaths_relative_to_adults<- life_expectancy_data$infant.deaths/life_expectancy_data$Adult.Mortality

# Plotting infant deaths relative to adults by country status 
ggplot(data= life_expectancy_data, aes(x= Status, y= infant_deaths_relative_to_adults))+
  geom_point(color= "purple", size= 2.5)+
   labs(x= "Country Status", y= "Infant Deaths Relative to Adults")+
  theme_minimal() + theme_bw()

# Creating a subset of the original data 
subset_one_life_expectancy_data<- life_expectancy_data[, c("Life.expectancy", "Alcohol", "BMI")]

# Obtaining the p-values   
p_values <- rcorr(as.matrix(subset_one_life_expectancy_data))
print(p_values)
##                 Life.expectancy Alcohol  BMI
## Life.expectancy            1.00    0.40 0.54
## Alcohol                    0.40    1.00 0.35
## BMI                        0.54    0.35 1.00
## 
## n= 1649 
## 
## 
## P
##                 Life.expectancy Alcohol BMI
## Life.expectancy                  0       0 
## Alcohol          0                       0 
## BMI              0               0
# Creating a visual correlation matrix
head(subset_one_life_expectancy_data)
##   Life.expectancy Alcohol  BMI
## 1            65.0    0.01 19.1
## 2            59.9    0.01 18.6
## 3            59.9    0.01 18.1
## 4            59.5    0.01 17.6
## 5            59.2    0.01 17.2
## 6            58.8    0.01 16.7
M<- cor(subset_one_life_expectancy_data)
corrplot(M, method= "ellipse")

#Creating a numeric correlation matrix
head(subset_one_life_expectancy_data)
##   Life.expectancy Alcohol  BMI
## 1            65.0    0.01 19.1
## 2            59.9    0.01 18.6
## 3            59.9    0.01 18.1
## 4            59.5    0.01 17.6
## 5            59.2    0.01 17.2
## 6            58.8    0.01 16.7
M<- cor(subset_one_life_expectancy_data)
corrplot(M, method= "number")

# Creating a new data frame with a subset of the original data
subset_two_life_expectancy_data<- life_expectancy_data[, c("Country", "Hepatitis.B", "Measles", "Polio", "Diphtheria", "HIV.AIDS")]

# Obtaining some summary statistics for some health conditions in each country 
subset_two_life_expectancy_data |>
  group_by(Country) |>
    summarise(average_number_of_hepb_patients= mean(Hepatitis.B),
              range_of_measles_patients= range(Measles),
              range_of_polio_patients= range(Polio),
              average_number_of_diphtheria_patients= mean(Diphtheria),
              average_number_of_diphtheria_patients= mean(HIV.AIDS))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'Country'. You can override using the
## `.groups` argument.
## # A tibble: 266 × 5
## # Groups:   Country [133]
##    Country  average_number_of_he…¹ range_of_measles_pat…² range_of_polio_patie…³
##    <chr>                     <dbl>                  <int>                  <int>
##  1 Afghani…                   64.6                    430                      5
##  2 Afghani…                   64.6                   8762                     68
##  3 Albania                    98                        0                     97
##  4 Albania                    98                      662                     99
##  5 Algeria                    76.5                      0                     86
##  6 Algeria                    76.5                   3289                     95
##  7 Angola                     71                      265                     63
##  8 Angola                     71                    11699                     81
##  9 Argenti…                   80.3                      0                     91
## 10 Argenti…                   80.3                     17                     99
## # ℹ 256 more rows
## # ℹ abbreviated names: ¹​average_number_of_hepb_patients,
## #   ²​range_of_measles_patients, ³​range_of_polio_patients
## # ℹ 1 more variable: average_number_of_diphtheria_patients <dbl>
# Creating a data frame with the subset of data that is going to be used 
Afghanistan_data<- life_expectancy_data[1:16, ]
Afghanistan_data
##        Country Year     Status Life.expectancy Adult.Mortality infant.deaths
## 1  Afghanistan 2015 Developing            65.0             263            62
## 2  Afghanistan 2014 Developing            59.9             271            64
## 3  Afghanistan 2013 Developing            59.9             268            66
## 4  Afghanistan 2012 Developing            59.5             272            69
## 5  Afghanistan 2011 Developing            59.2             275            71
## 6  Afghanistan 2010 Developing            58.8             279            74
## 7  Afghanistan 2009 Developing            58.6             281            77
## 8  Afghanistan 2008 Developing            58.1             287            80
## 9  Afghanistan 2007 Developing            57.5             295            82
## 10 Afghanistan 2006 Developing            57.3             295            84
## 11 Afghanistan 2005 Developing            57.3             291            85
## 12 Afghanistan 2004 Developing            57.0             293            87
## 13 Afghanistan 2003 Developing            56.7             295            87
## 14 Afghanistan 2002 Developing            56.2               3            88
## 15 Afghanistan 2001 Developing            55.3             316            88
## 16 Afghanistan 2000 Developing            54.8             321            88
##    Alcohol percentage.expenditure Hepatitis.B Measles  BMI under.five.deaths
## 1     0.01              71.279624          65    1154 19.1                83
## 2     0.01              73.523582          62     492 18.6                86
## 3     0.01              73.219243          64     430 18.1                89
## 4     0.01              78.184215          67    2787 17.6                93
## 5     0.01               7.097109          68    3013 17.2                97
## 6     0.01              79.679367          66    1989 16.7               102
## 7     0.01              56.762217          63    2861 16.2               106
## 8     0.03              25.873925          64    1599 15.7               110
## 9     0.02              10.910156          63    1141 15.2               113
## 10    0.03              17.171518          64    1990 14.7               116
## 11    0.02               1.388648          66    1296 14.2               118
## 12    0.02              15.296066          67     466 13.8               120
## 13    0.01              11.089053          65     798 13.4               122
## 14    0.01              16.887351          64    2486 13.0               122
## 15    0.01              10.574728          63    8762 12.6               122
## 16    0.01              10.424960          62    6532 12.2               122
##    Polio Total.expenditure Diphtheria HIV.AIDS       GDP Population
## 1      6              8.16         65      0.1 584.25921   33736494
## 2     58              8.18         62      0.1 612.69651     327582
## 3     62              8.13         64      0.1 631.74498   31731688
## 4     67              8.52         67      0.1 669.95900    3696958
## 5     68              7.87         68      0.1  63.53723    2978599
## 6     66              9.20         66      0.1 553.32894    2883167
## 7     63              9.42         63      0.1 445.89330     284331
## 8     64              8.33         64      0.1 373.36112    2729431
## 9     63              6.73         63      0.1 369.83580   26616792
## 10    58              7.43         58      0.1 272.56377    2589345
## 11    58              8.70         58      0.1  25.29413     257798
## 12     5              8.79          5      0.1 219.14135   24118979
## 13    41              8.82         41      0.1 198.72854    2364851
## 14    36              7.76         36      0.1 187.84595   21979923
## 15    35              7.80         33      0.1 117.49698    2966463
## 16    24              8.20         24      0.1 114.56000     293756
##    thinness..1.19.years thinness.5.9.years Income.composition.of.resources
## 1                  17.2               17.3                           0.479
## 2                  17.5               17.5                           0.476
## 3                  17.7               17.7                           0.470
## 4                  17.9               18.0                           0.463
## 5                  18.2               18.2                           0.454
## 6                  18.4               18.4                           0.448
## 7                  18.6               18.7                           0.434
## 8                  18.8               18.9                           0.433
## 9                  19.0               19.1                           0.415
## 10                 19.2               19.3                           0.405
## 11                 19.3               19.5                           0.396
## 12                 19.5               19.7                           0.381
## 13                 19.7               19.9                           0.373
## 14                 19.9                2.2                           0.341
## 15                  2.1                2.4                           0.340
## 16                  2.3                2.5                           0.338
##    Schooling infant_deaths_relative_to_adults
## 1       10.1                        0.2357414
## 2       10.0                        0.2361624
## 3        9.9                        0.2462687
## 4        9.8                        0.2536765
## 5        9.5                        0.2581818
## 6        9.2                        0.2652330
## 7        8.9                        0.2740214
## 8        8.7                        0.2787456
## 9        8.4                        0.2779661
## 10       8.1                        0.2847458
## 11       7.9                        0.2920962
## 12       6.8                        0.2969283
## 13       6.5                        0.2949153
## 14       6.2                       29.3333333
## 15       5.9                        0.2784810
## 16       5.5                        0.2741433
# Creating a graph of time series data 
myts <- ts(Afghanistan_data$Measles, start=c(2000), end=c(2015), frequency=12)
myts2 <- window(myts, start=c(2014, 1), end=c(2014, 12))
plot(myts2,
      xlab ="Year",  
          ylab ="Number of People",  
          main ="Prevalence of Measles in Afghanistan",   
          col.main ="darkgreen")+
              xlim(2014, 2015)+
              ylim(0, 10000)

## NULL