options(repos = "https://cran.rstudio.com/")
# Set the CRAN mirror
options(repos = "https://cran.rstudio.com/")

# Install the 'tidyverse' package
install.packages("tidyverse")
## Installing package into 'C:/Users/meebo/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\meebo\AppData\Local\Temp\Rtmp0ARLH6\downloaded_packages

The goal of the project is to clean dataset. In more detail, initial dataset look like this:

And the cleaned dataset should be like this:

---

install.packages("tidyverse")
## Installing package into 'C:/Users/meebo/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\meebo\AppData\Local\Temp\Rtmp0ARLH6\downloaded_packages
install.packages("janitor")
## Installing package into 'C:/Users/meebo/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'janitor' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\meebo\AppData\Local\Temp\Rtmp0ARLH6\downloaded_packages
install.packages("here")
## Installing package into 'C:/Users/meebo/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'here' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\meebo\AppData\Local\Temp\Rtmp0ARLH6\downloaded_packages
install.packages("skimr")
## Installing package into 'C:/Users/meebo/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'skimr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\meebo\AppData\Local\Temp\Rtmp0ARLH6\downloaded_packages
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(ggplot2)
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(here)
## here() starts at C:/Users/meebo/Downloads
library(skimr)
library(stringr)
death_cause <- read_csv("Cause of death.csv",show_col_types = FALSE)
death_cause
## # A tibble: 41,528 × 7
##    `Indicator Name`   `Indicator Code` `Country Name` `Country Code`  Year Value
##    <chr>              <chr>            <chr>          <chr>          <dbl> <dbl>
##  1 Cause of death, b… SH.DTH.COMM.FE.… Africa Easter… AFE             2019  50.0
##  2 Cause of death, b… SH.DTH.COMM.FE.… Africa Easter… AFE             2015  54.2
##  3 Cause of death, b… SH.DTH.COMM.FE.… Africa Easter… AFE             2010  61.2
##  4 Cause of death, b… SH.DTH.COMM.FE.… Africa Easter… AFE             2000  70.0
##  5 Cause of death, b… SH.DTH.COMM.FE.… Africa Wester… AFW             2019  60.0
##  6 Cause of death, b… SH.DTH.COMM.FE.… Africa Wester… AFW             2015  62.8
##  7 Cause of death, b… SH.DTH.COMM.FE.… Africa Wester… AFW             2010  66.5
##  8 Cause of death, b… SH.DTH.COMM.FE.… Africa Wester… AFW             2000  72.8
##  9 Cause of death, b… SH.DTH.COMM.FE.… Arab World     ARB             2019  19.2
## 10 Cause of death, b… SH.DTH.COMM.FE.… Arab World     ARB             2015  20.4
## # ℹ 41,518 more rows
## # ℹ 1 more variable: Disaggregation <chr>

1. Inspect data

skim_without_charts(death_cause)
Data summary
Name death_cause
Number of rows 41528
Number of columns 7
_______________________
Column type frequency:
character 5
numeric 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Indicator Name 0 1 38 144 0 45 0
Indicator Code 0 1 14 22 0 45 0
Country Name 0 1 3 50 0 231 0
Country Code 0 1 3 3 0 231 0
Disaggregation 0 1 17 87 0 45 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
Year 0 1 2011.00 7.11 2000 2000.00 2010.00 2019.00 2019
Value 0 1 33.38 28.83 0 7.66 23.21 56.21 100
glimpse(death_cause)
## Rows: 41,528
## Columns: 7
## $ `Indicator Name` <chr> "Cause of death, by communicable diseases and materna…
## $ `Indicator Code` <chr> "SH.DTH.COMM.FE.ZS", "SH.DTH.COMM.FE.ZS", "SH.DTH.COM…
## $ `Country Name`   <chr> "Africa Eastern and Southern", "Africa Eastern and So…
## $ `Country Code`   <chr> "AFE", "AFE", "AFE", "AFE", "AFW", "AFW", "AFW", "AFW…
## $ Year             <dbl> 2019, 2015, 2010, 2000, 2019, 2015, 2010, 2000, 2019,…
## $ Value            <dbl> 49.954751, 54.176359, 61.197361, 70.024576, 59.977799…
## $ Disaggregation   <chr> "female, All Ages, Communicable diseases and maternal…
colnames(death_cause)
## [1] "Indicator Name" "Indicator Code" "Country Name"   "Country Code"  
## [5] "Year"           "Value"          "Disaggregation"
dim(death_cause)
## [1] 41528     7
str(death_cause)
## spc_tbl_ [41,528 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Indicator Name: chr [1:41528] "Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, female (% of female population)" "Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, female (% of female population)" "Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, female (% of female population)" "Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, female (% of female population)" ...
##  $ Indicator Code: chr [1:41528] "SH.DTH.COMM.FE.ZS" "SH.DTH.COMM.FE.ZS" "SH.DTH.COMM.FE.ZS" "SH.DTH.COMM.FE.ZS" ...
##  $ Country Name  : chr [1:41528] "Africa Eastern and Southern" "Africa Eastern and Southern" "Africa Eastern and Southern" "Africa Eastern and Southern" ...
##  $ Country Code  : chr [1:41528] "AFE" "AFE" "AFE" "AFE" ...
##  $ Year          : num [1:41528] 2019 2015 2010 2000 2019 ...
##  $ Value         : num [1:41528] 50 54.2 61.2 70 60 ...
##  $ Disaggregation: chr [1:41528] "female, All Ages, Communicable diseases and maternal, prenatal and nutrition conditions" "female, All Ages, Communicable diseases and maternal, prenatal and nutrition conditions" "female, All Ages, Communicable diseases and maternal, prenatal and nutrition conditions" "female, All Ages, Communicable diseases and maternal, prenatal and nutrition conditions" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Indicator Name` = col_character(),
##   ..   `Indicator Code` = col_character(),
##   ..   `Country Name` = col_character(),
##   ..   `Country Code` = col_character(),
##   ..   Year = col_double(),
##   ..   Value = col_double(),
##   ..   Disaggregation = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

2. Cleaning process

reduced_death_cause <- death_cause %>% select(-`Indicator Code`,-Disaggregation)
reduced_death_cause
## # A tibble: 41,528 × 5
##    `Indicator Name`                    `Country Name` `Country Code`  Year Value
##    <chr>                               <chr>          <chr>          <dbl> <dbl>
##  1 Cause of death, by communicable di… Africa Easter… AFE             2019  50.0
##  2 Cause of death, by communicable di… Africa Easter… AFE             2015  54.2
##  3 Cause of death, by communicable di… Africa Easter… AFE             2010  61.2
##  4 Cause of death, by communicable di… Africa Easter… AFE             2000  70.0
##  5 Cause of death, by communicable di… Africa Wester… AFW             2019  60.0
##  6 Cause of death, by communicable di… Africa Wester… AFW             2015  62.8
##  7 Cause of death, by communicable di… Africa Wester… AFW             2010  66.5
##  8 Cause of death, by communicable di… Africa Wester… AFW             2000  72.8
##  9 Cause of death, by communicable di… Arab World     ARB             2019  19.2
## 10 Cause of death, by communicable di… Arab World     ARB             2015  20.4
## # ℹ 41,518 more rows
class(reduced_death_cause)
## [1] "tbl_df"     "tbl"        "data.frame"
filtered_death_cause <- reduced_death_cause %>%
  filter(grepl("female.*age",`Indicator Name`, ignore.case = TRUE)|
         grepl("male.*age",`Indicator Name`, ignore.case = TRUE))
filtered_death_cause
## # A tibble: 22,131 × 5
##    `Indicator Name`                    `Country Name` `Country Code`  Year Value
##    <chr>                               <chr>          <chr>          <dbl> <dbl>
##  1 Cause of death, by communicable di… Africa Easter… AFE             2019  84.2
##  2 Cause of death, by communicable di… Africa Easter… AFE             2015  84.9
##  3 Cause of death, by communicable di… Africa Easter… AFE             2010  87.5
##  4 Cause of death, by communicable di… Africa Easter… AFE             2000  91.2
##  5 Cause of death, by communicable di… Africa Wester… AFW             2019  86.3
##  6 Cause of death, by communicable di… Africa Wester… AFW             2015  87.2
##  7 Cause of death, by communicable di… Africa Wester… AFW             2010  90.0
##  8 Cause of death, by communicable di… Africa Wester… AFW             2000  93.0
##  9 Cause of death, by communicable di… Arab World     ARB             2019  77.3
## 10 Cause of death, by communicable di… Arab World     ARB             2015  75.0
## # ℹ 22,121 more rows
num_unique_values <- n_distinct(filtered_death_cause$`Indicator Name`)
print(num_unique_values)
## [1] 24
# Count the occurrences of each unique cause of death
cause_of_death_counts <- table(filtered_death_cause$`Indicator Name`)

# Display the unique cause of death names and their counts
unique_causes <- unique(filtered_death_cause$`Indicator Name`)
for (cause in unique_causes) {
  cat(cause, ": ", cause_of_death_counts[cause], "\n")
}
## Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, ages 0-4, female (% of female population ages 0-4) :  924 
## Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, ages 0-4, male (% of male population ages 0-4) :  924 
## Cause of death, by injury, ages 0-4, female (% of female population ages 0-4) :  924 
## Cause of death, by injury, ages 0-4, male (% of male population ages 0-4) :  924 
## Cause of death, by non-communicable diseases, ages 0-4, female (% of female population ages 0-4) :  924 
## Cause of death, by non-communicable diseases, ages 0-4, male (% of male population ages 0-4) :  924 
## Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, ages 5-14, female (% of female population ages 5-14) :  902 
## Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, ages 5-14, male (% of male population ages 5-14) :  908 
## Cause of death, by injury, ages 5-14, female (% of female population ages 5-14) :  924 
## Cause of death, by injury, ages 5-14, male (% of male population ages 5-14) :  924 
## Cause of death, by non-communicable diseases, ages 5-14, female (% of female population ages 5-14) :  923 
## Cause of death, by non-communicable diseases, ages 5-14, male (% of male population ages 5-14) :  923 
## Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, ages 15-59, female (% of female population ages 15-59) :  924 
## Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, ages 15-59, male (% of male population ages 15-59) :  919 
## Cause of death, by injury, ages 15-59, female (% of female population ages 15-59) :  924 
## Cause of death, by injury, ages 15-59, male (% of male population ages 15-59) :  924 
## Cause of death, by non-communicable diseases, ages 15-59, female (% of female population ages 15-59) :  924 
## Cause of death, by non-communicable diseases, ages 15-59, male (% of male population ages 15-59) :  924 
## Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, ages 60+, female (% of female population ages 60+) :  924 
## Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, ages 60+, male (% of male population ages 60+) :  924 
## Cause of death, by injury, ages 60+, female (% of female population ages 60+) :  924 
## Cause of death, by injury, ages 60+, male (% of male population ages 60+) :  924 
## Cause of death, by non-communicable diseases, ages 60+, female (% of female population ages 60+) :  924 
## Cause of death, by non-communicable diseases, ages 60+, male (% of male population ages 60+) :  924

Look like Indicator Name has 02 structure string (24 unique string values), one with 5 parts and one with 4 parts that are separated by comma. We just need to extract 03 parts that have cause of death, age, sex (male/female)

filtered_death_cause <- filtered_death_cause %>%
  rename(indicator_name = `Indicator Name`, country_name = `Country Name`, country_code = `Country Code`, year = Year, death_rate = Value)
# Split data in Indicator Name column into age_bracket, gender, cause_name
filtered_death_cause$ages_bracket <- str_extract(filtered_death_cause$indicator_name, "ages[\\s0-9+-]+")
filtered_death_cause$gender <- str_extract(filtered_death_cause$indicator_name, "male|female")
filtered_death_cause$cause_name <- str_extract(filtered_death_cause$indicator_name, "(?<=by ).*(?=, ages)")
num_missing <- sum(is.na(filtered_death_cause))
print(num_missing)
## [1] 0
# Final cleaned df
cleaned_death_cause <- filtered_death_cause %>% select(-indicator_name)
cleaned_death_cause
## # A tibble: 22,131 × 7
##    country_name     country_code  year death_rate ages_bracket gender cause_name
##    <chr>            <chr>        <dbl>      <dbl> <chr>        <chr>  <chr>     
##  1 Africa Eastern … AFE           2019       84.2 ages 0-4     female communica…
##  2 Africa Eastern … AFE           2015       84.9 ages 0-4     female communica…
##  3 Africa Eastern … AFE           2010       87.5 ages 0-4     female communica…
##  4 Africa Eastern … AFE           2000       91.2 ages 0-4     female communica…
##  5 Africa Western … AFW           2019       86.3 ages 0-4     female communica…
##  6 Africa Western … AFW           2015       87.2 ages 0-4     female communica…
##  7 Africa Western … AFW           2010       90.0 ages 0-4     female communica…
##  8 Africa Western … AFW           2000       93.0 ages 0-4     female communica…
##  9 Arab World       ARB           2019       77.3 ages 0-4     female communica…
## 10 Arab World       ARB           2015       75.0 ages 0-4     female communica…
## # ℹ 22,121 more rows

3. Analyze process and visualization

avg_by_cause_year <- cleaned_death_cause %>%
  group_by(cause_name, year) %>%
  summarize(avg_death_rate = mean(death_rate))
## `summarise()` has grouped output by 'cause_name'. You can override using the
## `.groups` argument.
ggplot(avg_by_cause_year, aes(x = year, y = avg_death_rate, color = cause_name)) +
  geom_line() +
  labs(title = "Trend of Death Rates by Year and Cause of death",
       x = "Year",
       y = "Average Death Rate",
       color = "Cause of death") +
  theme_minimal()

avg_by_country <- cleaned_death_cause %>%
  filter(year == 2019) %>%
  group_by(country_name) %>%
  summarize(avg_death_rate1 = mean(death_rate)) %>%
  arrange(desc(avg_death_rate1)) %>%
  head(5)

ggplot(avg_by_country, aes(x = country_name, y = avg_death_rate1)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(title = "Top 5 Countries with Highest Death Rates in 2019",
       x = "Country",
       y = "Average Death Rate") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

avg_by_age <- cleaned_death_cause %>%
  filter(year == 2019) %>%
  group_by(ages_bracket) %>%
  summarize(avg_death_rate2 = mean(death_rate))


ggplot(avg_by_age, aes(x = ages_bracket, y = avg_death_rate2)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(title = "Death Rate Distribution by Age Bracket in 2019",
       x = "Age Bracket",
       y = "Average Death Rate") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))