options(repos = "https://cran.rstudio.com/")
# Set the CRAN mirror
options(repos = "https://cran.rstudio.com/")
# Install the 'tidyverse' package
install.packages("tidyverse")
## Installing package into 'C:/Users/meebo/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\meebo\AppData\Local\Temp\Rtmp0ARLH6\downloaded_packages
The goal of the project is to clean dataset. In more detail, initial dataset look like this:
And the cleaned dataset should be like this:
---
install.packages("tidyverse")
## Installing package into 'C:/Users/meebo/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\meebo\AppData\Local\Temp\Rtmp0ARLH6\downloaded_packages
install.packages("janitor")
## Installing package into 'C:/Users/meebo/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'janitor' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\meebo\AppData\Local\Temp\Rtmp0ARLH6\downloaded_packages
install.packages("here")
## Installing package into 'C:/Users/meebo/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'here' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\meebo\AppData\Local\Temp\Rtmp0ARLH6\downloaded_packages
install.packages("skimr")
## Installing package into 'C:/Users/meebo/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'skimr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\meebo\AppData\Local\Temp\Rtmp0ARLH6\downloaded_packages
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(ggplot2)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(here)
## here() starts at C:/Users/meebo/Downloads
library(skimr)
library(stringr)
death_cause <- read_csv("Cause of death.csv",show_col_types = FALSE)
death_cause
## # A tibble: 41,528 × 7
## `Indicator Name` `Indicator Code` `Country Name` `Country Code` Year Value
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 Cause of death, b… SH.DTH.COMM.FE.… Africa Easter… AFE 2019 50.0
## 2 Cause of death, b… SH.DTH.COMM.FE.… Africa Easter… AFE 2015 54.2
## 3 Cause of death, b… SH.DTH.COMM.FE.… Africa Easter… AFE 2010 61.2
## 4 Cause of death, b… SH.DTH.COMM.FE.… Africa Easter… AFE 2000 70.0
## 5 Cause of death, b… SH.DTH.COMM.FE.… Africa Wester… AFW 2019 60.0
## 6 Cause of death, b… SH.DTH.COMM.FE.… Africa Wester… AFW 2015 62.8
## 7 Cause of death, b… SH.DTH.COMM.FE.… Africa Wester… AFW 2010 66.5
## 8 Cause of death, b… SH.DTH.COMM.FE.… Africa Wester… AFW 2000 72.8
## 9 Cause of death, b… SH.DTH.COMM.FE.… Arab World ARB 2019 19.2
## 10 Cause of death, b… SH.DTH.COMM.FE.… Arab World ARB 2015 20.4
## # ℹ 41,518 more rows
## # ℹ 1 more variable: Disaggregation <chr>
skim_without_charts(death_cause)
| Name | death_cause |
| Number of rows | 41528 |
| Number of columns | 7 |
| _______________________ | |
| Column type frequency: | |
| character | 5 |
| numeric | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Indicator Name | 0 | 1 | 38 | 144 | 0 | 45 | 0 |
| Indicator Code | 0 | 1 | 14 | 22 | 0 | 45 | 0 |
| Country Name | 0 | 1 | 3 | 50 | 0 | 231 | 0 |
| Country Code | 0 | 1 | 3 | 3 | 0 | 231 | 0 |
| Disaggregation | 0 | 1 | 17 | 87 | 0 | 45 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| Year | 0 | 1 | 2011.00 | 7.11 | 2000 | 2000.00 | 2010.00 | 2019.00 | 2019 |
| Value | 0 | 1 | 33.38 | 28.83 | 0 | 7.66 | 23.21 | 56.21 | 100 |
glimpse(death_cause)
## Rows: 41,528
## Columns: 7
## $ `Indicator Name` <chr> "Cause of death, by communicable diseases and materna…
## $ `Indicator Code` <chr> "SH.DTH.COMM.FE.ZS", "SH.DTH.COMM.FE.ZS", "SH.DTH.COM…
## $ `Country Name` <chr> "Africa Eastern and Southern", "Africa Eastern and So…
## $ `Country Code` <chr> "AFE", "AFE", "AFE", "AFE", "AFW", "AFW", "AFW", "AFW…
## $ Year <dbl> 2019, 2015, 2010, 2000, 2019, 2015, 2010, 2000, 2019,…
## $ Value <dbl> 49.954751, 54.176359, 61.197361, 70.024576, 59.977799…
## $ Disaggregation <chr> "female, All Ages, Communicable diseases and maternal…
colnames(death_cause)
## [1] "Indicator Name" "Indicator Code" "Country Name" "Country Code"
## [5] "Year" "Value" "Disaggregation"
dim(death_cause)
## [1] 41528 7
str(death_cause)
## spc_tbl_ [41,528 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Indicator Name: chr [1:41528] "Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, female (% of female population)" "Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, female (% of female population)" "Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, female (% of female population)" "Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, female (% of female population)" ...
## $ Indicator Code: chr [1:41528] "SH.DTH.COMM.FE.ZS" "SH.DTH.COMM.FE.ZS" "SH.DTH.COMM.FE.ZS" "SH.DTH.COMM.FE.ZS" ...
## $ Country Name : chr [1:41528] "Africa Eastern and Southern" "Africa Eastern and Southern" "Africa Eastern and Southern" "Africa Eastern and Southern" ...
## $ Country Code : chr [1:41528] "AFE" "AFE" "AFE" "AFE" ...
## $ Year : num [1:41528] 2019 2015 2010 2000 2019 ...
## $ Value : num [1:41528] 50 54.2 61.2 70 60 ...
## $ Disaggregation: chr [1:41528] "female, All Ages, Communicable diseases and maternal, prenatal and nutrition conditions" "female, All Ages, Communicable diseases and maternal, prenatal and nutrition conditions" "female, All Ages, Communicable diseases and maternal, prenatal and nutrition conditions" "female, All Ages, Communicable diseases and maternal, prenatal and nutrition conditions" ...
## - attr(*, "spec")=
## .. cols(
## .. `Indicator Name` = col_character(),
## .. `Indicator Code` = col_character(),
## .. `Country Name` = col_character(),
## .. `Country Code` = col_character(),
## .. Year = col_double(),
## .. Value = col_double(),
## .. Disaggregation = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
reduced_death_cause <- death_cause %>% select(-`Indicator Code`,-Disaggregation)
reduced_death_cause
## # A tibble: 41,528 × 5
## `Indicator Name` `Country Name` `Country Code` Year Value
## <chr> <chr> <chr> <dbl> <dbl>
## 1 Cause of death, by communicable di… Africa Easter… AFE 2019 50.0
## 2 Cause of death, by communicable di… Africa Easter… AFE 2015 54.2
## 3 Cause of death, by communicable di… Africa Easter… AFE 2010 61.2
## 4 Cause of death, by communicable di… Africa Easter… AFE 2000 70.0
## 5 Cause of death, by communicable di… Africa Wester… AFW 2019 60.0
## 6 Cause of death, by communicable di… Africa Wester… AFW 2015 62.8
## 7 Cause of death, by communicable di… Africa Wester… AFW 2010 66.5
## 8 Cause of death, by communicable di… Africa Wester… AFW 2000 72.8
## 9 Cause of death, by communicable di… Arab World ARB 2019 19.2
## 10 Cause of death, by communicable di… Arab World ARB 2015 20.4
## # ℹ 41,518 more rows
class(reduced_death_cause)
## [1] "tbl_df" "tbl" "data.frame"
filtered_death_cause <- reduced_death_cause %>%
filter(grepl("female.*age",`Indicator Name`, ignore.case = TRUE)|
grepl("male.*age",`Indicator Name`, ignore.case = TRUE))
filtered_death_cause
## # A tibble: 22,131 × 5
## `Indicator Name` `Country Name` `Country Code` Year Value
## <chr> <chr> <chr> <dbl> <dbl>
## 1 Cause of death, by communicable di… Africa Easter… AFE 2019 84.2
## 2 Cause of death, by communicable di… Africa Easter… AFE 2015 84.9
## 3 Cause of death, by communicable di… Africa Easter… AFE 2010 87.5
## 4 Cause of death, by communicable di… Africa Easter… AFE 2000 91.2
## 5 Cause of death, by communicable di… Africa Wester… AFW 2019 86.3
## 6 Cause of death, by communicable di… Africa Wester… AFW 2015 87.2
## 7 Cause of death, by communicable di… Africa Wester… AFW 2010 90.0
## 8 Cause of death, by communicable di… Africa Wester… AFW 2000 93.0
## 9 Cause of death, by communicable di… Arab World ARB 2019 77.3
## 10 Cause of death, by communicable di… Arab World ARB 2015 75.0
## # ℹ 22,121 more rows
num_unique_values <- n_distinct(filtered_death_cause$`Indicator Name`)
print(num_unique_values)
## [1] 24
# Count the occurrences of each unique cause of death
cause_of_death_counts <- table(filtered_death_cause$`Indicator Name`)
# Display the unique cause of death names and their counts
unique_causes <- unique(filtered_death_cause$`Indicator Name`)
for (cause in unique_causes) {
cat(cause, ": ", cause_of_death_counts[cause], "\n")
}
## Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, ages 0-4, female (% of female population ages 0-4) : 924
## Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, ages 0-4, male (% of male population ages 0-4) : 924
## Cause of death, by injury, ages 0-4, female (% of female population ages 0-4) : 924
## Cause of death, by injury, ages 0-4, male (% of male population ages 0-4) : 924
## Cause of death, by non-communicable diseases, ages 0-4, female (% of female population ages 0-4) : 924
## Cause of death, by non-communicable diseases, ages 0-4, male (% of male population ages 0-4) : 924
## Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, ages 5-14, female (% of female population ages 5-14) : 902
## Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, ages 5-14, male (% of male population ages 5-14) : 908
## Cause of death, by injury, ages 5-14, female (% of female population ages 5-14) : 924
## Cause of death, by injury, ages 5-14, male (% of male population ages 5-14) : 924
## Cause of death, by non-communicable diseases, ages 5-14, female (% of female population ages 5-14) : 923
## Cause of death, by non-communicable diseases, ages 5-14, male (% of male population ages 5-14) : 923
## Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, ages 15-59, female (% of female population ages 15-59) : 924
## Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, ages 15-59, male (% of male population ages 15-59) : 919
## Cause of death, by injury, ages 15-59, female (% of female population ages 15-59) : 924
## Cause of death, by injury, ages 15-59, male (% of male population ages 15-59) : 924
## Cause of death, by non-communicable diseases, ages 15-59, female (% of female population ages 15-59) : 924
## Cause of death, by non-communicable diseases, ages 15-59, male (% of male population ages 15-59) : 924
## Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, ages 60+, female (% of female population ages 60+) : 924
## Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, ages 60+, male (% of male population ages 60+) : 924
## Cause of death, by injury, ages 60+, female (% of female population ages 60+) : 924
## Cause of death, by injury, ages 60+, male (% of male population ages 60+) : 924
## Cause of death, by non-communicable diseases, ages 60+, female (% of female population ages 60+) : 924
## Cause of death, by non-communicable diseases, ages 60+, male (% of male population ages 60+) : 924
Look like Indicator Name has 02 structure string (24 unique string values), one with 5 parts and one with 4 parts that are separated by comma. We just need to extract 03 parts that have cause of death, age, sex (male/female)
filtered_death_cause <- filtered_death_cause %>%
rename(indicator_name = `Indicator Name`, country_name = `Country Name`, country_code = `Country Code`, year = Year, death_rate = Value)
# Split data in Indicator Name column into age_bracket, gender, cause_name
filtered_death_cause$ages_bracket <- str_extract(filtered_death_cause$indicator_name, "ages[\\s0-9+-]+")
filtered_death_cause$gender <- str_extract(filtered_death_cause$indicator_name, "male|female")
filtered_death_cause$cause_name <- str_extract(filtered_death_cause$indicator_name, "(?<=by ).*(?=, ages)")
num_missing <- sum(is.na(filtered_death_cause))
print(num_missing)
## [1] 0
# Final cleaned df
cleaned_death_cause <- filtered_death_cause %>% select(-indicator_name)
cleaned_death_cause
## # A tibble: 22,131 × 7
## country_name country_code year death_rate ages_bracket gender cause_name
## <chr> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 Africa Eastern … AFE 2019 84.2 ages 0-4 female communica…
## 2 Africa Eastern … AFE 2015 84.9 ages 0-4 female communica…
## 3 Africa Eastern … AFE 2010 87.5 ages 0-4 female communica…
## 4 Africa Eastern … AFE 2000 91.2 ages 0-4 female communica…
## 5 Africa Western … AFW 2019 86.3 ages 0-4 female communica…
## 6 Africa Western … AFW 2015 87.2 ages 0-4 female communica…
## 7 Africa Western … AFW 2010 90.0 ages 0-4 female communica…
## 8 Africa Western … AFW 2000 93.0 ages 0-4 female communica…
## 9 Arab World ARB 2019 77.3 ages 0-4 female communica…
## 10 Arab World ARB 2015 75.0 ages 0-4 female communica…
## # ℹ 22,121 more rows
avg_by_cause_year <- cleaned_death_cause %>%
group_by(cause_name, year) %>%
summarize(avg_death_rate = mean(death_rate))
## `summarise()` has grouped output by 'cause_name'. You can override using the
## `.groups` argument.
ggplot(avg_by_cause_year, aes(x = year, y = avg_death_rate, color = cause_name)) +
geom_line() +
labs(title = "Trend of Death Rates by Year and Cause of death",
x = "Year",
y = "Average Death Rate",
color = "Cause of death") +
theme_minimal()
avg_by_country <- cleaned_death_cause %>%
filter(year == 2019) %>%
group_by(country_name) %>%
summarize(avg_death_rate1 = mean(death_rate)) %>%
arrange(desc(avg_death_rate1)) %>%
head(5)
ggplot(avg_by_country, aes(x = country_name, y = avg_death_rate1)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(title = "Top 5 Countries with Highest Death Rates in 2019",
x = "Country",
y = "Average Death Rate") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
avg_by_age <- cleaned_death_cause %>%
filter(year == 2019) %>%
group_by(ages_bracket) %>%
summarize(avg_death_rate2 = mean(death_rate))
ggplot(avg_by_age, aes(x = ages_bracket, y = avg_death_rate2)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(title = "Death Rate Distribution by Age Bracket in 2019",
x = "Age Bracket",
y = "Average Death Rate") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))