# 1. Exploratory Data Analysis (EDA)
library(tidyverse)
## Warning: 程辑包'tidyverse'是用R版本4.2.3 来建造的
## Warning: 程辑包'ggplot2'是用R版本4.2.3 来建造的
## Warning: 程辑包'tibble'是用R版本4.2.3 来建造的
## Warning: 程辑包'tidyr'是用R版本4.2.3 来建造的
## Warning: 程辑包'readr'是用R版本4.2.3 来建造的
## Warning: 程辑包'purrr'是用R版本4.2.3 来建造的
## Warning: 程辑包'dplyr'是用R版本4.2.3 来建造的
## Warning: 程辑包'stringr'是用R版本4.2.3 来建造的
## Warning: 程辑包'forcats'是用R版本4.2.3 来建造的
## Warning: 程辑包'lubridate'是用R版本4.2.3 来建造的
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data <- read.csv("Countries and death causes.csv")
str(data)
## 'data.frame': 6840 obs. of 31 variables:
## $ Entity : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Code : chr "AFG" "AFG" "AFG" "AFG" ...
## $ Year : int 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 ...
## $ Outdoor.air.pollution : int 3169 3222 3395 3623 3788 3869 3943 4024 4040 4042 ...
## $ High.systolic.blood.pressure : int 25633 25872 26309 26961 27658 28090 28587 29021 29349 29712 ...
## $ Diet.high.in.sodium : int 1045 1055 1075 1103 1134 1154 1178 1202 1222 1242 ...
## $ Diet.low.in.whole.grains : int 7077 7149 7297 7499 7698 7807 7943 8075 8173 8265 ...
## $ Alochol.use : int 356 364 376 389 399 406 413 420 425 426 ...
## $ Diet.low.in.fruits : int 3185 3248 3351 3480 3610 3703 3819 3938 4038 4127 ...
## $ Unsafe.water.source : int 3702 4309 5356 7152 7192 8378 8487 9348 9788 9931 ...
## $ Secondhand.smoke : int 4794 4921 5279 5734 6050 6167 6298 6425 6402 6323 ...
## $ Low.birth.weight : int 16135 17924 21200 23795 24866 25534 25997 26246 25805 25080 ...
## $ Child.wasting : int 19546 20334 22895 27002 29205 30943 31628 32736 32760 32271 ...
## $ Unsafe.sex : int 351 361 378 395 410 422 435 448 458 469 ...
## $ Diet.low.in.nuts.and.seeds : int 2319 2449 2603 2771 2932 3049 3173 3298 3401 3482 ...
## $ Household.air.pollution.from.solid.fuels: int 34372 35392 38065 41154 43153 44024 45005 46017 46055 45681 ...
## $ Diet.low.in.Vegetables : int 3679 3732 3827 3951 4075 4153 4247 4339 4409 4473 ...
## $ Low.physical.activity : int 2637 2652 2688 2744 2805 2839 2878 2914 2944 2976 ...
## $ Smoking : int 5174 5247 5363 5522 5689 5801 5934 6066 6178 6288 ...
## $ High.fasting.plasma.glucose : int 11449 11811 12265 12821 13400 13871 14413 14970 15502 16058 ...
## $ Air.pollution : int 37231 38315 41172 44488 46634 47566 48617 49703 49746 49349 ...
## $ High.body.mass.index : int 9518 9489 9528 9611 9675 9608 9503 9286 9024 8857 ...
## $ Unsafe.sanitation : int 2798 3254 4042 5392 5418 6313 6393 7038 7366 7468 ...
## $ No.access.to.handwashing.facility : int 4825 5127 5889 7007 7421 7896 8098 8507 8560 8424 ...
## $ Drug.use : int 174 188 211 232 247 260 274 287 295 302 ...
## $ Low.bone.mineral.density : int 389 389 393 411 413 417 423 425 429 428 ...
## $ Vitamin.A.deficiency : int 2016 2056 2100 2316 2665 3070 3214 3228 3413 3662 ...
## $ Child.stunting : int 7686 7886 8568 9875 11031 11973 12426 12805 13011 13052 ...
## $ Discontinued.breastfeeding : int 107 121 150 204 204 233 233 255 264 263 ...
## $ Non.exclusive.breastfeeding : int 2216 2501 3053 3726 3833 4124 4183 4393 4417 4326 ...
## $ Iron.deficiency : int 564 611 700 773 812 848 883 914 924 909 ...
summary(data)
## Entity Code Year Outdoor.air.pollution
## Length:6840 Length:6840 Min. :1990 Min. : 0
## Class :character Class :character 1st Qu.:1997 1st Qu.: 434
## Mode :character Mode :character Median :2004 Median : 2101
## Mean :2004 Mean : 84582
## 3rd Qu.:2012 3rd Qu.: 11810
## Max. :2019 Max. :4506193
## High.systolic.blood.pressure Diet.high.in.sodium Diet.low.in.whole.grains
## Min. : 2 Min. : 0.0 Min. : 0.0
## 1st Qu.: 1828 1st Qu.: 137.0 1st Qu.: 273.8
## Median : 8770 Median : 969.5 Median : 1444.0
## Mean : 224225 Mean : 40497.2 Mean : 38691.3
## 3rd Qu.: 40356 3rd Qu.: 5169.8 3rd Qu.: 6773.2
## Max. :10845595 Max. :1885356.0 Max. :1844836.0
## Alochol.use Diet.low.in.fruits Unsafe.water.source Secondhand.smoke
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 1
## 1st Qu.: 263.8 1st Qu.: 144.0 1st Qu.: 7.0 1st Qu.: 209
## Median : 1780.5 Median : 834.5 Median : 182.5 Median : 994
## Mean : 54848.6 Mean : 23957.8 Mean : 44086.4 Mean : 30364
## 3rd Qu.: 8368.0 3rd Qu.: 3104.8 3rd Qu.: 5599.2 3rd Qu.: 4348
## Max. :2441973.0 Max. :1046015.0 Max. :2450944.0 Max. :1304318
## Low.birth.weight Child.wasting Unsafe.sex
## Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 123 1st Qu.: 26 1st Qu.: 97
## Median : 1057 Median : 504 Median : 619
## Mean : 59126 Mean : 49924 Mean : 27646
## 3rd Qu.: 10903 3rd Qu.: 9765 3rd Qu.: 4492
## Max. :3033425 Max. :3430422 Max. :1664813
## Diet.low.in.nuts.and.seeds Household.air.pollution.from.solid.fuels
## Min. : 0 Min. : 0
## 1st Qu.: 27 1st Qu.: 32
## Median : 252 Median : 821
## Mean : 12996 Mean : 83641
## 3rd Qu.: 1998 3rd Qu.: 10870
## Max. :575139 Max. :4358214
## Diet.low.in.Vegetables Low.physical.activity Smoking
## Min. : 0.0 Min. : 0.0 Min. : 1
## 1st Qu.: 109.0 1st Qu.: 92.0 1st Qu.: 894
## Median : 590.5 Median : 521.5 Median : 4987
## Mean : 11982.5 Mean : 16489.1 Mean : 181958
## 3rd Qu.: 2101.8 3rd Qu.: 2820.2 3rd Qu.: 23994
## Max. :529381.0 Max. :831502.0 Max. :7693368
## High.fasting.plasma.glucose Air.pollution High.body.mass.index
## Min. : 3 Min. : 0 Min. : 2
## 1st Qu.: 1178 1st Qu.: 816 1st Qu.: 918
## Median : 4966 Median : 5748 Median : 3917
## Mean : 117554 Mean : 164752 Mean : 89870
## 3rd Qu.: 21639 3rd Qu.: 25050 3rd Qu.: 17968
## Max. :6501398 Max. :6671740 Max. :5019360
## Unsafe.sanitation No.access.to.handwashing.facility Drug.use
## Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 3 1st Qu.: 19 1st Qu.: 31
## Median : 102 Median : 221 Median : 222
## Mean : 31522 Mean : 21800 Mean : 10285
## 3rd Qu.: 3854 3rd Qu.: 3954 3rd Qu.: 1224
## Max. :1842275 Max. :1200349 Max. :494492
## Low.bone.mineral.density Vitamin.A.deficiency Child.stunting
## Min. : 0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 43 1st Qu.: 0.0 1st Qu.: 1.0
## Median : 277 Median : 2.0 Median : 41.5
## Mean : 8182 Mean : 2471.6 Mean : 11164.3
## 3rd Qu.: 1232 3rd Qu.: 230.2 3rd Qu.: 1563.2
## Max. :437884 Max. :207555.0 Max. :833449.0
## Discontinued.breastfeeding Non.exclusive.breastfeeding Iron.deficiency
## Min. : 0.00 Min. : 0.0 Min. : 0
## 1st Qu.: 0.00 1st Qu.: 3.0 1st Qu.: 1
## Median : 4.00 Median : 60.5 Median : 12
## Mean : 431.46 Mean : 7171.9 Mean : 1421
## 3rd Qu.: 71.25 3rd Qu.: 1315.5 3rd Qu.: 238
## Max. :33106.00 Max. :505470.0 Max. :73461
head(data)
## Entity Code Year Outdoor.air.pollution High.systolic.blood.pressure
## 1 Afghanistan AFG 1990 3169 25633
## 2 Afghanistan AFG 1991 3222 25872
## 3 Afghanistan AFG 1992 3395 26309
## 4 Afghanistan AFG 1993 3623 26961
## 5 Afghanistan AFG 1994 3788 27658
## 6 Afghanistan AFG 1995 3869 28090
## Diet.high.in.sodium Diet.low.in.whole.grains Alochol.use Diet.low.in.fruits
## 1 1045 7077 356 3185
## 2 1055 7149 364 3248
## 3 1075 7297 376 3351
## 4 1103 7499 389 3480
## 5 1134 7698 399 3610
## 6 1154 7807 406 3703
## Unsafe.water.source Secondhand.smoke Low.birth.weight Child.wasting
## 1 3702 4794 16135 19546
## 2 4309 4921 17924 20334
## 3 5356 5279 21200 22895
## 4 7152 5734 23795 27002
## 5 7192 6050 24866 29205
## 6 8378 6167 25534 30943
## Unsafe.sex Diet.low.in.nuts.and.seeds
## 1 351 2319
## 2 361 2449
## 3 378 2603
## 4 395 2771
## 5 410 2932
## 6 422 3049
## Household.air.pollution.from.solid.fuels Diet.low.in.Vegetables
## 1 34372 3679
## 2 35392 3732
## 3 38065 3827
## 4 41154 3951
## 5 43153 4075
## 6 44024 4153
## Low.physical.activity Smoking High.fasting.plasma.glucose Air.pollution
## 1 2637 5174 11449 37231
## 2 2652 5247 11811 38315
## 3 2688 5363 12265 41172
## 4 2744 5522 12821 44488
## 5 2805 5689 13400 46634
## 6 2839 5801 13871 47566
## High.body.mass.index Unsafe.sanitation No.access.to.handwashing.facility
## 1 9518 2798 4825
## 2 9489 3254 5127
## 3 9528 4042 5889
## 4 9611 5392 7007
## 5 9675 5418 7421
## 6 9608 6313 7896
## Drug.use Low.bone.mineral.density Vitamin.A.deficiency Child.stunting
## 1 174 389 2016 7686
## 2 188 389 2056 7886
## 3 211 393 2100 8568
## 4 232 411 2316 9875
## 5 247 413 2665 11031
## 6 260 417 3070 11973
## Discontinued.breastfeeding Non.exclusive.breastfeeding Iron.deficiency
## 1 107 2216 564
## 2 121 2501 611
## 3 150 3053 700
## 4 204 3726 773
## 5 204 3833 812
## 6 233 4124 848
length(unique(data$Entity))
## [1] 228
length(unique(data$Year))
## [1] 30
# Visualize some basic data
library(ggplot2)
library(dplyr)
# Line Plot for Yearly Trend of Deaths from Alcohol Use in Afghanistan
# Filter data for Afghanistan
afghanistan_data <- data %>% filter(Entity == "Afghanistan")
ggplot(afghanistan_data, aes(x = Year, y = Alochol.use)) +
geom_line(color = "blue", size=1) +
ggtitle("Yearly Trend of Deaths from Alochol use in Afghanistan") +
xlab("Year") +
ylab("Deaths from Alcohol Use") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

##notes:Yearly Trend of Deaths from Alochol use in Afghanistan: This line plot shows the trend of deaths caused by alochol use over the years. We can observe how the number of deaths has fluctuated in Afghanistan from 1990 onward.
#Bar Plot for Comparison of Deaths from Secondhand Smoke (1990)
# Filter data for selected countries and year 1990
countries <- c("Afghanistan", "India", "China", "United States", "Germany")
subset_data <- data %>% filter(Entity %in% countries, Year == 1990)
# Create bar plot
ggplot(subset_data, aes(x = Entity, y = Secondhand.smoke , fill = Entity)) +
geom_bar(stat = "identity") +
ggtitle("Comparison of Deaths from Secondhand Smoke (1990)") +
xlab("Country") +
ylab("Deaths from Secondhand Smoke") +
theme_minimal()

##notes:Comparison of Deaths from Secondhand Smoke (1990): This bar chart compares deaths from secondhand smoke across five different countries in 1990. It helps in understanding how this risk factor impacts different nations.
#R Code for Faceted Plot (Total Deaths per Year for the Five Largest Countries)
#Assume the following columns contribute to 'total deaths':
str(data)
## 'data.frame': 6840 obs. of 31 variables:
## $ Entity : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Code : chr "AFG" "AFG" "AFG" "AFG" ...
## $ Year : int 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 ...
## $ Outdoor.air.pollution : int 3169 3222 3395 3623 3788 3869 3943 4024 4040 4042 ...
## $ High.systolic.blood.pressure : int 25633 25872 26309 26961 27658 28090 28587 29021 29349 29712 ...
## $ Diet.high.in.sodium : int 1045 1055 1075 1103 1134 1154 1178 1202 1222 1242 ...
## $ Diet.low.in.whole.grains : int 7077 7149 7297 7499 7698 7807 7943 8075 8173 8265 ...
## $ Alochol.use : int 356 364 376 389 399 406 413 420 425 426 ...
## $ Diet.low.in.fruits : int 3185 3248 3351 3480 3610 3703 3819 3938 4038 4127 ...
## $ Unsafe.water.source : int 3702 4309 5356 7152 7192 8378 8487 9348 9788 9931 ...
## $ Secondhand.smoke : int 4794 4921 5279 5734 6050 6167 6298 6425 6402 6323 ...
## $ Low.birth.weight : int 16135 17924 21200 23795 24866 25534 25997 26246 25805 25080 ...
## $ Child.wasting : int 19546 20334 22895 27002 29205 30943 31628 32736 32760 32271 ...
## $ Unsafe.sex : int 351 361 378 395 410 422 435 448 458 469 ...
## $ Diet.low.in.nuts.and.seeds : int 2319 2449 2603 2771 2932 3049 3173 3298 3401 3482 ...
## $ Household.air.pollution.from.solid.fuels: int 34372 35392 38065 41154 43153 44024 45005 46017 46055 45681 ...
## $ Diet.low.in.Vegetables : int 3679 3732 3827 3951 4075 4153 4247 4339 4409 4473 ...
## $ Low.physical.activity : int 2637 2652 2688 2744 2805 2839 2878 2914 2944 2976 ...
## $ Smoking : int 5174 5247 5363 5522 5689 5801 5934 6066 6178 6288 ...
## $ High.fasting.plasma.glucose : int 11449 11811 12265 12821 13400 13871 14413 14970 15502 16058 ...
## $ Air.pollution : int 37231 38315 41172 44488 46634 47566 48617 49703 49746 49349 ...
## $ High.body.mass.index : int 9518 9489 9528 9611 9675 9608 9503 9286 9024 8857 ...
## $ Unsafe.sanitation : int 2798 3254 4042 5392 5418 6313 6393 7038 7366 7468 ...
## $ No.access.to.handwashing.facility : int 4825 5127 5889 7007 7421 7896 8098 8507 8560 8424 ...
## $ Drug.use : int 174 188 211 232 247 260 274 287 295 302 ...
## $ Low.bone.mineral.density : int 389 389 393 411 413 417 423 425 429 428 ...
## $ Vitamin.A.deficiency : int 2016 2056 2100 2316 2665 3070 3214 3228 3413 3662 ...
## $ Child.stunting : int 7686 7886 8568 9875 11031 11973 12426 12805 13011 13052 ...
## $ Discontinued.breastfeeding : int 107 121 150 204 204 233 233 255 264 263 ...
## $ Non.exclusive.breastfeeding : int 2216 2501 3053 3726 3833 4124 4183 4393 4417 4326 ...
## $ Iron.deficiency : int 564 611 700 773 812 848 883 914 924 909 ...
death_cols <- c("Alochol.use", "Secondhand.smoke",
"Smoking", "Drug.use")
# Create a new column for total deaths
data <- data %>%
rowwise() %>%
mutate(Total_Deaths = sum(c_across(all_of(death_cols)), na.rm = TRUE)) %>%
ungroup()
# Select the five countries
biggest_countries <- c("China", "India", "United States", "Indonesia", "Brazil")
# Filter data for the five countries
filtered_data <- data %>%
filter(Entity %in% biggest_countries)
# Create faceted plot
ggplot(filtered_data, aes(x = Year, y = Total_Deaths)) +
geom_line(color = "blue", size = 1) +
facet_wrap(~Entity, scales = "free_y") +
ggtitle("Total Deaths Per Year for the Five Largest Countries") +
xlab("Year") +
ylab("Total Deaths") +
theme_minimal()

##notes: The facet plot shows a consistent upward trend in total deaths across Brazil, China, India, Indonesia, and the United States from 1990 to 2020, reflecting increasing health burdens in these populous nations. Brazil, China, India, and Indonesia exhibit a steady rise, with notable acceleration after 2010, likely driven by factors such as population growth, pollution, and rising rates of health-related issues like high blood pressure. In contrast, the United States displays a peak around 2005, followed by a decline and then a resurgence around 2020, possibly reflecting healthcare improvements and later emerging health challenges. These trends underscore the growing need for targeted health interventions to address key risk factors and improve overall public health outcomes in these countries.
#Heatmap for Correlation Between Various Causes of Death
library(reshape2)
## Warning: 程辑包'reshape2'是用R版本4.2.3 来建造的
##
## 载入程辑包:'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(corrplot)
## corrplot 0.94 loaded
# Calculate the correlation matrix (excluding non-numeric columns)
numeric_data <- data %>% select(-Entity, -Code, -Year)
cor_matrix <- cor(numeric_data, use = "complete.obs")
# Create heatmap
corrplot(cor_matrix, method = "color", tl.cex = 0.7, number.cex = 0.7,
title = "Correlation Between Various Causes of Death", mar = c(0,0,1,0))

##notes:Correlation Between Various Causes of Death: The heatmap shows the correlation between different causes of death. Strong correlations may indicate that certain factors are linked and could influence each other in terms of health outcomes.
# b. Data cleaning and transformation:
# Check for missing values
sapply(data, function(x) sum(is.na(x)))
## Entity
## 0
## Code
## 0
## Year
## 0
## Outdoor.air.pollution
## 0
## High.systolic.blood.pressure
## 0
## Diet.high.in.sodium
## 0
## Diet.low.in.whole.grains
## 0
## Alochol.use
## 0
## Diet.low.in.fruits
## 0
## Unsafe.water.source
## 0
## Secondhand.smoke
## 0
## Low.birth.weight
## 0
## Child.wasting
## 0
## Unsafe.sex
## 0
## Diet.low.in.nuts.and.seeds
## 0
## Household.air.pollution.from.solid.fuels
## 0
## Diet.low.in.Vegetables
## 0
## Low.physical.activity
## 0
## Smoking
## 0
## High.fasting.plasma.glucose
## 0
## Air.pollution
## 0
## High.body.mass.index
## 0
## Unsafe.sanitation
## 0
## No.access.to.handwashing.facility
## 0
## Drug.use
## 0
## Low.bone.mineral.density
## 0
## Vitamin.A.deficiency
## 0
## Child.stunting
## 0
## Discontinued.breastfeeding
## 0
## Non.exclusive.breastfeeding
## 0
## Iron.deficiency
## 0
## Total_Deaths
## 0
unique(sapply(data, function(x) sum(is.na(x))))
## [1] 0
# Handling missing values - for example, removing rows with NAs
data_clean <- na.omit(data)
str(data)
## tibble [6,840 × 32] (S3: tbl_df/tbl/data.frame)
## $ Entity : chr [1:6840] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Code : chr [1:6840] "AFG" "AFG" "AFG" "AFG" ...
## $ Year : int [1:6840] 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 ...
## $ Outdoor.air.pollution : int [1:6840] 3169 3222 3395 3623 3788 3869 3943 4024 4040 4042 ...
## $ High.systolic.blood.pressure : int [1:6840] 25633 25872 26309 26961 27658 28090 28587 29021 29349 29712 ...
## $ Diet.high.in.sodium : int [1:6840] 1045 1055 1075 1103 1134 1154 1178 1202 1222 1242 ...
## $ Diet.low.in.whole.grains : int [1:6840] 7077 7149 7297 7499 7698 7807 7943 8075 8173 8265 ...
## $ Alochol.use : int [1:6840] 356 364 376 389 399 406 413 420 425 426 ...
## $ Diet.low.in.fruits : int [1:6840] 3185 3248 3351 3480 3610 3703 3819 3938 4038 4127 ...
## $ Unsafe.water.source : int [1:6840] 3702 4309 5356 7152 7192 8378 8487 9348 9788 9931 ...
## $ Secondhand.smoke : int [1:6840] 4794 4921 5279 5734 6050 6167 6298 6425 6402 6323 ...
## $ Low.birth.weight : int [1:6840] 16135 17924 21200 23795 24866 25534 25997 26246 25805 25080 ...
## $ Child.wasting : int [1:6840] 19546 20334 22895 27002 29205 30943 31628 32736 32760 32271 ...
## $ Unsafe.sex : int [1:6840] 351 361 378 395 410 422 435 448 458 469 ...
## $ Diet.low.in.nuts.and.seeds : int [1:6840] 2319 2449 2603 2771 2932 3049 3173 3298 3401 3482 ...
## $ Household.air.pollution.from.solid.fuels: int [1:6840] 34372 35392 38065 41154 43153 44024 45005 46017 46055 45681 ...
## $ Diet.low.in.Vegetables : int [1:6840] 3679 3732 3827 3951 4075 4153 4247 4339 4409 4473 ...
## $ Low.physical.activity : int [1:6840] 2637 2652 2688 2744 2805 2839 2878 2914 2944 2976 ...
## $ Smoking : int [1:6840] 5174 5247 5363 5522 5689 5801 5934 6066 6178 6288 ...
## $ High.fasting.plasma.glucose : int [1:6840] 11449 11811 12265 12821 13400 13871 14413 14970 15502 16058 ...
## $ Air.pollution : int [1:6840] 37231 38315 41172 44488 46634 47566 48617 49703 49746 49349 ...
## $ High.body.mass.index : int [1:6840] 9518 9489 9528 9611 9675 9608 9503 9286 9024 8857 ...
## $ Unsafe.sanitation : int [1:6840] 2798 3254 4042 5392 5418 6313 6393 7038 7366 7468 ...
## $ No.access.to.handwashing.facility : int [1:6840] 4825 5127 5889 7007 7421 7896 8098 8507 8560 8424 ...
## $ Drug.use : int [1:6840] 174 188 211 232 247 260 274 287 295 302 ...
## $ Low.bone.mineral.density : int [1:6840] 389 389 393 411 413 417 423 425 429 428 ...
## $ Vitamin.A.deficiency : int [1:6840] 2016 2056 2100 2316 2665 3070 3214 3228 3413 3662 ...
## $ Child.stunting : int [1:6840] 7686 7886 8568 9875 11031 11973 12426 12805 13011 13052 ...
## $ Discontinued.breastfeeding : int [1:6840] 107 121 150 204 204 233 233 255 264 263 ...
## $ Non.exclusive.breastfeeding : int [1:6840] 2216 2501 3053 3726 3833 4124 4183 4393 4417 4326 ...
## $ Iron.deficiency : int [1:6840] 564 611 700 773 812 848 883 914 924 909 ...
## $ Total_Deaths : int [1:6840] 10498 10720 11229 11877 12385 12634 12919 13198 13300 13339 ...
str(data_clean)
## tibble [6,840 × 32] (S3: tbl_df/tbl/data.frame)
## $ Entity : chr [1:6840] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Code : chr [1:6840] "AFG" "AFG" "AFG" "AFG" ...
## $ Year : int [1:6840] 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 ...
## $ Outdoor.air.pollution : int [1:6840] 3169 3222 3395 3623 3788 3869 3943 4024 4040 4042 ...
## $ High.systolic.blood.pressure : int [1:6840] 25633 25872 26309 26961 27658 28090 28587 29021 29349 29712 ...
## $ Diet.high.in.sodium : int [1:6840] 1045 1055 1075 1103 1134 1154 1178 1202 1222 1242 ...
## $ Diet.low.in.whole.grains : int [1:6840] 7077 7149 7297 7499 7698 7807 7943 8075 8173 8265 ...
## $ Alochol.use : int [1:6840] 356 364 376 389 399 406 413 420 425 426 ...
## $ Diet.low.in.fruits : int [1:6840] 3185 3248 3351 3480 3610 3703 3819 3938 4038 4127 ...
## $ Unsafe.water.source : int [1:6840] 3702 4309 5356 7152 7192 8378 8487 9348 9788 9931 ...
## $ Secondhand.smoke : int [1:6840] 4794 4921 5279 5734 6050 6167 6298 6425 6402 6323 ...
## $ Low.birth.weight : int [1:6840] 16135 17924 21200 23795 24866 25534 25997 26246 25805 25080 ...
## $ Child.wasting : int [1:6840] 19546 20334 22895 27002 29205 30943 31628 32736 32760 32271 ...
## $ Unsafe.sex : int [1:6840] 351 361 378 395 410 422 435 448 458 469 ...
## $ Diet.low.in.nuts.and.seeds : int [1:6840] 2319 2449 2603 2771 2932 3049 3173 3298 3401 3482 ...
## $ Household.air.pollution.from.solid.fuels: int [1:6840] 34372 35392 38065 41154 43153 44024 45005 46017 46055 45681 ...
## $ Diet.low.in.Vegetables : int [1:6840] 3679 3732 3827 3951 4075 4153 4247 4339 4409 4473 ...
## $ Low.physical.activity : int [1:6840] 2637 2652 2688 2744 2805 2839 2878 2914 2944 2976 ...
## $ Smoking : int [1:6840] 5174 5247 5363 5522 5689 5801 5934 6066 6178 6288 ...
## $ High.fasting.plasma.glucose : int [1:6840] 11449 11811 12265 12821 13400 13871 14413 14970 15502 16058 ...
## $ Air.pollution : int [1:6840] 37231 38315 41172 44488 46634 47566 48617 49703 49746 49349 ...
## $ High.body.mass.index : int [1:6840] 9518 9489 9528 9611 9675 9608 9503 9286 9024 8857 ...
## $ Unsafe.sanitation : int [1:6840] 2798 3254 4042 5392 5418 6313 6393 7038 7366 7468 ...
## $ No.access.to.handwashing.facility : int [1:6840] 4825 5127 5889 7007 7421 7896 8098 8507 8560 8424 ...
## $ Drug.use : int [1:6840] 174 188 211 232 247 260 274 287 295 302 ...
## $ Low.bone.mineral.density : int [1:6840] 389 389 393 411 413 417 423 425 429 428 ...
## $ Vitamin.A.deficiency : int [1:6840] 2016 2056 2100 2316 2665 3070 3214 3228 3413 3662 ...
## $ Child.stunting : int [1:6840] 7686 7886 8568 9875 11031 11973 12426 12805 13011 13052 ...
## $ Discontinued.breastfeeding : int [1:6840] 107 121 150 204 204 233 233 255 264 263 ...
## $ Non.exclusive.breastfeeding : int [1:6840] 2216 2501 3053 3726 3833 4124 4183 4393 4417 4326 ...
## $ Iron.deficiency : int [1:6840] 564 611 700 773 812 848 883 914 924 909 ...
## $ Total_Deaths : int [1:6840] 10498 10720 11229 11877 12385 12634 12919 13198 13300 13339 ...
# Or fill missing values with the mean (if numerical) not applicable in the dataset
# Data Distribution (Skewness) Visualization
# Plot histograms to check the distribution of death causes
ggplot(data_clean, aes(x = Total_Deaths)) +
geom_histogram(fill = "blue", color = "black") +
ggtitle("Distribution of Total Deaths from Alcohol Use, Smoking, Secondhand Smoking, and Drug Use") +
xlab("Total_Deaths from alochol use, smoking, second hand smoking, and drug use") +
ylab("Frequency") +
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Visualizing Outliers
# Create boxplots for each cause of death
ggplot(data_clean, aes(x =, y =Total_Deaths)) +
geom_boxplot() +
ggtitle("Boxplot for Total Deaths from alochol use, smoking, second hand smoking, and drug use") +
ylab("otal Deaths")

# Need to normalize or scale the data for some algorithms
library(dplyr)
# Excluding non-numeric ones like 'Entity', 'Code', and 'Year'
numeric_columns <- data_clean %>% select(-Entity, -Code, -Year)
# Define a function for normalization
normalize <- function(x) {
return((x - min(x)) / (max(x) - min(x)))
}
data_clean_normalized <- as.data.frame(lapply(numeric_columns, normalize))
data_clean_normalized <- cbind(data %>% select(Entity, Code, Year), data_clean_normalized)
head(data_clean_normalized)
## Entity Code Year Outdoor.air.pollution High.systolic.blood.pressure
## 1 Afghanistan AFG 1990 0.0007032544 0.002363264
## 2 Afghanistan AFG 1991 0.0007150160 0.002385301
## 3 Afghanistan AFG 1992 0.0007534076 0.002425594
## 4 Afghanistan AFG 1993 0.0008040046 0.002485710
## 5 Afghanistan AFG 1994 0.0008406209 0.002549976
## 6 Afghanistan AFG 1995 0.0008585962 0.002589808
## Diet.high.in.sodium Diet.low.in.whole.grains Alochol.use Diet.low.in.fruits
## 1 0.0005542720 0.003836113 0.0001457838 0.003044889
## 2 0.0005595760 0.003875141 0.0001490598 0.003105118
## 3 0.0005701841 0.003955365 0.0001539739 0.003203587
## 4 0.0005850354 0.004064860 0.0001592974 0.003326912
## 5 0.0006014779 0.004172729 0.0001633925 0.003451193
## 6 0.0006120860 0.004231812 0.0001662590 0.003540102
## Unsafe.water.source Secondhand.smoke Low.birth.weight Child.wasting
## 1 0.001510438 0.003674720 0.005319070 0.005697841
## 2 0.001758098 0.003772089 0.005908832 0.005927551
## 3 0.002185280 0.004046562 0.006988800 0.006674106
## 4 0.002918059 0.004395404 0.007844268 0.007871335
## 5 0.002934380 0.004637676 0.008197335 0.008513530
## 6 0.003418275 0.004727378 0.008417548 0.009020173
## Unsafe.sex Diet.low.in.nuts.and.seeds
## 1 0.0002108345 0.004032069
## 2 0.0002168412 0.004258101
## 3 0.0002270525 0.004525862
## 4 0.0002372639 0.004817966
## 5 0.0002462739 0.005097898
## 6 0.0002534819 0.005301327
## Household.air.pollution.from.solid.fuels Diet.low.in.Vegetables
## 1 0.007886717 0.006949626
## 2 0.008120758 0.007049743
## 3 0.008734082 0.007229198
## 4 0.009442859 0.007463434
## 5 0.009901533 0.007697670
## 6 0.010101386 0.007845011
## Low.physical.activity Smoking High.fasting.plasma.glucose Air.pollution
## 1 0.003171369 0.0006723974 0.001760545 0.005580403
## 2 0.003189409 0.0006818861 0.001816226 0.005742880
## 3 0.003232704 0.0006969640 0.001886057 0.006171104
## 4 0.003300052 0.0007176312 0.001971577 0.006668126
## 5 0.003373413 0.0007393382 0.002060635 0.006989781
## 6 0.003414303 0.0007538962 0.002133081 0.007129474
## High.body.mass.index Unsafe.sanitation No.access.to.handwashing.facility
## 1 0.001895860 0.001518774 0.004019664
## 2 0.001890082 0.001766294 0.004271258
## 3 0.001897852 0.002194026 0.004906073
## 4 0.001914388 0.002926816 0.005837469
## 5 0.001927139 0.002940929 0.006182369
## 6 0.001913791 0.003426741 0.006578087
## Drug.use Low.bone.mineral.density Vitamin.A.deficiency Child.stunting
## 1 0.0003518763 0.0008883631 0.009713088 0.009221920
## 2 0.0003801882 0.0008883631 0.009905808 0.009461887
## 3 0.0004267005 0.0008974980 0.010117800 0.010280173
## 4 0.0004691684 0.0009386047 0.011158488 0.011848355
## 5 0.0004995025 0.0009431722 0.012839970 0.013235363
## 6 0.0005257921 0.0009523070 0.014791260 0.014365606
## Discontinued.breastfeeding Non.exclusive.breastfeeding Iron.deficiency
## 1 0.003232043 0.004384039 0.007677543
## 2 0.003654927 0.004947870 0.008317338
## 3 0.004530901 0.006039923 0.009528866
## 4 0.006162025 0.007371357 0.010522590
## 5 0.006162025 0.007583042 0.011053484
## 6 0.007037999 0.008158743 0.011543540
## Total_Deaths
## 1 0.0008794930
## 2 0.0008980950
## 3 0.0009407458
## 4 0.0009950437
## 5 0.0010376106
## 6 0.0010584751
# Visualizing Outliers
# Create boxplots for each cause of death
ggplot(data_clean_normalized, aes(x =, y =Outdoor.air.pollution)) +
geom_boxplot() +
ggtitle("Boxplot for Outdoor Air Pollution Deaths") +
ylab("Outdoor Air Pollution Deaths")

# Calculate the death rate data with death rate against the social economic status of the country (GDP per capita)
GDPpC <- read.csv("API_NY.GDP.PCAP.CD_DS2_en_csv_v2_31681.csv")
str(GDPpC)
## 'data.frame': 266 obs. of 68 variables:
## $ Country.Name : chr "Aruba" "Africa Eastern and Southern" "Afghanistan" "Africa Western and Central" ...
## $ Country.Code : chr "ABW" "AFE" "AFG" "AFW" ...
## $ Indicator.Name: chr "GDP per capita (current US$)" "GDP per capita (current US$)" "GDP per capita (current US$)" "GDP per capita (current US$)" ...
## $ Indicator.Code: chr "NY.GDP.PCAP.CD" "NY.GDP.PCAP.CD" "NY.GDP.PCAP.CD" "NY.GDP.PCAP.CD" ...
## $ X1960 : num NA 162 NA 122 NA ...
## $ X1961 : num NA 166 NA 128 NA ...
## $ X1962 : num NA 172 NA 134 NA ...
## $ X1963 : num NA 182 NA 139 NA ...
## $ X1964 : num NA 193 NA 149 NA ...
## $ X1965 : num NA 203 NA 156 NA ...
## $ X1966 : num NA 215 NA 163 NA ...
## $ X1967 : num NA 227 NA 146 NA ...
## $ X1968 : num NA 238 NA 147 NA ...
## $ X1969 : num NA 256 NA 163 NA ...
## $ X1970 : num NA 253 NA 220 NA ...
## $ X1971 : num NA 268 NA 197 NA ...
## $ X1972 : num NA 283 NA 232 NA ...
## $ X1973 : num NA 354 NA 283 NA ...
## $ X1974 : num NA 422 NA 371 NA ...
## $ X1975 : num NA 436 NA 416 NA ...
## $ X1976 : num NA 430 NA 484 NA ...
## $ X1977 : num NA 468 NA 495 NA ...
## $ X1978 : num NA 509 NA 528 NA ...
## $ X1979 : num NA 579 NA 630 NA ...
## $ X1980 : num NA 728 NA 764 NA ...
## $ X1981 : num NA 747 NA 1336 NA ...
## $ X1982 : num NA 690 NA 1173 NA ...
## $ X1983 : num NA 712 NA 881 NA ...
## $ X1984 : num NA 639 NA 745 NA ...
## $ X1985 : num NA 535 NA 762 NA ...
## $ X1986 : num 6283 561 NA 588 NA ...
## $ X1987 : num 7567 653 NA 589 NA ...
## $ X1988 : num 9275 690 NA 568 NA ...
## $ X1989 : num 10767 712 NA 516 NA ...
## $ X1990 : num 11639 811 NA 598 NA ...
## $ X1991 : num 12850 859 NA 612 NA ...
## $ X1992 : num 13658 732 NA 572 NA ...
## $ X1993 : num 14970 716 NA 580 451 ...
## $ X1994 : num 16675 707 NA 587 329 ...
## $ X1995 : num 17140 774 NA 878 398 ...
## $ X1996 : num 17375 751 NA 1084 454 ...
## $ X1997 : num 18713 775 NA 1109 516 ...
## $ X1998 : num 19742 704 NA 1159 423 ...
## $ X1999 : num 19834 678 NA 532 388 ...
## $ X2000 : num 21026 715 180 527 557 ...
## $ X2001 : num 20911 633 143 539 527 ...
## $ X2002 : num 21375 634 182 627 873 ...
## $ X2003 : num 22051 820 200 706 983 ...
## $ X2004 : num 24106 994 222 850 1255 ...
## $ X2005 : num 24978 1130 254 1008 1901 ...
## $ X2006 : num 25833 1236 274 1246 2598 ...
## $ X2007 : num 27665 1380 376 1421 3121 ...
## $ X2008 : num 29012 1439 383 1686 4082 ...
## $ X2009 : num 25741 1405 453 1468 3124 ...
## $ X2010 : num 24453 1623 562 1680 3587 ...
## $ X2011 : num 26043 1758 609 1862 4608 ...
## $ X2012 : num 25611 1724 653 1958 5084 ...
## $ X2013 : num 26515 1696 639 2154 5061 ...
## $ X2014 : num 26940 1679 627 2249 5012 ...
## $ X2015 : num 28419 1499 567 1883 3217 ...
## $ X2016 : num 28450 1346 523 1649 1810 ...
## $ X2017 : num 29329 1486 526 1591 2439 ...
## $ X2018 : num 30918 1559 492 1735 2541 ...
## $ X2019 : num 31903 1508 498 1814 2191 ...
## $ X2020 : num 24008 1356 512 1688 1451 ...
## $ X2021 : num 29128 1546 356 1769 1927 ...
## $ X2022 : num 33301 1642 353 1789 2933 ...
## $ X2023 : num NA 1673 NA 1584 2310 ...
# Reshaping the dataset using melt function
library(reshape2)
GDPpC_adj <- GDPpC %>% select(-Country.Name, -Indicator.Name, -Indicator.Code)
str(GDPpC_adj)
## 'data.frame': 266 obs. of 65 variables:
## $ Country.Code: chr "ABW" "AFE" "AFG" "AFW" ...
## $ X1960 : num NA 162 NA 122 NA ...
## $ X1961 : num NA 166 NA 128 NA ...
## $ X1962 : num NA 172 NA 134 NA ...
## $ X1963 : num NA 182 NA 139 NA ...
## $ X1964 : num NA 193 NA 149 NA ...
## $ X1965 : num NA 203 NA 156 NA ...
## $ X1966 : num NA 215 NA 163 NA ...
## $ X1967 : num NA 227 NA 146 NA ...
## $ X1968 : num NA 238 NA 147 NA ...
## $ X1969 : num NA 256 NA 163 NA ...
## $ X1970 : num NA 253 NA 220 NA ...
## $ X1971 : num NA 268 NA 197 NA ...
## $ X1972 : num NA 283 NA 232 NA ...
## $ X1973 : num NA 354 NA 283 NA ...
## $ X1974 : num NA 422 NA 371 NA ...
## $ X1975 : num NA 436 NA 416 NA ...
## $ X1976 : num NA 430 NA 484 NA ...
## $ X1977 : num NA 468 NA 495 NA ...
## $ X1978 : num NA 509 NA 528 NA ...
## $ X1979 : num NA 579 NA 630 NA ...
## $ X1980 : num NA 728 NA 764 NA ...
## $ X1981 : num NA 747 NA 1336 NA ...
## $ X1982 : num NA 690 NA 1173 NA ...
## $ X1983 : num NA 712 NA 881 NA ...
## $ X1984 : num NA 639 NA 745 NA ...
## $ X1985 : num NA 535 NA 762 NA ...
## $ X1986 : num 6283 561 NA 588 NA ...
## $ X1987 : num 7567 653 NA 589 NA ...
## $ X1988 : num 9275 690 NA 568 NA ...
## $ X1989 : num 10767 712 NA 516 NA ...
## $ X1990 : num 11639 811 NA 598 NA ...
## $ X1991 : num 12850 859 NA 612 NA ...
## $ X1992 : num 13658 732 NA 572 NA ...
## $ X1993 : num 14970 716 NA 580 451 ...
## $ X1994 : num 16675 707 NA 587 329 ...
## $ X1995 : num 17140 774 NA 878 398 ...
## $ X1996 : num 17375 751 NA 1084 454 ...
## $ X1997 : num 18713 775 NA 1109 516 ...
## $ X1998 : num 19742 704 NA 1159 423 ...
## $ X1999 : num 19834 678 NA 532 388 ...
## $ X2000 : num 21026 715 180 527 557 ...
## $ X2001 : num 20911 633 143 539 527 ...
## $ X2002 : num 21375 634 182 627 873 ...
## $ X2003 : num 22051 820 200 706 983 ...
## $ X2004 : num 24106 994 222 850 1255 ...
## $ X2005 : num 24978 1130 254 1008 1901 ...
## $ X2006 : num 25833 1236 274 1246 2598 ...
## $ X2007 : num 27665 1380 376 1421 3121 ...
## $ X2008 : num 29012 1439 383 1686 4082 ...
## $ X2009 : num 25741 1405 453 1468 3124 ...
## $ X2010 : num 24453 1623 562 1680 3587 ...
## $ X2011 : num 26043 1758 609 1862 4608 ...
## $ X2012 : num 25611 1724 653 1958 5084 ...
## $ X2013 : num 26515 1696 639 2154 5061 ...
## $ X2014 : num 26940 1679 627 2249 5012 ...
## $ X2015 : num 28419 1499 567 1883 3217 ...
## $ X2016 : num 28450 1346 523 1649 1810 ...
## $ X2017 : num 29329 1486 526 1591 2439 ...
## $ X2018 : num 30918 1559 492 1735 2541 ...
## $ X2019 : num 31903 1508 498 1814 2191 ...
## $ X2020 : num 24008 1356 512 1688 1451 ...
## $ X2021 : num 29128 1546 356 1769 1927 ...
## $ X2022 : num 33301 1642 353 1789 2933 ...
## $ X2023 : num NA 1673 NA 1584 2310 ...
GDPpC_long <- melt(GDPpC_adj,
id.vars = c("Country.Code"), # Keeping Country Code as identifier
variable.name = "Year", # Column for years
value.name = "GDP_per_Capita") # Column for GDP per capita values
str(GDPpC_long)
## 'data.frame': 17024 obs. of 3 variables:
## $ Country.Code : chr "ABW" "AFE" "AFG" "AFW" ...
## $ Year : Factor w/ 64 levels "X1960","X1961",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ GDP_per_Capita: num NA 162 NA 122 NA ...
# Remove the 'X' from the 'Year' column and convert it to numeric
GDPpC_long$Year <- as.numeric(gsub("X", "", GDPpC_long$Year))
head(GDPpC_long)
## Country.Code Year GDP_per_Capita
## 1 ABW 1960 NA
## 2 AFE 1960 162.3425
## 3 AFG 1960 NA
## 4 AFW 1960 122.1939
## 5 AGO 1960 NA
## 6 ALB 1960 NA
# Merge datasets on 'Country' and 'Year'
library(dplyr)
merged_data <- merge(data_clean, GDPpC_long, by.x = c("Code", "Year"), by.y = c("Country.Code", "Year"))
str(merged_data)
## 'data.frame': 6000 obs. of 33 variables:
## $ Code : chr "AFG" "AFG" "AFG" "AFG" ...
## $ Year : int 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 ...
## $ Entity : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Outdoor.air.pollution : int 3169 3222 3395 3623 3788 3869 3943 4024 4040 4042 ...
## $ High.systolic.blood.pressure : int 25633 25872 26309 26961 27658 28090 28587 29021 29349 29712 ...
## $ Diet.high.in.sodium : int 1045 1055 1075 1103 1134 1154 1178 1202 1222 1242 ...
## $ Diet.low.in.whole.grains : int 7077 7149 7297 7499 7698 7807 7943 8075 8173 8265 ...
## $ Alochol.use : int 356 364 376 389 399 406 413 420 425 426 ...
## $ Diet.low.in.fruits : int 3185 3248 3351 3480 3610 3703 3819 3938 4038 4127 ...
## $ Unsafe.water.source : int 3702 4309 5356 7152 7192 8378 8487 9348 9788 9931 ...
## $ Secondhand.smoke : int 4794 4921 5279 5734 6050 6167 6298 6425 6402 6323 ...
## $ Low.birth.weight : int 16135 17924 21200 23795 24866 25534 25997 26246 25805 25080 ...
## $ Child.wasting : int 19546 20334 22895 27002 29205 30943 31628 32736 32760 32271 ...
## $ Unsafe.sex : int 351 361 378 395 410 422 435 448 458 469 ...
## $ Diet.low.in.nuts.and.seeds : int 2319 2449 2603 2771 2932 3049 3173 3298 3401 3482 ...
## $ Household.air.pollution.from.solid.fuels: int 34372 35392 38065 41154 43153 44024 45005 46017 46055 45681 ...
## $ Diet.low.in.Vegetables : int 3679 3732 3827 3951 4075 4153 4247 4339 4409 4473 ...
## $ Low.physical.activity : int 2637 2652 2688 2744 2805 2839 2878 2914 2944 2976 ...
## $ Smoking : int 5174 5247 5363 5522 5689 5801 5934 6066 6178 6288 ...
## $ High.fasting.plasma.glucose : int 11449 11811 12265 12821 13400 13871 14413 14970 15502 16058 ...
## $ Air.pollution : int 37231 38315 41172 44488 46634 47566 48617 49703 49746 49349 ...
## $ High.body.mass.index : int 9518 9489 9528 9611 9675 9608 9503 9286 9024 8857 ...
## $ Unsafe.sanitation : int 2798 3254 4042 5392 5418 6313 6393 7038 7366 7468 ...
## $ No.access.to.handwashing.facility : int 4825 5127 5889 7007 7421 7896 8098 8507 8560 8424 ...
## $ Drug.use : int 174 188 211 232 247 260 274 287 295 302 ...
## $ Low.bone.mineral.density : int 389 389 393 411 413 417 423 425 429 428 ...
## $ Vitamin.A.deficiency : int 2016 2056 2100 2316 2665 3070 3214 3228 3413 3662 ...
## $ Child.stunting : int 7686 7886 8568 9875 11031 11973 12426 12805 13011 13052 ...
## $ Discontinued.breastfeeding : int 107 121 150 204 204 233 233 255 264 263 ...
## $ Non.exclusive.breastfeeding : int 2216 2501 3053 3726 3833 4124 4183 4393 4417 4326 ...
## $ Iron.deficiency : int 564 611 700 773 812 848 883 914 924 909 ...
## $ Total_Deaths : int 10498 10720 11229 11877 12385 12634 12919 13198 13300 13339 ...
## $ GDP_per_Capita : num NA NA NA NA NA NA NA NA NA NA ...
head(merged_data)
## Code Year Entity Outdoor.air.pollution High.systolic.blood.pressure
## 1 AFG 1990 Afghanistan 3169 25633
## 2 AFG 1991 Afghanistan 3222 25872
## 3 AFG 1992 Afghanistan 3395 26309
## 4 AFG 1993 Afghanistan 3623 26961
## 5 AFG 1994 Afghanistan 3788 27658
## 6 AFG 1995 Afghanistan 3869 28090
## Diet.high.in.sodium Diet.low.in.whole.grains Alochol.use Diet.low.in.fruits
## 1 1045 7077 356 3185
## 2 1055 7149 364 3248
## 3 1075 7297 376 3351
## 4 1103 7499 389 3480
## 5 1134 7698 399 3610
## 6 1154 7807 406 3703
## Unsafe.water.source Secondhand.smoke Low.birth.weight Child.wasting
## 1 3702 4794 16135 19546
## 2 4309 4921 17924 20334
## 3 5356 5279 21200 22895
## 4 7152 5734 23795 27002
## 5 7192 6050 24866 29205
## 6 8378 6167 25534 30943
## Unsafe.sex Diet.low.in.nuts.and.seeds
## 1 351 2319
## 2 361 2449
## 3 378 2603
## 4 395 2771
## 5 410 2932
## 6 422 3049
## Household.air.pollution.from.solid.fuels Diet.low.in.Vegetables
## 1 34372 3679
## 2 35392 3732
## 3 38065 3827
## 4 41154 3951
## 5 43153 4075
## 6 44024 4153
## Low.physical.activity Smoking High.fasting.plasma.glucose Air.pollution
## 1 2637 5174 11449 37231
## 2 2652 5247 11811 38315
## 3 2688 5363 12265 41172
## 4 2744 5522 12821 44488
## 5 2805 5689 13400 46634
## 6 2839 5801 13871 47566
## High.body.mass.index Unsafe.sanitation No.access.to.handwashing.facility
## 1 9518 2798 4825
## 2 9489 3254 5127
## 3 9528 4042 5889
## 4 9611 5392 7007
## 5 9675 5418 7421
## 6 9608 6313 7896
## Drug.use Low.bone.mineral.density Vitamin.A.deficiency Child.stunting
## 1 174 389 2016 7686
## 2 188 389 2056 7886
## 3 211 393 2100 8568
## 4 232 411 2316 9875
## 5 247 413 2665 11031
## 6 260 417 3070 11973
## Discontinued.breastfeeding Non.exclusive.breastfeeding Iron.deficiency
## 1 107 2216 564
## 2 121 2501 611
## 3 150 3053 700
## 4 204 3726 773
## 5 204 3833 812
## 6 233 4124 848
## Total_Deaths GDP_per_Capita
## 1 10498 NA
## 2 10720 NA
## 3 11229 NA
## 4 11877 NA
## 5 12385 NA
## 6 12634 NA
# Removing rows with NAs
sapply(merged_data, function(x) sum(is.na(x)))
## Code
## 0
## Year
## 0
## Entity
## 0
## Outdoor.air.pollution
## 0
## High.systolic.blood.pressure
## 0
## Diet.high.in.sodium
## 0
## Diet.low.in.whole.grains
## 0
## Alochol.use
## 0
## Diet.low.in.fruits
## 0
## Unsafe.water.source
## 0
## Secondhand.smoke
## 0
## Low.birth.weight
## 0
## Child.wasting
## 0
## Unsafe.sex
## 0
## Diet.low.in.nuts.and.seeds
## 0
## Household.air.pollution.from.solid.fuels
## 0
## Diet.low.in.Vegetables
## 0
## Low.physical.activity
## 0
## Smoking
## 0
## High.fasting.plasma.glucose
## 0
## Air.pollution
## 0
## High.body.mass.index
## 0
## Unsafe.sanitation
## 0
## No.access.to.handwashing.facility
## 0
## Drug.use
## 0
## Low.bone.mineral.density
## 0
## Vitamin.A.deficiency
## 0
## Child.stunting
## 0
## Discontinued.breastfeeding
## 0
## Non.exclusive.breastfeeding
## 0
## Iron.deficiency
## 0
## Total_Deaths
## 0
## GDP_per_Capita
## 172
unique(sapply(merged_data, function(x) sum(is.na(x))))
## [1] 0 172
merged_data_clean <- na.omit(merged_data)
str(merged_data_clean)
## 'data.frame': 5828 obs. of 33 variables:
## $ Code : chr "AFG" "AFG" "AFG" "AFG" ...
## $ Year : int 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 ...
## $ Entity : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Outdoor.air.pollution : int 4021 4014 3961 4116 4176 4176 4232 4480 4767 5038 ...
## $ High.systolic.blood.pressure : int 29999 30421 30189 30157 30225 30089 30075 30080 30219 30280 ...
## $ Diet.high.in.sodium : int 1260 1282 1275 1277 1281 1276 1270 1263 1261 1254 ...
## $ Diet.low.in.whole.grains : int 8328 8440 8383 8398 8433 8415 8418 8426 8474 8488 ...
## $ Alochol.use : int 427 432 432 437 445 452 456 465 478 485 ...
## $ Diet.low.in.fruits : int 4174 4226 4184 4179 4188 4166 4142 4108 4088 4050 ...
## $ Unsafe.water.source : int 9942 10052 10004 10841 10761 10118 9081 8168 7245 6437 ...
## $ Secondhand.smoke : int 6227 6214 6103 6341 6383 6272 6153 6010 5868 5752 ...
## $ Low.birth.weight : int 24549 24859 25441 25617 25459 25149 24354 23778 23533 23232 ...
## $ Child.wasting : int 31559 30938 29619 29098 28167 26669 24987 23100 21624 20264 ...
## $ Unsafe.sex : int 482 500 507 518 533 541 549 557 568 576 ...
## $ Diet.low.in.nuts.and.seeds : int 3516 3543 3487 3457 3430 3381 3325 3251 3183 3102 ...
## $ Household.air.pollution.from.solid.fuels: int 45132 45028 44137 44953 44534 43280 41778 40047 38320 36675 ...
## $ Diet.low.in.Vegetables : int 4501 4539 4483 4471 4475 4449 4432 4417 4422 4407 ...
## $ Low.physical.activity : int 3013 3068 3078 3104 3137 3153 3178 3214 3267 3311 ...
## $ Smoking : int 6378 6505 6488 6528 6594 6636 6720 6849 7026 7181 ...
## $ High.fasting.plasma.glucose : int 16545 17144 17467 17921 18397 18652 18917 19183 19498 19730 ...
## $ Air.pollution : int 48763 48660 47732 48687 48337 47107 45691 44161 42696 41348 ...
## $ High.body.mass.index : int 8829 9047 9151 9410 9759 10051 10519 11218 12106 12982 ...
## $ Unsafe.sanitation : int 7472 7547 7497 8106 8028 7532 6740 6050 5350 4732 ...
## $ No.access.to.handwashing.facility : int 8233 8144 7915 8482 8440 8033 7481 6900 6291 5793 ...
## $ Drug.use : int 312 328 337 356 376 390 401 414 430 445 ...
## $ Low.bone.mineral.density : int 431 436 437 433 440 454 454 465 478 493 ...
## $ Vitamin.A.deficiency : int 3855 3944 3798 1902 1330 1243 1231 1226 1555 1704 ...
## $ Child.stunting : int 12965 12732 12069 9862 8928 8380 7885 7303 7148 6867 ...
## $ Discontinued.breastfeeding : int 256 250 242 263 255 233 204 182 165 158 ...
## $ Non.exclusive.breastfeeding : int 4195 4094 3947 3953 3774 3519 3216 2992 2832 2706 ...
## $ Iron.deficiency : int 897 919 937 972 985 1007 992 1008 1008 1001 ...
## $ Total_Deaths : int 13344 13479 13360 13662 13798 13750 13730 13738 13802 13863 ...
## $ GDP_per_Capita : num 180 143 182 200 222 ...
## - attr(*, "na.action")= 'omit' Named int [1:172] 1 2 3 4 5 6 7 8 9 10 ...
## ..- attr(*, "names")= chr [1:172] "1" "2" "3" "4" ...
str(merged_data)
## 'data.frame': 6000 obs. of 33 variables:
## $ Code : chr "AFG" "AFG" "AFG" "AFG" ...
## $ Year : int 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 ...
## $ Entity : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Outdoor.air.pollution : int 3169 3222 3395 3623 3788 3869 3943 4024 4040 4042 ...
## $ High.systolic.blood.pressure : int 25633 25872 26309 26961 27658 28090 28587 29021 29349 29712 ...
## $ Diet.high.in.sodium : int 1045 1055 1075 1103 1134 1154 1178 1202 1222 1242 ...
## $ Diet.low.in.whole.grains : int 7077 7149 7297 7499 7698 7807 7943 8075 8173 8265 ...
## $ Alochol.use : int 356 364 376 389 399 406 413 420 425 426 ...
## $ Diet.low.in.fruits : int 3185 3248 3351 3480 3610 3703 3819 3938 4038 4127 ...
## $ Unsafe.water.source : int 3702 4309 5356 7152 7192 8378 8487 9348 9788 9931 ...
## $ Secondhand.smoke : int 4794 4921 5279 5734 6050 6167 6298 6425 6402 6323 ...
## $ Low.birth.weight : int 16135 17924 21200 23795 24866 25534 25997 26246 25805 25080 ...
## $ Child.wasting : int 19546 20334 22895 27002 29205 30943 31628 32736 32760 32271 ...
## $ Unsafe.sex : int 351 361 378 395 410 422 435 448 458 469 ...
## $ Diet.low.in.nuts.and.seeds : int 2319 2449 2603 2771 2932 3049 3173 3298 3401 3482 ...
## $ Household.air.pollution.from.solid.fuels: int 34372 35392 38065 41154 43153 44024 45005 46017 46055 45681 ...
## $ Diet.low.in.Vegetables : int 3679 3732 3827 3951 4075 4153 4247 4339 4409 4473 ...
## $ Low.physical.activity : int 2637 2652 2688 2744 2805 2839 2878 2914 2944 2976 ...
## $ Smoking : int 5174 5247 5363 5522 5689 5801 5934 6066 6178 6288 ...
## $ High.fasting.plasma.glucose : int 11449 11811 12265 12821 13400 13871 14413 14970 15502 16058 ...
## $ Air.pollution : int 37231 38315 41172 44488 46634 47566 48617 49703 49746 49349 ...
## $ High.body.mass.index : int 9518 9489 9528 9611 9675 9608 9503 9286 9024 8857 ...
## $ Unsafe.sanitation : int 2798 3254 4042 5392 5418 6313 6393 7038 7366 7468 ...
## $ No.access.to.handwashing.facility : int 4825 5127 5889 7007 7421 7896 8098 8507 8560 8424 ...
## $ Drug.use : int 174 188 211 232 247 260 274 287 295 302 ...
## $ Low.bone.mineral.density : int 389 389 393 411 413 417 423 425 429 428 ...
## $ Vitamin.A.deficiency : int 2016 2056 2100 2316 2665 3070 3214 3228 3413 3662 ...
## $ Child.stunting : int 7686 7886 8568 9875 11031 11973 12426 12805 13011 13052 ...
## $ Discontinued.breastfeeding : int 107 121 150 204 204 233 233 255 264 263 ...
## $ Non.exclusive.breastfeeding : int 2216 2501 3053 3726 3833 4124 4183 4393 4417 4326 ...
## $ Iron.deficiency : int 564 611 700 773 812 848 883 914 924 909 ...
## $ Total_Deaths : int 10498 10720 11229 11877 12385 12634 12919 13198 13300 13339 ...
## $ GDP_per_Capita : num NA NA NA NA NA NA NA NA NA NA ...
# c. data transformation _formulate it as a binary classification problem
library(dplyr)
# Calculate death rate as a function of total deaths and GDP per capita
merged_data_clean_use <- merged_data_clean %>%
mutate(Death_Rate_vs_GDP = Total_Deaths / GDP_per_Capita)
str(merged_data_clean_use)
## 'data.frame': 5828 obs. of 34 variables:
## $ Code : chr "AFG" "AFG" "AFG" "AFG" ...
## $ Year : int 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 ...
## $ Entity : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Outdoor.air.pollution : int 4021 4014 3961 4116 4176 4176 4232 4480 4767 5038 ...
## $ High.systolic.blood.pressure : int 29999 30421 30189 30157 30225 30089 30075 30080 30219 30280 ...
## $ Diet.high.in.sodium : int 1260 1282 1275 1277 1281 1276 1270 1263 1261 1254 ...
## $ Diet.low.in.whole.grains : int 8328 8440 8383 8398 8433 8415 8418 8426 8474 8488 ...
## $ Alochol.use : int 427 432 432 437 445 452 456 465 478 485 ...
## $ Diet.low.in.fruits : int 4174 4226 4184 4179 4188 4166 4142 4108 4088 4050 ...
## $ Unsafe.water.source : int 9942 10052 10004 10841 10761 10118 9081 8168 7245 6437 ...
## $ Secondhand.smoke : int 6227 6214 6103 6341 6383 6272 6153 6010 5868 5752 ...
## $ Low.birth.weight : int 24549 24859 25441 25617 25459 25149 24354 23778 23533 23232 ...
## $ Child.wasting : int 31559 30938 29619 29098 28167 26669 24987 23100 21624 20264 ...
## $ Unsafe.sex : int 482 500 507 518 533 541 549 557 568 576 ...
## $ Diet.low.in.nuts.and.seeds : int 3516 3543 3487 3457 3430 3381 3325 3251 3183 3102 ...
## $ Household.air.pollution.from.solid.fuels: int 45132 45028 44137 44953 44534 43280 41778 40047 38320 36675 ...
## $ Diet.low.in.Vegetables : int 4501 4539 4483 4471 4475 4449 4432 4417 4422 4407 ...
## $ Low.physical.activity : int 3013 3068 3078 3104 3137 3153 3178 3214 3267 3311 ...
## $ Smoking : int 6378 6505 6488 6528 6594 6636 6720 6849 7026 7181 ...
## $ High.fasting.plasma.glucose : int 16545 17144 17467 17921 18397 18652 18917 19183 19498 19730 ...
## $ Air.pollution : int 48763 48660 47732 48687 48337 47107 45691 44161 42696 41348 ...
## $ High.body.mass.index : int 8829 9047 9151 9410 9759 10051 10519 11218 12106 12982 ...
## $ Unsafe.sanitation : int 7472 7547 7497 8106 8028 7532 6740 6050 5350 4732 ...
## $ No.access.to.handwashing.facility : int 8233 8144 7915 8482 8440 8033 7481 6900 6291 5793 ...
## $ Drug.use : int 312 328 337 356 376 390 401 414 430 445 ...
## $ Low.bone.mineral.density : int 431 436 437 433 440 454 454 465 478 493 ...
## $ Vitamin.A.deficiency : int 3855 3944 3798 1902 1330 1243 1231 1226 1555 1704 ...
## $ Child.stunting : int 12965 12732 12069 9862 8928 8380 7885 7303 7148 6867 ...
## $ Discontinued.breastfeeding : int 256 250 242 263 255 233 204 182 165 158 ...
## $ Non.exclusive.breastfeeding : int 4195 4094 3947 3953 3774 3519 3216 2992 2832 2706 ...
## $ Iron.deficiency : int 897 919 937 972 985 1007 992 1008 1008 1001 ...
## $ Total_Deaths : int 13344 13479 13360 13662 13798 13750 13730 13738 13802 13863 ...
## $ GDP_per_Capita : num 180 143 182 200 222 ...
## $ Death_Rate_vs_GDP : num 74.1 94.3 73.3 68.4 62.2 ...
## - attr(*, "na.action")= 'omit' Named int [1:172] 1 2 3 4 5 6 7 8 9 10 ...
## ..- attr(*, "names")= chr [1:172] "1" "2" "3" "4" ...
# Define a threshold for binary classification (median of total deaths)
threshold <- median(merged_data_clean_use$Death_Rate_vs_GDP, na.rm = TRUE)
# Create a new binary target variable: 'High_Deaths' (1 if above threshold, 0 if below)
merged_data_clean_use <- merged_data_clean_use %>%
mutate(High_Deaths = ifelse(Death_Rate_vs_GDP >= threshold, 1, 0))
# Ensure High_Deaths is a factor with exactly two levels (0 and 1)
merged_data_clean_use$High_Deaths <- factor(merged_data_clean_use$High_Deaths, levels = c(0, 1))
merged_data_clean_use <- data.frame(merged_data_clean_use)
str(merged_data_clean_use)
## 'data.frame': 5828 obs. of 35 variables:
## $ Code : chr "AFG" "AFG" "AFG" "AFG" ...
## $ Year : int 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 ...
## $ Entity : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Outdoor.air.pollution : int 4021 4014 3961 4116 4176 4176 4232 4480 4767 5038 ...
## $ High.systolic.blood.pressure : int 29999 30421 30189 30157 30225 30089 30075 30080 30219 30280 ...
## $ Diet.high.in.sodium : int 1260 1282 1275 1277 1281 1276 1270 1263 1261 1254 ...
## $ Diet.low.in.whole.grains : int 8328 8440 8383 8398 8433 8415 8418 8426 8474 8488 ...
## $ Alochol.use : int 427 432 432 437 445 452 456 465 478 485 ...
## $ Diet.low.in.fruits : int 4174 4226 4184 4179 4188 4166 4142 4108 4088 4050 ...
## $ Unsafe.water.source : int 9942 10052 10004 10841 10761 10118 9081 8168 7245 6437 ...
## $ Secondhand.smoke : int 6227 6214 6103 6341 6383 6272 6153 6010 5868 5752 ...
## $ Low.birth.weight : int 24549 24859 25441 25617 25459 25149 24354 23778 23533 23232 ...
## $ Child.wasting : int 31559 30938 29619 29098 28167 26669 24987 23100 21624 20264 ...
## $ Unsafe.sex : int 482 500 507 518 533 541 549 557 568 576 ...
## $ Diet.low.in.nuts.and.seeds : int 3516 3543 3487 3457 3430 3381 3325 3251 3183 3102 ...
## $ Household.air.pollution.from.solid.fuels: int 45132 45028 44137 44953 44534 43280 41778 40047 38320 36675 ...
## $ Diet.low.in.Vegetables : int 4501 4539 4483 4471 4475 4449 4432 4417 4422 4407 ...
## $ Low.physical.activity : int 3013 3068 3078 3104 3137 3153 3178 3214 3267 3311 ...
## $ Smoking : int 6378 6505 6488 6528 6594 6636 6720 6849 7026 7181 ...
## $ High.fasting.plasma.glucose : int 16545 17144 17467 17921 18397 18652 18917 19183 19498 19730 ...
## $ Air.pollution : int 48763 48660 47732 48687 48337 47107 45691 44161 42696 41348 ...
## $ High.body.mass.index : int 8829 9047 9151 9410 9759 10051 10519 11218 12106 12982 ...
## $ Unsafe.sanitation : int 7472 7547 7497 8106 8028 7532 6740 6050 5350 4732 ...
## $ No.access.to.handwashing.facility : int 8233 8144 7915 8482 8440 8033 7481 6900 6291 5793 ...
## $ Drug.use : int 312 328 337 356 376 390 401 414 430 445 ...
## $ Low.bone.mineral.density : int 431 436 437 433 440 454 454 465 478 493 ...
## $ Vitamin.A.deficiency : int 3855 3944 3798 1902 1330 1243 1231 1226 1555 1704 ...
## $ Child.stunting : int 12965 12732 12069 9862 8928 8380 7885 7303 7148 6867 ...
## $ Discontinued.breastfeeding : int 256 250 242 263 255 233 204 182 165 158 ...
## $ Non.exclusive.breastfeeding : int 4195 4094 3947 3953 3774 3519 3216 2992 2832 2706 ...
## $ Iron.deficiency : int 897 919 937 972 985 1007 992 1008 1008 1001 ...
## $ Total_Deaths : int 13344 13479 13360 13662 13798 13750 13730 13738 13802 13863 ...
## $ GDP_per_Capita : num 180 143 182 200 222 ...
## $ Death_Rate_vs_GDP : num 74.1 94.3 73.3 68.4 62.2 ...
## $ High_Deaths : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
#2. Classification Modeling
# a. Split the data into training and test sets:
# reform to include varibles in model
data_use <- merged_data_clean_use %>% select(-Code, -GDP_per_Capita, -Death_Rate_vs_GDP, -Total_Deaths)
str(data_use)
## 'data.frame': 5828 obs. of 31 variables:
## $ Year : int 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 ...
## $ Entity : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Outdoor.air.pollution : int 4021 4014 3961 4116 4176 4176 4232 4480 4767 5038 ...
## $ High.systolic.blood.pressure : int 29999 30421 30189 30157 30225 30089 30075 30080 30219 30280 ...
## $ Diet.high.in.sodium : int 1260 1282 1275 1277 1281 1276 1270 1263 1261 1254 ...
## $ Diet.low.in.whole.grains : int 8328 8440 8383 8398 8433 8415 8418 8426 8474 8488 ...
## $ Alochol.use : int 427 432 432 437 445 452 456 465 478 485 ...
## $ Diet.low.in.fruits : int 4174 4226 4184 4179 4188 4166 4142 4108 4088 4050 ...
## $ Unsafe.water.source : int 9942 10052 10004 10841 10761 10118 9081 8168 7245 6437 ...
## $ Secondhand.smoke : int 6227 6214 6103 6341 6383 6272 6153 6010 5868 5752 ...
## $ Low.birth.weight : int 24549 24859 25441 25617 25459 25149 24354 23778 23533 23232 ...
## $ Child.wasting : int 31559 30938 29619 29098 28167 26669 24987 23100 21624 20264 ...
## $ Unsafe.sex : int 482 500 507 518 533 541 549 557 568 576 ...
## $ Diet.low.in.nuts.and.seeds : int 3516 3543 3487 3457 3430 3381 3325 3251 3183 3102 ...
## $ Household.air.pollution.from.solid.fuels: int 45132 45028 44137 44953 44534 43280 41778 40047 38320 36675 ...
## $ Diet.low.in.Vegetables : int 4501 4539 4483 4471 4475 4449 4432 4417 4422 4407 ...
## $ Low.physical.activity : int 3013 3068 3078 3104 3137 3153 3178 3214 3267 3311 ...
## $ Smoking : int 6378 6505 6488 6528 6594 6636 6720 6849 7026 7181 ...
## $ High.fasting.plasma.glucose : int 16545 17144 17467 17921 18397 18652 18917 19183 19498 19730 ...
## $ Air.pollution : int 48763 48660 47732 48687 48337 47107 45691 44161 42696 41348 ...
## $ High.body.mass.index : int 8829 9047 9151 9410 9759 10051 10519 11218 12106 12982 ...
## $ Unsafe.sanitation : int 7472 7547 7497 8106 8028 7532 6740 6050 5350 4732 ...
## $ No.access.to.handwashing.facility : int 8233 8144 7915 8482 8440 8033 7481 6900 6291 5793 ...
## $ Drug.use : int 312 328 337 356 376 390 401 414 430 445 ...
## $ Low.bone.mineral.density : int 431 436 437 433 440 454 454 465 478 493 ...
## $ Vitamin.A.deficiency : int 3855 3944 3798 1902 1330 1243 1231 1226 1555 1704 ...
## $ Child.stunting : int 12965 12732 12069 9862 8928 8380 7885 7303 7148 6867 ...
## $ Discontinued.breastfeeding : int 256 250 242 263 255 233 204 182 165 158 ...
## $ Non.exclusive.breastfeeding : int 4195 4094 3947 3953 3774 3519 3216 2992 2832 2706 ...
## $ Iron.deficiency : int 897 919 937 972 985 1007 992 1008 1008 1001 ...
## $ High_Deaths : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
# Check for missing values
sapply(data_use, function(x) sum(is.na(x)))
## Year
## 0
## Entity
## 0
## Outdoor.air.pollution
## 0
## High.systolic.blood.pressure
## 0
## Diet.high.in.sodium
## 0
## Diet.low.in.whole.grains
## 0
## Alochol.use
## 0
## Diet.low.in.fruits
## 0
## Unsafe.water.source
## 0
## Secondhand.smoke
## 0
## Low.birth.weight
## 0
## Child.wasting
## 0
## Unsafe.sex
## 0
## Diet.low.in.nuts.and.seeds
## 0
## Household.air.pollution.from.solid.fuels
## 0
## Diet.low.in.Vegetables
## 0
## Low.physical.activity
## 0
## Smoking
## 0
## High.fasting.plasma.glucose
## 0
## Air.pollution
## 0
## High.body.mass.index
## 0
## Unsafe.sanitation
## 0
## No.access.to.handwashing.facility
## 0
## Drug.use
## 0
## Low.bone.mineral.density
## 0
## Vitamin.A.deficiency
## 0
## Child.stunting
## 0
## Discontinued.breastfeeding
## 0
## Non.exclusive.breastfeeding
## 0
## Iron.deficiency
## 0
## High_Deaths
## 0
unique(sapply(data_use, function(x) sum(is.na(x))))
## [1] 0
library(caTools)
## Warning: 程辑包'caTools'是用R版本4.2.3 来建造的
set.seed(123)
# Split the dataset
split <- sample.split(data_use$High_Deaths, SplitRatio = 0.8)
train_set <- subset(data_use, split == TRUE)
test_set <- subset(data_use, split == FALSE)
str(train_set)
## 'data.frame': 4662 obs. of 31 variables:
## $ Year : int 2000 2001 2002 2005 2006 2008 2009 2011 2012 2013 ...
## $ Entity : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Outdoor.air.pollution : int 4021 4014 3961 4176 4232 4767 5038 5824 6516 7273 ...
## $ High.systolic.blood.pressure : int 29999 30421 30189 30089 30075 30219 30280 30684 31090 31462 ...
## $ Diet.high.in.sodium : int 1260 1282 1275 1276 1270 1261 1254 1255 1264 1270 ...
## $ Diet.low.in.whole.grains : int 8328 8440 8383 8415 8418 8474 8488 8620 8753 8854 ...
## $ Alochol.use : int 427 432 432 452 456 478 485 496 502 508 ...
## $ Diet.low.in.fruits : int 4174 4226 4184 4166 4142 4088 4050 4033 4052 4060 ...
## $ Unsafe.water.source : int 9942 10052 10004 10118 9081 7245 6437 5600 5243 5220 ...
## $ Secondhand.smoke : int 6227 6214 6103 6272 6153 5868 5752 5686 5676 5739 ...
## $ Low.birth.weight : int 24549 24859 25441 25149 24354 23533 23232 23181 23172 23176 ...
## $ Child.wasting : int 31559 30938 29619 26669 24987 21624 20264 18163 17368 16573 ...
## $ Unsafe.sex : int 482 500 507 541 549 568 576 597 609 621 ...
## $ Diet.low.in.nuts.and.seeds : int 3516 3543 3487 3381 3325 3183 3102 3008 2982 2947 ...
## $ Household.air.pollution.from.solid.fuels: int 45132 45028 44137 43280 41778 38320 36675 34380 33223 32344 ...
## $ Diet.low.in.Vegetables : int 4501 4539 4483 4449 4432 4422 4407 4432 4476 4506 ...
## $ Low.physical.activity : int 3013 3068 3078 3153 3178 3267 3311 3403 3468 3525 ...
## $ Smoking : int 6378 6505 6488 6636 6720 7026 7181 7572 7856 8124 ...
## $ High.fasting.plasma.glucose : int 16545 17144 17467 18652 18917 19498 19730 20216 20593 20961 ...
## $ Air.pollution : int 48763 48660 47732 47107 45691 42696 41348 39893 39363 39187 ...
## $ High.body.mass.index : int 8829 9047 9151 10051 10519 12106 12982 14666 15603 16511 ...
## $ Unsafe.sanitation : int 7472 7547 7497 7532 6740 5350 4732 4069 3784 3739 ...
## $ No.access.to.handwashing.facility : int 8233 8144 7915 8033 7481 6291 5793 5275 5045 5041 ...
## $ Drug.use : int 312 328 337 390 401 430 445 482 502 522 ...
## $ Low.bone.mineral.density : int 431 436 437 454 454 478 493 529 547 581 ...
## $ Vitamin.A.deficiency : int 3855 3944 3798 1243 1231 1555 1704 1342 1327 800 ...
## $ Child.stunting : int 12965 12732 12069 8380 7885 7148 6867 5800 5541 4710 ...
## $ Discontinued.breastfeeding : int 256 250 242 233 204 165 158 199 239 297 ...
## $ Non.exclusive.breastfeeding : int 4195 4094 3947 3519 3216 2832 2706 2591 2511 2492 ...
## $ Iron.deficiency : int 897 919 937 1007 992 1008 1001 958 928 900 ...
## $ High_Deaths : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
# b. Decision Tree:
library(rpart)
# Decision tree model
tree_model <- rpart(High_Deaths ~ ., data = train_set, method = "class")
# Predict on test set
pred_tree <- predict(tree_model, newdata = test_set, type = "class")
# Confusion Matrix
conf_matrix <- table(test_set$High_Deaths, pred_tree)
print(conf_matrix)
## pred_tree
## 0 1
## 0 570 13
## 1 16 567
#decision tree plotting
library(rpart.plot)
## Warning: 程辑包'rpart.plot'是用R版本4.2.3 来建造的
rpart.plot(tree_model)

library(caret)
## Warning: 程辑包'caret'是用R版本4.2.3 来建造的
## 载入需要的程辑包:lattice
##
## 载入程辑包:'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(pROC)
## Warning: 程辑包'pROC'是用R版本4.2.3 来建造的
## Type 'citation("pROC")' for a citation.
##
## 载入程辑包:'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# Calculating Accuracy
accuracy_tree <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("Accuracy:", accuracy_tree, "\n")
## Accuracy: 0.9751286
# Calculating Precision
precision_tree <- conf_matrix[2, 2] / sum(conf_matrix[, 2])
cat("Precision:", precision_tree, "\n")
## Precision: 0.9775862
# Calculating Recall (Sensitivity)
recall_tree <- conf_matrix[2, 2] / sum(conf_matrix[2, ])
cat("Recall:", recall_tree, "\n")
## Recall: 0.9725557
# Calculating F1 Score
f1_score_tree <- 2 * (precision_tree * recall_tree) / (precision_tree + recall_tree)
cat("F1 Score:", f1_score_tree, "\n")
## F1 Score: 0.9750645
library(pROC)
# Predict class probabilities for the test set
# Change type to "prob"
pred_prob <- predict(tree_model, newdata = test_set, type = "prob")
# Create the ROC curve
roc_curve <- roc(test_set$High_Deaths, pred_prob[, 2])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plot the ROC curve
plot(roc_curve, main = "ROC Curve for Decision Tree Model", col = "blue")

# Calculate the AUC
auc_value <- auc(roc_curve)
cat("AUC:", auc_value, "\n")
## AUC: 0.9901041
# d. Logistic regression model
logit_model <- glm(High_Deaths ~ ., data = train_set, family = binomial)
## Warning: glm.fit:算法没有聚合
## Warning: glm.fit:拟合機率算出来是数值零或一
# Predicting on test set
pred_logit <- predict(logit_model, newdata = test_set, type = "response")
pred_logit
## 14 15 18 21 26 30
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 34 37 44 45 63 66
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 72 74 87 101 105 112
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 115 120 126 136 138 145
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 146 155 157 158 177 178
## 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 179 194 196 197 199 201
## 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16
## 203 208 227 228 229 235
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 237 244 245 246 248 258
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 268 271 274 278 279 286
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 289 290 294 299 305 309
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 320 325 327 334 337 341
## 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00
## 349 355 359 364 366 372
## 1.000000e+00 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00
## 378 392 397 398 400 404
## 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 407 415 430 436 438 446
## 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 447 450 452 459 463 476
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 477 479 487 495 497 505
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 506 517 518 519 521 528
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 534 542 546 557 568 571
## 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 574 576 588 589 593 611
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00
## 613 614 617 625 627 629
## 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16
## 647 648 651 652 658 661
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16
## 663 665 677 679 685 686
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 688 689 697 700 701 708
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 709 715 717 718 721 725
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00
## 731 738 749 751 755 759
## 1.000000e+00 1.000000e+00 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00
## 762 772 775 779 781 783
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16
## 791 794 799 808 811 815
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 822 827 829 833 836 837
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 842 843 845 847 853 858
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 861 868 870 872 875 881
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 890 905 910 919 920 922
## 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 931 936 941 944 949 950
## 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16
## 952 954 958 972 973 993
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00
## 998 999 1002 1011 1016 1017
## 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16
## 1028 1039 1040 1044 1053 1055
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 1065 1068 1073 1074 1079 1092
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 1096 1103 1110 1114 1116 1119
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 1123 1126 1128 1131 1133 1134
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 1139 1152 1157 1164 1165 1166
## 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 1176 1178 1180 1183 1189 1190
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 1191 1193 1197 1201 1204 1210
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16
## 1217 1218 1236 1237 1239 1244
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 1249 1252 1253 1258 1261 1263
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 1273 1275 1278 1281 1282 1291
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00
## 1294 1297 1304 1306 1312 1315
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 1317 1319 1320 1321 1322 1333
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16
## 1335 1343 1344 1346 1356 1360
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00
## 1365 1378 1382 1385 1386 1395
## 1.000000e+00 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 1399 1400 1404 1405 1410 1417
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16
## 1425 1428 1432 1433 1437 1441
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00
## 1446 1456 1457 1459 1464 1466
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 1469 1470 1471 1485 1487 1495
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 1500 1509 1512 1521 1526 1527
## 2.220446e-16 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 1529 1532 1538 1539 1540 1556
## 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 1559 1574 1578 1580 1588 1590
## 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 1592 1595 1600 1602 1605 1606
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 1609 1618 1628 1641 1642 1652
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 1.000000e+00
## 1658 1660 1663 1668 1669 1673
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 1692 1694 1696 1698 1701 1702
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 1703 1706 1707 1714 1722 1732
## 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00
## 1743 1745 1754 1756 1758 1761
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 1766 1767 1779 1781 1785 1791
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 1796 1809 1815 1821 1825 1828
## 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 1833 1840 1841 1853 1858 1860
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 1862 1865 1870 1871 1875 1889
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 1903 1907 1910 1914 1923 1929
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 1931 1936 1937 1941 1947 1955
## 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 1.000000e+00
## 1958 1965 1969 1972 1973 1975
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 1987 1990 1999 2007 2012 2013
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16
## 2023 2024 2025 2028 2032 2035
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 2036 2037 2038 2041 2046 2054
## 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 2.220446e-16 2.220446e-16
## 2055 2058 2059 2070 2076 2079
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 2081 2082 2085 2086 2090 2093
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 2104 2106 2111 2115 2117 2120
## 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16
## 2123 2125 2129 2130 2144 2149
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 2156 2158 2163 2170 2188 2189
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 2190 2201 2209 2210 2215 2233
## 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16
## 2235 2247 2253 2254 2256 2259
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 2269 2270 2271 2274 2275 2296
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00
## 2312 2320 2321 2323 2326 2332
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16
## 2341 2357 2360 2361 2362 2377
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 2379 2385 2389 2393 2396 2397
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16
## 2402 2408 2417 2422 2431 2434
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 2441 2444 2448 2460 2463 2464
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16
## 2470 2480 2482 2496 2506 2507
## 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00
## 2519 2521 2522 2524 2525 2529
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 2535 2539 2541 2543 2547 2548
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 2553 2567 2577 2581 2598 2603
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 2612 2622 2628 2634 2639 2641
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16
## 2646 2648 2649 2650 2653 2655
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 2665 2666 2669 2671 2680 2702
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00
## 2711 2713 2718 2725 2734 2737
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 2742 2747 2756 2764 2766 2768
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 2782 2789 2801 2806 2819 2821
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 2826 2828 2831 2834 2839 2851
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 2852 2853 2856 2865 2881 2891
## 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 2.220446e-16
## 2903 2906 2909 2910 2911 2913
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00
## 2915 2924 2927 2931 2932 2934
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16
## 2939 2941 2944 2948 2949 2950
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 2965 2966 2972 2976 2997 2999
## 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 3001 3005 3008 3009 3014 3016
## 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 3023 3031 3032 3043 3044 3045
## 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 3048 3049 3050 3055 3058 3063
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16
## 3064 3076 3080 3090 3102 3103
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 3104 3108 3121 3122 3126 3127
## 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 3137 3138 3156 3158 3164 3175
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 3176 3190 3193 3220 3231 3237
## 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16
## 3246 3252 3263 3267 3272 3274
## 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00
## 3289 3296 3314 3316 3318 3320
## 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 3324 3338 3339 3341 3348 3349
## 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 3361 3365 3370 3371 3377 3379
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 3381 3383 3391 3392 3397 3401
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16
## 3409 3418 3420 3423 3426 3432
## 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00
## 3448 3452 3457 3460 3465 3467
## 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 3472 3473 3487 3491 3492 3493
## 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16
## 3494 3499 3502 3503 3506 3510
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 3517 3518 3522 3527 3529 3532
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 3534 3547 3565 3566 3567 3574
## 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00
## 3577 3578 3581 3582 3583 3585
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 3590 3592 3594 3595 3598 3600
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 3619 3620 3626 3627 3633 3639
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00
## 3640 3643 3657 3674 3679 3681
## 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 3682 3686 3687 3701 3704 3708
## 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00
## 3712 3716 3726 3731 3732 3733
## 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 3734 3746 3747 3752 3774 3776
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 3778 3781 3783 3795 3798 3800
## 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 3802 3805 3807 3813 3825 3826
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 3831 3834 3838 3855 3857 3863
## 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16
## 3866 3872 3876 3880 3883 3885
## 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 3896 3899 3902 3908 3923 3927
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 3934 3952 3965 3966 3974 3986
## 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 3993 3994 4005 4006 4007 4011
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 4028 4036 4039 4059 4067 4068
## 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16
## 4078 4092 4094 4095 4104 4106
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 4107 4114 4120 4126 4130 4135
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 4141 4149 4152 4159 4175 4177
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16
## 4178 4179 4182 4184 4185 4191
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 4199 4203 4206 4222 4223 4225
## 2.220446e-16 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16
## 4226 4228 4229 4232 4236 4242
## 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00
## 4244 4259 4260 4262 4265 4266
## 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16
## 4270 4274 4278 4286 4289 4296
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00
## 4297 4298 4304 4306 4313 4315
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 4320 4321 4323 4330 4339 4348
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 4351 4355 4365 4370 4377 4413
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00
## 4418 4423 4427 4428 4433 4435
## 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 4436 4440 4443 4448 4451 4456
## 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16
## 4463 4467 4477 4501 4502 4505
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 4507 4514 4516 4517 4523 4524
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 4527 4531 4541 4542 4546 4548
## 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 4551 4559 4560 4573 4579 4582
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 4587 4589 4596 4599 4602 4608
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 4615 4616 4624 4626 4634 4637
## 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 4639 4647 4652 4674 4676 4680
## 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 4683 4688 4691 4695 4696 4697
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 4702 4703 4711 4719 4722 4723
## 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16
## 4735 4737 4739 4762 4768 4771
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00
## 4773 4779 4786 4787 4788 4793
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 4803 4806 4808 4823 4826 4845
## 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16
## 4861 4864 4866 4868 4875 4882
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 4907 4908 4909 4912 4913 4939
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16
## 4943 4953 4954 4958 4962 4963
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 4964 4972 4975 4977 4980 4982
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 4998 5006 5024 5030 5037 5041
## 2.220446e-16 2.220446e-16 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16
## 5042 5045 5048 5050 5055 5058
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 5065 5067 5074 5077 5079 5088
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 5091 5096 5101 5109 5111 5119
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 5122 5128 5130 5134 5138 5140
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 5143 5155 5157 5161 5168 5174
## 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00
## 5177 5180 5182 5192 5193 5197
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 5201 5202 5208 5212 5219 5225
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 5243 5248 5252 5253 5260 5276
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 5282 5291 5293 5294 5298 5306
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 5323 5327 5328 5330 5334 5337
## 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 5346 5347 5350 5352 5353 5354
## 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 5366 5383 5392 5396 5402 5405
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 5408 5415 5425 5427 5434 5435
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00
## 5436 5441 5443 5450 5457 5461
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 5465 5478 5482 5483 5495 5509
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16
## 5512 5513 5521 5524 5529 5545
## 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 5546 5562 5567 5570 5571 5572
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 5576 5577 5578 5598 5603 5608
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 5609 5617 5630 5633 5636 5638
## 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 5642 5644 5650 5652 5654 5660
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 5661 5662 5663 5665 5670 5675
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 5678 5679 5680 5687 5689 5694
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 5699 5701 5714 5717 5723 5724
## 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 5728 5742 5784 5786 5789 5802
## 2.220446e-16 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00
## 5810 5811 5813 5816 5828 5829
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16
## 5830 5836 5845 5849 5854 5857
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
## 5862 5865 5867 5873 5876 5881
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00
## 5886 5894 5896 5897 5900 5901
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 5904 5910 5920 5923 5926 5929
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 5937 5941 5944 5954 5962 5966
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 5969 5972 5976 5978 5987 5991
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## 5993 5999
## 1.000000e+00 1.000000e+00
pred_logit_class <- ifelse(pred_logit > 0.5, 1, 0)
# Confusion Matrix
conf_matrix <- table(test_set$High_Deaths, pred_logit_class)
print(conf_matrix)
## pred_logit_class
## 0 1
## 0 558 25
## 1 17 566
library(caret)
library(pROC)
# Calculating Accuracy
accuracy_logi <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("Accuracy:", accuracy_logi, "\n")
## Accuracy: 0.9639794
# Calculating Precision
precision_logi <- conf_matrix[2, 2] / sum(conf_matrix[, 2])
cat("Precision:", precision_logi, "\n")
## Precision: 0.9576988
# Calculating Recall (Sensitivity)
recall_logi <- conf_matrix[2, 2] / sum(conf_matrix[2, ])
cat("Recall:", recall_logi, "\n")
## Recall: 0.9708405
# Calculating F1 Score
f1_score_logi <- 2 * (precision_logi * recall_logi) / (precision_logi + recall_logi)
cat("F1 Score:", f1_score_logi, "\n")
## F1 Score: 0.9642249
# ROC Curve and AUC
roc_obj <- roc(test_set$High_Deaths, pred_logit)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_value <- auc(roc_obj)
cat("AUC:", auc_value, "\n")
## AUC: 0.9639794
# Plot ROC curve
plot(roc_obj, main = "ROC Curve for Logistic Regression", col = "blue")

#3. Clustering
library(caTools)
set.seed(123)
str(data_clean)
## tibble [6,840 × 32] (S3: tbl_df/tbl/data.frame)
## $ Entity : chr [1:6840] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Code : chr [1:6840] "AFG" "AFG" "AFG" "AFG" ...
## $ Year : int [1:6840] 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 ...
## $ Outdoor.air.pollution : int [1:6840] 3169 3222 3395 3623 3788 3869 3943 4024 4040 4042 ...
## $ High.systolic.blood.pressure : int [1:6840] 25633 25872 26309 26961 27658 28090 28587 29021 29349 29712 ...
## $ Diet.high.in.sodium : int [1:6840] 1045 1055 1075 1103 1134 1154 1178 1202 1222 1242 ...
## $ Diet.low.in.whole.grains : int [1:6840] 7077 7149 7297 7499 7698 7807 7943 8075 8173 8265 ...
## $ Alochol.use : int [1:6840] 356 364 376 389 399 406 413 420 425 426 ...
## $ Diet.low.in.fruits : int [1:6840] 3185 3248 3351 3480 3610 3703 3819 3938 4038 4127 ...
## $ Unsafe.water.source : int [1:6840] 3702 4309 5356 7152 7192 8378 8487 9348 9788 9931 ...
## $ Secondhand.smoke : int [1:6840] 4794 4921 5279 5734 6050 6167 6298 6425 6402 6323 ...
## $ Low.birth.weight : int [1:6840] 16135 17924 21200 23795 24866 25534 25997 26246 25805 25080 ...
## $ Child.wasting : int [1:6840] 19546 20334 22895 27002 29205 30943 31628 32736 32760 32271 ...
## $ Unsafe.sex : int [1:6840] 351 361 378 395 410 422 435 448 458 469 ...
## $ Diet.low.in.nuts.and.seeds : int [1:6840] 2319 2449 2603 2771 2932 3049 3173 3298 3401 3482 ...
## $ Household.air.pollution.from.solid.fuels: int [1:6840] 34372 35392 38065 41154 43153 44024 45005 46017 46055 45681 ...
## $ Diet.low.in.Vegetables : int [1:6840] 3679 3732 3827 3951 4075 4153 4247 4339 4409 4473 ...
## $ Low.physical.activity : int [1:6840] 2637 2652 2688 2744 2805 2839 2878 2914 2944 2976 ...
## $ Smoking : int [1:6840] 5174 5247 5363 5522 5689 5801 5934 6066 6178 6288 ...
## $ High.fasting.plasma.glucose : int [1:6840] 11449 11811 12265 12821 13400 13871 14413 14970 15502 16058 ...
## $ Air.pollution : int [1:6840] 37231 38315 41172 44488 46634 47566 48617 49703 49746 49349 ...
## $ High.body.mass.index : int [1:6840] 9518 9489 9528 9611 9675 9608 9503 9286 9024 8857 ...
## $ Unsafe.sanitation : int [1:6840] 2798 3254 4042 5392 5418 6313 6393 7038 7366 7468 ...
## $ No.access.to.handwashing.facility : int [1:6840] 4825 5127 5889 7007 7421 7896 8098 8507 8560 8424 ...
## $ Drug.use : int [1:6840] 174 188 211 232 247 260 274 287 295 302 ...
## $ Low.bone.mineral.density : int [1:6840] 389 389 393 411 413 417 423 425 429 428 ...
## $ Vitamin.A.deficiency : int [1:6840] 2016 2056 2100 2316 2665 3070 3214 3228 3413 3662 ...
## $ Child.stunting : int [1:6840] 7686 7886 8568 9875 11031 11973 12426 12805 13011 13052 ...
## $ Discontinued.breastfeeding : int [1:6840] 107 121 150 204 204 233 233 255 264 263 ...
## $ Non.exclusive.breastfeeding : int [1:6840] 2216 2501 3053 3726 3833 4124 4183 4393 4417 4326 ...
## $ Iron.deficiency : int [1:6840] 564 611 700 773 812 848 883 914 924 909 ...
## $ Total_Deaths : int [1:6840] 10498 10720 11229 11877 12385 12634 12919 13198 13300 13339 ...
library(dplyr)
# Group data by country and calculate the average of all numeric columns
country_level_data <- data_clean %>%
group_by(Code) %>% # Group by country code
summarise(across(where(is.numeric), mean, na.rm = TRUE)) # Calculate the mean for all numeric columns
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(where(is.numeric), mean, na.rm = TRUE)`.
## ℹ In group 1: `Code = ""`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
str(country_level_data)
## tibble [206 × 31] (S3: tbl_df/tbl/data.frame)
## $ Code : chr [1:206] "" "AFG" "AGO" "ALB" ...
## $ Year : num [1:206] 2004 2004 2004 2004 2004 ...
## $ Outdoor.air.pollution : num [1:206] 554194.6 5178 2853.2 1190.7 13.9 ...
## $ High.systolic.blood.pressure : num [1:206] 1.48e+06 3.02e+04 1.18e+04 5.46e+03 6.95e+01 ...
## $ Diet.high.in.sodium : num [1:206] 2.66e+05 1.24e+03 8.39e+02 1.81e+03 4.23 ...
## $ Diet.low.in.whole.grains : num [1:206] 254974.9 8460.8 1332.2 1037.5 14.2 ...
## $ Alochol.use : num [1:206] 362617.1 459.3 7689.9 559.6 32.4 ...
## $ Diet.low.in.fruits : num [1:206] 157106.8 4004 1271.4 266.1 3.6 ...
## $ Unsafe.water.source : num [1:206] 277264.8 7174.6 23526.3 15.6 0 ...
## $ Secondhand.smoke : num [1:206] 1.99e+05 5.93e+03 1.70e+03 7.08e+02 6.47 ...
## $ Low.birth.weight : num [1:206] 3.72e+05 2.36e+04 1.81e+04 3.75e+02 7.33e-01 ...
## $ Child.wasting : num [1:206] 309658 23388 30602 254 0 ...
## $ Unsafe.sex : num [1:206] 172391.1 532.3 7531.2 57.1 4.7 ...
## $ Diet.low.in.nuts.and.seeds : num [1:206] 85270.3 3121.6 474.2 314.6 1.8 ...
## $ Household.air.pollution.from.solid.fuels: num [1:206] 530919 38546 16643 1281 0 ...
## $ Diet.low.in.Vegetables : num [1:206] 78076.44 4411.6 1060.13 95.47 2.27 ...
## $ Low.physical.activity : num [1:206] 1.10e+05 3.22e+03 4.48e+02 1.83e+02 8.53 ...
## $ Smoking : num [1:206] 1.21e+06 7.04e+03 5.77e+03 3.16e+03 8.62e+01 ...
## $ High.fasting.plasma.glucose : num [1:206] 776159.1 18008.6 6578 1344.6 31.4 ...
## $ Air.pollution : num [1:206] 1.06e+06 4.34e+04 1.94e+04 2.46e+03 1.38e+01 ...
## $ High.body.mass.index : num [1:206] 594427.9 12552 3000.5 2060.9 38.1 ...
## $ Unsafe.sanitation : num [1:206] 1.98e+05 5.31e+03 1.63e+04 8.77 0.00 ...
## $ No.access.to.handwashing.facility : num [1:206] 135995.4 6498.1 13052.4 24.8 0 ...
## $ Drug.use : num [1:206] 68113.78 398.5 258.83 95.27 3.93 ...
## $ Low.bone.mineral.density : num [1:206] 54349.5 483 475.6 52.6 7.2 ...
## $ Vitamin.A.deficiency : num [1:206] 15169.12 1928.67 2073.8 1.73 0 ...
## $ Child.stunting : num [1:206] 69087 8245 6997 102 0 ...
## $ Discontinued.breastfeeding : num [1:206] 2668.617 238.467 345.033 0.433 0 ...
## $ Non.exclusive.breastfeeding : num [1:206] 44388.4 3164.7 5147.1 50.1 0 ...
## $ Iron.deficiency : num [1:206] 8902.3 875.4 428.4 1.2 0 ...
## $ Total_Deaths : num [1:206] 1839838 13833 15420 4520 129 ...
country_level_data <- country_level_data[-1,]
country_level_data <- country_level_data[-grep("OWID_WRL",country_level_data$Code),]
str(country_level_data)
## tibble [204 × 31] (S3: tbl_df/tbl/data.frame)
## $ Code : chr [1:204] "AFG" "AGO" "ALB" "AND" ...
## $ Year : num [1:204] 2004 2004 2004 2004 2004 ...
## $ Outdoor.air.pollution : num [1:204] 5178 2853.2 1190.7 13.9 1484.1 ...
## $ High.systolic.blood.pressure : num [1:204] 30207 11753.5 5464.9 69.5 2555.1 ...
## $ Diet.high.in.sodium : num [1:204] 1241.73 839.37 1814.53 4.23 119.53 ...
## $ Diet.low.in.whole.grains : num [1:204] 8460.8 1332.2 1037.5 14.2 611.9 ...
## $ Alochol.use : num [1:204] 459.3 7689.9 559.6 32.4 291.7 ...
## $ Diet.low.in.fruits : num [1:204] 4004 1271.4 266.1 3.6 116.4 ...
## $ Unsafe.water.source : num [1:204] 7174.6 23526.3 15.6 0 17.1 ...
## $ Secondhand.smoke : num [1:204] 5931.47 1700.97 707.57 6.47 394.07 ...
## $ Low.birth.weight : num [1:204] 2.36e+04 1.81e+04 3.75e+02 7.33e-01 1.52e+02 ...
## $ Child.wasting : num [1:204] 23388.5 30602.3 254.2 0 18.6 ...
## $ Unsafe.sex : num [1:204] 532.3 7531.2 57.1 4.7 65 ...
## $ Diet.low.in.nuts.and.seeds : num [1:204] 3121.6 474.2 314.6 1.8 10.7 ...
## $ Household.air.pollution.from.solid.fuels: num [1:204] 3.85e+04 1.66e+04 1.28e+03 0.00 6.67e-01 ...
## $ Diet.low.in.Vegetables : num [1:204] 4411.6 1060.13 95.47 2.27 88.87 ...
## $ Low.physical.activity : num [1:204] 3215.9 447.93 182.93 8.53 472.77 ...
## $ Smoking : num [1:204] 7043.3 5770.6 3158 86.2 1491.8 ...
## $ High.fasting.plasma.glucose : num [1:204] 18008.6 6578 1344.6 31.4 1771.6 ...
## $ Air.pollution : num [1:204] 43384.1 19412.9 2457.7 13.8 1466.7 ...
## $ High.body.mass.index : num [1:204] 12552 3000.5 2060.9 38.1 2925.6 ...
## $ Unsafe.sanitation : num [1:204] 5311.47 16253.77 8.77 0 1.37 ...
## $ No.access.to.handwashing.facility : num [1:204] 6498.1 13052.4 24.8 0 16.5 ...
## $ Drug.use : num [1:204] 398.5 258.83 95.27 3.93 410.43 ...
## $ Low.bone.mineral.density : num [1:204] 483 475.6 52.6 7.2 144.5 ...
## $ Vitamin.A.deficiency : num [1:204] 1.93e+03 2.07e+03 1.73 0.00 3.33e-02 ...
## $ Child.stunting : num [1:204] 8245.2 6997.2 102.4 0 2.8 ...
## $ Discontinued.breastfeeding : num [1:204] 238.467 345.033 0.433 0 0 ...
## $ Non.exclusive.breastfeeding : num [1:204] 3164.7 5147.1 50.1 0 2.07 ...
## $ Iron.deficiency : num [1:204] 875.4 428.37 1.2 0 1.43 ...
## $ Total_Deaths : num [1:204] 13833 15420 4520 129 2588 ...
tail(country_level_data)
## # A tibble: 6 × 31
## Code Year Outdoor.air.pollution High.systolic.blood.pr…¹ Diet.high.in.sodium
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 VUT 2004. 29.7 371. 53.3
## 2 WSM 2004. 32.3 260. 15.9
## 3 YEM 2004. 6025. 19767. 818.
## 4 ZAF 2004. 21862. 54454. 3253.
## 5 ZMB 2004. 1810. 6566 1379.
## 6 ZWE 2004. 2162. 9282. 856.
## # ℹ abbreviated name: ¹High.systolic.blood.pressure
## # ℹ 26 more variables: Diet.low.in.whole.grains <dbl>, Alochol.use <dbl>,
## # Diet.low.in.fruits <dbl>, Unsafe.water.source <dbl>,
## # Secondhand.smoke <dbl>, Low.birth.weight <dbl>, Child.wasting <dbl>,
## # Unsafe.sex <dbl>, Diet.low.in.nuts.and.seeds <dbl>,
## # Household.air.pollution.from.solid.fuels <dbl>,
## # Diet.low.in.Vegetables <dbl>, Low.physical.activity <dbl>, Smoking <dbl>, …
# 1: Selecting Feature Variables
# Select relevant features for clustering
str(country_level_data)
## tibble [204 × 31] (S3: tbl_df/tbl/data.frame)
## $ Code : chr [1:204] "AFG" "AGO" "ALB" "AND" ...
## $ Year : num [1:204] 2004 2004 2004 2004 2004 ...
## $ Outdoor.air.pollution : num [1:204] 5178 2853.2 1190.7 13.9 1484.1 ...
## $ High.systolic.blood.pressure : num [1:204] 30207 11753.5 5464.9 69.5 2555.1 ...
## $ Diet.high.in.sodium : num [1:204] 1241.73 839.37 1814.53 4.23 119.53 ...
## $ Diet.low.in.whole.grains : num [1:204] 8460.8 1332.2 1037.5 14.2 611.9 ...
## $ Alochol.use : num [1:204] 459.3 7689.9 559.6 32.4 291.7 ...
## $ Diet.low.in.fruits : num [1:204] 4004 1271.4 266.1 3.6 116.4 ...
## $ Unsafe.water.source : num [1:204] 7174.6 23526.3 15.6 0 17.1 ...
## $ Secondhand.smoke : num [1:204] 5931.47 1700.97 707.57 6.47 394.07 ...
## $ Low.birth.weight : num [1:204] 2.36e+04 1.81e+04 3.75e+02 7.33e-01 1.52e+02 ...
## $ Child.wasting : num [1:204] 23388.5 30602.3 254.2 0 18.6 ...
## $ Unsafe.sex : num [1:204] 532.3 7531.2 57.1 4.7 65 ...
## $ Diet.low.in.nuts.and.seeds : num [1:204] 3121.6 474.2 314.6 1.8 10.7 ...
## $ Household.air.pollution.from.solid.fuels: num [1:204] 3.85e+04 1.66e+04 1.28e+03 0.00 6.67e-01 ...
## $ Diet.low.in.Vegetables : num [1:204] 4411.6 1060.13 95.47 2.27 88.87 ...
## $ Low.physical.activity : num [1:204] 3215.9 447.93 182.93 8.53 472.77 ...
## $ Smoking : num [1:204] 7043.3 5770.6 3158 86.2 1491.8 ...
## $ High.fasting.plasma.glucose : num [1:204] 18008.6 6578 1344.6 31.4 1771.6 ...
## $ Air.pollution : num [1:204] 43384.1 19412.9 2457.7 13.8 1466.7 ...
## $ High.body.mass.index : num [1:204] 12552 3000.5 2060.9 38.1 2925.6 ...
## $ Unsafe.sanitation : num [1:204] 5311.47 16253.77 8.77 0 1.37 ...
## $ No.access.to.handwashing.facility : num [1:204] 6498.1 13052.4 24.8 0 16.5 ...
## $ Drug.use : num [1:204] 398.5 258.83 95.27 3.93 410.43 ...
## $ Low.bone.mineral.density : num [1:204] 483 475.6 52.6 7.2 144.5 ...
## $ Vitamin.A.deficiency : num [1:204] 1.93e+03 2.07e+03 1.73 0.00 3.33e-02 ...
## $ Child.stunting : num [1:204] 8245.2 6997.2 102.4 0 2.8 ...
## $ Discontinued.breastfeeding : num [1:204] 238.467 345.033 0.433 0 0 ...
## $ Non.exclusive.breastfeeding : num [1:204] 3164.7 5147.1 50.1 0 2.07 ...
## $ Iron.deficiency : num [1:204] 875.4 428.37 1.2 0 1.43 ...
## $ Total_Deaths : num [1:204] 13833 15420 4520 129 2588 ...
clustering_data <- country_level_data %>%
select(Alochol.use, Secondhand.smoke,
Smoking, Drug.use)
# Standardize the data for better clustering performance
clustering_data_scale <- scale(clustering_data)
head(clustering_data_scale)
## Alochol.use Secondhand.smoke Smoking Drug.use
## [1,] -0.25331543 0.005321049 -0.16598429 -0.17860798
## [2,] -0.06540833 -0.133598577 -0.17398764 -0.19513870
## [3,] -0.25070798 -0.166219487 -0.19041777 -0.21449818
## [4,] -0.26440965 -0.189241955 -0.20973573 -0.22530825
## [5,] -0.25767013 -0.176514086 -0.20089639 -0.17719557
## [6,] 0.21437372 0.067359009 0.08578723 0.03621244
#Step 2: Apply K-Means Clustering
set.seed(123)
# Apply K-means clustering with an arbitrary k (e.g., k = 3)
kmeans_model <- kmeans(clustering_data_scale, centers = 3, nstart = 25)
kmeans_model
## K-means clustering with 3 clusters of sizes 200, 3, 1
##
## Cluster means:
## Alochol.use Secondhand.smoke Smoking Drug.use
## 1 -0.1211628 -0.1038789 -0.1109928 -0.1280741
## 2 4.4850983 2.7382539 3.2297423 5.1957899
## 3 10.7772713 12.5610152 12.5093297 10.0274451
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 38.08730 36.98749 0.00000
## (between_SS / total_SS = 90.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Add cluster assignments to the original data
country_level_data$Cluster <- as.factor(kmeans_model$cluster)
head(country_level_data)
## # A tibble: 6 × 32
## Code Year Outdoor.air.pollution High.systolic.blood.pr…¹ Diet.high.in.sodium
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 AFG 2004. 5178. 30207. 1242.
## 2 AGO 2004. 2853. 11754. 839.
## 3 ALB 2004. 1191. 5465. 1815.
## 4 AND 2004. 13.9 69.5 4.23
## 5 ARE 2004. 1484. 2555. 120.
## 6 ARG 2004. 12004. 50664. 6633.
## # ℹ abbreviated name: ¹High.systolic.blood.pressure
## # ℹ 27 more variables: Diet.low.in.whole.grains <dbl>, Alochol.use <dbl>,
## # Diet.low.in.fruits <dbl>, Unsafe.water.source <dbl>,
## # Secondhand.smoke <dbl>, Low.birth.weight <dbl>, Child.wasting <dbl>,
## # Unsafe.sex <dbl>, Diet.low.in.nuts.and.seeds <dbl>,
## # Household.air.pollution.from.solid.fuels <dbl>,
## # Diet.low.in.Vegetables <dbl>, Low.physical.activity <dbl>, Smoking <dbl>, …
#Step 3: Visualize the Clustering Results
library(ggplot2)
# Use PCA to reduce the dimensionality for visualization
pca <- prcomp(clustering_data)
str(pca$x)
## num [1:204, 1:4] -27462 -27853 -32116 -35304 -33812 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
# Create a data frame for plotting
plot_data <- data.frame(PC1 = pca$x[,1], PC2 = pca$x[,2], Cluster = country_level_data$Cluster)
# Plot the clusters using the first two principal components
ggplot(plot_data, aes(x = PC1, y = PC2, color = Cluster)) +
geom_point() +
ggtitle("K-Means Clustering Results (k = 3)") +
theme_minimal()

# Add country names from the original dataset to the clustering data
clustering_data_with_countries <- as.data.frame(clustering_data)
clustering_data_with_countries$Country <- country_level_data$Code # Replace 'Entity' with the correct country column
clustering_data_with_countries$Cluster <- as.factor(kmeans_model$cluster)
# Visualize clustering
library(factoextra)
## Warning: 程辑包'factoextra'是用R版本4.2.3 来建造的
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_cluster(kmeans_model,
data = clustering_data_scale,
geom = "point", # Use points for the cluster plot
labelsize = 0, # Remove default cluster labels
show.clust.cent = FALSE) + # Hide cluster centers
geom_text(aes(label = clustering_data_with_countries$Country), # Add country labels
size = 3,
vjust = -0.5, # Adjust vertical positioning
hjust = 0.5, # Adjust horizontal positioning
color = "black") # Color for country names
