# 1. Exploratory Data Analysis (EDA)

library(tidyverse)
## Warning: 程辑包'tidyverse'是用R版本4.2.3 来建造的
## Warning: 程辑包'ggplot2'是用R版本4.2.3 来建造的
## Warning: 程辑包'tibble'是用R版本4.2.3 来建造的
## Warning: 程辑包'tidyr'是用R版本4.2.3 来建造的
## Warning: 程辑包'readr'是用R版本4.2.3 来建造的
## Warning: 程辑包'purrr'是用R版本4.2.3 来建造的
## Warning: 程辑包'dplyr'是用R版本4.2.3 来建造的
## Warning: 程辑包'stringr'是用R版本4.2.3 来建造的
## Warning: 程辑包'forcats'是用R版本4.2.3 来建造的
## Warning: 程辑包'lubridate'是用R版本4.2.3 来建造的
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data <- read.csv("Countries and death causes.csv")
str(data)
## 'data.frame':    6840 obs. of  31 variables:
##  $ Entity                                  : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Code                                    : chr  "AFG" "AFG" "AFG" "AFG" ...
##  $ Year                                    : int  1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 ...
##  $ Outdoor.air.pollution                   : int  3169 3222 3395 3623 3788 3869 3943 4024 4040 4042 ...
##  $ High.systolic.blood.pressure            : int  25633 25872 26309 26961 27658 28090 28587 29021 29349 29712 ...
##  $ Diet.high.in.sodium                     : int  1045 1055 1075 1103 1134 1154 1178 1202 1222 1242 ...
##  $ Diet.low.in.whole.grains                : int  7077 7149 7297 7499 7698 7807 7943 8075 8173 8265 ...
##  $ Alochol.use                             : int  356 364 376 389 399 406 413 420 425 426 ...
##  $ Diet.low.in.fruits                      : int  3185 3248 3351 3480 3610 3703 3819 3938 4038 4127 ...
##  $ Unsafe.water.source                     : int  3702 4309 5356 7152 7192 8378 8487 9348 9788 9931 ...
##  $ Secondhand.smoke                        : int  4794 4921 5279 5734 6050 6167 6298 6425 6402 6323 ...
##  $ Low.birth.weight                        : int  16135 17924 21200 23795 24866 25534 25997 26246 25805 25080 ...
##  $ Child.wasting                           : int  19546 20334 22895 27002 29205 30943 31628 32736 32760 32271 ...
##  $ Unsafe.sex                              : int  351 361 378 395 410 422 435 448 458 469 ...
##  $ Diet.low.in.nuts.and.seeds              : int  2319 2449 2603 2771 2932 3049 3173 3298 3401 3482 ...
##  $ Household.air.pollution.from.solid.fuels: int  34372 35392 38065 41154 43153 44024 45005 46017 46055 45681 ...
##  $ Diet.low.in.Vegetables                  : int  3679 3732 3827 3951 4075 4153 4247 4339 4409 4473 ...
##  $ Low.physical.activity                   : int  2637 2652 2688 2744 2805 2839 2878 2914 2944 2976 ...
##  $ Smoking                                 : int  5174 5247 5363 5522 5689 5801 5934 6066 6178 6288 ...
##  $ High.fasting.plasma.glucose             : int  11449 11811 12265 12821 13400 13871 14413 14970 15502 16058 ...
##  $ Air.pollution                           : int  37231 38315 41172 44488 46634 47566 48617 49703 49746 49349 ...
##  $ High.body.mass.index                    : int  9518 9489 9528 9611 9675 9608 9503 9286 9024 8857 ...
##  $ Unsafe.sanitation                       : int  2798 3254 4042 5392 5418 6313 6393 7038 7366 7468 ...
##  $ No.access.to.handwashing.facility       : int  4825 5127 5889 7007 7421 7896 8098 8507 8560 8424 ...
##  $ Drug.use                                : int  174 188 211 232 247 260 274 287 295 302 ...
##  $ Low.bone.mineral.density                : int  389 389 393 411 413 417 423 425 429 428 ...
##  $ Vitamin.A.deficiency                    : int  2016 2056 2100 2316 2665 3070 3214 3228 3413 3662 ...
##  $ Child.stunting                          : int  7686 7886 8568 9875 11031 11973 12426 12805 13011 13052 ...
##  $ Discontinued.breastfeeding              : int  107 121 150 204 204 233 233 255 264 263 ...
##  $ Non.exclusive.breastfeeding             : int  2216 2501 3053 3726 3833 4124 4183 4393 4417 4326 ...
##  $ Iron.deficiency                         : int  564 611 700 773 812 848 883 914 924 909 ...
summary(data)
##     Entity              Code                Year      Outdoor.air.pollution
##  Length:6840        Length:6840        Min.   :1990   Min.   :      0      
##  Class :character   Class :character   1st Qu.:1997   1st Qu.:    434      
##  Mode  :character   Mode  :character   Median :2004   Median :   2101      
##                                        Mean   :2004   Mean   :  84582      
##                                        3rd Qu.:2012   3rd Qu.:  11810      
##                                        Max.   :2019   Max.   :4506193      
##  High.systolic.blood.pressure Diet.high.in.sodium Diet.low.in.whole.grains
##  Min.   :       2             Min.   :      0.0   Min.   :      0.0       
##  1st Qu.:    1828             1st Qu.:    137.0   1st Qu.:    273.8       
##  Median :    8770             Median :    969.5   Median :   1444.0       
##  Mean   :  224225             Mean   :  40497.2   Mean   :  38691.3       
##  3rd Qu.:   40356             3rd Qu.:   5169.8   3rd Qu.:   6773.2       
##  Max.   :10845595             Max.   :1885356.0   Max.   :1844836.0       
##   Alochol.use        Diet.low.in.fruits  Unsafe.water.source Secondhand.smoke 
##  Min.   :      0.0   Min.   :      0.0   Min.   :      0.0   Min.   :      1  
##  1st Qu.:    263.8   1st Qu.:    144.0   1st Qu.:      7.0   1st Qu.:    209  
##  Median :   1780.5   Median :    834.5   Median :    182.5   Median :    994  
##  Mean   :  54848.6   Mean   :  23957.8   Mean   :  44086.4   Mean   :  30364  
##  3rd Qu.:   8368.0   3rd Qu.:   3104.8   3rd Qu.:   5599.2   3rd Qu.:   4348  
##  Max.   :2441973.0   Max.   :1046015.0   Max.   :2450944.0   Max.   :1304318  
##  Low.birth.weight  Child.wasting       Unsafe.sex     
##  Min.   :      0   Min.   :      0   Min.   :      0  
##  1st Qu.:    123   1st Qu.:     26   1st Qu.:     97  
##  Median :   1057   Median :    504   Median :    619  
##  Mean   :  59126   Mean   :  49924   Mean   :  27646  
##  3rd Qu.:  10903   3rd Qu.:   9765   3rd Qu.:   4492  
##  Max.   :3033425   Max.   :3430422   Max.   :1664813  
##  Diet.low.in.nuts.and.seeds Household.air.pollution.from.solid.fuels
##  Min.   :     0             Min.   :      0                         
##  1st Qu.:    27             1st Qu.:     32                         
##  Median :   252             Median :    821                         
##  Mean   : 12996             Mean   :  83641                         
##  3rd Qu.:  1998             3rd Qu.:  10870                         
##  Max.   :575139             Max.   :4358214                         
##  Diet.low.in.Vegetables Low.physical.activity    Smoking       
##  Min.   :     0.0       Min.   :     0.0      Min.   :      1  
##  1st Qu.:   109.0       1st Qu.:    92.0      1st Qu.:    894  
##  Median :   590.5       Median :   521.5      Median :   4987  
##  Mean   : 11982.5       Mean   : 16489.1      Mean   : 181958  
##  3rd Qu.:  2101.8       3rd Qu.:  2820.2      3rd Qu.:  23994  
##  Max.   :529381.0       Max.   :831502.0      Max.   :7693368  
##  High.fasting.plasma.glucose Air.pollution     High.body.mass.index
##  Min.   :      3             Min.   :      0   Min.   :      2     
##  1st Qu.:   1178             1st Qu.:    816   1st Qu.:    918     
##  Median :   4966             Median :   5748   Median :   3917     
##  Mean   : 117554             Mean   : 164752   Mean   :  89870     
##  3rd Qu.:  21639             3rd Qu.:  25050   3rd Qu.:  17968     
##  Max.   :6501398             Max.   :6671740   Max.   :5019360     
##  Unsafe.sanitation No.access.to.handwashing.facility    Drug.use     
##  Min.   :      0   Min.   :      0                   Min.   :     0  
##  1st Qu.:      3   1st Qu.:     19                   1st Qu.:    31  
##  Median :    102   Median :    221                   Median :   222  
##  Mean   :  31522   Mean   :  21800                   Mean   : 10285  
##  3rd Qu.:   3854   3rd Qu.:   3954                   3rd Qu.:  1224  
##  Max.   :1842275   Max.   :1200349                   Max.   :494492  
##  Low.bone.mineral.density Vitamin.A.deficiency Child.stunting    
##  Min.   :     0           Min.   :     0.0     Min.   :     0.0  
##  1st Qu.:    43           1st Qu.:     0.0     1st Qu.:     1.0  
##  Median :   277           Median :     2.0     Median :    41.5  
##  Mean   :  8182           Mean   :  2471.6     Mean   : 11164.3  
##  3rd Qu.:  1232           3rd Qu.:   230.2     3rd Qu.:  1563.2  
##  Max.   :437884           Max.   :207555.0     Max.   :833449.0  
##  Discontinued.breastfeeding Non.exclusive.breastfeeding Iron.deficiency
##  Min.   :    0.00           Min.   :     0.0            Min.   :    0  
##  1st Qu.:    0.00           1st Qu.:     3.0            1st Qu.:    1  
##  Median :    4.00           Median :    60.5            Median :   12  
##  Mean   :  431.46           Mean   :  7171.9            Mean   : 1421  
##  3rd Qu.:   71.25           3rd Qu.:  1315.5            3rd Qu.:  238  
##  Max.   :33106.00           Max.   :505470.0            Max.   :73461
head(data)
##        Entity Code Year Outdoor.air.pollution High.systolic.blood.pressure
## 1 Afghanistan  AFG 1990                  3169                        25633
## 2 Afghanistan  AFG 1991                  3222                        25872
## 3 Afghanistan  AFG 1992                  3395                        26309
## 4 Afghanistan  AFG 1993                  3623                        26961
## 5 Afghanistan  AFG 1994                  3788                        27658
## 6 Afghanistan  AFG 1995                  3869                        28090
##   Diet.high.in.sodium Diet.low.in.whole.grains Alochol.use Diet.low.in.fruits
## 1                1045                     7077         356               3185
## 2                1055                     7149         364               3248
## 3                1075                     7297         376               3351
## 4                1103                     7499         389               3480
## 5                1134                     7698         399               3610
## 6                1154                     7807         406               3703
##   Unsafe.water.source Secondhand.smoke Low.birth.weight Child.wasting
## 1                3702             4794            16135         19546
## 2                4309             4921            17924         20334
## 3                5356             5279            21200         22895
## 4                7152             5734            23795         27002
## 5                7192             6050            24866         29205
## 6                8378             6167            25534         30943
##   Unsafe.sex Diet.low.in.nuts.and.seeds
## 1        351                       2319
## 2        361                       2449
## 3        378                       2603
## 4        395                       2771
## 5        410                       2932
## 6        422                       3049
##   Household.air.pollution.from.solid.fuels Diet.low.in.Vegetables
## 1                                    34372                   3679
## 2                                    35392                   3732
## 3                                    38065                   3827
## 4                                    41154                   3951
## 5                                    43153                   4075
## 6                                    44024                   4153
##   Low.physical.activity Smoking High.fasting.plasma.glucose Air.pollution
## 1                  2637    5174                       11449         37231
## 2                  2652    5247                       11811         38315
## 3                  2688    5363                       12265         41172
## 4                  2744    5522                       12821         44488
## 5                  2805    5689                       13400         46634
## 6                  2839    5801                       13871         47566
##   High.body.mass.index Unsafe.sanitation No.access.to.handwashing.facility
## 1                 9518              2798                              4825
## 2                 9489              3254                              5127
## 3                 9528              4042                              5889
## 4                 9611              5392                              7007
## 5                 9675              5418                              7421
## 6                 9608              6313                              7896
##   Drug.use Low.bone.mineral.density Vitamin.A.deficiency Child.stunting
## 1      174                      389                 2016           7686
## 2      188                      389                 2056           7886
## 3      211                      393                 2100           8568
## 4      232                      411                 2316           9875
## 5      247                      413                 2665          11031
## 6      260                      417                 3070          11973
##   Discontinued.breastfeeding Non.exclusive.breastfeeding Iron.deficiency
## 1                        107                        2216             564
## 2                        121                        2501             611
## 3                        150                        3053             700
## 4                        204                        3726             773
## 5                        204                        3833             812
## 6                        233                        4124             848
length(unique(data$Entity)) 
## [1] 228
length(unique(data$Year))  
## [1] 30
# Visualize some basic data
library(ggplot2)
library(dplyr)

# Line Plot for Yearly Trend of Deaths from Alcohol Use in Afghanistan
# Filter data for Afghanistan
afghanistan_data <- data %>% filter(Entity == "Afghanistan")
ggplot(afghanistan_data, aes(x = Year, y =  Alochol.use)) +
  geom_line(color = "blue", size=1) +
  ggtitle("Yearly Trend of Deaths from Alochol use in Afghanistan") +
  xlab("Year") +
  ylab("Deaths from Alcohol Use") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

##notes:Yearly Trend of Deaths from Alochol use in Afghanistan: This line plot shows the trend of deaths caused by alochol use over the years. We can observe how the number of deaths has fluctuated in Afghanistan from 1990 onward.
#Bar Plot for Comparison of Deaths from Secondhand Smoke (1990)
# Filter data for selected countries and year 1990
countries <- c("Afghanistan", "India", "China", "United States", "Germany")
subset_data <- data %>% filter(Entity %in% countries, Year == 1990)
# Create bar plot
ggplot(subset_data, aes(x = Entity, y = Secondhand.smoke , fill = Entity)) +
  geom_bar(stat = "identity") +
  ggtitle("Comparison of Deaths from Secondhand Smoke (1990)") +
  xlab("Country") +
  ylab("Deaths from Secondhand Smoke") +
  theme_minimal()

##notes:Comparison of Deaths from Secondhand Smoke (1990): This bar chart compares deaths from secondhand smoke across five different countries in 1990. It helps in understanding how this risk factor impacts different nations.
#R Code for Faceted Plot (Total Deaths per Year for the Five Largest Countries)
#Assume the following columns contribute to 'total deaths':
str(data)
## 'data.frame':    6840 obs. of  31 variables:
##  $ Entity                                  : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Code                                    : chr  "AFG" "AFG" "AFG" "AFG" ...
##  $ Year                                    : int  1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 ...
##  $ Outdoor.air.pollution                   : int  3169 3222 3395 3623 3788 3869 3943 4024 4040 4042 ...
##  $ High.systolic.blood.pressure            : int  25633 25872 26309 26961 27658 28090 28587 29021 29349 29712 ...
##  $ Diet.high.in.sodium                     : int  1045 1055 1075 1103 1134 1154 1178 1202 1222 1242 ...
##  $ Diet.low.in.whole.grains                : int  7077 7149 7297 7499 7698 7807 7943 8075 8173 8265 ...
##  $ Alochol.use                             : int  356 364 376 389 399 406 413 420 425 426 ...
##  $ Diet.low.in.fruits                      : int  3185 3248 3351 3480 3610 3703 3819 3938 4038 4127 ...
##  $ Unsafe.water.source                     : int  3702 4309 5356 7152 7192 8378 8487 9348 9788 9931 ...
##  $ Secondhand.smoke                        : int  4794 4921 5279 5734 6050 6167 6298 6425 6402 6323 ...
##  $ Low.birth.weight                        : int  16135 17924 21200 23795 24866 25534 25997 26246 25805 25080 ...
##  $ Child.wasting                           : int  19546 20334 22895 27002 29205 30943 31628 32736 32760 32271 ...
##  $ Unsafe.sex                              : int  351 361 378 395 410 422 435 448 458 469 ...
##  $ Diet.low.in.nuts.and.seeds              : int  2319 2449 2603 2771 2932 3049 3173 3298 3401 3482 ...
##  $ Household.air.pollution.from.solid.fuels: int  34372 35392 38065 41154 43153 44024 45005 46017 46055 45681 ...
##  $ Diet.low.in.Vegetables                  : int  3679 3732 3827 3951 4075 4153 4247 4339 4409 4473 ...
##  $ Low.physical.activity                   : int  2637 2652 2688 2744 2805 2839 2878 2914 2944 2976 ...
##  $ Smoking                                 : int  5174 5247 5363 5522 5689 5801 5934 6066 6178 6288 ...
##  $ High.fasting.plasma.glucose             : int  11449 11811 12265 12821 13400 13871 14413 14970 15502 16058 ...
##  $ Air.pollution                           : int  37231 38315 41172 44488 46634 47566 48617 49703 49746 49349 ...
##  $ High.body.mass.index                    : int  9518 9489 9528 9611 9675 9608 9503 9286 9024 8857 ...
##  $ Unsafe.sanitation                       : int  2798 3254 4042 5392 5418 6313 6393 7038 7366 7468 ...
##  $ No.access.to.handwashing.facility       : int  4825 5127 5889 7007 7421 7896 8098 8507 8560 8424 ...
##  $ Drug.use                                : int  174 188 211 232 247 260 274 287 295 302 ...
##  $ Low.bone.mineral.density                : int  389 389 393 411 413 417 423 425 429 428 ...
##  $ Vitamin.A.deficiency                    : int  2016 2056 2100 2316 2665 3070 3214 3228 3413 3662 ...
##  $ Child.stunting                          : int  7686 7886 8568 9875 11031 11973 12426 12805 13011 13052 ...
##  $ Discontinued.breastfeeding              : int  107 121 150 204 204 233 233 255 264 263 ...
##  $ Non.exclusive.breastfeeding             : int  2216 2501 3053 3726 3833 4124 4183 4393 4417 4326 ...
##  $ Iron.deficiency                         : int  564 611 700 773 812 848 883 914 924 909 ...
death_cols <- c("Alochol.use", "Secondhand.smoke", 
                "Smoking", "Drug.use")
# Create a new column for total deaths
data <- data %>%
  rowwise() %>%
  mutate(Total_Deaths = sum(c_across(all_of(death_cols)), na.rm = TRUE)) %>%
  ungroup()
# Select the five countries 
biggest_countries <- c("China", "India", "United States", "Indonesia", "Brazil")
# Filter data for the five countries
filtered_data <- data %>%
  filter(Entity %in% biggest_countries)
# Create faceted plot
ggplot(filtered_data, aes(x = Year, y = Total_Deaths)) +
  geom_line(color = "blue", size = 1) +
  facet_wrap(~Entity, scales = "free_y") +
  ggtitle("Total Deaths Per Year for the Five Largest Countries") +
  xlab("Year") +
  ylab("Total Deaths") +
  theme_minimal()

##notes: The facet plot shows a consistent upward trend in total deaths across Brazil, China, India, Indonesia, and the United States from 1990 to 2020, reflecting increasing health burdens in these populous nations. Brazil, China, India, and Indonesia exhibit a steady rise, with notable acceleration after 2010, likely driven by factors such as population growth, pollution, and rising rates of health-related issues like high blood pressure. In contrast, the United States displays a peak around 2005, followed by a decline and then a resurgence around 2020, possibly reflecting healthcare improvements and later emerging health challenges. These trends underscore the growing need for targeted health interventions to address key risk factors and improve overall public health outcomes in these countries.
#Heatmap for Correlation Between Various Causes of Death
library(reshape2)
## Warning: 程辑包'reshape2'是用R版本4.2.3 来建造的
## 
## 载入程辑包:'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(corrplot)
## corrplot 0.94 loaded
# Calculate the correlation matrix (excluding non-numeric columns)
numeric_data <- data %>% select(-Entity, -Code, -Year)
cor_matrix <- cor(numeric_data, use = "complete.obs")
# Create heatmap
corrplot(cor_matrix, method = "color", tl.cex = 0.7, number.cex = 0.7, 
         title = "Correlation Between Various Causes of Death", mar = c(0,0,1,0))

##notes:Correlation Between Various Causes of Death: The heatmap shows the correlation between different causes of death. Strong correlations may indicate that certain factors are linked and could influence each other in terms of health outcomes.
# b. Data cleaning and transformation:
# Check for missing values
sapply(data, function(x) sum(is.na(x)))
##                                   Entity 
##                                        0 
##                                     Code 
##                                        0 
##                                     Year 
##                                        0 
##                    Outdoor.air.pollution 
##                                        0 
##             High.systolic.blood.pressure 
##                                        0 
##                      Diet.high.in.sodium 
##                                        0 
##                 Diet.low.in.whole.grains 
##                                        0 
##                              Alochol.use 
##                                        0 
##                       Diet.low.in.fruits 
##                                        0 
##                      Unsafe.water.source 
##                                        0 
##                         Secondhand.smoke 
##                                        0 
##                         Low.birth.weight 
##                                        0 
##                            Child.wasting 
##                                        0 
##                               Unsafe.sex 
##                                        0 
##               Diet.low.in.nuts.and.seeds 
##                                        0 
## Household.air.pollution.from.solid.fuels 
##                                        0 
##                   Diet.low.in.Vegetables 
##                                        0 
##                    Low.physical.activity 
##                                        0 
##                                  Smoking 
##                                        0 
##              High.fasting.plasma.glucose 
##                                        0 
##                            Air.pollution 
##                                        0 
##                     High.body.mass.index 
##                                        0 
##                        Unsafe.sanitation 
##                                        0 
##        No.access.to.handwashing.facility 
##                                        0 
##                                 Drug.use 
##                                        0 
##                 Low.bone.mineral.density 
##                                        0 
##                     Vitamin.A.deficiency 
##                                        0 
##                           Child.stunting 
##                                        0 
##               Discontinued.breastfeeding 
##                                        0 
##              Non.exclusive.breastfeeding 
##                                        0 
##                          Iron.deficiency 
##                                        0 
##                             Total_Deaths 
##                                        0
unique(sapply(data, function(x) sum(is.na(x))))
## [1] 0
# Handling missing values - for example, removing rows with NAs
data_clean <- na.omit(data)
str(data)
## tibble [6,840 × 32] (S3: tbl_df/tbl/data.frame)
##  $ Entity                                  : chr [1:6840] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Code                                    : chr [1:6840] "AFG" "AFG" "AFG" "AFG" ...
##  $ Year                                    : int [1:6840] 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 ...
##  $ Outdoor.air.pollution                   : int [1:6840] 3169 3222 3395 3623 3788 3869 3943 4024 4040 4042 ...
##  $ High.systolic.blood.pressure            : int [1:6840] 25633 25872 26309 26961 27658 28090 28587 29021 29349 29712 ...
##  $ Diet.high.in.sodium                     : int [1:6840] 1045 1055 1075 1103 1134 1154 1178 1202 1222 1242 ...
##  $ Diet.low.in.whole.grains                : int [1:6840] 7077 7149 7297 7499 7698 7807 7943 8075 8173 8265 ...
##  $ Alochol.use                             : int [1:6840] 356 364 376 389 399 406 413 420 425 426 ...
##  $ Diet.low.in.fruits                      : int [1:6840] 3185 3248 3351 3480 3610 3703 3819 3938 4038 4127 ...
##  $ Unsafe.water.source                     : int [1:6840] 3702 4309 5356 7152 7192 8378 8487 9348 9788 9931 ...
##  $ Secondhand.smoke                        : int [1:6840] 4794 4921 5279 5734 6050 6167 6298 6425 6402 6323 ...
##  $ Low.birth.weight                        : int [1:6840] 16135 17924 21200 23795 24866 25534 25997 26246 25805 25080 ...
##  $ Child.wasting                           : int [1:6840] 19546 20334 22895 27002 29205 30943 31628 32736 32760 32271 ...
##  $ Unsafe.sex                              : int [1:6840] 351 361 378 395 410 422 435 448 458 469 ...
##  $ Diet.low.in.nuts.and.seeds              : int [1:6840] 2319 2449 2603 2771 2932 3049 3173 3298 3401 3482 ...
##  $ Household.air.pollution.from.solid.fuels: int [1:6840] 34372 35392 38065 41154 43153 44024 45005 46017 46055 45681 ...
##  $ Diet.low.in.Vegetables                  : int [1:6840] 3679 3732 3827 3951 4075 4153 4247 4339 4409 4473 ...
##  $ Low.physical.activity                   : int [1:6840] 2637 2652 2688 2744 2805 2839 2878 2914 2944 2976 ...
##  $ Smoking                                 : int [1:6840] 5174 5247 5363 5522 5689 5801 5934 6066 6178 6288 ...
##  $ High.fasting.plasma.glucose             : int [1:6840] 11449 11811 12265 12821 13400 13871 14413 14970 15502 16058 ...
##  $ Air.pollution                           : int [1:6840] 37231 38315 41172 44488 46634 47566 48617 49703 49746 49349 ...
##  $ High.body.mass.index                    : int [1:6840] 9518 9489 9528 9611 9675 9608 9503 9286 9024 8857 ...
##  $ Unsafe.sanitation                       : int [1:6840] 2798 3254 4042 5392 5418 6313 6393 7038 7366 7468 ...
##  $ No.access.to.handwashing.facility       : int [1:6840] 4825 5127 5889 7007 7421 7896 8098 8507 8560 8424 ...
##  $ Drug.use                                : int [1:6840] 174 188 211 232 247 260 274 287 295 302 ...
##  $ Low.bone.mineral.density                : int [1:6840] 389 389 393 411 413 417 423 425 429 428 ...
##  $ Vitamin.A.deficiency                    : int [1:6840] 2016 2056 2100 2316 2665 3070 3214 3228 3413 3662 ...
##  $ Child.stunting                          : int [1:6840] 7686 7886 8568 9875 11031 11973 12426 12805 13011 13052 ...
##  $ Discontinued.breastfeeding              : int [1:6840] 107 121 150 204 204 233 233 255 264 263 ...
##  $ Non.exclusive.breastfeeding             : int [1:6840] 2216 2501 3053 3726 3833 4124 4183 4393 4417 4326 ...
##  $ Iron.deficiency                         : int [1:6840] 564 611 700 773 812 848 883 914 924 909 ...
##  $ Total_Deaths                            : int [1:6840] 10498 10720 11229 11877 12385 12634 12919 13198 13300 13339 ...
str(data_clean)
## tibble [6,840 × 32] (S3: tbl_df/tbl/data.frame)
##  $ Entity                                  : chr [1:6840] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Code                                    : chr [1:6840] "AFG" "AFG" "AFG" "AFG" ...
##  $ Year                                    : int [1:6840] 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 ...
##  $ Outdoor.air.pollution                   : int [1:6840] 3169 3222 3395 3623 3788 3869 3943 4024 4040 4042 ...
##  $ High.systolic.blood.pressure            : int [1:6840] 25633 25872 26309 26961 27658 28090 28587 29021 29349 29712 ...
##  $ Diet.high.in.sodium                     : int [1:6840] 1045 1055 1075 1103 1134 1154 1178 1202 1222 1242 ...
##  $ Diet.low.in.whole.grains                : int [1:6840] 7077 7149 7297 7499 7698 7807 7943 8075 8173 8265 ...
##  $ Alochol.use                             : int [1:6840] 356 364 376 389 399 406 413 420 425 426 ...
##  $ Diet.low.in.fruits                      : int [1:6840] 3185 3248 3351 3480 3610 3703 3819 3938 4038 4127 ...
##  $ Unsafe.water.source                     : int [1:6840] 3702 4309 5356 7152 7192 8378 8487 9348 9788 9931 ...
##  $ Secondhand.smoke                        : int [1:6840] 4794 4921 5279 5734 6050 6167 6298 6425 6402 6323 ...
##  $ Low.birth.weight                        : int [1:6840] 16135 17924 21200 23795 24866 25534 25997 26246 25805 25080 ...
##  $ Child.wasting                           : int [1:6840] 19546 20334 22895 27002 29205 30943 31628 32736 32760 32271 ...
##  $ Unsafe.sex                              : int [1:6840] 351 361 378 395 410 422 435 448 458 469 ...
##  $ Diet.low.in.nuts.and.seeds              : int [1:6840] 2319 2449 2603 2771 2932 3049 3173 3298 3401 3482 ...
##  $ Household.air.pollution.from.solid.fuels: int [1:6840] 34372 35392 38065 41154 43153 44024 45005 46017 46055 45681 ...
##  $ Diet.low.in.Vegetables                  : int [1:6840] 3679 3732 3827 3951 4075 4153 4247 4339 4409 4473 ...
##  $ Low.physical.activity                   : int [1:6840] 2637 2652 2688 2744 2805 2839 2878 2914 2944 2976 ...
##  $ Smoking                                 : int [1:6840] 5174 5247 5363 5522 5689 5801 5934 6066 6178 6288 ...
##  $ High.fasting.plasma.glucose             : int [1:6840] 11449 11811 12265 12821 13400 13871 14413 14970 15502 16058 ...
##  $ Air.pollution                           : int [1:6840] 37231 38315 41172 44488 46634 47566 48617 49703 49746 49349 ...
##  $ High.body.mass.index                    : int [1:6840] 9518 9489 9528 9611 9675 9608 9503 9286 9024 8857 ...
##  $ Unsafe.sanitation                       : int [1:6840] 2798 3254 4042 5392 5418 6313 6393 7038 7366 7468 ...
##  $ No.access.to.handwashing.facility       : int [1:6840] 4825 5127 5889 7007 7421 7896 8098 8507 8560 8424 ...
##  $ Drug.use                                : int [1:6840] 174 188 211 232 247 260 274 287 295 302 ...
##  $ Low.bone.mineral.density                : int [1:6840] 389 389 393 411 413 417 423 425 429 428 ...
##  $ Vitamin.A.deficiency                    : int [1:6840] 2016 2056 2100 2316 2665 3070 3214 3228 3413 3662 ...
##  $ Child.stunting                          : int [1:6840] 7686 7886 8568 9875 11031 11973 12426 12805 13011 13052 ...
##  $ Discontinued.breastfeeding              : int [1:6840] 107 121 150 204 204 233 233 255 264 263 ...
##  $ Non.exclusive.breastfeeding             : int [1:6840] 2216 2501 3053 3726 3833 4124 4183 4393 4417 4326 ...
##  $ Iron.deficiency                         : int [1:6840] 564 611 700 773 812 848 883 914 924 909 ...
##  $ Total_Deaths                            : int [1:6840] 10498 10720 11229 11877 12385 12634 12919 13198 13300 13339 ...
# Or fill missing values with the mean (if numerical)  not applicable in the dataset

# Data Distribution (Skewness) Visualization
# Plot histograms to check the distribution of death causes
ggplot(data_clean, aes(x = Total_Deaths)) +
  geom_histogram(fill = "blue", color = "black") +
  ggtitle("Distribution of Total Deaths from Alcohol Use, Smoking, Secondhand Smoking, and Drug Use") +
  xlab("Total_Deaths from alochol use, smoking, second hand smoking, and drug use") +
  ylab("Frequency") +
  theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Visualizing Outliers 
# Create boxplots for each cause of death
ggplot(data_clean, aes(x =, y =Total_Deaths)) +
  geom_boxplot() +
  ggtitle("Boxplot for Total Deaths from alochol use, smoking, second hand smoking, and drug use") +
  ylab("otal Deaths")

# Need to normalize or scale the data for some algorithms
library(dplyr)
# Excluding non-numeric ones like 'Entity', 'Code', and 'Year'
numeric_columns <- data_clean %>% select(-Entity, -Code, -Year)
# Define a function for normalization
normalize <- function(x) {
  return((x - min(x)) / (max(x) - min(x)))
}
data_clean_normalized <- as.data.frame(lapply(numeric_columns, normalize))
data_clean_normalized <- cbind(data %>% select(Entity, Code, Year), data_clean_normalized)
head(data_clean_normalized)
##        Entity Code Year Outdoor.air.pollution High.systolic.blood.pressure
## 1 Afghanistan  AFG 1990          0.0007032544                  0.002363264
## 2 Afghanistan  AFG 1991          0.0007150160                  0.002385301
## 3 Afghanistan  AFG 1992          0.0007534076                  0.002425594
## 4 Afghanistan  AFG 1993          0.0008040046                  0.002485710
## 5 Afghanistan  AFG 1994          0.0008406209                  0.002549976
## 6 Afghanistan  AFG 1995          0.0008585962                  0.002589808
##   Diet.high.in.sodium Diet.low.in.whole.grains  Alochol.use Diet.low.in.fruits
## 1        0.0005542720              0.003836113 0.0001457838        0.003044889
## 2        0.0005595760              0.003875141 0.0001490598        0.003105118
## 3        0.0005701841              0.003955365 0.0001539739        0.003203587
## 4        0.0005850354              0.004064860 0.0001592974        0.003326912
## 5        0.0006014779              0.004172729 0.0001633925        0.003451193
## 6        0.0006120860              0.004231812 0.0001662590        0.003540102
##   Unsafe.water.source Secondhand.smoke Low.birth.weight Child.wasting
## 1         0.001510438      0.003674720      0.005319070   0.005697841
## 2         0.001758098      0.003772089      0.005908832   0.005927551
## 3         0.002185280      0.004046562      0.006988800   0.006674106
## 4         0.002918059      0.004395404      0.007844268   0.007871335
## 5         0.002934380      0.004637676      0.008197335   0.008513530
## 6         0.003418275      0.004727378      0.008417548   0.009020173
##     Unsafe.sex Diet.low.in.nuts.and.seeds
## 1 0.0002108345                0.004032069
## 2 0.0002168412                0.004258101
## 3 0.0002270525                0.004525862
## 4 0.0002372639                0.004817966
## 5 0.0002462739                0.005097898
## 6 0.0002534819                0.005301327
##   Household.air.pollution.from.solid.fuels Diet.low.in.Vegetables
## 1                              0.007886717            0.006949626
## 2                              0.008120758            0.007049743
## 3                              0.008734082            0.007229198
## 4                              0.009442859            0.007463434
## 5                              0.009901533            0.007697670
## 6                              0.010101386            0.007845011
##   Low.physical.activity      Smoking High.fasting.plasma.glucose Air.pollution
## 1           0.003171369 0.0006723974                 0.001760545   0.005580403
## 2           0.003189409 0.0006818861                 0.001816226   0.005742880
## 3           0.003232704 0.0006969640                 0.001886057   0.006171104
## 4           0.003300052 0.0007176312                 0.001971577   0.006668126
## 5           0.003373413 0.0007393382                 0.002060635   0.006989781
## 6           0.003414303 0.0007538962                 0.002133081   0.007129474
##   High.body.mass.index Unsafe.sanitation No.access.to.handwashing.facility
## 1          0.001895860       0.001518774                       0.004019664
## 2          0.001890082       0.001766294                       0.004271258
## 3          0.001897852       0.002194026                       0.004906073
## 4          0.001914388       0.002926816                       0.005837469
## 5          0.001927139       0.002940929                       0.006182369
## 6          0.001913791       0.003426741                       0.006578087
##       Drug.use Low.bone.mineral.density Vitamin.A.deficiency Child.stunting
## 1 0.0003518763             0.0008883631          0.009713088    0.009221920
## 2 0.0003801882             0.0008883631          0.009905808    0.009461887
## 3 0.0004267005             0.0008974980          0.010117800    0.010280173
## 4 0.0004691684             0.0009386047          0.011158488    0.011848355
## 5 0.0004995025             0.0009431722          0.012839970    0.013235363
## 6 0.0005257921             0.0009523070          0.014791260    0.014365606
##   Discontinued.breastfeeding Non.exclusive.breastfeeding Iron.deficiency
## 1                0.003232043                 0.004384039     0.007677543
## 2                0.003654927                 0.004947870     0.008317338
## 3                0.004530901                 0.006039923     0.009528866
## 4                0.006162025                 0.007371357     0.010522590
## 5                0.006162025                 0.007583042     0.011053484
## 6                0.007037999                 0.008158743     0.011543540
##   Total_Deaths
## 1 0.0008794930
## 2 0.0008980950
## 3 0.0009407458
## 4 0.0009950437
## 5 0.0010376106
## 6 0.0010584751
# Visualizing Outliers
# Create boxplots for each cause of death
ggplot(data_clean_normalized, aes(x =, y =Outdoor.air.pollution)) +
  geom_boxplot() +
  ggtitle("Boxplot for Outdoor Air Pollution Deaths") +
  ylab("Outdoor Air Pollution Deaths")

# Calculate the death rate data with death rate against the social economic status of the country (GDP per capita)
GDPpC <- read.csv("API_NY.GDP.PCAP.CD_DS2_en_csv_v2_31681.csv")
str(GDPpC)
## 'data.frame':    266 obs. of  68 variables:
##  $ Country.Name  : chr  "Aruba" "Africa Eastern and Southern" "Afghanistan" "Africa Western and Central" ...
##  $ Country.Code  : chr  "ABW" "AFE" "AFG" "AFW" ...
##  $ Indicator.Name: chr  "GDP per capita (current US$)" "GDP per capita (current US$)" "GDP per capita (current US$)" "GDP per capita (current US$)" ...
##  $ Indicator.Code: chr  "NY.GDP.PCAP.CD" "NY.GDP.PCAP.CD" "NY.GDP.PCAP.CD" "NY.GDP.PCAP.CD" ...
##  $ X1960         : num  NA 162 NA 122 NA ...
##  $ X1961         : num  NA 166 NA 128 NA ...
##  $ X1962         : num  NA 172 NA 134 NA ...
##  $ X1963         : num  NA 182 NA 139 NA ...
##  $ X1964         : num  NA 193 NA 149 NA ...
##  $ X1965         : num  NA 203 NA 156 NA ...
##  $ X1966         : num  NA 215 NA 163 NA ...
##  $ X1967         : num  NA 227 NA 146 NA ...
##  $ X1968         : num  NA 238 NA 147 NA ...
##  $ X1969         : num  NA 256 NA 163 NA ...
##  $ X1970         : num  NA 253 NA 220 NA ...
##  $ X1971         : num  NA 268 NA 197 NA ...
##  $ X1972         : num  NA 283 NA 232 NA ...
##  $ X1973         : num  NA 354 NA 283 NA ...
##  $ X1974         : num  NA 422 NA 371 NA ...
##  $ X1975         : num  NA 436 NA 416 NA ...
##  $ X1976         : num  NA 430 NA 484 NA ...
##  $ X1977         : num  NA 468 NA 495 NA ...
##  $ X1978         : num  NA 509 NA 528 NA ...
##  $ X1979         : num  NA 579 NA 630 NA ...
##  $ X1980         : num  NA 728 NA 764 NA ...
##  $ X1981         : num  NA 747 NA 1336 NA ...
##  $ X1982         : num  NA 690 NA 1173 NA ...
##  $ X1983         : num  NA 712 NA 881 NA ...
##  $ X1984         : num  NA 639 NA 745 NA ...
##  $ X1985         : num  NA 535 NA 762 NA ...
##  $ X1986         : num  6283 561 NA 588 NA ...
##  $ X1987         : num  7567 653 NA 589 NA ...
##  $ X1988         : num  9275 690 NA 568 NA ...
##  $ X1989         : num  10767 712 NA 516 NA ...
##  $ X1990         : num  11639 811 NA 598 NA ...
##  $ X1991         : num  12850 859 NA 612 NA ...
##  $ X1992         : num  13658 732 NA 572 NA ...
##  $ X1993         : num  14970 716 NA 580 451 ...
##  $ X1994         : num  16675 707 NA 587 329 ...
##  $ X1995         : num  17140 774 NA 878 398 ...
##  $ X1996         : num  17375 751 NA 1084 454 ...
##  $ X1997         : num  18713 775 NA 1109 516 ...
##  $ X1998         : num  19742 704 NA 1159 423 ...
##  $ X1999         : num  19834 678 NA 532 388 ...
##  $ X2000         : num  21026 715 180 527 557 ...
##  $ X2001         : num  20911 633 143 539 527 ...
##  $ X2002         : num  21375 634 182 627 873 ...
##  $ X2003         : num  22051 820 200 706 983 ...
##  $ X2004         : num  24106 994 222 850 1255 ...
##  $ X2005         : num  24978 1130 254 1008 1901 ...
##  $ X2006         : num  25833 1236 274 1246 2598 ...
##  $ X2007         : num  27665 1380 376 1421 3121 ...
##  $ X2008         : num  29012 1439 383 1686 4082 ...
##  $ X2009         : num  25741 1405 453 1468 3124 ...
##  $ X2010         : num  24453 1623 562 1680 3587 ...
##  $ X2011         : num  26043 1758 609 1862 4608 ...
##  $ X2012         : num  25611 1724 653 1958 5084 ...
##  $ X2013         : num  26515 1696 639 2154 5061 ...
##  $ X2014         : num  26940 1679 627 2249 5012 ...
##  $ X2015         : num  28419 1499 567 1883 3217 ...
##  $ X2016         : num  28450 1346 523 1649 1810 ...
##  $ X2017         : num  29329 1486 526 1591 2439 ...
##  $ X2018         : num  30918 1559 492 1735 2541 ...
##  $ X2019         : num  31903 1508 498 1814 2191 ...
##  $ X2020         : num  24008 1356 512 1688 1451 ...
##  $ X2021         : num  29128 1546 356 1769 1927 ...
##  $ X2022         : num  33301 1642 353 1789 2933 ...
##  $ X2023         : num  NA 1673 NA 1584 2310 ...
# Reshaping the dataset using melt function
library(reshape2)
GDPpC_adj <- GDPpC %>% select(-Country.Name, -Indicator.Name, -Indicator.Code)
str(GDPpC_adj)
## 'data.frame':    266 obs. of  65 variables:
##  $ Country.Code: chr  "ABW" "AFE" "AFG" "AFW" ...
##  $ X1960       : num  NA 162 NA 122 NA ...
##  $ X1961       : num  NA 166 NA 128 NA ...
##  $ X1962       : num  NA 172 NA 134 NA ...
##  $ X1963       : num  NA 182 NA 139 NA ...
##  $ X1964       : num  NA 193 NA 149 NA ...
##  $ X1965       : num  NA 203 NA 156 NA ...
##  $ X1966       : num  NA 215 NA 163 NA ...
##  $ X1967       : num  NA 227 NA 146 NA ...
##  $ X1968       : num  NA 238 NA 147 NA ...
##  $ X1969       : num  NA 256 NA 163 NA ...
##  $ X1970       : num  NA 253 NA 220 NA ...
##  $ X1971       : num  NA 268 NA 197 NA ...
##  $ X1972       : num  NA 283 NA 232 NA ...
##  $ X1973       : num  NA 354 NA 283 NA ...
##  $ X1974       : num  NA 422 NA 371 NA ...
##  $ X1975       : num  NA 436 NA 416 NA ...
##  $ X1976       : num  NA 430 NA 484 NA ...
##  $ X1977       : num  NA 468 NA 495 NA ...
##  $ X1978       : num  NA 509 NA 528 NA ...
##  $ X1979       : num  NA 579 NA 630 NA ...
##  $ X1980       : num  NA 728 NA 764 NA ...
##  $ X1981       : num  NA 747 NA 1336 NA ...
##  $ X1982       : num  NA 690 NA 1173 NA ...
##  $ X1983       : num  NA 712 NA 881 NA ...
##  $ X1984       : num  NA 639 NA 745 NA ...
##  $ X1985       : num  NA 535 NA 762 NA ...
##  $ X1986       : num  6283 561 NA 588 NA ...
##  $ X1987       : num  7567 653 NA 589 NA ...
##  $ X1988       : num  9275 690 NA 568 NA ...
##  $ X1989       : num  10767 712 NA 516 NA ...
##  $ X1990       : num  11639 811 NA 598 NA ...
##  $ X1991       : num  12850 859 NA 612 NA ...
##  $ X1992       : num  13658 732 NA 572 NA ...
##  $ X1993       : num  14970 716 NA 580 451 ...
##  $ X1994       : num  16675 707 NA 587 329 ...
##  $ X1995       : num  17140 774 NA 878 398 ...
##  $ X1996       : num  17375 751 NA 1084 454 ...
##  $ X1997       : num  18713 775 NA 1109 516 ...
##  $ X1998       : num  19742 704 NA 1159 423 ...
##  $ X1999       : num  19834 678 NA 532 388 ...
##  $ X2000       : num  21026 715 180 527 557 ...
##  $ X2001       : num  20911 633 143 539 527 ...
##  $ X2002       : num  21375 634 182 627 873 ...
##  $ X2003       : num  22051 820 200 706 983 ...
##  $ X2004       : num  24106 994 222 850 1255 ...
##  $ X2005       : num  24978 1130 254 1008 1901 ...
##  $ X2006       : num  25833 1236 274 1246 2598 ...
##  $ X2007       : num  27665 1380 376 1421 3121 ...
##  $ X2008       : num  29012 1439 383 1686 4082 ...
##  $ X2009       : num  25741 1405 453 1468 3124 ...
##  $ X2010       : num  24453 1623 562 1680 3587 ...
##  $ X2011       : num  26043 1758 609 1862 4608 ...
##  $ X2012       : num  25611 1724 653 1958 5084 ...
##  $ X2013       : num  26515 1696 639 2154 5061 ...
##  $ X2014       : num  26940 1679 627 2249 5012 ...
##  $ X2015       : num  28419 1499 567 1883 3217 ...
##  $ X2016       : num  28450 1346 523 1649 1810 ...
##  $ X2017       : num  29329 1486 526 1591 2439 ...
##  $ X2018       : num  30918 1559 492 1735 2541 ...
##  $ X2019       : num  31903 1508 498 1814 2191 ...
##  $ X2020       : num  24008 1356 512 1688 1451 ...
##  $ X2021       : num  29128 1546 356 1769 1927 ...
##  $ X2022       : num  33301 1642 353 1789 2933 ...
##  $ X2023       : num  NA 1673 NA 1584 2310 ...
GDPpC_long <- melt(GDPpC_adj, 
                 id.vars = c("Country.Code"),  # Keeping Country Code as identifier
                 variable.name = "Year",       # Column for years
                 value.name = "GDP_per_Capita")  # Column for GDP per capita values

str(GDPpC_long)
## 'data.frame':    17024 obs. of  3 variables:
##  $ Country.Code  : chr  "ABW" "AFE" "AFG" "AFW" ...
##  $ Year          : Factor w/ 64 levels "X1960","X1961",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ GDP_per_Capita: num  NA 162 NA 122 NA ...
# Remove the 'X' from the 'Year' column and convert it to numeric
GDPpC_long$Year <- as.numeric(gsub("X", "", GDPpC_long$Year))

head(GDPpC_long)
##   Country.Code Year GDP_per_Capita
## 1          ABW 1960             NA
## 2          AFE 1960       162.3425
## 3          AFG 1960             NA
## 4          AFW 1960       122.1939
## 5          AGO 1960             NA
## 6          ALB 1960             NA
# Merge datasets on 'Country' and 'Year'
library(dplyr)
merged_data <- merge(data_clean, GDPpC_long, by.x = c("Code", "Year"), by.y = c("Country.Code", "Year"))
str(merged_data)
## 'data.frame':    6000 obs. of  33 variables:
##  $ Code                                    : chr  "AFG" "AFG" "AFG" "AFG" ...
##  $ Year                                    : int  1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 ...
##  $ Entity                                  : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Outdoor.air.pollution                   : int  3169 3222 3395 3623 3788 3869 3943 4024 4040 4042 ...
##  $ High.systolic.blood.pressure            : int  25633 25872 26309 26961 27658 28090 28587 29021 29349 29712 ...
##  $ Diet.high.in.sodium                     : int  1045 1055 1075 1103 1134 1154 1178 1202 1222 1242 ...
##  $ Diet.low.in.whole.grains                : int  7077 7149 7297 7499 7698 7807 7943 8075 8173 8265 ...
##  $ Alochol.use                             : int  356 364 376 389 399 406 413 420 425 426 ...
##  $ Diet.low.in.fruits                      : int  3185 3248 3351 3480 3610 3703 3819 3938 4038 4127 ...
##  $ Unsafe.water.source                     : int  3702 4309 5356 7152 7192 8378 8487 9348 9788 9931 ...
##  $ Secondhand.smoke                        : int  4794 4921 5279 5734 6050 6167 6298 6425 6402 6323 ...
##  $ Low.birth.weight                        : int  16135 17924 21200 23795 24866 25534 25997 26246 25805 25080 ...
##  $ Child.wasting                           : int  19546 20334 22895 27002 29205 30943 31628 32736 32760 32271 ...
##  $ Unsafe.sex                              : int  351 361 378 395 410 422 435 448 458 469 ...
##  $ Diet.low.in.nuts.and.seeds              : int  2319 2449 2603 2771 2932 3049 3173 3298 3401 3482 ...
##  $ Household.air.pollution.from.solid.fuels: int  34372 35392 38065 41154 43153 44024 45005 46017 46055 45681 ...
##  $ Diet.low.in.Vegetables                  : int  3679 3732 3827 3951 4075 4153 4247 4339 4409 4473 ...
##  $ Low.physical.activity                   : int  2637 2652 2688 2744 2805 2839 2878 2914 2944 2976 ...
##  $ Smoking                                 : int  5174 5247 5363 5522 5689 5801 5934 6066 6178 6288 ...
##  $ High.fasting.plasma.glucose             : int  11449 11811 12265 12821 13400 13871 14413 14970 15502 16058 ...
##  $ Air.pollution                           : int  37231 38315 41172 44488 46634 47566 48617 49703 49746 49349 ...
##  $ High.body.mass.index                    : int  9518 9489 9528 9611 9675 9608 9503 9286 9024 8857 ...
##  $ Unsafe.sanitation                       : int  2798 3254 4042 5392 5418 6313 6393 7038 7366 7468 ...
##  $ No.access.to.handwashing.facility       : int  4825 5127 5889 7007 7421 7896 8098 8507 8560 8424 ...
##  $ Drug.use                                : int  174 188 211 232 247 260 274 287 295 302 ...
##  $ Low.bone.mineral.density                : int  389 389 393 411 413 417 423 425 429 428 ...
##  $ Vitamin.A.deficiency                    : int  2016 2056 2100 2316 2665 3070 3214 3228 3413 3662 ...
##  $ Child.stunting                          : int  7686 7886 8568 9875 11031 11973 12426 12805 13011 13052 ...
##  $ Discontinued.breastfeeding              : int  107 121 150 204 204 233 233 255 264 263 ...
##  $ Non.exclusive.breastfeeding             : int  2216 2501 3053 3726 3833 4124 4183 4393 4417 4326 ...
##  $ Iron.deficiency                         : int  564 611 700 773 812 848 883 914 924 909 ...
##  $ Total_Deaths                            : int  10498 10720 11229 11877 12385 12634 12919 13198 13300 13339 ...
##  $ GDP_per_Capita                          : num  NA NA NA NA NA NA NA NA NA NA ...
head(merged_data)
##   Code Year      Entity Outdoor.air.pollution High.systolic.blood.pressure
## 1  AFG 1990 Afghanistan                  3169                        25633
## 2  AFG 1991 Afghanistan                  3222                        25872
## 3  AFG 1992 Afghanistan                  3395                        26309
## 4  AFG 1993 Afghanistan                  3623                        26961
## 5  AFG 1994 Afghanistan                  3788                        27658
## 6  AFG 1995 Afghanistan                  3869                        28090
##   Diet.high.in.sodium Diet.low.in.whole.grains Alochol.use Diet.low.in.fruits
## 1                1045                     7077         356               3185
## 2                1055                     7149         364               3248
## 3                1075                     7297         376               3351
## 4                1103                     7499         389               3480
## 5                1134                     7698         399               3610
## 6                1154                     7807         406               3703
##   Unsafe.water.source Secondhand.smoke Low.birth.weight Child.wasting
## 1                3702             4794            16135         19546
## 2                4309             4921            17924         20334
## 3                5356             5279            21200         22895
## 4                7152             5734            23795         27002
## 5                7192             6050            24866         29205
## 6                8378             6167            25534         30943
##   Unsafe.sex Diet.low.in.nuts.and.seeds
## 1        351                       2319
## 2        361                       2449
## 3        378                       2603
## 4        395                       2771
## 5        410                       2932
## 6        422                       3049
##   Household.air.pollution.from.solid.fuels Diet.low.in.Vegetables
## 1                                    34372                   3679
## 2                                    35392                   3732
## 3                                    38065                   3827
## 4                                    41154                   3951
## 5                                    43153                   4075
## 6                                    44024                   4153
##   Low.physical.activity Smoking High.fasting.plasma.glucose Air.pollution
## 1                  2637    5174                       11449         37231
## 2                  2652    5247                       11811         38315
## 3                  2688    5363                       12265         41172
## 4                  2744    5522                       12821         44488
## 5                  2805    5689                       13400         46634
## 6                  2839    5801                       13871         47566
##   High.body.mass.index Unsafe.sanitation No.access.to.handwashing.facility
## 1                 9518              2798                              4825
## 2                 9489              3254                              5127
## 3                 9528              4042                              5889
## 4                 9611              5392                              7007
## 5                 9675              5418                              7421
## 6                 9608              6313                              7896
##   Drug.use Low.bone.mineral.density Vitamin.A.deficiency Child.stunting
## 1      174                      389                 2016           7686
## 2      188                      389                 2056           7886
## 3      211                      393                 2100           8568
## 4      232                      411                 2316           9875
## 5      247                      413                 2665          11031
## 6      260                      417                 3070          11973
##   Discontinued.breastfeeding Non.exclusive.breastfeeding Iron.deficiency
## 1                        107                        2216             564
## 2                        121                        2501             611
## 3                        150                        3053             700
## 4                        204                        3726             773
## 5                        204                        3833             812
## 6                        233                        4124             848
##   Total_Deaths GDP_per_Capita
## 1        10498             NA
## 2        10720             NA
## 3        11229             NA
## 4        11877             NA
## 5        12385             NA
## 6        12634             NA
# Removing rows with NAs
sapply(merged_data, function(x) sum(is.na(x)))
##                                     Code 
##                                        0 
##                                     Year 
##                                        0 
##                                   Entity 
##                                        0 
##                    Outdoor.air.pollution 
##                                        0 
##             High.systolic.blood.pressure 
##                                        0 
##                      Diet.high.in.sodium 
##                                        0 
##                 Diet.low.in.whole.grains 
##                                        0 
##                              Alochol.use 
##                                        0 
##                       Diet.low.in.fruits 
##                                        0 
##                      Unsafe.water.source 
##                                        0 
##                         Secondhand.smoke 
##                                        0 
##                         Low.birth.weight 
##                                        0 
##                            Child.wasting 
##                                        0 
##                               Unsafe.sex 
##                                        0 
##               Diet.low.in.nuts.and.seeds 
##                                        0 
## Household.air.pollution.from.solid.fuels 
##                                        0 
##                   Diet.low.in.Vegetables 
##                                        0 
##                    Low.physical.activity 
##                                        0 
##                                  Smoking 
##                                        0 
##              High.fasting.plasma.glucose 
##                                        0 
##                            Air.pollution 
##                                        0 
##                     High.body.mass.index 
##                                        0 
##                        Unsafe.sanitation 
##                                        0 
##        No.access.to.handwashing.facility 
##                                        0 
##                                 Drug.use 
##                                        0 
##                 Low.bone.mineral.density 
##                                        0 
##                     Vitamin.A.deficiency 
##                                        0 
##                           Child.stunting 
##                                        0 
##               Discontinued.breastfeeding 
##                                        0 
##              Non.exclusive.breastfeeding 
##                                        0 
##                          Iron.deficiency 
##                                        0 
##                             Total_Deaths 
##                                        0 
##                           GDP_per_Capita 
##                                      172
unique(sapply(merged_data, function(x) sum(is.na(x))))
## [1]   0 172
merged_data_clean <- na.omit(merged_data)
str(merged_data_clean)
## 'data.frame':    5828 obs. of  33 variables:
##  $ Code                                    : chr  "AFG" "AFG" "AFG" "AFG" ...
##  $ Year                                    : int  2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 ...
##  $ Entity                                  : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Outdoor.air.pollution                   : int  4021 4014 3961 4116 4176 4176 4232 4480 4767 5038 ...
##  $ High.systolic.blood.pressure            : int  29999 30421 30189 30157 30225 30089 30075 30080 30219 30280 ...
##  $ Diet.high.in.sodium                     : int  1260 1282 1275 1277 1281 1276 1270 1263 1261 1254 ...
##  $ Diet.low.in.whole.grains                : int  8328 8440 8383 8398 8433 8415 8418 8426 8474 8488 ...
##  $ Alochol.use                             : int  427 432 432 437 445 452 456 465 478 485 ...
##  $ Diet.low.in.fruits                      : int  4174 4226 4184 4179 4188 4166 4142 4108 4088 4050 ...
##  $ Unsafe.water.source                     : int  9942 10052 10004 10841 10761 10118 9081 8168 7245 6437 ...
##  $ Secondhand.smoke                        : int  6227 6214 6103 6341 6383 6272 6153 6010 5868 5752 ...
##  $ Low.birth.weight                        : int  24549 24859 25441 25617 25459 25149 24354 23778 23533 23232 ...
##  $ Child.wasting                           : int  31559 30938 29619 29098 28167 26669 24987 23100 21624 20264 ...
##  $ Unsafe.sex                              : int  482 500 507 518 533 541 549 557 568 576 ...
##  $ Diet.low.in.nuts.and.seeds              : int  3516 3543 3487 3457 3430 3381 3325 3251 3183 3102 ...
##  $ Household.air.pollution.from.solid.fuels: int  45132 45028 44137 44953 44534 43280 41778 40047 38320 36675 ...
##  $ Diet.low.in.Vegetables                  : int  4501 4539 4483 4471 4475 4449 4432 4417 4422 4407 ...
##  $ Low.physical.activity                   : int  3013 3068 3078 3104 3137 3153 3178 3214 3267 3311 ...
##  $ Smoking                                 : int  6378 6505 6488 6528 6594 6636 6720 6849 7026 7181 ...
##  $ High.fasting.plasma.glucose             : int  16545 17144 17467 17921 18397 18652 18917 19183 19498 19730 ...
##  $ Air.pollution                           : int  48763 48660 47732 48687 48337 47107 45691 44161 42696 41348 ...
##  $ High.body.mass.index                    : int  8829 9047 9151 9410 9759 10051 10519 11218 12106 12982 ...
##  $ Unsafe.sanitation                       : int  7472 7547 7497 8106 8028 7532 6740 6050 5350 4732 ...
##  $ No.access.to.handwashing.facility       : int  8233 8144 7915 8482 8440 8033 7481 6900 6291 5793 ...
##  $ Drug.use                                : int  312 328 337 356 376 390 401 414 430 445 ...
##  $ Low.bone.mineral.density                : int  431 436 437 433 440 454 454 465 478 493 ...
##  $ Vitamin.A.deficiency                    : int  3855 3944 3798 1902 1330 1243 1231 1226 1555 1704 ...
##  $ Child.stunting                          : int  12965 12732 12069 9862 8928 8380 7885 7303 7148 6867 ...
##  $ Discontinued.breastfeeding              : int  256 250 242 263 255 233 204 182 165 158 ...
##  $ Non.exclusive.breastfeeding             : int  4195 4094 3947 3953 3774 3519 3216 2992 2832 2706 ...
##  $ Iron.deficiency                         : int  897 919 937 972 985 1007 992 1008 1008 1001 ...
##  $ Total_Deaths                            : int  13344 13479 13360 13662 13798 13750 13730 13738 13802 13863 ...
##  $ GDP_per_Capita                          : num  180 143 182 200 222 ...
##  - attr(*, "na.action")= 'omit' Named int [1:172] 1 2 3 4 5 6 7 8 9 10 ...
##   ..- attr(*, "names")= chr [1:172] "1" "2" "3" "4" ...
str(merged_data)
## 'data.frame':    6000 obs. of  33 variables:
##  $ Code                                    : chr  "AFG" "AFG" "AFG" "AFG" ...
##  $ Year                                    : int  1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 ...
##  $ Entity                                  : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Outdoor.air.pollution                   : int  3169 3222 3395 3623 3788 3869 3943 4024 4040 4042 ...
##  $ High.systolic.blood.pressure            : int  25633 25872 26309 26961 27658 28090 28587 29021 29349 29712 ...
##  $ Diet.high.in.sodium                     : int  1045 1055 1075 1103 1134 1154 1178 1202 1222 1242 ...
##  $ Diet.low.in.whole.grains                : int  7077 7149 7297 7499 7698 7807 7943 8075 8173 8265 ...
##  $ Alochol.use                             : int  356 364 376 389 399 406 413 420 425 426 ...
##  $ Diet.low.in.fruits                      : int  3185 3248 3351 3480 3610 3703 3819 3938 4038 4127 ...
##  $ Unsafe.water.source                     : int  3702 4309 5356 7152 7192 8378 8487 9348 9788 9931 ...
##  $ Secondhand.smoke                        : int  4794 4921 5279 5734 6050 6167 6298 6425 6402 6323 ...
##  $ Low.birth.weight                        : int  16135 17924 21200 23795 24866 25534 25997 26246 25805 25080 ...
##  $ Child.wasting                           : int  19546 20334 22895 27002 29205 30943 31628 32736 32760 32271 ...
##  $ Unsafe.sex                              : int  351 361 378 395 410 422 435 448 458 469 ...
##  $ Diet.low.in.nuts.and.seeds              : int  2319 2449 2603 2771 2932 3049 3173 3298 3401 3482 ...
##  $ Household.air.pollution.from.solid.fuels: int  34372 35392 38065 41154 43153 44024 45005 46017 46055 45681 ...
##  $ Diet.low.in.Vegetables                  : int  3679 3732 3827 3951 4075 4153 4247 4339 4409 4473 ...
##  $ Low.physical.activity                   : int  2637 2652 2688 2744 2805 2839 2878 2914 2944 2976 ...
##  $ Smoking                                 : int  5174 5247 5363 5522 5689 5801 5934 6066 6178 6288 ...
##  $ High.fasting.plasma.glucose             : int  11449 11811 12265 12821 13400 13871 14413 14970 15502 16058 ...
##  $ Air.pollution                           : int  37231 38315 41172 44488 46634 47566 48617 49703 49746 49349 ...
##  $ High.body.mass.index                    : int  9518 9489 9528 9611 9675 9608 9503 9286 9024 8857 ...
##  $ Unsafe.sanitation                       : int  2798 3254 4042 5392 5418 6313 6393 7038 7366 7468 ...
##  $ No.access.to.handwashing.facility       : int  4825 5127 5889 7007 7421 7896 8098 8507 8560 8424 ...
##  $ Drug.use                                : int  174 188 211 232 247 260 274 287 295 302 ...
##  $ Low.bone.mineral.density                : int  389 389 393 411 413 417 423 425 429 428 ...
##  $ Vitamin.A.deficiency                    : int  2016 2056 2100 2316 2665 3070 3214 3228 3413 3662 ...
##  $ Child.stunting                          : int  7686 7886 8568 9875 11031 11973 12426 12805 13011 13052 ...
##  $ Discontinued.breastfeeding              : int  107 121 150 204 204 233 233 255 264 263 ...
##  $ Non.exclusive.breastfeeding             : int  2216 2501 3053 3726 3833 4124 4183 4393 4417 4326 ...
##  $ Iron.deficiency                         : int  564 611 700 773 812 848 883 914 924 909 ...
##  $ Total_Deaths                            : int  10498 10720 11229 11877 12385 12634 12919 13198 13300 13339 ...
##  $ GDP_per_Capita                          : num  NA NA NA NA NA NA NA NA NA NA ...
#  c. data transformation _formulate it as a binary classification problem
library(dplyr)

# Calculate death rate as a function of total deaths and GDP per capita
merged_data_clean_use <- merged_data_clean %>%
  mutate(Death_Rate_vs_GDP = Total_Deaths / GDP_per_Capita)
str(merged_data_clean_use)
## 'data.frame':    5828 obs. of  34 variables:
##  $ Code                                    : chr  "AFG" "AFG" "AFG" "AFG" ...
##  $ Year                                    : int  2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 ...
##  $ Entity                                  : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Outdoor.air.pollution                   : int  4021 4014 3961 4116 4176 4176 4232 4480 4767 5038 ...
##  $ High.systolic.blood.pressure            : int  29999 30421 30189 30157 30225 30089 30075 30080 30219 30280 ...
##  $ Diet.high.in.sodium                     : int  1260 1282 1275 1277 1281 1276 1270 1263 1261 1254 ...
##  $ Diet.low.in.whole.grains                : int  8328 8440 8383 8398 8433 8415 8418 8426 8474 8488 ...
##  $ Alochol.use                             : int  427 432 432 437 445 452 456 465 478 485 ...
##  $ Diet.low.in.fruits                      : int  4174 4226 4184 4179 4188 4166 4142 4108 4088 4050 ...
##  $ Unsafe.water.source                     : int  9942 10052 10004 10841 10761 10118 9081 8168 7245 6437 ...
##  $ Secondhand.smoke                        : int  6227 6214 6103 6341 6383 6272 6153 6010 5868 5752 ...
##  $ Low.birth.weight                        : int  24549 24859 25441 25617 25459 25149 24354 23778 23533 23232 ...
##  $ Child.wasting                           : int  31559 30938 29619 29098 28167 26669 24987 23100 21624 20264 ...
##  $ Unsafe.sex                              : int  482 500 507 518 533 541 549 557 568 576 ...
##  $ Diet.low.in.nuts.and.seeds              : int  3516 3543 3487 3457 3430 3381 3325 3251 3183 3102 ...
##  $ Household.air.pollution.from.solid.fuels: int  45132 45028 44137 44953 44534 43280 41778 40047 38320 36675 ...
##  $ Diet.low.in.Vegetables                  : int  4501 4539 4483 4471 4475 4449 4432 4417 4422 4407 ...
##  $ Low.physical.activity                   : int  3013 3068 3078 3104 3137 3153 3178 3214 3267 3311 ...
##  $ Smoking                                 : int  6378 6505 6488 6528 6594 6636 6720 6849 7026 7181 ...
##  $ High.fasting.plasma.glucose             : int  16545 17144 17467 17921 18397 18652 18917 19183 19498 19730 ...
##  $ Air.pollution                           : int  48763 48660 47732 48687 48337 47107 45691 44161 42696 41348 ...
##  $ High.body.mass.index                    : int  8829 9047 9151 9410 9759 10051 10519 11218 12106 12982 ...
##  $ Unsafe.sanitation                       : int  7472 7547 7497 8106 8028 7532 6740 6050 5350 4732 ...
##  $ No.access.to.handwashing.facility       : int  8233 8144 7915 8482 8440 8033 7481 6900 6291 5793 ...
##  $ Drug.use                                : int  312 328 337 356 376 390 401 414 430 445 ...
##  $ Low.bone.mineral.density                : int  431 436 437 433 440 454 454 465 478 493 ...
##  $ Vitamin.A.deficiency                    : int  3855 3944 3798 1902 1330 1243 1231 1226 1555 1704 ...
##  $ Child.stunting                          : int  12965 12732 12069 9862 8928 8380 7885 7303 7148 6867 ...
##  $ Discontinued.breastfeeding              : int  256 250 242 263 255 233 204 182 165 158 ...
##  $ Non.exclusive.breastfeeding             : int  4195 4094 3947 3953 3774 3519 3216 2992 2832 2706 ...
##  $ Iron.deficiency                         : int  897 919 937 972 985 1007 992 1008 1008 1001 ...
##  $ Total_Deaths                            : int  13344 13479 13360 13662 13798 13750 13730 13738 13802 13863 ...
##  $ GDP_per_Capita                          : num  180 143 182 200 222 ...
##  $ Death_Rate_vs_GDP                       : num  74.1 94.3 73.3 68.4 62.2 ...
##  - attr(*, "na.action")= 'omit' Named int [1:172] 1 2 3 4 5 6 7 8 9 10 ...
##   ..- attr(*, "names")= chr [1:172] "1" "2" "3" "4" ...
# Define a threshold for binary classification (median of total deaths)
threshold <- median(merged_data_clean_use$Death_Rate_vs_GDP, na.rm = TRUE)

# Create a new binary target variable: 'High_Deaths' (1 if above threshold, 0 if below)
merged_data_clean_use <- merged_data_clean_use %>%
  mutate(High_Deaths = ifelse(Death_Rate_vs_GDP >= threshold, 1, 0))

# Ensure High_Deaths is a factor with exactly two levels (0 and 1)
merged_data_clean_use$High_Deaths <- factor(merged_data_clean_use$High_Deaths, levels = c(0, 1))

merged_data_clean_use <- data.frame(merged_data_clean_use)
str(merged_data_clean_use)
## 'data.frame':    5828 obs. of  35 variables:
##  $ Code                                    : chr  "AFG" "AFG" "AFG" "AFG" ...
##  $ Year                                    : int  2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 ...
##  $ Entity                                  : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Outdoor.air.pollution                   : int  4021 4014 3961 4116 4176 4176 4232 4480 4767 5038 ...
##  $ High.systolic.blood.pressure            : int  29999 30421 30189 30157 30225 30089 30075 30080 30219 30280 ...
##  $ Diet.high.in.sodium                     : int  1260 1282 1275 1277 1281 1276 1270 1263 1261 1254 ...
##  $ Diet.low.in.whole.grains                : int  8328 8440 8383 8398 8433 8415 8418 8426 8474 8488 ...
##  $ Alochol.use                             : int  427 432 432 437 445 452 456 465 478 485 ...
##  $ Diet.low.in.fruits                      : int  4174 4226 4184 4179 4188 4166 4142 4108 4088 4050 ...
##  $ Unsafe.water.source                     : int  9942 10052 10004 10841 10761 10118 9081 8168 7245 6437 ...
##  $ Secondhand.smoke                        : int  6227 6214 6103 6341 6383 6272 6153 6010 5868 5752 ...
##  $ Low.birth.weight                        : int  24549 24859 25441 25617 25459 25149 24354 23778 23533 23232 ...
##  $ Child.wasting                           : int  31559 30938 29619 29098 28167 26669 24987 23100 21624 20264 ...
##  $ Unsafe.sex                              : int  482 500 507 518 533 541 549 557 568 576 ...
##  $ Diet.low.in.nuts.and.seeds              : int  3516 3543 3487 3457 3430 3381 3325 3251 3183 3102 ...
##  $ Household.air.pollution.from.solid.fuels: int  45132 45028 44137 44953 44534 43280 41778 40047 38320 36675 ...
##  $ Diet.low.in.Vegetables                  : int  4501 4539 4483 4471 4475 4449 4432 4417 4422 4407 ...
##  $ Low.physical.activity                   : int  3013 3068 3078 3104 3137 3153 3178 3214 3267 3311 ...
##  $ Smoking                                 : int  6378 6505 6488 6528 6594 6636 6720 6849 7026 7181 ...
##  $ High.fasting.plasma.glucose             : int  16545 17144 17467 17921 18397 18652 18917 19183 19498 19730 ...
##  $ Air.pollution                           : int  48763 48660 47732 48687 48337 47107 45691 44161 42696 41348 ...
##  $ High.body.mass.index                    : int  8829 9047 9151 9410 9759 10051 10519 11218 12106 12982 ...
##  $ Unsafe.sanitation                       : int  7472 7547 7497 8106 8028 7532 6740 6050 5350 4732 ...
##  $ No.access.to.handwashing.facility       : int  8233 8144 7915 8482 8440 8033 7481 6900 6291 5793 ...
##  $ Drug.use                                : int  312 328 337 356 376 390 401 414 430 445 ...
##  $ Low.bone.mineral.density                : int  431 436 437 433 440 454 454 465 478 493 ...
##  $ Vitamin.A.deficiency                    : int  3855 3944 3798 1902 1330 1243 1231 1226 1555 1704 ...
##  $ Child.stunting                          : int  12965 12732 12069 9862 8928 8380 7885 7303 7148 6867 ...
##  $ Discontinued.breastfeeding              : int  256 250 242 263 255 233 204 182 165 158 ...
##  $ Non.exclusive.breastfeeding             : int  4195 4094 3947 3953 3774 3519 3216 2992 2832 2706 ...
##  $ Iron.deficiency                         : int  897 919 937 972 985 1007 992 1008 1008 1001 ...
##  $ Total_Deaths                            : int  13344 13479 13360 13662 13798 13750 13730 13738 13802 13863 ...
##  $ GDP_per_Capita                          : num  180 143 182 200 222 ...
##  $ Death_Rate_vs_GDP                       : num  74.1 94.3 73.3 68.4 62.2 ...
##  $ High_Deaths                             : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
#2. Classification Modeling
#  a. Split the data into training and test sets:
# reform to include varibles in model
data_use <- merged_data_clean_use %>% select(-Code, -GDP_per_Capita, -Death_Rate_vs_GDP, -Total_Deaths)
str(data_use)
## 'data.frame':    5828 obs. of  31 variables:
##  $ Year                                    : int  2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 ...
##  $ Entity                                  : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Outdoor.air.pollution                   : int  4021 4014 3961 4116 4176 4176 4232 4480 4767 5038 ...
##  $ High.systolic.blood.pressure            : int  29999 30421 30189 30157 30225 30089 30075 30080 30219 30280 ...
##  $ Diet.high.in.sodium                     : int  1260 1282 1275 1277 1281 1276 1270 1263 1261 1254 ...
##  $ Diet.low.in.whole.grains                : int  8328 8440 8383 8398 8433 8415 8418 8426 8474 8488 ...
##  $ Alochol.use                             : int  427 432 432 437 445 452 456 465 478 485 ...
##  $ Diet.low.in.fruits                      : int  4174 4226 4184 4179 4188 4166 4142 4108 4088 4050 ...
##  $ Unsafe.water.source                     : int  9942 10052 10004 10841 10761 10118 9081 8168 7245 6437 ...
##  $ Secondhand.smoke                        : int  6227 6214 6103 6341 6383 6272 6153 6010 5868 5752 ...
##  $ Low.birth.weight                        : int  24549 24859 25441 25617 25459 25149 24354 23778 23533 23232 ...
##  $ Child.wasting                           : int  31559 30938 29619 29098 28167 26669 24987 23100 21624 20264 ...
##  $ Unsafe.sex                              : int  482 500 507 518 533 541 549 557 568 576 ...
##  $ Diet.low.in.nuts.and.seeds              : int  3516 3543 3487 3457 3430 3381 3325 3251 3183 3102 ...
##  $ Household.air.pollution.from.solid.fuels: int  45132 45028 44137 44953 44534 43280 41778 40047 38320 36675 ...
##  $ Diet.low.in.Vegetables                  : int  4501 4539 4483 4471 4475 4449 4432 4417 4422 4407 ...
##  $ Low.physical.activity                   : int  3013 3068 3078 3104 3137 3153 3178 3214 3267 3311 ...
##  $ Smoking                                 : int  6378 6505 6488 6528 6594 6636 6720 6849 7026 7181 ...
##  $ High.fasting.plasma.glucose             : int  16545 17144 17467 17921 18397 18652 18917 19183 19498 19730 ...
##  $ Air.pollution                           : int  48763 48660 47732 48687 48337 47107 45691 44161 42696 41348 ...
##  $ High.body.mass.index                    : int  8829 9047 9151 9410 9759 10051 10519 11218 12106 12982 ...
##  $ Unsafe.sanitation                       : int  7472 7547 7497 8106 8028 7532 6740 6050 5350 4732 ...
##  $ No.access.to.handwashing.facility       : int  8233 8144 7915 8482 8440 8033 7481 6900 6291 5793 ...
##  $ Drug.use                                : int  312 328 337 356 376 390 401 414 430 445 ...
##  $ Low.bone.mineral.density                : int  431 436 437 433 440 454 454 465 478 493 ...
##  $ Vitamin.A.deficiency                    : int  3855 3944 3798 1902 1330 1243 1231 1226 1555 1704 ...
##  $ Child.stunting                          : int  12965 12732 12069 9862 8928 8380 7885 7303 7148 6867 ...
##  $ Discontinued.breastfeeding              : int  256 250 242 263 255 233 204 182 165 158 ...
##  $ Non.exclusive.breastfeeding             : int  4195 4094 3947 3953 3774 3519 3216 2992 2832 2706 ...
##  $ Iron.deficiency                         : int  897 919 937 972 985 1007 992 1008 1008 1001 ...
##  $ High_Deaths                             : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
# Check for missing values
sapply(data_use, function(x) sum(is.na(x)))
##                                     Year 
##                                        0 
##                                   Entity 
##                                        0 
##                    Outdoor.air.pollution 
##                                        0 
##             High.systolic.blood.pressure 
##                                        0 
##                      Diet.high.in.sodium 
##                                        0 
##                 Diet.low.in.whole.grains 
##                                        0 
##                              Alochol.use 
##                                        0 
##                       Diet.low.in.fruits 
##                                        0 
##                      Unsafe.water.source 
##                                        0 
##                         Secondhand.smoke 
##                                        0 
##                         Low.birth.weight 
##                                        0 
##                            Child.wasting 
##                                        0 
##                               Unsafe.sex 
##                                        0 
##               Diet.low.in.nuts.and.seeds 
##                                        0 
## Household.air.pollution.from.solid.fuels 
##                                        0 
##                   Diet.low.in.Vegetables 
##                                        0 
##                    Low.physical.activity 
##                                        0 
##                                  Smoking 
##                                        0 
##              High.fasting.plasma.glucose 
##                                        0 
##                            Air.pollution 
##                                        0 
##                     High.body.mass.index 
##                                        0 
##                        Unsafe.sanitation 
##                                        0 
##        No.access.to.handwashing.facility 
##                                        0 
##                                 Drug.use 
##                                        0 
##                 Low.bone.mineral.density 
##                                        0 
##                     Vitamin.A.deficiency 
##                                        0 
##                           Child.stunting 
##                                        0 
##               Discontinued.breastfeeding 
##                                        0 
##              Non.exclusive.breastfeeding 
##                                        0 
##                          Iron.deficiency 
##                                        0 
##                              High_Deaths 
##                                        0
unique(sapply(data_use, function(x) sum(is.na(x))))
## [1] 0
library(caTools)
## Warning: 程辑包'caTools'是用R版本4.2.3 来建造的
set.seed(123)

# Split the dataset
split <- sample.split(data_use$High_Deaths, SplitRatio = 0.8)
train_set <- subset(data_use, split == TRUE)
test_set <- subset(data_use, split == FALSE)
str(train_set)
## 'data.frame':    4662 obs. of  31 variables:
##  $ Year                                    : int  2000 2001 2002 2005 2006 2008 2009 2011 2012 2013 ...
##  $ Entity                                  : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Outdoor.air.pollution                   : int  4021 4014 3961 4176 4232 4767 5038 5824 6516 7273 ...
##  $ High.systolic.blood.pressure            : int  29999 30421 30189 30089 30075 30219 30280 30684 31090 31462 ...
##  $ Diet.high.in.sodium                     : int  1260 1282 1275 1276 1270 1261 1254 1255 1264 1270 ...
##  $ Diet.low.in.whole.grains                : int  8328 8440 8383 8415 8418 8474 8488 8620 8753 8854 ...
##  $ Alochol.use                             : int  427 432 432 452 456 478 485 496 502 508 ...
##  $ Diet.low.in.fruits                      : int  4174 4226 4184 4166 4142 4088 4050 4033 4052 4060 ...
##  $ Unsafe.water.source                     : int  9942 10052 10004 10118 9081 7245 6437 5600 5243 5220 ...
##  $ Secondhand.smoke                        : int  6227 6214 6103 6272 6153 5868 5752 5686 5676 5739 ...
##  $ Low.birth.weight                        : int  24549 24859 25441 25149 24354 23533 23232 23181 23172 23176 ...
##  $ Child.wasting                           : int  31559 30938 29619 26669 24987 21624 20264 18163 17368 16573 ...
##  $ Unsafe.sex                              : int  482 500 507 541 549 568 576 597 609 621 ...
##  $ Diet.low.in.nuts.and.seeds              : int  3516 3543 3487 3381 3325 3183 3102 3008 2982 2947 ...
##  $ Household.air.pollution.from.solid.fuels: int  45132 45028 44137 43280 41778 38320 36675 34380 33223 32344 ...
##  $ Diet.low.in.Vegetables                  : int  4501 4539 4483 4449 4432 4422 4407 4432 4476 4506 ...
##  $ Low.physical.activity                   : int  3013 3068 3078 3153 3178 3267 3311 3403 3468 3525 ...
##  $ Smoking                                 : int  6378 6505 6488 6636 6720 7026 7181 7572 7856 8124 ...
##  $ High.fasting.plasma.glucose             : int  16545 17144 17467 18652 18917 19498 19730 20216 20593 20961 ...
##  $ Air.pollution                           : int  48763 48660 47732 47107 45691 42696 41348 39893 39363 39187 ...
##  $ High.body.mass.index                    : int  8829 9047 9151 10051 10519 12106 12982 14666 15603 16511 ...
##  $ Unsafe.sanitation                       : int  7472 7547 7497 7532 6740 5350 4732 4069 3784 3739 ...
##  $ No.access.to.handwashing.facility       : int  8233 8144 7915 8033 7481 6291 5793 5275 5045 5041 ...
##  $ Drug.use                                : int  312 328 337 390 401 430 445 482 502 522 ...
##  $ Low.bone.mineral.density                : int  431 436 437 454 454 478 493 529 547 581 ...
##  $ Vitamin.A.deficiency                    : int  3855 3944 3798 1243 1231 1555 1704 1342 1327 800 ...
##  $ Child.stunting                          : int  12965 12732 12069 8380 7885 7148 6867 5800 5541 4710 ...
##  $ Discontinued.breastfeeding              : int  256 250 242 233 204 165 158 199 239 297 ...
##  $ Non.exclusive.breastfeeding             : int  4195 4094 3947 3519 3216 2832 2706 2591 2511 2492 ...
##  $ Iron.deficiency                         : int  897 919 937 1007 992 1008 1001 958 928 900 ...
##  $ High_Deaths                             : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
# b. Decision Tree:
library(rpart)

# Decision tree model
tree_model <- rpart(High_Deaths ~ ., data = train_set, method = "class")

# Predict on test set
pred_tree <- predict(tree_model, newdata = test_set, type = "class")

# Confusion Matrix
conf_matrix <- table(test_set$High_Deaths, pred_tree)
print(conf_matrix)
##    pred_tree
##       0   1
##   0 570  13
##   1  16 567
#decision tree plotting
library(rpart.plot)
## Warning: 程辑包'rpart.plot'是用R版本4.2.3 来建造的
rpart.plot(tree_model)

library(caret)
## Warning: 程辑包'caret'是用R版本4.2.3 来建造的
## 载入需要的程辑包:lattice
## 
## 载入程辑包:'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(pROC)
## Warning: 程辑包'pROC'是用R版本4.2.3 来建造的
## Type 'citation("pROC")' for a citation.
## 
## 载入程辑包:'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
# Calculating Accuracy
accuracy_tree <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("Accuracy:", accuracy_tree, "\n")
## Accuracy: 0.9751286
# Calculating Precision
precision_tree <- conf_matrix[2, 2] / sum(conf_matrix[, 2])
cat("Precision:", precision_tree, "\n")
## Precision: 0.9775862
# Calculating Recall (Sensitivity)
recall_tree <- conf_matrix[2, 2] / sum(conf_matrix[2, ])
cat("Recall:", recall_tree, "\n")
## Recall: 0.9725557
# Calculating F1 Score
f1_score_tree <- 2 * (precision_tree * recall_tree) / (precision_tree + recall_tree)
cat("F1 Score:", f1_score_tree, "\n")
## F1 Score: 0.9750645
library(pROC)

# Predict class probabilities for the test set 
# Change type to "prob"
pred_prob <- predict(tree_model, newdata = test_set, type = "prob")

# Create the ROC curve
roc_curve <- roc(test_set$High_Deaths, pred_prob[, 2])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plot the ROC curve
plot(roc_curve, main = "ROC Curve for Decision Tree Model", col = "blue")

# Calculate the AUC
auc_value <- auc(roc_curve)
cat("AUC:", auc_value, "\n")
## AUC: 0.9901041
# d. Logistic regression model
logit_model <- glm(High_Deaths ~ ., data = train_set, family = binomial)
## Warning: glm.fit:算法没有聚合
## Warning: glm.fit:拟合機率算出来是数值零或一
# Predicting on test set
pred_logit <- predict(logit_model, newdata = test_set, type = "response")
pred_logit
##           14           15           18           21           26           30 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##           34           37           44           45           63           66 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##           72           74           87          101          105          112 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##          115          120          126          136          138          145 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##          146          155          157          158          177          178 
## 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##          179          194          196          197          199          201 
## 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 
##          203          208          227          228          229          235 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##          237          244          245          246          248          258 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##          268          271          274          278          279          286 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##          289          290          294          299          305          309 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##          320          325          327          334          337          341 
## 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 
##          349          355          359          364          366          372 
## 1.000000e+00 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 
##          378          392          397          398          400          404 
## 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##          407          415          430          436          438          446 
## 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##          447          450          452          459          463          476 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##          477          479          487          495          497          505 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##          506          517          518          519          521          528 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##          534          542          546          557          568          571 
## 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##          574          576          588          589          593          611 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 
##          613          614          617          625          627          629 
## 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 
##          647          648          651          652          658          661 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 
##          663          665          677          679          685          686 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##          688          689          697          700          701          708 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##          709          715          717          718          721          725 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 
##          731          738          749          751          755          759 
## 1.000000e+00 1.000000e+00 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 
##          762          772          775          779          781          783 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 
##          791          794          799          808          811          815 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##          822          827          829          833          836          837 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##          842          843          845          847          853          858 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##          861          868          870          872          875          881 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##          890          905          910          919          920          922 
## 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##          931          936          941          944          949          950 
## 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 
##          952          954          958          972          973          993 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 
##          998          999         1002         1011         1016         1017 
## 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 
##         1028         1039         1040         1044         1053         1055 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         1065         1068         1073         1074         1079         1092 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         1096         1103         1110         1114         1116         1119 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         1123         1126         1128         1131         1133         1134 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         1139         1152         1157         1164         1165         1166 
## 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         1176         1178         1180         1183         1189         1190 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         1191         1193         1197         1201         1204         1210 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 
##         1217         1218         1236         1237         1239         1244 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         1249         1252         1253         1258         1261         1263 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         1273         1275         1278         1281         1282         1291 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 
##         1294         1297         1304         1306         1312         1315 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         1317         1319         1320         1321         1322         1333 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 
##         1335         1343         1344         1346         1356         1360 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 
##         1365         1378         1382         1385         1386         1395 
## 1.000000e+00 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         1399         1400         1404         1405         1410         1417 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 
##         1425         1428         1432         1433         1437         1441 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 
##         1446         1456         1457         1459         1464         1466 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         1469         1470         1471         1485         1487         1495 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         1500         1509         1512         1521         1526         1527 
## 2.220446e-16 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         1529         1532         1538         1539         1540         1556 
## 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         1559         1574         1578         1580         1588         1590 
## 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         1592         1595         1600         1602         1605         1606 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         1609         1618         1628         1641         1642         1652 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 1.000000e+00 
##         1658         1660         1663         1668         1669         1673 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         1692         1694         1696         1698         1701         1702 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         1703         1706         1707         1714         1722         1732 
## 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 
##         1743         1745         1754         1756         1758         1761 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         1766         1767         1779         1781         1785         1791 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         1796         1809         1815         1821         1825         1828 
## 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         1833         1840         1841         1853         1858         1860 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         1862         1865         1870         1871         1875         1889 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         1903         1907         1910         1914         1923         1929 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         1931         1936         1937         1941         1947         1955 
## 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 1.000000e+00 
##         1958         1965         1969         1972         1973         1975 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         1987         1990         1999         2007         2012         2013 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 
##         2023         2024         2025         2028         2032         2035 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         2036         2037         2038         2041         2046         2054 
## 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 2.220446e-16 2.220446e-16 
##         2055         2058         2059         2070         2076         2079 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         2081         2082         2085         2086         2090         2093 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         2104         2106         2111         2115         2117         2120 
## 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 
##         2123         2125         2129         2130         2144         2149 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         2156         2158         2163         2170         2188         2189 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         2190         2201         2209         2210         2215         2233 
## 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 
##         2235         2247         2253         2254         2256         2259 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         2269         2270         2271         2274         2275         2296 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 
##         2312         2320         2321         2323         2326         2332 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 
##         2341         2357         2360         2361         2362         2377 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         2379         2385         2389         2393         2396         2397 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 
##         2402         2408         2417         2422         2431         2434 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         2441         2444         2448         2460         2463         2464 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 
##         2470         2480         2482         2496         2506         2507 
## 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 
##         2519         2521         2522         2524         2525         2529 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         2535         2539         2541         2543         2547         2548 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         2553         2567         2577         2581         2598         2603 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         2612         2622         2628         2634         2639         2641 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 
##         2646         2648         2649         2650         2653         2655 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         2665         2666         2669         2671         2680         2702 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 
##         2711         2713         2718         2725         2734         2737 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         2742         2747         2756         2764         2766         2768 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         2782         2789         2801         2806         2819         2821 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         2826         2828         2831         2834         2839         2851 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         2852         2853         2856         2865         2881         2891 
## 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 2.220446e-16 
##         2903         2906         2909         2910         2911         2913 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 
##         2915         2924         2927         2931         2932         2934 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 
##         2939         2941         2944         2948         2949         2950 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         2965         2966         2972         2976         2997         2999 
## 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         3001         3005         3008         3009         3014         3016 
## 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         3023         3031         3032         3043         3044         3045 
## 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         3048         3049         3050         3055         3058         3063 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 
##         3064         3076         3080         3090         3102         3103 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         3104         3108         3121         3122         3126         3127 
## 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         3137         3138         3156         3158         3164         3175 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         3176         3190         3193         3220         3231         3237 
## 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 
##         3246         3252         3263         3267         3272         3274 
## 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 
##         3289         3296         3314         3316         3318         3320 
## 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         3324         3338         3339         3341         3348         3349 
## 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         3361         3365         3370         3371         3377         3379 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         3381         3383         3391         3392         3397         3401 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 
##         3409         3418         3420         3423         3426         3432 
## 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 
##         3448         3452         3457         3460         3465         3467 
## 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         3472         3473         3487         3491         3492         3493 
## 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 
##         3494         3499         3502         3503         3506         3510 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         3517         3518         3522         3527         3529         3532 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         3534         3547         3565         3566         3567         3574 
## 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 
##         3577         3578         3581         3582         3583         3585 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         3590         3592         3594         3595         3598         3600 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         3619         3620         3626         3627         3633         3639 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 
##         3640         3643         3657         3674         3679         3681 
## 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         3682         3686         3687         3701         3704         3708 
## 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 
##         3712         3716         3726         3731         3732         3733 
## 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         3734         3746         3747         3752         3774         3776 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         3778         3781         3783         3795         3798         3800 
## 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         3802         3805         3807         3813         3825         3826 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         3831         3834         3838         3855         3857         3863 
## 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 
##         3866         3872         3876         3880         3883         3885 
## 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         3896         3899         3902         3908         3923         3927 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         3934         3952         3965         3966         3974         3986 
## 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         3993         3994         4005         4006         4007         4011 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         4028         4036         4039         4059         4067         4068 
## 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 
##         4078         4092         4094         4095         4104         4106 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         4107         4114         4120         4126         4130         4135 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         4141         4149         4152         4159         4175         4177 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 
##         4178         4179         4182         4184         4185         4191 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         4199         4203         4206         4222         4223         4225 
## 2.220446e-16 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 
##         4226         4228         4229         4232         4236         4242 
## 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 
##         4244         4259         4260         4262         4265         4266 
## 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 
##         4270         4274         4278         4286         4289         4296 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 
##         4297         4298         4304         4306         4313         4315 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         4320         4321         4323         4330         4339         4348 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         4351         4355         4365         4370         4377         4413 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 
##         4418         4423         4427         4428         4433         4435 
## 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         4436         4440         4443         4448         4451         4456 
## 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 
##         4463         4467         4477         4501         4502         4505 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         4507         4514         4516         4517         4523         4524 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         4527         4531         4541         4542         4546         4548 
## 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         4551         4559         4560         4573         4579         4582 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         4587         4589         4596         4599         4602         4608 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         4615         4616         4624         4626         4634         4637 
## 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         4639         4647         4652         4674         4676         4680 
## 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         4683         4688         4691         4695         4696         4697 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         4702         4703         4711         4719         4722         4723 
## 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 
##         4735         4737         4739         4762         4768         4771 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 
##         4773         4779         4786         4787         4788         4793 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         4803         4806         4808         4823         4826         4845 
## 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 
##         4861         4864         4866         4868         4875         4882 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         4907         4908         4909         4912         4913         4939 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 
##         4943         4953         4954         4958         4962         4963 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         4964         4972         4975         4977         4980         4982 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         4998         5006         5024         5030         5037         5041 
## 2.220446e-16 2.220446e-16 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 
##         5042         5045         5048         5050         5055         5058 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         5065         5067         5074         5077         5079         5088 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         5091         5096         5101         5109         5111         5119 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         5122         5128         5130         5134         5138         5140 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         5143         5155         5157         5161         5168         5174 
## 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 
##         5177         5180         5182         5192         5193         5197 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         5201         5202         5208         5212         5219         5225 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         5243         5248         5252         5253         5260         5276 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         5282         5291         5293         5294         5298         5306 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         5323         5327         5328         5330         5334         5337 
## 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         5346         5347         5350         5352         5353         5354 
## 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         5366         5383         5392         5396         5402         5405 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         5408         5415         5425         5427         5434         5435 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 
##         5436         5441         5443         5450         5457         5461 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         5465         5478         5482         5483         5495         5509 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 
##         5512         5513         5521         5524         5529         5545 
## 2.220446e-16 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         5546         5562         5567         5570         5571         5572 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         5576         5577         5578         5598         5603         5608 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         5609         5617         5630         5633         5636         5638 
## 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         5642         5644         5650         5652         5654         5660 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         5661         5662         5663         5665         5670         5675 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         5678         5679         5680         5687         5689         5694 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         5699         5701         5714         5717         5723         5724 
## 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         5728         5742         5784         5786         5789         5802 
## 2.220446e-16 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 
##         5810         5811         5813         5816         5828         5829 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 
##         5830         5836         5845         5849         5854         5857 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 
##         5862         5865         5867         5873         5876         5881 
## 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 
##         5886         5894         5896         5897         5900         5901 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         5904         5910         5920         5923         5926         5929 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         5937         5941         5944         5954         5962         5966 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         5969         5972         5976         5978         5987         5991 
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 
##         5993         5999 
## 1.000000e+00 1.000000e+00
pred_logit_class <- ifelse(pred_logit > 0.5, 1, 0)

# Confusion Matrix
conf_matrix <- table(test_set$High_Deaths, pred_logit_class)
print(conf_matrix)
##    pred_logit_class
##       0   1
##   0 558  25
##   1  17 566
library(caret)
library(pROC)

# Calculating Accuracy
accuracy_logi <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("Accuracy:", accuracy_logi, "\n")
## Accuracy: 0.9639794
# Calculating Precision
precision_logi <- conf_matrix[2, 2] / sum(conf_matrix[, 2])
cat("Precision:", precision_logi, "\n")
## Precision: 0.9576988
# Calculating Recall (Sensitivity)
recall_logi <- conf_matrix[2, 2] / sum(conf_matrix[2, ])
cat("Recall:", recall_logi, "\n")
## Recall: 0.9708405
# Calculating F1 Score
f1_score_logi <- 2 * (precision_logi * recall_logi) / (precision_logi + recall_logi)
cat("F1 Score:", f1_score_logi, "\n")
## F1 Score: 0.9642249
# ROC Curve and AUC
roc_obj <- roc(test_set$High_Deaths, pred_logit)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_value <- auc(roc_obj)
cat("AUC:", auc_value, "\n")
## AUC: 0.9639794
# Plot ROC curve
plot(roc_obj, main = "ROC Curve for Logistic Regression", col = "blue")

#3. Clustering

library(caTools)
set.seed(123)

str(data_clean)
## tibble [6,840 × 32] (S3: tbl_df/tbl/data.frame)
##  $ Entity                                  : chr [1:6840] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Code                                    : chr [1:6840] "AFG" "AFG" "AFG" "AFG" ...
##  $ Year                                    : int [1:6840] 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 ...
##  $ Outdoor.air.pollution                   : int [1:6840] 3169 3222 3395 3623 3788 3869 3943 4024 4040 4042 ...
##  $ High.systolic.blood.pressure            : int [1:6840] 25633 25872 26309 26961 27658 28090 28587 29021 29349 29712 ...
##  $ Diet.high.in.sodium                     : int [1:6840] 1045 1055 1075 1103 1134 1154 1178 1202 1222 1242 ...
##  $ Diet.low.in.whole.grains                : int [1:6840] 7077 7149 7297 7499 7698 7807 7943 8075 8173 8265 ...
##  $ Alochol.use                             : int [1:6840] 356 364 376 389 399 406 413 420 425 426 ...
##  $ Diet.low.in.fruits                      : int [1:6840] 3185 3248 3351 3480 3610 3703 3819 3938 4038 4127 ...
##  $ Unsafe.water.source                     : int [1:6840] 3702 4309 5356 7152 7192 8378 8487 9348 9788 9931 ...
##  $ Secondhand.smoke                        : int [1:6840] 4794 4921 5279 5734 6050 6167 6298 6425 6402 6323 ...
##  $ Low.birth.weight                        : int [1:6840] 16135 17924 21200 23795 24866 25534 25997 26246 25805 25080 ...
##  $ Child.wasting                           : int [1:6840] 19546 20334 22895 27002 29205 30943 31628 32736 32760 32271 ...
##  $ Unsafe.sex                              : int [1:6840] 351 361 378 395 410 422 435 448 458 469 ...
##  $ Diet.low.in.nuts.and.seeds              : int [1:6840] 2319 2449 2603 2771 2932 3049 3173 3298 3401 3482 ...
##  $ Household.air.pollution.from.solid.fuels: int [1:6840] 34372 35392 38065 41154 43153 44024 45005 46017 46055 45681 ...
##  $ Diet.low.in.Vegetables                  : int [1:6840] 3679 3732 3827 3951 4075 4153 4247 4339 4409 4473 ...
##  $ Low.physical.activity                   : int [1:6840] 2637 2652 2688 2744 2805 2839 2878 2914 2944 2976 ...
##  $ Smoking                                 : int [1:6840] 5174 5247 5363 5522 5689 5801 5934 6066 6178 6288 ...
##  $ High.fasting.plasma.glucose             : int [1:6840] 11449 11811 12265 12821 13400 13871 14413 14970 15502 16058 ...
##  $ Air.pollution                           : int [1:6840] 37231 38315 41172 44488 46634 47566 48617 49703 49746 49349 ...
##  $ High.body.mass.index                    : int [1:6840] 9518 9489 9528 9611 9675 9608 9503 9286 9024 8857 ...
##  $ Unsafe.sanitation                       : int [1:6840] 2798 3254 4042 5392 5418 6313 6393 7038 7366 7468 ...
##  $ No.access.to.handwashing.facility       : int [1:6840] 4825 5127 5889 7007 7421 7896 8098 8507 8560 8424 ...
##  $ Drug.use                                : int [1:6840] 174 188 211 232 247 260 274 287 295 302 ...
##  $ Low.bone.mineral.density                : int [1:6840] 389 389 393 411 413 417 423 425 429 428 ...
##  $ Vitamin.A.deficiency                    : int [1:6840] 2016 2056 2100 2316 2665 3070 3214 3228 3413 3662 ...
##  $ Child.stunting                          : int [1:6840] 7686 7886 8568 9875 11031 11973 12426 12805 13011 13052 ...
##  $ Discontinued.breastfeeding              : int [1:6840] 107 121 150 204 204 233 233 255 264 263 ...
##  $ Non.exclusive.breastfeeding             : int [1:6840] 2216 2501 3053 3726 3833 4124 4183 4393 4417 4326 ...
##  $ Iron.deficiency                         : int [1:6840] 564 611 700 773 812 848 883 914 924 909 ...
##  $ Total_Deaths                            : int [1:6840] 10498 10720 11229 11877 12385 12634 12919 13198 13300 13339 ...
library(dplyr)

# Group data by country and calculate the average of all numeric columns
country_level_data <- data_clean %>%
  group_by(Code) %>%  # Group by country code
  summarise(across(where(is.numeric), mean, na.rm = TRUE))  # Calculate the mean for all numeric columns
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(where(is.numeric), mean, na.rm = TRUE)`.
## ℹ In group 1: `Code = ""`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))
str(country_level_data)
## tibble [206 × 31] (S3: tbl_df/tbl/data.frame)
##  $ Code                                    : chr [1:206] "" "AFG" "AGO" "ALB" ...
##  $ Year                                    : num [1:206] 2004 2004 2004 2004 2004 ...
##  $ Outdoor.air.pollution                   : num [1:206] 554194.6 5178 2853.2 1190.7 13.9 ...
##  $ High.systolic.blood.pressure            : num [1:206] 1.48e+06 3.02e+04 1.18e+04 5.46e+03 6.95e+01 ...
##  $ Diet.high.in.sodium                     : num [1:206] 2.66e+05 1.24e+03 8.39e+02 1.81e+03 4.23 ...
##  $ Diet.low.in.whole.grains                : num [1:206] 254974.9 8460.8 1332.2 1037.5 14.2 ...
##  $ Alochol.use                             : num [1:206] 362617.1 459.3 7689.9 559.6 32.4 ...
##  $ Diet.low.in.fruits                      : num [1:206] 157106.8 4004 1271.4 266.1 3.6 ...
##  $ Unsafe.water.source                     : num [1:206] 277264.8 7174.6 23526.3 15.6 0 ...
##  $ Secondhand.smoke                        : num [1:206] 1.99e+05 5.93e+03 1.70e+03 7.08e+02 6.47 ...
##  $ Low.birth.weight                        : num [1:206] 3.72e+05 2.36e+04 1.81e+04 3.75e+02 7.33e-01 ...
##  $ Child.wasting                           : num [1:206] 309658 23388 30602 254 0 ...
##  $ Unsafe.sex                              : num [1:206] 172391.1 532.3 7531.2 57.1 4.7 ...
##  $ Diet.low.in.nuts.and.seeds              : num [1:206] 85270.3 3121.6 474.2 314.6 1.8 ...
##  $ Household.air.pollution.from.solid.fuels: num [1:206] 530919 38546 16643 1281 0 ...
##  $ Diet.low.in.Vegetables                  : num [1:206] 78076.44 4411.6 1060.13 95.47 2.27 ...
##  $ Low.physical.activity                   : num [1:206] 1.10e+05 3.22e+03 4.48e+02 1.83e+02 8.53 ...
##  $ Smoking                                 : num [1:206] 1.21e+06 7.04e+03 5.77e+03 3.16e+03 8.62e+01 ...
##  $ High.fasting.plasma.glucose             : num [1:206] 776159.1 18008.6 6578 1344.6 31.4 ...
##  $ Air.pollution                           : num [1:206] 1.06e+06 4.34e+04 1.94e+04 2.46e+03 1.38e+01 ...
##  $ High.body.mass.index                    : num [1:206] 594427.9 12552 3000.5 2060.9 38.1 ...
##  $ Unsafe.sanitation                       : num [1:206] 1.98e+05 5.31e+03 1.63e+04 8.77 0.00 ...
##  $ No.access.to.handwashing.facility       : num [1:206] 135995.4 6498.1 13052.4 24.8 0 ...
##  $ Drug.use                                : num [1:206] 68113.78 398.5 258.83 95.27 3.93 ...
##  $ Low.bone.mineral.density                : num [1:206] 54349.5 483 475.6 52.6 7.2 ...
##  $ Vitamin.A.deficiency                    : num [1:206] 15169.12 1928.67 2073.8 1.73 0 ...
##  $ Child.stunting                          : num [1:206] 69087 8245 6997 102 0 ...
##  $ Discontinued.breastfeeding              : num [1:206] 2668.617 238.467 345.033 0.433 0 ...
##  $ Non.exclusive.breastfeeding             : num [1:206] 44388.4 3164.7 5147.1 50.1 0 ...
##  $ Iron.deficiency                         : num [1:206] 8902.3 875.4 428.4 1.2 0 ...
##  $ Total_Deaths                            : num [1:206] 1839838 13833 15420 4520 129 ...
country_level_data <- country_level_data[-1,]
country_level_data <- country_level_data[-grep("OWID_WRL",country_level_data$Code),]
str(country_level_data)
## tibble [204 × 31] (S3: tbl_df/tbl/data.frame)
##  $ Code                                    : chr [1:204] "AFG" "AGO" "ALB" "AND" ...
##  $ Year                                    : num [1:204] 2004 2004 2004 2004 2004 ...
##  $ Outdoor.air.pollution                   : num [1:204] 5178 2853.2 1190.7 13.9 1484.1 ...
##  $ High.systolic.blood.pressure            : num [1:204] 30207 11753.5 5464.9 69.5 2555.1 ...
##  $ Diet.high.in.sodium                     : num [1:204] 1241.73 839.37 1814.53 4.23 119.53 ...
##  $ Diet.low.in.whole.grains                : num [1:204] 8460.8 1332.2 1037.5 14.2 611.9 ...
##  $ Alochol.use                             : num [1:204] 459.3 7689.9 559.6 32.4 291.7 ...
##  $ Diet.low.in.fruits                      : num [1:204] 4004 1271.4 266.1 3.6 116.4 ...
##  $ Unsafe.water.source                     : num [1:204] 7174.6 23526.3 15.6 0 17.1 ...
##  $ Secondhand.smoke                        : num [1:204] 5931.47 1700.97 707.57 6.47 394.07 ...
##  $ Low.birth.weight                        : num [1:204] 2.36e+04 1.81e+04 3.75e+02 7.33e-01 1.52e+02 ...
##  $ Child.wasting                           : num [1:204] 23388.5 30602.3 254.2 0 18.6 ...
##  $ Unsafe.sex                              : num [1:204] 532.3 7531.2 57.1 4.7 65 ...
##  $ Diet.low.in.nuts.and.seeds              : num [1:204] 3121.6 474.2 314.6 1.8 10.7 ...
##  $ Household.air.pollution.from.solid.fuels: num [1:204] 3.85e+04 1.66e+04 1.28e+03 0.00 6.67e-01 ...
##  $ Diet.low.in.Vegetables                  : num [1:204] 4411.6 1060.13 95.47 2.27 88.87 ...
##  $ Low.physical.activity                   : num [1:204] 3215.9 447.93 182.93 8.53 472.77 ...
##  $ Smoking                                 : num [1:204] 7043.3 5770.6 3158 86.2 1491.8 ...
##  $ High.fasting.plasma.glucose             : num [1:204] 18008.6 6578 1344.6 31.4 1771.6 ...
##  $ Air.pollution                           : num [1:204] 43384.1 19412.9 2457.7 13.8 1466.7 ...
##  $ High.body.mass.index                    : num [1:204] 12552 3000.5 2060.9 38.1 2925.6 ...
##  $ Unsafe.sanitation                       : num [1:204] 5311.47 16253.77 8.77 0 1.37 ...
##  $ No.access.to.handwashing.facility       : num [1:204] 6498.1 13052.4 24.8 0 16.5 ...
##  $ Drug.use                                : num [1:204] 398.5 258.83 95.27 3.93 410.43 ...
##  $ Low.bone.mineral.density                : num [1:204] 483 475.6 52.6 7.2 144.5 ...
##  $ Vitamin.A.deficiency                    : num [1:204] 1.93e+03 2.07e+03 1.73 0.00 3.33e-02 ...
##  $ Child.stunting                          : num [1:204] 8245.2 6997.2 102.4 0 2.8 ...
##  $ Discontinued.breastfeeding              : num [1:204] 238.467 345.033 0.433 0 0 ...
##  $ Non.exclusive.breastfeeding             : num [1:204] 3164.7 5147.1 50.1 0 2.07 ...
##  $ Iron.deficiency                         : num [1:204] 875.4 428.37 1.2 0 1.43 ...
##  $ Total_Deaths                            : num [1:204] 13833 15420 4520 129 2588 ...
tail(country_level_data)
## # A tibble: 6 × 31
##   Code   Year Outdoor.air.pollution High.systolic.blood.pr…¹ Diet.high.in.sodium
##   <chr> <dbl>                 <dbl>                    <dbl>               <dbl>
## 1 VUT   2004.                  29.7                     371.                53.3
## 2 WSM   2004.                  32.3                     260.                15.9
## 3 YEM   2004.                6025.                    19767.               818. 
## 4 ZAF   2004.               21862.                    54454.              3253. 
## 5 ZMB   2004.                1810.                     6566               1379. 
## 6 ZWE   2004.                2162.                     9282.               856. 
## # ℹ abbreviated name: ¹​High.systolic.blood.pressure
## # ℹ 26 more variables: Diet.low.in.whole.grains <dbl>, Alochol.use <dbl>,
## #   Diet.low.in.fruits <dbl>, Unsafe.water.source <dbl>,
## #   Secondhand.smoke <dbl>, Low.birth.weight <dbl>, Child.wasting <dbl>,
## #   Unsafe.sex <dbl>, Diet.low.in.nuts.and.seeds <dbl>,
## #   Household.air.pollution.from.solid.fuels <dbl>,
## #   Diet.low.in.Vegetables <dbl>, Low.physical.activity <dbl>, Smoking <dbl>, …
# 1: Selecting Feature Variables
# Select relevant features for clustering
str(country_level_data)
## tibble [204 × 31] (S3: tbl_df/tbl/data.frame)
##  $ Code                                    : chr [1:204] "AFG" "AGO" "ALB" "AND" ...
##  $ Year                                    : num [1:204] 2004 2004 2004 2004 2004 ...
##  $ Outdoor.air.pollution                   : num [1:204] 5178 2853.2 1190.7 13.9 1484.1 ...
##  $ High.systolic.blood.pressure            : num [1:204] 30207 11753.5 5464.9 69.5 2555.1 ...
##  $ Diet.high.in.sodium                     : num [1:204] 1241.73 839.37 1814.53 4.23 119.53 ...
##  $ Diet.low.in.whole.grains                : num [1:204] 8460.8 1332.2 1037.5 14.2 611.9 ...
##  $ Alochol.use                             : num [1:204] 459.3 7689.9 559.6 32.4 291.7 ...
##  $ Diet.low.in.fruits                      : num [1:204] 4004 1271.4 266.1 3.6 116.4 ...
##  $ Unsafe.water.source                     : num [1:204] 7174.6 23526.3 15.6 0 17.1 ...
##  $ Secondhand.smoke                        : num [1:204] 5931.47 1700.97 707.57 6.47 394.07 ...
##  $ Low.birth.weight                        : num [1:204] 2.36e+04 1.81e+04 3.75e+02 7.33e-01 1.52e+02 ...
##  $ Child.wasting                           : num [1:204] 23388.5 30602.3 254.2 0 18.6 ...
##  $ Unsafe.sex                              : num [1:204] 532.3 7531.2 57.1 4.7 65 ...
##  $ Diet.low.in.nuts.and.seeds              : num [1:204] 3121.6 474.2 314.6 1.8 10.7 ...
##  $ Household.air.pollution.from.solid.fuels: num [1:204] 3.85e+04 1.66e+04 1.28e+03 0.00 6.67e-01 ...
##  $ Diet.low.in.Vegetables                  : num [1:204] 4411.6 1060.13 95.47 2.27 88.87 ...
##  $ Low.physical.activity                   : num [1:204] 3215.9 447.93 182.93 8.53 472.77 ...
##  $ Smoking                                 : num [1:204] 7043.3 5770.6 3158 86.2 1491.8 ...
##  $ High.fasting.plasma.glucose             : num [1:204] 18008.6 6578 1344.6 31.4 1771.6 ...
##  $ Air.pollution                           : num [1:204] 43384.1 19412.9 2457.7 13.8 1466.7 ...
##  $ High.body.mass.index                    : num [1:204] 12552 3000.5 2060.9 38.1 2925.6 ...
##  $ Unsafe.sanitation                       : num [1:204] 5311.47 16253.77 8.77 0 1.37 ...
##  $ No.access.to.handwashing.facility       : num [1:204] 6498.1 13052.4 24.8 0 16.5 ...
##  $ Drug.use                                : num [1:204] 398.5 258.83 95.27 3.93 410.43 ...
##  $ Low.bone.mineral.density                : num [1:204] 483 475.6 52.6 7.2 144.5 ...
##  $ Vitamin.A.deficiency                    : num [1:204] 1.93e+03 2.07e+03 1.73 0.00 3.33e-02 ...
##  $ Child.stunting                          : num [1:204] 8245.2 6997.2 102.4 0 2.8 ...
##  $ Discontinued.breastfeeding              : num [1:204] 238.467 345.033 0.433 0 0 ...
##  $ Non.exclusive.breastfeeding             : num [1:204] 3164.7 5147.1 50.1 0 2.07 ...
##  $ Iron.deficiency                         : num [1:204] 875.4 428.37 1.2 0 1.43 ...
##  $ Total_Deaths                            : num [1:204] 13833 15420 4520 129 2588 ...
clustering_data <- country_level_data %>%
  select(Alochol.use, Secondhand.smoke, 
         Smoking, Drug.use)

# Standardize the data for better clustering performance
clustering_data_scale <- scale(clustering_data)

head(clustering_data_scale)
##      Alochol.use Secondhand.smoke     Smoking    Drug.use
## [1,] -0.25331543      0.005321049 -0.16598429 -0.17860798
## [2,] -0.06540833     -0.133598577 -0.17398764 -0.19513870
## [3,] -0.25070798     -0.166219487 -0.19041777 -0.21449818
## [4,] -0.26440965     -0.189241955 -0.20973573 -0.22530825
## [5,] -0.25767013     -0.176514086 -0.20089639 -0.17719557
## [6,]  0.21437372      0.067359009  0.08578723  0.03621244
#Step 2: Apply K-Means Clustering
set.seed(123)

# Apply K-means clustering with an arbitrary k (e.g., k = 3)
kmeans_model <- kmeans(clustering_data_scale, centers = 3, nstart = 25)
kmeans_model
## K-means clustering with 3 clusters of sizes 200, 3, 1
## 
## Cluster means:
##   Alochol.use Secondhand.smoke    Smoking   Drug.use
## 1  -0.1211628       -0.1038789 -0.1109928 -0.1280741
## 2   4.4850983        2.7382539  3.2297423  5.1957899
## 3  10.7772713       12.5610152 12.5093297 10.0274451
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 38.08730 36.98749  0.00000
##  (between_SS / total_SS =  90.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
# Add cluster assignments to the original data
country_level_data$Cluster <- as.factor(kmeans_model$cluster)

head(country_level_data)
## # A tibble: 6 × 32
##   Code   Year Outdoor.air.pollution High.systolic.blood.pr…¹ Diet.high.in.sodium
##   <chr> <dbl>                 <dbl>                    <dbl>               <dbl>
## 1 AFG   2004.                5178.                   30207.              1242.  
## 2 AGO   2004.                2853.                   11754.               839.  
## 3 ALB   2004.                1191.                    5465.              1815.  
## 4 AND   2004.                  13.9                     69.5                4.23
## 5 ARE   2004.                1484.                    2555.               120.  
## 6 ARG   2004.               12004.                   50664.              6633.  
## # ℹ abbreviated name: ¹​High.systolic.blood.pressure
## # ℹ 27 more variables: Diet.low.in.whole.grains <dbl>, Alochol.use <dbl>,
## #   Diet.low.in.fruits <dbl>, Unsafe.water.source <dbl>,
## #   Secondhand.smoke <dbl>, Low.birth.weight <dbl>, Child.wasting <dbl>,
## #   Unsafe.sex <dbl>, Diet.low.in.nuts.and.seeds <dbl>,
## #   Household.air.pollution.from.solid.fuels <dbl>,
## #   Diet.low.in.Vegetables <dbl>, Low.physical.activity <dbl>, Smoking <dbl>, …
#Step 3: Visualize the Clustering Results
library(ggplot2)

# Use PCA to reduce the dimensionality for visualization
pca <- prcomp(clustering_data)

str(pca$x)
##  num [1:204, 1:4] -27462 -27853 -32116 -35304 -33812 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
# Create a data frame for plotting
plot_data <- data.frame(PC1 = pca$x[,1], PC2 = pca$x[,2], Cluster = country_level_data$Cluster)

# Plot the clusters using the first two principal components
ggplot(plot_data, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point() +
  ggtitle("K-Means Clustering Results (k = 3)") +
  theme_minimal()

# Add country names from the original dataset to the clustering data
clustering_data_with_countries <- as.data.frame(clustering_data)
clustering_data_with_countries$Country <- country_level_data$Code  # Replace 'Entity' with the correct country column
clustering_data_with_countries$Cluster <- as.factor(kmeans_model$cluster)
# Visualize clustering
library(factoextra)
## Warning: 程辑包'factoextra'是用R版本4.2.3 来建造的
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_cluster(kmeans_model, 
             data = clustering_data_scale, 
             geom = "point",  # Use points for the cluster plot
             labelsize = 0,   # Remove default cluster labels
             show.clust.cent = FALSE) +  # Hide cluster centers
geom_text(aes(label = clustering_data_with_countries$Country),  # Add country labels
          size = 3, 
          vjust = -0.5,   # Adjust vertical positioning
          hjust = 0.5,    # Adjust horizontal positioning
          color = "black")  # Color for country names

library(shiny)
## Warning: 程辑包'shiny'是用R版本4.2.3 来建造的
# Define UI for application
ui <- navbarPage(
  title = "Interactive Data Analysis App",
  
  # EDA Panel
  tabPanel("EDA",
           sidebarLayout(
             sidebarPanel(
               selectInput("eda_var", "Select Variable for EDA", 
                           choices = names(data_clean), 
                           selected = "Alochol.use"),
               
               # Add a dropdown to select the type of plot
               selectInput("plot_type", "Select Plot Type", 
                           choices = c("Histogram", "Boxplot"), 
                           selected = "Histogram")
             ),
             mainPanel(
               plotOutput("edaPlot"),
               verbatimTextOutput("summaryOutput")
             )
           )),
  
  # Add a new tab for Facet Line Plot
  tabPanel("Specific Variables EDA",
           sidebarLayout(
             sidebarPanel(
               checkboxGroupInput("facet_vars", "Select Variables", 
                                  choices = c("Alochol.use", "Smoking", "Secondhand.smoke", "Drug.use"),
                                  selected = c("Alochol.use", "Smoking", "Secondhand.smoke", "Drug.use")),
               selectInput("facet_countries", "Select Countries", 
                           choices = unique(data_clean$Entity),
                           selected = unique(data_clean$Entity)[1:5],  # Default to first five countries
                           multiple = TRUE)
             ),
             mainPanel(
               plotOutput("facetPlot")
             )
           )
  ),
  
  # Classification Panel 
  tabPanel("Classification",
           sidebarLayout(
             sidebarPanel(
               selectInput("model", "Select Model", 
                           choices = c("Decision Tree", "Logistic Regression"))
             ),
             mainPanel(
               plotOutput("rocPlot"),
               textOutput("modelPerformance")
             )
           )),
  
  # Clustering Panel 
  tabPanel("Clustering",
           sidebarLayout(
             sidebarPanel(
               sliderInput("clusters", "Number of Clusters:", 
                           min = 2, max = 5, value = 3)
             ),
             mainPanel(
               plotOutput("clusterPlot"),
          #     plotOutput("pcaPlot")
             )
           ))
)

# Define server logic
server <- function(input, output) {
  
  # EDA Panel Logic
  output$edaPlot <- renderPlot({
    var <- input$eda_var
    
    # Check if the variable is numeric
    if (is.numeric(data_clean[[var]])) {
      if (input$plot_type == "Histogram") {
        # Plot histogram for numeric variables
        ggplot(data_clean, aes_string(x = var)) +
          geom_histogram(fill = "blue", color = "black", bins = 30) +
          ggtitle(paste("Histogram of", var)) +
          theme_minimal()
      } else if (input$plot_type == "Boxplot") {
        # Plot boxplot for numeric variables
        ggplot(data_clean, aes_string(y = var)) +
          geom_boxplot(fill = "blue", color = "black") +
          ggtitle(paste("Boxplot of", var)) +
          theme_minimal()
      }
    } else {
      # Plot bar chart for categorical variables
      ggplot(data_clean, aes_string(x = var)) +
        geom_bar(fill = "blue", color = "black") +
        ggtitle(paste("Bar Chart of", var)) +
        theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8)) + # Adjusted angle, justification, and size
        scale_x_discrete(breaks = function(x) x[seq(1, length(x), by = 2)])  # Show every second label
    }
  })
  
  # EDA Summary Output
  output$summaryOutput <- renderPrint({
    summary(data_clean[[input$eda_var]])
  })
  
  # Add logic for the facet plot
  output$facetPlot <- renderPlot({
    # Filter the data for selected countries
    selected_data <- data_clean %>% 
      filter(Entity %in% input$facet_countries)
    
    # Melt the data to create a long format suitable for facet plots
    facet_data <- selected_data %>%
      select(Year, Entity, all_of(input$facet_vars)) %>%
      pivot_longer(cols = input$facet_vars, names_to = "Cause", values_to = "Deaths")
    
    # Create the facet line plot
    ggplot(facet_data, aes(x = Year, y = Deaths, color = Cause)) +
      geom_line() +
      facet_wrap(~ Entity, scales = "free_y") +
      theme_minimal() +
      ggtitle("Facet Plot of Deaths for Selected Causes") +
      xlab("Year") +
      ylab("Number of Deaths")
  })
  
  
  # Classification Panel Logic (ROC and Performance)
  output$rocPlot <- renderPlot({
    if (input$model == "Logistic Regression") {
      roc_obj <- roc(test_set$High_Deaths, pred_logit)
      plot(roc_obj, main = "ROC Curve for Logistic Regression", col = "blue")
    } else if (input$model == "Decision Tree") {
      roc_curve <- roc(test_set$High_Deaths, pred_prob[, 2])
      plot(roc_curve, main = "ROC Curve for Decision Tree", col = "red")
    }
  })
  
  # Classification Performance Metrics
  output$modelPerformance <- renderText({
    if (input$model == "Logistic Regression") {
      paste("Accuracy:", accuracy_logi, "\nPrecision:", precision_logi, "\nRecall:", recall_logi, "\nF1 Score:", f1_score_logi)
    } else if (input$model == "Decision Tree") {
      paste("Accuracy:", accuracy_tree, "\nPrecision:", precision_tree, "\nRecall:", recall_tree, "\nF1 Score:", f1_score_tree)
    }
  })
  
  # Clustering Panel Logic
  output$clusterPlot <- renderPlot({
    set.seed(123)
    kmeans_model <- kmeans(clustering_data_scale, centers = input$clusters, nstart = 25)
    # Assuming train_set contains the country names in the 'Entity' column
    clustering_data_with_countries <- as.data.frame(clustering_data)
    clustering_data_with_countries$Country <- country_level_data$Code   # 'Entity' holds country names
    clustering_data_with_countries$Cluster <- as.factor(kmeans_model$cluster)
    
    # Plot with country labels using fviz_cluster and geom_text
    fviz_cluster(kmeans_model, 
                 data = clustering_data_scale, 
                 geom = "point",  # Use points for the plot
                 labelsize = 0,   # Remove default cluster labels (numbers)
                 show.clust.cent = FALSE) +  # Hide cluster centers
      geom_text(aes(label = clustering_data_with_countries$Country),  # Add country labels
                size = 3, 
                vjust = -0.5,   # Adjust vertical positioning
                hjust = 0.5,    # Adjust horizontal positioning
                color = "black")  # Color for country names
  })
  
  # PCA for Clustering Visualization
  # output$pcaPlot <- renderPlot({
  #  pca <- prcomp(clustering_data)
  #  plot_data <- data.frame(PC1 = pca$x[,1], PC2 = pca$x[,2], Cluster = kmeans_model$cluster)
  
  #   ggplot(plot_data, aes(x = PC1, y = PC2, color = Cluster)) +
  #    geom_point() +
  #    ggtitle("PCA Plot for Clusters")
#})
}

shinyApp(ui = ui, server = server)
## PhantomJS not found. You can install it with webshot::install_phantomjs(). If it is installed, please make sure the phantomjs executable can be found via the PATH variable.
Shiny applications not supported in static R Markdown documents