The aim of this project is to analyze and predict if the other causes of deaths can predict if blood pressure death ratio is high or low, mainly ones that affects the respiratory system.
# Load libraries
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Loading required package: lattice
library(rpart)
library(rpart.plot)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(lime)
##
## Attaching package: 'lime'
## The following object is masked from 'package:dplyr':
##
## explain
library(knitr)
library(rmarkdown)
The dataset is inputted into the program, and str(), summary(), and head() is used to find the structure, statistical summary, and first rows of the dataset.
df <- read.csv("C:\\Users\\Ralf\\Documents\\(Uni)\\Data Analysis\\Assignment\\Countries and death causes.csv")
str(df)
## 'data.frame': 6840 obs. of 31 variables:
## $ Entity : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Code : chr "AFG" "AFG" "AFG" "AFG" ...
## $ Year : int 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 ...
## $ Outdoor.air.pollution : int 3169 3222 3395 3623 3788 3869 3943 4024 4040 4042 ...
## $ High.systolic.blood.pressure : int 25633 25872 26309 26961 27658 28090 28587 29021 29349 29712 ...
## $ Diet.high.in.sodium : int 1045 1055 1075 1103 1134 1154 1178 1202 1222 1242 ...
## $ Diet.low.in.whole.grains : int 7077 7149 7297 7499 7698 7807 7943 8075 8173 8265 ...
## $ Alochol.use : int 356 364 376 389 399 406 413 420 425 426 ...
## $ Diet.low.in.fruits : int 3185 3248 3351 3480 3610 3703 3819 3938 4038 4127 ...
## $ Unsafe.water.source : int 3702 4309 5356 7152 7192 8378 8487 9348 9788 9931 ...
## $ Secondhand.smoke : int 4794 4921 5279 5734 6050 6167 6298 6425 6402 6323 ...
## $ Low.birth.weight : int 16135 17924 21200 23795 24866 25534 25997 26246 25805 25080 ...
## $ Child.wasting : int 19546 20334 22895 27002 29205 30943 31628 32736 32760 32271 ...
## $ Unsafe.sex : int 351 361 378 395 410 422 435 448 458 469 ...
## $ Diet.low.in.nuts.and.seeds : int 2319 2449 2603 2771 2932 3049 3173 3298 3401 3482 ...
## $ Household.air.pollution.from.solid.fuels: int 34372 35392 38065 41154 43153 44024 45005 46017 46055 45681 ...
## $ Diet.low.in.Vegetables : int 3679 3732 3827 3951 4075 4153 4247 4339 4409 4473 ...
## $ Low.physical.activity : int 2637 2652 2688 2744 2805 2839 2878 2914 2944 2976 ...
## $ Smoking : int 5174 5247 5363 5522 5689 5801 5934 6066 6178 6288 ...
## $ High.fasting.plasma.glucose : int 11449 11811 12265 12821 13400 13871 14413 14970 15502 16058 ...
## $ Air.pollution : int 37231 38315 41172 44488 46634 47566 48617 49703 49746 49349 ...
## $ High.body.mass.index : int 9518 9489 9528 9611 9675 9608 9503 9286 9024 8857 ...
## $ Unsafe.sanitation : int 2798 3254 4042 5392 5418 6313 6393 7038 7366 7468 ...
## $ No.access.to.handwashing.facility : int 4825 5127 5889 7007 7421 7896 8098 8507 8560 8424 ...
## $ Drug.use : int 174 188 211 232 247 260 274 287 295 302 ...
## $ Low.bone.mineral.density : int 389 389 393 411 413 417 423 425 429 428 ...
## $ Vitamin.A.deficiency : int 2016 2056 2100 2316 2665 3070 3214 3228 3413 3662 ...
## $ Child.stunting : int 7686 7886 8568 9875 11031 11973 12426 12805 13011 13052 ...
## $ Discontinued.breastfeeding : int 107 121 150 204 204 233 233 255 264 263 ...
## $ Non.exclusive.breastfeeding : int 2216 2501 3053 3726 3833 4124 4183 4393 4417 4326 ...
## $ Iron.deficiency : int 564 611 700 773 812 848 883 914 924 909 ...
summary(df)
## Entity Code Year Outdoor.air.pollution
## Length:6840 Length:6840 Min. :1990 Min. : 0
## Class :character Class :character 1st Qu.:1997 1st Qu.: 434
## Mode :character Mode :character Median :2004 Median : 2101
## Mean :2004 Mean : 84582
## 3rd Qu.:2012 3rd Qu.: 11810
## Max. :2019 Max. :4506193
## High.systolic.blood.pressure Diet.high.in.sodium Diet.low.in.whole.grains
## Min. : 2 Min. : 0.0 Min. : 0.0
## 1st Qu.: 1828 1st Qu.: 137.0 1st Qu.: 273.8
## Median : 8770 Median : 969.5 Median : 1444.0
## Mean : 224225 Mean : 40497.2 Mean : 38691.3
## 3rd Qu.: 40356 3rd Qu.: 5169.8 3rd Qu.: 6773.2
## Max. :10845595 Max. :1885356.0 Max. :1844836.0
## Alochol.use Diet.low.in.fruits Unsafe.water.source Secondhand.smoke
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 1
## 1st Qu.: 263.8 1st Qu.: 144.0 1st Qu.: 7.0 1st Qu.: 209
## Median : 1780.5 Median : 834.5 Median : 182.5 Median : 994
## Mean : 54848.6 Mean : 23957.8 Mean : 44086.4 Mean : 30364
## 3rd Qu.: 8368.0 3rd Qu.: 3104.8 3rd Qu.: 5599.2 3rd Qu.: 4348
## Max. :2441973.0 Max. :1046015.0 Max. :2450944.0 Max. :1304318
## Low.birth.weight Child.wasting Unsafe.sex
## Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 123 1st Qu.: 26 1st Qu.: 97
## Median : 1057 Median : 504 Median : 619
## Mean : 59126 Mean : 49924 Mean : 27646
## 3rd Qu.: 10903 3rd Qu.: 9765 3rd Qu.: 4492
## Max. :3033425 Max. :3430422 Max. :1664813
## Diet.low.in.nuts.and.seeds Household.air.pollution.from.solid.fuels
## Min. : 0 Min. : 0
## 1st Qu.: 27 1st Qu.: 32
## Median : 252 Median : 821
## Mean : 12996 Mean : 83641
## 3rd Qu.: 1998 3rd Qu.: 10870
## Max. :575139 Max. :4358214
## Diet.low.in.Vegetables Low.physical.activity Smoking
## Min. : 0.0 Min. : 0.0 Min. : 1
## 1st Qu.: 109.0 1st Qu.: 92.0 1st Qu.: 894
## Median : 590.5 Median : 521.5 Median : 4987
## Mean : 11982.5 Mean : 16489.1 Mean : 181958
## 3rd Qu.: 2101.8 3rd Qu.: 2820.2 3rd Qu.: 23994
## Max. :529381.0 Max. :831502.0 Max. :7693368
## High.fasting.plasma.glucose Air.pollution High.body.mass.index
## Min. : 3 Min. : 0 Min. : 2
## 1st Qu.: 1178 1st Qu.: 816 1st Qu.: 918
## Median : 4966 Median : 5748 Median : 3917
## Mean : 117554 Mean : 164752 Mean : 89870
## 3rd Qu.: 21639 3rd Qu.: 25050 3rd Qu.: 17968
## Max. :6501398 Max. :6671740 Max. :5019360
## Unsafe.sanitation No.access.to.handwashing.facility Drug.use
## Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 3 1st Qu.: 19 1st Qu.: 31
## Median : 102 Median : 221 Median : 222
## Mean : 31522 Mean : 21800 Mean : 10285
## 3rd Qu.: 3854 3rd Qu.: 3954 3rd Qu.: 1224
## Max. :1842275 Max. :1200349 Max. :494492
## Low.bone.mineral.density Vitamin.A.deficiency Child.stunting
## Min. : 0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 43 1st Qu.: 0.0 1st Qu.: 1.0
## Median : 277 Median : 2.0 Median : 41.5
## Mean : 8182 Mean : 2471.6 Mean : 11164.3
## 3rd Qu.: 1232 3rd Qu.: 230.2 3rd Qu.: 1563.2
## Max. :437884 Max. :207555.0 Max. :833449.0
## Discontinued.breastfeeding Non.exclusive.breastfeeding Iron.deficiency
## Min. : 0.00 Min. : 0.0 Min. : 0
## 1st Qu.: 0.00 1st Qu.: 3.0 1st Qu.: 1
## Median : 4.00 Median : 60.5 Median : 12
## Mean : 431.46 Mean : 7171.9 Mean : 1421
## 3rd Qu.: 71.25 3rd Qu.: 1315.5 3rd Qu.: 238
## Max. :33106.00 Max. :505470.0 Max. :73461
head(df)
From the summaries, it can be seen that the dataset includes two character strings for the first two collumns that identify which country the death number is attributed to, with the rest of the collumns showing how many deaths were caused by each individual cause.Another thing to note is that there were 228 unique countries (Entity), but only 205 unique codes (Code), meaning that the dataset does need to be cleaned.
Now, plots are generated to help visualization of the dataset. But, as the entity and code share the same purpose of identifying which country the data is attached to, it is better to discard one of them. The “Code” collumn should be omitted from the analysis.
# Remove the "Code" collumn
df <- df[, !names(df) %in% "Code"]
# Check is "Code" is removed or not
head(df)
As shown above, the Code collumn is removed.
Now to visualize which cause of death is the most common one, a boxplot can be plotted using the geom_boxplot function. Removing any null values is also done to delete any anomalies in the dataset.
#remove non-numerical data from the dataset
causes_df <- df[, !(names(df) %in% c("Entity", "Code", "Year"))]
#omit any null values in the dataset
omit_causes_df <- na.omit(causes_df)
# Reshapes the dataset. Converts each death cause collumn into a row to help visualize the individual cause of death (I used ChatGPT 4o)
long_df <- data.frame(
Cause = rep(names(omit_causes_df), each = nrow(omit_causes_df)),
Deaths = as.vector(as.matrix(omit_causes_df))
)
# Create the boxplot for the different causes of deaths
ggplot(long_df, aes(x = Cause, y = Deaths)) +
geom_bar(stat = "identity", , color = "black") +
ggtitle("Distribution of Deaths by Different Causes") +
xlab("Cause of Death") +
ylab("Number of Deaths") +
theme_minimal() +
coord_flip()
It can be seen that high blood pressure is the leading cause of death
around the world.
Create a new collumn that sums the total deaths for each row
# Only include number of deaths
death_causes_columns <- df[, !(names(df) %in% c("Entity", "Year", "Code"))]
# Create new dataset that combines the number of deaths per row
new_df <- df %>%
mutate(total_deaths =rowSums(death_causes_columns, na.rm = TRUE))
To classify whether the number of deaths caused by blood pressure for a given country for a given year is considered high or low, the mean value needs to be established. A ratio that compares the blood pressure related deaths and the total deaths is used to get the average value. A new collumn “BPDeathRatio” is that new ratio.
The aim of the project now is to see if smoking, secondhand smoke, and outdoor air pollution numbers can predict a high blood pressure ratio death. The target variable is then the BPDeathRatio, and the feature variables would be the Smoking and secondhand smoke ratio collumns.
# Select smoking and secondhand smoke data as features
model_df <- new_df[, c("Entity", "Year", "High.systolic.blood.pressure", "total_deaths", "Smoking", "Secondhand.smoke", "Outdoor.air.pollution")]
# Create BPDeathRatio and classify as high/low (convert to binary classification)
model_df <- model_df %>%
mutate(BPDeathRatio = High.systolic.blood.pressure / total_deaths) %>%
mutate(SmokingDeathRatio = Smoking / total_deaths) %>%
mutate(SecondhandSmokeDeathRatio = Secondhand.smoke / total_deaths) %>%
mutate(PollutionDeathRatio = Outdoor.air.pollution / total_deaths) %>%
mutate(Class_BP = ifelse
(BPDeathRatio > mean(BPDeathRatio)
, 1
, 0)
)
# Calculate the mean of BPDeathRatio
mean_valueBP <- mean(model_df$BPDeathRatio)
# Print the median value
paste("The median value of BPDeathRatio is:", mean_valueBP)
## [1] "The median value of BPDeathRatio is: 0.166748675306498"
# View data
head(model_df)
A binary classification is shown in the “class_BP” collumn, 0 for lower, 1 for higher compared to the median value of the blood pressure death ratio
We can also show the trend of this ratio year by year
ggplot(model_df, aes(x = Year, y = BPDeathRatio)) +
geom_smooth(color = "red") +
labs(title = "Year-by-Year Trend of Blood Pressure Death Ratio",
x = "Year",
y = "Blood Pressure Death Ratio")+
theme_minimal()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
It can be seen that the ratio does indeed increase, signaling a higher
percentage of people dying due to high blood pressure in more recent
years.
Next step is to split the dataset using a 80/20 ratio
# Set a seed for reproducibility
set.seed(11111)
# Split the data (80% training, 20% test)
intrain <- runif(nrow(model_df)) < 0.8
# Create the training and test datasets
train <- model_df[intrain,]
test <- model_df[!intrain,]
paste(
"Training set size:",
nrow(train),
"Test set size:",
nrow(test)
)
## [1] "Training set size: 5478 Test set size: 1362"
Now train the model using decision tree classifier, with the other ratios as the feature variables
tree_model <- rpart(Class_BP ~ SmokingDeathRatio + SecondhandSmokeDeathRatio + PollutionDeathRatio, data = train, method = "class")
# Visualize the decision tree
rpart.plot(tree_model)
# Predict on the test set
predictions <- predict(tree_model, newdata = test, type = "class")
# Show confusion matrix
confusion_matrix <- table(test$Class_BP, predictions, dnn = c("Actual", "Predicted"))
print(confusion_matrix)
## Predicted
## Actual 0 1
## 0 419 139
## 1 76 728
# Calculate accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
paste("Decision Tree Model Accuracy:", round(accuracy * 100, 2), "%")
## [1] "Decision Tree Model Accuracy: 84.21 %"
# Get predicted probabilities for ROC curve
dt_pred_probs <- predict(tree_model, newdata = test, type = "prob")[,2]
# Plot ROC curve and calculate AUC
dt_roc_curve <- roc(test$Class_BP, dt_pred_probs, quiet=TRUE)
plot(dt_roc_curve, col = "blue", lwd = 3, main = "ROC Curve for Decision Tree Model")
dt_auc_value <- auc(dt_roc_curve)
paste("Decision Tree Model AUC:", round(dt_auc_value,4))
## [1] "Decision Tree Model AUC: 0.8701"
Next step is to compare the decision tree model with a new model, in this case using a logistic regression model
# Train the logistic regression model
logistic_model <- glm(Class_BP ~ SmokingDeathRatio + SecondhandSmokeDeathRatio + PollutionDeathRatio,
data = train, family = "binomial")
# Predict on the test set
logistic_predictions <- predict(logistic_model, newdata = test, type = "response")
logistic_class <- ifelse(logistic_predictions > 0.5, 1, 0)
# Confusion Matrix for logistic regression
logistic_confusion_matrix <- table(test$Class_BP, logistic_class, dnn = c("Actual", "Predicted"))
print(logistic_confusion_matrix)
## Predicted
## Actual 0 1
## 0 417 141
## 1 141 663
# Calculate accuracy
logistic_accuracy <- sum(diag(logistic_confusion_matrix)) / sum(logistic_confusion_matrix)
paste("Logistic Regression Model Accuracy:", round(logistic_accuracy * 100, 2), "%")
## [1] "Logistic Regression Model Accuracy: 79.3 %"
# Plot ROC curve and calculate AUC
logistic_roc_curve <- roc(test$Class_BP, logistic_predictions, quiet=TRUE)
plot(logistic_roc_curve, col = "red", lwd = 3, main = "ROC Curve for Logistic Regression Model")
logistic_auc_value <- auc(logistic_roc_curve)
paste("Logistic Regression Model AUC:", round(logistic_auc_value, 4))
## [1] "Logistic Regression Model AUC: 0.8676"
We can then compare the result of the two models
# Plot the ROC curve for the decision tree model
plot(dt_roc_curve, col="blue", lwd = 3, main = "ROC Curves for Decision Tree and Logistic Regression Models")
# Add the ROC curve for the logistic regression model
lines(logistic_roc_curve, col="red", lwd = 3)
# Add a legend to distinguish between the two models
legend("bottomright", legend = c("Decision Tree, AUC = 0.870", "Logistic Regression, AUC = 0.868"),
col = c("blue", "red"), lwd = 3)
It can be seen that the two models perform relatively similarly, with a
similar AUC value; but it can be seen that the Decision Tree model does
have a higher accuracy compared to the Logistic Regression model,
meaning that it has a stronger predictive power for this particular
case.
If we were to compare the accuracy of these models with a null model, in this case the null model gives the data with the most results of that particular collumn (class_BP).
# Check the proportion of 1/0s in the training set
prop.table(table(train$Class_BP))
##
## 0 1
## 0.424425 0.575575
# Null model prediction (predicting all 1s since it's the majority class)
null_predictions <- rep(1, nrow(test))
# Confusion matrix for the null model
null_confusion_matrix <- table(test$Class_BP, null_predictions, dnn = c("Actual", "Predicted"))
print(null_confusion_matrix)
## Predicted
## Actual 1
## 0 558
## 1 804
# Calculate accuracy of the null model
null_accuracy <- sum(diag(null_confusion_matrix)) / sum(null_confusion_matrix)
paste("Null Model Accuracy:", round(null_accuracy * 100, 2), "%")
## [1] "Null Model Accuracy: 40.97 %"
# Create a vector of probabilities for the ROC curve (all predicted as 1)
null_pred_probs <- rep(1, nrow(test))
# Plot ROC curve for the null model
null_roc_curve <- roc(test$Class_BP, null_pred_probs, quiet=TRUE)
plot(null_roc_curve, col = "green", lwd = 3, main = "ROC Curves for Decision Tree, Logistic Regression, and Null Models")
# Add ROC curve for decision tree and logistic regression (reuse from before)
lines(dt_roc_curve, col = "blue", lwd = 3)
lines(logistic_roc_curve, col = "red", lwd = 3)
# Add legend
legend("bottomright", legend = c("Decision Tree", "Logistic Regression", "Null Model"),
col = c("blue", "red", "green"), lwd = 3)
null_auc_value <- auc(null_roc_curve)
paste("Null Model AUC:", round(null_auc_value, 4))
## [1] "Null Model AUC: 0.5"
It can be seen that the null model performed poorly compared to the more advanced models
Now that it is established that the Decision Tree model is more accurate, a more exploratory analysis can be done to analyse which of the three feature variables performs the best.
Three different decision trees can be made by using similar steps previously
tree_model_smoking <- rpart(Class_BP ~ SmokingDeathRatio, data = train, method = "class")
rpart.plot(tree_model_smoking)
tree_model_secondhand <- rpart(Class_BP ~ SecondhandSmokeDeathRatio, data = train, method = "class")
rpart.plot(tree_model_secondhand)
tree_model_pollution <- rpart(Class_BP ~ PollutionDeathRatio, data = train, method = "class")
rpart.plot(tree_model_pollution)
The accuracy of each model can also be shown
predictions_smoking <- predict(tree_model_smoking, newdata = test, type = "class")
confusion_matrix_smoking <- table(test$Class_BP, predictions_smoking)
accuracy_smoking <- sum(diag(confusion_matrix_smoking)) / sum(confusion_matrix_smoking)
paste("SmokingDeathRatio Model Accuracy:", round(accuracy_smoking * 100, 2), "%")
## [1] "SmokingDeathRatio Model Accuracy: 80.32 %"
predictions_secondhand <- predict(tree_model_secondhand, newdata = test, type = "class")
confusion_matrix_secondhand <- table(test$Class_BP, predictions_secondhand)
accuracy_secondhand <- sum(diag(confusion_matrix_secondhand)) / sum(confusion_matrix_secondhand)
paste("SecondhandSmokeDeathRatio Model Accuracy:", round(accuracy_secondhand * 100, 2), "%")
## [1] "SecondhandSmokeDeathRatio Model Accuracy: 74.96 %"
predictions_pollution <- predict(tree_model_pollution, newdata = test, type = "class")
confusion_matrix_pollution <- table(test$Class_BP, predictions_pollution)
accuracy_pollution <- sum(diag(confusion_matrix_pollution)) / sum(confusion_matrix_pollution)
paste("PollutionDeathRatio Model Accuracy:", round(accuracy_pollution * 100, 2), "%")
## [1] "PollutionDeathRatio Model Accuracy: 74.08 %"
Finally, the AUC values for the feature variables can be calculated to find which variable performs the best
smoking_pred_probs <- predict(tree_model_smoking, newdata = test, type = "prob")[,2]
smoking_roc_curve <- roc(test$Class_BP, smoking_pred_probs, quiet=TRUE)
plot(smoking_roc_curve, col = "blue", lwd = 3, main = "ROC Curve for SmokingDeathRatio Model")
secondhand_pred_probs <- predict(tree_model_secondhand, newdata = test, type = "prob")[,2]
secondhand_roc_curve <- roc(test$Class_BP, secondhand_pred_probs, quiet=TRUE)
lines(secondhand_roc_curve, col = "red", lwd = 3)
pollution_pred_probs <- predict(tree_model_pollution, newdata = test, type = "prob")[,2]
pollution_roc_curve <- roc(test$Class_BP, pollution_pred_probs, quiet=TRUE)
lines(pollution_roc_curve, col = "green", lwd = 3)
smoking_auc_value <- auc(smoking_roc_curve)
paste("SmokingDeathRatio AUC:", round(smoking_auc_value, 4))
## [1] "SmokingDeathRatio AUC: 0.8291"
secondhand_auc_value <- auc(secondhand_roc_curve)
paste("SecondhandSmokeDeathRatio AUC:", round(secondhand_auc_value, 4))
## [1] "SecondhandSmokeDeathRatio AUC: 0.7051"
pollution_auc_value <- auc(pollution_roc_curve)
paste("PollutionDeathRatio AUC:", round(pollution_auc_value, 4))
## [1] "PollutionDeathRatio AUC: 0.7213"
The smoking ratio is the highest, meaning that it has the most similarity with the blood pressure ratio. This makes sense, as it is known that “blood pressure and heart rate increase during smoking” (Omvik,1996), this model just confirms that statement. The combination of the variables do increase the accuracy of the model however, indicating that multiple risk factors can together better predict complex health outcomes.