#import libraries
#install.packages("openxlsx") # run once
library(openxlsx)
## Warning: package 'openxlsx' was built under R version 4.5.2
library(survival)
## Warning: package 'survival' was built under R version 4.5.2
#read the dataset uonhealthinform.xlsx file # Ensure both the dataset and the notebook file are in the same directory
df <- read.xlsx(file.choose())
attach(df)
#view data
View(df)
#check structure of data
str(df)
## 'data.frame': 11 obs. of 12 variables:
## $ Time : num 10 15 20 25 30 35 40 45 50 55 ...
## $ Event : num 1 1 0 1 1 0 1 1 0 0 ...
## $ Patient_ID : num 2356 4589 3 37 1213 ...
## $ Age : num 23 68 97 45 63 47 89 90 27 34 ...
## $ Gender : chr "M" "F" "F" "M" ...
## $ Height_cm : num 234 123 345 267 876 356 321 768 986 246 ...
## $ Weight_kg : num 56 76 86 79 34 56 29 55 89 43 ...
## $ Blood_Pressure : num 124 89 100 78 129 117 115 111 85 120 ...
## $ Disease_category: chr "Stroke" "TB" "Malaria" "Malaria" ...
## $ Region : chr "Urban" "Urban" "Rural" "Urban" ...
## $ County : chr "Nakuru" "Isiolo" "Mombasa" "Kisumu" ...
## $ Education_level : chr "Primary" "Secondary" "College" "University" ...
#check first variables rows
head(df)
## Time Event Patient_ID Age Gender Height_cm Weight_kg Blood_Pressure
## 1 10 1 2356 23 M 234 56 124
## 2 15 1 4589 68 F 123 76 89
## 3 20 0 3 97 F 345 86 100
## 4 25 1 37 45 M 267 79 78
## 5 30 1 1213 63 M 876 34 129
## 6 35 0 2390 47 M 356 56 117
## Disease_category Region County Education_level
## 1 Stroke Urban Nakuru Primary
## 2 TB Urban Isiolo Secondary
## 3 Malaria Rural Mombasa College
## 4 Malaria Urban Kisumu University
## 5 HIV Rural Machakos None
## 6 Cholera Urban Garissa Primary
#create A simple linear regression model without predictors/covariances
#we use ~1 if there is no predictor variables/covariance
model0 =lm(Blood_Pressure~1, data=df)
#check model summary
summary(model0)
##
## Call:
## lm(formula = Blood_Pressure ~ 1, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.727 -13.727 5.273 12.773 23.273
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 105.727 5.175 20.43 1.74e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.16 on 10 degrees of freedom
#Create multiple linear regression Model with predictors/covariances
#we use ~1 if there is no predictor variables/covariance
model1 =lm(Blood_Pressure~Age+factor(Gender)+Weight_kg)
#check model summary
summary(model1)
##
## Call:
## lm(formula = Blood_Pressure ~ Age + factor(Gender) + Weight_kg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.116 -7.252 1.834 6.130 14.699
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 149.46407 17.62397 8.481 6.27e-05 ***
## Age -0.05405 0.14994 -0.361 0.72910
## factor(Gender)M 1.20910 7.53639 0.160 0.87707
## Weight_kg -0.68512 0.19272 -3.555 0.00928 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.94 on 7 degrees of freedom
## Multiple R-squared: 0.6615, Adjusted R-squared: 0.5164
## F-statistic: 4.559 on 3 and 7 DF, p-value: 0.04508
#observation: A multiple linear regression model showed that weight was significantly associated with blood pressure (coefficient = −0.69, p = 0.009), while age and gender were not significant predictors having a p=0.73. The model explained approximately 66%(0.6615) of the variance in blood pressure (adjusted R-squared = 0.52% or (0.5164)) and was statistically significant overall (F(3,7) = 4.56, p = 0.045).
The multiple linear regression model perfomed better than the simple linear regression model, as indicated by the higher R-squared value (0.6615 vs. 0.5234) and significant F-statistic (p = 0.045).
#checking the 2 Model comparison (ANOVA)
anova(model0, model1)
## Analysis of Variance Table
##
## Model 1: Blood_Pressure ~ 1
## Model 2: Blood_Pressure ~ Age + factor(Gender) + Weight_kg
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 10 2946.18
## 2 7 997.41 3 1948.8 4.559 0.04508 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#observation: f-statistic = 4.56, p-value = 0.045 The ANOVA results indicate that the multiple linear regression model (model1) is significantly better at explaining the variance in blood pressure compared to the simple model (model0) that includes only the intercept. The p-value of 0.045 suggests that adding the predictors; age, gender and Weight_kg, significantly improves the model fit.
#Checking for linear models
#1. Linearity we assume each predictor to have a lienear relationship with the outcome variable(blood-pressure)
plot(model1, which = 1)
#observation No clear pattern or curves, we escecpt residual to be
scattered around zero.However this is a small dataset so we may not see
a clear pattern.
#2. Normality of Residuals
#plot
plot(model1, which = 2)
#shapiro-wilk test
shapiro.test(residuals(model1))
##
## Shapiro-Wilk normality test
##
## data: residuals(model1)
## W = 0.96933, p-value = 0.8797
#observation * The Q-Q plot shows that the residuals generally follow a straight line, this is acceptable normality. * The Shapiro-Wilk test yields a p-value of 0.8, this is >0.5, we fail to reject the null hypothesis and conclude that the residuals are normally distributed.
#3. checking for Homoscedasticity
plot(model1, which = 3)
observation: There’s no funnel shape, this means the variance of
residuals is constant across all levels of fitted values, indicating
homoscedasticity.
#4. Checking for independence of Residuals Data were collected from different individuals, so we assume independence of residuals.
#5. Checking for Multicollinearity
#Checking for multicollinearity
library(car)
## Warning: package 'car' was built under R version 4.5.2
## Loading required package: carData
vif(model1)
## Age factor(Gender) Weight_kg
## 1.055340 1.087127 1.073281
#observation: All VIF values are below 5, indicating no significant multicollinearity among the predictors.
# conclusion
Compared to the model0, the multiple linear regression model1 including age, gender, and weight provided a significantly better fit to the data (F(3,7) = 4.56, p = 0.045). Among the predictors, weight was significantly associated with blood pressure, while age and gender were not. Model diagnostics indicated no major violations of linear regression assumptions, though interpretation is limited by the small sample size.
Survival Analysis
#Fit the cox model
#call survival libary
library(survival)
#create survival object
surv_object <- Surv(Time, Event)
#fit cox proportional hazards model
cox_model2 <- coxph(surv_object ~ Age + factor(Gender) + Weight_kg)
#view model summary
summary(cox_model2)
## Call:
## coxph(formula = surv_object ~ Age + factor(Gender) + Weight_kg)
##
## n= 11, number of events= 7
##
## coef exp(coef) se(coef) z Pr(>|z|)
## Age 0.004015 1.004023 0.019940 0.201 0.840
## factor(Gender)M 0.547043 1.728136 0.903612 0.605 0.545
## Weight_kg -0.005610 0.994406 0.026076 -0.215 0.830
##
## exp(coef) exp(-coef) lower .95 upper .95
## Age 1.0040 0.9960 0.9655 1.044
## factor(Gender)M 1.7281 0.5787 0.2941 10.156
## Weight_kg 0.9944 1.0056 0.9449 1.047
##
## Concordance= 0.538 (se = 0.136 )
## Likelihood ratio test= 0.65 on 3 df, p=0.9
## Wald test = 0.59 on 3 df, p=0.9
## Score (logrank) test = 0.61 on 3 df, p=0.9
#observation: The fit assesses how Age, Gender, and Weight affect the hazard (risk) of experiencing the event over time.
Age: exp(coef) = Hazard Ratio (HR), HR= 1.004 at 95% CI (0.980-1.029) for Age, indicating each additional year of age is associated with a 0.4% increase in hazard, adjusting for gender and weight, but not statistically significant (p=0.84); This means Age is not significantly associated with the risk of the event.
Gender (Male vs Female): HR= 1.73 at 95% CI: 0.29 – 10.16,p-value: 0.545, indicating that Males have an estimated 73% higher hazard of experiencing the event compared to females, controlling for age and weight. However, this is not statistically significant (p=0.545). wide CI substantial uncertainty.
Weight (kg): HR = 0.994, 95% CI: 0.94 – 1.05, p-value: 0.830, Each 1 kg increase in weight is associated with a 0.6% reduction in hazard, holding age and gender constant. This means Weight is not a significant predictor of event risk.
Overal lmodel performance
Likelihood ratio test: p = 0.90, Wald test: p = 0.90, Score (log-rank) test: p = 0.90. This means the model1 does not provide a significantly better fit than a model0 with no covariates.
Model discrimination: Concordance (C-index): 0.538, indicating poor ability to distinguish between individuals who experience the event earlier versus later.
Standard error: 0.136: indicates some uncertainty in the C-index estimate.The model has poor discriminatory ability, performing only slightly better than random chance (0.5).
#Checking proportional hazards assumption in a Cox regression model
#check proportional hazards assumption
cox.zph(cox_model2)
## chisq df p
## Age 4.5664 1 0.033
## factor(Gender) 0.0123 1 0.912
## Weight_kg 1.9451 1 0.163
## GLOBAL 5.6990 3 0.127
#observation:
Cox-model assumes the effect of each covariate on the hazard is constant over time.
*GLOBAL:p_value= 0.127 this is >0.05, we fail to reject the null hypothesis and conclude that overall, the proportional hazards assumption holds for the model.Taken together, the model does not show strong evidence of a global violation.
#Visualizing Blood Pressure Incidence on Kenya Map
#load kenya counties shapefile
#install.packages("sf") # run once
library(sf)
## Warning: package 'sf' was built under R version 4.5.2
## Linking to GEOS 3.13.1, GDAL 3.11.4, PROJ 9.7.0; sf_use_s2() is TRUE
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(rnaturalearth)
## Warning: package 'rnaturalearth' was built under R version 4.5.2
#geretate .sh file
Sys.setenv(SHAPE_RESTORE_SHX = "YES")
#read the shapefile # dowload and make sure it is in the same directory as this notebook
kenya_counties <- st_read("County.shp")
## Reading layer `County' from data source
## `C:\Users\DELL\OneDrive\Desktop\msc_year2\Health_informatics\County.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 47 features and 0 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 33.91182 ymin: -4.702271 xmax: 41.90626 ymax: 5.430648
## CRS: NA
names(kenya_counties)
## [1] "geometry"
#create the dataframe to match the counties with blood pressure incidence data
#Sample blood pressure incidence data for selected cities in Kenya
bp_data <- data.frame(
county = c("Nairobi", "Mombasa", "Kisumu", "Nakuru", "Eldoret"),
lon = c(36.8219, 39.6682, 34.7617, 36.0667, 35.2698),
lat = c(-1.2921, -4.0435, -0.0917, -0.3031, 0.5143),
incidence = c(28.4, 25.1, 30.2, 27.0, 26.3)
)
bp_sf <- st_as_sf(
bp_data,
coords = c("lon", "lat"),
crs = 4326
)
# Load Kenya boundary (sf object)
kenya <- ne_countries(country = "Kenya", returnclass = "sf")
# Plot map with BP incidence points
ggplot() +
geom_sf(data = kenya, fill = "gray95", color = "gray40") +
geom_sf(
data = bp_sf,
aes(size = incidence, color = incidence),
alpha = 0.8
) +
scale_color_viridis_c(name = "BP Incidence (%)") +
scale_size_continuous(name = "BP Incidence (%)") +
labs(
title = "Blood Pressure Incidence by City in Kenya",
subtitle = "Selected Counties",
caption = "Source: Your dataset"
) +
theme_minimal()