Load necessary libraries
library(tidyverse)
library(irr) # For Cohen's kappa
library(knitr)
library(readr)
Load Dataset
# Load the dataset
malaria_data <- read.table("https://raw.githubusercontent.com/HackBio-Internship/public_datasets/main/R/lancet_malaria.txt", header = TRUE, sep = "\t")
# Preview
head(malaria_data)
Renaming of the column names
colnames(malaria_data) <- c("Review Found", "Author", "Title", "Year", "Region","Country","Location", "PCR_N_Tested", "PCR_N_Positive", "PCR_Percent","Microscopy_N_Tested", "Microscopy_N_Positive", "Microscopy_Percent", "Historical_Transmission", "Current_Transmission", "Setting_20", "Setting_15", "Setting_10", "Setting_5", "PCR_Method", "Microscopy_Fields", "Sampling_Season", "Notes")
#preview
head(malaria_data)
Compute Percentage of infections missed by microscopy
malaria_data <- malaria_data %>%
mutate(Missed_pct = (`PCR_N_Positive` - `Microscopy_N_Positive`) / `PCR_N_Positive` * 100)
# Boxplot of missed infections by region
ggplot(malaria_data, aes(x = Region, y = Missed_pct, fill = Region)) +
geom_boxplot() +
labs(title = "Percentage of Infections Missed by Microscopy",
y = "Missed Infections (%)", x = "Region") +
theme_minimal()

Agreement Analysis (Cohen’s Kappa)
# Convert to binary variables (Positive = 1, Negative = 0)
malaria_data <- malaria_data %>%
mutate(Microscopy_binary = ifelse(`Microscopy_N_Positive` > 0, 1, 0),
PCR_binary = ifelse(`PCR_N_Positive` > 0, 1, 0))
# Compute Cohen's Kappa
kappa_result <- kappa2(malaria_data[, c("Microscopy_binary", "PCR_binary")])
kappa_result
## Cohen's Kappa for 2 Raters (Weights: unweighted)
##
## Subjects = 387
## Raters = 2
## Kappa = 0
##
## z = 0
## p-value = 1
Sensitivity, Specificity, PPV, NPV
# Confusion table
TP <- sum(malaria_data$Microscopy_binary == 1 & malaria_data$PCR_binary == 1)
FP <- sum(malaria_data$Microscopy_binary == 1 & malaria_data$PCR_binary == 0)
TN <- sum(malaria_data$Microscopy_binary == 0 & malaria_data$PCR_binary == 0)
FN <- sum(malaria_data$Microscopy_binary == 0 & malaria_data$PCR_binary == 1)
# Compute metrics
sensitivity <- TP / (TP + FN)
specificity <- TN / (TN + FP)
ppv <- TP / (TP + FP)
npv <- TN / (TN + FN)
metrics <- data.frame(Metric = c("Sensitivity","Specificity","PPV","NPV"),
Value = c(sensitivity, specificity, ppv, npv))
kable(metrics)
| Sensitivity |
0.8914729 |
| Specificity |
NaN |
| PPV |
1.0000000 |
| NPV |
0.0000000 |
Stratified Summaries by Region
# Compute average prevalence ratio and missed infections by region
malaria_data <- malaria_data %>%
mutate(Prevalence_Ratio = `Microscopy_N_Positive` / `PCR_N_Positive`)
region_summary <- malaria_data %>%
group_by(Region) %>%
summarise(
Avg_Prevalence_Ratio = mean(Prevalence_Ratio, na.rm = TRUE),
Avg_Missed_pct = mean(Missed_pct, na.rm = TRUE),
.groups = 'drop'
)
kable(region_summary)
| Asia&Oceania |
0.5355473 |
46.44527 |
| East Africa |
0.5222043 |
47.77957 |
| South America |
0.1560791 |
84.39209 |
| West Africa |
0.5840086 |
41.59914 |
# Boxplot of prevalence ratios by region
ggplot(malaria_data, aes(x = Region, y = Prevalence_Ratio, fill = Region)) +
geom_boxplot() +
labs(title = "Prevalence Ratio by Region",
y = "Prevalence Ratio (Microscopy / PCR)", x = "Region") +
theme_minimal()
