Load necessary libraries

library(tidyverse)
library(irr)       # For Cohen's kappa
library(knitr)
library(readr)

Load Dataset

# Load the dataset
malaria_data <- read.table("https://raw.githubusercontent.com/HackBio-Internship/public_datasets/main/R/lancet_malaria.txt", header = TRUE, sep = "\t")

# Preview
head(malaria_data)

Renaming of the column names

colnames(malaria_data) <- c("Review Found", "Author", "Title", "Year", "Region","Country","Location", "PCR_N_Tested", "PCR_N_Positive", "PCR_Percent","Microscopy_N_Tested", "Microscopy_N_Positive", "Microscopy_Percent", "Historical_Transmission", "Current_Transmission", "Setting_20", "Setting_15", "Setting_10", "Setting_5", "PCR_Method", "Microscopy_Fields", "Sampling_Season", "Notes")

#preview
head(malaria_data)

Compute Percentage of infections missed by microscopy

malaria_data <- malaria_data %>%
  mutate(Missed_pct = (`PCR_N_Positive` - `Microscopy_N_Positive`) / `PCR_N_Positive` * 100)

# Boxplot of missed infections by region
ggplot(malaria_data, aes(x = Region, y = Missed_pct, fill = Region)) +
  geom_boxplot() +
  labs(title = "Percentage of Infections Missed by Microscopy",
       y = "Missed Infections (%)", x = "Region") +
  theme_minimal()

Agreement Analysis (Cohen’s Kappa)

# Convert to binary variables (Positive = 1, Negative = 0)
malaria_data <- malaria_data %>%
  mutate(Microscopy_binary = ifelse(`Microscopy_N_Positive` > 0, 1, 0),
         PCR_binary = ifelse(`PCR_N_Positive` > 0, 1, 0))

# Compute Cohen's Kappa
kappa_result <- kappa2(malaria_data[, c("Microscopy_binary", "PCR_binary")])
kappa_result
##  Cohen's Kappa for 2 Raters (Weights: unweighted)
## 
##  Subjects = 387 
##    Raters = 2 
##     Kappa = 0 
## 
##         z = 0 
##   p-value = 1

Sensitivity, Specificity, PPV, NPV

# Confusion table
TP <- sum(malaria_data$Microscopy_binary == 1 & malaria_data$PCR_binary == 1)
FP <- sum(malaria_data$Microscopy_binary == 1 & malaria_data$PCR_binary == 0)
TN <- sum(malaria_data$Microscopy_binary == 0 & malaria_data$PCR_binary == 0)
FN <- sum(malaria_data$Microscopy_binary == 0 & malaria_data$PCR_binary == 1)

# Compute metrics
sensitivity <- TP / (TP + FN)
specificity <- TN / (TN + FP)
ppv <- TP / (TP + FP)
npv <- TN / (TN + FN)

metrics <- data.frame(Metric = c("Sensitivity","Specificity","PPV","NPV"),
                      Value = c(sensitivity, specificity, ppv, npv))
kable(metrics)
Metric Value
Sensitivity 0.8914729
Specificity NaN
PPV 1.0000000
NPV 0.0000000

Stratified Summaries by Region

# Compute average prevalence ratio and missed infections by region
malaria_data <- malaria_data %>%
  mutate(Prevalence_Ratio = `Microscopy_N_Positive` / `PCR_N_Positive`)

region_summary <- malaria_data %>%
  group_by(Region) %>%
  summarise(
    Avg_Prevalence_Ratio = mean(Prevalence_Ratio, na.rm = TRUE),
    Avg_Missed_pct = mean(Missed_pct, na.rm = TRUE),
    .groups = 'drop'
  )

kable(region_summary)
Region Avg_Prevalence_Ratio Avg_Missed_pct
Asia&Oceania 0.5355473 46.44527
East Africa 0.5222043 47.77957
South America 0.1560791 84.39209
West Africa 0.5840086 41.59914
# Boxplot of prevalence ratios by region
ggplot(malaria_data, aes(x = Region, y = Prevalence_Ratio, fill = Region)) +
  geom_boxplot() +
  labs(title = "Prevalence Ratio by Region",
       y = "Prevalence Ratio (Microscopy / PCR)", x = "Region") +
  theme_minimal()