Descriptive Statistics Code probset1

#CLEANED DATASET: data_imp.csv

#Install countrycode to be able to divide countries in "data_imp.csv" into regions and continents for better interpretation (Grouping)
install.packages("countrycode")
library(countrycode)
library(dplyr)
library(tidyverse)

# Load the cleaned + imputed + merged data (data_imp.csv)
data_imp <- read.csv("data_imp.csv")

#Quick check if the data exists and is numeric, and compute overall mean median and mode (initial descriptive stats) of each column using summary()
str(data_imp)
summary(data_imp)

#Get the mean median and sd, focusing on the 2 main variables: 
  #Current health expenditure as % of GDP (che_gdp) and 
  #labor productivity proxy (rgdpo / emp) (lab_prod)
  #"na.rm = TRUE" helps in ensuring missing values are ignored even if the dataset is already clean
overall_stats <- data_imp %>%
  summarise(
    # Health expenditure (% of GDP)
    mean_che_gdp   = mean(che_gdp, na.rm = TRUE),
    median_che_gdp = median(che_gdp, na.rm = TRUE),
    sd_che_gdp     = sd(che_gdp, na.rm = TRUE),
    
    # Labor productivity (rgdpo / emp)
    mean_lab_prod   = mean(lab_prod, na.rm = TRUE),
    median_lab_prod = median(lab_prod, na.rm = TRUE),
    sd_lab_prod     = sd(lab_prod, na.rm = TRUE)
  )

overall_stats

#Add regions and continents column using countrycode through mutate function
#countrycode will automatically assign which region and continent a country belongs to
data_imp <- data_imp %>%
  mutate(
    region = countrycode(country, "country.name", "region"),
    continent = countrycode(country, "country.name", "continent")
  )

#Get the mean, median, and sd by REGION for better interpretation using the summarise function
#arrange(desc()) orders results by descending mean labor productivity for easier interpretation
#As mentioned, "na.rm = TRUE" helps in ensuring missing values are ignored even if the dataset is already clean
region_stats <- data_imp %>%
  group_by(region) %>%
  summarise(
    n = n(),
    mean_che = mean(che_gdp, na.rm = TRUE),
    median_che = median(che_gdp, na.rm = TRUE),
    sd_che = sd(che_gdp, na.rm = TRUE),
    
    mean_prod = mean(lab_prod, na.rm = TRUE),
    median_prod = median(lab_prod, na.rm = TRUE),
    sd_prod = sd(lab_prod, na.rm = TRUE)
  ) %>%
  arrange(desc(mean_prod))

region_stats

#Get the mean, median, and sd by CONTINENT for another way of interpretation using the summarise function
#arrange(desc()) orders results by descending mean labor productivity for easier interpretation
#As mentioned, "na.rm = TRUE" helps in ensuring missing values are ignored even if the dataset is already clean
continent_stats <- data_imp %>%
  group_by(continent) %>%
  summarise(
    n = n(),
    mean_che = mean(che_gdp, na.rm = TRUE),
    median_che = median(che_gdp, na.rm = TRUE),
    sd_che = sd(che_gdp, na.rm = TRUE),
    
    mean_prod = mean(lab_prod, na.rm = TRUE),
    median_prod = median(lab_prod, na.rm = TRUE),
    sd_prod = sd(lab_prod, na.rm = TRUE)
  ) %>%
  arrange(desc(mean_prod))

continent_stats