Import Data

# excel file

data <- read_excel(here("01_module4/data/myData.xlsx"))

data
## # A tibble: 193 × 9
##    hdi_rank_2023 country           human_development_in…¹ life_expectancy_at_b…²
##            <dbl> <chr>                              <dbl>                  <dbl>
##  1             1 Iceland                            0.972                   82.7
##  2             2 Norway                             0.97                    83.3
##  3             2 Switzerland                        0.97                    84.0
##  4             4 Denmark                            0.962                   81.9
##  5             5 Germany                            0.959                   81.4
##  6             5 Sweden                             0.959                   83.3
##  7             7 Australia                          0.958                   83.9
##  8             8 Hong Kong, China…                  0.955                   85.5
##  9             8 Netherlands                        0.955                   82.2
## 10            10 Belgium                            0.951                   82.1
## # ℹ 183 more rows
## # ℹ abbreviated names: ¹​human_development_index_hdi, ²​life_expectancy_at_birth
## # ℹ 5 more variables: expected_years_of_schooling <dbl>,
## #   mean_years_of_schooling <dbl>, gross_national_income_gni_per_capita <dbl>,
## #   gni_per_capita_rank_minus_hdi_rank <dbl>, hdi_rank_2022 <chr>

Introduction

Questions

Variation

#Adding a variable 
data1 <- data%>%
    mutate(continent = countrycode (country, 
                                    origin = "country.name",
                                    destination = "continent"))

data1 %>%
    ggplot() +
    geom_bar(mapping = aes(x = continent))

Visualizing distributions

data%>%
    ggplot(mapping = aes(x= human_development_index_hdi)) +
    geom_histogram(binwidth = 0.05)

data%>%
    filter(human_development_index_hdi > 0.8) %>%
    
    ggplot(aes(x = human_development_index_hdi)) +
    geom_histogram(binwidth = 0.01)

data1%>%
    ggplot(aes(x = human_development_index_hdi, color = continent)) +
    geom_freqpoly(binwidth = 0.05)

Typical values

data1 %>%
    # Filter out countries lower than 0.4
    filter(human_development_index_hdi > 0.4) %>%
    # Plot 
    ggplot(aes(x = human_development_index_hdi)) + 
    geom_histogram(binwidth = 0.01)

Unusual values

data1 %>%
    ggplot(aes(gross_national_income_gni_per_capita)) +
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

data1 %>%
    ggplot(aes(gross_national_income_gni_per_capita)) +
    geom_histogram() +
    coord_cartesian(ylim = c(0,30))
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Missing Values

data1 %>%

    #filter (y < 1000 | y > 120000)
    
    mutate(gross_national_income_gni_per_capita = ifelse(gross_national_income_gni_per_capita < 1000 | gross_national_income_gni_per_capita > 120000, NA, gross_national_income_gni_per_capita)) %>%
    
    ggplot(aes(x= human_development_index_hdi, y = gross_national_income_gni_per_capita)) +
    geom_point()
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).

Covariation

A categorical and continuous variable

data1 %>%
    ggplot(aes(x = continent, y = human_development_index_hdi)) +
    geom_boxplot()

Two continous variables

data1 %>%
    
    ggplot() +
    geom_point(mapping = aes(x = human_development_index_hdi, y = gross_national_income_gni_per_capita), alpha = 1/5)

library(hexbin)
data1 %>%
    ggplot(aes(x = human_development_index_hdi, y = gross_national_income_gni_per_capita)) +
    geom_hex()

data1 %>%
    ggplot(aes(x = human_development_index_hdi, y = gross_national_income_gni_per_capita)) +
    geom_boxplot(aes(group = cut_width(human_development_index_hdi, 0.05)))
## Warning: Orientation is not uniquely specified when both the x and y aesthetics are
## continuous. Picking default orientation 'x'.

Two categorical variables

 data1 <- data1 %>%
    mutate(hdi_category = cut(human_development_index_hdi, 
            breaks = c(0, 0.55, 0.70, 0.80, 1),
            labels = c("Low", "Medium", "High", "Very High")))
    

data1 %>%
    ggplot(aes(x = continent, fill = hdi_category)) +
    geom_bar(position = "dodge")