Import Data
# excel file
data <- read_excel(here("01_module4/data/myData.xlsx"))
data
## # A tibble: 193 × 9
## hdi_rank_2023 country human_development_in…¹ life_expectancy_at_b…²
## <dbl> <chr> <dbl> <dbl>
## 1 1 Iceland 0.972 82.7
## 2 2 Norway 0.97 83.3
## 3 2 Switzerland 0.97 84.0
## 4 4 Denmark 0.962 81.9
## 5 5 Germany 0.959 81.4
## 6 5 Sweden 0.959 83.3
## 7 7 Australia 0.958 83.9
## 8 8 Hong Kong, China… 0.955 85.5
## 9 8 Netherlands 0.955 82.2
## 10 10 Belgium 0.951 82.1
## # ℹ 183 more rows
## # ℹ abbreviated names: ¹​human_development_index_hdi, ²​life_expectancy_at_birth
## # ℹ 5 more variables: expected_years_of_schooling <dbl>,
## # mean_years_of_schooling <dbl>, gross_national_income_gni_per_capita <dbl>,
## # gni_per_capita_rank_minus_hdi_rank <dbl>, hdi_rank_2022 <chr>
Introduction
Questions
Variation
#Adding a variable
data1 <- data%>%
mutate(continent = countrycode (country,
origin = "country.name",
destination = "continent"))
data1 %>%
ggplot() +
geom_bar(mapping = aes(x = continent))

Visualizing distributions
data%>%
ggplot(mapping = aes(x= human_development_index_hdi)) +
geom_histogram(binwidth = 0.05)

data%>%
filter(human_development_index_hdi > 0.8) %>%
ggplot(aes(x = human_development_index_hdi)) +
geom_histogram(binwidth = 0.01)

data1%>%
ggplot(aes(x = human_development_index_hdi, color = continent)) +
geom_freqpoly(binwidth = 0.05)

Typical values
data1 %>%
# Filter out countries lower than 0.4
filter(human_development_index_hdi > 0.4) %>%
# Plot
ggplot(aes(x = human_development_index_hdi)) +
geom_histogram(binwidth = 0.01)

Unusual values
data1 %>%
ggplot(aes(gross_national_income_gni_per_capita)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

data1 %>%
ggplot(aes(gross_national_income_gni_per_capita)) +
geom_histogram() +
coord_cartesian(ylim = c(0,30))
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Missing Values
data1 %>%
#filter (y < 1000 | y > 120000)
mutate(gross_national_income_gni_per_capita = ifelse(gross_national_income_gni_per_capita < 1000 | gross_national_income_gni_per_capita > 120000, NA, gross_national_income_gni_per_capita)) %>%
ggplot(aes(x= human_development_index_hdi, y = gross_national_income_gni_per_capita)) +
geom_point()
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).

Covariation
A categorical and continuous variable
data1 %>%
ggplot(aes(x = continent, y = human_development_index_hdi)) +
geom_boxplot()

Two continous variables
data1 %>%
ggplot() +
geom_point(mapping = aes(x = human_development_index_hdi, y = gross_national_income_gni_per_capita), alpha = 1/5)

library(hexbin)
data1 %>%
ggplot(aes(x = human_development_index_hdi, y = gross_national_income_gni_per_capita)) +
geom_hex()

data1 %>%
ggplot(aes(x = human_development_index_hdi, y = gross_national_income_gni_per_capita)) +
geom_boxplot(aes(group = cut_width(human_development_index_hdi, 0.05)))
## Warning: Orientation is not uniquely specified when both the x and y aesthetics are
## continuous. Picking default orientation 'x'.

Two categorical variables
data1 <- data1 %>%
mutate(hdi_category = cut(human_development_index_hdi,
breaks = c(0, 0.55, 0.70, 0.80, 1),
labels = c("Low", "Medium", "High", "Very High")))
data1 %>%
ggplot(aes(x = continent, fill = hdi_category)) +
geom_bar(position = "dodge")
