1. Chung

1.1. Packages

library("readxl")
library("knitr") # tạo bảng
library("tidyverse") # dyplyr, tidyr, ggplot2,...
library("grid")
library("gridExtra") # vẽ nhiều đồ thị cùng lúc
library("kableExtra") # tạo bảng
library("naniar") # cho dữ liệu missing
library("mice") # điền các giá trị missing
library("rmarkdown")

# bipartite graph
library("chorddiag")
library("reshape2")

# world map
library("maps")


# bar chart race
library("gganimate")
library("ggflags")
library("countrycode")

# K medoids
library("cluster")
library("factoextra")

# Panel regression
library("plm")
library("tseries")
library("lmtest") # heteroskedasticity test

set.seed(123) # set starting random number

1.2. Nhập dữ liệu

dat <- read.csv("./athlete_events.csv")
region <- read.csv("./noc_regions.csv")
continent <- read.csv("./region.csv")

paged_table(head(dat, n = 15))

Các trường dữ liệu trong tập data trên là:

ID - ID unique của từng vận động viên
Name - Tên của vận động viên
Sex - Giới tính. M: Nam, F: Nữ
Age - Tuổi
Height - Chiều cao (Đơn vị cm)
Weight - Cân nặng (Đơn vị Kg)
Team - Tên đội tuyển (Tên quốc gia)
NOC - Đoàn Olympics tham gia (Viết tắt 3 chữ cái của quốc gia)
Games - Năm diễn ra và Mùa (Thế vận đội mùa đông và mùa hè)
Year - Năm diễn ra
Season - Summer: Mùa hè, Winter: Mùa đông
City - Thành phố đăng cai
Sport - Môn thi đấu
Event - Mục thi đấu cụ thể (nằm bên trong môn thi đấu)
Medal - Huy chương. Gold: vàng, Silver: bạc, Bronze: đồng

1.3. Kiểm tra dữ liệu khuyết thiếu

gg_miss_var(dat)

## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

2. Tổng quan về Olympics

2.1. Ở Olympics người ta thi gì?

# Số lượng mô thi đấu ở từng thế vận hội
dat %>% 
  # Lấy các giá trị unique (không trùng lặp) của 3 biến Year, Season, Sport
  distinct(Year, Season, Sport) %>% 
  # Tính số lượng môn thi đấu `Sport` theo năm `Year` và mùa thi đấu `Season`
  group_by(Year, Season) %>% 
  summarise(count = n()) %>% 
  # Vẽ đồ thị
  ggplot() + 
    aes(x = Year, y = count, fill = Season) + 
    geom_col() + # Đồ thị cột
    facet_grid(Season ~ .) + # Tách đồ thị làm 2 dòng theo `Season`
    theme_bw() + # Đổi theme có nền trắng
    
    # Note các giai đoạn của 2 lần thế chiến (World war I và World war II)
    annotate("rect", xmin = 1914, xmax = 1918, ymin = 0, ymax = 35, alpha = 0.2) +  # Vẽ chữ nhật màu xám cho WWI
    annotate("rect", xmin = 1939, xmax = 1945, ymin = 0, ymax = 35, alpha = 0.2) +  # Vẽ chữ nhật màu xám cho WWII
    
    # Thêm text "WWI" và "WWII"
    annotate("text", x = 1942, y = 20, label = "WWII") + 
    annotate("text", x = 1916, y = 20, label = "WWI") + 
    
    # Custom lại đồ thị cho đẹp
      # chỉnh các khoảng cách các điểm ở trục x sao cho cách nhau 8 năm (1896, )
      scale_x_continuous(breaks = seq(1896, 2016, 8)) + 
      
      # đổi màu cách cột sang dạng màu pastel (màu nhạt nhẹ)
      scale_fill_brewer(palette = "Pastel1") +
      
      # đổi cách label của trục x, y và title
      labs(
        x = "Năm",
        y = "Số lượng môn thi đấu",
        title = "Số lượng môn thi đấu ở từng thế vận hội"
      ) +
    # Format lại các trục
    theme(
      # cho các năm nằm gốc 45 độ
      axis.text.x = element_text(angle = 45, vjust = 0.5),
      
      # cho label trục y nằm cách xa trục
      axis.title.y = element_text(margin = margin(r = 15)),
      
      # cho label trục x nằm cách xa trục
      axis.title.x = element_text(margin = margin(t = 15)), 
      
      # cho title của bảng to lên, in đậm, và nằm ra chính giữa
      plot.title = element_text(face = "bold", hjust = 0.5, margin = margin(b = 20), size = 15) 
    )

2.2. Số năm môn thi đâu được xuất hiện (trong bộ dữ liệu), và số nội dung trong một môn thi đấu

2.2.1. Đồ thị

count_sport <- dat %>% 
  distinct(Year, Sport) %>% 
  group_by(Sport) %>% 
  summarise(count = n()) %>% 
  arrange(desc(count)) %>% 
  mutate(top5 = case_when(
    row_number() <= 5 ~ TRUE,
    TRUE ~ FALSE
  )) %>% 
  slice(1:15) %>% 
  ggplot() +
    aes(x = reorder(Sport, count), y = count, fill = top5) +
    geom_col(show.legend = F) + 
    scale_fill_manual(values = c("gray", "red")) +
    coord_flip() +
    labs(
      x = "Môn thi đấu",
      y = "Số năm xuất hiện",
      title = "Số năm môn thể thao được thi đấu"
    ) + 
  theme_bw(base_size = 9) + 
  theme(
    axis.title.x = element_text(size = 12, margin = margin(t = 10)), 
    axis.title.y = element_text(size = 12, margin = margin(r = 10)),
    plot.title = element_text(margin = margin(b = 10), face = "bold", hjust = 0.5, size = 11)
  ) 
count_sport

event_plot <- dat %>% 
  filter(Year == 2016) %>% 
  distinct(Sport, Event) %>% 
  group_by(Sport) %>% 
  summarise(count = n()) %>% 
  arrange(desc(count)) %>% 
  mutate(top5 = case_when(
    row_number() <= 5 ~ TRUE,
    TRUE ~ FALSE
  )) %>% 
  slice(1:15) %>% 
  ggplot() +
    aes(x = reorder(Sport, count), y = count, fill = top5) + 
    geom_col(show.legend = F) + 
    coord_flip() +
    scale_fill_manual(values = c("grey", "red")) + 
    labs(
      title = "Số nội dung thi đấu trong từng môn", x = "Môn thể thao",
      y = "Số nội dung thi đấu của môn thể thao"
    ) +
    theme_bw(base_size = 9) + 
    theme(
      axis.title.x = element_text(size = 12, margin = margin(t = 10)), 
      axis.title.y = element_text(size = 12, margin = margin(r = 10)),
      plot.title = element_text(margin = margin(b = 10), face = "bold", hjust = 0.5, size = 11)
    ) 
event_plot

grid.arrange(count_sport, event_plot, ncol = 2)

rm(list = c("count_sport", "event_plot"))

2.2.3. Tương quan số lần môn thể thao xuất hiện và số nội dung thi đấu

count_year <- dat %>% 
  distinct(Year, Sport) %>% 
  group_by(Sport) %>% 
  summarise(count_year = n())

count_event <- dat %>% 
  filter(Year == 2016) %>%
  distinct(Sport, Event) %>% 
  group_by(Sport) %>% 
  summarise(count_event = n())

count_year %>% 
  left_join(count_event, by = "Sport") %>% 
  drop_na() %>% 
  ggplot() + 
    aes(x = (count_year), y = (count_event)) + 
    geom_point(alpha = 0.5) + 
    geom_smooth(method = "lm") + 
    theme_bw() + 
    labs(
      title = "Tương quan số lần môn thể thao xuất hiện và số nội dung thi đấu", 
      x = "Số lần môn thể thao xuất hiện",
      y = "Số nội dung thi đấu của môn thể thao"
    ) + 
    theme(
      axis.title.x = element_text(size = 10, margin = margin(t = 10)), 
      axis.title.y = element_text(size = 10, margin = margin(r = 10)),
      plot.title = element_text(margin = margin(b = 10), face = "bold", hjust = 0.5, size = 12)
    )

3. Thế vận hội mùa đông

3.1. Các quốc gia tham gia như thế nào tại thế vận hội mùa đông

# Những nước có nhiều vận động viên nhất
mean_athl <- dat %>% 
  select(NOC, Year, ID) %>% 
  group_by(NOC, Year) %>% 
  summarise(count = n()) %>% 
  group_by(NOC) %>% 
  summarise(mean_athl = sum(count)/n(), count_year = n()) %>% 
  filter(count_year > 10) %>% 
  left_join(region, by = "NOC")

mean_athl %>% 
  arrange(desc(mean_athl)) %>% 
  slice(1:9) %>% 
  ggplot() +
    aes(x = reorder(region, mean_athl), y = mean_athl) +
    geom_col(fill = "#124B92") +
    coord_flip() +
    theme_bw() +
    labs(
      title = "Những nước có nhiều vận động viên nhất?", y = "Số vận động viên trung bình mỗi năm",
      x = "Quốc gia"
    ) +
    theme(
      axis.title.x = element_text(size = 10, margin = margin(t = 10)),
      axis.title.y = element_text(size = 10, margin = margin(r = 10)),
      plot.title = element_text(margin = margin(b = 10), face = "bold", hjust = 0.5, size = 12)
    )

# Số vận động viên trung bình các năm theo các châu lục
continent %>% 
  select(alpha.3, region) %>% 
  rename(
    NOC = alpha.3,
    continent = region
  ) %>% 
  right_join(mean_athl, by = "NOC") %>% 
  arrange(desc(mean_athl)) %>% 
  slice(1:13) %>% 
  filter(is.na(continent) == F) %>% 
  ggplot() +
    aes(x = reorder(region, mean_athl), y = mean_athl,fill = continent) +
    facet_wrap(. ~ continent, scales = "free") +
    geom_col(show.legend = F) +
    coord_flip() +
    labs(
      title = "Số vận động viên trung bình mỗi năm theo các châu lục", y = "Số vận động viên trung bình mỗi năm",
      x = "Quốc gia"
    ) +
    theme_bw() +
    theme(
      axis.title.x = element_text(size = 10, margin = margin(t = 10)),
      axis.title.y = element_text(size = 10, margin = margin(r = 10)),
      plot.title = element_text(margin = margin(b = 10), face = "bold", hjust = 0.5, size = 12)
    )

3.2. Bình đẳng giới tại thế vận hội mùa hè

# của thế vận hội mùa hè
dat %>% 
  filter(Season == "Summer") %>% 
  group_by(Year, Sex) %>%
  summarise(count = n()) %>% 
  spread(key = Sex, value = count) %>% 
  mutate(sum = `F` + `M`) %>% 
  mutate(Year = as.factor(Year)) %>% 
  ggplot() +
    geom_col(aes(x = reorder(Year, Year), y = sum), col = "#ffffff") +  
    geom_col(aes(x = Year, y = sum/2), col = "#ffffff") +  
    geom_col(aes(x = Year, y = `F`), fill = "dark orange", width = 0.5) +
    geom_text(aes(x = Year, y = `F` + 500, label = paste(round(`F` / sum * 100, 0), "%")), size = 3, color = "dark orange") + 
    scale_x_discrete(limits=rev) +   
    coord_flip() + 
    theme_bw(base_size = 10) +
    theme(
      axis.title.x = element_text(size = 10, margin = margin(t = 10)),
      axis.title.y = element_text(size = 10, margin = margin(r = 10)),
      plot.title = element_text(margin = margin(b = 10), face = "bold", hjust = 0.5, size = 12)
    ) + 
    labs(
      x = "Năm",
      y= "Tỷ lệ vận động viên nữ trên tổng số vận động viên",
      title = "Tỷ lệ vận động viên nữ theo thời gian"
    )

3.3. Thành tích của các quốc gia tại thế vận hội mùa hè

Để phản ánh thành tích các quốc gia ở thế vận hội một cách chính xác và cập nhật hơn, ta sẽ xét Số huy chương mà các quốc gia giành được trong 10 kỳ thế vận hội gần nhất (tính từ năm 2000 khi Liên xô đã tan rã và Trung quốc tham gia thường xuyên ở Olympics). Ta cũng chỉ xét thế vận hội mùa đông.

dat %>% 
  filter(Season == "Summer", Year %in% 1950:1960) %>%
  select(NOC, Medal) %>% 
  drop_na() %>% 
  count(NOC, Medal) %>% 
  spread(key = Medal, value = n) %>% 
  mutate(sum_medal = Bronze + Gold + Silver) %>% 
  arrange(desc(sum_medal)) %>% 
  slice(1:12) %>% 
  select(-sum_medal) %>% 
  gather(Gold, Silver, Bronze, key = "Medal", value = "Count") %>% 
  left_join(region[, 1:2], by = "NOC") %>% 
  select(-NOC) %>% 
  acast(region ~ Medal, value.var = "Count") %>% 
  .[order(rowSums(.), decreasing = F),] %>% 
  chorddiag(type = "bipartite", groupColors = c("brown", "yellow", "gray"), showTicks = F)

dat %>% 
  filter(Season == "Summer", Year %in% 2000:2016) %>%
  select(NOC, Medal) %>% 
  drop_na() %>% 
  count(NOC, Medal) %>% 
  spread(key = Medal, value = n) %>% 
  mutate(sum_medal = Bronze + Gold + Silver) %>% 
  arrange(desc(sum_medal)) %>% 
  slice(1:12) %>% 
  select(-sum_medal) %>% 
  gather(Gold, Silver, Bronze, key = "Medal", value = "Count") %>% 
  left_join(region[, 1:2], by = "NOC") %>% 
  select(-NOC) %>% 
  acast(region ~ Medal, value.var = "Count") %>% 
  .[order(rowSums(.), decreasing = F),] %>% 
  chorddiag(type = "bipartite", groupColors = c("brown", "yellow", "gray"), showTicks = F)

dat %>% 
  filter(Year > 1990, Season == "Summer") %>% 
  select(NOC, Medal) %>% 
  drop_na() %>% 
  count(NOC) %>% 
  left_join(region[, 1:2], by = "NOC") %>% 
  drop_na() %>% 
  right_join(map_data("world"), by = c("region")) %>% 
  ggplot(aes(long, lat, group = group))+
    geom_polygon(aes(fill = n), color = "black", size = 0.2) +
    scale_fill_gradient(low = "white", high="yellow",   guide = "colourbar") + 
    theme_bw() + 
    labs(
      x = "",
      y = "",
      title = "Tổng số huy chương của các nước (từ 1990)"
    ) +
    theme(
      axis.text = element_blank(),
      axis.ticks = element_blank(),
      plot.title = element_text(margin = margin(b = 10), face = "bold", hjust = 0.5, size = 12)
    ) + 
  guides(fill = guide_colorbar(title = "Số lượng huy chương"))

# Đồ thị động (Bar chart race) tổng số huy chương đạt được qua các năm
p_ani <- dat %>% 
  drop_na(Medal) %>% 
  count(NOC, Year) %>% 
  arrange(Year, desc(n)) %>% 
  group_by(Year) %>% 
  mutate(NOC_2 = tolower(countrycode(NOC, "iso3c", "iso2c"))) %>% 
  drop_na() %>% 
  mutate(rank = rank(order(n, decreasing = T))) %>% 
  arrange(Year, rank) %>% 
  filter(Year %in% 1995:2016) %>% 
  ggplot(aes(x = rank, y= n)) + 
    geom_col(fill = "azure3", color = "black", width = 0.7) +
    geom_flag(aes(x = rank, y = - 35, country = NOC_2), size = 9, na.rm = T, show.legend = F) + 
    geom_text(aes(x = rank, y = -75, label = NOC), size = 3) +
    coord_flip(expand = F, clip = "off") +
    scale_x_reverse(limits = c(10.5, 0)) + 
    labs(
      x = "",
      title="{closest_state}",
      y = "Số lượng huy chương"
    ) +
    theme(
      panel.background = element_blank(),
      axis.text.y = element_blank(),
      axis.ticks.y = element_blank(),
      axis.title.x = element_text(margin = margin(t = 20), face = "bold")
    ) + 
    transition_states(Year, transition_length = 4, state_length = 1)

animate(p_ani, fps = 30, duration = 25, width = 700, height = 470, res = 120)

4. Phân cụm

dat.cluster <- dat %>% 
  filter(Season == "Summer", Year == 2016)

4.1. Kiểm tra outliers

height_boxplot <- dat.cluster %>% 
  ggplot() +
  geom_boxplot(aes(Height)) +
  coord_flip() +
  xlab("Height") +
  theme_bw(base_size = 9)
weight_boxplot <- dat.cluster %>% 
  ggplot() + 
  geom_boxplot(aes(Weight)) + 
  coord_flip() + 
  xlab("Weight") + 
  theme_bw(base_size = 9)
grid.arrange(height_boxplot, weight_boxplot, ncol = 2, top = textGrob("Boxlot chiều cao và cân nặng vận động viên năm 2016",gp=gpar(fontsize=12,font=2)))

## Warning: Removed 176 rows containing non-finite values (stat_boxplot).

## Warning: Removed 223 rows containing non-finite values (stat_boxplot).

rm(list = c("height_boxplot", "weight_boxplot"))

weight_q3 <- as.numeric(quantile(dat.cluster$Weight, 0.75, na.rm = T))
weight_q1 <- as.numeric(quantile(dat.cluster$Weight, 0.25, na.rm = T))
weight_iqr <- IQR(dat.cluster$Weight, na.rm = T)

height_q3 <- as.numeric(quantile(dat.cluster$Height, 0.75, na.rm = T))
height_q1 <- as.numeric(quantile(dat.cluster$Height, 0.25, na.rm = T))
height_iqr <- IQR(dat.cluster$Height, na.rm = T)

dat.cluster[
  !is.na(dat.cluster$Weight) & dat.cluster$Weight >  weight_q3 + 1.5*weight_iqr,
  "Weight"
] <- quantile(dat.cluster$Weight, 0.95, na.rm = T)

dat.cluster[
  !is.na(dat.cluster$Weight) & dat.cluster$Weight < weight_q1 - 1.5*weight_iqr, 
  "Weight"
] <- quantile(dat.cluster$Weight, 0.05, na.rm = T)

dat.cluster[
  !is.na(dat.cluster$Height) & dat.cluster$Height > height_q3 + 1.5*height_iqr, 
  "Height"
] <- quantile(dat.cluster$Height, 0.95, na.rm = T)

dat.cluster[
  !is.na(dat.cluster$Height) & dat.cluster$Height < height_q1 - 1.5*height_iqr, 
  "Height"
] <- quantile(dat.cluster$Height, 0.05, na.rm = T)

rm(list = c("weight_q3", "weight_q1", "weight_iqr", "height_q3", "height_q1", "height_iqr"))

height_boxplot <- dat.cluster %>% 
  ggplot() +
  geom_boxplot(aes(Height)) +
  coord_flip() +
  xlab("Height") +
  theme_bw(base_size = 9)
weight_boxplot <- dat.cluster %>% 
  ggplot() + 
  geom_boxplot(aes(Weight)) + 
  coord_flip() + 
  xlab("Weight") + 
  theme_bw(base_size = 9)
grid.arrange(height_boxplot, weight_boxplot, ncol = 2, top = textGrob("Boxlot chiều cao và cân nặng vận động viên năm 2016",gp=gpar(fontsize=12,font=2)))

## Warning: Removed 176 rows containing non-finite values (stat_boxplot).

## Warning: Removed 223 rows containing non-finite values (stat_boxplot).

rm(list = c("height_boxplot", "weight_boxplot"))

4.2. Xử lý missing

dat.cluster %>% 
  select(-Medal) %>% 
  gg_miss_var()

## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

dat.cluster %>% 
  select(Weight, Height) %>% 
  drop_na() %>% 
  ggplot(aes(x = Height, y = Weight)) +
    geom_point(col = "#777777") +
    geom_smooth(method = "lm") +
    theme_bw() +
    labs(title = "Mối tương quan giữa chiều cao và cân nặng") +
    theme_bw() + 
    theme(
      axis.title.x = element_text(size = 10, margin = margin(t = 10)), 
      axis.title.y = element_text(size = 10, margin = margin(r = 10)),
      plot.title = element_text(margin = margin(b = 15), face = "bold", hjust = 0.5, size = 11)
    )

## `geom_smooth()` using formula 'y ~ x'

dat.cluster[, "Height"] <- dat.cluster %>% 
  select(-Medal, -Weight) %>% 
  mice(method = "cart", m = 2) %>% 
  complete() %>% 
  .$Height

## 
##  iter imp variable
##   1   1  Height
##   1   2  Height
##   2   1  Height
##   2   2  Height
##   3   1  Height
##   3   2  Height
##   4   1  Height
##   4   2  Height
##   5   1  Height
##   5   2  Height

## Warning: Number of logged events: 10

dat.cluster[, "Weight"] <- dat.cluster %>% 
  select(Weight, Height) %>% 
  mice(method = "norm.nob") %>% 
  complete() %>% 
  .$Weight

## 
##  iter imp variable
##   1   1  Weight
##   1   2  Weight
##   1   3  Weight
##   1   4  Weight
##   1   5  Weight
##   2   1  Weight
##   2   2  Weight
##   2   3  Weight
##   2   4  Weight
##   2   5  Weight
##   3   1  Weight
##   3   2  Weight
##   3   3  Weight
##   3   4  Weight
##   3   5  Weight
##   4   1  Weight
##   4   2  Weight
##   4   3  Weight
##   4   4  Weight
##   4   5  Weight
##   5   1  Weight
##   5   2  Weight
##   5   3  Weight
##   5   4  Weight
##   5   5  Weight

Kiểm tra lại mối tương quan giữa Weight và Height

dat.cluster %>% 
  select(Weight, Height) %>% 
  ggplot(aes(x = Height, y = Weight)) +
    geom_point(col = "#777777") +
    geom_smooth(method = "lm") +
    theme_bw() +
    labs(title = "Mối tương quan giữa chiều cao và cân nặng") +
    theme_bw() + 
    theme(
      axis.title.x = element_text(size = 10, margin = margin(t = 10)), 
      axis.title.y = element_text(size = 10, margin = margin(r = 10)),
      plot.title = element_text(margin = margin(b = 15), face = "bold", hjust = 0.5, size = 11)
    )

## `geom_smooth()` using formula 'y ~ x'

Vì dữ liệu có quá nhiều outliers nên sử dụng phương pháp K-mediods để khắc phục

4.2. Số lượng cụm tối ưu

4.2.1. Silhouette

dat.cluster %>%
  select(Height, Weight) %>% 
  fviz_nbclust(clara, method = "silhouette") +
  theme(
      axis.title.x = element_text(size = 10, margin = margin(t = 10)),
      axis.title.y = element_text(size = 10, margin = margin(r = 10)),
      # axis.text.y = element_blank(),
      plot.title = element_text(margin = margin(b = 10), face = "bold", hjust = 0.5, size = 12)
  )

4.2.2. Elbow - WSS

dat.cluster %>%
  select(Height, Weight) %>% 
  sample_frac(0.1) %>% 
  fviz_nbclust(clara, method = "wss") + 
  geom_vline(xintercept = 3, linetype = "dashed", col = "dark gray") +
  theme(
      axis.title.x = element_text(size = 10, margin = margin(t = 10)),
      axis.title.y = element_text(size = 10, margin = margin(r = 10)),
      axis.text.y = element_blank(),
      plot.title = element_text(margin = margin(b = 10), face = "bold", hjust = 0.5, size = 12)
  )

=> Chọn 3 cụm để phân tích dựa theo phương pháp Gap-statistics

4.3. CLARA

res.clus <- dat.cluster %>% 
  select(Height, Weight) %>% 
  clara(k = 3, stand = T, sampsize = 2000)

row.names(res.clus$medoids) <- c("Cluster 1", "Cluster 2", "Cluster 3")
res.clus$medoids[, 2] <- round(res.clus$medoids[, 2], 3)
res.clus$medoids %>% 
  kbl() %>% 
  kable_styling(bootstrap_options = "striped")

	Height	Weight
Cluster 1	189	88
Cluster 2	165	57
Cluster 3	178	71

dat.cluster <- dat.cluster %>% mutate(
  cluster = factor(res.clus$clustering, levels = c(1, 2, 3))
)

dat.cluster %>% 
  ggplot(aes(Height, Weight, col = cluster)) +
  geom_point(show.legend = F) +
  guides(col = guide_legend(title = "Cụm")) + 
  theme_bw() +
  labs(
    title = "Đồ thị phân cụm"
  ) + 
  theme(
      axis.title.x = element_text(size = 10, margin = margin(t = 10)),
      axis.title.y = element_text(size = 10, margin = margin(r = 10)),
      plot.title = element_text(margin = margin(b = 10), face = "bold", hjust = 0.5, size = 12)
  )

dat.cluster %>% 
  group_by(cluster) %>% 
  mutate(have_medal = case_when(
    is.na(Medal) ~ 0,
    TRUE ~ 1
  )) %>% 
  mutate(mean_medal = sum(have_medal)/n()) %>% 
  ggplot(aes(Height, Weight)) +
  geom_point(aes(col = mean_medal)) +
  scale_colour_continuous(
    breaks = c(0.175, 0.13, 0.153),
    labels = c("Cụm 3", "Cụm 1", "Cụm 2"),
    low = "dark gray", high="yellow",   guide = "colourbar"
    
  ) +
  theme_bw() +
  labs(
    title = "Đồ thị phân cụm (màu theo số huy chương)"
  ) +
  guides(col = guide_colorbar(title = "Lượng huy chương trung bình")) +
  theme(
      axis.title.x = element_text(size = 10, margin = margin(t = 10)),
      axis.title.y = element_text(size = 10, margin = margin(r = 10)),
      plot.title = element_text(margin = margin(b = 10), face = "bold", hjust = 0.5, size = 12)
  )

fviz_silhouette(res.clus) +
  guides(fill = F, col = F) +
  theme_classic() +
  theme(
      axis.title.x = element_text(size = 10, margin = margin(t = 10)),
      axis.text.x = element_blank(),
      axis.ticks.x = element_blank(),
      axis.title.y = element_text(size = 10, margin = margin(r = 10)),
      plot.title = element_text(margin = margin(b = 10), face = "bold", hjust = 0.5, size = 12)
  )

##   cluster size ave.sil.width
## 1       1  507          0.40
## 2       2  703          0.48
## 3       3  790          0.43

5. Hồi quy dữ liệu bảng

5.1. Lấy dữ liệu

5.1.1. Lọc các nước có đủ năm

dat.regression <- dat %>%
  drop_na() %>% 
  group_by(NOC, Year) %>% 
  summarise(
    N_Medal = n(),
    H_Mean = mean(Height, na.rm = T),
    W_Mean = mean(Weight, na.rm = T),
    Age_Mean = mean(Age, na.rm = T)
  ) %>% 
  group_by(NOC) %>%
  filter(
    length(intersect(Year, seq(1992, 2016, 4))) == 7
  ) %>%
  filter(
    Year %in% seq(1992, 2016, 4)
  )

5.1.2. Lấy vả merge dữ liệu HDI

dat.regression <- read.csv("hdi.csv") %>% 
  rename(
    HDI = 4,
    Region = Entity,
    NOC = Code
  ) %>% 
  right_join(dat.regression, by = c("NOC", "Year")) %>% 
  select(-Region) %>% 
  mice(method = "cart") %>% 
  complete()

## 
##  iter imp variable
##   1   1  HDI
##   1   2  HDI
##   1   3  HDI
##   1   4  HDI
##   1   5  HDI
##   2   1  HDI
##   2   2  HDI
##   2   3  HDI
##   2   4  HDI
##   2   5  HDI
##   3   1  HDI
##   3   2  HDI
##   3   3  HDI
##   3   4  HDI
##   3   5  HDI
##   4   1  HDI
##   4   2  HDI
##   4   3  HDI
##   4   4  HDI
##   4   5  HDI
##   5   1  HDI
##   5   2  HDI
##   5   3  HDI
##   5   4  HDI
##   5   5  HDI

5.2. Xử lý outliers

hdi_box <- dat.regression %>% 
  ggplot() +
    geom_boxplot(aes(HDI)) + 
    theme_bw()

H_box <- dat.regression %>% 
  ggplot() +
    geom_boxplot(aes(H_Mean)) + 
    theme_bw()

W_box <- dat.regression %>% 
  ggplot() +
    geom_boxplot(aes(W_Mean)) + 
    theme_bw()

N_box <- dat.regression %>% 
  ggplot() +
    geom_boxplot(aes(N_Medal)) + 
    theme_bw()

Age_box <- dat.regression %>% 
  ggplot() +
    geom_boxplot(aes(Age_Mean)) + 
    theme_bw()

grid.arrange(hdi_box, H_box, W_box, Age_box, N_box)

rm(list = c("hdi_box", "gdp_box", "H_box", "W_box", "N_box", "Age_box"))

dat.regression_out <- dat.regression %>% 
  mutate(
    HDI = case_when(
      HDI > quantile(dat.regression$HDI, .75) + 1.5*IQR(dat.regression$HDI) ~ 
        quantile(dat.regression$HDI, .95)[[1]],
      HDI < quantile(dat.regression$HDI, .25) - 1.5*IQR(dat.regression$HDI) ~ 
        quantile(dat.regression$HDI, .05)[[1]],
      TRUE ~ HDI,
    ),
    H_Mean = case_when(
      H_Mean > quantile(dat.regression$H_Mean, .75) + 1.5*IQR(dat.regression$H_Mean) ~ 
        quantile(dat.regression$H_Mean, .95)[[1]],
      H_Mean < quantile(dat.regression$H_Mean, .25) - 1.5*IQR(dat.regression$H_Mean) ~ 
        quantile(dat.regression$H_Mean, .05)[[1]],
      TRUE ~ H_Mean,
    ),
    W_Mean = case_when(
      W_Mean > quantile(dat.regression$W_Mean, .75) + 1.5*IQR(dat.regression$W_Mean) ~ 
        quantile(dat.regression$W_Mean, .95)[[1]],
      W_Mean < quantile(dat.regression$W_Mean, .25) - 1.5*IQR(dat.regression$W_Mean) ~ 
        quantile(dat.regression$W_Mean, .05)[[1]],
      TRUE ~ W_Mean,
    ),
    Age_Mean = case_when(
      Age_Mean > quantile(dat.regression$Age_Mean, .75) + 1.5*IQR(dat.regression$Age_Mean) ~ 
        quantile(dat.regression$Age_Mean, .95)[[1]],
      Age_Mean < quantile(dat.regression$Age_Mean, .25) - 1.5*IQR(dat.regression$Age_Mean) ~ 
        quantile(dat.regression$Age_Mean, .05)[[1]],
      TRUE ~ Age_Mean,
    ),
    N_Medal = as.numeric(N_Medal)
  ) %>% 
  mutate(
    N_Medal = case_when(
      N_Medal > quantile(dat.regression$N_Medal, .75) + 1.5*IQR(dat.regression$N_Medal) ~ 
        quantile(dat.regression$W_Mean, .95)[[1]],
      N_Medal < quantile(dat.regression$N_Medal, .25) - 1.5*IQR(dat.regression$N_Medal) ~ 
        quantile(dat.regression$N_Medal, .05)[[1]],
      TRUE ~ N_Medal,
    )
  )

hdi_box <- dat.regression_out %>% 
  ggplot() +
    geom_boxplot(aes(HDI)) + 
    theme_bw()

H_box <- dat.regression_out %>% 
  ggplot() +
    geom_boxplot(aes(H_Mean)) + 
    theme_bw()

W_box <- dat.regression_out %>% 
  ggplot() +
    geom_boxplot(aes(W_Mean)) + 
    theme_bw()

N_box <- dat.regression_out %>% 
  ggplot() +
    geom_boxplot(aes(N_Medal)) + 
    theme_bw()

Age_box <- dat.regression_out %>% 
  ggplot() +
    geom_boxplot(aes(Age_Mean)) + 
    theme_bw()

grid.arrange(hdi_box, H_box, W_box, N_box, Age_box)

rm(list = c("hdi_box", "gdp_box", "H_box", "W_box", "N_box", "Age_box"))

5.3. Mô hình hồi quy

Biến phụ thuộc: N_Medal

Biến độc lập:

Mô hình 1: H_Mean, Age_Mean, HDI
Mô hình 2: W_Mean, Age_Mean, HDI

dat.regression_out %>% 
  select(H_Mean, W_Mean, Age_Mean, HDI) %>% 
  cor() %>% 
  as.data.frame() %>% 
  round(3) %>% 
  mutate_all(~ cell_spec(.x, color = ifelse(.x >= 0.6 & .x != 1, "red"," black"))) %>% 
  kable(escape = F, align = "r", caption = "Ma trận tương quan") %>% 
  kable_styling(bootstrap_options = "hover")

Ma trận tương quan
	H_Mean	W_Mean	Age_Mean	HDI
H_Mean	1	0.852	0.432	0.445
W_Mean	0.852	1	0.379	0.493
Age_Mean	0.432	0.379	1	0.509
HDI	0.445	0.493	0.509	1

ols_m_h <- lm(N_Medal ~ H_Mean + Age_Mean + HDI, data = dat.regression_out)
fe_m_h <- plm(N_Medal ~ H_Mean + Age_Mean + HDI, data = dat.regression_out, index = c("NOC", "Year"), model = "within")
re_m_h <- plm(N_Medal ~ H_Mean + Age_Mean + HDI, data = dat.regression_out, index = c("NOC", "Year"), model = "random")
fgls_m_h <- pggls(N_Medal ~ H_Mean + Age_Mean + HDI, data = dat.regression_out, index = c("NOC", "Year"), model = "pooling")

ols_m_h <- summary(ols_m_h)
fe_m_h <- summary(fe_m_h)
re_m_h <- summary(re_m_h)
fgls_m_h <- summary(fgls_m_h)

ols_m_w <- lm(N_Medal ~ W_Mean + Age_Mean + HDI, data = dat.regression_out)
fe_m_w <- plm(N_Medal ~ W_Mean + Age_Mean + HDI, data = dat.regression_out, index = c("NOC", "Year"), model = "within")
re_m_w <- plm(N_Medal ~ W_Mean + Age_Mean + HDI, data = dat.regression_out, index = c("NOC", "Year"), model = "random")
fgls_m_w <- pggls(N_Medal ~ W_Mean + Age_Mean + HDI, data = dat.regression_out, index = c("NOC", "Year"), model = "pooling")

ols_m_w <- summary(ols_m_w)
fe_m_w <- summary(fe_m_w)
re_m_w <- summary(re_m_w)
fgls_m_w <- summary(fgls_m_w)

Mô hình 1 - tác động của chiều cao tới số lượng huy chương
	OLS	FE	RE	FGLS
(Intercept)	-94.209 **	_	-31.016	-75.721 **
H_Mean	0.77 ***	0.282	0.339 *	0.566 ***
Age_Mean	-2.958 ***	-0.531	-0.634	-0.586
HDI	84.584 ***	16.906	23.07 **	26.978 ***
R^2	0.144	0.021	0.03	0.097

Mô hình 2 - tác động của cân nặng tới số lượng huy chương
	OLS	FE	RE	FGLS
(Intercept)	7.859	_	17.309	5.401
W_Mean	0.144	0.144	0.161	0.228 **
Age_Mean	-2.337 **	-0.56	-0.65	-0.484
HDI	93.59 ***	17.23	23.383 **	26.02 **
R^2	0.121	0.016	0.024	0.069

Nhóm 4

Khánh

2022-04-12