1. Thư viện ggplot2

1.1. Tập dữ liệu MosaicData

data(CPS85, package = "mosaicData")
head(CPS85)

##   wage educ race sex hispanic south married exper union age   sector
## 1  9.0   10    W   M       NH    NS Married    27   Not  43    const
## 2  5.5   12    W   M       NH    NS Married    20   Not  38    sales
## 3  3.8   12    W   F       NH    NS  Single     4   Not  22    sales
## 4 10.5   12    W   F       NH    NS Married    29   Not  47 clerical
## 5 15.0   12    W   M       NH    NS Married    40 Union  58    const
## 6  9.0   16    W   F       NH    NS Married    27   Not  49 clerical

1.2. Lớp ggplot

library(ggplot2)
# Ánh xạ các biến tới các trục tọa độ
ggplot(data = CPS85, mapping = aes(x = exper, y = wage))

1.3. Lớp geom

# Hàm geom_point : Điểm dữ liệu
ggplot(data = CPS85, mapping = aes(x = exper, y = wage)) + 
  geom_point()

# Loại bỏ dữ liệu ngoại lai (lọc dữ liệu)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

plotdata <- filter(CPS85, wage<40)
# Trực quan lại dữ liệu mới
ggplot(data = plotdata, mapping = aes(x = exper, y = wage)) +
  geom_point()

# Tuỳ chọn color, size, alpha tương ứng màu, kích cỡ, độ trong suốt (0-1) của điểm dữ liệu
ggplot(data = plotdata, mapping = aes(x = exper, y = wage)) +
  geom_point(color = "cornflowerblue", alpha = 0.7, size = 3)

# Hàm geom_smooth : Đường thẳng hồi quy
# method = "lm" - mô hình hồi quy tuyến tính
ggplot(data = plotdata, mapping = aes(x = exper, y = wage)) +
  geom_point(color = "cornflowerblue", alpha = 0.7, size = 2) +
  geom_smooth(method = "lm")

## `geom_smooth()` using formula = 'y ~ x'

1.4. Trực quan nhóm

# Trực quan 2 biến liên tục và 1 biến phân loại
ggplot(data = plotdata, mapping = aes(x = exper, y = wage, color = sex)) +
  geom_point(alpha = 0.7,size = 2) +
  geom_smooth(method = "lm", se = FALSE, size = 1.5)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `geom_smooth()` using formula = 'y ~ x'

1.5. Co chiều dữ liệu

ggplot(data = plotdata, mapping = aes(x = exper, y = wage, color = sex)) +
  geom_point(alpha = 0.7,size = 2) +
  geom_smooth(method = "lm", se = FALSE, size = 1.5) +
  scale_x_continuous(breaks = seq(0,60,10)) +
  scale_y_continuous(breaks = seq(0,30,5),
                     label = scales::dollar) +
  scale_color_manual(values = c("indianred3", "cornflowerblue"))

## `geom_smooth()` using formula = 'y ~ x'

1.6. Trực quan bằng facet

ggplot(data = plotdata, mapping = aes(x = exper, y = wage, color = sex)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, size = 1.5) +
  scale_x_continuous(breaks = seq(0,60,10)) +
  scale_y_continuous(breaks = seq(0,30,5), label = scales::dollar) +
  scale_color_manual(values = c("indianred3", "cornflowerblue")) +
  facet_wrap(~ sector) +
  labs(title = "The correlation between wage and experience", 
       subtitle = "Survey of Population",
       caption = "Nguồn: http://mosaic-web.org/",
       x = "Experience",
       y = "Wage (USD)",
       color = "Gender") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

2. Ánh xạ dữ liệu

2.1. Ánh xạ trong hàm

ggplot(data = plotdata, mapping = aes(x = exper, y = wage, color = sex)) +
  geom_point(alpha = 0.7,size = 2) +
  geom_smooth(method = "lm", formula = y ~ poly(x,2), se = FALSE, size = 1.5)

2.2. Ánh xạ ngoài hàm

ggplot(data = plotdata, mapping = aes(x = exper, y = wage)) +
  geom_point(aes(color = sex)) +
  geom_smooth(method = "lm", formula = y ~ poly(x,2), se = FALSE, size = 1.5)

3. Biểu đồ thống kê

3.1. Tập dữ liệu Hoa Iris

library(dplyr)
str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

iris %>% head(10)

##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1           5.1         3.5          1.4         0.2  setosa
## 2           4.9         3.0          1.4         0.2  setosa
## 3           4.7         3.2          1.3         0.2  setosa
## 4           4.6         3.1          1.5         0.2  setosa
## 5           5.0         3.6          1.4         0.2  setosa
## 6           5.4         3.9          1.7         0.4  setosa
## 7           4.6         3.4          1.4         0.3  setosa
## 8           5.0         3.4          1.5         0.2  setosa
## 9           4.4         2.9          1.4         0.2  setosa
## 10          4.9         3.1          1.5         0.1  setosa

3.2. Biểu đồ Histogram bằng ggplot2

Biểu đồ phân bố tần số (còn được gọi là biểu đồ phân bố mật độ, biểu đồ cột)
Ba đặc trưng quan trọng của biểu đồ phân bố tần số là tâm điểm, độ rộng, độ dốc.

library(ggplot2)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(grid)
# Sepal.Length
Sepal_Length_His = ggplot(data = iris, aes(x = Sepal.Length)) +
  # Biểu đồ Histogram
  geom_histogram(binwidth = 0.2, color = "black", aes(fill = Species)) +
  # Thêm nhãn x, y
  xlab("Sepal Length (cm)") +
  ylab("Frequency") +
  # Xoá chú thích
  theme(legend.position = "left") +
  # Thêm tiêu đề
  ggtitle("Histogram of Sepal Length") +
  # Thêm một đường thẳng đứng biểu thị giá trị trung bình của 'Sepal.Length'
  geom_vline(data = iris, aes(xintercept = mean(Sepal.Length)), linetype = "dotted")

# Sepal.Width
Sepal_Width_His = ggplot(data = iris, aes(x = Sepal.Width)) +
  geom_histogram(binwidth = 0.2, color = "black", aes(fill = Species)) +
  xlab("Sepal Width (cm)") +
  ylab("Frequency") +
  theme(legend.position = "right") +
  ggtitle("Histogram of Sepal Width") +
  geom_vline(data = iris, aes(xintercept = mean(Sepal.Width)), linetype = "dashed")

# Petal.Length
Petal_Length_His = ggplot(data = iris, aes(x = Petal.Length)) +
  geom_histogram(binwidth = 0.2, color = "black", aes(fill = Species)) +
  xlab("Petal Length (cm)") +
  ylab("Frequency") +
  theme(legend.position = "left") +
  ggtitle("Histogram of Petal Length") +
  geom_vline(data = iris, aes(xintercept = mean(Petal.Length)), linetype="dashed")

# Petal.Width
Petal_Width_His = ggplot(data = iris, aes(x = Petal.Width)) +
  geom_histogram(binwidth = 0.2, color = "black", aes(fill = Species)) +
  xlab("Petal Width (cm)") +
  ylab("Frequency") +
  theme(legend.position = "right") +
  ggtitle("Histogram of Petal Width") +
  geom_vline(data=iris, aes(xintercept = mean(Petal.Width)), linetype="dashed")

grid.arrange(Sepal_Length_His, Sepal_Width_His, Petal_Length_His, Petal_Width_His,
  # Chọn số hàng cho lưới
  nrow = 2,
  # Thêm tiêu đề, đặt kích thước phông chữ
  top = textGrob("Iris Frequency Histogram", gp=gpar(fontsize=15)))

3.3. Biểu đồ Density Plot

# Sepal.Length
Sepal_Length_His = ggplot(data = iris, aes(x = Sepal.Length)) +
  # Biểu đồ Histogram
  geom_density(binwidth = 0.2, color = "black", aes(fill = Species), alpha = 0.5) +
  # Thêm nhãn x, y
  xlab("Sepal Length (cm)") +
  ylab("Frequency") +
  # Xoá chú thích
  theme(legend.position = "left") +
  # Thêm tiêu đề
  ggtitle("Histogram of Sepal Length") +
  # Thêm một đường thẳng đứng biểu thị giá trị trung bình của 'Sepal.Length'
  geom_vline(data = iris, aes(xintercept = mean(Sepal.Length)), linetype = "dotted")

## Warning in geom_density(binwidth = 0.2, color = "black", aes(fill = Species), :
## Ignoring unknown parameters: `binwidth`

# Sepal.Width
Sepal_Width_His = ggplot(data = iris, aes(x = Sepal.Width)) +
  geom_density(binwidth = 0.2, color = "black", aes(fill = Species), alpha = 0.5) +
  xlab("Sepal Width (cm)") +
  ylab("Frequency") +
  theme(legend.position = "right") +
  ggtitle("Histogram of Sepal Width") +
  geom_vline(data = iris, aes(xintercept = mean(Sepal.Width)), linetype = "dashed")

## Warning in geom_density(binwidth = 0.2, color = "black", aes(fill = Species), :
## Ignoring unknown parameters: `binwidth`

# Petal.Length
Petal_Length_His = ggplot(data = iris, aes(x = Petal.Length)) +
  geom_density(binwidth = 0.2, color = "black", aes(fill = Species), alpha = 0.5) +
  xlab("Petal Length (cm)") +
  ylab("Frequency") +
  theme(legend.position = "left") +
  ggtitle("Histogram of Petal Length") +
  geom_vline(data = iris, aes(xintercept = mean(Petal.Length)), linetype="dashed")

## Warning in geom_density(binwidth = 0.2, color = "black", aes(fill = Species), :
## Ignoring unknown parameters: `binwidth`

# Petal.Width
Petal_Width_His = ggplot(data = iris, aes(x = Petal.Width)) +
  geom_density(binwidth = 0.2, color = "black", aes(fill = Species), alpha = 0.5) +
  xlab("Petal Width (cm)") +
  ylab("Frequency") +
  theme(legend.position = "right") +
  ggtitle("Histogram of Petal Width") +
  geom_vline(data=iris, aes(xintercept = mean(Petal.Width)), linetype="dashed")

## Warning in geom_density(binwidth = 0.2, color = "black", aes(fill = Species), :
## Ignoring unknown parameters: `binwidth`

grid.arrange(Sepal_Length_His, Sepal_Width_His, Petal_Length_His, Petal_Width_His,
  # Chọn số hàng cho lưới
  nrow = 2,
  # Thêm tiêu đề, đặt kích thước phông chữ
  top = textGrob("Iris Frequency Histogram", gp=gpar(fontsize=15)))

3.4. Biểu đồ Box Plot

Sepal_Length_His = ggplot(data = iris, aes(x = Sepal.Length)) +
  # Biểu đồ Histogram
  geom_boxplot(binwidth = 0.2, color = "black", aes(fill = Species)) +
  # Thêm nhãn x, y
  xlab("Sepal Length (cm)") +
  ylab("Frequency") +
  # Xoá chú thích
  theme(legend.position = "left") +
  # Thêm tiêu đề
  ggtitle("Histogram of Sepal Length") +
  # Thêm một đường thẳng đứng biểu thị giá trị trung bình của 'Sepal.Length'
  geom_vline(data = iris, aes(xintercept = mean(Sepal.Length)), linetype = "dotted")

## Warning in geom_boxplot(binwidth = 0.2, color = "black", aes(fill = Species)):
## Ignoring unknown parameters: `binwidth`

# Sepal.Width
Sepal_Width_His = ggplot(data = iris, aes(x = Sepal.Width)) +
  geom_boxplot(binwidth = 0.2, color = "black", aes(fill = Species)) +
  xlab("Sepal Width (cm)") +
  ylab("Frequency") +
  theme(legend.position = "right") +
  ggtitle("Histogram of Sepal Width") +
  geom_vline(data = iris, aes(xintercept = mean(Sepal.Width)), linetype = "dashed")

## Warning in geom_boxplot(binwidth = 0.2, color = "black", aes(fill = Species)):
## Ignoring unknown parameters: `binwidth`

# Petal.Length
Petal_Length_His = ggplot(data = iris, aes(x = Petal.Length)) +
  geom_boxplot(binwidth = 0.2, color = "black", aes(fill = Species)) +
  xlab("Petal Length (cm)") +
  ylab("Frequency") +
  theme(legend.position = "left") +
  ggtitle("Histogram of Petal Length") +
  geom_vline(data = iris, aes(xintercept = mean(Petal.Length)), linetype="dashed")

## Warning in geom_boxplot(binwidth = 0.2, color = "black", aes(fill = Species)):
## Ignoring unknown parameters: `binwidth`

# Petal.Width
Petal_Width_His = ggplot(data = iris, aes(x = Petal.Width)) +
  geom_boxplot(binwidth = 0.2, color = "black", aes(fill = Species)) +
  xlab("Petal Width (cm)") +
  ylab("Frequency") +
  theme(legend.position = "right") +
  ggtitle("Histogram of Petal Width") +
  geom_vline(data=iris, aes(xintercept = mean(Petal.Width)), linetype="dashed")

## Warning in geom_boxplot(binwidth = 0.2, color = "black", aes(fill = Species)):
## Ignoring unknown parameters: `binwidth`

grid.arrange(Sepal_Length_His, Sepal_Width_His, Petal_Length_His, Petal_Width_His,
  # Chọn số hàng cho lưới
  nrow = 2,
  # Thêm tiêu đề, đặt kích thước phông chữ
  top = textGrob("Iris Frequency Histogram", gp=gpar(fontsize=15)))

4. Khám phá dữ liệu EDA

Thao tác nâng cao với sáu hàm cơ bản trong dplyr (select, filter, group_by, mutate, summarize, arrange).
Khả năng sử dụng toán tử pipe forward nâng cao.

4.1. Thư viện dplyr

Lọc theo dòng
Lọc theo cột
Sắp xếp tăng/giảm theo dòng
Tạo ra cột (biến số mới)
Thống kê theo nhóm

4.2. Tập dữ liệu Gapminder

library(gapminder)
data("gapminder")
unique(gapminder$country) -> obj

4.3. Toán tử pipe

Toán tử Forward Pipe %>% : Chuyển toàn bộ kết quả của hàm đi trước (bên trái) thành dữ liệu đầu vào của hàm đi sau (bên phải)

library(dplyr)
rnorm(100,20,5) %>%
  matrix(ncol=2) %>%
  data.frame(x=.[,1],y=.[,2]) %>%
  plot(col="red")

Toán tử T pipe %T>% : Dữ liệu đầu vào của 1 hàm A đi trước sẽ được truyền cho 2 nhánh tương ứng với quy trình B1 và quy trình B2

rnorm(100,10,1) %T>%
  ts.plot(col="red") %>%
  density() %>%
  plot(col="blue","densityplot")

Toán tử Assigning pipe %$% : Trích xuất đích danh một đối tượng trong kết quả của hàm đi trước để sử dụng như dữ liệu đầu vào cho hàm đi sau
- Đi với thư viện magrittr

library(magrittr)
data.frame(x = rnorm(100,10,5), y = rnorm(100,20,5)) %$% 
  ts.plot(x,col="red")

data.frame(x = rnorm(100,10,5), y = rnorm(100,20,5)) %$% 
  ts.plot(y,col="blue")

Toán tử Backward pipe %<>% : Kết quả cuối cùng sau mọi quy trình bên phải trong chuỗi sẽ được truyền ngược về object đầu tiên nằm ngoài cùng bên trái của chuỗi

library(magrittr)
x = abs(rnorm(100))
y = x
cbind(x,y) %>% head()

##              x         y
## [1,] 0.4668927 0.4668927
## [2,] 1.2633983 1.2633983
## [3,] 0.5414329 0.5414329
## [4,] 1.3038096 1.3038096
## [5,] 1.2518578 1.2518578
## [6,] 0.1316260 0.1316260

x %<>% .^2 %>% sqrt()
x

##   [1] 0.46689266 1.26339827 0.54143294 1.30380959 1.25185778 0.13162604
##   [7] 0.48355718 1.12337536 0.12240449 0.62095073 0.27752536 0.07975628
##  [13] 0.73291150 0.65251004 0.39600529 0.31839095 0.42947084 0.83434967
##  [19] 0.51627074 1.59341715 0.29488339 0.15160037 1.27699742 0.84884192
##  [25] 0.43220156 1.43341616 0.64519088 0.57914965 0.86393628 1.36719563
##  [31] 0.01768203 0.65027176 0.81309093 1.01886766 1.16202835 2.35555185
##  [37] 1.21364360 0.54716084 1.59741168 2.20142356 0.53734122 1.38407281
##  [43] 0.94212747 1.45396711 1.13371906 0.50956938 2.30711250 0.33554023
##  [49] 2.08085571 0.85744888 1.00297536 1.19120585 0.30619839 0.08300870
##  [55] 1.28342650 0.96539297 0.03793506 0.66777800 0.50097693 1.94021303
##  [61] 0.31245508 0.62444058 0.62049258 1.53999178 0.03992903 1.13449154
##  [67] 0.14832951 0.18729978 1.62014298 0.82341916 1.69425977 0.79224704
##  [73] 1.33946286 0.91238584 0.41301031 0.99806509 0.60251561 0.98240187
##  [79] 0.75142604 0.49434351 1.58077659 0.91467860 0.08735286 0.78005382
##  [85] 1.44345342 2.77020867 0.61060486 0.26014558 0.69056500 1.04073433
##  [91] 0.31898618 1.81585405 0.31679163 2.21229687 0.39783909 0.73367516
##  [97] 0.64552491 0.52095590 0.91182124 1.09982919

4.4. Tập dữ liệu ariquality

library(dplyr)
data("airquality")
head(airquality)

##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6

4.5. Các hàm chính trong dplyr

Hàm Select : Trả về một tập con gồm các cột

dplyr::select(airquality, Ozone, Temp) %>% head(4)

##   Ozone Temp
## 1    41   67
## 2    36   72
## 3    12   74
## 4    18   62

airquality %>% dplyr::select(-Ozone, -Temp) %>% head(4)

##   Solar.R Wind Month Day
## 1     190  7.4     5   1
## 2     118  8.0     5   2
## 3     149 12.6     5   3
## 4     313 11.5     5   4

Hàm filter : Lọc dữ liệu, kết quả của nó trả về các dữ liệu thỏa mãn một điều kiện nào đó, thông thường hàm filter sẽ lọc dữ liệu theo dòng

filter(airquality, Ozone > 106) %>% head(5)

##   Ozone Solar.R Wind Temp Month Day
## 1   115     223  5.7   79     5  30
## 2   135     269  4.1   84     7   1
## 3   108     223  8.0   85     7  25
## 4   122     255  4.0   89     8   7
## 5   110     207  8.0   90     8   9

filter(airquality, Ozone > 106 & Month == 8) %>% head(5)

##   Ozone Solar.R Wind Temp Month Day
## 1   122     255  4.0   89     8   7
## 2   110     207  8.0   90     8   9
## 3   168     238  3.4   81     8  25
## 4   118     225  2.3   94     8  29

filter(airquality, Ozone > 106, Month == 8, Wind > 7) %>% head(5)

##   Ozone Solar.R Wind Temp Month Day
## 1   110     207    8   90     8   9

Hàm mutate : Tạo một cột biến mới thỏa mãn một điều kiện nào đó hoặc đơn thuần là tạo ra cột biến mới dựa theo một công thức tính toán nào đó

mutate(airquality, temp.celsius = Temp - 31) %>% head(5)

##   Ozone Solar.R Wind Temp Month Day temp.celsius
## 1    41     190  7.4   67     5   1           36
## 2    36     118  8.0   72     5   2           41
## 3    12     149 12.6   74     5   3           43
## 4    18     313 11.5   62     5   4           31
## 5    NA      NA 14.3   56     5   5           25

Hàm summarize và group_by :
- Hàm summarize : Khái quát thống kê, tổng kết nhiều giá trị nào đó trở thành một giá trị.
- Kết hợp với hàm group_by : Sắp xếp theo nhóm

summarize(group_by(airquality, Month), round(mean(Temp, na.rm = T)))

## # A tibble: 5 × 2
##   Month `round(mean(Temp, na.rm = T))`
##   <int>                          <dbl>
## 1     5                             66
## 2     6                             79
## 3     7                             84
## 4     8                             84
## 5     9                             77

4.6. Phân tích dữ liệu gapminder

4.6.1. Lọc dữ liệu

# Việt Nam từ 2000 trở đi
gapminder %>% 
  filter(country == "Vietnam" & year > 2000)

## # A tibble: 2 × 6
##   country continent  year lifeExp      pop gdpPercap
##   <fct>   <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Vietnam Asia       2002    73.0 80908147     1764.
## 2 Vietnam Asia       2007    74.2 85262356     2442.

# Việt Nam và USA
gapminder %>%
  filter(country == "Vietnam" | country == "United States") %>%
  head(4)

## # A tibble: 4 × 6
##   country       continent  year lifeExp       pop gdpPercap
##   <fct>         <fct>     <int>   <dbl>     <int>     <dbl>
## 1 United States Americas   1952    68.4 157553000    13990.
## 2 United States Americas   1957    69.5 171984000    14847.
## 3 United States Americas   1962    70.2 186538000    16173.
## 4 United States Americas   1967    70.8 198712000    19530.

# Lọc ra các quốc gia có tuổi thọ trên 80, sắp xếp các quốc gia này theo chiều tăng dần GDP
gapminder %>% 
  filter(lifeExp >= 80) %>%
  arrange(gdpPercap) %>%
  head(n=10)

## # A tibble: 10 × 6
##    country          continent  year lifeExp       pop gdpPercap
##    <fct>            <fct>     <int>   <dbl>     <int>     <dbl>
##  1 New Zealand      Oceania    2007    80.2   4115771    25185.
##  2 Israel           Asia       2007    80.7   6426679    25523.
##  3 Italy            Europe     2002    80.2  57926999    27968.
##  4 Hong Kong, China Asia       1997    80     6495918    28378.
##  5 Italy            Europe     2007    80.5  58147733    28570.
##  6 Japan            Asia       2002    82   127065841    28605.
##  7 Japan            Asia       1997    80.7 125956499    28817.
##  8 Spain            Europe     2007    80.9  40448191    28821.
##  9 Sweden           Europe     2002    80.0   8954175    29342.
## 10 Hong Kong, China Asia       2002    81.5   6762476    30209.

4.6.2. Lựa chọn cột

gapminder %>%
  dplyr::select(country, lifeExp) %>%
  head()

## # A tibble: 6 × 2
##   country     lifeExp
##   <fct>         <dbl>
## 1 Afghanistan    28.8
## 2 Afghanistan    30.3
## 3 Afghanistan    32.0
## 4 Afghanistan    34.0
## 5 Afghanistan    36.1
## 6 Afghanistan    38.4

# Sắp xếp theo thứ tự dân số các nước, arrange()
gapminder %>%
  dplyr::select(country, gdpPercap, pop) %>%
  arrange(pop) %>%
  head(n=10)

## # A tibble: 10 × 3
##    country               gdpPercap   pop
##    <fct>                     <dbl> <int>
##  1 Sao Tome and Principe      880. 60011
##  2 Sao Tome and Principe      861. 61325
##  3 Djibouti                  2670. 63149
##  4 Sao Tome and Principe     1072. 65345
##  5 Sao Tome and Principe     1385. 70787
##  6 Djibouti                  2865. 71851
##  7 Sao Tome and Principe     1533. 76595
##  8 Sao Tome and Principe     1738. 86796
##  9 Djibouti                  3021. 89898
## 10 Sao Tome and Principe     1890. 98593

# Sắp xếp theo năm mới nhất và thứ tự giảm dần về GDP các nước, sử dụng arrange() cho 2 biến
gapminder %>%
  dplyr::select(country, gdpPercap, year) %>%
  arrange(desc(year), desc(gdpPercap)) %>%
  head(n=10)

## # A tibble: 10 × 3
##    country          gdpPercap  year
##    <fct>                <dbl> <int>
##  1 Norway              49357.  2007
##  2 Kuwait              47307.  2007
##  3 Singapore           47143.  2007
##  4 United States       42952.  2007
##  5 Ireland             40676.  2007
##  6 Hong Kong, China    39725.  2007
##  7 Switzerland         37506.  2007
##  8 Netherlands         36798.  2007
##  9 Canada              36319.  2007
## 10 Iceland             36181.  2007

4.6.3. Bổ sung cột

# Thêm một cột biến mới, có giá trị là tổng GDP các nước.
gapminder %>%
  mutate(total_gdp = gdpPercap * pop) %>%
  head()

## # A tibble: 6 × 7
##   country     continent  year lifeExp      pop gdpPercap    total_gdp
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>        <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.  6567086330.
## 2 Afghanistan Asia       1957    30.3  9240934      821.  7585448670.
## 3 Afghanistan Asia       1962    32.0 10267083      853.  8758855797.
## 4 Afghanistan Asia       1967    34.0 11537966      836.  9648014150.
## 5 Afghanistan Asia       1972    36.1 13079460      740.  9678553274.
## 6 Afghanistan Asia       1977    38.4 14880372      786. 11697659231.

# Sử dụng hoán chuyển dữ liệu, sau đó sắp xếp giảm dần theo gdp
gapminder %>%
  mutate(log_gdp = log10(gdpPercap), log_pop = log10(pop)) %>%
  arrange(desc(log_gdp)) %>%
  head()

## # A tibble: 6 × 8
##   country continent  year lifeExp     pop gdpPercap log_gdp log_pop
##   <fct>   <fct>     <int>   <dbl>   <int>     <dbl>   <dbl>   <dbl>
## 1 Kuwait  Asia       1957    58.0  212846   113523.    5.06    5.33
## 2 Kuwait  Asia       1972    67.7  841934   109348.    5.04    5.93
## 3 Kuwait  Asia       1952    55.6  160000   108382.    5.03    5.20
## 4 Kuwait  Asia       1962    60.5  358266    95458.    4.98    5.55
## 5 Kuwait  Asia       1967    64.6  575003    80895.    4.91    5.76
## 6 Kuwait  Asia       1977    69.3 1140357    59265.    4.77    6.06

4.6.4. Thống kê nhóm

#  Tính giá trị trung bình và giá trị trung vị cho các quốc gia theo các năm
gapminder %>%
  group_by(year) %>%
  summarise(mean_gdp = mean(gdpPercap), median_gdp = median(gdpPercap))

## # A tibble: 12 × 3
##     year mean_gdp median_gdp
##    <int>    <dbl>      <dbl>
##  1  1952    3725.      1969.
##  2  1957    4299.      2173.
##  3  1962    4726.      2335.
##  4  1967    5484.      2678.
##  5  1972    6770.      3339.
##  6  1977    7313.      3799.
##  7  1982    7519.      4216.
##  8  1987    7901.      4280.
##  9  1992    8159.      4386.
## 10  1997    9090.      4782.
## 11  2002    9918.      5320.
## 12  2007   11680.      6124.

4.6.5. Hàm đếm dữ liệu trống

count_missing <- function(x) {
  is.na(x) -> missing_value
  sum(missing_value) -> all_missing_value
  return(all_missing_value)
}

myvector <- c(NA, 3, 5, 4, 8, NA, 3, 88, NA, NA, NA)
count_missing(myvector)

## [1] 5

Tập dữ liệu nycflights13

library(nycflights13)
data("flights")
# Khởi tạo một véc tơ trống
vector_missing_flights <- c()
# Gán cho k là số cột của tập dữ liệu
k <- ncol(flights)
# Sử dụng vòng lặp duyệt qua từng cột
# Hàm pull() cho phép đẩy ra giá trị từng cột
for (i in 1:k ) {
  flights %>% pull(i) -> alternative_values
  count_missing(alternative_values) -> missing_values_flights
  vector_missing_flights <- c(vector_missing_flights,
  missing_values_flights)
}

names(flights) -> col_name
final_result <- data.frame(col_name, vector_missing_flights)
names(final_result)[2] <- c("n_missing")
final_result

##          col_name n_missing
## 1            year         0
## 2           month         0
## 3             day         0
## 4        dep_time      8255
## 5  sched_dep_time         0
## 6       dep_delay      8255
## 7        arr_time      8713
## 8  sched_arr_time         0
## 9       arr_delay      9430
## 10        carrier         0
## 11         flight         0
## 12        tailnum      2512
## 13         origin         0
## 14           dest         0
## 15       air_time      9430
## 16       distance         0
## 17           hour         0
## 18         minute         0
## 19      time_hour         0

4.6.6. Tần số xuất hiện

# Thống kê đếm số lượng các quốc gia được liệt kê trong biến country:
gapminder %>%
  group_by(country) %>%
  summarize(count_country = n()) %>%
  head()

## # A tibble: 6 × 2
##   country     count_country
##   <fct>               <int>
## 1 Afghanistan            12
## 2 Albania                12
## 3 Algeria                12
## 4 Angola                 12
## 5 Argentina              12
## 6 Australia              12

# Thống kê trong tập dữ liệu theo năm, quốc gia nào có tuổi thọ dân số cao nhất theo các năm, ở châu lục nào, tuổi thọ cao nhất năm đó là bao nhiêu?
gapminder %>%
  group_by( year) %>%
  summarize(longest_life = max(lifeExp),
  country = country[which.max(lifeExp)],
  coninent = continent[which.max(lifeExp)])

## # A tibble: 12 × 4
##     year longest_life country coninent
##    <int>        <dbl> <fct>   <fct>   
##  1  1952         72.7 Norway  Europe  
##  2  1957         73.5 Iceland Europe  
##  3  1962         73.7 Iceland Europe  
##  4  1967         74.2 Sweden  Europe  
##  5  1972         74.7 Sweden  Europe  
##  6  1977         76.1 Iceland Europe  
##  7  1982         77.1 Japan   Asia    
##  8  1987         78.7 Japan   Asia    
##  9  1992         79.4 Japan   Asia    
## 10  1997         80.7 Japan   Asia    
## 11  2002         82   Japan   Asia    
## 12  2007         82.6 Japan   Asia

4.7. Trực quan dữ liệu Gapminder

4.7.1. Biểu đồ dây

Phân tích về tuổi thọ trung bình của người Việt Nam qua các mốc thời gian

library(ggplot2)
vietnam_data <- gapminder %>%
  filter(country == "Vietnam")
  ggplot(data = vietnam_data, mapping = aes(x = year, y = lifeExp)) +
  geom_line(size = 1.5, color = "lightgrey") +
  geom_point(size = 3, color = "steelblue") +
  labs(y = "Life Expectancy (years)", x = "Year",
       title = "Life expectancy changes over time",
       subtitle = "Vietnam (1952-2007)",
       caption = "Source: http://www.gapminder.org/data/")

vietnam_data

## # A tibble: 12 × 6
##    country continent  year lifeExp      pop gdpPercap
##    <fct>   <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Vietnam Asia       1952    40.4 26246839      605.
##  2 Vietnam Asia       1957    42.9 28998543      676.
##  3 Vietnam Asia       1962    45.4 33796140      772.
##  4 Vietnam Asia       1967    47.8 39463910      637.
##  5 Vietnam Asia       1972    50.3 44655014      700.
##  6 Vietnam Asia       1977    55.8 50533506      714.
##  7 Vietnam Asia       1982    58.8 56142181      707.
##  8 Vietnam Asia       1987    62.8 62826491      821.
##  9 Vietnam Asia       1992    67.7 69940728      989.
## 10 Vietnam Asia       1997    70.7 76048996     1386.
## 11 Vietnam Asia       2002    73.0 80908147     1764.
## 12 Vietnam Asia       2007    74.2 85262356     2442.

Xem xu hướng của các quốc gia khác

ggplot(data = gapminder, mapping = aes(x = year, y = lifeExp, group = country)) +
  geom_line(size = 0.5, color = "red") +
  facet_grid(.~ continent) +
  coord_flip() +
  labs(y = "Life Expectancy (years)", x = "Year",
       title = "Life expectancy changes over time",
       subtitle = "Period (1952-2007)",
       caption = "Source: http://www.gapminder.org/data/")

4.7.2. Biểu đồ mật độ xác suất

sự phân bổ của GDP bình quân đầu người và tuổi thọ:

ggplot(gapminder, aes(x = gdpPercap, y = lifeExp)) +
  geom_point(size = 0.25) +
  geom_density_2d() +
  scale_x_log10() +
  labs(y = "Life Expectancy (years)", x = "Year",
       title = "Joint Distribution",
       subtitle = "Period (1952-2007)",
       caption = "Source: http://www.gapminder.org/data/")

## Warning: Computation failed in `stat_density2d()`
## Caused by error in `loadNamespace()`:
## ! there is no package called 'isoband'

Vẽ biểu đồ so sánh về xu hướng của GDP và tuổi thọ trung bình theo các năm cho các quốc gia sau: Việt Nam, United States, Ghana

gapminder %>%
  filter(country %in% c("Vietnam","United States", "Ghana")) -> plotdata
plotdata %>%
  head(n=10)

## # A tibble: 10 × 6
##    country continent  year lifeExp      pop gdpPercap
##    <fct>   <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Ghana   Africa     1952    43.1  5581001      911.
##  2 Ghana   Africa     1957    44.8  6391288     1044.
##  3 Ghana   Africa     1962    46.5  7355248     1190.
##  4 Ghana   Africa     1967    48.1  8490213     1126.
##  5 Ghana   Africa     1972    49.9  9354120     1178.
##  6 Ghana   Africa     1977    51.8 10538093      993.
##  7 Ghana   Africa     1982    53.7 11400338      876.
##  8 Ghana   Africa     1987    55.7 14168101      847.
##  9 Ghana   Africa     1992    57.5 16278738      925.
## 10 Ghana   Africa     1997    58.6 18418288     1005.

labels <- plotdata %>%
  filter(year == max(year))

ggplot(plotdata, aes(x = year, y = lifeExp)) +
  geom_point(size = 0.8, color = "blue") +
  geom_line(size = 0.5, color = "red") +
  geom_text(aes(x = year + 1, y = lifeExp, label = country),
            data = labels, hjust = "left", size = 1.5) +
  ggtitle("Life Expectancy Over Time") +
  xlab(NULL) +
  ylab("Life Expectancy (Years)") +
  facet_grid(continent ~.) +
  xlim(1950, 2015) +
  theme_bw()

5. Hệ số tương quan

5.1. Các loại hệ số tương quan

Hệ số tương quan Pearson (Pearson correlation coeffcient, ký hiệu r): cor(x,y)
Hệ số tương quan Spearman 𝜌 được sử dụng khi hai biến x và y không tuân theo luật phân phối chuẩn; cor.test(x, y, method = “spearman”)
Hệ số tương quan Kendall 𝜏 cũng là một phương pháp phân tích phi tham số được ước tính bằng cách tìm các cặp số (x,y) “song hành” (concordant) với nhau: cor.test(x, y, method=“kendall”)

5.2. Áp dụng

# Tập dữ liệu về nồng độ cholesterol (chol), gồm 3 biến số là age, bmi và chol cho 18 đối tượng được quan sát
library(dplyr)
library(corrplot)

## corrplot 0.92 loaded

library(ggplot2)

age <- c(46, 20, 52, 30, 57, 25, 55, 67,44, 33, 44, 78, 33, 56, 87, 46, 33, 11)
bmi <- c(25.4,20.6, 26.2,22.6,25.4, 23.1, 22.7, 24.9, 19.8, 25.3, 23.2, 21.8, 20.9, 26.7, 26.4, 21.2, 21.2, 22.8)
chol <- c(3.5, 1.9, 4.0, 2.6, 4.5, 3.0, 2.9, 3.8, 2.1, 3.8, 4.1, 3.0, 2.5, 4.6, 3.2, 4.2, 2.3, 4.0)

data <- data.frame(age, bmi, chol)
data %>% head()

##   age  bmi chol
## 1  46 25.4  3.5
## 2  20 20.6  1.9
## 3  52 26.2  4.0
## 4  30 22.6  2.6
## 5  57 25.4  4.5
## 6  25 23.1  3.0

M <- cor(data)
corrplot(M, method = "shade")

ggplot(data, aes(x= age, y= chol)) +
  geom_point(col = "red")

M <- cor(data)
corrplot(M, method = "number")

6.Hồi quy tuyến tính

6.1. Hiển thị dữ liệu

library(reticulate)

import numpy as np
import matplotlib.pyplot as plt

# height (cm)
X = np.array([[147, 150, 153, 158, 163, 165, 168, 170, 173, 175, 178, 180, 183]]).T
# weight (kg)
y = np.array([[ 49, 50, 51, 54, 58, 59, 60, 62, 63, 64, 66, 67, 68]]).T

# Trực quan hóa dữ liệu
plt.plot(X, y, 'ro')
plt.axis([140, 190, 45, 75])

## (140.0, 190.0, 45.0, 75.0)

plt.xlabel('Height (cm)')
plt.ylabel('Weight (kg)')
plt.show()

6.2. Nghiệm theo công thức

# Xây dựng Xbar
one = np.ones((X.shape[0], 1))
Xbar = np.concatenate((one, X), axis = 1)
# Tính trọng lượng
A = np.dot(Xbar.T, Xbar)
b = np.dot(Xbar.T, y)
w = np.dot(np.linalg.pinv(A), b)
print('w = ', w)

## w =  [[-33.73541021]
##  [  0.55920496]]

#Chuẩn bị

w_0 = w[0][0]
w_1 = w[1][0]
x0 = np.linspace(145, 185, 2)
y0 = w_0 + w_1*x0
# Vẽ đường phù hợp
plt.plot(X.T, y.T, 'ro') 
#dòng phù hợp
plt.plot(x0, y0)
plt.axis([140, 190, 45, 75])

## (140.0, 190.0, 45.0, 75.0)

plt.xlabel('Height (cm)')
plt.ylabel('Weight (kg)')
plt.show()

y1 = w_1*155 + w_0
y2 = w_1*160 + w_0
print( u'Dự đoán 152 cm: %.2f (kg),là:52 (kg)'%(y1) )

## Dự đoán 152 cm: 52.94 (kg),là:52 (kg)

print( u'Dự đoán 160 cm: %.2f (kg),là:52 (kg)'%(y2) )

## Dự đoán 160 cm: 55.74 (kg),là:52 (kg)

6.3. Nghiệm theo thư viện scikit-learn

from sklearn import datasets, linear_model
# Điều chỉnh mô hình bằng hồi quy tuyến tính
regr = linear_model.LinearRegression(fit_intercept=False) # fit_intercept = False
regr.fit(Xbar, y)

LinearRegression(fit_intercept=False)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

# So sánh hai kết quả
print('Solution of scikit-learn:',regr.coef_ )

## Solution of scikit-learn: [[-33.73541021   0.55920496]]

print('Solution of by (5):',w.T)

## Solution of by (5): [[-33.73541021   0.55920496]]

7. Kiểm định giả thuyết thống kê

library(ggplot2)
id <- c(1:1000)
height <- rnorm(1000, mean = 170, sd = 6.3)
df <- data.frame(id, height)
ggplot(df, aes(x=height))+
  geom_histogram(aes(y=..density..),color="darkblue", fill="lightblue") -> p
p + 
  geom_vline(aes(xintercept = mean(height)), color = "red", linetype = "dashed", size = 1) + 
  geom_density(alpha=.2)

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Trực Quan Dữ Liệu _ Bài giảng

Đặng Lê Thanh Tú

2023-10-09