R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(gapminder)

#lấy dữ liệu
data("gapminder")

head(gapminder, n = 10)
## # A tibble: 10 × 6
##    country     continent  year lifeExp      pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.
##  2 Afghanistan Asia       1957    30.3  9240934      821.
##  3 Afghanistan Asia       1962    32.0 10267083      853.
##  4 Afghanistan Asia       1967    34.0 11537966      836.
##  5 Afghanistan Asia       1972    36.1 13079460      740.
##  6 Afghanistan Asia       1977    38.4 14880372      786.
##  7 Afghanistan Asia       1982    39.9 12881816      978.
##  8 Afghanistan Asia       1987    40.8 13867957      852.
##  9 Afghanistan Asia       1992    41.7 16317921      649.
## 10 Afghanistan Asia       1997    41.8 22227415      635.
tail(gapminder, n = 10)
## # A tibble: 10 × 6
##    country  continent  year lifeExp      pop gdpPercap
##    <fct>    <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Zimbabwe Africa     1962    52.4  4277736      527.
##  2 Zimbabwe Africa     1967    54.0  4995432      570.
##  3 Zimbabwe Africa     1972    55.6  5861135      799.
##  4 Zimbabwe Africa     1977    57.7  6642107      686.
##  5 Zimbabwe Africa     1982    60.4  7636524      789.
##  6 Zimbabwe Africa     1987    62.4  9216418      706.
##  7 Zimbabwe Africa     1992    60.4 10704340      693.
##  8 Zimbabwe Africa     1997    46.8 11404948      792.
##  9 Zimbabwe Africa     2002    40.0 11926563      672.
## 10 Zimbabwe Africa     2007    43.5 12311143      470.
#Xem chiều dữ liệu
dim(gapminder)
## [1] 1704    6
#xem cấu trúc biến
str(gapminder)
## tibble [1,704 × 6] (S3: tbl_df/tbl/data.frame)
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ year     : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ lifeExp  : num [1:1704] 28.8 30.3 32 34 36.1 ...
##  $ pop      : int [1:1704] 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
##  $ gdpPercap: num [1:1704] 779 821 853 836 740 ...
#xem thống kê số lượng các quốc gia từng châu lục bằng table
table(gapminder$continent)
## 
##   Africa Americas     Asia   Europe  Oceania 
##      624      300      396      360       24
gapminder %>%
  filter(
    continent == "Asia",
    country == "China", 
    year %in% c(1997, 2002, 2007)
  )
## # A tibble: 3 × 6
##   country continent  year lifeExp        pop gdpPercap
##   <fct>   <fct>     <int>   <dbl>      <int>     <dbl>
## 1 China   Asia       1997    70.4 1230075000     2289.
## 2 China   Asia       2002    72.0 1280400000     3119.
## 3 China   Asia       2007    73.0 1318683096     4959.
gapminder %>%
filter(
year == 2007,
continent == "Asia",
country == "China"
) %>%
summarise(mean(lifeExp))
## # A tibble: 1 × 1
##   `mean(lifeExp)`
##             <dbl>
## 1            73.0
gapminder %>%
filter(year == 2007) %>%
group_by(continent) %>%
summarise(total_pop = sum(pop)) %>%
arrange(desc(total_pop))
## # A tibble: 5 × 2
##   continent  total_pop
##   <fct>          <dbl>
## 1 Asia      3811953827
## 2 Africa     929539692
## 3 Americas   898871184
## 4 Europe     586098529
## 5 Oceania     24549947
gapminder %>%
filter(year == 2007) %>%
mutate(totalGDP = gdpPercap * pop) %>%
head(n = 10)
## # A tibble: 10 × 7
##    country     continent  year lifeExp       pop gdpPercap      totalGDP
##    <fct>       <fct>     <int>   <dbl>     <int>     <dbl>         <dbl>
##  1 Afghanistan Asia       2007    43.8  31889923      975.  31079291949.
##  2 Albania     Europe     2007    76.4   3600523     5937.  21376411360.
##  3 Algeria     Africa     2007    72.3  33333216     6223. 207444851958.
##  4 Angola      Africa     2007    42.7  12420476     4797.  59583895818.
##  5 Argentina   Americas   2007    75.3  40301927    12779. 515033625357.
##  6 Australia   Oceania    2007    81.2  20434176    34435. 703658358894.
##  7 Austria     Europe     2007    79.8   8199783    36126. 296229400691.
##  8 Bahrain     Asia       2007    75.6    708573    29796.  21112675360.
##  9 Bangladesh  Asia       2007    64.1 150448339     1391. 209311822134.
## 10 Belgium     Europe     2007    79.4  10392226    33693. 350141166520.
# Lọc dữ liệu cho năm 2007
gapminder2007 <- gapminder %>%
filter(year == 2007)
gapminder2007 %>%
head(n=10)
## # A tibble: 10 × 6
##    country     continent  year lifeExp       pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>     <int>     <dbl>
##  1 Afghanistan Asia       2007    43.8  31889923      975.
##  2 Albania     Europe     2007    76.4   3600523     5937.
##  3 Algeria     Africa     2007    72.3  33333216     6223.
##  4 Angola      Africa     2007    42.7  12420476     4797.
##  5 Argentina   Americas   2007    75.3  40301927    12779.
##  6 Australia   Oceania    2007    81.2  20434176    34435.
##  7 Austria     Europe     2007    79.8   8199783    36126.
##  8 Bahrain     Asia       2007    75.6    708573    29796.
##  9 Bangladesh  Asia       2007    64.1 150448339     1391.
## 10 Belgium     Europe     2007    79.4  10392226    33693.
# Mô phỏng trực quan
ggplot(data = gapminder2007, mapping = aes(x = gdpPercap,y = lifeExp, color = continent)) +
geom_point() +
scale_x_log10()

#Thể hiện bubble chart theo dân số
ggplot(data = gapminder2007, mapping = aes(x = gdpPercap,y = lifeExp, color = continent, size = pop))+
  geom_point() +
  scale_x_log10()

# Vẽ đường hồi quy với geom_smooth.
# Tham số method = "loess" để chỉ ra sử dụng phương pháp bình phương tối thiểu LMSE
ggplot(data = gapminder, mapping = aes(x = gdpPercap, y = lifeExp, size = pop, color = continent, group = continent)) +
  geom_point() +
  geom_smooth(method = "loess") +
  scale_x_log10()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: size.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

ggplot(data = gapminder, mapping = aes(x = gdpPercap, y = lifeExp, size = pop)) +
  geom_point(aes(color = continent)) +
  geom_smooth(method = "loess") +
  scale_x_log10() +
  labs(x = "Log GDP per Capita", y = "Life Expectancy") +
  ggtitle("Association between GDP Per Capita and Life Expectancy") +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: size.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

# Lọc dữ liệu năm 2007
gapminder %>%
filter(year == 2007) -> gapminder2007
gapminder2007 %>% head(n=10)
## # A tibble: 10 × 6
##    country     continent  year lifeExp       pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>     <int>     <dbl>
##  1 Afghanistan Asia       2007    43.8  31889923      975.
##  2 Albania     Europe     2007    76.4   3600523     5937.
##  3 Algeria     Africa     2007    72.3  33333216     6223.
##  4 Angola      Africa     2007    42.7  12420476     4797.
##  5 Argentina   Americas   2007    75.3  40301927    12779.
##  6 Australia   Oceania    2007    81.2  20434176    34435.
##  7 Austria     Europe     2007    79.8   8199783    36126.
##  8 Bahrain     Asia       2007    75.6    708573    29796.
##  9 Bangladesh  Asia       2007    64.1 150448339     1391.
## 10 Belgium     Europe     2007    79.4  10392226    33693.
# Dùng histogram
ggplot(data = gapminder2007, mapping = aes(gdpPercap)) +
geom_histogram(fill = "#00adef", color = "red", bins = 20) +
labs(title = "Distribution of GDP per Capita in 2007", y = "Frequency")

# Dùng histogram
library(scales)
ggplot(data = gapminder2007, mapping = aes(x = gdpPercap, y = ..count../sum(..count..))) +
geom_histogram(fill = "#00adef", color = "red", bins = 20) +
scale_y_continuous(labels = percent) +
labs(title = "Distribution of GDP per Capita in 2007", y = "Frequency")
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Biểu đồ mật độ xác suất thông thường
ggplot(data = gapminder2007, mapping = aes(gdpPercap, fill = continent)) +
geom_density(alpha = 0.7)

# Lọc ra dữ liệu các quốc gia ở châu Á
asia <- gapminder %>%
filter(continent == "Asia" & year == 2007)
asia %>%
head(n=10)
## # A tibble: 10 × 6
##    country          continent  year lifeExp        pop gdpPercap
##    <fct>            <fct>     <int>   <dbl>      <int>     <dbl>
##  1 Afghanistan      Asia       2007    43.8   31889923      975.
##  2 Bahrain          Asia       2007    75.6     708573    29796.
##  3 Bangladesh       Asia       2007    64.1  150448339     1391.
##  4 Cambodia         Asia       2007    59.7   14131858     1714.
##  5 China            Asia       2007    73.0 1318683096     4959.
##  6 Hong Kong, China Asia       2007    82.2    6980412    39725.
##  7 India            Asia       2007    64.7 1110396331     2452.
##  8 Indonesia        Asia       2007    70.6  223547000     3541.
##  9 Iran             Asia       2007    71.0   69453570    11606.
## 10 Iraq             Asia       2007    59.5   27499638     4471.
# Lọc ra dữ liệu cho các quốc gia ở châu Âu
europe <- gapminder %>%
filter(continent == "Europe" & year == 2007)
europe %>%
head(n=10)
## # A tibble: 10 × 6
##    country                continent  year lifeExp      pop gdpPercap
##    <fct>                  <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Albania                Europe     2007    76.4  3600523     5937.
##  2 Austria                Europe     2007    79.8  8199783    36126.
##  3 Belgium                Europe     2007    79.4 10392226    33693.
##  4 Bosnia and Herzegovina Europe     2007    74.9  4552198     7446.
##  5 Bulgaria               Europe     2007    73.0  7322858    10681.
##  6 Croatia                Europe     2007    75.7  4493312    14619.
##  7 Czech Republic         Europe     2007    76.5 10228744    22833.
##  8 Denmark                Europe     2007    78.3  5468120    35278.
##  9 Finland                Europe     2007    79.3  5238460    33207.
## 10 France                 Europe     2007    80.7 61083916    30470.
# Trực quan dữ liệu châu Á để so sánh
ggplot(data = asia, mapping = aes(x = country, y = lifeExp, fill = country)) +
geom_bar(stat = "identity", width = 0.5) +
coord_flip()

# Sắp xếp lại trật tự cho châu Á và gán cho đối tượng là graph1
ggplot(data = asia, mapping = aes(x = reorder(country, lifeExp), y = lifeExp, fill = country)) +
geom_bar(stat = "identity", width = 0.6) +
coord_flip() +
theme(legend.position = "none") +
labs(x="", y="Life Expectancy of Asia") -> graph1
graph1

# Tương tự với châu Âu gán cho đối tượng là graph2
ggplot(data = europe, mapping = aes(x = reorder(country, lifeExp), y = lifeExp, fill = country)) +
geom_bar(stat = "identity", width = 0.9) +
coord_flip() +
theme(legend.position = "none") +
labs(x="", y="Life Expectancy of Europe") -> graph2
graph2