options(repos = c(CRAN = "https://cloud.r-project.org/"))
knitr::opts_chunk$set(echo = TRUE)
#thao tac voi DPLYR
gapminder %>%
  filter(
    continent == "Asia",
    country == "Vietnam",
    year %in% c(1997, 2002, 2007)
  )
## # A tibble: 3 × 6
##   country continent  year lifeExp      pop gdpPercap
##   <fct>   <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Vietnam Asia       1997    70.7 76048996     1386.
## 2 Vietnam Asia       2002    73.0 80908147     1764.
## 3 Vietnam Asia       2007    74.2 85262356     2442.
#thong ke voi summarize
install.packages("gapminder")
## Warning: package 'gapminder' is in use and will not be installed
library(dplyr)
library(gapminder)

# Calculate mean life expectancy for Vietnam in 2007
gapminder %>%
  filter( 
    year == 2007,
    continent == "Asia",
    country == "Vietnam"
  ) %>%
  summarise(mean_life_exp = mean(lifeExp))
## # A tibble: 1 × 1
##   mean_life_exp
##           <dbl>
## 1          74.2
#ket hop group_by và Summarize
gapminder %>%
  filter(year == 2007) %>%
  group_by(continent) %>%
  summarise(average_lifeExp = mean(lifeExp))
## # A tibble: 5 × 2
##   continent average_lifeExp
##   <fct>               <dbl>
## 1 Africa               54.8
## 2 Americas             73.6
## 3 Asia                 70.7
## 4 Europe               77.6
## 5 Oceania              80.7
#hàm arrange()
gapminder %>%
  filter(year == 2007) %>%
  group_by(continent) %>%
  summarise(total_pop = sum(pop)) %>%
  arrange(desc(total_pop))
## # A tibble: 5 × 2
##   continent  total_pop
##   <fct>          <dbl>
## 1 Asia      3811953827
## 2 Africa     929539692
## 3 Americas   898871184
## 4 Europe     586098529
## 5 Oceania     24549947
#hàm mutate
gapminder %>%
  filter(year == 2007) %>%
  mutate(totalGDP = gdpPercap * pop) %>%
  head(n = 10)
## # A tibble: 10 × 7
##    country     continent  year lifeExp       pop gdpPercap      totalGDP
##    <fct>       <fct>     <int>   <dbl>     <int>     <dbl>         <dbl>
##  1 Afghanistan Asia       2007    43.8  31889923      975.  31079291949.
##  2 Albania     Europe     2007    76.4   3600523     5937.  21376411360.
##  3 Algeria     Africa     2007    72.3  33333216     6223. 207444851958.
##  4 Angola      Africa     2007    42.7  12420476     4797.  59583895818.
##  5 Argentina   Americas   2007    75.3  40301927    12779. 515033625357.
##  6 Australia   Oceania    2007    81.2  20434176    34435. 703658358894.
##  7 Austria     Europe     2007    79.8   8199783    36126. 296229400691.
##  8 Bahrain     Asia       2007    75.6    708573    29796.  21112675360.
##  9 Bangladesh  Asia       2007    64.1 150448339     1391. 209311822134.
## 10 Belgium     Europe     2007    79.4  10392226    33693. 350141166520.
#TRỰC QUAN DỮ LIỆU - VISUALIZATION
# Lọc dữ liệu cho năm 2007
gapminder2007 <- gapminder %>%
  filter(year == 2007)
gapminder2007 %>%
  head(n = 10)
## # A tibble: 10 × 6
##    country     continent  year lifeExp       pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>     <int>     <dbl>
##  1 Afghanistan Asia       2007    43.8  31889923      975.
##  2 Albania     Europe     2007    76.4   3600523     5937.
##  3 Algeria     Africa     2007    72.3  33333216     6223.
##  4 Angola      Africa     2007    42.7  12420476     4797.
##  5 Argentina   Americas   2007    75.3  40301927    12779.
##  6 Australia   Oceania    2007    81.2  20434176    34435.
##  7 Austria     Europe     2007    79.8   8199783    36126.
##  8 Bahrain     Asia       2007    75.6    708573    29796.
##  9 Bangladesh  Asia       2007    64.1 150448339     1391.
## 10 Belgium     Europe     2007    79.4  10392226    33693.
 #Mô phỏng trựcquan
 ggplot(data= gapminder2007, mapping= aes(x= gdpPercap,y= lifeExp,color= continent)) +
 geom_point()+
 scale_x_log10()

 #Thể hiện bubblechart theo quy mô dân số
 ggplot(data= gapminder2007, mapping= aes(x= gdpPercap,y= lifeExp,color= continent,size= pop))+
 geom_point()+
 scale_x_log10()

# Vẽ đường hồi quy với geom_smooth.
 # Tham số method = "loess" để chỉ ra sử dụng phương pháp bình phương tối thiểu LMSE
 ggplot(data = gapminder, mapping = aes(x = gdpPercap, y = lifeExp, size = pop)) +
 geom_point(aes(color = continent)) +
 geom_smooth(method = "loess") +
 scale_x_log10()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: size.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

 # Gán nhãn cho biểu đồ
library(ggplot2)

ggplot(data = gapminder, mapping = aes(x = gdpPercap, y = lifeExp, size = pop)) +
  geom_point(aes(color = continent)) +
  geom_smooth(method = "loess") +
  scale_x_log10() +
  labs(x = "Log GDP per Capita", y = "Life Expectancy") +
  ggtitle("Association between GDP Per Capita and Life Expectancy") +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: size.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

 # Lọc dữ liệu năm 2007
 gapminder %>%
 filter(year == 2007)-> gapminder2007
 gapminder2007 %>% head(n=10)
## # A tibble: 10 × 6
##    country     continent  year lifeExp       pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>     <int>     <dbl>
##  1 Afghanistan Asia       2007    43.8  31889923      975.
##  2 Albania     Europe     2007    76.4   3600523     5937.
##  3 Algeria     Africa     2007    72.3  33333216     6223.
##  4 Angola      Africa     2007    42.7  12420476     4797.
##  5 Argentina   Americas   2007    75.3  40301927    12779.
##  6 Australia   Oceania    2007    81.2  20434176    34435.
##  7 Austria     Europe     2007    79.8   8199783    36126.
##  8 Bahrain     Asia       2007    75.6    708573    29796.
##  9 Bangladesh  Asia       2007    64.1 150448339     1391.
## 10 Belgium     Europe     2007    79.4  10392226    33693.
 #Dùnghistogram
 library(ggplot2)
ggplot(data = gapminder2007, mapping = aes(x = gdpPercap)) +
  geom_histogram(fill = "#00adef", color = "red", bins = 20) +
  labs(title = "Distribution of GDP per Capita in 2007", y = "Frequency")

#Dùnghistogram
 library(scales)
 ggplot(data= gapminder2007, mapping= aes(x= gdpPercap,y= ..count../sum(..count..)))+
 geom_histogram(fill= "#00adef", color= "red",bins= 20) +
   scale_y_continuous(labels= percent)+
 labs(title= "DistributionofGDP perCapitain 2007", y= "Frequency")
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

 #Biểu đồ mật độ xác suất thông thường
 ggplot(data= gapminder2007, mapping= aes(gdpPercap, fill= continent))+
 geom_density(alpha= 0.7)

install.packages("ggridges")
## Installing package into 'C:/Users/VHC/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'ggridges' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\VHC\AppData\Local\Temp\Rtmp0SNy9Z\downloaded_packages
library(ggridges)
ggplot(data = gapminder2007, aes(x = gdpPercap, y = continent, fill = continent)) +
  geom_density_ridges(alpha = 0.7) +
  theme_ridges() +
  labs(title = "Ridge Plot for GDP Per Capita") +  
  theme(legend.position = "none")
## Picking joint bandwidth of 3510

#SO SÁNH TUỔI THỌ TRUNG BÌNH GIỮA CÁC QUỐC GIA
 #Lọc ra dữ liệu các quốc gia ở châu Á
 asia <-gapminder %>%
 filter(continent=="Asia" & year ==2007)
 asia %>%
 head(n=10)
## # A tibble: 10 × 6
##    country          continent  year lifeExp        pop gdpPercap
##    <fct>            <fct>     <int>   <dbl>      <int>     <dbl>
##  1 Afghanistan      Asia       2007    43.8   31889923      975.
##  2 Bahrain          Asia       2007    75.6     708573    29796.
##  3 Bangladesh       Asia       2007    64.1  150448339     1391.
##  4 Cambodia         Asia       2007    59.7   14131858     1714.
##  5 China            Asia       2007    73.0 1318683096     4959.
##  6 Hong Kong, China Asia       2007    82.2    6980412    39725.
##  7 India            Asia       2007    64.7 1110396331     2452.
##  8 Indonesia        Asia       2007    70.6  223547000     3541.
##  9 Iran             Asia       2007    71.0   69453570    11606.
## 10 Iraq             Asia       2007    59.5   27499638     4471.
#Lọc ra dữ liệu cho các quốc gia ở châu Âu
 europe <-gapminder %>%
 filter(continent=="Europe" & year ==2007)
 europe %>%
 head(n=10)
## # A tibble: 10 × 6
##    country                continent  year lifeExp      pop gdpPercap
##    <fct>                  <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Albania                Europe     2007    76.4  3600523     5937.
##  2 Austria                Europe     2007    79.8  8199783    36126.
##  3 Belgium                Europe     2007    79.4 10392226    33693.
##  4 Bosnia and Herzegovina Europe     2007    74.9  4552198     7446.
##  5 Bulgaria               Europe     2007    73.0  7322858    10681.
##  6 Croatia                Europe     2007    75.7  4493312    14619.
##  7 Czech Republic         Europe     2007    76.5 10228744    22833.
##  8 Denmark                Europe     2007    78.3  5468120    35278.
##  9 Finland                Europe     2007    79.3  5238460    33207.
## 10 France                 Europe     2007    80.7 61083916    30470.
#Trực quan dữ liệu châu Á để so sánh
 ggplot(data= asia, mapping= aes(x= country,y= lifeExp,fill= country)) +
 geom_bar(stat= "identity", width= 0.9) +
 coord_flip()

 #Sắp xếp lại trật tự cho châu Á và gán cho đối tượng là graph1
 ggplot(data= asia, mapping= aes(x= reorder(country,lifeExp), y= lifeExp,fill= country)) +
 geom_bar(stat= "identity", width= 0.9) +
 coord_flip()+
 theme(legend.position= "none") +
 labs(x="", y="LifeExpectancyofAsia")->graph1
 graph1

 #Tương tự vớichâu Âu gán cho đối tượnglà graph2
 ggplot(data= europe, mapping= aes(x= reorder(country,lifeExp), y= lifeExp, fill= country))+
 geom_bar(stat= "identity", width= 0.9) +
 coord_flip()+
 theme(legend.position= "none") +
 labs(x="", y="LifeExpectancyofEurope")->graph2
 graph2

#hiển thị nhiều biểu đồ cùng 1 lúc
install.packages("gridExtra")
## Installing package into 'C:/Users/VHC/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'gridExtra' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\VHC\AppData\Local\Temp\Rtmp0SNy9Z\downloaded_packages
 library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
 grid.arrange(graph1, graph2, ncol = 2)