options(repos = c(CRAN = "https://cloud.r-project.org/"))
knitr::opts_chunk$set(echo = TRUE)
#thao tac voi DPLYR
gapminder %>%
filter(
continent == "Asia",
country == "Vietnam",
year %in% c(1997, 2002, 2007)
)
## # A tibble: 3 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Vietnam Asia 1997 70.7 76048996 1386.
## 2 Vietnam Asia 2002 73.0 80908147 1764.
## 3 Vietnam Asia 2007 74.2 85262356 2442.
#thong ke voi summarize
install.packages("gapminder")
## Warning: package 'gapminder' is in use and will not be installed
library(dplyr)
library(gapminder)
# Calculate mean life expectancy for Vietnam in 2007
gapminder %>%
filter(
year == 2007,
continent == "Asia",
country == "Vietnam"
) %>%
summarise(mean_life_exp = mean(lifeExp))
## # A tibble: 1 × 1
## mean_life_exp
## <dbl>
## 1 74.2
#ket hop group_by và Summarize
gapminder %>%
filter(year == 2007) %>%
group_by(continent) %>%
summarise(average_lifeExp = mean(lifeExp))
## # A tibble: 5 × 2
## continent average_lifeExp
## <fct> <dbl>
## 1 Africa 54.8
## 2 Americas 73.6
## 3 Asia 70.7
## 4 Europe 77.6
## 5 Oceania 80.7
#hàm arrange()
gapminder %>%
filter(year == 2007) %>%
group_by(continent) %>%
summarise(total_pop = sum(pop)) %>%
arrange(desc(total_pop))
## # A tibble: 5 × 2
## continent total_pop
## <fct> <dbl>
## 1 Asia 3811953827
## 2 Africa 929539692
## 3 Americas 898871184
## 4 Europe 586098529
## 5 Oceania 24549947
#hàm mutate
gapminder %>%
filter(year == 2007) %>%
mutate(totalGDP = gdpPercap * pop) %>%
head(n = 10)
## # A tibble: 10 × 7
## country continent year lifeExp pop gdpPercap totalGDP
## <fct> <fct> <int> <dbl> <int> <dbl> <dbl>
## 1 Afghanistan Asia 2007 43.8 31889923 975. 31079291949.
## 2 Albania Europe 2007 76.4 3600523 5937. 21376411360.
## 3 Algeria Africa 2007 72.3 33333216 6223. 207444851958.
## 4 Angola Africa 2007 42.7 12420476 4797. 59583895818.
## 5 Argentina Americas 2007 75.3 40301927 12779. 515033625357.
## 6 Australia Oceania 2007 81.2 20434176 34435. 703658358894.
## 7 Austria Europe 2007 79.8 8199783 36126. 296229400691.
## 8 Bahrain Asia 2007 75.6 708573 29796. 21112675360.
## 9 Bangladesh Asia 2007 64.1 150448339 1391. 209311822134.
## 10 Belgium Europe 2007 79.4 10392226 33693. 350141166520.
#TRỰC QUAN DỮ LIỆU - VISUALIZATION
# Lọc dữ liệu cho năm 2007
gapminder2007 <- gapminder %>%
filter(year == 2007)
gapminder2007 %>%
head(n = 10)
## # A tibble: 10 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 2007 43.8 31889923 975.
## 2 Albania Europe 2007 76.4 3600523 5937.
## 3 Algeria Africa 2007 72.3 33333216 6223.
## 4 Angola Africa 2007 42.7 12420476 4797.
## 5 Argentina Americas 2007 75.3 40301927 12779.
## 6 Australia Oceania 2007 81.2 20434176 34435.
## 7 Austria Europe 2007 79.8 8199783 36126.
## 8 Bahrain Asia 2007 75.6 708573 29796.
## 9 Bangladesh Asia 2007 64.1 150448339 1391.
## 10 Belgium Europe 2007 79.4 10392226 33693.
#Mô phỏng trựcquan
ggplot(data= gapminder2007, mapping= aes(x= gdpPercap,y= lifeExp,color= continent)) +
geom_point()+
scale_x_log10()

#Thể hiện bubblechart theo quy mô dân số
ggplot(data= gapminder2007, mapping= aes(x= gdpPercap,y= lifeExp,color= continent,size= pop))+
geom_point()+
scale_x_log10()

# Vẽ đường hồi quy với geom_smooth.
# Tham số method = "loess" để chỉ ra sử dụng phương pháp bình phương tối thiểu LMSE
ggplot(data = gapminder, mapping = aes(x = gdpPercap, y = lifeExp, size = pop)) +
geom_point(aes(color = continent)) +
geom_smooth(method = "loess") +
scale_x_log10()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: size.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?

# Gán nhãn cho biểu đồ
library(ggplot2)
ggplot(data = gapminder, mapping = aes(x = gdpPercap, y = lifeExp, size = pop)) +
geom_point(aes(color = continent)) +
geom_smooth(method = "loess") +
scale_x_log10() +
labs(x = "Log GDP per Capita", y = "Life Expectancy") +
ggtitle("Association between GDP Per Capita and Life Expectancy") +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: size.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?

# Lọc dữ liệu năm 2007
gapminder %>%
filter(year == 2007)-> gapminder2007
gapminder2007 %>% head(n=10)
## # A tibble: 10 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 2007 43.8 31889923 975.
## 2 Albania Europe 2007 76.4 3600523 5937.
## 3 Algeria Africa 2007 72.3 33333216 6223.
## 4 Angola Africa 2007 42.7 12420476 4797.
## 5 Argentina Americas 2007 75.3 40301927 12779.
## 6 Australia Oceania 2007 81.2 20434176 34435.
## 7 Austria Europe 2007 79.8 8199783 36126.
## 8 Bahrain Asia 2007 75.6 708573 29796.
## 9 Bangladesh Asia 2007 64.1 150448339 1391.
## 10 Belgium Europe 2007 79.4 10392226 33693.
#Dùnghistogram
library(ggplot2)
ggplot(data = gapminder2007, mapping = aes(x = gdpPercap)) +
geom_histogram(fill = "#00adef", color = "red", bins = 20) +
labs(title = "Distribution of GDP per Capita in 2007", y = "Frequency")

#Dùnghistogram
library(scales)
ggplot(data= gapminder2007, mapping= aes(x= gdpPercap,y= ..count../sum(..count..)))+
geom_histogram(fill= "#00adef", color= "red",bins= 20) +
scale_y_continuous(labels= percent)+
labs(title= "DistributionofGDP perCapitain 2007", y= "Frequency")
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#Biểu đồ mật độ xác suất thông thường
ggplot(data= gapminder2007, mapping= aes(gdpPercap, fill= continent))+
geom_density(alpha= 0.7)

install.packages("ggridges")
## Installing package into 'C:/Users/VHC/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'ggridges' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\VHC\AppData\Local\Temp\Rtmp0SNy9Z\downloaded_packages
library(ggridges)
ggplot(data = gapminder2007, aes(x = gdpPercap, y = continent, fill = continent)) +
geom_density_ridges(alpha = 0.7) +
theme_ridges() +
labs(title = "Ridge Plot for GDP Per Capita") +
theme(legend.position = "none")
## Picking joint bandwidth of 3510

#SO SÁNH TUỔI THỌ TRUNG BÌNH GIỮA CÁC QUỐC GIA
#Lọc ra dữ liệu các quốc gia ở châu Á
asia <-gapminder %>%
filter(continent=="Asia" & year ==2007)
asia %>%
head(n=10)
## # A tibble: 10 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 2007 43.8 31889923 975.
## 2 Bahrain Asia 2007 75.6 708573 29796.
## 3 Bangladesh Asia 2007 64.1 150448339 1391.
## 4 Cambodia Asia 2007 59.7 14131858 1714.
## 5 China Asia 2007 73.0 1318683096 4959.
## 6 Hong Kong, China Asia 2007 82.2 6980412 39725.
## 7 India Asia 2007 64.7 1110396331 2452.
## 8 Indonesia Asia 2007 70.6 223547000 3541.
## 9 Iran Asia 2007 71.0 69453570 11606.
## 10 Iraq Asia 2007 59.5 27499638 4471.
#Lọc ra dữ liệu cho các quốc gia ở châu Âu
europe <-gapminder %>%
filter(continent=="Europe" & year ==2007)
europe %>%
head(n=10)
## # A tibble: 10 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Albania Europe 2007 76.4 3600523 5937.
## 2 Austria Europe 2007 79.8 8199783 36126.
## 3 Belgium Europe 2007 79.4 10392226 33693.
## 4 Bosnia and Herzegovina Europe 2007 74.9 4552198 7446.
## 5 Bulgaria Europe 2007 73.0 7322858 10681.
## 6 Croatia Europe 2007 75.7 4493312 14619.
## 7 Czech Republic Europe 2007 76.5 10228744 22833.
## 8 Denmark Europe 2007 78.3 5468120 35278.
## 9 Finland Europe 2007 79.3 5238460 33207.
## 10 France Europe 2007 80.7 61083916 30470.
#Trực quan dữ liệu châu Á để so sánh
ggplot(data= asia, mapping= aes(x= country,y= lifeExp,fill= country)) +
geom_bar(stat= "identity", width= 0.9) +
coord_flip()

#Sắp xếp lại trật tự cho châu Á và gán cho đối tượng là graph1
ggplot(data= asia, mapping= aes(x= reorder(country,lifeExp), y= lifeExp,fill= country)) +
geom_bar(stat= "identity", width= 0.9) +
coord_flip()+
theme(legend.position= "none") +
labs(x="", y="LifeExpectancyofAsia")->graph1
graph1

#Tương tự vớichâu Âu gán cho đối tượnglà graph2
ggplot(data= europe, mapping= aes(x= reorder(country,lifeExp), y= lifeExp, fill= country))+
geom_bar(stat= "identity", width= 0.9) +
coord_flip()+
theme(legend.position= "none") +
labs(x="", y="LifeExpectancyofEurope")->graph2
graph2

#hiển thị nhiều biểu đồ cùng 1 lúc
install.packages("gridExtra")
## Installing package into 'C:/Users/VHC/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'gridExtra' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\VHC\AppData\Local\Temp\Rtmp0SNy9Z\downloaded_packages
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(graph1, graph2, ncol = 2)
