ds<- gapminder::gapminder
# colnames(ds)
# [1] "country" "continent" "year" "lifeExp" "pop" "gdpPercap"
select the first 3 columns
# you have many options
ds %>% select(1:3) %>% head
## # A tibble: 6 × 3
## country continent year
## <fct> <fct> <int>
## 1 Afghanistan Asia 1952
## 2 Afghanistan Asia 1957
## 3 Afghanistan Asia 1962
## 4 Afghanistan Asia 1967
## 5 Afghanistan Asia 1972
## 6 Afghanistan Asia 1977
# ds %>% select(1,2,3)
# ds %>% select(country:year)
# ds %>% select(-c(4:6))
# ds %>% select(-c(lifeExp:gdpPercap))
# ds %>% select(contains("c"))
filter data for Jordan and the year 2007
ds %>% filter(country=="Jordan") %>% filter(year=="2007")
## # A tibble: 1 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Jordan Asia 2007 72.5 6053193 4519.
ds %>% filter(country=="Jordan" & year=="2007")
## # A tibble: 1 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Jordan Asia 2007 72.5 6053193 4519.
ds %>% filter(country=="Jordan" , year=="2007")
## # A tibble: 1 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Jordan Asia 2007 72.5 6053193 4519.
What was the total population of the world in 2007
ds %>% filter(year=="2007") %>% summarise(Population=sum(pop))
## # A tibble: 1 × 1
## Population
## <dbl>
## 1 6251013179
What was the the population of each continent in 2007?
ds %>% filter(year=="2007") %>% group_by(continent) %>% summarise(Population_per_continent=sum(pop))
## # A tibble: 5 × 2
## continent Population_per_continent
## <fct> <dbl>
## 1 Africa 929539692
## 2 Americas 898871184
## 3 Asia 3811953827
## 4 Europe 586098529
## 5 Oceania 24549947
GDP of each contry is population X gdpPercap. Calculate sum GDP for the whole world in 2007
ds %>% filter(year=="2007") %>% mutate(GDP=pop*gdpPercap) %>% summarise(Total_GDP=sum(GDP))
## # A tibble: 1 × 1
## Total_GDP
## <dbl>
## 1 5.81e13
GDP in JOD
ds %>% filter(year=="2007") %>% mutate(GDP=pop*gdpPercap, GDP_JOD=GDP/0.71) %>% summarise(Total_GDP=sum(GDP))
## # A tibble: 1 × 1
## Total_GDP
## <dbl>
## 1 5.81e13
What is the percentage of each country population per its continent’s total in 2007?
ds %>% filter(year=="2007") %>% group_by(continent) %>% mutate(cont_pop=sum(pop)) %>% ungroup() %>% mutate(country_Percent=pop*100/cont_pop) %>% head
## # A tibble: 6 × 8
## country continent year lifeExp pop gdpPercap cont_pop country_Pe…¹
## <fct> <fct> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 Afghanistan Asia 2007 43.8 31889923 975. 3811953827 0.837
## 2 Albania Europe 2007 76.4 3600523 5937. 586098529 0.614
## 3 Algeria Africa 2007 72.3 33333216 6223. 929539692 3.59
## 4 Angola Africa 2007 42.7 12420476 4797. 929539692 1.34
## 5 Argentina Americas 2007 75.3 40301927 12779. 898871184 4.48
## 6 Australia Oceania 2007 81.2 20434176 34435. 24549947 83.2
## # … with abbreviated variable name ¹country_Percent
Arrange countries based on gpdOercap in 2007, maximum values on top.
ds %>% filter(year=="2007") %>% arrange(-gdpPercap) %>% head
## # A tibble: 6 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Norway Europe 2007 80.2 4627926 49357.
## 2 Kuwait Asia 2007 77.6 2505559 47307.
## 3 Singapore Asia 2007 80.0 4553009 47143.
## 4 United States Americas 2007 78.2 301139947 42952.
## 5 Ireland Europe 2007 78.9 4109086 40676.
## 6 Hong Kong, China Asia 2007 82.2 6980412 39725.
Show the year that had the highest gpd for each country
ds %>% arrange(-gdpPercap) %>% distinct(country)
## # A tibble: 142 × 1
## country
## <fct>
## 1 Kuwait
## 2 Norway
## 3 Singapore
## 4 United States
## 5 Ireland
## 6 Hong Kong, China
## 7 Switzerland
## 8 Netherlands
## 9 Canada
## 10 Iceland
## # … with 132 more rows
ds %>% arrange(-gdpPercap) %>% distinct(country, .keep_all = T) %>% head
## # A tibble: 6 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Kuwait Asia 1957 58.0 212846 113523.
## 2 Norway Europe 2007 80.2 4627926 49357.
## 3 Singapore Asia 2007 80.0 4553009 47143.
## 4 United States Americas 2007 78.2 301139947 42952.
## 5 Ireland Europe 2007 78.9 4109086 40676.
## 6 Hong Kong, China Asia 2007 82.2 6980412 39725.
make a table that has columns as names of continents, rows as years, and cells as total population per continent.
ds %>% select(continent, year, pop) %>% group_by(year, continent) %>% summarise(popPerCont=sum(pop)) %>% ungroup() %>% spread(continent, popPerCont)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 6
## year Africa Americas Asia Europe Oceania
## <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1952 237640501 345152446 1395357351 418120846 10686006
## 2 1957 264837738 386953916 1562780599 437890351 11941976
## 3 1962 296516865 433270254 1696357182 460355155 13283518
## 4 1967 335289489 480746623 1905662900 481178958 14600414
## 5 1972 379879541 529384210 2150972248 500635059 16106100
## 6 1977 433061021 578067699 2384513556 517164531 17239000
## 7 1982 499348587 630290920 2610135582 531266901 18394850
## 8 1987 574834110 682753971 2871220762 543094160 19574415
## 9 1992 659081517 739274104 3133292191 558142797 20919651
## 10 1997 743832984 796900410 3383285500 568944148 22241430
## 11 2002 833723916 849772762 3601802203 578223869 23454829
## 12 2007 929539692 898871184 3811953827 586098529 24549947
What are the top 5 countries that have the best life expectency in 2007?
ds %>% filter(year=="2007") %>% slice_max(lifeExp, n=5)
## # A tibble: 5 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Japan Asia 2007 82.6 127467972 31656.
## 2 Hong Kong, China Asia 2007 82.2 6980412 39725.
## 3 Iceland Europe 2007 81.8 301931 36181.
## 4 Switzerland Europe 2007 81.7 7554661 37506.
## 5 Australia Oceania 2007 81.2 20434176 34435.
ds %>% filter(year=="2007") %>% slice_max(lifeExp, n=5) %>% pull(country)
## [1] Japan Hong Kong, China Iceland Switzerland
## [5] Australia
## 142 Levels: Afghanistan Albania Algeria Angola Argentina Australia ... Zimbabwe
Which country had the highest gpdPercap in each continent in 2007?
ds %>% filter(year=="2007") %>% group_by(continent) %>% slice_max(gdpPercap, n=1) %>% select(continent, country)
## # A tibble: 5 × 2
## # Groups: continent [5]
## continent country
## <fct> <fct>
## 1 Africa Gabon
## 2 Americas United States
## 3 Asia Kuwait
## 4 Europe Norway
## 5 Oceania Australia
how many contries had a population more than / lesss than 10 million in 2007
ds %>% filter(year=="2007") %>% mutate(pop10M=ifelse(pop>10e6, "More than 10 M", "Less than 10M")) %>% count(pop10M)
## # A tibble: 2 × 2
## pop10M n
## <chr> <int>
## 1 Less than 10M 66
## 2 More than 10 M 76
Change the name of Jordan to the Hashemite Kingdom of Jordan
ds %>% mutate(country=recode(country, Jordan="Hashemite Kingdom of Jordan")) %>% head
## # A tibble: 6 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
ds %>% mutate(across(is.numeric, sqrt ))%>% head
## Warning: Predicate functions must be wrapped in `where()`.
##
## # Bad
## data %>% select(is.numeric)
##
## # Good
## data %>% select(where(is.numeric))
##
## ℹ Please update your code.
## This message is displayed once per session.
## # A tibble: 6 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan Asia 44.2 5.37 2903. 27.9
## 2 Afghanistan Asia 44.2 5.51 3040. 28.7
## 3 Afghanistan Asia 44.3 5.66 3204. 29.2
## 4 Afghanistan Asia 44.4 5.83 3397. 28.9
## 5 Afghanistan Asia 44.4 6.01 3617. 27.2
## 6 Afghanistan Asia 44.5 6.20 3858. 28.0
https://dplyr.tidyverse.org/reference/group_map.html ## group_map for each continents , is there a significant association between the gpdPercap and lifeExp in 2007. show p value of linear regression models.
lmPval <- function(DATA) {
broom::tidy(lm(DATA[["lifeExp"]]~DATA[["gdpPercap"]]))$p.value[2]
}
ds %>% filter(year=="2007") %>% group_by(continent) %>% group_map(~lmPval(.x)) %>% unlist
## [1] 4.861750e-03 1.866495e-03 9.130069e-06 2.795338e-09 NaN
ds %>%
filter(year=="2007") %>%
group_by(continent) %>%
group_modify(~broom::tidy(lm(lifeExp~gdpPercap, data=.x))) %>%
filter(term=="gdpPercap")
## # A tibble: 5 × 6
## # Groups: continent [5]
## continent term estimate std.error statistic p.value
## <fct> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Africa gdpPercap 0.00102 0.000347 2.95 4.86e-3
## 2 Americas gdpPercap 0.000270 0.0000769 3.51 1.87e-3
## 3 Asia gdpPercap 0.000388 0.0000732 5.30 9.13e-6
## 4 Europe gdpPercap 0.000215 0.0000251 8.54 2.80e-9
## 5 Oceania gdpPercap 0.000111 NaN NaN NaN