data <- read_csv("../00_data/myData.csv")
## Rows: 20755 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Entity, Code
## dbl (2): Year, LifeExpectancy
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data2 <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-12-05/life_expectancy_different_ages.csv')
## Rows: 20755 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Entity, Code
## dbl (7): Year, LifeExpectancy0, LifeExpectancy10, LifeExpectancy25, LifeExpe...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Describe the two datasets Data 1: Life Expectancy
Columns: year, life expectancy, entity Rows: 10
Data 2: Life Expectancy at 25 Columns: year, life expectancy (25),
entity
Rows: 10
set.seed(123495)
data_small <- data %>% select(LifeExpectancy, Entity) %>% sample_n(10)
data2_small <- data2 %>% select(LifeExpectancy25, Entity ) %>% sample_n(10)
data_small
## # A tibble: 10 × 2
## LifeExpectancy Entity
## <dbl> <chr>
## 1 66.6 Eritrea
## 2 25 Guatemala
## 3 71.1 Belarus
## 4 36.2 East Timor
## 5 58.9 Least developed countries
## 6 67.8 Fiji
## 7 61.9 Marshall Islands
## 8 46.0 South Africa
## 9 48.2 Greece
## 10 56.7 Belgium
data2_small
## # A tibble: 10 × 2
## LifeExpectancy25 Entity
## <dbl> <chr>
## 1 72.9 Hungary
## 2 69.1 Saint Lucia
## 3 74.4 Antigua and Barbuda
## 4 72.5 Belgium
## 5 66.2 Saint Pierre and Miquelon
## 6 62.4 Mozambique
## 7 74.4 Bahrain
## 8 71.4 Armenia
## 9 65.3 Cayman Islands
## 10 77.2 China
Describe the resulting data:
Columns: Entity, Life Expectancy, Life Expectancy at 25 Rows: 1
How is it different from the original two datasets? 1 row compared to over 20,000 with only entity and life expectancy, and life expectancy at 25.
data_small %>% inner_join(data2_small, by = c("Entity"))
## # A tibble: 1 × 3
## LifeExpectancy Entity LifeExpectancy25
## <dbl> <chr> <dbl>
## 1 56.7 Belgium 72.5
Describe the resulitng data:
Columns: Life expectancy, Entity, Life expectancy at 25 Rows: 10
How is it different from the two original datasets? 10 rows compared to over 20,000. Does not include age or year.
data_small %>% left_join(data2_small, by = c("Entity"))
## # A tibble: 10 × 3
## LifeExpectancy Entity LifeExpectancy25
## <dbl> <chr> <dbl>
## 1 66.6 Eritrea NA
## 2 25 Guatemala NA
## 3 71.1 Belarus NA
## 4 36.2 East Timor NA
## 5 58.9 Least developed countries NA
## 6 67.8 Fiji NA
## 7 61.9 Marshall Islands NA
## 8 46.0 South Africa NA
## 9 48.2 Greece NA
## 10 56.7 Belgium 72.5
Describe the resulting data: Columns:Life expectancy, Entity, Life expectancy at 25 Rows: 10
How is it different from the original data set? only 10 rows, most of the data is not available from the original dataset the expectancy is NA. But the life expectancy at 25 is all listed for varous countries.
data_small %>% right_join(data2_small, by = c ("Entity"))
## # A tibble: 10 × 3
## LifeExpectancy Entity LifeExpectancy25
## <dbl> <chr> <dbl>
## 1 56.7 Belgium 72.5
## 2 NA Hungary 72.9
## 3 NA Saint Lucia 69.1
## 4 NA Antigua and Barbuda 74.4
## 5 NA Saint Pierre and Miquelon 66.2
## 6 NA Mozambique 62.4
## 7 NA Bahrain 74.4
## 8 NA Armenia 71.4
## 9 NA Cayman Islands 65.3
## 10 NA China 77.2