Challenge 8

Author

Jingyi Yang

1. Start Up

knitr::opts_chunk$set(echo = TRUE)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(here)
here() starts at C:/8-601
library(readr)
library(readxl)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("countrycode")

2. Import and clean the data

I clean the data through the following:

  1. Delete the column that contains “Code,” which includes repetitive information.

  2. Delete the “Unit,” “Element,” and “Domain” columns, which only contain one variable (in the data set “Livestock”) or might not be used in data analysis.

  3. Filter the aggregated data.

  4. Delete the “Flag” and “Flag Description column, which is not very helpful for data analysis.

  5. Keep the information in the “Country_groups” column in the “Country_groups” data set only if they refer to the continent.

Cattle_dairy<- here("challenge_datasets","FAOSTAT_cattle_dairy.csv") %>%
  read_csv()%>%
  select(-c(contains("Code"), Element, Domain, Unit))%>%
  filter(Flag!="A") %>%
  select(-c(Flag,`Flag Description`))
Rows: 36449 Columns: 14
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (8): Domain Code, Domain, Area, Element, Item, Unit, Flag, Flag Description
dbl (6): Area Code, Element Code, Item Code, Year Code, Year, Value

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Cattle_dairy
# A tibble: 23,335 × 4
   Area        Item                   Year  Value
   <chr>       <chr>                 <dbl>  <dbl>
 1 Afghanistan Milk, whole fresh cow  1961 700000
 2 Afghanistan Milk, whole fresh cow  1961   5000
 3 Afghanistan Milk, whole fresh cow  1961 350000
 4 Afghanistan Milk, whole fresh cow  1962 700000
 5 Afghanistan Milk, whole fresh cow  1962   5000
 6 Afghanistan Milk, whole fresh cow  1962 350000
 7 Afghanistan Milk, whole fresh cow  1963 780000
 8 Afghanistan Milk, whole fresh cow  1963   5128
 9 Afghanistan Milk, whole fresh cow  1963 400000
10 Afghanistan Milk, whole fresh cow  1964 780000
# ℹ 23,325 more rows
continent <- unique(codelist$continent)

Country_groups <- here("challenge_datasets","FAOSTAT_country_groups.csv") %>%
  read_csv() %>%
  select(- contains("Code"))%>%
  rename("Area"= "Country")%>%
  filter(`Country Group`%in% continent)
Rows: 1943 Columns: 7
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): Country Group, Country, M49 Code, ISO2 Code, ISO3 Code
dbl (2): Country Group Code, Country Code

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Country_groups
# A tibble: 276 × 2
   `Country Group` Area                    
   <chr>           <chr>                   
 1 Africa          Algeria                 
 2 Africa          Angola                  
 3 Africa          Benin                   
 4 Africa          Botswana                
 5 Africa          Burkina Faso            
 6 Africa          Burundi                 
 7 Africa          Cabo Verde              
 8 Africa          Cameroon                
 9 Africa          Central African Republic
10 Africa          Chad                    
# ℹ 266 more rows
Egg_chicken <- here("challenge_datasets","FAOSTAT_egg_chicken.csv") %>%
  read_csv()%>%
  select(-c(contains("Code"), Element, Domain, Unit))%>%
  filter(Flag!="A") %>%
  select(-c(Flag,`Flag Description`))
Rows: 38170 Columns: 14
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (8): Domain Code, Domain, Area, Element, Item, Unit, Flag, Flag Description
dbl (6): Area Code, Element Code, Item Code, Year Code, Year, Value

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Egg_chicken
# A tibble: 27,436 × 4
   Area        Item                 Year Value
   <chr>       <chr>               <dbl> <dbl>
 1 Afghanistan Eggs, hen, in shell  1961  4000
 2 Afghanistan Eggs, hen, in shell  1961 25000
 3 Afghanistan Eggs, hen, in shell  1961 10000
 4 Afghanistan Eggs, hen, in shell  1962  4400
 5 Afghanistan Eggs, hen, in shell  1962 25000
 6 Afghanistan Eggs, hen, in shell  1962 11000
 7 Afghanistan Eggs, hen, in shell  1963  4600
 8 Afghanistan Eggs, hen, in shell  1963 25000
 9 Afghanistan Eggs, hen, in shell  1963 11500
10 Afghanistan Eggs, hen, in shell  1964  4800
# ℹ 27,426 more rows
Livestock <- here("challenge_datasets","FAOSTAT_livestock.csv") %>%
  read_csv()%>%
  select(-c(contains("Code"), Element, Domain, Unit))%>%
  filter(Flag!="A") %>%
  select(-c(Flag,`Flag Description`))
Rows: 82116 Columns: 14
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (8): Domain Code, Domain, Area, Element, Item, Unit, Flag, Flag Description
dbl (6): Area Code, Element Code, Item Code, Year Code, Year, Value

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Livestock
# A tibble: 31,279 × 4
   Area        Item   Year   Value
   <chr>       <chr> <dbl>   <dbl>
 1 Afghanistan Asses  1964 1150000
 2 Afghanistan Asses  1973 1250000
 3 Afghanistan Asses  1974 1250000
 4 Afghanistan Asses  1975 1250000
 5 Afghanistan Asses  1976 1250000
 6 Afghanistan Asses  1978 1300000
 7 Afghanistan Asses  1979 1300000
 8 Afghanistan Asses  1980 1295000
 9 Afghanistan Asses  1981 1315000
10 Afghanistan Asses  1982 1315000
# ℹ 31,269 more rows

3. Join and Analysis

I use the left_join(), right_join(), and full_join to add information about the continent in the summary table and join several data sets together for analysis. I use inner_join(), semi_join(), andanti_join() functions to clean the information that is not available and filter the values in the “Area” column based on whether or not they represent the country.

 Cattle_summary <- Cattle_dairy %>%
  group_by(Area)%>%
  summarise(n= n(), total_value= sum(Value,na.rm = T), m_value=mean(Value,na.rm = T))%>%
  ungroup()

right_join(Country_groups,Cattle_summary, by="Area")
# A tibble: 232 × 5
   `Country Group` Area                         n total_value m_value
   <chr>           <chr>                    <int>       <dbl>   <dbl>
 1 Africa          Algeria                    123    34152880 277666.
 2 Africa          Angola                     173    28426778 164317.
 3 Africa          Benin                      131     5929620  45264.
 4 Africa          Botswana                   174    28585119 164282.
 5 Africa          Burkina Faso               106    17596097 166001.
 6 Africa          Burundi                    155     7216303  46557.
 7 Africa          Cabo Verde                 153      579526   3788.
 8 Africa          Cameroon                   172    17872197 103908.
 9 Africa          Central African Republic   170    12150128  71471.
10 Africa          Chad                       174    40511532 232825.
# ℹ 222 more rows
inner_join(Cattle_summary, Country_groups) ## clean the information that is not available
Joining with `by = join_by(Area)`
# A tibble: 204 × 5
   Area                    n total_value  m_value `Country Group`
   <chr>               <int>       <dbl>    <dbl> <chr>          
 1 Afghanistan           149   130181708  873703. Asia           
 2 Albania                96     8494126   88480. Europe         
 3 Algeria               123    34152880  277666. Africa         
 4 American Samoa        172      628193    3652. Oceania        
 5 Angola                173    28426778  164317. Africa         
 6 Antigua and Barbuda   171     1096874    6414. Americas       
 7 Argentina             111   137915720 1242484. Americas       
 8 Armenia                29     1383364   47702. Asia           
 9 Australia              58     2331255   40194. Oceania        
10 Austria                58     2547863   43929. Europe         
# ℹ 194 more rows
Egg_summary <- Egg_chicken%>%
  group_by(Area)%>%
  summarise(median=median(Value), sd=sd(Value), range(Value))%>%
  right_join(Country_groups, by="Area")
Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
dplyr 1.1.0.
ℹ Please use `reframe()` instead.
ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
  always returns an ungrouped data frame and adjust accordingly.
`summarise()` has grouped output by 'Area'. You can override using the
`.groups` argument.
Egg_summary
# A tibble: 492 × 5
# Groups:   Area [276]
   Area           median     sd `range(Value)` `Country Group`
   <chr>           <dbl>  <dbl>          <dbl> <chr>          
 1 Afghanistan     14200  7625.           4000 Asia           
 2 Afghanistan     14200  7625.          29048 Asia           
 3 Albania         12200 35029.            420 Europe         
 4 Albania         12200 35029.         110543 Europe         
 5 Algeria         27600 57487.           4600 Africa         
 6 Algeria         27600 57487.         390000 Africa         
 7 American Samoa     32 18858.              5 Oceania        
 8 American Samoa     32 18858.          51648 Oceania        
 9 Angola           3900 19755.            654 Africa         
10 Angola           3900 19755.          63067 Africa         
# ℹ 482 more rows
Total_Table <- Livestock %>% full_join(Cattle_dairy, by = join_by(Area, Item, Year, Value)) %>% full_join(Egg_chicken ,by = join_by(Area, Item, Year, Value))%>% full_join(Country_groups)%>%
  arrange (Area, Item)
Joining with `by = join_by(Area)`
Total_Table
# A tibble: 82,101 × 5
   Area        Item   Year   Value `Country Group`
   <chr>       <chr> <dbl>   <dbl> <chr>          
 1 Afghanistan Asses  1964 1150000 Asia           
 2 Afghanistan Asses  1973 1250000 Asia           
 3 Afghanistan Asses  1974 1250000 Asia           
 4 Afghanistan Asses  1975 1250000 Asia           
 5 Afghanistan Asses  1976 1250000 Asia           
 6 Afghanistan Asses  1978 1300000 Asia           
 7 Afghanistan Asses  1979 1300000 Asia           
 8 Afghanistan Asses  1980 1295000 Asia           
 9 Afghanistan Asses  1981 1315000 Asia           
10 Afghanistan Asses  1982 1315000 Asia           
# ℹ 82,091 more rows
summary_total <- Total_Table %>%
  group_by(Area, Item)%>%
  summarise(avg_stocks = mean(Value, na.rm=TRUE),
            med_stocks = median(Value, na.rm=TRUE),
            sd_stock = sd(Value,na.rm=TRUE),
            n_missing = sum(is.na(Value)))%>%
  left_join(Country_groups, by="Area")
`summarise()` has grouped output by 'Area'. You can override using the
`.groups` argument.
summary_total
# A tibble: 1,613 × 7
# Groups:   Area [305]
   Area        Item     avg_stocks med_stocks sd_stock n_missing `Country Group`
   <chr>       <chr>         <dbl>      <dbl>    <dbl>     <int> <chr>          
 1 Afghanistan Asses      1035797.    1250000  319734.         0 Asia           
 2 Afghanistan Camels      246944.     250000   43255.         0 Asia           
 3 Afghanistan Cattle     2672409.    2797500  928853.         0 Asia           
 4 Afghanistan Eggs, h…     15398.      14200    7625.         0 Asia           
 5 Afghanistan Goats      4018809.    3750000 1570651.         0 Asia           
 6 Afghanistan Horses      337873.     370000   96615.         0 Asia           
 7 Afghanistan Milk, w…    873703.     556000 1069804.         0 Asia           
 8 Afghanistan Mules        27473.      27500    3471.         0 Asia           
 9 Afghanistan Sheep     15680483.   15055000 2387024.         0 Asia           
10 Africa      Eggs, h…     46818.      47052    9912.         0 <NA>           
# ℹ 1,603 more rows
summary_total %>%
  distinct(Area)
# A tibble: 305 × 1
# Groups:   Area [305]
   Area               
   <chr>              
 1 Afghanistan        
 2 Africa             
 3 Albania            
 4 Algeria            
 5 American Samoa     
 6 Americas           
 7 Andorra            
 8 Angola             
 9 Anguilla           
10 Antigua and Barbuda
# ℹ 295 more rows
 summary_total_1 <- summary_total%>%
  semi_join(Country_groups, by="Area") # Show the values in the "Area" column that match continent information. In other words, show the country values in the "Area" column.

 summary_total_1
# A tibble: 1,556 × 7
# Groups:   Area [276]
   Area        Item     avg_stocks med_stocks sd_stock n_missing `Country Group`
   <chr>       <chr>         <dbl>      <dbl>    <dbl>     <int> <chr>          
 1 Afghanistan Asses      1035797.    1250000  319734.         0 Asia           
 2 Afghanistan Camels      246944.     250000   43255.         0 Asia           
 3 Afghanistan Cattle     2672409.    2797500  928853.         0 Asia           
 4 Afghanistan Eggs, h…     15398.      14200    7625.         0 Asia           
 5 Afghanistan Goats      4018809.    3750000 1570651.         0 Asia           
 6 Afghanistan Horses      337873.     370000   96615.         0 Asia           
 7 Afghanistan Milk, w…    873703.     556000 1069804.         0 Asia           
 8 Afghanistan Mules        27473.      27500    3471.         0 Asia           
 9 Afghanistan Sheep     15680483.   15055000 2387024.         0 Asia           
10 Albania     Asses        79896.      78000   20242.         0 Europe         
# ℹ 1,546 more rows
 summary_total_1 %>%
   distinct(Area)
# A tibble: 276 × 1
# Groups:   Area [276]
   Area               
   <chr>              
 1 Afghanistan        
 2 Albania            
 3 Algeria            
 4 American Samoa     
 5 Andorra            
 6 Angola             
 7 Anguilla           
 8 Antigua and Barbuda
 9 Argentina          
10 Armenia            
# ℹ 266 more rows
summary_total_2 <- summary_total%>%
  anti_join(Country_groups, by="Area")# Show the values in the "Area" column that do not match continent information. In other words, show the values in the "Area" column that do not represent the country.

summary_total_2 
# A tibble: 57 × 7
# Groups:   Area [29]
   Area           Item  avg_stocks med_stocks sd_stock n_missing `Country Group`
   <chr>          <chr>      <dbl>      <dbl>    <dbl>     <int> <chr>          
 1 Africa         Eggs…     46818.     47052     9912.         0 <NA>           
 2 Africa         Milk…      4869.      4600      439.         0 <NA>           
 3 Americas       Eggs…    115442.    112066.    7520.         0 <NA>           
 4 Americas       Milk…     27492.     25327     4475.         0 <NA>           
 5 Asia           Eggs…     77617.     86820.   17555.         0 <NA>           
 6 Asia           Milk…     11052.      9794     3955.         0 <NA>           
 7 Australia and… Eggs…    126346.    122012    16718.         0 <NA>           
 8 Australia and… Milk…     34615.     33060.    6974.         0 <NA>           
 9 Caribbean      Eggs…     94266.     95140.    9729.         0 <NA>           
10 Caribbean      Milk…     13933.     13808     2415.         0 <NA>           
# ℹ 47 more rows
summary_total_2 %>%
   distinct(Area)
# A tibble: 29 × 1
# Groups:   Area [29]
   Area                     
   <chr>                    
 1 Africa                   
 2 Americas                 
 3 Asia                     
 4 Australia and New Zealand
 5 Caribbean                
 6 Central America          
 7 Central Asia             
 8 Eastern Africa           
 9 Eastern Asia             
10 Eastern Europe           
# ℹ 19 more rows