Load required libraries

library(conflicted)
library(dplyr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ ggplot2   3.4.4     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
library(ggplot2)
library(zoo)
library(scales)

Read data

cattle_dairy <- read.csv("challenge_datasets/FAOSTAT_cattle_dairy.csv")
head(cattle_dairy)
##   Domain.Code            Domain Area.Code        Area Element.Code      Element
## 1          QL Livestock Primary         2 Afghanistan         5318 Milk Animals
## 2          QL Livestock Primary         2 Afghanistan         5420        Yield
## 3          QL Livestock Primary         2 Afghanistan         5510   Production
## 4          QL Livestock Primary         2 Afghanistan         5318 Milk Animals
## 5          QL Livestock Primary         2 Afghanistan         5420        Yield
## 6          QL Livestock Primary         2 Afghanistan         5510   Production
##   Item.Code                  Item Year.Code Year   Unit  Value Flag
## 1       882 Milk, whole fresh cow      1961 1961   Head 700000    F
## 2       882 Milk, whole fresh cow      1961 1961  hg/An   5000   Fc
## 3       882 Milk, whole fresh cow      1961 1961 tonnes 350000    F
## 4       882 Milk, whole fresh cow      1962 1962   Head 700000    F
## 5       882 Milk, whole fresh cow      1962 1962  hg/An   5000   Fc
## 6       882 Milk, whole fresh cow      1962 1962 tonnes 350000    F
##   Flag.Description
## 1     FAO estimate
## 2  Calculated data
## 3     FAO estimate
## 4     FAO estimate
## 5  Calculated data
## 6     FAO estimate
egg_chicken <- read.csv("challenge_datasets/FAOSTAT_egg_chicken.csv")
head(egg_chicken)
##   Domain.Code            Domain Area.Code        Area Element.Code    Element
## 1          QL Livestock Primary         2 Afghanistan         5313     Laying
## 2          QL Livestock Primary         2 Afghanistan         5410      Yield
## 3          QL Livestock Primary         2 Afghanistan         5510 Production
## 4          QL Livestock Primary         2 Afghanistan         5313     Laying
## 5          QL Livestock Primary         2 Afghanistan         5410      Yield
## 6          QL Livestock Primary         2 Afghanistan         5510 Production
##   Item.Code                Item Year.Code Year      Unit Value Flag
## 1      1062 Eggs, hen, in shell      1961 1961 1000 Head  4000    F
## 2      1062 Eggs, hen, in shell      1961 1961  100mg/An 25000   Fc
## 3      1062 Eggs, hen, in shell      1961 1961    tonnes 10000    F
## 4      1062 Eggs, hen, in shell      1962 1962 1000 Head  4400    F
## 5      1062 Eggs, hen, in shell      1962 1962  100mg/An 25000   Fc
## 6      1062 Eggs, hen, in shell      1962 1962    tonnes 11000    F
##   Flag.Description
## 1     FAO estimate
## 2  Calculated data
## 3     FAO estimate
## 4     FAO estimate
## 5  Calculated data
## 6     FAO estimate

Sanity check

Let’s see if the Element.Code in the second table corresponds to the Element.Code in the first table. Seems like it does!

filtered_data <- egg_chicken %>%
    dplyr::filter(Element.Code == 5313)
head(filtered_data)
##   Domain.Code            Domain Area.Code        Area Element.Code Element
## 1          QL Livestock Primary         2 Afghanistan         5313  Laying
## 2          QL Livestock Primary         2 Afghanistan         5313  Laying
## 3          QL Livestock Primary         2 Afghanistan         5313  Laying
## 4          QL Livestock Primary         2 Afghanistan         5313  Laying
## 5          QL Livestock Primary         2 Afghanistan         5313  Laying
## 6          QL Livestock Primary         2 Afghanistan         5313  Laying
##   Item.Code                Item Year.Code Year      Unit Value Flag
## 1      1062 Eggs, hen, in shell      1961 1961 1000 Head  4000    F
## 2      1062 Eggs, hen, in shell      1962 1962 1000 Head  4400    F
## 3      1062 Eggs, hen, in shell      1963 1963 1000 Head  4600    F
## 4      1062 Eggs, hen, in shell      1964 1964 1000 Head  4800    F
## 5      1062 Eggs, hen, in shell      1965 1965 1000 Head  5200    F
## 6      1062 Eggs, hen, in shell      1966 1966 1000 Head  5500     
##   Flag.Description
## 1     FAO estimate
## 2     FAO estimate
## 3     FAO estimate
## 4     FAO estimate
## 5     FAO estimate
## 6    Official data

What each column contains??

unique(cattle_dairy$Domain)
## [1] "Livestock Primary"
unique(egg_chicken$Domain)
## [1] "Livestock Primary"
unique(cattle_dairy$Element)
## [1] "Milk Animals" "Yield"        "Production"
unique(egg_chicken$Element)
## [1] "Laying"     "Yield"      "Production"
unique(cattle_dairy$Item)
## [1] "Milk, whole fresh cow"
unique(egg_chicken$Item)
## [1] "Eggs, hen, in shell"
unique(cattle_dairy$Unit)
## [1] "Head"   "hg/An"  "tonnes"
unique(egg_chicken$Unit)
## [1] "1000 Head" "100mg/An"  "tonnes"

Describing the data

The first dataset, FAOSTAT_cattle_dairy.csv, contains data about cattle and dairy production. Here are some key details:

  • Number of Entries: 36,449

  • Columns: 14

  • Data Columns: Include ‘Domain Code’, ‘Domain’, ‘Area Code’, ‘Area’, ‘Element Code’, ‘Element’, ‘Item Code’, ‘Item’, ‘Year Code’, ‘Year’, ‘Unit’, ‘Value’, ‘Flag’, and ‘Flag Description’.

  • Data Types: Mixed types, including integers, floating-point numbers, and objects (typically strings).

  • Sample Data: The first few rows show data for Afghanistan in 1961 and 1962, with entries for ‘Milk Animals’, ‘Yield’, and ‘Production’ of ‘Milk, whole fresh cow’.

summary(cattle_dairy)
##  Domain.Code           Domain            Area.Code          Area          
##  Length:36449       Length:36449       Min.   :   1.0   Length:36449      
##  Class :character   Class :character   1st Qu.:  69.0   Class :character  
##  Mode  :character   Mode  :character   Median : 141.0   Mode  :character  
##                                        Mean   : 775.2                     
##                                        3rd Qu.: 215.0                     
##                                        Max.   :5504.0                     
##                                                                           
##   Element.Code    Element            Item.Code       Item          
##  Min.   :5318   Length:36449       Min.   :882   Length:36449      
##  1st Qu.:5318   Class :character   1st Qu.:882   Class :character  
##  Median :5420   Mode  :character   Median :882   Mode  :character  
##  Mean   :5416                      Mean   :882                     
##  3rd Qu.:5510                      3rd Qu.:882                     
##  Max.   :5510                      Max.   :882                     
##                                                                    
##    Year.Code         Year          Unit               Value          
##  Min.   :1961   Min.   :1961   Length:36449       Min.   :        7  
##  1st Qu.:1976   1st Qu.:1976   Class :character   1st Qu.:     7849  
##  Median :1991   Median :1991   Mode  :character   Median :    43266  
##  Mean   :1990   Mean   :1990                      Mean   :  4410235  
##  3rd Qu.:2005   3rd Qu.:2005                      3rd Qu.:   700000  
##  Max.   :2018   Max.   :2018                      Max.   :683217055  
##                                                   NA's   :74         
##      Flag           Flag.Description  
##  Length:36449       Length:36449      
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
## 

The first dataset, FAOSTAT_egg_chicken.csv, contains data about eggs and chicken production. Here are some key details:

  • Number of Entries: 38,170

  • Columns: 14

  • Data Columns: Include ‘Domain Code’, ‘Domain’, ‘Area Code’, ‘Area’, ‘Element Code’, ‘Element’, ‘Item Code’, ‘Item’, ‘Year Code’, ‘Year’, ‘Unit’, ‘Value’, ‘Flag’, and ‘Flag Description’.

  • Data Types: Mixed types, including integers, floating-point numbers, and objects (typically strings).

  • Sample Data: The first few rows show data for Afghanistan from 1961 and 1962, with entries for ‘Laying’, ‘Yield’, and ‘Production’ of ‘Eggs, hen, in shell’.

summary(egg_chicken)
##  Domain.Code           Domain            Area.Code          Area          
##  Length:38170       Length:38170       Min.   :   1.0   Length:38170      
##  Class :character   Class :character   1st Qu.:  70.0   Class :character  
##  Mode  :character   Mode  :character   Median : 143.0   Mode  :character  
##                                        Mean   : 771.1                     
##                                        3rd Qu.: 215.0                     
##                                        Max.   :5504.0                     
##                                                                           
##   Element.Code    Element            Item.Code        Item          
##  Min.   :5313   Length:38170       Min.   :1062   Length:38170      
##  1st Qu.:5313   Class :character   1st Qu.:1062   Class :character  
##  Median :5410   Mode  :character   Median :1062   Mode  :character  
##  Mean   :5411                      Mean   :1062                     
##  3rd Qu.:5510                      3rd Qu.:1062                     
##  Max.   :5510                      Max.   :1062                     
##                                                                     
##    Year.Code         Year          Unit               Value         
##  Min.   :1961   Min.   :1961   Length:38170       Min.   :       1  
##  1st Qu.:1976   1st Qu.:1976   Class :character   1st Qu.:    2600  
##  Median :1991   Median :1991   Mode  :character   Median :   31996  
##  Mean   :1990   Mean   :1990                      Mean   :  291341  
##  3rd Qu.:2005   3rd Qu.:2005                      3rd Qu.:   93836  
##  Max.   :2018   Max.   :2018                      Max.   :76769955  
##                                                   NA's   :40        
##      Flag           Flag.Description  
##  Length:38170       Length:38170      
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
## 

The egg chicken dataset appears to be structured similarly to the cattle dairy dataset, with columns representing various identifiers and attributes of the data, and a ‘Value’ column that contains the numerical data for production, yield, etc. There are a small number of missing values in the ‘Value’ column, and a larger number of missing entries in the ‘Flag’ column. The ‘Flag Description’ provides context for the data, such as whether it’s an FAO estimate or calculated data.

Tidy data

# Tidy the cattle_dairy dataset
cattle_dairy_tidy <- cattle_dairy %>%
  # Handle missing values, for example, removing rows with missing 'Value'
  dplyr::filter(!is.na(Value))

head(cattle_dairy_tidy)
##   Domain.Code            Domain Area.Code        Area Element.Code      Element
## 1          QL Livestock Primary         2 Afghanistan         5318 Milk Animals
## 2          QL Livestock Primary         2 Afghanistan         5420        Yield
## 3          QL Livestock Primary         2 Afghanistan         5510   Production
## 4          QL Livestock Primary         2 Afghanistan         5318 Milk Animals
## 5          QL Livestock Primary         2 Afghanistan         5420        Yield
## 6          QL Livestock Primary         2 Afghanistan         5510   Production
##   Item.Code                  Item Year.Code Year   Unit  Value Flag
## 1       882 Milk, whole fresh cow      1961 1961   Head 700000    F
## 2       882 Milk, whole fresh cow      1961 1961  hg/An   5000   Fc
## 3       882 Milk, whole fresh cow      1961 1961 tonnes 350000    F
## 4       882 Milk, whole fresh cow      1962 1962   Head 700000    F
## 5       882 Milk, whole fresh cow      1962 1962  hg/An   5000   Fc
## 6       882 Milk, whole fresh cow      1962 1962 tonnes 350000    F
##   Flag.Description
## 1     FAO estimate
## 2  Calculated data
## 3     FAO estimate
## 4     FAO estimate
## 5  Calculated data
## 6     FAO estimate
# Tidy the egg_chicken dataset
egg_chicken_tidy <- egg_chicken %>%
  # Handle missing values, for example, removing rows with missing 'Value'
  dplyr::filter(!is.na(Value))

head(egg_chicken_tidy)
##   Domain.Code            Domain Area.Code        Area Element.Code    Element
## 1          QL Livestock Primary         2 Afghanistan         5313     Laying
## 2          QL Livestock Primary         2 Afghanistan         5410      Yield
## 3          QL Livestock Primary         2 Afghanistan         5510 Production
## 4          QL Livestock Primary         2 Afghanistan         5313     Laying
## 5          QL Livestock Primary         2 Afghanistan         5410      Yield
## 6          QL Livestock Primary         2 Afghanistan         5510 Production
##   Item.Code                Item Year.Code Year      Unit Value Flag
## 1      1062 Eggs, hen, in shell      1961 1961 1000 Head  4000    F
## 2      1062 Eggs, hen, in shell      1961 1961  100mg/An 25000   Fc
## 3      1062 Eggs, hen, in shell      1961 1961    tonnes 10000    F
## 4      1062 Eggs, hen, in shell      1962 1962 1000 Head  4400    F
## 5      1062 Eggs, hen, in shell      1962 1962  100mg/An 25000   Fc
## 6      1062 Eggs, hen, in shell      1962 1962    tonnes 11000    F
##   Flag.Description
## 1     FAO estimate
## 2  Calculated data
## 3     FAO estimate
## 4     FAO estimate
## 5  Calculated data
## 6     FAO estimate

Mutate

The datasets are already clean and require no further tidying, although, we may want to merge Country Code and Country. Again, not required for this analysis.

cattle_dairy_mutated <- cattle_dairy_tidy %>%
  mutate(class = "cattle_dairy")

egg_chicken_mutated <- egg_chicken_tidy %>%
  mutate(class = "egg_chicken")

Joining Datasets

merged_data <- full_join(cattle_dairy_mutated, egg_chicken_mutated)
## Joining with `by = join_by(Domain.Code, Domain, Area.Code, Area, Element.Code,
## Element, Item.Code, Item, Year.Code, Year, Unit, Value, Flag, Flag.Description,
## class)`
head(merged_data)
##   Domain.Code            Domain Area.Code        Area Element.Code      Element
## 1          QL Livestock Primary         2 Afghanistan         5318 Milk Animals
## 2          QL Livestock Primary         2 Afghanistan         5420        Yield
## 3          QL Livestock Primary         2 Afghanistan         5510   Production
## 4          QL Livestock Primary         2 Afghanistan         5318 Milk Animals
## 5          QL Livestock Primary         2 Afghanistan         5420        Yield
## 6          QL Livestock Primary         2 Afghanistan         5510   Production
##   Item.Code                  Item Year.Code Year   Unit  Value Flag
## 1       882 Milk, whole fresh cow      1961 1961   Head 700000    F
## 2       882 Milk, whole fresh cow      1961 1961  hg/An   5000   Fc
## 3       882 Milk, whole fresh cow      1961 1961 tonnes 350000    F
## 4       882 Milk, whole fresh cow      1962 1962   Head 700000    F
## 5       882 Milk, whole fresh cow      1962 1962  hg/An   5000   Fc
## 6       882 Milk, whole fresh cow      1962 1962 tonnes 350000    F
##   Flag.Description        class
## 1     FAO estimate cattle_dairy
## 2  Calculated data cattle_dairy
## 3     FAO estimate cattle_dairy
## 4     FAO estimate cattle_dairy
## 5  Calculated data cattle_dairy
## 6     FAO estimate cattle_dairy
colnames(merged_data)
##  [1] "Domain.Code"      "Domain"           "Area.Code"        "Area"            
##  [5] "Element.Code"     "Element"          "Item.Code"        "Item"            
##  [9] "Year.Code"        "Year"             "Unit"             "Value"           
## [13] "Flag"             "Flag.Description" "class"
dim(merged_data)
## [1] 74505    15

Analysis

While there has been a sub-linear growth in the production of egg_chicken, we can see that cattle_diary has seen an much higher growth. It is also interesting to note that the production went down around 1990 before increasing yet again.

# Filter the dataset to include only relevant elements
filtered_data <- merged_data %>%
  dplyr::filter(Element %in% c('Milk Animals', 'Laying', 'Production'))

# Group by year and class, calculate the mean value
grouped_data <- filtered_data %>%
  group_by(Year, class) %>%
  summarise(mean = mean(Value, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
# Create a bar plot
ggplot(grouped_data, aes(Year, mean, fill = class)) +
  geom_bar(stat = "identity") +
  scale_y_continuous(labels = scales::comma) +
  labs(x = "Year", y = "Average Production",
       title = "Element-wise analysis of Milk Animals, Chicken, and Production per Year") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))