library(conflicted)
library(dplyr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.4 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
library(ggplot2)
library(zoo)
library(scales)
cattle_dairy <- read.csv("challenge_datasets/FAOSTAT_cattle_dairy.csv")
head(cattle_dairy)
## Domain.Code Domain Area.Code Area Element.Code Element
## 1 QL Livestock Primary 2 Afghanistan 5318 Milk Animals
## 2 QL Livestock Primary 2 Afghanistan 5420 Yield
## 3 QL Livestock Primary 2 Afghanistan 5510 Production
## 4 QL Livestock Primary 2 Afghanistan 5318 Milk Animals
## 5 QL Livestock Primary 2 Afghanistan 5420 Yield
## 6 QL Livestock Primary 2 Afghanistan 5510 Production
## Item.Code Item Year.Code Year Unit Value Flag
## 1 882 Milk, whole fresh cow 1961 1961 Head 700000 F
## 2 882 Milk, whole fresh cow 1961 1961 hg/An 5000 Fc
## 3 882 Milk, whole fresh cow 1961 1961 tonnes 350000 F
## 4 882 Milk, whole fresh cow 1962 1962 Head 700000 F
## 5 882 Milk, whole fresh cow 1962 1962 hg/An 5000 Fc
## 6 882 Milk, whole fresh cow 1962 1962 tonnes 350000 F
## Flag.Description
## 1 FAO estimate
## 2 Calculated data
## 3 FAO estimate
## 4 FAO estimate
## 5 Calculated data
## 6 FAO estimate
egg_chicken <- read.csv("challenge_datasets/FAOSTAT_egg_chicken.csv")
head(egg_chicken)
## Domain.Code Domain Area.Code Area Element.Code Element
## 1 QL Livestock Primary 2 Afghanistan 5313 Laying
## 2 QL Livestock Primary 2 Afghanistan 5410 Yield
## 3 QL Livestock Primary 2 Afghanistan 5510 Production
## 4 QL Livestock Primary 2 Afghanistan 5313 Laying
## 5 QL Livestock Primary 2 Afghanistan 5410 Yield
## 6 QL Livestock Primary 2 Afghanistan 5510 Production
## Item.Code Item Year.Code Year Unit Value Flag
## 1 1062 Eggs, hen, in shell 1961 1961 1000 Head 4000 F
## 2 1062 Eggs, hen, in shell 1961 1961 100mg/An 25000 Fc
## 3 1062 Eggs, hen, in shell 1961 1961 tonnes 10000 F
## 4 1062 Eggs, hen, in shell 1962 1962 1000 Head 4400 F
## 5 1062 Eggs, hen, in shell 1962 1962 100mg/An 25000 Fc
## 6 1062 Eggs, hen, in shell 1962 1962 tonnes 11000 F
## Flag.Description
## 1 FAO estimate
## 2 Calculated data
## 3 FAO estimate
## 4 FAO estimate
## 5 Calculated data
## 6 FAO estimate
Let’s see if the Element.Code in the second table
corresponds to the Element.Code in the first table. Seems
like it does!
filtered_data <- egg_chicken %>%
dplyr::filter(Element.Code == 5313)
head(filtered_data)
## Domain.Code Domain Area.Code Area Element.Code Element
## 1 QL Livestock Primary 2 Afghanistan 5313 Laying
## 2 QL Livestock Primary 2 Afghanistan 5313 Laying
## 3 QL Livestock Primary 2 Afghanistan 5313 Laying
## 4 QL Livestock Primary 2 Afghanistan 5313 Laying
## 5 QL Livestock Primary 2 Afghanistan 5313 Laying
## 6 QL Livestock Primary 2 Afghanistan 5313 Laying
## Item.Code Item Year.Code Year Unit Value Flag
## 1 1062 Eggs, hen, in shell 1961 1961 1000 Head 4000 F
## 2 1062 Eggs, hen, in shell 1962 1962 1000 Head 4400 F
## 3 1062 Eggs, hen, in shell 1963 1963 1000 Head 4600 F
## 4 1062 Eggs, hen, in shell 1964 1964 1000 Head 4800 F
## 5 1062 Eggs, hen, in shell 1965 1965 1000 Head 5200 F
## 6 1062 Eggs, hen, in shell 1966 1966 1000 Head 5500
## Flag.Description
## 1 FAO estimate
## 2 FAO estimate
## 3 FAO estimate
## 4 FAO estimate
## 5 FAO estimate
## 6 Official data
unique(cattle_dairy$Domain)
## [1] "Livestock Primary"
unique(egg_chicken$Domain)
## [1] "Livestock Primary"
unique(cattle_dairy$Element)
## [1] "Milk Animals" "Yield" "Production"
unique(egg_chicken$Element)
## [1] "Laying" "Yield" "Production"
unique(cattle_dairy$Item)
## [1] "Milk, whole fresh cow"
unique(egg_chicken$Item)
## [1] "Eggs, hen, in shell"
unique(cattle_dairy$Unit)
## [1] "Head" "hg/An" "tonnes"
unique(egg_chicken$Unit)
## [1] "1000 Head" "100mg/An" "tonnes"
The first dataset, FAOSTAT_cattle_dairy.csv, contains
data about cattle and dairy production. Here are some key details:
Number of Entries: 36,449
Columns: 14
Data Columns: Include ‘Domain Code’, ‘Domain’, ‘Area Code’, ‘Area’, ‘Element Code’, ‘Element’, ‘Item Code’, ‘Item’, ‘Year Code’, ‘Year’, ‘Unit’, ‘Value’, ‘Flag’, and ‘Flag Description’.
Data Types: Mixed types, including integers, floating-point numbers, and objects (typically strings).
Sample Data: The first few rows show data for Afghanistan in 1961 and 1962, with entries for ‘Milk Animals’, ‘Yield’, and ‘Production’ of ‘Milk, whole fresh cow’.
summary(cattle_dairy)
## Domain.Code Domain Area.Code Area
## Length:36449 Length:36449 Min. : 1.0 Length:36449
## Class :character Class :character 1st Qu.: 69.0 Class :character
## Mode :character Mode :character Median : 141.0 Mode :character
## Mean : 775.2
## 3rd Qu.: 215.0
## Max. :5504.0
##
## Element.Code Element Item.Code Item
## Min. :5318 Length:36449 Min. :882 Length:36449
## 1st Qu.:5318 Class :character 1st Qu.:882 Class :character
## Median :5420 Mode :character Median :882 Mode :character
## Mean :5416 Mean :882
## 3rd Qu.:5510 3rd Qu.:882
## Max. :5510 Max. :882
##
## Year.Code Year Unit Value
## Min. :1961 Min. :1961 Length:36449 Min. : 7
## 1st Qu.:1976 1st Qu.:1976 Class :character 1st Qu.: 7849
## Median :1991 Median :1991 Mode :character Median : 43266
## Mean :1990 Mean :1990 Mean : 4410235
## 3rd Qu.:2005 3rd Qu.:2005 3rd Qu.: 700000
## Max. :2018 Max. :2018 Max. :683217055
## NA's :74
## Flag Flag.Description
## Length:36449 Length:36449
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
The first dataset, FAOSTAT_egg_chicken.csv, contains
data about eggs and chicken production. Here are some key details:
Number of Entries: 38,170
Columns: 14
Data Columns: Include ‘Domain Code’, ‘Domain’, ‘Area Code’, ‘Area’, ‘Element Code’, ‘Element’, ‘Item Code’, ‘Item’, ‘Year Code’, ‘Year’, ‘Unit’, ‘Value’, ‘Flag’, and ‘Flag Description’.
Data Types: Mixed types, including integers, floating-point numbers, and objects (typically strings).
Sample Data: The first few rows show data for Afghanistan from 1961 and 1962, with entries for ‘Laying’, ‘Yield’, and ‘Production’ of ‘Eggs, hen, in shell’.
summary(egg_chicken)
## Domain.Code Domain Area.Code Area
## Length:38170 Length:38170 Min. : 1.0 Length:38170
## Class :character Class :character 1st Qu.: 70.0 Class :character
## Mode :character Mode :character Median : 143.0 Mode :character
## Mean : 771.1
## 3rd Qu.: 215.0
## Max. :5504.0
##
## Element.Code Element Item.Code Item
## Min. :5313 Length:38170 Min. :1062 Length:38170
## 1st Qu.:5313 Class :character 1st Qu.:1062 Class :character
## Median :5410 Mode :character Median :1062 Mode :character
## Mean :5411 Mean :1062
## 3rd Qu.:5510 3rd Qu.:1062
## Max. :5510 Max. :1062
##
## Year.Code Year Unit Value
## Min. :1961 Min. :1961 Length:38170 Min. : 1
## 1st Qu.:1976 1st Qu.:1976 Class :character 1st Qu.: 2600
## Median :1991 Median :1991 Mode :character Median : 31996
## Mean :1990 Mean :1990 Mean : 291341
## 3rd Qu.:2005 3rd Qu.:2005 3rd Qu.: 93836
## Max. :2018 Max. :2018 Max. :76769955
## NA's :40
## Flag Flag.Description
## Length:38170 Length:38170
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
The egg chicken dataset appears to be structured similarly to the cattle dairy dataset, with columns representing various identifiers and attributes of the data, and a ‘Value’ column that contains the numerical data for production, yield, etc. There are a small number of missing values in the ‘Value’ column, and a larger number of missing entries in the ‘Flag’ column. The ‘Flag Description’ provides context for the data, such as whether it’s an FAO estimate or calculated data.
# Tidy the cattle_dairy dataset
cattle_dairy_tidy <- cattle_dairy %>%
# Handle missing values, for example, removing rows with missing 'Value'
dplyr::filter(!is.na(Value))
head(cattle_dairy_tidy)
## Domain.Code Domain Area.Code Area Element.Code Element
## 1 QL Livestock Primary 2 Afghanistan 5318 Milk Animals
## 2 QL Livestock Primary 2 Afghanistan 5420 Yield
## 3 QL Livestock Primary 2 Afghanistan 5510 Production
## 4 QL Livestock Primary 2 Afghanistan 5318 Milk Animals
## 5 QL Livestock Primary 2 Afghanistan 5420 Yield
## 6 QL Livestock Primary 2 Afghanistan 5510 Production
## Item.Code Item Year.Code Year Unit Value Flag
## 1 882 Milk, whole fresh cow 1961 1961 Head 700000 F
## 2 882 Milk, whole fresh cow 1961 1961 hg/An 5000 Fc
## 3 882 Milk, whole fresh cow 1961 1961 tonnes 350000 F
## 4 882 Milk, whole fresh cow 1962 1962 Head 700000 F
## 5 882 Milk, whole fresh cow 1962 1962 hg/An 5000 Fc
## 6 882 Milk, whole fresh cow 1962 1962 tonnes 350000 F
## Flag.Description
## 1 FAO estimate
## 2 Calculated data
## 3 FAO estimate
## 4 FAO estimate
## 5 Calculated data
## 6 FAO estimate
# Tidy the egg_chicken dataset
egg_chicken_tidy <- egg_chicken %>%
# Handle missing values, for example, removing rows with missing 'Value'
dplyr::filter(!is.na(Value))
head(egg_chicken_tidy)
## Domain.Code Domain Area.Code Area Element.Code Element
## 1 QL Livestock Primary 2 Afghanistan 5313 Laying
## 2 QL Livestock Primary 2 Afghanistan 5410 Yield
## 3 QL Livestock Primary 2 Afghanistan 5510 Production
## 4 QL Livestock Primary 2 Afghanistan 5313 Laying
## 5 QL Livestock Primary 2 Afghanistan 5410 Yield
## 6 QL Livestock Primary 2 Afghanistan 5510 Production
## Item.Code Item Year.Code Year Unit Value Flag
## 1 1062 Eggs, hen, in shell 1961 1961 1000 Head 4000 F
## 2 1062 Eggs, hen, in shell 1961 1961 100mg/An 25000 Fc
## 3 1062 Eggs, hen, in shell 1961 1961 tonnes 10000 F
## 4 1062 Eggs, hen, in shell 1962 1962 1000 Head 4400 F
## 5 1062 Eggs, hen, in shell 1962 1962 100mg/An 25000 Fc
## 6 1062 Eggs, hen, in shell 1962 1962 tonnes 11000 F
## Flag.Description
## 1 FAO estimate
## 2 Calculated data
## 3 FAO estimate
## 4 FAO estimate
## 5 Calculated data
## 6 FAO estimate
The datasets are already clean and require no further tidying, although, we may want to merge Country Code and Country. Again, not required for this analysis.
cattle_dairy_mutated <- cattle_dairy_tidy %>%
mutate(class = "cattle_dairy")
egg_chicken_mutated <- egg_chicken_tidy %>%
mutate(class = "egg_chicken")
merged_data <- full_join(cattle_dairy_mutated, egg_chicken_mutated)
## Joining with `by = join_by(Domain.Code, Domain, Area.Code, Area, Element.Code,
## Element, Item.Code, Item, Year.Code, Year, Unit, Value, Flag, Flag.Description,
## class)`
head(merged_data)
## Domain.Code Domain Area.Code Area Element.Code Element
## 1 QL Livestock Primary 2 Afghanistan 5318 Milk Animals
## 2 QL Livestock Primary 2 Afghanistan 5420 Yield
## 3 QL Livestock Primary 2 Afghanistan 5510 Production
## 4 QL Livestock Primary 2 Afghanistan 5318 Milk Animals
## 5 QL Livestock Primary 2 Afghanistan 5420 Yield
## 6 QL Livestock Primary 2 Afghanistan 5510 Production
## Item.Code Item Year.Code Year Unit Value Flag
## 1 882 Milk, whole fresh cow 1961 1961 Head 700000 F
## 2 882 Milk, whole fresh cow 1961 1961 hg/An 5000 Fc
## 3 882 Milk, whole fresh cow 1961 1961 tonnes 350000 F
## 4 882 Milk, whole fresh cow 1962 1962 Head 700000 F
## 5 882 Milk, whole fresh cow 1962 1962 hg/An 5000 Fc
## 6 882 Milk, whole fresh cow 1962 1962 tonnes 350000 F
## Flag.Description class
## 1 FAO estimate cattle_dairy
## 2 Calculated data cattle_dairy
## 3 FAO estimate cattle_dairy
## 4 FAO estimate cattle_dairy
## 5 Calculated data cattle_dairy
## 6 FAO estimate cattle_dairy
colnames(merged_data)
## [1] "Domain.Code" "Domain" "Area.Code" "Area"
## [5] "Element.Code" "Element" "Item.Code" "Item"
## [9] "Year.Code" "Year" "Unit" "Value"
## [13] "Flag" "Flag.Description" "class"
dim(merged_data)
## [1] 74505 15
While there has been a sub-linear growth in the production of egg_chicken, we can see that cattle_diary has seen an much higher growth. It is also interesting to note that the production went down around 1990 before increasing yet again.
# Filter the dataset to include only relevant elements
filtered_data <- merged_data %>%
dplyr::filter(Element %in% c('Milk Animals', 'Laying', 'Production'))
# Group by year and class, calculate the mean value
grouped_data <- filtered_data %>%
group_by(Year, class) %>%
summarise(mean = mean(Value, na.rm = TRUE)) %>%
ungroup()
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
# Create a bar plot
ggplot(grouped_data, aes(Year, mean, fill = class)) +
geom_bar(stat = "identity") +
scale_y_continuous(labels = scales::comma) +
labs(x = "Year", y = "Average Production",
title = "Element-wise analysis of Milk Animals, Chicken, and Production per Year") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))