Wine Description: Wines of fresh grapes of all qualities, including sparkling, fortified and dessert wines. Data source: https://ourworldindata.org/grapher/wine-production
# Import libraries
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Import and visualize data file
wine_production <- read_csv("wine-production.csv")
## Rows: 5969 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Entity, Code
## dbl (2): Year, Wine | 00000564 || Production | 005510 || tonnes
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(wine_production)
# Print number of rows and columns
print(paste("Number of rows: ", format(nrow(wine_production), big.mark=",")))
## [1] "Number of rows: 5,969"
print(paste("Number of columns: ", ncol(wine_production)))
## [1] "Number of columns: 4"
# Import libraries
# Check for missing values by creating a data frame showing columns names and their number of missing values
missing_values <- data.frame(names(wine_production), colSums(is.na(wine_production)))
# Print data frame showing columns names and their number of missing values
print(missing_values)
## names.wine_production.
## Entity Entity
## Code Code
## Year Year
## Wine | 00000564 || Production | 005510 || tonnes Wine | 00000564 || Production | 005510 || tonnes
## colSums.is.na.wine_production..
## Entity 0
## Code 2204
## Year 0
## Wine | 00000564 || Production | 005510 || tonnes 0
# Remove missing values rows
wine_production_clean <- wine_production %>%
drop_na()
# Print number of rows omitted by the difference between total of rows and rows cleaned
print(paste("Number of rows omitted: ", formatC(nrow(wine_production) - nrow(wine_production_clean), big.mark=",", format = "f", digits = 0)))
## [1] "Number of rows omitted: 2,204"
# Visualize clean data
View(wine_production_clean)
# Create subsets
# Min and max years
min_wine_production <- min(wine_production_clean$Year)
max_wine_production <- max(wine_production_clean$Year)
# Print min and max years of wine production
print(paste("Min. year of wine production: ", formatC(min_wine_production, format = "f", digits = 0)))
## [1] "Min. year of wine production: 1961"
print(paste("Max. year of wine production: ", formatC(max_wine_production, format = "f", digits = 0)))
## [1] "Max. year of wine production: 2021"
# Print the range of years convered in the analysis
print(paste("Periods (years) covered in the analysis: ", formatC(max_wine_production - min_wine_production, format = "f", digits = 0)))
## [1] "Periods (years) covered in the analysis: 60"
# Create sub data set for min year and visualize it. Also, exclude the consolidate value 'World'
wine_production_clean_min <- subset(wine_production_clean, wine_production_clean$Year == min_wine_production & wine_production_clean$Entity != "World")
View(wine_production_clean_min)
#Create sub data set for max year and visualize it. Also, exclude the consolidate value 'World'
wine_production_clean_max <- subset(wine_production_clean, wine_production_clean$Year == max_wine_production & wine_production_clean$Entity != "World")
View(wine_production_clean_max)
# Import libraries
# Summary stats for min year of wine production
summary_min <- summary(wine_production_clean_min)
# Print min year before summary stats
print(paste("Summary wine production of year: ", min(wine_production_clean_min$Year)))
## [1] "Summary wine production of year: 1961"
summary_min
## Entity Code Year
## Length:52 Length:52 Min. :1961
## Class :character Class :character 1st Qu.:1961
## Mode :character Mode :character Median :1961
## Mean :1961
## 3rd Qu.:1961
## Max. :1961
## Wine | 00000564 || Production | 005510 || tonnes
## Min. : 0
## 1st Qu.: 955
## Median : 29201
## Mean : 413079
## 3rd Qu.: 352744
## Max. :5248200
# Summary stats for max year of wine production
summary_max <- summary(wine_production_clean_max)
# Print max year before summary stats
print(paste("Summary wine production of year: ", max(wine_production_clean_max$Year)))
## [1] "Summary wine production of year: 2021"
summary_max
## Entity Code Year
## Length:72 Length:72 Min. :2021
## Class :character Class :character 1st Qu.:2021
## Mode :character Mode :character Median :2021
## Mean :2021
## 3rd Qu.:2021
## Max. :2021
## Wine | 00000564 || Production | 005510 || tonnes
## Min. : 0
## 1st Qu.: 3899
## Median : 26490
## Mean : 373201
## 3rd Qu.: 251100
## Max. :5088500
# Import libraries
library(dplyr)
library(ggplot2)
# Sort data and take top 10 producers min year wine producing countries
wpc_min_sorted <- arrange(wine_production_clean_min, desc(wine_production_clean_min$`Wine | 00000564 || Production | 005510 || tonnes`))
top_10_producers <- top_n(wpc_min_sorted, 10, wpc_min_sorted$`Wine | 00000564 || Production | 005510 || tonnes`)
# Plot data
ggplot(top_10_producers, aes(x = reorder(top_10_producers$Entity,top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`), y = top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`, label = top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`)) +
geom_col(fill="#57068C") + geom_text(aes(label = formatC(`Wine | 00000564 || Production | 005510 || tonnes`, big.mark=",", format = "f", digits = 0)), vjust = -0.5, size = 3) + labs(x="Country",y="Production")+ggtitle(paste("Top 10 Wine Producers By Country (in tonnes), year: ",min(top_10_producers$Year)))+
theme_classic() +
scale_y_continuous(labels = function(x) formatC(x, big.mark=",", format = "f", digits = 0))
## Warning: Use of `top_10_producers$Entity` is discouraged.
## ℹ Use `Entity` instead.
## Warning: Use of `` top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`
## `` is discouraged.
## ℹ Use `Wine | 00000564 || Production | 005510 || tonnes` instead.
## Use of `` top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`
## `` is discouraged.
## ℹ Use `Wine | 00000564 || Production | 005510 || tonnes` instead.
## Use of `` top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`
## `` is discouraged.
## ℹ Use `Wine | 00000564 || Production | 005510 || tonnes` instead.
## Warning: Use of `top_10_producers$Entity` is discouraged.
## ℹ Use `Entity` instead.
## Warning: Use of `` top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`
## `` is discouraged.
## ℹ Use `Wine | 00000564 || Production | 005510 || tonnes` instead.
## Use of `` top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`
## `` is discouraged.
## ℹ Use `Wine | 00000564 || Production | 005510 || tonnes` instead.
# Sort data and take top 10 producers max year wine producing countries
wpc_max_sorted <- arrange(wine_production_clean_max, desc(wine_production_clean_max$`Wine | 00000564 || Production | 005510 || tonnes`))
top_10_producers <- top_n(wpc_max_sorted, 10, wpc_max_sorted$`Wine | 00000564 || Production | 005510 || tonnes`)
# Plot data
ggplot(top_10_producers, aes(x = reorder(top_10_producers$Entity,top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`), y = top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`, label = top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`)) +
geom_col(fill="#57068C") + geom_text(aes(label = formatC(`Wine | 00000564 || Production | 005510 || tonnes`, big.mark=",", format = "f", digits = 0)), vjust = -0.5, size = 3) + labs(x="Country",y="Production")+ggtitle(paste("Top 10 Wine Producers By Country (in tonnes), year: ",min(top_10_producers$Year)))+
theme_classic() +
scale_y_continuous(labels = function(x) formatC(x, big.mark=",", format = "f", digits = 0))
## Warning: Use of `top_10_producers$Entity` is discouraged.
## ℹ Use `Entity` instead.
## Warning: Use of `` top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`
## `` is discouraged.
## ℹ Use `Wine | 00000564 || Production | 005510 || tonnes` instead.
## Use of `` top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`
## `` is discouraged.
## ℹ Use `Wine | 00000564 || Production | 005510 || tonnes` instead.
## Use of `` top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`
## `` is discouraged.
## ℹ Use `Wine | 00000564 || Production | 005510 || tonnes` instead.
## Warning: Use of `top_10_producers$Entity` is discouraged.
## ℹ Use `Entity` instead.
## Warning: Use of `` top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`
## `` is discouraged.
## ℹ Use `Wine | 00000564 || Production | 005510 || tonnes` instead.
## Use of `` top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`
## `` is discouraged.
## ℹ Use `Wine | 00000564 || Production | 005510 || tonnes` instead.