Wine Description: Wines of fresh grapes of all qualities, including sparkling, fortified and dessert wines. Data source: https://ourworldindata.org/grapher/wine-production

Importing and visualize dataset

# Import libraries
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Import and visualize data file
wine_production <- read_csv("wine-production.csv")
## Rows: 5969 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Entity, Code
## dbl (2): Year, Wine | 00000564 || Production | 005510 || tonnes
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(wine_production)

# Print number of rows and columns
print(paste("Number of rows: ", format(nrow(wine_production), big.mark=",")))
## [1] "Number of rows:  5,969"
print(paste("Number of columns: ", ncol(wine_production)))
## [1] "Number of columns:  4"

Cleaning dataset

# Import libraries
# Check for missing values by creating a data frame showing columns names and their number of missing values
missing_values <- data.frame(names(wine_production), colSums(is.na(wine_production)))

# Print data frame showing columns names and their number of missing values
print(missing_values)
##                                                                            names.wine_production.
## Entity                                                                                     Entity
## Code                                                                                         Code
## Year                                                                                         Year
## Wine | 00000564 || Production | 005510 || tonnes Wine | 00000564 || Production | 005510 || tonnes
##                                                  colSums.is.na.wine_production..
## Entity                                                                         0
## Code                                                                        2204
## Year                                                                           0
## Wine | 00000564 || Production | 005510 || tonnes                               0
# Remove missing values rows
wine_production_clean <- wine_production %>%
  drop_na()

# Print number of rows omitted by the difference between total of rows and rows cleaned
print(paste("Number of rows omitted: ", formatC(nrow(wine_production) - nrow(wine_production_clean), big.mark=",", format = "f", digits = 0)))
## [1] "Number of rows omitted:  2,204"
# Visualize clean data
View(wine_production_clean)

Selecting periods

# Create subsets
# Min and max years
min_wine_production <- min(wine_production_clean$Year)
max_wine_production <- max(wine_production_clean$Year)

# Print min and max years of wine production
print(paste("Min. year of wine production: ", formatC(min_wine_production, format = "f", digits = 0)))
## [1] "Min. year of wine production:  1961"
print(paste("Max. year of wine production: ", formatC(max_wine_production, format = "f", digits = 0)))
## [1] "Max. year of wine production:  2021"
# Print the range of years convered in the analysis
print(paste("Periods (years) covered in the analysis: ", formatC(max_wine_production - min_wine_production, format = "f", digits = 0)))
## [1] "Periods (years) covered in the analysis:  60"
# Create sub data set for min year and visualize it. Also, exclude the consolidate value 'World'
wine_production_clean_min <- subset(wine_production_clean, wine_production_clean$Year == min_wine_production & wine_production_clean$Entity != "World")
View(wine_production_clean_min)

#Create sub data set for max year and visualize it. Also, exclude the consolidate value 'World'
wine_production_clean_max <- subset(wine_production_clean, wine_production_clean$Year == max_wine_production & wine_production_clean$Entity != "World")
View(wine_production_clean_max)

Dataset statistics

# Import libraries

# Summary stats for min year of wine production
summary_min <- summary(wine_production_clean_min)
# Print min year before summary stats
print(paste("Summary wine production of year: ", min(wine_production_clean_min$Year)))
## [1] "Summary wine production of year:  1961"
summary_min
##     Entity              Code                Year     
##  Length:52          Length:52          Min.   :1961  
##  Class :character   Class :character   1st Qu.:1961  
##  Mode  :character   Mode  :character   Median :1961  
##                                        Mean   :1961  
##                                        3rd Qu.:1961  
##                                        Max.   :1961  
##  Wine | 00000564 || Production | 005510 || tonnes
##  Min.   :      0                                 
##  1st Qu.:    955                                 
##  Median :  29201                                 
##  Mean   : 413079                                 
##  3rd Qu.: 352744                                 
##  Max.   :5248200
# Summary stats for max year of wine production
summary_max <- summary(wine_production_clean_max)
# Print max year before summary stats
print(paste("Summary wine production of year: ", max(wine_production_clean_max$Year)))
## [1] "Summary wine production of year:  2021"
summary_max
##     Entity              Code                Year     
##  Length:72          Length:72          Min.   :2021  
##  Class :character   Class :character   1st Qu.:2021  
##  Mode  :character   Mode  :character   Median :2021  
##                                        Mean   :2021  
##                                        3rd Qu.:2021  
##                                        Max.   :2021  
##  Wine | 00000564 || Production | 005510 || tonnes
##  Min.   :      0                                 
##  1st Qu.:   3899                                 
##  Median :  26490                                 
##  Mean   : 373201                                 
##  3rd Qu.: 251100                                 
##  Max.   :5088500

Top 10 Wine Producers By Country

# Import libraries
library(dplyr)
library(ggplot2)

# Sort data and take top 10 producers min year wine producing countries
wpc_min_sorted <- arrange(wine_production_clean_min, desc(wine_production_clean_min$`Wine | 00000564 || Production | 005510 || tonnes`))

top_10_producers <- top_n(wpc_min_sorted, 10, wpc_min_sorted$`Wine | 00000564 || Production | 005510 || tonnes`)

# Plot data
ggplot(top_10_producers, aes(x = reorder(top_10_producers$Entity,top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`), y = top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`, label = top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`)) + 
geom_col(fill="#57068C") + geom_text(aes(label = formatC(`Wine | 00000564 || Production | 005510 || tonnes`, big.mark=",", format = "f", digits = 0)), vjust = -0.5, size = 3) + labs(x="Country",y="Production")+ggtitle(paste("Top 10 Wine Producers By Country (in tonnes), year: ",min(top_10_producers$Year)))+
theme_classic() +   
scale_y_continuous(labels = function(x) formatC(x, big.mark=",", format = "f", digits = 0))
## Warning: Use of `top_10_producers$Entity` is discouraged.
## ℹ Use `Entity` instead.
## Warning: Use of `` top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`
## `` is discouraged.
## ℹ Use `Wine | 00000564 || Production | 005510 || tonnes` instead.
## Use of `` top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`
## `` is discouraged.
## ℹ Use `Wine | 00000564 || Production | 005510 || tonnes` instead.
## Use of `` top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`
## `` is discouraged.
## ℹ Use `Wine | 00000564 || Production | 005510 || tonnes` instead.
## Warning: Use of `top_10_producers$Entity` is discouraged.
## ℹ Use `Entity` instead.
## Warning: Use of `` top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`
## `` is discouraged.
## ℹ Use `Wine | 00000564 || Production | 005510 || tonnes` instead.
## Use of `` top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`
## `` is discouraged.
## ℹ Use `Wine | 00000564 || Production | 005510 || tonnes` instead.

# Sort data and take top 10 producers max year wine producing countries
wpc_max_sorted <- arrange(wine_production_clean_max, desc(wine_production_clean_max$`Wine | 00000564 || Production | 005510 || tonnes`))

top_10_producers <- top_n(wpc_max_sorted, 10, wpc_max_sorted$`Wine | 00000564 || Production | 005510 || tonnes`)

# Plot data
ggplot(top_10_producers, aes(x = reorder(top_10_producers$Entity,top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`), y = top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`, label = top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`)) + 
geom_col(fill="#57068C") + geom_text(aes(label = formatC(`Wine | 00000564 || Production | 005510 || tonnes`, big.mark=",", format = "f", digits = 0)), vjust = -0.5, size = 3) + labs(x="Country",y="Production")+ggtitle(paste("Top 10 Wine Producers By Country (in tonnes), year: ",min(top_10_producers$Year)))+
theme_classic() +   
scale_y_continuous(labels = function(x) formatC(x, big.mark=",", format = "f", digits = 0))
## Warning: Use of `top_10_producers$Entity` is discouraged.
## ℹ Use `Entity` instead.
## Warning: Use of `` top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`
## `` is discouraged.
## ℹ Use `Wine | 00000564 || Production | 005510 || tonnes` instead.
## Use of `` top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`
## `` is discouraged.
## ℹ Use `Wine | 00000564 || Production | 005510 || tonnes` instead.
## Use of `` top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`
## `` is discouraged.
## ℹ Use `Wine | 00000564 || Production | 005510 || tonnes` instead.
## Warning: Use of `top_10_producers$Entity` is discouraged.
## ℹ Use `Entity` instead.
## Warning: Use of `` top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`
## `` is discouraged.
## ℹ Use `Wine | 00000564 || Production | 005510 || tonnes` instead.
## Use of `` top_10_producers$`Wine | 00000564 || Production | 005510 || tonnes`
## `` is discouraged.
## ℹ Use `Wine | 00000564 || Production | 005510 || tonnes` instead.