2.2 Data Cleaning
First, the dataset was filtered to maintain observations within the
1995–2025 period. Next, only production-related variables were selected,
and irrelevant product categories were excluded from the main analytical
dataset. Character variables were converted into numeric format where
necessary, and missing values were handled either through removal or
aggregation, depending on their structural relevance.
Country names were standardized to ensure consistency across
datasets, especially for later spatial visualization. In addition, the
data were aggregated at the country level in order to compute long-term
averages and growth indicators. These steps were conducted to ensure
comparability between countries and to eliminate inconsistencies that
could distort clustering results.
ds_raw <- read_delim(input_path, delim = sep_symbol, locale = locale(encoding = "latin1"),
col_types = cols(.default = "c"))
cat("Original rows:", nrow(ds_raw), "columns:", ncol(ds_raw), "\n")
## Original rows: 59704 columns: 7
cat("Column names:", paste(colnames(ds_raw), collapse = ", "), "\n\n")
## Column names: Continent, Region/Country, Product, Variable, Year, Unit, Quantity
# norm names
ds <- ds_raw %>%
rename(
Continent = 1,
Region_Country = 2,
Product = 3,
Variable = 4,
Year = 5,
Unit = 6,
Quantity = 7
) %>%
mutate(
Product = str_squish(Product),
Variable = str_squish(Variable),
Region_Country = str_squish(Region_Country),
Unit = str_squish(Unit),
Year = as.integer(as.numeric(Year))
)
# std product names
ds <- ds %>%
mutate(
Product_std = case_when(
str_detect(str_to_lower(Product), "\\bwine\\b") ~ "Wine",
str_detect(str_to_lower(Product), "vineyard") ~ "Vineyard",
str_detect(str_to_lower(Product), "grape") ~ "Grapes",
TRUE ~ Product
),
Variable_std = str_to_lower(Variable)
)
# filter: wine production, grapes production, vineyard surface area
ds_prod <- ds %>%
filter(
(Product_std == "Wine" & Variable_std == "production") |
(Product_std == "Vineyard" & str_detect(Variable_std, "surface")) |
(Product_std == "Grapes" & Variable_std == "production")
) %>%
filter(!is.na(Year), Year >= min_year & Year <= max_year)
# clean quanitity
clean_quantity <- function(q) {
q2 <- q %>%
str_replace_all("\\s+", "") %>%
str_replace_all(",", ".") %>%
str_replace_all("[^0-9\\.\\-]", "")
ifelse(q2 == "" | q2 == ".", NA, as.numeric(q2))
}
ds_prod <- ds_prod %>%
mutate(
Quantity_num = map_dbl(Quantity, ~ clean_quantity(.x)),
Unit = if_else(is.na(Unit) | Unit == "", "unknown", Unit)
)