Loading the libraries
library(tidyverse)
library(ggplot2)
library(patchwork)
Reading the file
pumpkins <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-10-19/pumpkins.csv')
Identifying the string values in interested numeric columns
pumpkins %>%
filter(str_detect(weight_lbs, "exhibition only"))
## # A tibble: 54 x 14
## id place weight_lbs grower_name city state_prov country gpc_site
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2013-F "291 E~ "291 Entri~ "291 Entrie~ "291 ~ "291 Entri~ "291 En~ "291 Ent~
## 2 2013-L "203 E~ "203 Entri~ "203 Entrie~ "203 ~ "203 Entri~ "203 En~ "203 Ent~
## 3 2013-P "1681 ~ "1681 Entr~ "1681 Entri~ "1681~ "1681 Entr~ "1681 E~ "1681 En~
## 4 2013-S "151 E~ "151 Entri~ "151 Entrie~ "151 ~ "151 Entri~ "151 En~ "151 Ent~
## 5 2013-T "289 E~ "289 Entri~ "289 Entrie~ "289 ~ "289 Entri~ "289 En~ "289 Ent~
## 6 2013-W "273 E~ "273 Entri~ "273 Entrie~ "273 ~ "273 Entri~ "273 En~ "273 Ent~
## 7 2014-F "330 E~ "330 Entri~ "330 Entrie~ "330 ~ "330 Entri~ "330 En~ "330 Ent~
## 8 2014-L "185 E~ "185 Entri~ "185 Entrie~ "185 ~ "185 Entri~ "185 En~ "185 Ent~
## 9 2014-P "1900 ~ "1900 Entr~ "1900 Entri~ "1900~ "1900 Entr~ "1900 E~ "1900 En~
## 10 2014-S "192 E~ "192 Entri~ "192 Entrie~ "192 ~ "192 Entri~ "192 En~ "192 Ent~
## # ... with 44 more rows, and 6 more variables: seed_mother <chr>,
## # pollinator_father <chr>, ott <chr>, est_weight <chr>, pct_chart <chr>,
## # variety <lgl>
pumpkins %>%
filter(str_detect(place, c("DMG","EXH")))
## # A tibble: 1,177 x 14
## id place weight_lbs grower_name city state_prov country gpc_site
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2013-F EXH 115.50 Snyder, jerry Bessemer Pennsylva~ United~ Ohio Va~
## 2 2013-F EXH 107.30 Daho, Mehdi <NA> Other France Le Pota~
## 3 2013-F EXH 106.00 LaRue, Jack tenino Washington United~ Termina~
## 4 2013-F EXH 104.50 Treece, Jef West Linn Oregon United~ Canyonv~
## 5 2013-F EXH 102.00 Steil, Scott Richmond Minnesota United~ Stillwa~
## 6 2013-F EXH 61.50 Scherer, Josh Piqua Ohio United~ Gary Gr~
## 7 2013-F EXH 56.00 Wright, Tom Frenchville Pennsylva~ United~ PGPGA G~
## 8 2013-F EXH 42.50 Klinker, Kelly Woodburn Indiana United~ Gary Gr~
## 9 2013-F DMG 120.00 Westcott, Steve Oswego New York United~ CNY Pum~
## 10 2013-F DMG 98.00 Leland, Neal Canby Oregon United~ Baumans~
## # ... with 1,167 more rows, and 6 more variables: seed_mother <chr>,
## # pollinator_father <chr>, ott <chr>, est_weight <chr>, pct_chart <chr>,
## # variety <lgl>
Data cleaning before visualization
- Separated the id column into “year” and “type” of Pumpkins.
- converted weight_lbs column into numeric.
- converted year column into numeric.
- converted the weight_lbs into kg by dividing the lbs by 2.2046.
- Filtered out the country to “United States”
- Filtered out the string values place.
pumpkins<-pumpkins %>%
mutate(weight_kgs = as.numeric(gsub(",", "", weight_lbs))/2.2046) %>%
separate(id, sep = "-", into = c("year", "type"), remove = F) %>%
filter( country=='United States',
place != c("DMG","EXH")) %>%
group_by(year, type) %>%
mutate(avg_weight = mean(weight_kgs),
year = as.numeric(year),
type = recode(type,
"F" = "Field Pumpkin" ,
"P" = "Giant Pumpkin",
"S" = "Giant Squash",
"W" = "Giant Watermelon",
"L" = "Long Gourd",
"T" = "Tomato"))
Setting the levels for the types of pumpkin based on the analysis below to display the legends in the order of higher weights in the plot.
pumpkins$type <- factor(pumpkins$type, levels = c("Giant Pumpkin", "Giant Squash", "Giant Watermelon",
"Long Gourd", "Field Pumpkin", "Tomato"))
Plotting the graph for Average weight of the types of pumpkins over the years of 2013 and 2021 in the United States.
p1<-pumpkins %>%
ggplot(aes(x=as.factor(year), y=avg_weight, group=type, color=type,na.rm = TRUE)) +
geom_line(size = 1.25) +
geom_point() +
theme_classic()+
theme(plot.title = element_text(color = "black", size = 20,hjust = 0.5),
plot.subtitle = element_text(color = "black", size = 14,hjust = 0.5),
axis.title.x =element_text(size = 18, hjust = 0.5, color = "black"),
axis.title.y =element_text(size = 18, hjust = 0.5, color = "black"),
axis.text.y = element_text(size = 14, hjust = 1, color = "black"),
axis.text.x = element_text(size =14, hjust = 1, color = "black"),
legend.text = element_text(size = 15),
legend.title = element_text(size=15))+
labs(x = "Year",
y = "Weight in kgs",
color = "Type",
title = "Average weight of the pumpkins by type between the years of 2013 to 2021 in the US ",
subtitle = "Giant Pumpkins and Giant Squash are higher in weights than the Giant Watermelon, Long Gourd, Field Pumpkin and Tomato type of Pumkins.\n Over the years of 2013-2021 there is more fluctuation in the weights of Giant Pumpkins and Gian Squash. \n All the other types have almost constant weights over the years of 2013-2021") +scale_y_continuous(breaks = seq(0, 500, by = 50))
p1

The highest average weight of the giant pumpkins was around 410 kg in the US.Further I would like to know what countries produces the Largest pumpkins in further analysis.
Data cleaning before visualization
- grouping the data by city and sort by descending order.
- Created a region column by concatanating the state_prov and country.
- Filtered the pumkins greater than 1000 kgs and sorted
pumpkins %>% group_by(city) %>% count() %>% arrange(desc(n))
pumpkins <- pumpkins %>%
unite("region", c(state_prov, country), sep = ", ", remove = F) %>%
mutate(weight_kgs = as.numeric(gsub(",", "", weight_lbs))/2.2046,
region = str_wrap(region, 25))
region_code <- pumpkins %>%
filter(weight_kgs > 1000) %>%
arrange(desc(weight_kgs)) %>%
distinct(region) %>%
pull(region)
pumpkins %>%
filter(weight_kgs > 1000) %>%
mutate(region = fct_relevel(region, region_code)) %>%
select(region, weight_kgs, weight_lbs) %>%
arrange(desc(weight_kgs))
Plotting the graph to understand which countries and cities produces the largest Pumpkins which are greater than 1000 kg.
p2<-pumpkins %>%filter(weight_kgs > 1000) %>%
mutate(region = fct_relevel(region, region_code)) %>%
ggplot(aes(x = region,y=weight_kgs),na.rm = TRUE) +
geom_segment( aes(x=region, xend=region, y=970, yend=weight_kgs), color="skyblue",size=1,alpha=3,na.rm = TRUE) +
geom_point(aes(size = weight_kgs), color="orange", alpha=3)+
theme(panel.background = element_rect(color = 'white', fill = 'white'),
panel.grid.major.x = element_line(color = 'grey'),
panel.grid.major.y = element_line(color = 'white'),
axis.text.y = element_text(size = 14, hjust = 1, color = "black"),
axis.text.x = element_text(size =14, hjust = 1, color = "black"),
plot.title = element_text(color = "black", size = 20,hjust = 0.5),
plot.subtitle = element_text(color = "black", size = 14,hjust = 0.5),
axis.title.x =element_text(size = 18, hjust = 0.5, color = "black"),
axis.title.y =element_text(size = 18, hjust = 0.5, color = "black"),
legend.text = element_text(size = 10),
legend.title = element_text(size=15))+
scale_size_continuous(range = c(7,14))+
coord_flip() +
labs(x = "Region",
y = "Weight in Kgs",
title = "Pumpkins that weigh greater than 1000 kgs across countries ",
subtitle = "The Largest Pumpkin as of 2021 is from Italy of 1226 Kg (2708.8 lbs) \n The largest pumpkin in the United States is from New Hampshire, weighing 1146 kg (2528 lbs) "
)+
geom_curve(aes(x = 1, y = 1226, xend = 2.2, yend = 1230),
arrow = arrow(length = unit(0.35,"cm")),color="black",size=0.9)+
annotate("text", x = 3.25, y = 1226, label = "Largest Pumpkin \n is from Italy \n weighing 1226 kg", color="black",size=4)+
geom_curve(aes(x = 5, y = 1147, xend = 6.2, yend = 1150),
arrow = arrow(length = unit(0.35,"cm")),color="black",size=0.9)+
annotate("text", x = 7, y = 1147, label = "Largest Pumpkin in the U.S \n weighing 1147 kg", color="black",size=4)
p2

Creating a dashboard with the help of Patchwork.
p1/p2
