tg<- read_csv("CLEAN thanksgiving data.csv")
## Rows: 1058 Columns: 83
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (46): celebrate, main, cooked, stuffing, cranberry, gravy, brussel.sprou...
## dbl (37): id, celebrate01, gravy01, friendsgiving01, black.friday01, brussel...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(tg)
## # A tibble: 6 × 83
## id celebrate main cooked stuffing cranberry gravy brussel.sprouts carrots
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 4.34e9 Yes Turk… Baked Bread-b… None Yes <NA> Carrots
## 2 4.34e9 Yes Turk… Baked Bread-b… Other (p… Yes <NA> <NA>
## 3 4.34e9 Yes Turk… Roast… Rice-ba… Homemade Yes Brussel sprouts Carrots
## 4 4.34e9 Yes Turk… Baked Bread-b… Homemade Yes Brussel sprouts <NA>
## 5 4.34e9 Yes Tofu… Baked Bread-b… Canned Yes Brussel sprouts <NA>
## 6 4.34e9 Yes Turk… Roast… Rice-ba… Homemade Yes Brussel sprouts Carrots
## # … with 74 more variables: cauliflower <chr>, corn <chr>, cornbread <chr>,
## # fruit.salad <chr>, green.beans <chr>, mac.n.cheese <chr>,
## # mashed.potatoes <chr>, rolls <chr>, squash <chr>, salad <chr>,
## # yams.sweet.potato <chr>, apple.pie <chr>, buttermilk.pie <chr>,
## # cherry.pie <chr>, chocolate.pie <chr>, coconut.pie <chr>,
## # keylime.pie <chr>, peach.pie <chr>, pecan.pie <chr>, pumpkin.pie <chr>,
## # sweet.potato.pie <chr>, apple.cobbler <chr>, blondies <chr>, …
According to the variable breakdown (https://github.com/fivethirtyeight/data/tree/master/thanksgiving-2015), the side dish options are brussel sprouts, carrots, cauliflower, corn, cornbread, fruit salad, green beans/green bean casserole, macaroni and cheese, mashed potatoes, rolls/biscuits, vegetable salad, yams/sweet potato casserole, and a category for “other”. We include these variables along with squash (from the “other” category presumably) in our attempt to recreate FiveThirtyEight’s side dish graphic.
#Number of households in each region
#Number of households who serve the side menu items, by region
tg%>%
group_by(DivName)%>%
summarize(n=n())
## # A tibble: 10 × 2
## DivName n
## <chr> <int>
## 1 East North Central 150
## 2 East South Central 60
## 3 Middle Atlantic 159
## 4 Mountain 47
## 5 New England 58
## 6 Pacific 146
## 7 South Atlantic 214
## 8 West North Central 74
## 9 West South Central 91
## 10 <NA> 59
#Number of households who serve the side menu items, by region
tg%>%
group_by(DivName)%>%
summarize(sum(brussel.sprouts01), sum(carrots01), sum(cauliflower01),
sum(corn01), sum(cornbread01), sum(fruit.salad01),
sum(green.beans01), sum(mac.n.cheese01), sum(mashed.potatoes01),
sum(rolls01), sum(squash01), sum(salad01), sum(yams.sweet.potato01))
## # A tibble: 10 × 14
## DivName `sum(brussel.spro… `sum(carrots01)` `sum(cauliflowe… `sum(corn01)`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 East Nort… 22 28 13 76
## 2 East Sout… 7 15 5 31
## 3 Middle At… 41 45 25 77
## 4 Mountain 6 11 4 17
## 5 New Engla… 12 25 4 22
## 6 Pacific 29 30 18 55
## 7 South Atl… 27 48 11 96
## 8 West Nort… 3 12 3 36
## 9 West Sout… 4 21 3 46
## 10 <NA> 4 7 2 8
## # … with 9 more variables: sum(cornbread01) <dbl>, sum(fruit.salad01) <dbl>,
## # sum(green.beans01) <dbl>, sum(mac.n.cheese01) <dbl>,
## # sum(mashed.potatoes01) <dbl>, sum(rolls01) <dbl>, sum(squash01) <dbl>,
## # sum(salad01) <dbl>, sum(yams.sweet.potato01) <dbl>
tgsum<- tg%>%
group_by(DivName)%>%
summarize(n=n(), sumBrussel=sum(brussel.sprouts01), sumCarrots=sum(carrots01), sumCauliflower=sum(cauliflower01),
sumCorn=sum(corn01), sumCornbread=sum(cornbread01), sumFruitSalad=sum(fruit.salad01),
sumGreenBeans=sum(green.beans01), sumMac=sum(mac.n.cheese01), sumMashedPotatoes=sum(mashed.potatoes01),
sumRolls=sum(rolls01), sumSquash=sum(squash01), sumSalad=sum(salad01), sumYams=sum(yams.sweet.potato01))
tgsum%>%
mutate(propBrussel=sumBrussel/n, propCarrots=sumCarrots/n,
propCauliflower=sumCauliflower/n, propCorn=sumCorn/n,
propCornbread=sumCornbread/n, propFruitSalad=sumFruitSalad/n,
propGreenBeans=sumGreenBeans/n, propMac=sumMac/n,
propMashedPotatoes=sumMashedPotatoes/n, propRolls=sumRolls/n,
propSquash = sumSquash/n, propSalad=sumSalad/n,
propYams = sumYams/n)
## # A tibble: 10 × 28
## DivName n sumBrussel sumCarrots sumCauliflower sumCorn sumCornbread
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 East North C… 150 22 28 13 76 23
## 2 East South C… 60 7 15 5 31 16
## 3 Middle Atlan… 159 41 45 25 77 33
## 4 Mountain 47 6 11 4 17 10
## 5 New England 58 12 25 4 22 10
## 6 Pacific 146 29 30 18 55 37
## 7 South Atlant… 214 27 48 11 96 53
## 8 West North C… 74 3 12 3 36 12
## 9 West South C… 91 4 21 3 46 34
## 10 <NA> 59 4 7 2 8 7
## # … with 21 more variables: sumFruitSalad <dbl>, sumGreenBeans <dbl>,
## # sumMac <dbl>, sumMashedPotatoes <dbl>, sumRolls <dbl>, sumSquash <dbl>,
## # sumSalad <dbl>, sumYams <dbl>, propBrussel <dbl>, propCarrots <dbl>,
## # propCauliflower <dbl>, propCorn <dbl>, propCornbread <dbl>,
## # propFruitSalad <dbl>, propGreenBeans <dbl>, propMac <dbl>,
## # propMashedPotatoes <dbl>, propRolls <dbl>, propSquash <dbl>,
## # propSalad <dbl>, propYams <dbl>
tgsum<- tgsum%>%
mutate(propBrussel=sumBrussel/n, propCarrots=sumCarrots/n,
propCauliflower=sumCauliflower/n, propCorn=sumCorn/n,
propCornbread=sumCornbread/n, propFruitSalad=sumFruitSalad/n,
propGreenBeans=sumGreenBeans/n, propMac=sumMac/n,
propMashedPotatoes=sumMashedPotatoes/n, propRolls=sumRolls/n,
propSquash = sumSquash/n, propSalad=sumSalad/n,
propYams = sumYams/n)
#Loading data for national population values
popDiv<-data.frame(DivName=c("East North Central",
"East South Central",
"Middle Atlantic",
"Mountain",
"New England",
"Pacific",
"South Atlantic",
"West North Central",
"West South Central"),
pop=c(46798649,
18931477,
41601787,
23811346,
14757573,
52833604,
63991523,
21179519,
39500457))%>%
mutate(popProp=pop/323405935)
popDiv
## DivName pop popProp
## 1 East North Central 46798649 0.14470560
## 2 East South Central 18931477 0.05853782
## 3 Middle Atlantic 41601787 0.12863644
## 4 Mountain 23811346 0.07362681
## 5 New England 14757573 0.04563173
## 6 Pacific 52833604 0.16336622
## 7 South Atlantic 63991523 0.19786750
## 8 West North Central 21179519 0.06548896
## 9 West South Central 39500457 0.12213894
#Join national population values with summarized data set
tgpop<- left_join(tgsum, popDiv)
## Joining, by = "DivName"
tgpop
## # A tibble: 10 × 30
## DivName n sumBrussel sumCarrots sumCauliflower sumCorn sumCornbread
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 East North C… 150 22 28 13 76 23
## 2 East South C… 60 7 15 5 31 16
## 3 Middle Atlan… 159 41 45 25 77 33
## 4 Mountain 47 6 11 4 17 10
## 5 New England 58 12 25 4 22 10
## 6 Pacific 146 29 30 18 55 37
## 7 South Atlant… 214 27 48 11 96 53
## 8 West North C… 74 3 12 3 36 12
## 9 West South C… 91 4 21 3 46 34
## 10 <NA> 59 4 7 2 8 7
## # … with 23 more variables: sumFruitSalad <dbl>, sumGreenBeans <dbl>,
## # sumMac <dbl>, sumMashedPotatoes <dbl>, sumRolls <dbl>, sumSquash <dbl>,
## # sumSalad <dbl>, sumYams <dbl>, propBrussel <dbl>, propCarrots <dbl>,
## # propCauliflower <dbl>, propCorn <dbl>, propCornbread <dbl>,
## # propFruitSalad <dbl>, propGreenBeans <dbl>, propMac <dbl>,
## # propMashedPotatoes <dbl>, propRolls <dbl>, propSquash <dbl>,
## # propSalad <dbl>, propYams <dbl>, pop <dbl>, popProp <dbl>
#Weight the proportions
tgpop<- tgpop%>%
mutate(wPropBrussel=propBrussel*popProp, wPropCarrots=propCarrots*popProp,
wPropCauliflower=propCauliflower*popProp, wPropCorn=propCorn*popProp,
wPropCornbread=propCornbread*popProp, wPropFruitSalad=propFruitSalad*popProp,
wPropGreenBeans=propGreenBeans*popProp, wPropMac=propMac*popProp,
wPropMashedPotatoes=propMashedPotatoes*popProp, wPropRolls=propRolls*popProp,
wPropSquash=propSquash*popProp, wPropSalad=propSalad*popProp,
wPropYams=propYams*popProp)
#Compute national level values
natPropBrussel<- sum(tgpop$wPropBrussel, na.rm=TRUE)
natPropBrussel
## [1] 0.1455013
natPropCarrots<- sum(tgpop$wPropCarrots, na.rm=TRUE)
natPropCarrots
## [1] 0.231709
natPropCauliflower<- sum(tgpop$wPropCauliflower, na.rm=TRUE)
natPropCauliflower
## [1] 0.0840516
natPropCorn<- sum(tgpop$wPropCorn, na.rm=TRUE)
natPropCorn
## [1] 0.4537023
natPropCornbread<- sum(tgpop$wPropCornbread, na.rm=TRUE)
natPropCornbread
## [1] 0.234689
natPropFruitSalad<- sum(tgpop$wPropFruitSalad, na.rm=TRUE)
natPropFruitSalad
## [1] 0.2124256
natPropGreenBeans<- sum(tgpop$wPropGreenBeans, na.rm=TRUE)
natPropGreenBeans
## [1] 0.6704939
natPropMac<- sum(tgpop$wPropMac, na.rm=TRUE)
natPropMac
## [1] 0.1969969
natPropMashedPotatoes<- sum(tgpop$wPropMashedPotatoes, na.rm=TRUE)
natPropMashedPotatoes
## [1] 0.7939981
natPropRolls<- sum(tgpop$wPropRolls, na.rm=TRUE)
natPropRolls
## [1] 0.745888
natPropSquash<- sum(tgpop$wPropSquash, na.rm=TRUE)
natPropSquash
## [1] 0.1541902
natPropSalad<- sum(tgpop$wPropSalad, na.rm=TRUE)
natPropSalad
## [1] 0.2001446
natPropYams<- sum(tgpop$wPropYams, na.rm=TRUE)
natPropYams
## [1] 0.619342
Now that we have the national proportions we can see which regions are disproportionately into certain side dishes!!!
To find difference, for each region we want to subtract the national proportion from the regional proportion
tgdiff<-tgpop%>%
filter(!is.na(DivName))%>%
mutate(diffBrussel = propBrussel - natPropBrussel,
diffCarrots = propCarrots - natPropCarrots,
diffCauliflower = propCauliflower - natPropCauliflower,
diffCorn = propCorn - natPropCorn,
diffCornbread = propCornbread - natPropCornbread,
diffFruitSalad = propFruitSalad - natPropFruitSalad,
diffGreenBeans = propGreenBeans - natPropGreenBeans,
diffMac = propMac - natPropMac,
diffMashedPotatoes = propMashedPotatoes - natPropMashedPotatoes,
diffRolls = propRolls - natPropRolls,
diffSquash = propSquash - natPropSquash,
diffSalad = propSalad - natPropSalad,
diffYams = propYams - natPropYams)%>%
select(DivName, diffBrussel, diffCarrots, diffCauliflower, diffCorn,
diffCornbread, diffFruitSalad, diffGreenBeans, diffMac,
diffMashedPotatoes, diffRolls, diffSquash, diffSalad,
diffYams)
tgdiff
## # A tibble: 9 × 14
## DivName diffBrussel diffCarrots diffCauliflower diffCorn diffCornbread
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 East North Cen… 0.00117 -0.0450 0.00262 0.0530 -0.0814
## 2 East South Cen… -0.0288 0.0183 -0.000718 0.0630 0.0320
## 3 Middle Atlantic 0.112 0.0513 0.0732 0.0306 -0.0271
## 4 Mountain -0.0178 0.00233 0.00105 -0.0920 -0.0219
## 5 New England 0.0614 0.199 -0.0151 -0.0744 -0.0623
## 6 Pacific 0.0531 -0.0262 0.0392 -0.0770 0.0187
## 7 South Atlantic -0.0193 -0.00741 -0.0326 -0.00510 0.0130
## 8 West North Cen… -0.105 -0.0695 -0.0435 0.0328 -0.0725
## 9 West South Cen… -0.102 -0.000940 -0.0511 0.0518 0.139
## # … with 8 more variables: diffFruitSalad <dbl>, diffGreenBeans <dbl>,
## # diffMac <dbl>, diffMashedPotatoes <dbl>, diffRolls <dbl>, diffSquash <dbl>,
## # diffSalad <dbl>, diffYams <dbl>
I’ll try to return to this if I can do use R to find the greatest spreads but it’s giving me a huge headache (trying to use gather/spread functions to calculate the maximum across rows instead of doswn columns but it’s not giving me what I want) so I’m gonna do it the gold old-fashioned way and look at the table.
We observe the following side dishes are the most disproportionately popular for each region: East North Central: ROLLS! (At 0.074) East South Central: MAC N CHEESE! (At 0.153) Middle Atlantic: SQUASH! (At 0.1225) Mountain: SALAD! (At 0.055) New England: SQUASH! (At 0.380) Pacific: SALAD! (At 0.067) South Atlantic: MAC N CHEESE! (At 0.172) West North Central: GREEN BEANS! (At 0.140) West South Central: CORNBREAD! (At 0.139)
sides<- c("Rolls","Mac N Cheese","Squash","Salad","Squash","Salad","Mac N Cheese","Green Beans","Cornbread")
favorites<- tgdiff%>%
select(DivName)%>%
mutate(favSides=sides, DivName=paste(DivName, " Division", sep=""))
favorites
## # A tibble: 9 × 2
## DivName favSides
## <chr> <chr>
## 1 East North Central Division Rolls
## 2 East South Central Division Mac N Cheese
## 3 Middle Atlantic Division Squash
## 4 Mountain Division Salad
## 5 New England Division Squash
## 6 Pacific Division Salad
## 7 South Atlantic Division Mac N Cheese
## 8 West North Central Division Green Beans
## 9 West South Central Division Cornbread
#usmap library called earlier with other libraries
states <- usmap::us_map()
head(states)
## x y order hole piece group fips abbr full
## 1 1091779 -1380695 1 FALSE 1 01.1 01 AL Alabama
## 2 1091268 -1376372 2 FALSE 1 01.1 01 AL Alabama
## 3 1091140 -1362998 3 FALSE 1 01.1 01 AL Alabama
## 4 1090940 -1343517 4 FALSE 1 01.1 01 AL Alabama
## 5 1090913 -1341006 5 FALSE 1 01.1 01 AL Alabama
## 6 1090796 -1334480 6 FALSE 1 01.1 01 AL Alabama
fips<-read.csv("https://raw.githubusercontent.com/kitadasmalley/FA2020_DataViz/main/data/stateFIPS.csv",
header=TRUE)
geoPie<-fips%>%
left_join(favorites)
## Joining, by = "DivName"
head(geoPie)
## Name State Region RegionName Division DivName
## 1 Connecticut 9 1 Northeast Region 1 New England Division
## 2 Maine 23 1 Northeast Region 1 New England Division
## 3 Massachusetts 25 1 Northeast Region 1 New England Division
## 4 New Hampshire 33 1 Northeast Region 1 New England Division
## 5 Rhode Island 44 1 Northeast Region 1 New England Division
## 6 Vermont 50 1 Northeast Region 1 New England Division
## favSides
## 1 Squash
## 2 Squash
## 3 Squash
## 4 Squash
## 5 Squash
## 6 Squash
foodStates<-states %>%
mutate(Name=full)%>%
left_join(geoPie)
## Joining, by = "Name"
head(foodStates)
## x y order hole piece group fips abbr full Name State
## 1 1091779 -1380695 1 FALSE 1 01.1 01 AL Alabama Alabama 1
## 2 1091268 -1376372 2 FALSE 1 01.1 01 AL Alabama Alabama 1
## 3 1091140 -1362998 3 FALSE 1 01.1 01 AL Alabama Alabama 1
## 4 1090940 -1343517 4 FALSE 1 01.1 01 AL Alabama Alabama 1
## 5 1090913 -1341006 5 FALSE 1 01.1 01 AL Alabama Alabama 1
## 6 1090796 -1334480 6 FALSE 1 01.1 01 AL Alabama Alabama 1
## Region RegionName Division DivName favSides
## 1 3 South Region 6 East South Central Division Mac N Cheese
## 2 3 South Region 6 East South Central Division Mac N Cheese
## 3 3 South Region 6 East South Central Division Mac N Cheese
## 4 3 South Region 6 East South Central Division Mac N Cheese
## 5 3 South Region 6 East South Central Division Mac N Cheese
## 6 3 South Region 6 East South Central Division Mac N Cheese
Use geom_poly in ggplot
ggplot(foodStates, aes(x=x,y=y, group=group))+
geom_polygon()
Bonus points for creating my own palette!
Brainstorming colors: This graph features a qualitative palette varying primarily in hue (as opposed to luminance or chroma) since these side dishes are unordered, as far as this graph is concerned (while some are MORE disproportionately popular than others, that is not the point of this graph). This website https://color.adobe.com/create/image will even allow me to identify the colors of the original graph (or close enough at least).
ggplot(foodStates, aes(x=x,y=y, group=group))+
geom_polygon(aes(fill=favSides), color='grey')+
scale_fill_manual(values=c("#F29979","#F2CA50","#8FAFD9","#BCD97E","#5ABFB5","#E39FC5"))
#ggsave("firstgraphic.pdf",height=5,width=7)
What polishing needs to be done?
Add graph title and subtitle
Add caption with source
Add labels to fills (matching in color)
Generally, let’s try applying the FiveThirtyEight theme
remove legend and axis grid/text
ggplot(foodStates, aes(x=x,y=y, group=group))+
geom_polygon(aes(fill=favSides), color='grey')+
scale_fill_manual(values=c("#F29979","#F2CA50","#8FAFD9","#BCD97E","#5ABFB5","#E39FC5"))+
theme_fivethirtyeight()+
labs(title="Side Dishes of America's Regions",
subtitle = "Most disproportionately common Thanksgiving side dish by region",
caption = "SOURCE: SURVEYMONKEY AUDIENCE, VIA FiveThirtyEight")+
theme(panel.grid.major = element_blank(),
axis.text=element_blank(),
legend.position='none')+
coord_fixed()+
annotate("text", x=-2.1e+06, y=.5e+06, label="SALAD",color='#5ABFB5', size=3, fontface='bold')+
annotate("text", x=0,y=.7e+06, label="GREEN BEANS/ \n CASSEROLE", color='#F2CA50', size=3, fontface='bold')+
annotate("text", x=1.3e+06,y=.5e+06, label="ROLLS/ \n BISCUITS", color='#BCD97E', size=3, fontface='bold')+
annotate("text", x=2e+06,y=.8e+06, label="SQUASH", color='#E39FC5', size=3, fontface='bold')+
annotate("text", x=.8e+06,y=-1.9e+06, label="CORNBREAD", color='#F29979', size=3, fontface='bold')+
annotate("text", x=2.2e+06,y=-1.4e+06, label="MAC & \nCHEESE", color='#8FAFD9', size=3, fontface='bold')
For final polishing I want to make it as close as I am capable of! Going to try to add additional annotations and mess with text size, etc.
ggplot(foodStates, aes(x=x,y=y, group=group))+
geom_polygon(aes(fill=favSides), color="#636E72", size=.2)+
scale_fill_manual(values=c("#F29979","#F2CA50","#8FAFD9","#BCD97E","#5ABFB5","#E39FC5"))+
theme_fivethirtyeight()+
labs(title="Side Dishes of America's Regions",
subtitle = "Most disproportionately common Thanksgiving side dish by region",
caption = "SOURCE: SURVEYMONKEY AUDIENCE, VIA FiveThirtyEight")+
theme(panel.grid.major = element_blank(),
axis.text=element_blank(),
legend.position='none',
plot.caption=element_text(size=6, face='bold', color='grey'),
plot.title = element_text(size=15),
plot.subtitle = element_text(size=10.5))+
coord_fixed()+
annotate("text", x=-2.1e+06, y=.5e+06, label="SALAD",color='#5ABFB5', size=3, fontface='bold')+
annotate("text", x=0,y=.7e+06, label="GREEN BEANS/ \n CASSEROLE", color='#F2CA50', size=3, fontface='bold')+
annotate("text", x=1.3e+06,y=.5e+06, label="ROLLS/ \n BISCUITS", color='#BCD97E', size=3, fontface='bold')+
annotate("text", x=2e+06,y=.8e+06, label="SQUASH", color='#E39FC5', size=3, fontface='bold')+
annotate("text", x=.8e+06,y=-1.9e+06, label="CORNBREAD", color='#F29979', size=3, fontface='bold')+
annotate("text", x=2.2e+06,y=-1.4e+06, label="MAC & \nCHEESE", color='#8FAFD9', size=3, fontface='bold')+
geom_hline(yintercept=1.1e+06, color='grey')+
geom_hline(yintercept=1.4e+06, color='grey')+
annotate("text", x=-1.8e+06, y=1.3e+06, label="SURVERY DATES", size=2, fontface='bold', color="#636E72")+
annotate("text", x=-1.9e+06, y=1.2e+06, label="11/17/2015", size=2, color="#636E72")+
annotate("text", x=1e+06, y=1.3e+06, label="NO. OF RESPONDENTS", size=2, fontface='bold', color="#636E72")+
annotate("text", x=0.6e+06, y=1.2e+06, label="931", size=2, color="#636E72")
#ggsave("finalgraphic.pdf",height=5,width=5)
Note: submit pdfs of first graphic AND final graphic along with notes and code in this RMarkdown.