LAB 5

Author

Kate

options(scipen=999) 
library(tidyverse)
library(socviz)
library(datasetsICR)

READ THIS DISCLAIMER THIS QUARTO FILE OFFERS A BASIC OUTLINE FOR THE REPORT AND EXAMPLE FOR SOME OF THE CODE.
THIS FILE IS NOT COMPLETE AND NOT INTENDED FOR YOU TO SIMPLY REPLICATE.
YOU WILL NEED TO WORK THROUGH ALL REQUIREMENTS ON YOUR OWN, INCLUDING INTERPRETATIONS, AESTHETICS, LABELS, COLORS, TITLES, SUBTITLES, ETC.
BE CREATIVE!


PART 1: PRACTICE USING PIPES (dplyr) TO SUMMARIZE DATA: TWO CATEGORICAL VARIABLES

data(FIFA)
pip1 <- FIFA %>%         
  group_by(Nationality, Preferred.Foot) %>%
  summarize(N = n()) %>% 
  mutate(freq = N/sum(N),
         pct = round((freq*100),0)) %>%
  filter(Nationality %in% c("Spain", "Brazil", "France", "Italy", "Argentina"))
pip1
# A tibble: 15 × 5
# Groups:   Nationality [5]
   Nationality Preferred.Foot     N     freq   pct
   <fct>       <fct>          <int>    <dbl> <dbl>
 1 Argentina   ""                 1 0.00107      0
 2 Argentina   "Left"           214 0.228       23
 3 Argentina   "Right"          722 0.771       77
 4 Brazil      ""                 2 0.00242      0
 5 Brazil      "Left"           245 0.296       30
 6 Brazil      "Right"          580 0.701       70
 7 France      ""                 3 0.00328      0
 8 France      "Left"           236 0.258       26
 9 France      "Right"          675 0.739       74
10 Italy       ""                 3 0.00427      0
11 Italy       "Left"           189 0.269       27
12 Italy       "Right"          510 0.726       73
13 Spain       ""                 1 0.000933     0
14 Spain       "Left"           298 0.278       28
15 Spain       "Right"          773 0.721       72

PART 2: CREATE STACKED AND DODGED BAR CHARTS FROM 2 CATEGORICAL VARIABLES

p_title <- "Preferred Foot by Nationality"
p_caption <- "FIFA dataset"


# AS STACKED BAR CHART
p <- ggplot(data = subset(pip1, !is.na(Preferred.Foot) & !is.na(Nationality)), 
                        aes(x=Nationality, y=pct, fill = Preferred.Foot))

p + geom_col(position = "stack") +
    labs(x="Nationality", y="Percent", fill = "Preferred Foot",
         title = p_title, caption = p_caption, 
         subtitle = "As a stacked bar chart") +
    geom_text(aes(label=pct), position = position_stack(vjust=.5))

# AS DODGED BAR CHART
p + geom_col(position = "dodge2") +
    labs(x="Nationality", y="Percent", fill = "Preferred Foot",
         title = p_title, caption = p_caption, 
         subtitle = "As a dodged bar chart") + 
    geom_text(aes(label = pct), position = position_dodge(width = .9)) 

# AS FACETED HORIZONTAL BAR CHART
p + geom_col(position = "dodge2") +
    labs(x=NULL, y="Percent", fill = "Preferred Foot",
         title = p_title, caption = p_caption, 
         subtitle = "As a faceted horizontal bar chart") +
         guides(fill = "none") +
         coord_flip() +
         facet_grid(~ Nationality) +
    geom_text(aes(label = pct), position = position_dodge2(width = 1))

PART 3: PRACTICE USING PIPES (dplyr) TO SUMMARIZE DATA: TWO CONTINUOUS & ONE CATEGORICAL VARIABLE

pip2 <- FIFA %>%         
  group_by(Nationality) %>%
  summarize(N = n(),
            age_mean = mean(Age, na.rm=TRUE),
            weight_mean = mean(Weight, na.rm=TRUE)) %>%
  mutate(freq = N/sum(N),
         pct = round((freq*100),0)) %>%
         filter(Nationality %in% c("Spain", "Brazil", "France", "Italy", "Argentina"))

pip2
# A tibble: 5 × 6
  Nationality     N age_mean weight_mean   freq   pct
  <fct>       <int>    <dbl>       <dbl>  <dbl> <dbl>
1 Argentina     937     26.2        75.1 0.0515     5
2 Brazil        827     27.6        76.0 0.0454     5
3 France        914     24.6        75.6 0.0502     5
4 Italy         702     25.9        76.0 0.0386     4
5 Spain        1072     25.3        74.0 0.0589     6

PART 4: SCATTERPLOT WITH A THIRD CATEGORICAL VARIABLE

p <- ggplot(pip2, aes(x=weight_mean, y=age_mean, color=Nationality))
p + geom_point(size=5) +
    annotate(geom = "text", x = 75, y=25, 
                     label = "France has the lowest average age.", hjust=0) +
    labs(y="Average Age", x="Average Weight", 
         title="Age and Weight by Nationality", 
         subtitle = "Spain has the lowest average weight.",
         caption = "FIFA dataset{datasetsICR}")

PART 5: LEGEND AND GUIDES

p <- ggplot(pip2, aes(x=weight_mean, y=age_mean, color=Nationality))
p + geom_point(size=5) +
    annotate(geom = "text", x = 74, y=25, 
                     label = "Spain has the lowest average weight.", hjust=0) +
    labs(y="Average Age", x="Average Weight", 
         color = "Nationality",  title="Age and Weight by Nationality", 
         subtitle = "Brazil has the highest average weight and age.",
         caption = "FIFA dataset{datasetsICR}") +
  theme(legend.title = element_text(color="gray50", size=14, face="bold"),
        legend.position = c(x=.2, y=.7))

PART 6: DATA LABELS VS LEGEND

p <- ggplot(pip2, aes(x=weight_mean, y=age_mean, color=Nationality))
p + geom_point(size=5) +
    geom_text(mapping = aes(label=Nationality), hjust=1.5, size=3) +
    annotate(geom = "text", x = 73, y=25, 
                     label = "Spain has the lowest average weight and the second lowest average age.", hjust=0) +
    labs(y="Average Age", x="Average Weight", 
         title="Age and Weight by Nationality", 
         color = "Nationality", 
         subtitle = "Spain has the lowest average weight.",
         caption = "FIFA dataset{datasetsICR}") +
    theme(legend.position = "none")

PART 7: INTERPRETATION

Through the graphs created, we can see that FIFA players from Brazil have the highest average age and weight. Spain has the lowest average weight and the second lowest average age. France has the lowest average age and the third highest weight. Argentina has the second highest age and the second lowest weight. Italy has the second highest average weight and the third lowest age. The average weight only varied by about 2 kilograms and the average age varied by about 3 years.

END