Manipulate Data

Author

Jason Pemberton

1 dplyr

Code
starwars %>% 
  select(name, species, contains("color")) %>% 
  head(10)
# A tibble: 10 × 5
   name               species hair_color    skin_color  eye_color
   <chr>              <chr>   <chr>         <chr>       <chr>    
 1 Luke Skywalker     Human   blond         fair        blue     
 2 C-3PO              Droid   <NA>          gold        yellow   
 3 R2-D2              Droid   <NA>          white, blue red      
 4 Darth Vader        Human   none          white       yellow   
 5 Leia Organa        Human   brown         light       brown    
 6 Owen Lars          Human   brown, grey   light       blue     
 7 Beru Whitesun Lars Human   brown         light       blue     
 8 R5-D4              Droid   <NA>          white, red  red      
 9 Biggs Darklighter  Human   black         light       brown    
10 Obi-Wan Kenobi     Human   auburn, white fair        blue-gray
Code
starwars %>% 
  select(name, height, species, eye_color) %>% 
  filter(species == "Human" &
           height < 200 &
           eye_color %in% c("blue", "brown")) %>% 
  head(10)
# A tibble: 10 × 4
   name               height species eye_color
   <chr>               <int> <chr>   <chr>    
 1 Luke Skywalker        172 Human   blue     
 2 Leia Organa           150 Human   brown    
 3 Owen Lars             178 Human   blue     
 4 Beru Whitesun Lars    165 Human   blue     
 5 Biggs Darklighter     183 Human   brown    
 6 Anakin Skywalker      188 Human   blue     
 7 Wilhuff Tarkin        180 Human   blue     
 8 Han Solo              180 Human   brown    
 9 Boba Fett             183 Human   brown    
10 Lando Calrissian      177 Human   brown    
Code
starwars %>% 
  select(name, height, mass, species) %>%
  mutate(height = height/100) %>% 
  head(10)
# A tibble: 10 × 4
   name               height  mass species
   <chr>               <dbl> <dbl> <chr>  
 1 Luke Skywalker       1.72    77 Human  
 2 C-3PO                1.67    75 Droid  
 3 R2-D2                0.96    32 Droid  
 4 Darth Vader          2.02   136 Human  
 5 Leia Organa          1.5     49 Human  
 6 Owen Lars            1.78   120 Human  
 7 Beru Whitesun Lars   1.65    75 Human  
 8 R5-D4                0.97    32 Droid  
 9 Biggs Darklighter    1.83    84 Human  
10 Obi-Wan Kenobi       1.82    77 Human  
Code
msleep %>% 
  select(genus, order, sleep_total) %>% 
  arrange(sleep_total) %>% 
  head(10)
# A tibble: 10 × 3
   genus         order          sleep_total
   <chr>         <chr>                <dbl>
 1 Giraffa       Artiodactyla           1.9
 2 Globicephalus Cetacea                2.7
 3 Equus         Perissodactyla         2.9
 4 Capreolus     Artiodactyla           3  
 5 Equus         Perissodactyla         3.1
 6 Loxodonta     Proboscidea            3.3
 7 Phoca         Carnivora              3.5
 8 Ovis          Artiodactyla           3.8
 9 Elephas       Proboscidea            3.9
10 Bos           Artiodactyla           4  
Code
starwars %>% 
  select(name, contains("color"), species) %>% 
  mutate(species = recode(species, "Droid" = "Robot")) %>% 
  head(10)
# A tibble: 10 × 5
   name               hair_color    skin_color  eye_color species
   <chr>              <chr>         <chr>       <chr>     <chr>  
 1 Luke Skywalker     blond         fair        blue      Human  
 2 C-3PO              <NA>          gold        yellow    Robot  
 3 R2-D2              <NA>          white, blue red       Robot  
 4 Darth Vader        none          white       yellow    Human  
 5 Leia Organa        brown         light       brown     Human  
 6 Owen Lars          brown, grey   light       blue      Human  
 7 Beru Whitesun Lars brown         light       blue      Human  
 8 R5-D4              <NA>          white, red  red       Robot  
 9 Biggs Darklighter  black         light       brown     Human  
10 Obi-Wan Kenobi     auburn, white fair        blue-gray Human  
Code
starwars %>% 
  select(sex, height, mass) %>% 
  filter(sex == "male" | sex == "female") %>% 
  mutate(height = height/100) %>% 
  drop_na() %>% 
  group_by(sex) %>% 
  summarise(`Average Height` = (mean(height)),
            `Average Mass` = (mean(mass)))
# A tibble: 2 × 3
  sex    `Average Height` `Average Mass`
  <chr>             <dbl>          <dbl>
1 female             1.72           54.7
2 male               1.78           80.2

2 ggplot2

Code
penguins %>% 
  select(flipper_length_mm, body_mass_g, species) %>% 
  ggplot(aes(flipper_length_mm,
             body_mass_g,
             color = species)) +
  geom_point(size = 3,
             alpha = 0.5) +
  labs(
    title = "Flipper Lingth vs Body Mass by Species",
    x = "Flipper Length (mm)",
    y = "Body Mass (g)"
  ) +
  theme_minimal()

Code
penguins %>% 
  select(bill_length_mm, species) %>% 
  ggplot(aes(
    bill_length_mm,
    species,
    fill = species)) +
  geom_boxplot()+
  coord_flip()+
  labs(
    title = "Bill Length Distribution by Species",
    x = "Species",
    y = "Bill Length (mm)"
  ) +
  theme_minimal()

Code
penguins %>% 
  ggplot(aes(species,
             body_mass_g,
             fill = species)) +
  geom_bar(stat = "summary",
           fun = "mean",
           alpha = 0.5) +
  labs(
    title = "Average Body Mass of Penguin Species",
    x = "Species",
    y = "Average Body Mass (g)"
  ) +
  theme_minimal()

Code
# Calculate overall mean weight for all chickens
global_mean <- mean(chickwts$weight)

# Summarize data to get median weight per feed type
chickwts_summary <- chickwts %>%
  group_by(feed) %>%
  summarize(median_weight = median(weight)) %>%
  arrange(median_weight)  # Sort by median for better visualization

# Convert feed to factor for correct ordering in plot
chickwts$feed <- factor(chickwts$feed, levels = chickwts_summary$feed)

ggplot(chickwts, aes(x = weight, y = feed, color = feed)) +
  geom_vline(xintercept = global_mean, color = "black", linewidth = 1, linetype = "dashed") + # Vertical line at global mean
  geom_point(alpha = 0.6, size = 2, position = position_jitter(width = 0, height = 0.2)) + # Jitter points for clarity
  geom_segment(data = chickwts_summary, aes(x = global_mean, xend = median_weight, y = feed, yend = feed, color = feed), 
               linewidth = 1) + # Lollipop sticks start at same global mean
  geom_point(data = chickwts_summary, aes(x = median_weight, y = feed, color = feed), size = 4) + # Lollipop candy at median weight
  theme_minimal() +
  labs(title = "Chicken Weight by Feed Type (Global Mean to Median Lollipop Plot)",
       x = "Weight (grams)",
       y = "Feed Type")

Code
library(ggridges)
ggplot(lincoln_weather, aes(x = `Mean Temperature [F]`, y = Month, fill = stat(x))) +
  geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
  scale_fill_viridis_c(name = "Temp. [F]", option = "C") +
  labs(title = 'Temperatures in Lincoln NE in 2016')

3 forcats

Code
gss_cat %>% 
  count(marital)
# A tibble: 6 × 2
  marital           n
  <fct>         <int>
1 No answer        17
2 Never married  5416
3 Separated       743
4 Divorced       3383
5 Widowed        1807
6 Married       10117
Code
gss_cat %>% 
  mutate(marital = fct_relevel(marital,
                               "Never Married",
                               "Married",
                               "Separated",
                               "Divorced",
                               "No Answer")) %>% 
  count(marital)
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `marital = fct_relevel(...)`.
Caused by warning:
! 2 unknown levels in `f`: Never Married and No Answer
# A tibble: 6 × 2
  marital           n
  <fct>         <int>
1 Married       10117
2 Separated       743
3 Divorced       3383
4 No answer        17
5 Never married  5416
6 Widowed        1807
Code
gss_cat %>% 
  mutate(marital = fct_infreq(marital)) %>% 
  count(marital)
# A tibble: 6 × 2
  marital           n
  <fct>         <int>
1 Married       10117
2 Never married  5416
3 Divorced       3383
4 Widowed        1807
5 Separated       743
6 No answer        17
Code
gss_cat %>% 
  mutate(marital = fct_infreq(marital)) %>% 
  mutate(marital = fct_rev(marital)) %>%
  count(marital)
# A tibble: 6 × 2
  marital           n
  <fct>         <int>
1 No answer        17
2 Separated       743
3 Widowed        1807
4 Divorced       3383
5 Never married  5416
6 Married       10117
Code
gss_cat %>% 
  group_by(relig) %>% 
  summarise(meantv = mean(tvhours, na.rm = T)) %>% 
  mutate(relig = fct_reorder(relig, meantv)) %>% 
  ggplot(aes(meantv, relig)) +
  geom_point(size = 4,
             color = "steelblue") +
  theme_minimal() +
  labs(
    title = "Average TV Viewing Time by Religion",
    x = "Average TV Viewing Time",
    y = ""
  )

4 stringr

Code
iris %>% 
  select(Species, Sepal.Length) %>% 
  mutate(Description = 
           str_c(Species,
                 ": ",
                 Sepal.Length,
                 " cm")) %>% 
  head(10)
   Species Sepal.Length    Description
1   setosa          5.1 setosa: 5.1 cm
2   setosa          4.9 setosa: 4.9 cm
3   setosa          4.7 setosa: 4.7 cm
4   setosa          4.6 setosa: 4.6 cm
5   setosa          5.0   setosa: 5 cm
6   setosa          5.4 setosa: 5.4 cm
7   setosa          4.6 setosa: 4.6 cm
8   setosa          5.0   setosa: 5 cm
9   setosa          4.4 setosa: 4.4 cm
10  setosa          4.9 setosa: 4.9 cm
Code
mtcars %>% 
  rownames_to_column(var = "model") %>% 
  mutate(has_M = str_detect(model, "M")) %>% 
  filter(has_M == TRUE) %>% 
  select(model, mpg, cyl, disp) %>% 
  head(10)
           model  mpg cyl  disp
1      Mazda RX4 21.0   6 160.0
2  Mazda RX4 Wag 21.0   6 160.0
3      Merc 240D 24.4   4 146.7
4       Merc 230 22.8   4 140.8
5       Merc 280 19.2   6 167.6
6      Merc 280C 17.8   6 167.6
7     Merc 450SE 16.4   8 275.8
8     Merc 450SL 17.3   8 275.8
9    Merc 450SLC 15.2   8 275.8
10   AMC Javelin 15.2   8 304.0
Code
mtcars %>% 
  rownames_to_column(var = "model") %>% 
  mutate('short_name' = 
           str_sub(model, 1, 3)) %>%
  select(model, 'short_name', mpg, cyl, disp) %>% 
  head(10)
               model short_name  mpg cyl  disp
1          Mazda RX4        Maz 21.0   6 160.0
2      Mazda RX4 Wag        Maz 21.0   6 160.0
3         Datsun 710        Dat 22.8   4 108.0
4     Hornet 4 Drive        Hor 21.4   6 258.0
5  Hornet Sportabout        Hor 18.7   8 360.0
6            Valiant        Val 18.1   6 225.0
7         Duster 360        Dus 14.3   8 360.0
8          Merc 240D        Mer 24.4   4 146.7
9           Merc 230        Mer 22.8   4 140.8
10          Merc 280        Mer 19.2   6 167.6
Code
starwars %>% 
  select(name, species) %>% 
  mutate(species = str_to_upper(species)) %>%
  head(10)
# A tibble: 10 × 2
   name               species
   <chr>              <chr>  
 1 Luke Skywalker     HUMAN  
 2 C-3PO              DROID  
 3 R2-D2              DROID  
 4 Darth Vader        HUMAN  
 5 Leia Organa        HUMAN  
 6 Owen Lars          HUMAN  
 7 Beru Whitesun Lars HUMAN  
 8 R5-D4              DROID  
 9 Biggs Darklighter  HUMAN  
10 Obi-Wan Kenobi     HUMAN  

5 gtExtras

Code
mtcars %>% 
  group_by(cyl) %>% 
  summarize(Median = round(median(mpg), 1),
            Mean = round(mean(mpg), 1),
            Distribution = list(mpg)) %>% 
  gt() %>% 
  gt_plt_dist(Distribution) %>% 
  gt_theme_guardian() %>% 
  tab_header(title = "Miles per Gallon Statistics")
Miles per Gallon Statistics
cyl Median Mean Distribution
4 26.0 26.7
6 19.7 19.7
8 15.2 15.1
Code
table <- mpg %>%
  head(10) %>%
  gt() %>%
  tab_header(title = "Fuel Efficiency Data") %>%
  gt_theme_538()  # try gt_theme_bbc() or gt_theme_guardian()

table
Fuel Efficiency Data
manufacturer model displ year cyl trans drv cty hwy fl class
audi a4 1.8 1999 4 auto(l5) f 18 29 p compact
audi a4 1.8 1999 4 manual(m5) f 21 29 p compact
audi a4 2.0 2008 4 manual(m6) f 20 31 p compact
audi a4 2.0 2008 4 auto(av) f 21 30 p compact
audi a4 2.8 1999 6 auto(l5) f 16 26 p compact
audi a4 2.8 1999 6 manual(m5) f 18 26 p compact
audi a4 3.1 2008 6 auto(av) f 18 27 p compact
audi a4 quattro 1.8 1999 4 manual(m5) 4 18 26 p compact
audi a4 quattro 1.8 1999 4 auto(l5) 4 16 25 p compact
audi a4 quattro 2.0 2008 4 manual(m6) 4 20 28 p compact
Code
table2 <- starwars %>%
  select(name, height, mass) %>%
  filter(mass < 200) %>% 
  arrange(desc(mass)) %>% 
  head(10) %>%
  gt() %>%
  gt_plt_bar(column = mass)  # Works in `{gtExtras}`

table2
name height mass
Grievous 216
IG-88 200
Darth Vader 202
Tarfful 234
Owen Lars 178
Bossk 190
Chewbacca 228
Jek Tono Porkins 180
Dexter Jettster 198
Nute Gunray 191
Code
table3 <- msleep %>%
  select(name, sleep_total, vore) %>%
  arrange(desc(sleep_total)) %>%  # Sorting in descending order
  head(10) %>%
  gt() %>%
  gt_color_rows("sleep_total", palette = "Blues")

table3
name sleep_total vore
Little brown bat 19.9 insecti
Big brown bat 19.7 insecti
Thick-tailed opposum 19.4 carni
Giant armadillo 18.1 insecti
North American Opossum 18.0 omni
Long-nosed armadillo 17.4 carni
Owl monkey 17.0 omni
Arctic ground squirrel 16.6 herbi
Golden-mantled ground squirrel 15.9 herbi
Tiger 15.8 carni
Code
table4 <- starwars %>%
  select(name, species, homeworld, mass) %>%
  head(15) %>%
  gt() %>%
  tab_style(
    style = list(
      cell_fill(color = "lightcoral")  # Light red background
    ),
    locations = cells_body(
      rows = species == "Droid"  # Highlight rows where species is "Droid"
    )
  )

table4
name species homeworld mass
Luke Skywalker Human Tatooine 77
C-3PO Droid Tatooine 75
R2-D2 Droid Naboo 32
Darth Vader Human Tatooine 136
Leia Organa Human Alderaan 49
Owen Lars Human Tatooine 120
Beru Whitesun Lars Human Tatooine 75
R5-D4 Droid Tatooine 32
Biggs Darklighter Human Tatooine 84
Obi-Wan Kenobi Human Stewjon 77
Anakin Skywalker Human Tatooine 84
Wilhuff Tarkin Human Eriadu NA
Chewbacca Wookiee Kashyyyk 112
Han Solo Human Corellia 80
Greedo Rodian Rodia 74
Code
plot <- gapminder %>% 
  rename(Country = country) %>% 
  filter(continent == "Europe") %>% 
  group_by(Country) %>% 
  summarise(`GDP pc` = round(mean(gdpPercap)),
            `Pop size` = round(mean(pop)),
            `Life Expectancy` = list(lifeExp)) %>% 
  arrange(-`GDP pc`) %>% 
  head(10) %>% 
  gt() %>% 
  gt_plt_dist(`Life Expectancy`) %>% 
  tab_header(title = "GDP and Population Size - Europe") %>% 
  cols_align(align = "left")


table5 <- plot %>% 
  gt_theme_pff() %>% 
  gt_color_rows(column = `Pop size`,
                palette = "Pastel1") %>% 
  gt_plt_bar_pct(`GDP pc`,
                 fill = "steelblue",
                 height = 15,
                 width = 100)

table5
GDP and Population Size - Europe
Country GDP pc Pop size Life Expectancy
Switzerland
6384293
Norway
4031441
Netherlands
13786798
Denmark
4994187
Germany
77547043
Iceland
226978
Austria
7583298
Sweden
8220029
Belgium
9725119
United Kingdom
56087801