1 dplyr

Note

dplyr is a popular R package for data manipulation, known for its intuitive syntax and verbs that make it easy to transform datasets.

Useful functions

select(): Choose specific columns from a data frame.
filter(): Select rows that meet specific conditions.
mutate(): Create new columns or modify existing ones.
arrange(): Order rows based on column values.
summarise(): Reduce multiple rows to a single summary row.
group_by(): Group data by one or more columns for aggregation.

Code

library(tidyverse)

starwars %>% 
  select(name, species, contains("color")) %>%
  head(10)

# A tibble: 10 × 5
   name               species hair_color    skin_color  eye_color
   <chr>              <chr>   <chr>         <chr>       <chr>    
 1 Luke Skywalker     Human   blond         fair        blue     
 2 C-3PO              Droid   <NA>          gold        yellow   
 3 R2-D2              Droid   <NA>          white, blue red      
 4 Darth Vader        Human   none          white       yellow   
 5 Leia Organa        Human   brown         light       brown    
 6 Owen Lars          Human   brown, grey   light       blue     
 7 Beru Whitesun Lars Human   brown         light       blue     
 8 R5-D4              Droid   <NA>          white, red  red      
 9 Biggs Darklighter  Human   black         light       brown    
10 Obi-Wan Kenobi     Human   auburn, white fair        blue-gray

Code

starwars %>% 
  select(name, species, contains("color")) %>% 
  head(10)

# A tibble: 10 × 5
   name               species hair_color    skin_color  eye_color
   <chr>              <chr>   <chr>         <chr>       <chr>    
 1 Luke Skywalker     Human   blond         fair        blue     
 2 C-3PO              Droid   <NA>          gold        yellow   
 3 R2-D2              Droid   <NA>          white, blue red      
 4 Darth Vader        Human   none          white       yellow   
 5 Leia Organa        Human   brown         light       brown    
 6 Owen Lars          Human   brown, grey   light       blue     
 7 Beru Whitesun Lars Human   brown         light       blue     
 8 R5-D4              Droid   <NA>          white, red  red      
 9 Biggs Darklighter  Human   black         light       brown    
10 Obi-Wan Kenobi     Human   auburn, white fair        blue-gray

Code

starwars %>% 
  select(name, height, mass, species) %>% 
  mutate(height = height/100) %>%
  head(10)

# A tibble: 10 × 4
   name               height  mass species
   <chr>               <dbl> <dbl> <chr>  
 1 Luke Skywalker       1.72    77 Human  
 2 C-3PO                1.67    75 Droid  
 3 R2-D2                0.96    32 Droid  
 4 Darth Vader          2.02   136 Human  
 5 Leia Organa          1.5     49 Human  
 6 Owen Lars            1.78   120 Human  
 7 Beru Whitesun Lars   1.65    75 Human  
 8 R5-D4                0.97    32 Droid  
 9 Biggs Darklighter    1.83    84 Human  
10 Obi-Wan Kenobi       1.82    77 Human

Code

msleep %>% 
  select(genus, order, sleep_total) %>% 
  arrange(sleep_total) %>%
  head(10)

# A tibble: 10 × 3
   genus         order          sleep_total
   <chr>         <chr>                <dbl>
 1 Giraffa       Artiodactyla           1.9
 2 Globicephalus Cetacea                2.7
 3 Equus         Perissodactyla         2.9
 4 Capreolus     Artiodactyla           3  
 5 Equus         Perissodactyla         3.1
 6 Loxodonta     Proboscidea            3.3
 7 Phoca         Carnivora              3.5
 8 Ovis          Artiodactyla           3.8
 9 Elephas       Proboscidea            3.9
10 Bos           Artiodactyla           4

Code

starwars %>% 
  select(name, contains("color"), species) %>% 
  mutate(species = recode(species,
                    "Droid" = "Robot")) %>%
  head(10)

# A tibble: 10 × 5
   name               hair_color    skin_color  eye_color species
   <chr>              <chr>         <chr>       <chr>     <chr>  
 1 Luke Skywalker     blond         fair        blue      Human  
 2 C-3PO              <NA>          gold        yellow    Robot  
 3 R2-D2              <NA>          white, blue red       Robot  
 4 Darth Vader        none          white       yellow    Human  
 5 Leia Organa        brown         light       brown     Human  
 6 Owen Lars          brown, grey   light       blue      Human  
 7 Beru Whitesun Lars brown         light       blue      Human  
 8 R5-D4              <NA>          white, red  red       Robot  
 9 Biggs Darklighter  black         light       brown     Human  
10 Obi-Wan Kenobi     auburn, white fair        blue-gray Human

Code

starwars %>% 
  select(sex, height, mass) %>% 
  filter(sex == "male" | sex == "female") %>%
  mutate(height = height/100) %>% 
  drop_na() %>%
  group_by(sex) %>% 
  summarise('Average height' = (mean(height)),
            'Average mass' = mean(mass))

# A tibble: 2 × 3
  sex    `Average height` `Average mass`
  <chr>             <dbl>          <dbl>
1 female             1.72           54.7
2 male               1.78           80.2

2 ggplot2

Note

ggplot2 is a widely-used package for data visualization, providing a powerful system to create complex plots.

Useful functions

ggplot(): Initialize a plot object.
geom_point(): Create scatter plots.
geom_line(): Draw lines to show trends.
geom_bar(): Create bar charts.
facet_wrap() / facet_grid(): Create subplots based on factors.
labs(): Customize labels and titles.

Code

if(!require(pacman)) install.packages("pacman")
pacman::p_load(tidyverse, plotly, ggridges)

library(palmerpenguins)

ggplot(data = penguins,
       aes(x = flipper_length_mm,
           y = body_mass_g,
           color = species)) +
    geom_point(size = 3,
               alpha = 0.5)+
    labs(title =
           "Flipper Length vs Body Mass by Species",
         x = "Flipper Length (mm)",
         y = "Body Mass (g)") +
    theme_minimal()

Code

penguins %>%
ggplot(aes(x = species, 
           y = bill_length_mm, 
           fill = species))+
    geom_boxplot(alpha = 0.5) +
    labs(title = "Bill Length Distribution by Species",
         x = "Species",
         y = "Bill Length (mm)") +
    theme_minimal()

Code

ggplot(penguins, 
       aes(x = species, 
           y = body_mass_g, 
           fill = species)) +
    geom_bar(stat = "summary",
             fun = "mean",
             alpha = 0.5) +
    labs(title = "Average Body Mass of Penguin Species",
         x = "Species",
         y = "Average Body Mass (g)") +
    theme_minimal()

Code

chickwts %>% 
  group_by(feed) %>%
  mutate(mean_by_feed = mean(weight)) %>%
  ungroup() %>%
  mutate(feed = fct_reorder(feed, mean_by_feed)) %>%
  ggplot(aes(x = feed, 
             y = weight, 
             colour = feed)) +
  coord_flip() +
  geom_jitter(show.legend = F,
              size = 4,
              alpha = 0.2,
              width = 0.05) +
  geom_point(stat = "summary",
             fun = "mean",
             size = 8,
             show.legend = F)+
  geom_hline(aes(yintercept = mean(weight)),
             colour = "gray70",
             size = 0.9) +
  geom_segment(aes(
                   x = feed,
                   y = mean(weight),
                   xend = feed,
                   yend = mean_by_feed),
               size = 2, 
               show.legend = F) +
  labs(title = "Weight of chickens by feed group",
       x = "Feed",
       y = "Weight of chickens") +
  theme(legend.position = "none") +
  theme_minimal()

Code

library(ggridges)
library(viridis)

ggplot(lincoln_weather, 
       aes(x = `Mean Temperature [F]`, 
           y = `Month`, 
           fill = ..x..)) +
  geom_density_ridges_gradient(scale = 3,
                               rel_min_height = 0.01,
                               alpha = 5) +
  scale_fill_viridis(name = "Temp. [F]",
                     option = "C") +
  labs(title = 'Temperatures in Lincoln NE in 2016') +
  theme_bw() +
  theme(
    legend.position="none",
    panel.spacing = unit(0.1, "lines"),
    strip.text.x = element_text(size = 8)
  )

3 forcats

Note

forcats provides tools for working with categorical data (factors), making it easier to reorder, create, and modify factor levels.

Useful functions

fct_relevel(): Manually change the order of factor levels.
fct_reorder(): Reorder factor levels based on another variable.
fct_infreq(): Reorder levels by their frequency.
fct_rev(): Reverse the order of factor levels.

Code

gss_cat %>% 
  count(marital)

# A tibble: 6 × 2
  marital           n
  <fct>         <int>
1 No answer        17
2 Never married  5416
3 Separated       743
4 Divorced       3383
5 Widowed        1807
6 Married       10117

Code

gss_cat %>% 
  mutate(marital = fct_relevel(marital,
                               "Never married",
                               "Married",
                               "Separated",
                               "Divorced",
                               "Widowed",
                               "No answer")) %>%
  count(marital)

# A tibble: 6 × 2
  marital           n
  <fct>         <int>
1 Never married  5416
2 Married       10117
3 Separated       743
4 Divorced       3383
5 Widowed        1807
6 No answer        17

Code

gss_cat %>% 
  mutate(marital = fct_infreq(marital)) %>%
  count(marital)

# A tibble: 6 × 2
  marital           n
  <fct>         <int>
1 Married       10117
2 Never married  5416
3 Divorced       3383
4 Widowed        1807
5 Separated       743
6 No answer        17

Code

gss_cat %>% 
  mutate(marital = fct_infreq(marital)) %>% 
  mutate(marital = fct_rev(marital)) %>%
  count(marital)

# A tibble: 6 × 2
  marital           n
  <fct>         <int>
1 No answer        17
2 Separated       743
3 Widowed        1807
4 Divorced       3383
5 Never married  5416
6 Married       10117

Code

gss_cat %>% 
  group_by(relig) %>% 
  summarise(meantv = mean(tvhours, na.rm = T)) %>% 
  mutate(relig = fct_reorder(relig, meantv)) %>%
  ggplot(aes(meantv, relig))+
  geom_point(size = 4, 
             color = "steelblue")+
  theme_minimal()+
  labs(title = "Average TV watchtime by religion",
       x = "Averate TV watch time",
       y = "")

4 stringr

Note

stringr provides functions to work efficiently with strings, including pattern matching, replacement, and splitting.

Useful functions

str_c(): Concatenate strings together.
str_detect(): Detect the presence of a pattern.
str_sub(): Extract substrings from a string.
str_to_upper(): Changes lowercase text to uppercase text.
str_replace(): Replace matching patterns in a string.
str_split(): Split strings into substrings.

Code

iris %>%
  select(Species, Sepal.Length) %>% 
  mutate(Description =
           str_c(Species,
                 ": ",
                 Sepal.Length,
                 " cm")) %>%
  head(10)

   Species Sepal.Length    Description
1   setosa          5.1 setosa: 5.1 cm
2   setosa          4.9 setosa: 4.9 cm
3   setosa          4.7 setosa: 4.7 cm
4   setosa          4.6 setosa: 4.6 cm
5   setosa          5.0   setosa: 5 cm
6   setosa          5.4 setosa: 5.4 cm
7   setosa          4.6 setosa: 4.6 cm
8   setosa          5.0   setosa: 5 cm
9   setosa          4.4 setosa: 4.4 cm
10  setosa          4.9 setosa: 4.9 cm

Code

mtcars %>% 
    rownames_to_column(var = "model") %>%
    mutate(has_M = str_detect(model, "M")) %>%
    filter(has_M == TRUE) %>%
    select(model, mpg, cyl, disp) %>% 
  head(10)

           model  mpg cyl  disp
1      Mazda RX4 21.0   6 160.0
2  Mazda RX4 Wag 21.0   6 160.0
3      Merc 240D 24.4   4 146.7
4       Merc 230 22.8   4 140.8
5       Merc 280 19.2   6 167.6
6      Merc 280C 17.8   6 167.6
7     Merc 450SE 16.4   8 275.8
8     Merc 450SL 17.3   8 275.8
9    Merc 450SLC 15.2   8 275.8
10   AMC Javelin 15.2   8 304.0

Code

mtcars %>%
  rownames_to_column(var = "model") %>%  
  mutate('short name' =
           str_sub(model, 1, 3)) %>%
  select(model, 'short name', mpg, cyl, disp) %>% 
  head(10)

               model short name  mpg cyl  disp
1          Mazda RX4        Maz 21.0   6 160.0
2      Mazda RX4 Wag        Maz 21.0   6 160.0
3         Datsun 710        Dat 22.8   4 108.0
4     Hornet 4 Drive        Hor 21.4   6 258.0
5  Hornet Sportabout        Hor 18.7   8 360.0
6            Valiant        Val 18.1   6 225.0
7         Duster 360        Dus 14.3   8 360.0
8          Merc 240D        Mer 24.4   4 146.7
9           Merc 230        Mer 22.8   4 140.8
10          Merc 280        Mer 19.2   6 167.6

Code

starwars %>%
    select(name, species) %>% 
    mutate(species = str_to_upper(species)) %>%
    head(10)

# A tibble: 10 × 2
   name               species
   <chr>              <chr>  
 1 Luke Skywalker     HUMAN  
 2 C-3PO              DROID  
 3 R2-D2              DROID  
 4 Darth Vader        HUMAN  
 5 Leia Organa        HUMAN  
 6 Owen Lars          HUMAN  
 7 Beru Whitesun Lars HUMAN  
 8 R5-D4              DROID  
 9 Biggs Darklighter  HUMAN  
10 Obi-Wan Kenobi     HUMAN

5 gtExtras

Note

gtExtras extends the gt package to add more flexibility in styling tables with additional formatting options and features.

Useful functions

gt_color_box(): Add color shading to cell values.
gt_highlight_rows(): Highlight specific rows in a table.
gt_plt_sparkline(): Add sparklines to table cells.
gt_fa_repeats(): Add font-awesome icons as repeat markers.

Code

library(gtExtras)
library(gapminder)
library(RColorBrewer)
library(svglite)

gapminder %>%
  rename(Country = country) %>%
  filter(continent == "Europe") %>%
  group_by(Country) %>%
  summarise(`GDP per capita` = round(mean(gdpPercap)),
            `Pop size` = round(mean(pop)),
            `Life expectance` = list(lifeExp)) %>%
  arrange(-`GDP per capita`) %>%
  head(10) %>%
  gt() %>%
  gt_theme_pff() %>%
  gt_plt_dist('Life expectance') %>%
  gt_color_rows(column = 'Pop size',
                palette = "Pastel1") %>%
  gt_plt_bar_pct('GDP per capita',
                 fill = "steelblue",
                 height = 15,
                 width = 120) %>%
  tab_header(title = "The GDP and Pop Size of Europe") %>%
  cols_align(align = "left")

The GDP and Pop Size of Europe
Country	GDP per capita	Pop size	Life expectance
Switzerland		6384293
Norway		4031441
Netherlands		13786798
Denmark		4994187
Germany		77547043
Iceland		226978
Austria		7583298
Sweden		8220029
Belgium		9725119
United Kingdom		56087801

Code

gapminder %>%
  head(10) %>% 
  gt %>% 
  gt_highlight_rows(row = year == 1972,
                    fill = 'steelblue') %>%
  tab_header(title = "Life Expectancy, Population and GDP in 1972") %>% 
  gt_theme_espn()

Life Expectancy, Population and GDP in 1972
country	continent	year	lifeExp	pop	gdpPercap
Afghanistan	Asia	1952	28.801	8425333	779.4453
Afghanistan	Asia	1957	30.332	9240934	820.8530
Afghanistan	Asia	1962	31.997	10267083	853.1007
Afghanistan	Asia	1967	34.020	11537966	836.1971
Afghanistan	Asia	1972	36.088	13079460	739.9811
Afghanistan	Asia	1977	38.438	14880372	786.1134
Afghanistan	Asia	1982	39.854	12881816	978.0114
Afghanistan	Asia	1987	40.822	13867957	852.3959
Afghanistan	Asia	1992	41.674	16317921	649.3414
Afghanistan	Asia	1997	41.763	22227415	635.3414

6 plotly

Note

plotly is a package for creating interactive web-based plots, often used to enhance visualizations initially created with ggplot2.

Useful functions

plot_ly(): Create a new interactive plot.
ggplotly(): Convert ggplot2 plots to interactive plots.
layout(): Customize the layout of a plotly object.
add_trace(): Add a trace to an existing plot.

Code

library(plotly)

p <- starwars %>%
  drop_na(height, mass, eye_color) %>% 
  filter(mass < 250) %>% 
  filter(eye_color %in% c("blue", 
                          "brown",
                          "black",
                          "pink",
                          "red",
                          "orange")) %>% 
  ggplot(aes(x = height,
             y = mass, 
         color = eye_color))+
  geom_jitter(size = 6,
              alpha = 0.5)+
  scale_color_manual(values = c("blue" = "blue",
                                "brown" = "brown",
                                "black" = "black",
                                "pink" = "pink",
                                "red" = "red",
                                "orange" = "orange"))+
  theme_minimal()+
  theme(legend.position = c(0.05,0.98),
        legend.justification = c("left", "top"))+ #<35
  labs(title = "height, mass and eye color",
       x = "Height of characters",
       y = "Mass of characters",
       color = "Eye Color")

ggplotly(p)

Code

trees %>% 
  plot_ly(x = ~ Girth,
          y = ~ Height,
          z = ~ Volume)

Code

plot_ly(z = volcano, type = "surface")

Courses that contain short and easy to digest video content are available at premieranalytics.com.bd Each lessons uses data that is built into R or comes with installed packages so you can replicated the work at home. premieranalytics.com.bd also includes teaching on statistics and research methods.