dplyr

Library

library(dplyr)
library(gapminder)

Data

head(gapminder)

## # A tibble: 6 × 6
##   country     continent  year lifeExp      pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.
## 4 Afghanistan Asia       1967    34.0 11537966      836.
## 5 Afghanistan Asia       1972    36.1 13079460      740.
## 6 Afghanistan Asia       1977    38.4 14880372      786.

head(starwars)

## # A tibble: 6 × 14
##   name         height  mass hair_…¹ skin_…² eye_c…³ birth…⁴ sex   gender homew…⁵
##   <chr>         <int> <dbl> <chr>   <chr>   <chr>     <dbl> <chr> <chr>  <chr>  
## 1 Luke Skywal…    172    77 blond   fair    blue       19   male  mascu… Tatooi…
## 2 C-3PO           167    75 <NA>    gold    yellow    112   none  mascu… Tatooi…
## 3 R2-D2            96    32 <NA>    white,… red        33   none  mascu… Naboo  
## 4 Darth Vader     202   136 none    white   yellow     41.9 male  mascu… Tatooi…
## 5 Leia Organa     150    49 brown   light   brown      19   fema… femin… Aldera…
## 6 Owen Lars       178   120 brown,… light   blue       52   male  mascu… Tatooi…
## # … with 4 more variables: species <chr>, films <list>, vehicles <list>,
## #   starships <list>, and abbreviated variable names ¹hair_color, ²skin_color,
## #   ³eye_color, ⁴birth_year, ⁵homeworld

Column Selection in R dplyr

gapminder %>%
  select(country, year, pop)

## # A tibble: 1,704 × 3
##    country      year      pop
##    <fct>       <int>    <int>
##  1 Afghanistan  1952  8425333
##  2 Afghanistan  1957  9240934
##  3 Afghanistan  1962 10267083
##  4 Afghanistan  1967 11537966
##  5 Afghanistan  1972 13079460
##  6 Afghanistan  1977 14880372
##  7 Afghanistan  1982 12881816
##  8 Afghanistan  1987 13867957
##  9 Afghanistan  1992 16317921
## 10 Afghanistan  1997 22227415
## # … with 1,694 more rows

Data Filtering-1

gapminder %>%
  select(country, year, lifeExp) %>%
  filter(year == 2007)

## # A tibble: 142 × 3
##    country      year lifeExp
##    <fct>       <int>   <dbl>
##  1 Afghanistan  2007    43.8
##  2 Albania      2007    76.4
##  3 Algeria      2007    72.3
##  4 Angola       2007    42.7
##  5 Argentina    2007    75.3
##  6 Australia    2007    81.2
##  7 Austria      2007    79.8
##  8 Bahrain      2007    75.6
##  9 Bangladesh   2007    64.1
## 10 Belgium      2007    79.4
## # … with 132 more rows

Data Filtering-2

gapminder %>%
  select(country, year, lifeExp) %>%
  filter(year == 2007, country == "Poland")

## # A tibble: 1 × 3
##   country  year lifeExp
##   <fct>   <int>   <dbl>
## 1 Poland   2007    75.6

Data Filtering-3

gapminder %>%
  select(country, year, lifeExp) %>%
  filter(year == 2007, country %in% c("Poland", "Croatia"))

## # A tibble: 2 × 3
##   country  year lifeExp
##   <fct>   <int>   <dbl>
## 1 Croatia  2007    75.7
## 2 Poland   2007    75.6

Data Filtering-4

starwars %>% 
  select(name, ends_with("color"))

## # A tibble: 87 × 4
##    name               hair_color    skin_color  eye_color
##    <chr>              <chr>         <chr>       <chr>    
##  1 Luke Skywalker     blond         fair        blue     
##  2 C-3PO              <NA>          gold        yellow   
##  3 R2-D2              <NA>          white, blue red      
##  4 Darth Vader        none          white       yellow   
##  5 Leia Organa        brown         light       brown    
##  6 Owen Lars          brown, grey   light       blue     
##  7 Beru Whitesun lars brown         light       blue     
##  8 R5-D4              <NA>          white, red  red      
##  9 Biggs Darklighter  black         light       brown    
## 10 Obi-Wan Kenobi     auburn, white fair        blue-gray
## # … with 77 more rows

Select only numeric coloumn

starwars %>% 
select_if(is.numeric)

## # A tibble: 87 × 3
##    height  mass birth_year
##     <int> <dbl>      <dbl>
##  1    172    77       19  
##  2    167    75      112  
##  3     96    32       33  
##  4    202   136       41.9
##  5    150    49       19  
##  6    178   120       52  
##  7    165    75       47  
##  8     97    32       NA  
##  9    183    84       24  
## 10    182    77       57  
## # … with 77 more rows

Select only factor coloumn

mtcars %>% 
select_if(is.factor)

## data frame with 0 columns and 32 rows

Data Filtering (condition &)

gapminder %>%
  select(country, year, lifeExp) %>%
  filter(year >= 2007& country %in% c("Poland", "Croatia"))

## # A tibble: 2 × 3
##   country  year lifeExp
##   <fct>   <int>   <dbl>
## 1 Croatia  2007    75.7
## 2 Poland   2007    75.6

Data Filtering (condition or (|))

gapminder %>%
  select(country, year, lifeExp) %>%
  filter(year >= 2007 | country %in% c("Poland", "Croatia"))

## # A tibble: 164 × 3
##    country      year lifeExp
##    <fct>       <int>   <dbl>
##  1 Afghanistan  2007    43.8
##  2 Albania      2007    76.4
##  3 Algeria      2007    72.3
##  4 Angola       2007    42.7
##  5 Argentina    2007    75.3
##  6 Australia    2007    81.2
##  7 Austria      2007    79.8
##  8 Bahrain      2007    75.6
##  9 Bangladesh   2007    64.1
## 10 Belgium      2007    79.4
## # … with 154 more rows

Data Filtering (Not condition (!))

gapminder %>%
  select(country, year, lifeExp) %>%
  filter(year >= 2007 | !country %in% c("Poland", "Croatia"))

## # A tibble: 1,682 × 3
##    country      year lifeExp
##    <fct>       <int>   <dbl>
##  1 Afghanistan  1952    28.8
##  2 Afghanistan  1957    30.3
##  3 Afghanistan  1962    32.0
##  4 Afghanistan  1967    34.0
##  5 Afghanistan  1972    36.1
##  6 Afghanistan  1977    38.4
##  7 Afghanistan  1982    39.9
##  8 Afghanistan  1987    40.8
##  9 Afghanistan  1992    41.7
## 10 Afghanistan  1997    41.8
## # … with 1,672 more rows

Data Filtering (pattern matching)

gapminder %>%
  select(country, year, lifeExp) %>%
  filter(grepl("Po", country))

## # A tibble: 24 × 3
##    country  year lifeExp
##    <fct>   <int>   <dbl>
##  1 Poland   1952    61.3
##  2 Poland   1957    65.8
##  3 Poland   1962    67.6
##  4 Poland   1967    69.6
##  5 Poland   1972    70.8
##  6 Poland   1977    70.7
##  7 Poland   1982    71.3
##  8 Poland   1987    71.0
##  9 Poland   1992    71.0
## 10 Poland   1997    72.8
## # … with 14 more rows

Data Ordering-1

gapminder %>%
  select(country, year, lifeExp) %>%
  filter(year == 2007) %>%
  arrange(lifeExp)

## # A tibble: 142 × 3
##    country                   year lifeExp
##    <fct>                    <int>   <dbl>
##  1 Swaziland                 2007    39.6
##  2 Mozambique                2007    42.1
##  3 Zambia                    2007    42.4
##  4 Sierra Leone              2007    42.6
##  5 Lesotho                   2007    42.6
##  6 Angola                    2007    42.7
##  7 Zimbabwe                  2007    43.5
##  8 Afghanistan               2007    43.8
##  9 Central African Republic  2007    44.7
## 10 Liberia                   2007    45.7
## # … with 132 more rows

Data Ordering-2

gapminder %>%
  select(country, year, lifeExp) %>%
  filter(year == 2007) %>%
  arrange(desc(lifeExp))

## # A tibble: 142 × 3
##    country           year lifeExp
##    <fct>            <int>   <dbl>
##  1 Japan             2007    82.6
##  2 Hong Kong, China  2007    82.2
##  3 Iceland           2007    81.8
##  4 Switzerland       2007    81.7
##  5 Australia         2007    81.2
##  6 Spain             2007    80.9
##  7 Sweden            2007    80.9
##  8 Israel            2007    80.7
##  9 France            2007    80.7
## 10 Canada            2007    80.7
## # … with 132 more rows

Select top values

gapminder %>%
  select(country, year, lifeExp) %>%
  filter(year == 2007) %>%
  arrange(desc(lifeExp)) %>%
  top_n(5)

## Selecting by lifeExp

## # A tibble: 5 × 3
##   country           year lifeExp
##   <fct>            <int>   <dbl>
## 1 Japan             2007    82.6
## 2 Hong Kong, China  2007    82.2
## 3 Iceland           2007    81.8
## 4 Switzerland       2007    81.7
## 5 Australia         2007    81.2

Creating Columns-1

gapminder %>%
  select(country, year, pop, gdpPercap) %>%
  filter(year == 2007) %>%
  mutate(gdp = pop * gdpPercap) %>%
  arrange(desc(gdp)) %>%
  top_n(5)

## Selecting by gdp

## # A tibble: 5 × 5
##   country        year        pop gdpPercap     gdp
##   <fct>         <int>      <int>     <dbl>   <dbl>
## 1 United States  2007  301139947    42952. 1.29e13
## 2 China          2007 1318683096     4959. 6.54e12
## 3 Japan          2007  127467972    31656. 4.04e12
## 4 India          2007 1110396331     2452. 2.72e12
## 5 Germany        2007   82400996    32170. 2.65e12

Creating Columns-2

gapminder %>%
  select(country, year, pop, gdpPercap) %>%
  filter(year == 2007) %>%
  transmute(gdp = pop * gdpPercap) %>%
  arrange(desc(gdp)) %>%
  top_n(5)

## Selecting by gdp

## # A tibble: 5 × 1
##       gdp
##     <dbl>
## 1 1.29e13
## 2 6.54e12
## 3 4.04e12
## 4 2.72e12
## 5 2.65e12

Creating Columns-3

starwars %>% 
  mutate(name, bmi = mass / ((height / 100)  ^ 2)) %>%
  select(name:mass, bmi)

## # A tibble: 87 × 4
##    name               height  mass   bmi
##    <chr>               <int> <dbl> <dbl>
##  1 Luke Skywalker        172    77  26.0
##  2 C-3PO                 167    75  26.9
##  3 R2-D2                  96    32  34.7
##  4 Darth Vader           202   136  33.3
##  5 Leia Organa           150    49  21.8
##  6 Owen Lars             178   120  37.9
##  7 Beru Whitesun lars    165    75  27.5
##  8 R5-D4                  97    32  34.0
##  9 Biggs Darklighter     183    84  25.1
## 10 Obi-Wan Kenobi        182    77  23.2
## # … with 77 more rows

Creating Columns-4

starwars %>%
  group_by(species) %>%
  summarise(
    n = n(),
    mass = mean(mass, na.rm = TRUE)
  ) %>%
  filter(
    n > 1,
    mass > 50
  )

## # A tibble: 8 × 3
##   species      n  mass
##   <chr>    <int> <dbl>
## 1 Droid        6  69.8
## 2 Gungan       3  74  
## 3 Human       35  82.8
## 4 Kaminoan     2  88  
## 5 Mirialan     2  53.1
## 6 Twi'lek      2  55  
## 7 Wookiee      2 124  
## 8 Zabrak       2  80

Creating Columns & if else

df =data.frame(x = c(1,5,6,NA))
df %>% 
  mutate(newvar=if_else(x<5, x+1, x+2,0))

##    x newvar
## 1  1      2
## 2  5      7
## 3  6      8
## 4 NA      0

Creating Columns & Nested IF ELSE

mydf =data.frame(x = c(1:5,NA))
mydf %>% mutate(newvar= if_else(is.na(x),"I am missing",
                                if_else(x==1,"I am one",
                                        if_else(x==2,"I am two",
                                                if_else(x==3,
                                                        "I am three","Others")))))

##    x       newvar
## 1  1     I am one
## 2  2     I am two
## 3  3   I am three
## 4  4       Others
## 5  5       Others
## 6 NA I am missing

Creating Columns & WHEN Statement

mydf %>% mutate(flag = case_when(is.na(x) ~ "I am missing",
                                 x == 1 ~ "I am one",
                                 x == 2 ~ "I am two",
                                 x == 3 ~ "I am three",
                                 TRUE ~ "Others"))

##    x         flag
## 1  1     I am one
## 2  2     I am two
## 3  3   I am three
## 4  4       Others
## 5  5       Others
## 6 NA I am missing

Creating Columns & across()

mtcars %>% 
  group_by(carb) %>% 
  mutate(across(where(is.numeric), mean))

## # A tibble: 32 × 11
## # Groups:   carb [6]
##      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  15.8  7.2   309.  187   3.60  3.90  17.0   0.2 0.3    3.6      4
##  2  15.8  7.2   309.  187   3.60  3.90  17.0   0.2 0.3    3.6      4
##  3  25.3  4.57  134.   86   3.68  2.49  19.5   1   0.571  3.57     1
##  4  25.3  4.57  134.   86   3.68  2.49  19.5   1   0.571  3.57     1
##  5  22.4  5.6   208.  117.  3.70  2.86  18.2   0.5 0.4    3.8      2
##  6  25.3  4.57  134.   86   3.68  2.49  19.5   1   0.571  3.57     1
##  7  15.8  7.2   309.  187   3.60  3.90  17.0   0.2 0.3    3.6      4
##  8  22.4  5.6   208.  117.  3.70  2.86  18.2   0.5 0.4    3.8      2
##  9  22.4  5.6   208.  117.  3.70  2.86  18.2   0.5 0.4    3.8      2
## 10  15.8  7.2   309.  187   3.60  3.90  17.0   0.2 0.3    3.6      4
## # … with 22 more rows

new columns whose value depends on some conditions

gapminder %>%
  filter(year == 2007) %>%
  group_by(continent) %>%
  summarize(avgLifeExp = mean(lifeExp)) %>%
  mutate(over75 = if_else(avgLifeExp > 75, "Y", "N"))

## # A tibble: 5 × 3
##   continent avgLifeExp over75
##   <fct>          <dbl> <chr> 
## 1 Africa          54.8 N     
## 2 Americas        73.6 N     
## 3 Asia            70.7 N     
## 4 Europe          77.6 Y     
## 5 Oceania         80.7 Y

Calculating Summary Statistics-1

gapminder %>%
  summarize(avgLifeExp = mean(lifeExp))

## # A tibble: 1 × 1
##   avgLifeExp
##        <dbl>
## 1       59.5

Calculating Summary Statistics-2

gapminder %>%
  filter(year == 2007, continent == "Europe") %>%
  summarize(avgLifeExp = mean(lifeExp))

## # A tibble: 1 × 1
##   avgLifeExp
##        <dbl>
## 1       77.6

Grouping in R dplyr-1

gapminder %>%
  filter(year == 2007) %>%
  group_by(continent) %>%
  summarize(avgLifeExp = mean(lifeExp))

## # A tibble: 5 × 2
##   continent avgLifeExp
##   <fct>          <dbl>
## 1 Africa          54.8
## 2 Americas        73.6
## 3 Asia            70.7
## 4 Europe          77.6
## 5 Oceania         80.7

Grouping in R dplyr-2

gapminder %>%
  filter(year == 2007) %>%
  group_by(continent) %>%
  summarize(avgLifeExp = mean(lifeExp)) %>%
  arrange(desc(avgLifeExp))

## # A tibble: 5 × 2
##   continent avgLifeExp
##   <fct>          <dbl>
## 1 Oceania         80.7
## 2 Europe          77.6
## 3 Americas        73.6
## 4 Asia            70.7
## 5 Africa          54.8

Grouping in R dplyr & across() function

  mtcars %>% 
  group_by(carb) %>% 
  summarise(across(where(is.numeric), mean))

## # A tibble: 6 × 11
##    carb   mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     1  25.3  4.57  134.   86   3.68  2.49  19.5   1   0.571  3.57
## 2     2  22.4  5.6   208.  117.  3.70  2.86  18.2   0.5 0.4    3.8 
## 3     3  16.3  8     276.  180   3.07  3.86  17.7   0   0      3   
## 4     4  15.8  7.2   309.  187   3.60  3.90  17.0   0.2 0.3    3.6 
## 5     6  19.7  6     145   175   3.62  2.77  15.5   0   1      5   
## 6     8  15    8     301   335   3.54  3.57  14.6   0   1      5

Grouping in R dplyr & Multiple across() function

  mtcars %>% 
  group_by(carb) %>% 
  summarise(across(mpg:qsec, mean), across(vs:gear, n_distinct))

## # A tibble: 6 × 11
##    carb   mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int> <int>
## 1     1  25.3  4.57  134.   86   3.68  2.49  19.5     1     2     2
## 2     2  22.4  5.6   208.  117.  3.70  2.86  18.2     2     2     3
## 3     3  16.3  8     276.  180   3.07  3.86  17.7     1     1     1
## 4     4  15.8  7.2   309.  187   3.60  3.90  17.0     2     2     3
## 5     6  19.7  6     145   175   3.62  2.77  15.5     1     1     1
## 6     8  15    8     301   335   3.54  3.57  14.6     1     1     1

Data Slicing-1

gapminder %>%
  slice(1)

## # A tibble: 1 × 6
##   country     continent  year lifeExp     pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>   <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8 8425333      779.

Data Slicing-2

gapminder %>%
  slice_head(n = 3)

## # A tibble: 3 × 6
##   country     continent  year lifeExp      pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.

Data Slicing-3

gapminder %>%
  slice_tail(n = 3)

## # A tibble: 3 × 6
##   country  continent  year lifeExp      pop gdpPercap
##   <fct>    <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Zimbabwe Africa     1997    46.8 11404948      792.
## 2 Zimbabwe Africa     2002    40.0 11926563      672.
## 3 Zimbabwe Africa     2007    43.5 12311143      470.

Data Slicing-4

gapminder %>%
  slice_sample(n = 5)

## # A tibble: 5 × 6
##   country          continent  year lifeExp      pop gdpPercap
##   <fct>            <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Haiti            Americas   1962    43.6  3880130     1797.
## 2 Congo, Dem. Rep. Africa     1982    47.8 30646495      674.
## 3 Austria          Europe     1987    74.9  7578903    23688.
## 4 Turkey           Europe     1982    61.0 47328791     4241.
## 5 Algeria          Africa     2007    72.3 33333216     6223.

Data Slicing-5

gapminder %>%
  filter(year == 2007) %>%
  slice_min(lifeExp, n = 5)

## # A tibble: 5 × 6
##   country      continent  year lifeExp      pop gdpPercap
##   <fct>        <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Swaziland    Africa     2007    39.6  1133066     4513.
## 2 Mozambique   Africa     2007    42.1 19951656      824.
## 3 Zambia       Africa     2007    42.4 11746035     1271.
## 4 Sierra Leone Africa     2007    42.6  6144562      863.
## 5 Lesotho      Africa     2007    42.6  2012649     1569.

Data Slicing-6

gapminder %>%
  filter(year == 2007) %>%
  slice_max(lifeExp, n = 5)

## # A tibble: 5 × 6
##   country          continent  year lifeExp       pop gdpPercap
##   <fct>            <fct>     <int>   <dbl>     <int>     <dbl>
## 1 Japan            Asia       2007    82.6 127467972    31656.
## 2 Hong Kong, China Asia       2007    82.2   6980412    39725.
## 3 Iceland          Europe     2007    81.8    301931    36181.
## 4 Switzerland      Europe     2007    81.7   7554661    37506.
## 5 Australia        Oceania    2007    81.2  20434176    34435.

sample 3 values

sample_n(starwars,3)

## # A tibble: 3 × 14
##   name        height  mass hair_c…¹ skin_…² eye_c…³ birth…⁴ sex   gender homew…⁵
##   <chr>        <int> <dbl> <chr>    <chr>   <chr>     <dbl> <chr> <chr>  <chr>  
## 1 Poe Dameron     NA    NA brown    light   brown        NA male  mascu… <NA>   
## 2 Cliegg Lars    183    NA brown    fair    blue         82 male  mascu… Tatooi…
## 3 Nute Gunray    191    90 none     mottle… red          NA male  mascu… Cato N…
## # … with 4 more variables: species <chr>, films <list>, vehicles <list>,
## #   starships <list>, and abbreviated variable names ¹hair_color, ²skin_color,
## #   ³eye_color, ⁴birth_year, ⁵homeworld

sample 10% of rows

sample_frac(starwars,0.1)

## # A tibble: 9 × 14
##   name         height  mass hair_…¹ skin_…² eye_c…³ birth…⁴ sex   gender homew…⁵
##   <chr>         <int> <dbl> <chr>   <chr>   <chr>     <dbl> <chr> <chr>  <chr>  
## 1 IG-88           200   140 none    metal   red          15 none  mascu… <NA>   
## 2 Ki-Adi-Mundi    198    82 white   pale    yellow       92 male  mascu… Cerea  
## 3 Yoda             66    17 white   green   brown       896 male  mascu… <NA>   
## 4 Owen Lars       178   120 brown,… light   blue         52 male  mascu… Tatooi…
## 5 Shmi Skywal…    163    NA black   fair    brown        72 fema… femin… Tatooi…
## 6 Sebulba         112    40 none    grey, … orange       NA male  mascu… Malast…
## 7 Gregar Typho    185    85 black   dark    brown        NA male  mascu… Naboo  
## 8 Darth Maul      175    80 none    red     yellow       54 male  mascu… Dathom…
## 9 Jek Tono Po…    180   110 brown   fair    blue         NA male  mascu… Bestin…
## # … with 4 more variables: species <chr>, films <list>, vehicles <list>,
## #   starships <list>, and abbreviated variable names ¹hair_color, ²skin_color,
## #   ³eye_color, ⁴birth_year, ⁵homeworld

Remove Duplicate Rows based on all the variables

distinct(starwars)

## # A tibble: 87 × 14
##    name        height  mass hair_…¹ skin_…² eye_c…³ birth…⁴ sex   gender homew…⁵
##    <chr>        <int> <dbl> <chr>   <chr>   <chr>     <dbl> <chr> <chr>  <chr>  
##  1 Luke Skywa…    172    77 blond   fair    blue       19   male  mascu… Tatooi…
##  2 C-3PO          167    75 <NA>    gold    yellow    112   none  mascu… Tatooi…
##  3 R2-D2           96    32 <NA>    white,… red        33   none  mascu… Naboo  
##  4 Darth Vader    202   136 none    white   yellow     41.9 male  mascu… Tatooi…
##  5 Leia Organa    150    49 brown   light   brown      19   fema… femin… Aldera…
##  6 Owen Lars      178   120 brown,… light   blue       52   male  mascu… Tatooi…
##  7 Beru White…    165    75 brown   light   blue       47   fema… femin… Tatooi…
##  8 R5-D4           97    32 <NA>    white,… red        NA   none  mascu… Tatooi…
##  9 Biggs Dark…    183    84 black   light   brown      24   male  mascu… Tatooi…
## 10 Obi-Wan Ke…    182    77 auburn… fair    blue-g…    57   male  mascu… Stewjon
## # … with 77 more rows, 4 more variables: species <chr>, films <list>,
## #   vehicles <list>, starships <list>, and abbreviated variable names
## #   ¹hair_color, ²skin_color, ³eye_color, ⁴birth_year, ⁵homeworld

Remove Duplicate Rows based on a variable

distinct(starwars, height, .keep_all= TRUE)

## # A tibble: 46 × 14
##    name        height  mass hair_…¹ skin_…² eye_c…³ birth…⁴ sex   gender homew…⁵
##    <chr>        <int> <dbl> <chr>   <chr>   <chr>     <dbl> <chr> <chr>  <chr>  
##  1 Luke Skywa…    172    77 blond   fair    blue       19   male  mascu… Tatooi…
##  2 C-3PO          167    75 <NA>    gold    yellow    112   none  mascu… Tatooi…
##  3 R2-D2           96    32 <NA>    white,… red        33   none  mascu… Naboo  
##  4 Darth Vader    202   136 none    white   yellow     41.9 male  mascu… Tatooi…
##  5 Leia Organa    150    49 brown   light   brown      19   fema… femin… Aldera…
##  6 Owen Lars      178   120 brown,… light   blue       52   male  mascu… Tatooi…
##  7 Beru White…    165    75 brown   light   blue       47   fema… femin… Tatooi…
##  8 R5-D4           97    32 <NA>    white,… red        NA   none  mascu… Tatooi…
##  9 Biggs Dark…    183    84 black   light   brown      24   male  mascu… Tatooi…
## 10 Obi-Wan Ke…    182    77 auburn… fair    blue-g…    57   male  mascu… Stewjon
## # … with 36 more rows, 4 more variables: species <chr>, films <list>,
## #   vehicles <list>, starships <list>, and abbreviated variable names
## #   ¹hair_color, ²skin_color, ³eye_color, ⁴birth_year, ⁵homeworld

Remove Duplicates Rows based on multiple variables

distinct(starwars, height, mass, .keep_all= TRUE)

## # A tibble: 77 × 14
##    name        height  mass hair_…¹ skin_…² eye_c…³ birth…⁴ sex   gender homew…⁵
##    <chr>        <int> <dbl> <chr>   <chr>   <chr>     <dbl> <chr> <chr>  <chr>  
##  1 Luke Skywa…    172    77 blond   fair    blue       19   male  mascu… Tatooi…
##  2 C-3PO          167    75 <NA>    gold    yellow    112   none  mascu… Tatooi…
##  3 R2-D2           96    32 <NA>    white,… red        33   none  mascu… Naboo  
##  4 Darth Vader    202   136 none    white   yellow     41.9 male  mascu… Tatooi…
##  5 Leia Organa    150    49 brown   light   brown      19   fema… femin… Aldera…
##  6 Owen Lars      178   120 brown,… light   blue       52   male  mascu… Tatooi…
##  7 Beru White…    165    75 brown   light   blue       47   fema… femin… Tatooi…
##  8 R5-D4           97    32 <NA>    white,… red        NA   none  mascu… Tatooi…
##  9 Biggs Dark…    183    84 black   light   brown      24   male  mascu… Tatooi…
## 10 Obi-Wan Ke…    182    77 auburn… fair    blue-g…    57   male  mascu… Stewjon
## # … with 67 more rows, 4 more variables: species <chr>, films <list>,
## #   vehicles <list>, starships <list>, and abbreviated variable names
## #   ¹hair_color, ²skin_color, ³eye_color, ⁴birth_year, ⁵homeworld

Dropping Variables -1

select(starwars, -height, -mass)

## # A tibble: 87 × 12
##    name       hair_…¹ skin_…² eye_c…³ birth…⁴ sex   gender homew…⁵ species films
##    <chr>      <chr>   <chr>   <chr>     <dbl> <chr> <chr>  <chr>   <chr>   <lis>
##  1 Luke Skyw… blond   fair    blue       19   male  mascu… Tatooi… Human   <chr>
##  2 C-3PO      <NA>    gold    yellow    112   none  mascu… Tatooi… Droid   <chr>
##  3 R2-D2      <NA>    white,… red        33   none  mascu… Naboo   Droid   <chr>
##  4 Darth Vad… none    white   yellow     41.9 male  mascu… Tatooi… Human   <chr>
##  5 Leia Orga… brown   light   brown      19   fema… femin… Aldera… Human   <chr>
##  6 Owen Lars  brown,… light   blue       52   male  mascu… Tatooi… Human   <chr>
##  7 Beru Whit… brown   light   blue       47   fema… femin… Tatooi… Human   <chr>
##  8 R5-D4      <NA>    white,… red        NA   none  mascu… Tatooi… Droid   <chr>
##  9 Biggs Dar… black   light   brown      24   male  mascu… Tatooi… Human   <chr>
## 10 Obi-Wan K… auburn… fair    blue-g…    57   male  mascu… Stewjon Human   <chr>
## # … with 77 more rows, 2 more variables: vehicles <list>, starships <list>, and
## #   abbreviated variable names ¹hair_color, ²skin_color, ³eye_color,
## #   ⁴birth_year, ⁵homeworld

Dropping Variables -2

select(starwars, -c(height, mass))

## # A tibble: 87 × 12
##    name       hair_…¹ skin_…² eye_c…³ birth…⁴ sex   gender homew…⁵ species films
##    <chr>      <chr>   <chr>   <chr>     <dbl> <chr> <chr>  <chr>   <chr>   <lis>
##  1 Luke Skyw… blond   fair    blue       19   male  mascu… Tatooi… Human   <chr>
##  2 C-3PO      <NA>    gold    yellow    112   none  mascu… Tatooi… Droid   <chr>
##  3 R2-D2      <NA>    white,… red        33   none  mascu… Naboo   Droid   <chr>
##  4 Darth Vad… none    white   yellow     41.9 male  mascu… Tatooi… Human   <chr>
##  5 Leia Orga… brown   light   brown      19   fema… femin… Aldera… Human   <chr>
##  6 Owen Lars  brown,… light   blue       52   male  mascu… Tatooi… Human   <chr>
##  7 Beru Whit… brown   light   blue       47   fema… femin… Tatooi… Human   <chr>
##  8 R5-D4      <NA>    white,… red        NA   none  mascu… Tatooi… Droid   <chr>
##  9 Biggs Dar… black   light   brown      24   male  mascu… Tatooi… Human   <chr>
## 10 Obi-Wan K… auburn… fair    blue-g…    57   male  mascu… Stewjon Human   <chr>
## # … with 77 more rows, 2 more variables: vehicles <list>, starships <list>, and
## #   abbreviated variable names ¹hair_color, ²skin_color, ³eye_color,
## #   ⁴birth_year, ⁵homeworld

rename

rename(starwars, height1=height)

## # A tibble: 87 × 14
##    name       height1  mass hair_…¹ skin_…² eye_c…³ birth…⁴ sex   gender homew…⁵
##    <chr>        <int> <dbl> <chr>   <chr>   <chr>     <dbl> <chr> <chr>  <chr>  
##  1 Luke Skyw…     172    77 blond   fair    blue       19   male  mascu… Tatooi…
##  2 C-3PO          167    75 <NA>    gold    yellow    112   none  mascu… Tatooi…
##  3 R2-D2           96    32 <NA>    white,… red        33   none  mascu… Naboo  
##  4 Darth Vad…     202   136 none    white   yellow     41.9 male  mascu… Tatooi…
##  5 Leia Orga…     150    49 brown   light   brown      19   fema… femin… Aldera…
##  6 Owen Lars      178   120 brown,… light   blue       52   male  mascu… Tatooi…
##  7 Beru Whit…     165    75 brown   light   blue       47   fema… femin… Tatooi…
##  8 R5-D4           97    32 <NA>    white,… red        NA   none  mascu… Tatooi…
##  9 Biggs Dar…     183    84 black   light   brown      24   male  mascu… Tatooi…
## 10 Obi-Wan K…     182    77 auburn… fair    blue-g…    57   male  mascu… Stewjon
## # … with 77 more rows, 4 more variables: species <chr>, films <list>,
## #   vehicles <list>, starships <list>, and abbreviated variable names
## #   ¹hair_color, ²skin_color, ³eye_color, ⁴birth_year, ⁵homeworld

Inner Join

df1 = data.frame(ID = c(1, 2, 3, 4, 5),
w = c('a', 'b', 'c', 'd', 'e'),
x = c(1, 1, 0, 0, 1),
y=rnorm(5),
z=letters[1:5])
df2 = data.frame(ID = c(1, 7, 3, 6, 8),
a = c('z', 'b', 'k', 'd', 'l'),
b = c(1, 2, 3, 0, 4),
c =rnorm(5),
d =letters[2:6])
inner_join(df1, df2, by = "ID")

##   ID w x          y z a b         c d
## 1  1 a 1 -0.5690531 a z 1 -1.110319 b
## 2  3 c 0  0.1303399 c k 3  1.706544 d

primary key does not have same name in both the tables

df1<-rename(df1, ID1=ID)
inner_join(df1, df2, by = c("ID1"="ID"))

##   ID1 w x          y z a b         c d
## 1   1 a 1 -0.5690531 a z 1 -1.110319 b
## 2   3 c 0  0.1303399 c k 3  1.706544 d

Left Join

left_join(df1, df2, by = c("ID1"="ID"))

##   ID1 w x          y z    a  b         c    d
## 1   1 a 1 -0.5690531 a    z  1 -1.110319    b
## 2   2 b 1 -0.9262669 b <NA> NA        NA <NA>
## 3   3 c 0  0.1303399 c    k  3  1.706544    d
## 4   4 d 0  1.3308165 d <NA> NA        NA <NA>
## 5   5 e 1 -1.1404569 e <NA> NA        NA <NA>

rbind & cbind

df1=data.frame(ID = 1:6,  x=letters[1:6])
df2=data.frame(ID = 7:12, x=letters[7:12])
rbind(df1,df2)

##    ID x
## 1   1 a
## 2   2 b
## 3   3 c
## 4   4 d
## 5   5 e
## 6   6 f
## 7   7 g
## 8   8 h
## 9   9 i
## 10 10 j
## 11 11 k
## 12 12 l

cbind(df1,df2)

##   ID x ID x
## 1  1 a  7 g
## 2  2 b  8 h
## 3  3 c  9 i
## 4  4 d 10 j
## 5  5 e 11 k
## 6  6 f 12 l

Prepare Sample Data for Demonstration

mtcars$model <- rownames(mtcars)
str(mtcars)

## 'data.frame':    32 obs. of  12 variables:
##  $ mpg  : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl  : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp : num  160 160 108 258 360 ...
##  $ hp   : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat : num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt   : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec : num  16.5 17 18.6 19.4 17 ...
##  $ vs   : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am   : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear : num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb : num  4 4 1 1 2 1 4 2 2 4 ...
##  $ model: chr  "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...

first <- mtcars[1:20, ]
second <- mtcars[10:32, ]

dplyr

Naimul Islam

2022-10-09

Library

Data

Column Selection in R dplyr

Data Filtering-1

Data Filtering-2

Data Filtering-3

Data Filtering-4

Select only numeric coloumn

Select only factor coloumn

Data Filtering (condition &)

Data Filtering (condition or (|))

Data Filtering (Not condition (!))

Data Filtering (pattern matching)

Data Ordering-1

Data Ordering-2

Select top values

Creating Columns-1

Creating Columns-2

Creating Columns-3

Creating Columns-4

Creating Columns & if else

Creating Columns & Nested IF ELSE

Creating Columns & WHEN Statement

Creating Columns & across()

new columns whose value depends on some conditions

Calculating Summary Statistics-1

Calculating Summary Statistics-2

Grouping in R dplyr-1

Grouping in R dplyr-2

Grouping in R dplyr & across() function

Grouping in R dplyr & Multiple across() function

Data Slicing-1

Data Slicing-2

Data Slicing-3

Data Slicing-4

Data Slicing-5

Data Slicing-6

sample 3 values

sample 10% of rows

Remove Duplicate Rows based on all the variables

Remove Duplicate Rows based on a variable

Remove Duplicates Rows based on multiple variables

Dropping Variables -1

Dropping Variables -2

rename

Inner Join

primary key does not have same name in both the tables

Left Join

rbind & cbind

Prepare Sample Data for Demonstration