Paquete dplyr -> manipulacion de datos Argumentos: set de datos, ¿que hacer con el set?. Resultado: set de datos modificado

mutate() adds new variables that are functions of existing variables.

select() picks variables based on their names.

filter() picks cases based on their values.

summarise() reduces multiple values down to a single summary.

arrange() changes the ordering of the rows.

Filter

Picks cases based on their values.(set data, criteria)

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
data("mtcars")

mt4.carbs <- filter(mtcars, carb >= 4) #cars whit 4 or more carburetors
mt4.carbs
##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8

Multiple filter

mt4.carbs.aut.trans <- filter(mtcars, carb >= 4, am == 0) 
#cars whit 4 or more carburetors and automatic transmission 
mt4.carbs.aut.trans
##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4

OR logical filter

disp.l250.or.mpg.m20 <- filter(mtcars, disp < 250|mpg > 20)
#cars whit diplacement less than 250 or mpg more than 20
disp.l250.or.mpg.m20
##                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4      21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710     22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Valiant        18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Merc 240D      24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230       22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 280       19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Merc 280C      17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Fiat 128       32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic    30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Toyota Corolla 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Toyota Corona  21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## Fiat X1-9      27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Porsche 914-2  26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## Lotus Europa   30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## Ferrari Dino   19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## Volvo 142E     21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2

Filter by min, max, another numerical criteria

max.mpg <- filter(mtcars, mpg == max(mpg))#car whit mpg max
max.mpg
##                 mpg cyl disp hp drat    wt qsec vs am gear carb
## Toyota Corolla 33.9   4 71.1 65 4.22 1.835 19.9  1  1    4    1

Filter by NAs for a specific row

data("airquality")
no.miss.solarrad <- filter(airquality, !is.na(Solar.R))
head(no.miss.solarrad)
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    23     299  8.6   65     5   7
## 6    19      99 13.8   59     5   8

Filter by NAs anywhere

airqual.no.na <- filter(airquality[1:10,],complete.cases(airquality[1:10,]))
airqual.no.na
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    23     299  8.6   65     5   7
## 6    19      99 13.8   59     5   8
## 7     8      19 20.1   61     5   9

Filter whit %in% (including/excluding specified values)

data("iris")
table(iris$Species) #counts of species
## 
##     setosa versicolor  virginica 
##         50         50         50
iris.two.species <- filter(iris, Species %in% c("setosa", "virginica"))
head(iris.two.species) 
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
table(iris.two.species$Species)
## 
##     setosa versicolor  virginica 
##         50          0         50
nrow(iris)
## [1] 150
nrow(iris.two.species)
## [1] 100
airqual.3c <- filter(airquality, Ozone < 20) [,1:3]
head((airqual.3c))
##   Ozone Solar.R Wind
## 1    12     149 12.6
## 2    18     313 11.5
## 3    19      99 13.8
## 4     8      19 20.1
## 5     7      NA  6.9
## 6    16     256  9.7

Filter by total frequency of a value across all rows

#counting of rows based on number of gears (group_by argument)
table(mtcars$gear) #Table: tablas de frecuencia
## 
##  3  4  5 
## 15 12  5
more.frequent.no.gears <- mtcars %>%
  group_by(gear)%>%
  filter(n()>10)
table(more.frequent.no.gears$gear)
## 
##  3  4 
## 15 12
#example group_by
mtcars%>%
  group_by(gear)%>%
  summarise(mean=mean(mpg), sum=sum(mpg),n=n())
## # A tibble: 3 x 4
##    gear  mean   sum     n
##   <dbl> <dbl> <dbl> <int>
## 1     3  16.1  242.    15
## 2     4  24.5  294.    12
## 3     5  21.4  107.     5
more.frequent.no.gears.horsepower <- mtcars %>%
  group_by(gear)%>%
  filter(n()>10, hp < 105)
table(more.frequent.no.gears.horsepower$gear)
## 
## 3 4 
## 1 7

Filter by col name (starts_whit)

names(iris)
## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"
iris.startswhitS <- iris %>%
  ## dplyr::select(starts_with("S"))
  select(starts_with("S"))
head(iris.startswhitS)
##   Sepal.Length Sepal.Width Species
## 1          5.1         3.5  setosa
## 2          4.9         3.0  setosa
## 3          4.7         3.2  setosa
## 4          4.6         3.1  setosa
## 5          5.0         3.6  setosa
## 6          5.4         3.9  setosa

Filter rows: col meet criteria (filter_at)

new.mtcars <- mtcars %>%
  filter_at(vars(cyl, hp),
            all_vars(. == max(.)))
new.mtcars
##               mpg cyl disp  hp drat   wt qsec vs am gear carb
## Maserati Bora  15   8  301 335 3.54 3.57 14.6  0  1    5    8
#vars -> equivalent semantics to "select" (vector of column names) PREGUNTAR
nmtcars <- mtcars %>%
  summarise(across(c(cyl,hp), max))
nmtcars 
##   cyl  hp
## 1   8 335

Arrange sort

Ascending

head(msleep)
## # A tibble: 6 x 11
##   name    genus vore  order conservation sleep_total sleep_rem sleep_cycle awake
##   <chr>   <chr> <chr> <chr> <chr>              <dbl>     <dbl>       <dbl> <dbl>
## 1 Cheetah Acin~ carni Carn~ lc                  12.1      NA        NA      11.9
## 2 Owl mo~ Aotus omni  Prim~ <NA>                17         1.8      NA       7  
## 3 Mounta~ Aplo~ herbi Rode~ nt                  14.4       2.4      NA       9.6
## 4 Greate~ Blar~ omni  Sori~ lc                  14.9       2.3       0.133   9.1
## 5 Cow     Bos   herbi Arti~ domesticated         4         0.7       0.667  20  
## 6 Three-~ Brad~ herbi Pilo~ <NA>                14.4       2.2       0.767   9.6
## # ... with 2 more variables: brainwt <dbl>, bodywt <dbl>
head(arrange(msleep, name))
## # A tibble: 6 x 11
##   name    genus vore  order conservation sleep_total sleep_rem sleep_cycle awake
##   <chr>   <chr> <chr> <chr> <chr>              <dbl>     <dbl>       <dbl> <dbl>
## 1 Africa~ Loxo~ herbi Prob~ vu                   3.3        NA          NA  20.7
## 2 Africa~ Cric~ omni  Rode~ <NA>                 8.3         2          NA  15.7
## 3 Africa~ Rhab~ omni  Rode~ <NA>                 8.7        NA          NA  15.3
## 4 Arctic~ Vulp~ carni Carn~ <NA>                12.5        NA          NA  11.5
## 5 Arctic~ Sper~ herbi Rode~ lc                  16.6        NA          NA   7.4
## 6 Asian ~ Elep~ herbi Prob~ en                   3.9        NA          NA  20.1
## # ... with 2 more variables: brainwt <dbl>, bodywt <dbl>
animal.name.sequence <- arrange(msleep, vore, order)
animal.name.sequence [1:10,1:4]
## # A tibble: 10 x 4
##    name              genus        vore  order    
##    <chr>             <chr>        <chr> <chr>    
##  1 Cheetah           Acinonyx     carni Carnivora
##  2 Northern fur seal Callorhinus  carni Carnivora
##  3 Dog               Canis        carni Carnivora
##  4 Domestic cat      Felis        carni Carnivora
##  5 Gray seal         Haliochoerus carni Carnivora
##  6 Tiger             Panthera     carni Carnivora
##  7 Jaguar            Panthera     carni Carnivora
##  8 Lion              Panthera     carni Carnivora
##  9 Caspian seal      Phoca        carni Carnivora
## 10 Genet             Genetta      carni Carnivora

Descending

animal.name.sequence.desc <- arrange(msleep, vore, desc(order))
animal.name.sequence.desc[1:10,1:4]
## # A tibble: 10 x 4
##    name                       genus         vore  order          
##    <chr>                      <chr>         <chr> <chr>          
##  1 Northern grasshopper mouse Onychomys     carni Rodentia       
##  2 Slow loris                 Nyctibeus     carni Primates       
##  3 Thick-tailed opposum       Lutreolina    carni Didelphimorphia
##  4 Long-nosed armadillo       Dasypus       carni Cingulata      
##  5 Pilot whale                Globicephalus carni Cetacea        
##  6 Common porpoise            Phocoena      carni Cetacea        
##  7 Bottle-nosed dolphin       Tursiops      carni Cetacea        
##  8 Cheetah                    Acinonyx      carni Carnivora      
##  9 Northern fur seal          Callorhinus   carni Carnivora      
## 10 Dog                        Canis         carni Carnivora

Rename

names(msleep)
##  [1] "name"         "genus"        "vore"         "order"        "conservation"
##  [6] "sleep_total"  "sleep_rem"    "sleep_cycle"  "awake"        "brainwt"     
## [11] "bodywt"
renamed.msleep <- rename(msleep, habito_alimento = vore, conservation_status = conservation)
names(renamed.msleep)
##  [1] "name"                "genus"               "habito_alimento"    
##  [4] "order"               "conservation_status" "sleep_total"        
##  [7] "sleep_rem"           "sleep_cycle"         "awake"              
## [10] "brainwt"             "bodywt"

Mutate

data("ChickWeight")
head(ChickWeight)
##   weight Time Chick Diet
## 1     42    0     1    1
## 2     51    2     1    1
## 3     59    4     1    1
## 4     64    6     1    1
## 5     76    8     1    1
## 6     93   10     1    1
ChickWeight.with.log <- mutate(ChickWeight, log.of.weight= log10(weight))
head(ChickWeight.with.log)
##   weight Time Chick Diet log.of.weight
## 1     42    0     1    1      1.623249
## 2     51    2     1    1      1.707570
## 3     59    4     1    1      1.770852
## 4     64    6     1    1      1.806180
## 5     76    8     1    1      1.880814
## 6     93   10     1    1      1.968483

Mutate_all to add new fields all at once

names(msleep)
##  [1] "name"         "genus"        "vore"         "order"        "conservation"
##  [6] "sleep_total"  "sleep_rem"    "sleep_cycle"  "awake"        "brainwt"     
## [11] "bodywt"
msleep.raiz.sqrt <- mutate_all(msleep[,6:11],list(sqrt_root= sqrt))
#columns names changed "_sqrt_root"
#mutate_all(msleep[,6:11],funs("square.root"=sqrt(.)))
#funs () was deprecated, use: list (name = function)
names(msleep.raiz.sqrt)
##  [1] "sleep_total"           "sleep_rem"             "sleep_cycle"          
##  [4] "awake"                 "brainwt"               "bodywt"               
##  [7] "sleep_total_sqrt_root" "sleep_rem_sqrt_root"   "sleep_cycle_sqrt_root"
## [10] "awake_sqrt_root"       "brainwt_sqrt_root"     "bodywt_sqrt_root"

Mutate_at to add fields

data("Titanic")
Titanic <- as.data.frame(Titanic) #converto to data frame
head(Titanic)
##   Class    Sex   Age Survived Freq
## 1   1st   Male Child       No    0
## 2   2nd   Male Child       No    0
## 3   3rd   Male Child       No   35
## 4  Crew   Male Child       No    0
## 5   1st Female Child       No    0
## 6   2nd Female Child       No    0
titanic.ranks<- mutate_at(Titanic, vars(Class,Age,Survived), list(Rank=min_rank))
titanic.with.ranks <- mutate_at(Titanic, vars(Class,Age,Survived),
funs(Rank = min_rank(desc(.))))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
head(titanic.ranks,10)
##    Class    Sex   Age Survived Freq Class_Rank Age_Rank Survived_Rank
## 1    1st   Male Child       No    0          1        1             1
## 2    2nd   Male Child       No    0          9        1             1
## 3    3rd   Male Child       No   35         17        1             1
## 4   Crew   Male Child       No    0         25        1             1
## 5    1st Female Child       No    0          1        1             1
## 6    2nd Female Child       No    0          9        1             1
## 7    3rd Female Child       No   17         17        1             1
## 8   Crew Female Child       No    0         25        1             1
## 9    1st   Male Adult       No  118          1       17             1
## 10   2nd   Male Adult       No  154          9       17             1
head(titanic.with.ranks,10)
##    Class    Sex   Age Survived Freq Class_Rank Age_Rank Survived_Rank
## 1    1st   Male Child       No    0         25       17            17
## 2    2nd   Male Child       No    0         17       17            17
## 3    3rd   Male Child       No   35          9       17            17
## 4   Crew   Male Child       No    0          1       17            17
## 5    1st Female Child       No    0         25       17            17
## 6    2nd Female Child       No    0         17       17            17
## 7    3rd Female Child       No   17          9       17            17
## 8   Crew Female Child       No    0          1       17            17
## 9    1st   Male Adult       No  118         25        1            17
## 10   2nd   Male Adult       No  154         17        1            17

Mutate_if

#Example 1
divide.by.10 <- function(a.number)(a.number/10)
head(CO2)
##   Plant   Type  Treatment conc uptake
## 1   Qn1 Quebec nonchilled   95   16.0
## 2   Qn1 Quebec nonchilled  175   30.4
## 3   Qn1 Quebec nonchilled  250   34.8
## 4   Qn1 Quebec nonchilled  350   37.2
## 5   Qn1 Quebec nonchilled  500   35.3
## 6   Qn1 Quebec nonchilled  675   39.2
new.df <- CO2 %>%
  mutate_if(is.numeric, divide.by.10)
head(new.df)
##   Plant   Type  Treatment conc uptake
## 1   Qn1 Quebec nonchilled  9.5   1.60
## 2   Qn1 Quebec nonchilled 17.5   3.04
## 3   Qn1 Quebec nonchilled 25.0   3.48
## 4   Qn1 Quebec nonchilled 35.0   3.72
## 5   Qn1 Quebec nonchilled 50.0   3.53
## 6   Qn1 Quebec nonchilled 67.5   3.92
#Example 2
#Reemplazar NAs por 0
df <- data.frame(
alpha = c(22, 1, NA),
almond = c(0, 5, 10),
grape = c(0, 2, 2),
apple = c(NA, 5, 10))
df
##   alpha almond grape apple
## 1    22      0     0    NA
## 2     1      5     2     5
## 3    NA     10     2    10
df.fix.alpha <- df %>% mutate_if(is.numeric, coalesce, ... = 0)
df.fix.alpha
##   alpha almond grape apple
## 1    22      0     0     0
## 2     1      5     2     5
## 3     0     10     2    10

String detect and true/false duplicate indicator

table(msleep$vore)
## 
##   carni   herbi insecti    omni 
##      19      32       5      20
msleep.sin.c.o.a <- filter(msleep, !str_detect(vore, paste(c("c","h"), collapse = "|")))
head(msleep.sin.c.o.a)
## # A tibble: 6 x 11
##   name    genus vore  order conservation sleep_total sleep_rem sleep_cycle awake
##   <chr>   <chr> <chr> <chr> <chr>              <dbl>     <dbl>       <dbl> <dbl>
## 1 Owl mo~ Aotus omni  Prim~ <NA>                17         1.8      NA       7  
## 2 Greate~ Blar~ omni  Sori~ lc                  14.9       2.3       0.133   9.1
## 3 Grivet  Cerc~ omni  Prim~ lc                  10         0.7      NA      14  
## 4 Star-n~ Cond~ omni  Sori~ lc                  10.3       2.2      NA      13.7
## 5 Africa~ Cric~ omni  Rode~ <NA>                 8.3       2        NA      15.7
## 6 Lesser~ Cryp~ omni  Sori~ lc                   9.1       1.4       0.15   14.9
## # ... with 2 more variables: brainwt <dbl>, bodywt <dbl>
table(msleep.sin.c.o.a$vore)
## 
## omni 
##   20
msleep.with.dup.indicator <- mutate(msleep, duplicate.indicator= duplicated(conservation))
head(msleep.with.dup.indicator, 20)
## # A tibble: 20 x 12
##    name   genus vore  order conservation sleep_total sleep_rem sleep_cycle awake
##    <chr>  <chr> <chr> <chr> <chr>              <dbl>     <dbl>       <dbl> <dbl>
##  1 Cheet~ Acin~ carni Carn~ lc                  12.1      NA        NA      11.9
##  2 Owl m~ Aotus omni  Prim~ <NA>                17         1.8      NA       7  
##  3 Mount~ Aplo~ herbi Rode~ nt                  14.4       2.4      NA       9.6
##  4 Great~ Blar~ omni  Sori~ lc                  14.9       2.3       0.133   9.1
##  5 Cow    Bos   herbi Arti~ domesticated         4         0.7       0.667  20  
##  6 Three~ Brad~ herbi Pilo~ <NA>                14.4       2.2       0.767   9.6
##  7 North~ Call~ carni Carn~ vu                   8.7       1.4       0.383  15.3
##  8 Vespe~ Calo~ <NA>  Rode~ <NA>                 7        NA        NA      17  
##  9 Dog    Canis carni Carn~ domesticated        10.1       2.9       0.333  13.9
## 10 Roe d~ Capr~ herbi Arti~ lc                   3        NA        NA      21  
## 11 Goat   Capri herbi Arti~ lc                   5.3       0.6      NA      18.7
## 12 Guine~ Cavis herbi Rode~ domesticated         9.4       0.8       0.217  14.6
## 13 Grivet Cerc~ omni  Prim~ lc                  10         0.7      NA      14  
## 14 Chinc~ Chin~ herbi Rode~ domesticated        12.5       1.5       0.117  11.5
## 15 Star-~ Cond~ omni  Sori~ lc                  10.3       2.2      NA      13.7
## 16 Afric~ Cric~ omni  Rode~ <NA>                 8.3       2        NA      15.7
## 17 Lesse~ Cryp~ omni  Sori~ lc                   9.1       1.4       0.15   14.9
## 18 Long-~ Dasy~ carni Cing~ lc                  17.4       3.1       0.383   6.6
## 19 Tree ~ Dend~ herbi Hyra~ lc                   5.3       0.5      NA      18.7
## 20 North~ Dide~ omni  Dide~ lc                  18         4.9       0.333   6  
## # ... with 3 more variables: brainwt <dbl>, bodywt <dbl>,
## #   duplicate.indicator <lgl>
#duplicated.indicator = FLASE/TRUE 
msleep.with.dup.indicator[1:6,c(1,5,12)]
## # A tibble: 6 x 3
##   name                       conservation duplicate.indicator
##   <chr>                      <chr>        <lgl>              
## 1 Cheetah                    lc           FALSE              
## 2 Owl monkey                 <NA>         FALSE              
## 3 Mountain beaver            nt           FALSE              
## 4 Greater short-tailed shrew lc           TRUE               
## 5 Cow                        domesticated FALSE              
## 6 Three-toed sloth           <NA>         TRUE
#conservation (major key), genus (minor,key)
msleep.with.dup.indicator2 <- mutate(msleep,duplicate.indicator = duplicated(conservation)) %>%
  arrange(conservation, genus)
msleep.with.dup.indicator2 [1:20, c(1,2,5,12)]
## # A tibble: 20 x 4
##    name                       genus         conservation duplicate.indicator
##    <chr>                      <chr>         <chr>        <lgl>              
##  1 Giraffe                    Giraffa       cd           FALSE              
##  2 Pilot whale                Globicephalus cd           TRUE               
##  3 Cow                        Bos           domesticated FALSE              
##  4 Dog                        Canis         domesticated TRUE               
##  5 Guinea pig                 Cavis         domesticated TRUE               
##  6 Chinchilla                 Chinchilla    domesticated TRUE               
##  7 Horse                      Equus         domesticated TRUE               
##  8 Donkey                     Equus         domesticated TRUE               
##  9 Domestic cat               Felis         domesticated TRUE               
## 10 Rabbit                     Oryctolagus   domesticated TRUE               
## 11 Sheep                      Ovis          domesticated TRUE               
## 12 Pig                        Sus           domesticated TRUE               
## 13 Asian elephant             Elephas       en           FALSE              
## 14 Golden hamster             Mesocricetus  en           TRUE               
## 15 Tiger                      Panthera      en           TRUE               
## 16 Giant armadillo            Priodontes    en           TRUE               
## 17 Cheetah                    Acinonyx      lc           FALSE              
## 18 Greater short-tailed shrew Blarina       lc           TRUE               
## 19 Roe deer                   Capreolus     lc           TRUE               
## 20 Goat                       Capri         lc           TRUE
msleepdup <- msleep %>%
  mutate(dup.indicator.gen = duplicated(genus), du.indicator.cons = duplicated(conservation))%>%
  arrange(conservation, genus)
msleepdup [1:10,c(1,2,5,12,13)]
## # A tibble: 10 x 5
##    name         genus         conservation dup.indicator.gen du.indicator.cons
##    <chr>        <chr>         <chr>        <lgl>             <lgl>            
##  1 Giraffe      Giraffa       cd           FALSE             FALSE            
##  2 Pilot whale  Globicephalus cd           FALSE             TRUE             
##  3 Cow          Bos           domesticated FALSE             FALSE            
##  4 Dog          Canis         domesticated FALSE             TRUE             
##  5 Guinea pig   Cavis         domesticated FALSE             TRUE             
##  6 Chinchilla   Chinchilla    domesticated FALSE             TRUE             
##  7 Horse        Equus         domesticated FALSE             TRUE             
##  8 Donkey       Equus         domesticated TRUE              TRUE             
##  9 Domestic cat Felis         domesticated FALSE             TRUE             
## 10 Rabbit       Oryctolagus   domesticated FALSE             TRUE
###PREGUNTAR
msleepdup2 <- msleep%>%
  mutate(across(c(genus,conservation), duplicated))%>%
  arrange(conservation, genus)
msleepdup2
## # A tibble: 83 x 11
##    name   genus vore  order conservation sleep_total sleep_rem sleep_cycle awake
##    <chr>  <lgl> <chr> <chr> <lgl>              <dbl>     <dbl>       <dbl> <dbl>
##  1 Cheet~ FALSE carni Carn~ FALSE               12.1      NA        NA      11.9
##  2 Owl m~ FALSE omni  Prim~ FALSE               17         1.8      NA       7  
##  3 Mount~ FALSE herbi Rode~ FALSE               14.4       2.4      NA       9.6
##  4 Cow    FALSE herbi Arti~ FALSE                4         0.7       0.667  20  
##  5 North~ FALSE carni Carn~ FALSE                8.7       1.4       0.383  15.3
##  6 Asian~ FALSE herbi Prob~ FALSE                3.9      NA        NA      20.1
##  7 Giraf~ FALSE herbi Arti~ FALSE                1.9       0.4      NA      22.1
##  8 Great~ FALSE omni  Sori~ TRUE                14.9       2.3       0.133   9.1
##  9 Three~ FALSE herbi Pilo~ TRUE                14.4       2.2       0.767   9.6
## 10 Vespe~ FALSE <NA>  Rode~ TRUE                 7        NA        NA      17  
## # ... with 73 more rows, and 2 more variables: brainwt <dbl>, bodywt <dbl>
fruit <- c("apple", "pear", "orange", "grape", "orange", "orange")
x <- c(1,2,4,9,4,6)
y <- c(22,3,4,55,15,9)
z <- c(3,1,4,10,12,8)
w <- c(2,2,2,4,5,6)
df <- data.frame(fruit,x,y,z,w)
df
##    fruit x  y  z w
## 1  apple 1 22  3 2
## 2   pear 2  3  1 2
## 3 orange 4  4  4 2
## 4  grape 9 55 10 4
## 5 orange 4 15 12 5
## 6 orange 6  9  8 6
df.show.dup <- df %>%
  mutate(duplicate.indicator = duplicated(fruit))
df.show.dup
##    fruit x  y  z w duplicate.indicator
## 1  apple 1 22  3 2               FALSE
## 2   pear 2  3  1 2               FALSE
## 3 orange 4  4  4 2               FALSE
## 4  grape 9 55 10 4               FALSE
## 5 orange 4 15 12 5                TRUE
## 6 orange 6  9  8 6                TRUE

Drop variable using NULL

df.sin.z <- df%>%
  mutate(z=NULL)
df.sin.z
##    fruit x  y w
## 1  apple 1 22 2
## 2   pear 2  3 2
## 3 orange 4  4 2
## 4  grape 9 55 4
## 5 orange 4 15 5
## 6 orange 6  9 6

Prefered coding sequence

if (!require("nycflights13")) install.packages("nycflights13")
## Loading required package: nycflights13
names(flights)
##  [1] "year"           "month"          "day"            "dep_time"      
##  [5] "sched_dep_time" "dep_delay"      "arr_time"       "sched_arr_time"
##  [9] "arr_delay"      "carrier"        "flight"         "tailnum"       
## [13] "origin"         "dest"           "air_time"       "distance"      
## [17] "hour"           "minute"         "time_hour"
## Not recommended but works
flights1 <- mutate(flights,
 gain = arr_delay - dep_delay,
 hours = air_time / 60,
 gain_per_hour = gain / hours,
 gain_per_minute = 60 * gain_per_hour)
names(flights1)
##  [1] "year"            "month"           "day"             "dep_time"       
##  [5] "sched_dep_time"  "dep_delay"       "arr_time"        "sched_arr_time" 
##  [9] "arr_delay"       "carrier"         "flight"          "tailnum"        
## [13] "origin"          "dest"            "air_time"        "distance"       
## [17] "hour"            "minute"          "time_hour"       "gain"           
## [21] "hours"           "gain_per_hour"   "gain_per_minute"
## Recommended
flights2 <- flights%>%
  mutate(gain = arr_delay - dep_delay,
 hours = air_time / 60) %>%
 mutate(gain_per_hour = gain / hours) %>%
 mutate(gain_per_minute = 60 * gain_per_hour)
flights2[1:6,c(1:2,20:23)]
## # A tibble: 6 x 6
##    year month  gain hours gain_per_hour gain_per_minute
##   <int> <int> <dbl> <dbl>         <dbl>           <dbl>
## 1  2013     1     9  3.78          2.38            143.
## 2  2013     1    16  3.78          4.23            254.
## 3  2013     1    31  2.67         11.6             698.
## 4  2013     1   -17  3.05         -5.57           -334.
## 5  2013     1   -19  1.93         -9.83           -590.
## 6  2013     1    16  2.5           6.4             384

Transmute: Keep Only variables created

fruit <- c("apple","pear","orange","grape", "orange","orange")
x <- c(1,2,4,9,4,6)
y <- c(22,3,4,55,15,9)
z <- c(3,1,4,10,12,8)
df <- data.frame(fruit,x,y,z)
df
##    fruit x  y  z
## 1  apple 1 22  3
## 2   pear 2  3  1
## 3 orange 4  4  4
## 4  grape 9 55 10
## 5 orange 4 15 12
## 6 orange 6  9  8
df <- df %>%
  mutate(new.variable = x+y+z)
df
##    fruit x  y  z new.variable
## 1  apple 1 22  3           26
## 2   pear 2  3  1            6
## 3 orange 4  4  4           12
## 4  grape 9 55 10           74
## 5 orange 4 15 12           31
## 6 orange 6  9  8           23
df1 <- transmute(df, new.variable = x+y+z)
df1
##   new.variable
## 1           26
## 2            6
## 3           12
## 4           74
## 5           31
## 6           23

Use across to apply a function over multiple columns

double.it <- function(x) x*2
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
iris %>%
  mutate(across(where(is.numeric), double.it))%>%
  head()
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1         10.2         7.0          2.8         0.4  setosa
## 2          9.8         6.0          2.8         0.4  setosa
## 3          9.4         6.4          2.6         0.4  setosa
## 4          9.2         6.2          3.0         0.4  setosa
## 5         10.0         7.2          2.8         0.4  setosa
## 6         10.8         7.8          3.4         0.8  setosa

Conditional mutating using case_when

row1 <- c("a","b","c","d","e","f","column.to.be.changed")
row2 <- c(1,1,1,6,6,1,2)
row3 <- c(3,4,4,6,4,4,4)
row4 <- c(4,6,25,5,5,2,9)
row5 <- c(5,3,6,3,3,6,2)
df <- as.data.frame(rbind(row2,row3,row4,row5))
names(df) <- row1
df
##      a b  c d e f column.to.be.changed
## row2 1 1  1 6 6 1                    2
## row3 3 4  4 6 4 4                    4
## row4 4 6 25 5 5 2                    9
## row5 5 3  6 3 3 6                    2
new.df <- df %>%
  mutate(column.to.be.changed = case_when(a==2 | a==5 | a==7 | (a==1 & b==4)~2, a==0 | a==1 | a==4 |
 a==3 | c==4~3, TRUE ~ NA_real_ ))
new.df
##      a b  c d e f column.to.be.changed
## row2 1 1  1 6 6 1                    3
## row3 3 4  4 6 4 4                    3
## row4 4 6 25 5 5 2                    3
## row5 5 3  6 3 3 6                    2

Select to choose variables/columns

Delete a column

fruit <- c("apple","pear","orange","grape", "orange","orange")
x <- c(1,2,4,9,4,6)
y <- c(22,3,4,55,15,9)
z <- c(3,1,4,10,12,8)
df <- data.frame(fruit,x,y,z) #before select
df
##    fruit x  y  z
## 1  apple 1 22  3
## 2   pear 2  3  1
## 3 orange 4  4  4
## 4  grape 9 55 10
## 5 orange 4 15 12
## 6 orange 6  9  8
df.no.fruit <- dplyr::select(df, -fruit)
df.no.fruit
##   x  y  z
## 1 1 22  3
## 2 2  3  1
## 3 4  4  4
## 4 9 55 10
## 5 4 15 12
## 6 6  9  8

Delete columns by name using starts_with or ends_with

names(mtcars)
##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"
mtcar.sin.startd.endt <- mtcars%>%
  select(-starts_with("d"), -ends_with("t")) 
names(mtcar.sin.startd.endt)
## [1] "mpg"  "cyl"  "hp"   "qsec" "vs"   "am"   "gear" "carb"

select_all to apply a function to all columns

state <- c("Maryland", "Alaska", "New Jersey")
income <- c(76067,74444,73702)
median.us <- c(61372,61372,61372)
life.expectancy <- c(78.8,78.3,80.3)
top.3.states <- data.frame(state, income, median.us,
 life.expectancy)
top.3.states
##        state income median.us life.expectancy
## 1   Maryland  76067     61372            78.8
## 2     Alaska  74444     61372            78.3
## 3 New Jersey  73702     61372            80.3
#Capitalize column names con toupper()
new.top.states <- top.3.states %>%
  select_all(toupper)
new.top.states
##        STATE INCOME MEDIAN.US LIFE.EXPECTANCY
## 1   Maryland  76067     61372            78.8
## 2     Alaska  74444     61372            78.3
## 3 New Jersey  73702     61372            80.3

Select columns using the pull function

#pull function: acts like dataframe$variable syntax
pull.first.column <- pull(top.3.states,1)
pull.first.column
## [1] "Maryland"   "Alaska"     "New Jersey"
top.3.states$state
## [1] "Maryland"   "Alaska"     "New Jersey"
pull.last.colum <- pull(top.3.states, -1)
pull.last.colum
## [1] 78.8 78.3 80.3

Select rows: any variable meets some condition

nrow(mtcars)
## [1] 32
mtcars.more.than.200 <- filter_all(mtcars, any_vars(. > 200))
nrow(mtcars.more.than.200)
## [1] 16

Select using wildcard matching

names(mtcars)
##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"
subset.mtcars <- mtcars %>%
 select(matches("pg|gea"))
names(subset.mtcars)
## [1] "mpg"  "gear"

Joins: manipulations of data from two sources

  • mutating joins: add new variables to one data from matching observations in another
  • filtering joins: filter observations from one dataframe based on match an observation in the other table
  • set operations: treat observations as if they were set elements Connections between tables: primary key (tabla A) –> foreign key (tabla B)

Left join

#cbind: take data-frame arguments and combine by columns
us.state.areas <- as.data.frame(cbind(state.abb, state.area))
us.state.areas[1:3,]
##   state.abb state.area
## 1        AL      51609
## 2        AK     589757
## 3        AZ     113909
us.state.abb.name <- as.data.frame(cbind(state.abb,state.name))
us.state.abb.name[1:3,]
##   state.abb state.name
## 1        AL    Alabama
## 2        AK     Alaska
## 3        AZ    Arizona
state.info.abb.area.name <- us.state.areas %>%
  left_join(us.state.abb.name, by = "state.abb")
head(state.info.abb.area.name)
##   state.abb state.area state.name
## 1        AL      51609    Alabama
## 2        AK     589757     Alaska
## 3        AZ     113909    Arizona
## 4        AR      53104   Arkansas
## 5        CA     158693 California
## 6        CO     104247   Colorado

Inner join

#rows that do not match are dropped
names <- c("Sally","Tom","Frieda","Alfonzo")
team.scores <- c(3,5,2,7)
team.league <- c("alpha","beta","gamma", "omicron")
team.info <- data.frame(names, team.scores, team.league)
team.info
##     names team.scores team.league
## 1   Sally           3       alpha
## 2     Tom           5        beta
## 3  Frieda           2       gamma
## 4 Alfonzo           7     omicron
names = c("Sally","Tom", "Bill", "Alfonzo")
school.grades <- c("A","B","C","B")
school.info <- data.frame(names, school.grades)
school.info
##     names school.grades
## 1   Sally             A
## 2     Tom             B
## 3    Bill             C
## 4 Alfonzo             B
school.and.team <- inner_join(team.info, school.info, by = "names")
school.and.team
##     names team.scores team.league school.grades
## 1   Sally           3       alpha             A
## 2     Tom           5        beta             B
## 3 Alfonzo           7     omicron             B

Anti-join

#Keeps values that do not match
names <- c("Sally","Tom","Frieda","Alfonzo")
team.scores <- c(3,5,2,7)
team.league <- c("alpha","beta","gamma", "omicron")
team.info <- data.frame(names, team.scores, team.league)
team.info
##     names team.scores team.league
## 1   Sally           3       alpha
## 2     Tom           5        beta
## 3  Frieda           2       gamma
## 4 Alfonzo           7     omicron
names <- c("Sally","Tom", "Bill", "Alfonzo")
school.grades <- c("A","B","C","B")
school.info <- data.frame(names, school.grades)
school.info
##     names school.grades
## 1   Sally             A
## 2     Tom             B
## 3    Bill             C
## 4 Alfonzo             B
team.info.but.no.grades <- anti_join(team.info, school.info, by = "names")
team.info.but.no.grades
##    names team.scores team.league
## 1 Frieda           2       gamma

Full join

#Keeps all values from both dataframes
team.info.and.or.grades <- full_join(team.info, school.info, by = "names")
team.info.and.or.grades
##     names team.scores team.league school.grades
## 1   Sally           3       alpha             A
## 2     Tom           5        beta             B
## 3  Frieda           2       gamma          <NA>
## 4 Alfonzo           7     omicron             B
## 5    Bill          NA        <NA>             C

Semi-join

#keeps all observations in dataset1 which match dataset2
team.info.with.grades <- semi_join(team.info, school.info)
## Joining, by = "names"
team.info.with.grades
##     names team.scores team.league
## 1   Sally           3       alpha
## 2     Tom           5        beta
## 3 Alfonzo           7     omicron

Right join

#returns all rows of y and columns of x and y
#right_join(x, y, by= common_key)
us.state.areas<- as.data.frame(cbind(state.abb,state.area))
us.state.areas[1:3,]
##   state.abb state.area
## 1        AL      51609
## 2        AK     589757
## 3        AZ     113909
us.state.abb.name <- as.data.frame(cbind(state.abb,state.name))
us.state.abb.name[1:3,]
##   state.abb state.name
## 1        AL    Alabama
## 2        AK     Alaska
## 3        AZ    Arizona
us.state.abb.name[1,1] <- "Intentional Mismatch"
us.state.with.abbreviation.and.name.and.area <- right_join(us.state.areas,us.state.abb.name, by = "state.abb")
us.state.with.abbreviation.and.name.and.area [1:3,]
##   state.abb state.area state.name
## 1        AK     589757     Alaska
## 2        AZ     113909    Arizona
## 3        AR      53104   Arkansas

Slice

nrow(msleep)
## [1] 83
msleep.only.first.5 <- slice(msleep, -6:-n())
nrow(msleep.only.first.5)
## [1] 5
msleep.20.rows <- msleep %>%
 slice(20:39)
nrow(msleep.20.rows)
## [1] 20
nrow(msleep) - nrow(msleep.20.rows)
## [1] 63

Summarise

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
data(gehan)
gehan2 <- gehan
library(tidyverse)
gehan2 %>% summarise(kount = n())
##   kount
## 1    42
gehan2 %>%
 group_by(treat)%>%
  summarise(kount = n())
## # A tibble: 2 x 2
##   treat   kount
##   <fct>   <int>
## 1 6-MP       21
## 2 control    21
gehan2 %>%
 group_by(treat) %>%
 summarise(average.remiss.time = mean(time),
 median.remiss.time = median(time),
 std.dev.remiss.time = sd(time),
 median.abs.deviation = mad(time),
 IQR.remiss.time = IQR(time))
## # A tibble: 2 x 6
##   treat   average.remiss.time median.remiss.t~ std.dev.remiss.~ median.abs.devi~
##   <fct>                 <dbl>            <int>            <dbl>            <dbl>
## 1 6-MP                  17.1                16            10.0             10.4 
## 2 control                8.67                8             6.47             5.93
## # ... with 1 more variable: IQR.remiss.time <dbl>
gehan2 %>%
 group_by(treat) %>%
 summarise(minimum.remission = min(time),
 max.remission = max(time))
## # A tibble: 2 x 3
##   treat   minimum.remission max.remission
##   <fct>               <int>         <int>
## 1 6-MP                    6            35
## 2 control                 1            23

Summarise Across

subset.survey <- survey [1:10,]
library (dplyr)
names(subset.survey)
##  [1] "Sex"    "Wr.Hnd" "NW.Hnd" "W.Hnd"  "Fold"   "Pulse"  "Clap"   "Exer"  
##  [9] "Smoke"  "Height" "M.I"    "Age"
subset.survey %>%
  na.omit() %>%
  group_by(Sex) %>%
  summarise(across(where(is.numeric), mean, .names = "mean_{col}")) %>%
  head()
## # A tibble: 2 x 6
##   Sex    mean_Wr.Hnd mean_NW.Hnd mean_Pulse mean_Height mean_Age
##   <fct>        <dbl>       <dbl>      <dbl>       <dbl>    <dbl>
## 1 Female        17.8        17.7       76.7        168.     25.0
## 2 Male          19.1        19.2       76.8        174.     20.3
View(msleep)
new.sleep <- msleep %>%
 group_by(vore, order) 
s <- summarise(new.sleep, n())
## `summarise()` has grouped output by 'vore'. You can override using the `.groups` argument.
s
## # A tibble: 32 x 3
## # Groups:   vore [5]
##    vore  order           `n()`
##    <chr> <chr>           <int>
##  1 carni Carnivora          12
##  2 carni Cetacea             3
##  3 carni Cingulata           1
##  4 carni Didelphimorphia     1
##  5 carni Primates            1
##  6 carni Rodentia            1
##  7 herbi Artiodactyla        5
##  8 herbi Diprotodontia       1
##  9 herbi Hyracoidea          2
## 10 herbi Lagomorpha          1
## # ... with 22 more rows
new.sleep.totals <- msleep %>%
 group_by(vore, order) %>%
 summarise(n())
## `summarise()` has grouped output by 'vore'. You can override using the `.groups` argument.
new.sleep.totals
## # A tibble: 32 x 3
## # Groups:   vore [5]
##    vore  order           `n()`
##    <chr> <chr>           <int>
##  1 carni Carnivora          12
##  2 carni Cetacea             3
##  3 carni Cingulata           1
##  4 carni Didelphimorphia     1
##  5 carni Primates            1
##  6 carni Rodentia            1
##  7 herbi Artiodactyla        5
##  8 herbi Diprotodontia       1
##  9 herbi Hyracoidea          2
## 10 herbi Lagomorpha          1
## # ... with 22 more rows

Gathering: Convert multiple columns into one

state <- c("Maryland", "Alaska", "New Jersey")
income <- c(76067,74444,73702)
median.us <- c(61372,61372,61372)
life.expectancy <- c(78.8,78.3,80.3)
teen.birth.rate.2015 <- c(17,29.3,12.1)
teen.birth.rate.2007 <- c(34.3,42.9,24.9)
teen.birth.rate.1991 <- c(54.1, 66, 41.3)
top.3.states <- data.frame(state, income, median.us,
 life.expectancy,
 teen.birth.rate.2015, teen.birth.rate.2007,
 teen.birth.rate.1991)
names(top.3.states)  <- c("state", "income", "median.us",
 "life.expectancy","2015","2007","1991")
top.3.states
##        state income median.us life.expectancy 2015 2007 1991
## 1   Maryland  76067     61372            78.8 17.0 34.3 54.1
## 2     Alaska  74444     61372            78.3 29.3 42.9 66.0
## 3 New Jersey  73702     61372            80.3 12.1 24.9 41.3
#gather to put all three years in one column
new.top.3.states <- top.3.states %>%
 gather("2015", "2007", "1991", key = "year", value = "cases")
new.top.3.states
##        state income median.us life.expectancy year cases
## 1   Maryland  76067     61372            78.8 2015  17.0
## 2     Alaska  74444     61372            78.3 2015  29.3
## 3 New Jersey  73702     61372            80.3 2015  12.1
## 4   Maryland  76067     61372            78.8 2007  34.3
## 5     Alaska  74444     61372            78.3 2007  42.9
## 6 New Jersey  73702     61372            80.3 2007  24.9
## 7   Maryland  76067     61372            78.8 1991  54.1
## 8     Alaska  74444     61372            78.3 1991  66.0
## 9 New Jersey  73702     61372            80.3 1991  41.3
#key and value -> new columns

Spreading: Consolidation of multiple rows into one

df_1 <- data_frame(Type = c("TypeA", "TypeA", "TypeB", "TypeB"),
 Answer = c("Yes", "No", NA, "No"), n = 1:4)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
df_1
## # A tibble: 4 x 3
##   Type  Answer     n
##   <chr> <chr>  <int>
## 1 TypeA Yes        1
## 2 TypeA No         2
## 3 TypeB <NA>       3
## 4 TypeB No         4
df_2 <- df_1 %>%
filter(!is.na(Answer)) %>%
 spread(key=Answer, value=n)
df_2
## # A tibble: 2 x 3
##   Type     No   Yes
##   <chr> <int> <int>
## 1 TypeA     2     1
## 2 TypeB     4    NA

Separate: Divide a single column into multiple columns

state <- c("Maryland", "Alaska", "New Jersey")
income <- c(76067,74444,73702)
median.us <- c(61372,61372,61372)
life.expectancy <- c(78.8,78.3,80.3)
teen.birth <- c("17//34.3//54.1", "29.0//42.9//66.0", "12.1//24.9//41.3")
top.3.states <- data.frame(state, income, median.us,
 life.expectancy,teen.birth)
top.3.states
##        state income median.us life.expectancy       teen.birth
## 1   Maryland  76067     61372            78.8   17//34.3//54.1
## 2     Alaska  74444     61372            78.3 29.0//42.9//66.0
## 3 New Jersey  73702     61372            80.3 12.1//24.9//41.3
top.3.states.separated.years <- top.3.states %>%
 separate(teen.birth,
 into = c("2015", "2007","1991"), sep = "//")
top.3.states.separated.years
##        state income median.us life.expectancy 2015 2007 1991
## 1   Maryland  76067     61372            78.8   17 34.3 54.1
## 2     Alaska  74444     61372            78.3 29.0 42.9 66.0
## 3 New Jersey  73702     61372            80.3 12.1 24.9 41.3

Recap of Handy DPLYR functions

Basic counts

new.sleep
## # A tibble: 83 x 11
## # Groups:   vore, order [32]
##    name   genus vore  order conservation sleep_total sleep_rem sleep_cycle awake
##    <chr>  <chr> <chr> <chr> <chr>              <dbl>     <dbl>       <dbl> <dbl>
##  1 Cheet~ Acin~ carni Carn~ lc                  12.1      NA        NA      11.9
##  2 Owl m~ Aotus omni  Prim~ <NA>                17         1.8      NA       7  
##  3 Mount~ Aplo~ herbi Rode~ nt                  14.4       2.4      NA       9.6
##  4 Great~ Blar~ omni  Sori~ lc                  14.9       2.3       0.133   9.1
##  5 Cow    Bos   herbi Arti~ domesticated         4         0.7       0.667  20  
##  6 Three~ Brad~ herbi Pilo~ <NA>                14.4       2.2       0.767   9.6
##  7 North~ Call~ carni Carn~ vu                   8.7       1.4       0.383  15.3
##  8 Vespe~ Calo~ <NA>  Rode~ <NA>                 7        NA        NA      17  
##  9 Dog    Canis carni Carn~ domesticated        10.1       2.9       0.333  13.9
## 10 Roe d~ Capr~ herbi Arti~ lc                   3        NA        NA      21  
## # ... with 73 more rows, and 2 more variables: brainwt <dbl>, bodywt <dbl>
m <- mutate(new.sleep, kount = n()) 
m[1:6,c(1:4,10:12)]
## # A tibble: 6 x 7
## # Groups:   vore, order [6]
##   name                       genus      vore  order        brainwt  bodywt kount
##   <chr>                      <chr>      <chr> <chr>          <dbl>   <dbl> <int>
## 1 Cheetah                    Acinonyx   carni Carnivora   NA        50        12
## 2 Owl monkey                 Aotus      omni  Primates     0.0155    0.48     10
## 3 Mountain beaver            Aplodontia herbi Rodentia    NA         1.35     16
## 4 Greater short-tailed shrew Blarina    omni  Soricomorp~  0.00029   0.019     3
## 5 Cow                        Bos        herbi Artiodacty~  0.423   600         5
## 6 Three-toed sloth           Bradypus   herbi Pilosa      NA         3.85      1
f <- filter(new.sleep, n() > 14)
f[1:5,c(1:4,10:11)]
## # A tibble: 5 x 6
## # Groups:   vore, order [1]
##   name                      genus      vore  order    brainwt bodywt
##   <chr>                     <chr>      <chr> <chr>      <dbl>  <dbl>
## 1 Mountain beaver           Aplodontia herbi Rodentia NA       1.35 
## 2 Guinea pig                Cavis      herbi Rodentia  0.0055  0.728
## 3 Chinchilla                Chinchilla herbi Rodentia  0.0064  0.42 
## 4 Western american chipmunk Eutamias   herbi Rodentia NA       0.071
## 5 Mongolian gerbil          Meriones   herbi Rodentia NA       0.053

Nth functions

salary.description <- c("Golden parachute type","Well to do",
"Average","Below average", "bring date seeds instead of flowers")
first(salary.description)
## [1] "Golden parachute type"
last(salary.description)
## [1] "bring date seeds instead of flowers"
nth(salary.description, -3)
## [1] "Average"
nth(salary.description,2)
## [1] "Well to do"

Count distinct values

a.vector <- c(22,33,44,1,2,3,3,3,4)
original.length <- length(a.vector)
original.length
## [1] 9
distinct.a.vector <- n_distinct(a.vector)
distinct.a.vector
## [1] 7
test1 <- if_else(original.length == distinct.a.vector, "all values
unique","some duplicate values in vector")
test1
## [1] "some duplicate values in vector"
b.vector <- c(1,2,3,4,5,6)
length(b.vector)
## [1] 6
distinct.b.vector <- n_distinct(b.vector)
distinct.b.vector
## [1] 6
test2 <- if_else(length(b.vector) == distinct.b.vector, "all values
unique", "duplicates")
test2
## [1] "all values\nunique"

na_if

test <- c(100, 0, 999)
x <- 5000/test
x
## [1] 50.000000       Inf  5.005005
x <- 5000/na_if(test,0) 
x
## [1] 50.000000        NA  5.005005

Coalesce to Replace Missing Values

#Replace missing values with zero
x <- c(33,4,11,NA,9)
x
## [1] 33  4 11 NA  9
x <- coalesce(x,0)
x
## [1] 33  4 11  0  9

Ranking functions

#Ranking via index
y <- c(100,4,12,6,8,3)
rank1 <-row_number(y)
rank1
## [1] 6 2 5 3 4 1
y[rank1[1]] 
## [1] 3
y[rank1[6]] 
## [1] 100
#Minimum rank
rank2 <- min_rank(y)
rank2
## [1] 6 2 5 3 4 1
#Dense rank
rank3 <- dense_rank(y)
rank3
## [1] 6 2 5 3 4 1
#Percent rank
rank4 <- percent_rank(y)
rank4
## [1] 1.0 0.2 0.8 0.4 0.6 0.0
#Cumulative distribution function
y <- c(100,4,12,6,8,3)
rank5 <- cume_dist(y)
rank5
## [1] 1.0000000 0.3333333 0.8333333 0.5000000 0.6666667 0.1666667
rank6 = ntile(y, 3)
rank6
## [1] 3 1 3 2 2 1
test.vector <- c(2,22,33,44,77,89,99)
quantile(test.vector, prob = seq(0,1,length = 11),type = 5)
##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##  2.0  6.0 20.0 28.6 36.3 44.0 67.1 81.8 90.0 97.0 99.0

Sampling

#Randomly sample 5 rows
data("ChickWeight")
my.sample <- sample_n(ChickWeight, 5)
my.sample
##   weight Time Chick Diet
## 1     92   14     9    1
## 2    124   10    25    2
## 3    116   10    36    3
## 4    205   18    26    2
## 5    136   12    26    2
set.seed(833)
my.sample <- sample_n(ChickWeight, 10, replace = TRUE)
my.sample
##    weight Time Chick Diet
## 1      98    8    45    4
## 2      42    0    17    1
## 3      98    8    36    3
## 4      51    2    11    1
## 5     198   20     3    1
## 6     237   21    49    4
## 7     205   16    50    4
## 8     170   16    39    3
## 9     332   18    35    3
## 10    144   14    33    3
my.sample <- sample_n(mtcars, 12, weight = cyl)
my.sample[,1:5]
##                     mpg cyl  disp  hp drat
## AMC Javelin        15.2   8 304.0 150 3.15
## Porsche 914-2      26.0   4 120.3  91 4.43
## Merc 280           19.2   6 167.6 123 3.92
## Cadillac Fleetwood 10.4   8 472.0 205 2.93
## Merc 240D          24.4   4 146.7  62 3.69
## Datsun 710         22.8   4 108.0  93 3.85
## Merc 280C          17.8   6 167.6 123 3.92
## Mazda RX4 Wag      21.0   6 160.0 110 3.90
## Merc 450SLC        15.2   8 275.8 180 3.07
## Chrysler Imperial  14.7   8 440.0 230 3.23
## Maserati Bora      15.0   8 301.0 335 3.54
## Valiant            18.1   6 225.0 105 2.76
#to obtain a sample equal to a specific percentage 
test1 <- sample_frac(ChickWeight, 0.02)
test1
##    weight Time Chick Diet
## 1      48    2    13    1
## 2      62    6    12    1
## 3     197   20    45    4
## 4     234   18    42    4
## 5      58    4    28    2
## 6     163   16     3    1
## 7     103    8    41    4
## 8     103    8    42    4
## 9     120   18    19    1
## 10     48    2    36    3
## 11     80    6    48    4
## 12    137   12    33    3
View(starwars)
by_hair_color <- starwars %>%
  group_by(hair_color) %>%
  sample_frac(.07,replace=TRUE)
by_hair_color
## # A tibble: 5 x 14
## # Groups:   hair_color [3]
##   name       height  mass hair_color skin_color eye_color birth_year sex   gender
##   <chr>       <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
## 1 Eeth Koth     171    NA black      brown      brown             NA male  mascu~
## 2 Dormé         165    NA brown      light      brown             NA fema~ femin~
## 3 Sebulba       112    40 none       grey, red  orange            NA male  mascu~
## 4 Shaak Ti      178    57 none       red, blue~ black             NA fema~ femin~
## 5 Tion Medon    206    80 none       grey       black             NA male  mascu~
## # ... with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
row.kount.only <- ChickWeight %>% tally()
row.kount.only
##     n
## 1 578
diet.kount <- ChickWeight %>% count(Diet)
diet.kount
##   Diet   n
## 1    1 220
## 2    2 120
## 3    3 120
## 4    4 118

Miscellaneous DPLYR functions

add_count for groupwise filtering

single.species.kount <- starwars %>%
 add_count(species) %>%
 filter(n == 1)
single.species.kount[,1:6]
## # A tibble: 29 x 6
##    name                  height  mass hair_color skin_color       eye_color
##    <chr>                  <int> <dbl> <chr>      <chr>            <chr>    
##  1 Greedo                   173    74 <NA>       green            black    
##  2 Jabba Desilijic Tiure    175  1358 <NA>       green-tan, brown orange   
##  3 Yoda                      66    17 white      green            brown    
##  4 Bossk                    190   113 none       green            red      
##  5 Ackbar                   180    83 none       brown mottle     orange   
##  6 Wicket Systri Warrick     88    20 brown      brown            brown    
##  7 Nien Nunb                160    68 none       grey             black    
##  8 Nute Gunray              191    90 none       mottled green    red      
##  9 Watto                    137    NA black      blue, grey       yellow   
## 10 Sebulba                  112    40 none       grey, red        orange   
## # ... with 19 more rows

Rename

mtcars <- rename(mtcars, spam_mpg = mpg)
names(mtcars)
##  [1] "spam_mpg" "cyl"      "disp"     "hp"       "drat"     "wt"      
##  [7] "qsec"     "vs"       "am"       "gear"     "carb"

case_when

new.starwars <- starwars %>%
 dplyr::select(name, mass, gender, species, height) %>%
 mutate(type = case_when(height > 200 | mass > 200 ~ "large",
 species == "Droid" ~ "robot", TRUE ~ "other"))
new.starwars
## # A tibble: 87 x 6
##    name                mass gender    species height type 
##    <chr>              <dbl> <chr>     <chr>    <int> <chr>
##  1 Luke Skywalker        77 masculine Human      172 other
##  2 C-3PO                 75 masculine Droid      167 robot
##  3 R2-D2                 32 masculine Droid       96 robot
##  4 Darth Vader          136 masculine Human      202 large
##  5 Leia Organa           49 feminine  Human      150 other
##  6 Owen Lars            120 masculine Human      178 other
##  7 Beru Whitesun lars    75 feminine  Human      165 other
##  8 R5-D4                 32 masculine Droid       97 robot
##  9 Biggs Darklighter     84 masculine Human      183 other
## 10 Obi-Wan Kenobi        77 masculine Human      182 other
## # ... with 77 more rows