Cap 1 dplyr DSCG

R Markdown

#1,1,1 single condition filter

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

data("mtcars")
#select only cars with six cylinders
six.cyl.only<-filter(mtcars,cyl==6)
six.cyl.only

##                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4      21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Valiant        18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Merc 280       19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Merc 280C      17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Ferrari Dino   19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6

#Seleccionar los carros con mas de 150 hp
mas.cientocincuenta<-filter(mtcars,hp > 150)
mas.cientocincuenta

##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8

#1,1,2 Multiple-Condition Filter
six.cylinders.and.110.horse.power<-filter(mtcars,cyl==6,hp==110)
six.cylinders.and.110.horse.power

##                 mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4      21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag  21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Hornet 4 Drive 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1

#1,1,3 OR Logic for Filtering
gear.eq.4.or.more.than.8<-filter(mtcars,gear==4|cyl>6)
gear.eq.4.or.more.than.8

##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
## Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2

#1,1,4 Filter by Minimums, Maximuns, and other Numeric criteria
smallest.engine.displacement<- filter(mtcars,disp==min(disp))
smallest.engine.displacement

##                 mpg cyl disp hp drat    wt qsec vs am gear carb
## Toyota Corolla 33.9   4 71.1 65 4.22 1.835 19.9  1  1    4    1

data("Chickweight")

## Warning in data("Chickweight"): data set 'Chickweight' not found

chick.subset<-filter(ChickWeight,Time<3,weight>53)
chick.subset

##   weight Time Chick Diet
## 1     55    2    22    2
## 2     55    2    40    3
## 3     55    2    43    4
## 4     54    2    50    4

#1,1,5 Filter Out Missing Values (NAs) for a specific Column
data("airquality")
head(airquality,10)# before filter

##    Ozone Solar.R Wind Temp Month Day
## 1     41     190  7.4   67     5   1
## 2     36     118  8.0   72     5   2
## 3     12     149 12.6   74     5   3
## 4     18     313 11.5   62     5   4
## 5     NA      NA 14.3   56     5   5
## 6     28      NA 14.9   66     5   6
## 7     23     299  8.6   65     5   7
## 8     19      99 13.8   59     5   8
## 9      8      19 20.1   61     5   9
## 10    NA     194  8.6   69     5  10

no.missing.ozone=filter(airquality,!is.na(Ozone))
head(no.missing.ozone,8)#after filter

##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    28      NA 14.9   66     5   6
## 6    23     299  8.6   65     5   7
## 7    19      99 13.8   59     5   8
## 8     8      19 20.1   61     5   9

#1,1,6 Filter Rows with NAs Anywhere in the Dataset
airqual.no.NA.anywhere<-filter(airquality[1:10,], 
  complete.cases(airquality[1:10,]))
airqual.no.NA.anywhere

##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    23     299  8.6   65     5   7
## 6    19      99 13.8   59     5   8
## 7     8      19 20.1   61     5   9

#1,1,7 Filter by %in%
data("iris")
table(iris$Species) #counts of species in the dataset

## 
##     setosa versicolor  virginica 
##         50         50         50

iris.two.species<-filter(iris,Species %in% c("setosa", "virginica"))
table(iris.two.species$Species)

## 
##     setosa versicolor  virginica 
##         50          0         50

nrow(iris); nrow(iris.two.species)

## [1] 150

## [1] 100

#Seleccionar los carros con mas de 30 mpg y carb 1
mas.30mpg.and.carb1<-filter(mtcars,mpg>30,carb==1)
mas.30mpg.and.carb1

##                 mpg cyl disp hp drat    wt  qsec vs am gear carb
## Fiat 128       32.4   4 78.7 66 4.08 2.200 19.47  1  1    4    1
## Toyota Corolla 33.9   4 71.1 65 4.22 1.835 19.90  1  1    4    1

#1,1,8 Filter for Ozone>29 and include Only three coumns
data("airquality")
airqual.3.coloumns<-filter(airquality, Ozone > 29)[,1:3]
head(airqual.3.coloumns)

##   Ozone Solar.R Wind
## 1    41     190  7.4
## 2    36     118  8.0
## 3    34     307 12.0
## 4    30     322 11.5
## 5    32      92 12.0
## 6    45     252 14.9

#datos para wind>7
data("airquality")
wind.3.columns<-filter(airquality, Wind>7)[,3:5]
head(wind.3.columns)

##   Wind Temp Month
## 1  7.4   67     5
## 2  8.0   72     5
## 3 12.6   74     5
## 4 11.5   62     5
## 5 14.3   56     5
## 6 14.9   66     5

#Filter by total frequency of a value across All rows
table(mtcars$gear)

## 
##  3  4  5 
## 15 12  5

more.frequent.no.of.gear<-mtcars%>%
  group_by(gear)%>%
  filter(n()>10) #
table(more.frequent.no.of.gear$gear)

## 
##  3  4 
## 15 12

#additional crteria can be added to the filter by includying a requirement that the horsepower be less than 105
more.frequent.no.of.gears.and.low.horsepower<-mtcars%>%group_by(gear)%>%
  filter(n()>10,hp<105)
  table(more.frequent.no.of.gears.and.low.horsepower$gear)

## 
## 3 4 
## 1 7

#1,1,10 Filter by Column Name Using "starts with"
names (iris) #show the column names

## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"

iris.display<-iris %>% dplyr::select(starts_with("s"))
head(iris.display) #use head to reduce number or rows output

##   Sepal.Length Sepal.Width Species
## 1          5.1         3.5  setosa
## 2          4.9         3.0  setosa
## 3          4.7         3.2  setosa
## 4          4.6         3.1  setosa
## 5          5.0         3.6  setosa
## 6          5.4         3.9  setosa

#Filter Rows: Columns Meet Criteria (filter_at)
new.mtcars<-mtcars%>% filter_at(vars(cyl,hp),
                                all_vars(.==max(.)))
new.mtcars

##               mpg cyl disp  hp drat   wt qsec vs am gear carb
## Maserati Bora  15   8  301 335 3.54 3.57 14.6  0  1    5    8

msleep<-ggplot2::msleep
msleep

## # A tibble: 83 x 11
##    name   genus vore  order conservation sleep_total sleep_rem sleep_cycle awake
##    <chr>  <chr> <chr> <chr> <chr>              <dbl>     <dbl>       <dbl> <dbl>
##  1 Cheet~ Acin~ carni Carn~ lc                  12.1      NA        NA      11.9
##  2 Owl m~ Aotus omni  Prim~ <NA>                17         1.8      NA       7  
##  3 Mount~ Aplo~ herbi Rode~ nt                  14.4       2.4      NA       9.6
##  4 Great~ Blar~ omni  Sori~ lc                  14.9       2.3       0.133   9.1
##  5 Cow    Bos   herbi Arti~ domesticated         4         0.7       0.667  20  
##  6 Three~ Brad~ herbi Pilo~ <NA>                14.4       2.2       0.767   9.6
##  7 North~ Call~ carni Carn~ vu                   8.7       1.4       0.383  15.3
##  8 Vespe~ Calo~ <NA>  Rode~ <NA>                 7        NA        NA      17  
##  9 Dog    Canis carni Carn~ domesticated        10.1       2.9       0.333  13.9
## 10 Roe d~ Capr~ herbi Arti~ lc                   3        NA        NA      21  
## # ... with 73 more rows, and 2 more variables: brainwt <dbl>, bodywt <dbl>

msleep.over.5<-msleep%>%
  select(name, sleep_total:sleep_rem,brainwt:bodywt)%>%
  filter_at(vars(contains("sleep")),all_vars(.>5))
msleep.over.5

## # A tibble: 2 x 5
##   name                 sleep_total sleep_rem brainwt bodywt
##   <chr>                      <dbl>     <dbl>   <dbl>  <dbl>
## 1 Thick-tailed opposum        19.4       6.6  NA       0.37
## 2 Giant armadillo             18.1       6.1   0.081  60

#1,2 Arrange (Sort)

msleep<-ggplot2::msleep
msleep[,1:4]

## # A tibble: 83 x 4
##    name                       genus       vore  order       
##    <chr>                      <chr>       <chr> <chr>       
##  1 Cheetah                    Acinonyx    carni Carnivora   
##  2 Owl monkey                 Aotus       omni  Primates    
##  3 Mountain beaver            Aplodontia  herbi Rodentia    
##  4 Greater short-tailed shrew Blarina     omni  Soricomorpha
##  5 Cow                        Bos         herbi Artiodactyla
##  6 Three-toed sloth           Bradypus    herbi Pilosa      
##  7 Northern fur seal          Callorhinus carni Carnivora   
##  8 Vesper mouse               Calomys     <NA>  Rodentia    
##  9 Dog                        Canis       carni Carnivora   
## 10 Roe deer                   Capreolus   herbi Artiodactyla
## # ... with 73 more rows

#1,2,1 Ascending
animal.name.sequence<-arrange(msleep,vore,order)
animal.name.sequence[,1:4]

## # A tibble: 83 x 4
##    name              genus        vore  order    
##    <chr>             <chr>        <chr> <chr>    
##  1 Cheetah           Acinonyx     carni Carnivora
##  2 Northern fur seal Callorhinus  carni Carnivora
##  3 Dog               Canis        carni Carnivora
##  4 Domestic cat      Felis        carni Carnivora
##  5 Gray seal         Haliochoerus carni Carnivora
##  6 Tiger             Panthera     carni Carnivora
##  7 Jaguar            Panthera     carni Carnivora
##  8 Lion              Panthera     carni Carnivora
##  9 Caspian seal      Phoca        carni Carnivora
## 10 Genet             Genetta      carni Carnivora
## # ... with 73 more rows

#1,2,2 Descending
animal.name.sequence.desc<-arrange(msleep,vore,desc(order))
head(animal.name.sequence.desc[,1:4])

## # A tibble: 6 x 4
##   name                       genus         vore  order          
##   <chr>                      <chr>         <chr> <chr>          
## 1 Northern grasshopper mouse Onychomys     carni Rodentia       
## 2 Slow loris                 Nyctibeus     carni Primates       
## 3 Thick-tailed opposum       Lutreolina    carni Didelphimorphia
## 4 Long-nosed armadillo       Dasypus       carni Cingulata      
## 5 Pilot whale                Globicephalus carni Cetacea        
## 6 Common porpoise            Phocoena      carni Cetacea

#1,3 Rename
#Rename one or more Columns in a dataset:
names(iris)

## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"

renamed.iris<-rename(iris,widh.of.petals=Petal.Width,various.plants.and.animals=Species)
names(renamed.iris)

## [1] "Sepal.Length"               "Sepal.Width"               
## [3] "Petal.Length"               "widh.of.petals"            
## [5] "various.plants.and.animals"

#1,4 Mutate

data("ChickWeight")
ChickWeight[1:2,]#first two rows

##   weight Time Chick Diet
## 1     42    0     1    1
## 2     51    2     1    1

ChickWeight.with.log<-mutate(ChickWeight,
log.of.weight=log10(weight))
ChickWeight.with.log[1:2,]

##   weight Time Chick Diet log.of.weight
## 1     42    0     1    1      1.623249
## 2     51    2     1    1      1.707570

#1,4,1 mutate_all 
msleep<-ggplot2::msleep
names(msleep)

##  [1] "name"         "genus"        "vore"         "order"        "conservation"
##  [6] "sleep_total"  "sleep_rem"    "sleep_cycle"  "awake"        "brainwt"     
## [11] "bodywt"

msleep.with.square.roots<-mutate_all(msleep[,6:11],
  funs("square root"=sqrt(.)))

## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

names(msleep.with.square.roots)

##  [1] "sleep_total"             "sleep_rem"              
##  [3] "sleep_cycle"             "awake"                  
##  [5] "brainwt"                 "bodywt"                 
##  [7] "sleep_total_square root" "sleep_rem_square root"  
##  [9] "sleep_cycle_square root" "awake_square root"      
## [11] "brainwt_square root"     "bodywt_square root"

msleep.with.square.roots

## # A tibble: 83 x 12
##    sleep_total sleep_rem sleep_cycle awake  brainwt  bodywt `sleep_total_square~
##          <dbl>     <dbl>       <dbl> <dbl>    <dbl>   <dbl>                <dbl>
##  1        12.1      NA        NA      11.9 NA        50                     3.48
##  2        17         1.8      NA       7    0.0155    0.48                  4.12
##  3        14.4       2.4      NA       9.6 NA         1.35                  3.79
##  4        14.9       2.3       0.133   9.1  0.00029   0.019                 3.86
##  5         4         0.7       0.667  20    0.423   600                     2   
##  6        14.4       2.2       0.767   9.6 NA         3.85                  3.79
##  7         8.7       1.4       0.383  15.3 NA        20.5                   2.95
##  8         7        NA        NA      17   NA         0.045                 2.65
##  9        10.1       2.9       0.333  13.9  0.07     14                     3.18
## 10         3        NA        NA      21    0.0982   14.8                   1.73
## # ... with 73 more rows, and 5 more variables: sleep_rem_square root <dbl>,
## #   sleep_cycle_square root <dbl>, awake_square root <dbl>,
## #   brainwt_square root <dbl>, bodywt_square root <dbl>

#1,4,2 mutate_at to Add Fields
data("Titanic")
Titanic<-as.data.frame(Titanic)
head(Titanic)

##   Class    Sex   Age Survived Freq
## 1   1st   Male Child       No    0
## 2   2nd   Male Child       No    0
## 3   3rd   Male Child       No   35
## 4  Crew   Male Child       No    0
## 5   1st Female Child       No    0
## 6   2nd Female Child       No    0

titanic.with.ranks<-mutate_at(Titanic,vars(Class,Age,Survived),
funs(Rank=min_rank(desc(.))))
head(titanic.with.ranks)

##   Class    Sex   Age Survived Freq Class_Rank Age_Rank Survived_Rank
## 1   1st   Male Child       No    0         25       17            17
## 2   2nd   Male Child       No    0         17       17            17
## 3   3rd   Male Child       No   35          9       17            17
## 4  Crew   Male Child       No    0          1       17            17
## 5   1st Female Child       No    0         25       17            17
## 6   2nd Female Child       No    0         17       17            17

#1,4,3 mutate_if
divide.by.10<-function (a.number) (a.number/10)

head(CO2)

##   Plant   Type  Treatment conc uptake
## 1   Qn1 Quebec nonchilled   95   16.0
## 2   Qn1 Quebec nonchilled  175   30.4
## 3   Qn1 Quebec nonchilled  250   34.8
## 4   Qn1 Quebec nonchilled  350   37.2
## 5   Qn1 Quebec nonchilled  500   35.3
## 6   Qn1 Quebec nonchilled  675   39.2

new.df<-CO2%>%
  mutate_if(is.numeric,divide.by.10)
head(new.df)

##   Plant   Type  Treatment conc uptake
## 1   Qn1 Quebec nonchilled  9.5   1.60
## 2   Qn1 Quebec nonchilled 17.5   3.04
## 3   Qn1 Quebec nonchilled 25.0   3.48
## 4   Qn1 Quebec nonchilled 35.0   3.72
## 5   Qn1 Quebec nonchilled 50.0   3.53
## 6   Qn1 Quebec nonchilled 67.5   3.92

df<-data.frame(
alpha=c(22,1,NA),
almond=c(0,5,10),
grape=c(0,2,2),
apple=c(NA,5,10))
df

##   alpha almond grape apple
## 1    22      0     0    NA
## 2     1      5     2     5
## 3    NA     10     2    10

df.fix.alpha<-df %>% mutate_if(is.numeric,coalesce,...=0)
df.fix.alpha

##   alpha almond grape apple
## 1    22      0     0     0
## 2     1      5     2     5
## 3     0     10     2    10

#1,4,4 string detect and true/false Duplicate indicator
msleep<-ggplot2::msleep
table(msleep$vore)

## 
##   carni   herbi insecti    omni 
##      19      32       5      20

msleep.no.c.or.a<-filter(msleep, !str_detect(vore,
 paste(c("c","a"),collapse = "|")))
table(msleep.no.c.or.a$vore)

## 
## herbi  omni 
##    32    20

msleep.with.dup.indicador<-mutate(msleep,duplicate.indicator=duplicated(conservation))
msleep.with.dup.indicador[1:6,]

## # A tibble: 6 x 12
##   name    genus vore  order conservation sleep_total sleep_rem sleep_cycle awake
##   <chr>   <chr> <chr> <chr> <chr>              <dbl>     <dbl>       <dbl> <dbl>
## 1 Cheetah Acin~ carni Carn~ lc                  12.1      NA        NA      11.9
## 2 Owl mo~ Aotus omni  Prim~ <NA>                17         1.8      NA       7  
## 3 Mounta~ Aplo~ herbi Rode~ nt                  14.4       2.4      NA       9.6
## 4 Greate~ Blar~ omni  Sori~ lc                  14.9       2.3       0.133   9.1
## 5 Cow     Bos   herbi Arti~ domesticated         4         0.7       0.667  20  
## 6 Three-~ Brad~ herbi Pilo~ <NA>                14.4       2.2       0.767   9.6
## # ... with 3 more variables: brainwt <dbl>, bodywt <dbl>,
## #   duplicate.indicator <lgl>

msleep.with.dup.indicador<-mutate(msleep,
                                  duplicate.indicator=duplicated(conservation))
msleep.with.dup.indicador[1:6,c(1,2,3,12)]

## # A tibble: 6 x 4
##   name                       genus      vore  duplicate.indicator
##   <chr>                      <chr>      <chr> <lgl>              
## 1 Cheetah                    Acinonyx   carni FALSE              
## 2 Owl monkey                 Aotus      omni  FALSE              
## 3 Mountain beaver            Aplodontia herbi FALSE              
## 4 Greater short-tailed shrew Blarina    omni  TRUE               
## 5 Cow                        Bos        herbi FALSE              
## 6 Three-toed sloth           Bradypus   herbi TRUE

msleep.with.dup.indicador2<-mutate(msleep,duplicate.indicator = duplicated(conservation, genus)) %>%
  arrange(conservation,genus)
msleep.with.dup.indicador2

## # A tibble: 83 x 12
##    name   genus vore  order conservation sleep_total sleep_rem sleep_cycle awake
##    <chr>  <chr> <chr> <chr> <chr>              <dbl>     <dbl>       <dbl> <dbl>
##  1 Giraf~ Gira~ herbi Arti~ cd                   1.9       0.4      NA      22.1
##  2 Pilot~ Glob~ carni Ceta~ cd                   2.7       0.1      NA      21.4
##  3 Cow    Bos   herbi Arti~ domesticated         4         0.7       0.667  20  
##  4 Dog    Canis carni Carn~ domesticated        10.1       2.9       0.333  13.9
##  5 Guine~ Cavis herbi Rode~ domesticated         9.4       0.8       0.217  14.6
##  6 Chinc~ Chin~ herbi Rode~ domesticated        12.5       1.5       0.117  11.5
##  7 Horse  Equus herbi Peri~ domesticated         2.9       0.6       1      21.1
##  8 Donkey Equus herbi Peri~ domesticated         3.1       0.4      NA      20.9
##  9 Domes~ Felis carni Carn~ domesticated        12.5       3.2       0.417  11.5
## 10 Rabbit Oryc~ herbi Lago~ domesticated         8.4       0.9       0.417  15.6
## # ... with 73 more rows, and 3 more variables: brainwt <dbl>, bodywt <dbl>,
## #   duplicate.indicator <lgl>

fruit<-c("apple","pear","orange","grape","orange","orange")
x<-c(1,2,4,9,4,6)
y<-c(22,3,4,55,15,9)
z<-c(3,1,4,10,12,8)
w<-c(2,2,2,4,5,6)
df<-data.frame(fruit,x,y,z,w)
df

##    fruit x  y  z w
## 1  apple 1 22  3 2
## 2   pear 2  3  1 2
## 3 orange 4  4  4 2
## 4  grape 9 55 10 4
## 5 orange 4 15 12 5
## 6 orange 6  9  8 6

df.show.single.dup<-mutate(df,duplicate.indicator=duplicated(fruit))
df.show.single.dup

##    fruit x  y  z w duplicate.indicator
## 1  apple 1 22  3 2               FALSE
## 2   pear 2  3  1 2               FALSE
## 3 orange 4  4  4 2               FALSE
## 4  grape 9 55 10 4               FALSE
## 5 orange 4 15 12 5                TRUE
## 6 orange 6  9  8 6                TRUE

#1,4,5 Drop Variables Using NULL
fruit<-c("apple","pear","orange","grape","orange","orange")
x<-c(1,2,4,9,4,6)
y<-c(22,3,4,55,15,9)
z<-c(3,1,4,10,12,8)
df<-data.frame(fruit,x,y,z)
df<-mutate(df,z=NULL)
df

##    fruit x  y
## 1  apple 1 22
## 2   pear 2  3
## 3 orange 4  4
## 4  grape 9 55
## 5 orange 4 15
## 6 orange 6  9

#1,4,6 Preferred coding sequence
if(!require("nycflights13")) install.packages("nycflights13")

## Loading required package: nycflights13

mutate(flights,
    gain= arr_delay-dep_delay,
    hours=air_time/60,
    gain_per_hour=gain/hours,
    gain_per_minute=60*gain_per_hour)

## # A tibble: 336,776 x 23
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 336,766 more rows, and 15 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## #   gain <dbl>, hours <dbl>, gain_per_hour <dbl>, gain_per_minute <dbl>

if (!require("nycflights13")) install.packages("nycflights13")
newfield.flights<-flights %>%
  mutate(gain=arr_delay-dep_delay,
  hours=air_time/60)%>%
  mutate(gain_per_hour=gain/hours)%>%
  mutate(gain_per_minute=60*gain_per_hour)
newfield.flights[1:6,c(1:2,20:23)]

## # A tibble: 6 x 6
##    year month  gain hours gain_per_hour gain_per_minute
##   <int> <int> <dbl> <dbl>         <dbl>           <dbl>
## 1  2013     1     9  3.78          2.38            143.
## 2  2013     1    16  3.78          4.23            254.
## 3  2013     1    31  2.67         11.6             698.
## 4  2013     1   -17  3.05         -5.57           -334.
## 5  2013     1   -19  1.93         -9.83           -590.
## 6  2013     1    16  2.5           6.4             384

#1,4,7 Transmute:keep Only variables created
fruit<-c("apple","pear","orange","grape","orange","orange")
x<-c(1,2,4,9,4,6)
y<-c(22,3,4,55,15,9)
z<-c(3,1,4,10,12,8)
df<-data.frame(fruit,x,y,z)
df

##    fruit x  y  z
## 1  apple 1 22  3
## 2   pear 2  3  1
## 3 orange 4  4  4
## 4  grape 9 55 10
## 5 orange 4 15 12
## 6 orange 6  9  8

dl<-transmute(df,new.variable=x+y+z)
dl

##   new.variable
## 1           26
## 2            6
## 3           12
## 4           74
## 5           31
## 6           23

mh<-data.frame(fruit)
fruit.mut<-mutate(mh,dl)
fruit.mut

##    fruit new.variable
## 1  apple           26
## 2   pear            6
## 3 orange           12
## 4  grape           74
## 5 orange           31
## 6 orange           23

names(fruit.mut)

## [1] "fruit"        "new.variable"

rename(fruit.mut,totfruit=new.variable)

##    fruit totfruit
## 1  apple       26
## 2   pear        6
## 3 orange       12
## 4  grape       74
## 5 orange       31
## 6 orange       23

#use across to apply a function over multiple columns
double.it<-function(x) x*2

head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

iris %>%
  mutate(across(where(is.numeric), double.it))%>%
  head()

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1         10.2         7.0          2.8         0.4  setosa
## 2          9.8         6.0          2.8         0.4  setosa
## 3          9.4         6.4          2.6         0.4  setosa
## 4          9.2         6.2          3.0         0.4  setosa
## 5         10.0         7.2          2.8         0.4  setosa
## 6         10.8         7.8          3.4         0.8  setosa

#1,4,9 Conditional Mutating using case_when
row1<-c("a","b","c","d","e","f","column.to.be.changed")
row2<-c(1,1,1,6,6,1,2)
row3<-c(3,4,4,6,4,4,4)
row4<-c(4,6,25,5,5,2,9)
row5<-c(5,3,6,3,3,6,2)
df<-as.data.frame(rbind(row2,row3,row4,row5))
names(df)<-row1

df

##      a b  c d e f column.to.be.changed
## row2 1 1  1 6 6 1                    2
## row3 3 4  4 6 4 4                    4
## row4 4 6 25 5 5 2                    9
## row5 5 3  6 3 3 6                    2

new.df<-df%>%
  mutate(column.to.be.changed=case_when(a == 2|a == 5|
  a == 7|(a == 1 & b == 4) ~ 2,a == 0|a == 1|a == 4|a == 3|c ==   4~3,TRUE ~ NA_real_))
new.df

##      a b  c d e f column.to.be.changed
## row2 1 1  1 6 6 1                    3
## row3 3 4  4 6 4 4                    3
## row4 4 6 25 5 5 2                    3
## row5 5 3  6 3 3 6                    2

#1,5 Select to choose variables/columns

#1,5,1 Delete a Column
fruit<-c("apple","pear","orange","grape","orange","orange")
x<-c(1,2,4,9,4,6)
y<-c(22,3,4,55,15,9)
z<-c(3,1,4,10,12,8)
df<-data.frame(fruit,x,y,z)
df

##    fruit x  y  z
## 1  apple 1 22  3
## 2   pear 2  3  1
## 3 orange 4  4  4
## 4  grape 9 55 10
## 5 orange 4 15 12
## 6 orange 6  9  8

new.df.no.fruit<-dplyr::select(df, -fruit)
new.df.no.fruit

##   x  y  z
## 1 1 22  3
## 2 2  3  1
## 3 4  4  4
## 4 9 55 10
## 5 4 15 12
## 6 6  9  8

#Delete columns by name using starts_with or ends_with
data ("mtcars")
names(mtcars)

##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"

mtcars.no.col.names.start.with.d<-select(mtcars,-starts_with("d"))
names(mtcars.no.col.names.start.with.d)

## [1] "mpg"  "cyl"  "hp"   "wt"   "qsec" "vs"   "am"   "gear" "carb"

mtcars.no.col.names.ends.with.t<-select(mtcars,-ends_with("t"))
names(mtcars.no.col.names.ends.with.t)

## [1] "mpg"  "cyl"  "disp" "hp"   "qsec" "vs"   "am"   "gear" "carb"

#1,5,3 Delete a Column
fruit<-c("apple","pear","orange","grape","orange","orange")
x<-c(1,2,4,9,4,6)
y<-c(22,3,4,55,15,9)
z<-c(3,1,4,10,12,8)
df<-data.frame(fruit,x,y,z)
df

##    fruit x  y  z
## 1  apple 1 22  3
## 2   pear 2  3  1
## 3 orange 4  4  4
## 4  grape 9 55 10
## 5 orange 4 15 12
## 6 orange 6  9  8

#1,5,4 select_all to apply a function to all columns
#create new dataframe
state<-c("Maryland","Alaska","New Jersey")
income<-c(76067,74444,73702)
median.us<-c(61372,61372,61372)
life.expectancy<-c(78.8,78.3,80.3)
top.3.states<-data.frame(state,income,median.us,life.expectancy)
top.3.states

##        state income median.us life.expectancy
## 1   Maryland  76067     61372            78.8
## 2     Alaska  74444     61372            78.3
## 3 New Jersey  73702     61372            80.3

new.top.3.states<-select_all(top.3.states,toupper)
new.top.3.states

##        STATE INCOME MEDIAN.US LIFE.EXPECTANCY
## 1   Maryland  76067     61372            78.8
## 2     Alaska  74444     61372            78.3
## 3 New Jersey  73702     61372            80.3

#1,5,5 select columns using the pull function
new.top.3.states<-select_all(top.3.states,toupper)
new.top.3.states

##        STATE INCOME MEDIAN.US LIFE.EXPECTANCY
## 1   Maryland  76067     61372            78.8
## 2     Alaska  74444     61372            78.3
## 3 New Jersey  73702     61372            80.3

pull.first.column<-pull(top.3.states,1)
pull.first.column

## [1] "Maryland"   "Alaska"     "New Jersey"

pull.last.column<-pull(top.3.states,-1)
pull.last.column

## [1] 78.8 78.3 80.3

#1,5,6 select rows:any variable meets some condition
nrow(mtcars)

## [1] 32

mtcars.more.than.200<-filter_all(mtcars,any_vars(.>200))
nrow(mtcars.more.than.200)

## [1] 16

#1,5,7 select columns:omit if column name contains specific characters
names(mtcars)

##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"

cars.with.no.p<-mtcars%>%
  dplyr::select(-contains("p"))
names(cars.with.no.p)

## [1] "cyl"  "drat" "wt"   "qsec" "vs"   "am"   "gear" "carb"

#1,5,8 Select using wildcard matching
names(mtcars)

##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"

subset.mtcars<-select(mtcars,
  matches("pg|gea"))
names(subset.mtcars)

## [1] "mpg"  "gear"

#1,6 Joins:Manipulations od data from two sources

#1,6,1 left join (Most Common)
us.state.areas<-as.data.frame(cbind(state.abb,state.area))
us.state.areas[1:3,]

##   state.abb state.area
## 1        AL      51609
## 2        AK     589757
## 3        AZ     113909

us.state.abbreviation.and.name<-as.data.frame(cbind(state.abb,state.name))
us.state.abbreviation.and.name[1:3,]

##   state.abb state.name
## 1        AL    Alabama
## 2        AK     Alaska
## 3        AZ    Arizona

state.info.abb.area.name<-us.state.areas %>%
  left_join(us.state.abbreviation.and.name,by="state.abb")
head(state.info.abb.area.name)

##   state.abb state.area state.name
## 1        AL      51609    Alabama
## 2        AK     589757     Alaska
## 3        AZ     113909    Arizona
## 4        AR      53104   Arkansas
## 5        CA     158693 California
## 6        CO     104247   Colorado

#1.6.2 Inner Join
#DATAFRAME 1
names<-c("Sally", "Tom","Frieda","Alfonzo")
team.scores<-c(3,5,2,7)
team.league<-c("alpha","beta","gamma","omicron")
team.info<-data.frame(names,team.scores,team.league)
#DATAFRAME 2
names=c("Sally","Tom","Bill","Alfonzo")
school.grades<-c("A","B","C","B")
school.info<-data_frame(names,school.grades)

## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

school.and.team<-inner_join(team.info,school.info,by="names")
school.and.team

##     names team.scores team.league school.grades
## 1   Sally           3       alpha             A
## 2     Tom           5        beta             B
## 3 Alfonzo           7     omicron             B

#1,6,3 Anti-join
#DATAFRAME 1
names<-c("Sally", "Tom","Frieda","Alfonzo")
team.scores<-c(3,5,2,7)
team.league<-c("alpha","beta","gamma","omicron")
team.info<-data.frame(names,team.scores,team.league)
team.info

##     names team.scores team.league
## 1   Sally           3       alpha
## 2     Tom           5        beta
## 3  Frieda           2       gamma
## 4 Alfonzo           7     omicron

#DATAFRAME 2
names=c("Sally","Tom","Bill","Alfonzo")
school.grades<-c("A","B","C","B")
school.info<-data_frame(names,school.grades)
school.info

## # A tibble: 4 x 2
##   names   school.grades
##   <chr>   <chr>        
## 1 Sally   A            
## 2 Tom     B            
## 3 Bill    C            
## 4 Alfonzo B

team.info.but.no.grades<-anti_join(team.info,school.info, by="names")
team.info.but.no.grades

##    names team.scores team.league
## 1 Frieda           2       gamma

x<-anti_join(school.info,team.info, by="names")
x

## # A tibble: 1 x 2
##   names school.grades
##   <chr> <chr>        
## 1 Bill  C

#1,6,4 join
#DATAFRAME 1
names<-c("Sally", "Tom","Frieda","Alfonzo")
team.scores<-c(3,5,2,7)
team.league<-c("alpha","beta","gamma","omicron")
team.info<-data.frame(names,team.scores,team.league)
#DATAFRAME 2
names=c("Sally","Tom","Bill","Alfonzo")
school.grades<-c("A","B","C","B")
school.info<-data_frame(names,school.grades)
team.info.and.or.grades<-full_join(team.info,school.info,by="names")
team.info.and.or.grades

##     names team.scores team.league school.grades
## 1   Sally           3       alpha             A
## 2     Tom           5        beta             B
## 3  Frieda           2       gamma          <NA>
## 4 Alfonzo           7     omicron             B
## 5    Bill          NA        <NA>             C

#1,6,5 Semi-join
team.info.with.grades<-semi_join(team.info,school.info)

## Joining, by = "names"

team.info.with.grades

##     names team.scores team.league
## 1   Sally           3       alpha
## 2     Tom           5        beta
## 3 Alfonzo           7     omicron

#1,6,6 Right Join
us.state.areas<-as.data.frame(cbind(state.abb,state.area))
us.state.areas[1:3,]

##   state.abb state.area
## 1        AL      51609
## 2        AK     589757
## 3        AZ     113909

us.state.abbreviation.and.name<-as.data.frame(cbind(state.abb,state.name))
us.state.abbreviation.and.name[1:3,]

##   state.abb state.name
## 1        AL    Alabama
## 2        AK     Alaska
## 3        AZ    Arizona

us.state.abbreviation.and.name[1,1]<-"Intentional Mismatch"
us.state.with.abbreviation.and.name.and.area<-right_join(us.state.areas,
 us.state.abbreviation.and.name,by="state.abb") 
us.state.with.abbreviation.and.name.and.area[1:3,]

##   state.abb state.area state.name
## 1        AK     589757     Alaska
## 2        AZ     113909    Arizona
## 3        AR      53104   Arkansas

#1,7 Slice

msleep<-ggplot2::msleep
nrow(msleep)

## [1] 83

msleep.only.first.5<-slice(msleep,-6:-n())
nrow(msleep.only.first.5)

## [1] 5

msleep.20.rows<-msleep%>%
  slice(20:39)
nrow(msleep.20.rows)

## [1] 20

nrow(msleep)-nrow(msleep.20.rows)

## [1] 63

#1,8 Summarise

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

data(gehan)
gehan2<-gehan
library(tidyverse)

gehan2 %>% summarise(kount=n())

##   kount
## 1    42

gehan2 %>%
  group_by(treat)%>%
  summarise(kount=n())

## # A tibble: 2 x 2
##   treat   kount
##   <fct>   <int>
## 1 6-MP       21
## 2 control    21

gehan2 %>%
  group_by(treat)  %>%
  summarise(avarage.remiss.time=mean(time),
            median.remiss.time=median(time),
            std.dev.remiss.time=sd(time),
            median.abs.deviation=mad(time),
            IQR.remiss.time=IQR(time))

## # A tibble: 2 x 6
##   treat   avarage.remiss.time median.remiss.t~ std.dev.remiss.~ median.abs.devi~
##   <fct>                 <dbl>            <int>            <dbl>            <dbl>
## 1 6-MP                  17.1                16            10.0             10.4 
## 2 control                8.67                8             6.47             5.93
## # ... with 1 more variable: IQR.remiss.time <dbl>

motorcycles<-c("HondaCb 190 R","Suzuki Gcxs 150","yamahaa R15")
Cyl<-c("188","150","150")
freno<-c("ABS","DISCO","DISCO")
duracion.gas.gal<-c(155,140,140)
dat<-data.frame(motorcycles,Cyl,freno,duracion.gas.gal)
head(x)

## # A tibble: 1 x 2
##   names school.grades
##   <chr> <chr>        
## 1 Bill  C

dat%>%
  group_by(Cyl)%>%
  summarise(kount=n())

## # A tibble: 2 x 2
##   Cyl   kount
##   <chr> <int>
## 1 150       2
## 2 188       1

dat%>%
  summarise(prome=mean(duracion.gas.gal),
            medi=median(duracion.gas.gal),
            desv=sd(duracion.gas.gal),
            mediana=mad(duracion.gas.gal))

##   prome medi     desv mediana
## 1   145  140 8.660254       0

gehan2%>%
  group_by(treat) %>%
  summarise(minimum.remission=min(time),
            max.remission=max(time))

## # A tibble: 2 x 3
##   treat   minimum.remission max.remission
##   <fct>               <int>         <int>
## 1 6-MP                    6            35
## 2 control                 1            23

#1,8,1 Summarise Across
library(MASS)
subset.survey<-survey[1:10,]
library(dplyr)
head(subset.survey)

##      Sex Wr.Hnd NW.Hnd W.Hnd    Fold Pulse    Clap Exer Smoke Height      M.I
## 1 Female   18.5   18.0 Right  R on L    92    Left Some Never 173.00   Metric
## 2   Male   19.5   20.5  Left  R on L   104    Left None Regul 177.80 Imperial
## 3   Male   18.0   13.3 Right  L on R    87 Neither None Occas     NA     <NA>
## 4   Male   18.8   18.9 Right  R on L    NA Neither None Never 160.00   Metric
## 5   Male   20.0   20.0 Right Neither    35   Right Some Never 165.00   Metric
## 6 Female   18.0   17.7 Right  L on R    64   Right Some Never 172.72 Imperial
##      Age
## 1 18.250
## 2 17.583
## 3 16.917
## 4 20.333
## 5 23.667
## 6 21.000

subset.survey %>%
  na.omit() %>%
  group_by(Sex) %>%
  summarise(across(where(is.numeric), mean,
  .names = "men_{col}"))%>%
  head()

## # A tibble: 2 x 6
##   Sex    men_Wr.Hnd men_NW.Hnd men_Pulse men_Height men_Age
##   <fct>       <dbl>      <dbl>     <dbl>      <dbl>   <dbl>
## 1 Female       17.8       17.7      76.7       168.    25.0
## 2 Male         19.1       19.2      76.8       174.    20.3

new.sleep<-msleep %>%
  group_by(vore,order)
s<-summarise(new.sleep,n())

## `summarise()` has grouped output by 'vore'. You can override using the `.groups` argument.

## # A tibble: 32 x 3
## # Groups:   vore [5]
##    vore  order           `n()`
##    <chr> <chr>           <int>
##  1 carni Carnivora          12
##  2 carni Cetacea             3
##  3 carni Cingulata           1
##  4 carni Didelphimorphia     1
##  5 carni Primates            1
##  6 carni Rodentia            1
##  7 herbi Artiodactyla        5
##  8 herbi Diprotodontia       1
##  9 herbi Hyracoidea          2
## 10 herbi Lagomorpha          1
## # ... with 22 more rows

new.sleep.totals<-msleep%>%
  group_by(vore,order)%>%
  summarise(n())

## `summarise()` has grouped output by 'vore'. You can override using the `.groups` argument.

new.sleep.totals

## # A tibble: 32 x 3
## # Groups:   vore [5]
##    vore  order           `n()`
##    <chr> <chr>           <int>
##  1 carni Carnivora          12
##  2 carni Cetacea             3
##  3 carni Cingulata           1
##  4 carni Didelphimorphia     1
##  5 carni Primates            1
##  6 carni Rodentia            1
##  7 herbi Artiodactyla        5
##  8 herbi Diprotodontia       1
##  9 herbi Hyracoidea          2
## 10 herbi Lagomorpha          1
## # ... with 22 more rows

#1,9 Gathering: Covert Multiple Columns into One

state<-c("Maryland","Alaska","New Jersey")
income<-c(76067,74444,73702)
median.us<-c(61372,61372,61372)
life.expectancy<-c(78.8,78.3,80.3)
teen.birth.rate.2015<-c(18,29.3,12.1)
teen.birth.rate.2007<-c(34.3,42.9,24.9)
teen.birth.rate.1991<-c(54.1,66,41.3)
top.3.states<-data.frame(state,income,median.us,life.expectancy,teen.birth.rate.2015,teen.birth.rate.2007,
  teen.birth.rate.1991)
names(top.3.states)<-c("state","income","median.us","life.spectancy","2015","2007","1991")
top.3.states

##        state income median.us life.spectancy 2015 2007 1991
## 1   Maryland  76067     61372           78.8 18.0 34.3 54.1
## 2     Alaska  74444     61372           78.3 29.3 42.9 66.0
## 3 New Jersey  73702     61372           80.3 12.1 24.9 41.3

new.top.3.states<-top.3.states %>%
  gather("2015","2007","1991",key="year",value = "cases")
new.top.3.states

##        state income median.us life.spectancy year cases
## 1   Maryland  76067     61372           78.8 2015  18.0
## 2     Alaska  74444     61372           78.3 2015  29.3
## 3 New Jersey  73702     61372           80.3 2015  12.1
## 4   Maryland  76067     61372           78.8 2007  34.3
## 5     Alaska  74444     61372           78.3 2007  42.9
## 6 New Jersey  73702     61372           80.3 2007  24.9
## 7   Maryland  76067     61372           78.8 1991  54.1
## 8     Alaska  74444     61372           78.3 1991  66.0
## 9 New Jersey  73702     61372           80.3 1991  41.3

#1,10 Spreading:Consolidation of Multiple Rows into One

df_1<-data_frame(Type=c("TypeA","TypeA","TypeB","TypeB"),
                 Answer=c("Yes","No",NA,"No"),n=1:4)
df_1

## # A tibble: 4 x 3
##   Type  Answer     n
##   <chr> <chr>  <int>
## 1 TypeA Yes        1
## 2 TypeA No         2
## 3 TypeB <NA>       3
## 4 TypeB No         4

df_2<-df_1%>%
filter(!is.na(Answer))%>%
  spread(key=Answer,value=n)
df_2

## # A tibble: 2 x 3
##   Type     No   Yes
##   <chr> <int> <int>
## 1 TypeA     2     1
## 2 TypeB     4    NA

#1,11 Separate:Divide a Single Column into Multiple Columns

state<-c("Maryland","Alaska","New Jersey")
income<-c(76067,74444,73702)
median.us<-c(61372,61372,61372)
life.expectancy<-c(78.8,78.3,80.3)
teen.birth<-c("17//34.3//54.1","29.0//42.9//66.0","12.1//24.9//41.3")

top.3.states<-data.frame(state,income,median.us,
                         life.expectancy,teen.birth)
top.3.states

##        state income median.us life.expectancy       teen.birth
## 1   Maryland  76067     61372            78.8   17//34.3//54.1
## 2     Alaska  74444     61372            78.3 29.0//42.9//66.0
## 3 New Jersey  73702     61372            80.3 12.1//24.9//41.3

top.3.states.separated.years<-top.3.states%>%
  separate(teen.birth,
  into = c("2015","2007","1991"),sep="//")
top.3.states.separated.years

##        state income median.us life.expectancy 2015 2007 1991
## 1   Maryland  76067     61372            78.8   17 34.3 54.1
## 2     Alaska  74444     61372            78.3 29.0 42.9 66.0
## 3 New Jersey  73702     61372            80.3 12.1 24.9 41.3

#1,12 Recap of Handy DPLYR Functions

#1,12,1 Number of Observations (n) Used Across Multiple DPLYR Functions
#1,12,2 Basic Counts
m<-mutate(new.sleep,koun = n())
m[1:5,c(1:4,10:12)]

## # A tibble: 5 x 7
## # Groups:   vore, order [5]
##   name                       genus      vore  order        brainwt  bodywt  koun
##   <chr>                      <chr>      <chr> <chr>          <dbl>   <dbl> <int>
## 1 Cheetah                    Acinonyx   carni Carnivora   NA        50        12
## 2 Owl monkey                 Aotus      omni  Primates     0.0155    0.48     10
## 3 Mountain beaver            Aplodontia herbi Rodentia    NA         1.35     16
## 4 Greater short-tailed shrew Blarina    omni  Soricomorp~  0.00029   0.019     3
## 5 Cow                        Bos        herbi Artiodacty~  0.423   600         5

f<-filter(new.sleep,n()>14)
f[1:5,c(1:4,10:11)]

## # A tibble: 5 x 6
## # Groups:   vore, order [1]
##   name                      genus      vore  order    brainwt bodywt
##   <chr>                     <chr>      <chr> <chr>      <dbl>  <dbl>
## 1 Mountain beaver           Aplodontia herbi Rodentia NA       1.35 
## 2 Guinea pig                Cavis      herbi Rodentia  0.0055  0.728
## 3 Chinchilla                Chinchilla herbi Rodentia  0.0064  0.42 
## 4 Western american chipmunk Eutamias   herbi Rodentia NA       0.071
## 5 Mongolian gerbil          Meriones   herbi Rodentia NA       0.053

#1,12,3 Nth functions
salary.description<-c("Golden parachute","Well to do","Average","Below average", "bring date seeds instead of flowers")
first(salary.description)

## [1] "Golden parachute"

last(salary.description)

## [1] "bring date seeds instead of flowers"

nth(salary.description,3)

## [1] "Average"

nth(salary.description,2)

## [1] "Well to do"

#1,12,4 Count Distinct Values
a.vector<-c(22,33,44,1,2,3,3,3,4)
original.length<-length(a.vector)
original.length

## [1] 9

#Show number of distinct element only (7):

distinct.a.vector<-n_distinct(a.vector)
distinct.a.vector

## [1] 7

test1<-if_else(original.length==distinct.a.vector,"all values unique","some duplicate values in vector")
test1

## [1] "some duplicate values in vector"

b.vector<-c(1,2,3,4,5,6)
length(b.vector)

## [1] 6

distinct.b.vector<-n_distinct(b.vector)
distinct.b.vector

## [1] 6

test2<-if_else(length(b.vector)==distinct.b.vector,"all values unique","duplicates")
test2

## [1] "all values unique"

#1,12,5 na_if
test<-c(100,0,999)
x<-5000/test
x

## [1] 50.000000       Inf  5.005005

class(x)

## [1] "numeric"

#1,12,6 Colalesce to Replace Missing Values
x<-c(33,4,11,NA,9)
x

## [1] 33  4 11 NA  9

x<-coalesce(x,0)
x

## [1] 33  4 11  0  9

#1,13 Ranking Functions

#1,13,1
y<-c(100,4,12,6,8,3)
rank1<-row_number(y)
rank1

## [1] 6 2 5 3 4 1

y[rank1[1]]

## [1] 3

y[rank1[6]]

## [1] 100

#1,13,2 Minimum Rank
rank2<-min_rank(y)
rank2

## [1] 6 2 5 3 4 1

#1,13,3 Dense Rank
rank3<-dense_rank(y)
rank3

## [1] 6 2 5 3 4 1

#1,13,4 Percent Rank
rank4<-percent_rank(y)
rank4

## [1] 1.0 0.2 0.8 0.4 0.6 0.0

#1,13,5 Cumulative Distribution Function
y<-c(100,4,12,6,8,3)
rank5<-cume_dist(y)
rank5

## [1] 1.0000000 0.3333333 0.8333333 0.5000000 0.6666667 0.1666667

rank6=ntile(y,3)
rank6

## [1] 3 1 3 2 2 1

test.vector<-c(2,22,33,44,77,89,99)
quantile(test.vector,prob=seq(0,1,length=11),type=5)

##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##  2.0  6.0 20.0 28.6 36.3 44.0 67.1 81.8 90.0 97.0 99.0

#1,14

set.seed(833)
data("ChickWeight")
my.sample<-sample_n(ChickWeight,5)
my.sample

##   weight Time Chick Diet
## 1     98    8    45    4
## 2     42    0    17    1
## 3     98    8    36    3
## 4     51    2    11    1
## 5    198   20     3    1

my.sample<-sample_n(ChickWeight,10,replace=TRUE)
my.sample

##    weight Time Chick Diet
## 1     237   21    49    4
## 2     205   16    50    4
## 3     170   16    39    3
## 4     332   18    35    3
## 5     144   14    33    3
## 6     231   18    25    2
## 7      41    0    40    3
## 8      51   10    16    1
## 9      42    0    41    4
## 10     70   12    24    2

my.sample<-sample_n(mtcars,12,weight=cyl)
my.sample[,1:5]

##                    mpg cyl  disp  hp drat
## Hornet Sportabout 18.7   8 360.0 175 3.15
## Chrysler Imperial 14.7   8 440.0 230 3.23
## Ford Pantera L    15.8   8 351.0 264 4.22
## Valiant           18.1   6 225.0 105 2.76
## Lotus Europa      30.4   4  95.1 113 3.77
## Fiat X1-9         27.3   4  79.0  66 4.08
## Porsche 914-2     26.0   4 120.3  91 4.43
## Toyota Corolla    33.9   4  71.1  65 4.22
## Merc 240D         24.4   4 146.7  62 3.69
## Maserati Bora     15.0   8 301.0 335 3.54
## Fiat 128          32.4   4  78.7  66 4.08
## Pontiac Firebird  19.2   8 400.0 175 3.08

test1<-sample_frac(ChickWeight,0.02)
test1

##    weight Time Chick Diet
## 1     163   16     3    1
## 2     103    8    41    4
## 3     103    8    42    4
## 4     120   18    19    1
## 5      48    2    36    3
## 6      80    6    48    4
## 7     137   12    33    3
## 8     154   12    40    3
## 9      40    0     2    1
## 10    138   14    44    4
## 11    240   14    21    2
## 12    130   12    39    3

by_hair_color<-starwars %>% group_by(hair_color)
my.sample<-sample_frac(by_hair_color,.07,replace=TRUE)
my.sample[,1:5]

## # A tibble: 5 x 5
## # Groups:   hair_color [3]
##   name             height  mass hair_color skin_color
##   <chr>             <int> <dbl> <chr>      <chr>     
## 1 Lando Calrissian    177    79 black      dark      
## 2 Chewbacca           228   112 brown      unknown   
## 3 San Hill            191    NA none       grey      
## 4 Mace Windu          188    84 none       dark      
## 5 Dexter Jettster     198   102 none       brown

row.kount.only<-ChickWeight %>% tally()
row.kount.only

##     n
## 1 578

diet.kount<-ChickWeight %>% count(Diet)
diet.kount

##   Diet   n
## 1    1 220
## 2    2 120
## 3    3 120
## 4    4 118

#1,15 Miscellaneous DPLYR Functions

#1,15,1 add_count for Groupwise Filtering
single.species.kount<-starwars%>%
  add_count(species)%>%
  filter(n==1)
single.species.kount[,1:6]

## # A tibble: 29 x 6
##    name                  height  mass hair_color skin_color       eye_color
##    <chr>                  <int> <dbl> <chr>      <chr>            <chr>    
##  1 Greedo                   173    74 <NA>       green            black    
##  2 Jabba Desilijic Tiure    175  1358 <NA>       green-tan, brown orange   
##  3 Yoda                      66    17 white      green            brown    
##  4 Bossk                    190   113 none       green            red      
##  5 Ackbar                   180    83 none       brown mottle     orange   
##  6 Wicket Systri Warrick     88    20 brown      brown            brown    
##  7 Nien Nunb                160    68 none       grey             black    
##  8 Nute Gunray              191    90 none       mottled green    red      
##  9 Watto                    137    NA black      blue, grey       yellow   
## 10 Sebulba                  112    40 none       grey, red        orange   
## # ... with 19 more rows

#1,15,2 Rename
mtcars<-rename(mtcars,spam_mpg=mpg)
data(mtcars)
names(mtcars)

##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"

mtcars<-rename(mtcars,spam_mpg=mpg)
names(mtcars)

##  [1] "spam_mpg" "cyl"      "disp"     "hp"       "drat"     "wt"      
##  [7] "qsec"     "vs"       "am"       "gear"     "carb"

#1,15,3 case_when
data(starwars)
new.starwars<-starwars %>%
  dplyr::select(name,mass,gender,species,height)%>%
  mutate(type=case_when(height>200|mass>200~"large",
  species=="Droid"~"robot", TRUE~"other"))
new.starwars

## # A tibble: 87 x 6
##    name                mass gender    species height type 
##    <chr>              <dbl> <chr>     <chr>    <int> <chr>
##  1 Luke Skywalker        77 masculine Human      172 other
##  2 C-3PO                 75 masculine Droid      167 robot
##  3 R2-D2                 32 masculine Droid       96 robot
##  4 Darth Vader          136 masculine Human      202 large
##  5 Leia Organa           49 feminine  Human      150 other
##  6 Owen Lars            120 masculine Human      178 other
##  7 Beru Whitesun lars    75 feminine  Human      165 other
##  8 R5-D4                 32 masculine Droid       97 robot
##  9 Biggs Darklighter     84 masculine Human      183 other
## 10 Obi-Wan Kenobi        77 masculine Human      182 other
## # ... with 77 more rows

#Finalizacion del Capitulo 1:Dplyr

Cap 1 dplyr DSCG

Santiago Colorado

13/10/2021

R Markdown