#comandos de filtro
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
data("mtcars")
#select only cars with cylinders 
six.cyl.only <- filter(mtcars, cyl == 6)
six.cyl.only
##                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4      21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Valiant        18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Merc 280       19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Merc 280C      17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Ferrari Dino   19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
#filtro de condición múltiple
Six.cylinder.and.110.horses.power <- filter(mtcars, cyl == 6, hp == 110)
Six.cylinder.and.110.horses.power
##                 mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4      21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag  21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Hornet 4 Drive 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
# OR lógica y filtrado
gear.eq.4.or.more.than.8 <- filter(mtcars, gear == 4|cyl > 6)
gear.eq.4.or.more.than.8
##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
## Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
#filtrado por mínimos, máximos y otros criterios 
library(tidyverse)
smallest.engine.displacement <- filter(mtcars, disp == min(disp))
smallest.engine.displacement
##                 mpg cyl disp hp drat    wt qsec vs am gear carb
## Toyota Corolla 33.9   4 71.1 65 4.22 1.835 19.9  1  1    4    1
#filtro con condiciones separadas por comas
data("ChickWeight")
chick.subset <- filter(ChickWeight, Time < 3, weight > 53)
chick.subset
##   weight Time Chick Diet
## 1     55    2    22    2
## 2     55    2    40    3
## 3     55    2    43    4
## 4     54    2    50    4
#filtro de valores perdidos para una colomna
data("airquality")
head(airquality,10) #antes de filtrar
##    Ozone Solar.R Wind Temp Month Day
## 1     41     190  7.4   67     5   1
## 2     36     118  8.0   72     5   2
## 3     12     149 12.6   74     5   3
## 4     18     313 11.5   62     5   4
## 5     NA      NA 14.3   56     5   5
## 6     28      NA 14.9   66     5   6
## 7     23     299  8.6   65     5   7
## 8     19      99 13.8   59     5   8
## 9      8      19 20.1   61     5   9
## 10    NA     194  8.6   69     5  10
#remover cualquier irela con valores extraviados en la columna Ozone
no.missing.ozone = filter(airquality, !is.na(Ozone))
head(no.missing.ozone,8) #despues del filtrado
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    28      NA 14.9   66     5   6
## 6    23     299  8.6   65     5   7
## 7    19      99 13.8   59     5   8
## 8     8      19 20.1   61     5   9
#filtro de irelas con NA en cualquier dataset
airqual.no.NA.anywhere<-filter(airquality[1:10,],complete.cases(airquality[1:10,]))
airqual.no.NA.anywhere
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    23     299  8.6   65     5   7
## 6    19      99 13.8   59     5   8
## 7     8      19 20.1   61     5   9
#filtros por %in%
data("iris")
table(iris$Species) #conteo de especies en el dataset
## 
##     setosa versicolor  virginica 
##         50         50         50
iris.two.species<-filter(iris,Species%in%c("setosa","virginica"))
table(iris.two.species$Species)
## 
##     setosa versicolor  virginica 
##         50          0         50
#filtro para ozono > 29 e incluyendo solo 3 columnas
data("airquality")
airqual.3.colums<-filter(airquality,Ozone>29)[,1:3]
head(airqual.3.colums)
##   Ozone Solar.R Wind
## 1    41     190  7.4
## 2    36     118  8.0
## 3    34     307 12.0
## 4    30     322 11.5
## 5    32      92 12.0
## 6    45     252 14.9
#filtro por frecuencias totales de un valor a traves de todas las irelas 
table(mtcars$gear)
## 
##  3  4  5 
## 15 12  5
more.frequent.number.of.gears <- mtcars%>%
    group_by(gear)%>%
  filter(n()>10) 
table(more.frequent.number.of.gears$gear)
## 
##  3  4 
## 15 12
more.frequent.no.of.gears.and.low.horsepower <- mtcars %>%
group_by(gear) %>%
  filter(n() > 10, hp < 105)
table(more.frequent.no.of.gears.and.low.horsepower$gear)
## 
## 3 4 
## 1 7
#filtrps ´pr nombres de columna usando "starts with"
names(iris) #muestra los nombres de las columnas
## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"
iris.display <- iris %>% dplyr::select(starts_with("S"))
head(iris.display) #usar head para reducir el número de  irelas en la salida
##   Sepal.Length Sepal.Width Species
## 1          5.1         3.5  setosa
## 2          4.9         3.0  setosa
## 3          4.7         3.2  setosa
## 4          4.6         3.1  setosa
## 5          5.0         3.6  setosa
## 6          5.4         3.9  setosa
#filtro de irelas: Criterios de las columnas. Encontrar irelas con criterios como el máximo
new.mtcars <- mtcars %>% filter_at(vars(cyl, hp),
                                   all_vars(. == max(.)))
new.mtcars
##               mpg cyl disp  hp drat   wt qsec vs am gear carb
## Maserati Bora  15   8  301 335 3.54 3.57 14.6  0  1    5    8
msleep<-ggplot2::msleep
msleep
## # A tibble: 83 x 11
##    name   genus vore  order conservation sleep_total sleep_rem sleep_cycle awake
##    <chr>  <chr> <chr> <chr> <chr>              <dbl>     <dbl>       <dbl> <dbl>
##  1 Cheet~ Acin~ carni Carn~ lc                  12.1      NA        NA      11.9
##  2 Owl m~ Aotus omni  Prim~ <NA>                17         1.8      NA       7  
##  3 Mount~ Aplo~ herbi Rode~ nt                  14.4       2.4      NA       9.6
##  4 Great~ Blar~ omni  Sori~ lc                  14.9       2.3       0.133   9.1
##  5 Cow    Bos   herbi Arti~ domesticated         4         0.7       0.667  20  
##  6 Three~ Brad~ herbi Pilo~ <NA>                14.4       2.2       0.767   9.6
##  7 North~ Call~ carni Carn~ vu                   8.7       1.4       0.383  15.3
##  8 Vespe~ Calo~ <NA>  Rode~ <NA>                 7        NA        NA      17  
##  9 Dog    Canis carni Carn~ domesticated        10.1       2.9       0.333  13.9
## 10 Roe d~ Capr~ herbi Arti~ lc                   3        NA        NA      21  
## # ... with 73 more rows, and 2 more variables: brainwt <dbl>, bodywt <dbl>
msleep.over.5 <- msleep %>%
  select(name, sleep_total:sleep_rem, brainwt:bodywt) %>%
  filter_at(vars(contains("sleep")), all_vars(.>5))
msleep.over.5
## # A tibble: 2 x 5
##   name                 sleep_total sleep_rem brainwt bodywt
##   <chr>                      <dbl>     <dbl>   <dbl>  <dbl>
## 1 Thick-tailed opposum        19.4       6.6  NA       0.37
## 2 Giant armadillo             18.1       6.1   0.081  60
#arreglar(clasificar)
msleep <- ggplot2::msleep
msleep[,1:4]
## # A tibble: 83 x 4
##    name                       genus       vore  order       
##    <chr>                      <chr>       <chr> <chr>       
##  1 Cheetah                    Acinonyx    carni Carnivora   
##  2 Owl monkey                 Aotus       omni  Primates    
##  3 Mountain beaver            Aplodontia  herbi Rodentia    
##  4 Greater short-tailed shrew Blarina     omni  Soricomorpha
##  5 Cow                        Bos         herbi Artiodactyla
##  6 Three-toed sloth           Bradypus    herbi Pilosa      
##  7 Northern fur seal          Callorhinus carni Carnivora   
##  8 Vesper mouse               Calomys     <NA>  Rodentia    
##  9 Dog                        Canis       carni Carnivora   
## 10 Roe deer                   Capreolus   herbi Artiodactyla
## # ... with 73 more rows
#ascendente
animal.name.sequence<-arrange(msleep,vore,order)
animal.name.sequence[,1:4]
## # A tibble: 83 x 4
##    name              genus        vore  order    
##    <chr>             <chr>        <chr> <chr>    
##  1 Cheetah           Acinonyx     carni Carnivora
##  2 Northern fur seal Callorhinus  carni Carnivora
##  3 Dog               Canis        carni Carnivora
##  4 Domestic cat      Felis        carni Carnivora
##  5 Gray seal         Haliochoerus carni Carnivora
##  6 Tiger             Panthera     carni Carnivora
##  7 Jaguar            Panthera     carni Carnivora
##  8 Lion              Panthera     carni Carnivora
##  9 Caspian seal      Phoca        carni Carnivora
## 10 Genet             Genetta      carni Carnivora
## # ... with 73 more rows
#descendente
animal.name.sequence.desc<-arrange(msleep,vore,desc(order))
head(animal.name.sequence.desc[,1:4])
## # A tibble: 6 x 4
##   name                       genus         vore  order          
##   <chr>                      <chr>         <chr> <chr>          
## 1 Northern grasshopper mouse Onychomys     carni Rodentia       
## 2 Slow loris                 Nyctibeus     carni Primates       
## 3 Thick-tailed opposum       Lutreolina    carni Didelphimorphia
## 4 Long-nosed armadillo       Dasypus       carni Cingulata      
## 5 Pilot whale                Globicephalus carni Cetacea        
## 6 Common porpoise            Phocoena      carni Cetacea
#renombrar
names(iris)
## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"
#mostras nombres de columnas
renamed.iris <- rename(iris, width.of.petals = Petal.Width,
various.plants.and.animals = Species)
names(renamed.iris)
## [1] "Sepal.Length"               "Sepal.Width"               
## [3] "Petal.Length"               "width.of.petals"           
## [5] "various.plants.and.animals"
#mudar
data("ChickWeight")
ChickWeight[1:2,] #primeras dos irelas
##   weight Time Chick Diet
## 1     42    0     1    1
## 2     51    2     1    1
#primeras 2 irelas con nuevos campos añadidos
Chickweight.with.log <- mutate(ChickWeight,log.of.weight = log10(weight))
Chickweight.with.log[1:2,]
##   weight Time Chick Diet log.of.weight
## 1     42    0     1    1      1.623249
## 2     51    2     1    1      1.707570
#mutate_all para añadir nuevos campos al mismo tiempo
msleep <- ggplot2::msleep
names(msleep)
##  [1] "name"         "genus"        "vore"         "order"        "conservation"
##  [6] "sleep_total"  "sleep_rem"    "sleep_cycle"  "awake"        "brainwt"     
## [11] "bodywt"
#ahora añade variales con raiz cuadrada7columnas para variables en columnas 6-11
msleep.with.square.roots <- mutate_all(msleep[,6:11],
      funs("square root" = sqrt( . )))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
      names(msleep.with.square.roots)
##  [1] "sleep_total"             "sleep_rem"              
##  [3] "sleep_cycle"             "awake"                  
##  [5] "brainwt"                 "bodywt"                 
##  [7] "sleep_total_square root" "sleep_rem_square root"  
##  [9] "sleep_cycle_square root" "awake_square root"      
## [11] "brainwt_square root"     "bodywt_square root"
msleep.with.square.roots
## # A tibble: 83 x 12
##    sleep_total sleep_rem sleep_cycle awake  brainwt  bodywt `sleep_total_square~
##          <dbl>     <dbl>       <dbl> <dbl>    <dbl>   <dbl>                <dbl>
##  1        12.1      NA        NA      11.9 NA        50                     3.48
##  2        17         1.8      NA       7    0.0155    0.48                  4.12
##  3        14.4       2.4      NA       9.6 NA         1.35                  3.79
##  4        14.9       2.3       0.133   9.1  0.00029   0.019                 3.86
##  5         4         0.7       0.667  20    0.423   600                     2   
##  6        14.4       2.2       0.767   9.6 NA         3.85                  3.79
##  7         8.7       1.4       0.383  15.3 NA        20.5                   2.95
##  8         7        NA        NA      17   NA         0.045                 2.65
##  9        10.1       2.9       0.333  13.9  0.07     14                     3.18
## 10         3        NA        NA      21    0.0982   14.8                   1.73
## # ... with 73 more rows, and 5 more variables: sleep_rem_square root <dbl>,
## #   sleep_cycle_square root <dbl>, awake_square root <dbl>,
## #   brainwt_square root <dbl>, bodywt_square root <dbl>
#mutate_at para añadir espacios
data("Titanic")
Titanic <- as.data.frame(Titanic)
head(Titanic)
##   Class    Sex   Age Survived Freq
## 1   1st   Male Child       No    0
## 2   2nd   Male Child       No    0
## 3   3rd   Male Child       No   35
## 4  Crew   Male Child       No    0
## 5   1st Female Child       No    0
## 6   2nd Female Child       No    0
titanic.with.ranks <- mutate_at(Titanic, vars(Class,Age,Survived),
funs(Rank = min_rank(desc(.))))
head(titanic.with.ranks)
##   Class    Sex   Age Survived Freq Class_Rank Age_Rank Survived_Rank
## 1   1st   Male Child       No    0         25       17            17
## 2   2nd   Male Child       No    0         17       17            17
## 3   3rd   Male Child       No   35          9       17            17
## 4  Crew   Male Child       No    0          1       17            17
## 5   1st Female Child       No    0         25       17            17
## 6   2nd Female Child       No    0         17       17            17
#mutate_if
divide.by.10 <- function (a.number) (a.number / 10)
new.df <- CO2 %>%
 mutate_if(is.numeric, divide.by.10)
head(new.df)
##   Plant   Type  Treatment conc uptake
## 1   Qn1 Quebec nonchilled  9.5   1.60
## 2   Qn1 Quebec nonchilled 17.5   3.04
## 3   Qn1 Quebec nonchilled 25.0   3.48
## 4   Qn1 Quebec nonchilled 35.0   3.72
## 5   Qn1 Quebec nonchilled 50.0   3.53
## 6   Qn1 Quebec nonchilled 67.5   3.92
df <- data.frame(
alpha = c(22, 1, NA),
almond = c(0, 5, 10),
grape = c(0, 2, 2),
apple = c(NA, 5, 10))
df
##   alpha almond grape apple
## 1    22      0     0    NA
## 2     1      5     2     5
## 3    NA     10     2    10
#Detección de cadena e indicador de duplicado TRUE/FALSE
msleep <- ggplot2::msleep
table(msleep$vore)
## 
##   carni   herbi insecti    omni 
##      19      32       5      20
msleep.no.c.or.a <- filter(msleep, !str_detect(vore,
 paste(c("c","a"), collapse = "|")))
table(msleep.no.c.or.a$vore)
## 
## herbi  omni 
##    32    20
msleep.with.dup.indicator<-mutate(msleep,duplicate.indicator=duplicated(conservation))
msleep.with.dup.indicator[1:6,]
## # A tibble: 6 x 12
##   name    genus vore  order conservation sleep_total sleep_rem sleep_cycle awake
##   <chr>   <chr> <chr> <chr> <chr>              <dbl>     <dbl>       <dbl> <dbl>
## 1 Cheetah Acin~ carni Carn~ lc                  12.1      NA        NA      11.9
## 2 Owl mo~ Aotus omni  Prim~ <NA>                17         1.8      NA       7  
## 3 Mounta~ Aplo~ herbi Rode~ nt                  14.4       2.4      NA       9.6
## 4 Greate~ Blar~ omni  Sori~ lc                  14.9       2.3       0.133   9.1
## 5 Cow     Bos   herbi Arti~ domesticated         4         0.7       0.667  20  
## 6 Three-~ Brad~ herbi Pilo~ <NA>                14.4       2.2       0.767   9.6
## # ... with 3 more variables: brainwt <dbl>, bodywt <dbl>,
## #   duplicate.indicator <lgl>
msleep.with.dup.indicator<-mutate(msleep,duplicate.indicator=duplicated(conservation))
msleep.with.dup.indicator[1:6,c(1,2,3,12)]
## # A tibble: 6 x 4
##   name                       genus      vore  duplicate.indicator
##   <chr>                      <chr>      <chr> <lgl>              
## 1 Cheetah                    Acinonyx   carni FALSE              
## 2 Owl monkey                 Aotus      omni  FALSE              
## 3 Mountain beaver            Aplodontia herbi FALSE              
## 4 Greater short-tailed shrew Blarina    omni  TRUE               
## 5 Cow                        Bos        herbi FALSE              
## 6 Three-toed sloth           Bradypus   herbi TRUE
msleep.with.dup.indicator2 <- mutate(msleep,
 duplicate.indicator = duplicated(conservation, genus)) %>%
 arrange(conservation,genus)
msleep.with.dup.indicator2
## # A tibble: 83 x 12
##    name   genus vore  order conservation sleep_total sleep_rem sleep_cycle awake
##    <chr>  <chr> <chr> <chr> <chr>              <dbl>     <dbl>       <dbl> <dbl>
##  1 Giraf~ Gira~ herbi Arti~ cd                   1.9       0.4      NA      22.1
##  2 Pilot~ Glob~ carni Ceta~ cd                   2.7       0.1      NA      21.4
##  3 Cow    Bos   herbi Arti~ domesticated         4         0.7       0.667  20  
##  4 Dog    Canis carni Carn~ domesticated        10.1       2.9       0.333  13.9
##  5 Guine~ Cavis herbi Rode~ domesticated         9.4       0.8       0.217  14.6
##  6 Chinc~ Chin~ herbi Rode~ domesticated        12.5       1.5       0.117  11.5
##  7 Horse  Equus herbi Peri~ domesticated         2.9       0.6       1      21.1
##  8 Donkey Equus herbi Peri~ domesticated         3.1       0.4      NA      20.9
##  9 Domes~ Felis carni Carn~ domesticated        12.5       3.2       0.417  11.5
## 10 Rabbit Oryc~ herbi Lago~ domesticated         8.4       0.9       0.417  15.6
## # ... with 73 more rows, and 3 more variables: brainwt <dbl>, bodywt <dbl>,
## #   duplicate.indicator <lgl>
#Crear dataframe 2
fruit <- c("apple","pear","orange","grape", "orange","orange")
x <- c(1,2,4,9,4,6)
y <- c(22,3,4,55,15,9)
z <- c(3,1,4,10,12,8)
w <- c(2,2,2,4,5,6)
df <- data.frame(fruit,x,y,z,w)
df
##    fruit x  y  z w
## 1  apple 1 22  3 2
## 2   pear 2  3  1 2
## 3 orange 4  4  4 2
## 4  grape 9 55 10 4
## 5 orange 4 15 12 5
## 6 orange 6  9  8 6
df.show.single.dup <- mutate(df, duplicate.indicator = duplicated(fruit))
df.show.single.dup
##    fruit x  y  z w duplicate.indicator
## 1  apple 1 22  3 2               FALSE
## 2   pear 2  3  1 2               FALSE
## 3 orange 4  4  4 2               FALSE
## 4  grape 9 55 10 4               FALSE
## 5 orange 4 15 12 5                TRUE
## 6 orange 6  9  8 6                TRUE
fruit <- c("apple","pear","orange","grape", "orange","orange")
x <- c(1,2,4,9,4,6)
y <- c(22,3,4,55,15,9)
z <- c(3,1,4,10,12,8)
df <- data.frame(fruit,x,y,z)
df <- mutate(df, z = NULL)
df
##    fruit x  y
## 1  apple 1 22
## 2   pear 2  3
## 3 orange 4  4
## 4  grape 9 55
## 5 orange 4 15
## 6 orange 6  9
#Secuencia de codigo requerida

 (!require("nycflights13"))
## Loading required package: nycflights13
## [1] FALSE
install.packages("nycflights13")
## Warning: package 'nycflights13' is in use and will not be installed
library(nycflights13)
library(tidyverse)
mutate(flights,
 gain = arr_delay - dep_delay,
 hours = air_time / 60,
 gain_per_hour = gain / hours,
 gain_per_minute = 60 * gain_per_hour)
## # A tibble: 336,776 x 23
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 336,766 more rows, and 15 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## #   gain <dbl>, hours <dbl>, gain_per_hour <dbl>, gain_per_minute <dbl>
newfield.flights <- flights %>%
 mutate(gain = arr_delay - dep_delay,
 hours = air_time / 60) %>%
 mutate(gain_per_hour = gain / hours) %>%
 mutate(gain_per_minute = 60 * gain_per_hour)
newfield.flights[1:6,c(1:2,20:23)]
## # A tibble: 6 x 6
##    year month  gain hours gain_per_hour gain_per_minute
##   <int> <int> <dbl> <dbl>         <dbl>           <dbl>
## 1  2013     1     9  3.78          2.38            143.
## 2  2013     1    16  3.78          4.23            254.
## 3  2013     1    31  2.67         11.6             698.
## 4  2013     1   -17  3.05         -5.57           -334.
## 5  2013     1   -19  1.93         -9.83           -590.
## 6  2013     1    16  2.5           6.4             384
#Transmutar. Quedarse unicamente con las variables creadas
fruit <- c("apple","pear","orange","grape", "orange","orange")
x <- c(1,2,4,9,4,6)
y <- c(22,3,4,55,15,9)
z <- c(3,1,4,10,12,8)
df <- data.frame(fruit,x,y,z)
df #antes de transmutar
##    fruit x  y  z
## 1  apple 1 22  3
## 2   pear 2  3  1
## 3 orange 4  4  4
## 4  grape 9 55 10
## 5 orange 4 15 12
## 6 orange 6  9  8
df <- transmute(df, new.variable = x + y + z)
df#despues de transmutar
##   new.variable
## 1           26
## 2            6
## 3           12
## 4           74
## 5           31
## 6           23
#Usar Across para aplicar una función sobre múltiple columnas
double.it <- function(x) x*2
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
iris %>%
 mutate(across(where(is.numeric), double.it)) %>%
 head() #muestra un nuevo dataframe de iris con valures duplicados para las variables numéricas
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1         10.2         7.0          2.8         0.4  setosa
## 2          9.8         6.0          2.8         0.4  setosa
## 3          9.4         6.4          2.6         0.4  setosa
## 4          9.2         6.2          3.0         0.4  setosa
## 5         10.0         7.2          2.8         0.4  setosa
## 6         10.8         7.8          3.4         0.8  setosa
#mutación condicional usando case_when
row1 <- c("a","b","c","d","e","f","column.to.be.changed")
row2 <- c(1,1,1,6,6,1,2)
row3 <- c(3,4,4,6,4,4,4)
row4 <- c(4,6,25,5,5,2,9)
row5 <- c(5,3,6,3,3,6,2)
df <- as.data.frame(rbind(row2,row3,row4,row5))
names(df) <- row1
df
##      a b  c d e f column.to.be.changed
## row2 1 1  1 6 6 1                    2
## row3 3 4  4 6 4 4                    4
## row4 4 6 25 5 5 2                    9
## row5 5 3  6 3 3 6                    2
new.df <-df %>%
 mutate(column.to.be.changed = case_when(a == 2 | a == 5 |
 a == 7 | (a == 1 & b == 4) ~ 2, a == 0 | a == 1 | a == 4 |
 a == 3 | c == 4 ~ 3, TRUE ~ NA_real_))
new.df
##      a b  c d e f column.to.be.changed
## row2 1 1  1 6 6 1                    3
## row3 3 4  4 6 4 4                    3
## row4 4 6 25 5 5 2                    3
## row5 5 3  6 3 3 6                    2
#Borrar columnas
fruit <- c("apple","pear","orange","grape", "orange","orange")
x <- c(1,2,4,9,4,6)
y <- c(22,3,4,55,15,9)
z <- c(3,1,4,10,12,8)
df <- data.frame(fruit,x,y,z) #antes de seleccionar
df
##    fruit x  y  z
## 1  apple 1 22  3
## 2   pear 2  3  1
## 3 orange 4  4  4
## 4  grape 9 55 10
## 5 orange 4 15 12
## 6 orange 6  9  8
#poner un - en frente de la valiable que será mostrada en la tabla. En este caso fruit
new.df.no.fruit <- dplyr::select(df, -fruit)
new.df.no.fruit #after select
##   x  y  z
## 1 1 22  3
## 2 2  3  1
## 3 4  4  4
## 4 9 55 10
## 5 4 15 12
## 6 6  9  8
#Borrar columnas por nombre usando start_with or end_with
data("mtcars")
names(mtcars)
##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"
#Borrar todas las columnas que empiecen por d
mtcars.no.col.names.start.with.d <- select(mtcars, -starts_with("d"))
names(mtcars.no.col.names.start.with.d)
## [1] "mpg"  "cyl"  "hp"   "wt"   "qsec" "vs"   "am"   "gear" "carb"
mtcars.no.col.names.ends.with <- select(mtcars,
 - ends_with("t"))
names(mtcars.no.col.names.ends.with)
## [1] "mpg"  "cyl"  "disp" "hp"   "qsec" "vs"   "am"   "gear" "carb"
#Reorganizar el orden de las columnas
fruit <- c("apple","pear","orange","grape", "orange","orange")
x <- c(1,2,4,9,4,6)
y <- c(22,3,4,55,15,9)
z <- c(3,1,4,10,12,8)
df <- data.frame(fruit,x,y,z)
df
##    fruit x  y  z
## 1  apple 1 22  3
## 2   pear 2  3  1
## 3 orange 4  4  4
## 4  grape 9 55 10
## 5 orange 4 15 12
## 6 orange 6  9  8
#select_all para activar la función All colums 
#crear un nuevo dataframe
state <- c("Maryland", "Alaska", "New Jersey")
income <- c(76067,74444,73702)
median.us <- c(61372,61372,61372)
life.expectancy <- c(78.8,78.3,80.3)
top.3.states <- data.frame(state, income, median.us,
 life.expectancy)
top.3.states #antes - el nombre de las columnas no estpa capitalizado
##        state income median.us life.expectancy
## 1   Maryland  76067     61372            78.8
## 2     Alaska  74444     61372            78.3
## 3 New Jersey  73702     61372            80.3
#capitalizar los nombres de las columnas usando la función "toupper"
new.top.3.states <- select_all(top.3.states, toupper)
new.top.3.states #luego de aplucar la funcion "toupper"
##        STATE INCOME MEDIAN.US LIFE.EXPECTANCY
## 1   Maryland  76067     61372            78.8
## 2     Alaska  74444     61372            78.3
## 3 New Jersey  73702     61372            80.3
#seleccionar columnas utilizando la función Pull
top.3.states <- data.frame(state, income, median.us, life.expectancy)
top.3.states #muestra valores del dataframe
##        state income median.us life.expectancy
## 1   Maryland  76067     61372            78.8
## 2     Alaska  74444     61372            78.3
## 3 New Jersey  73702     61372            80.3
#obtener solo los valores the la primera columna de la izquierda
pull.first.column <- pull(top.3.states,1)
pull.first.column
## [1] "Maryland"   "Alaska"     "New Jersey"
#usar un número negativo para mostras la primera columna de la derecha. 
pull.last.column <- pull(top.3.states,-1)
pull.last.column
## [1] 78.8 78.3 80.3
#seleccionar irelas: cualquier variable presenta condiciones
nrow(mtcars)
## [1] 32
#mostrar cualquier cosa, más de 200 datos
mtcars.more.than.200 <- filter_all(mtcars, any_vars(. > 200))
nrow(mtcars.more.than.200)
## [1] 16
#seleccionar columnas: omitir si los nombres de las columnas contienen caracteres especificos
names(mtcars)
##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"
#nombres de las columnas de los dataframe luego de seleccionarlos
cars.with.no.p <- mtcars %>%
 dplyr::select(-contains("p"))
names(cars.with.no.p)#ninguna columna que empiece por p está incluida en el dataframe
## [1] "cyl"  "drat" "wt"   "qsec" "vs"   "am"   "gear" "carb"
#elegir usando Wildcard Matching
names(mtcars)
##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"
subset.mtcars <- select(mtcars,
 matches("pg|gea"))
names(subset.mtcars)
## [1] "mpg"  "gear"
#uniser a la izquierda (el mpas común)
us.state.areas <- as.data.frame(cbind(state.abb, state.area))
us.state.areas[1:3,]
##   state.abb state.area
## 1        AL      51609
## 2        AK     589757
## 3        AZ     113909
us.state.abbreviation.and.name <- as.data.frame(cbind(state.abb,
 state.name))
us.state.abbreviation.and.name[1:3,]
##   state.abb state.name
## 1        AL    Alabama
## 2        AK     Alaska
## 3        AZ    Arizona
state.info.abb.area.name <- us.state.areas %>%
 left_join(us.state.abbreviation.and.name, by = "state.abb")
head(state.info.abb.area.name)
##   state.abb state.area state.name
## 1        AL      51609    Alabama
## 2        AK     589757     Alaska
## 3        AZ     113909    Arizona
## 4        AR      53104   Arkansas
## 5        CA     158693 California
## 6        CO     104247   Colorado
#unión interna. Primero crear el dataframe
names <- c("Sally","Tom","Frieda","Alfonzo")
team.scores <- c(3,5,2,7)
team.league <- c("alpha","beta","gamma", "omicron")
team.info <- data.frame(names, team.scores, team.league)
#crear un segundo dataframe
names = c("Sally","Tom", "Bill", "Alfonzo")
school.grades <- c("A","B","C","B")
school.info <- data.frame(names, school.grades)
school.and.team <- inner_join(team.info, school.info, by = "names")
school.and.team
##     names team.scores team.league school.grades
## 1   Sally           3       alpha             A
## 2     Tom           5        beta             B
## 3 Alfonzo           7     omicron             B
#anti union
names <- c("Sally","Tom","Frieda","Alfonzo")
team.scores <- c(3,5,2,7)
team.league <- c("alpha","beta","gamma", "omicron")
team.info <- data.frame(names, team.scores, team.league)
team.info
##     names team.scores team.league
## 1   Sally           3       alpha
## 2     Tom           5        beta
## 3  Frieda           2       gamma
## 4 Alfonzo           7     omicron
names <- c("Sally","Tom", "Bill", "Alfonzo")
school.grades <- c("A","B","C","B")
school.info <- data.frame(names, school.grades)
school.info
##     names school.grades
## 1   Sally             A
## 2     Tom             B
## 3    Bill             C
## 4 Alfonzo             B
team.info.but.no.grades <- anti_join(team.info, school.info,
 by = "names")
team.info.but.no.grades
##    names team.scores team.league
## 1 Frieda           2       gamma
#unión completa
names = c("Sally","Tom","Frieda","Alfonzo")
team.scores = c(3,5,2,7)
team.league = c("alpha","beta","gamma", "omicron")
team.info = data.frame(names, team.scores, team.league)
names = c("Sally","Tom", "Bill", "Alfonzo")
school.grades = c("A","B","C","B")
school.info = data.frame(names, school.grades)
team.info.and.or.grades <- full_join(team.info, school.info, by = "names")
team.info.and.or.grades
##     names team.scores team.league school.grades
## 1   Sally           3       alpha             A
## 2     Tom           5        beta             B
## 3  Frieda           2       gamma          <NA>
## 4 Alfonzo           7     omicron             B
## 5    Bill          NA        <NA>             C
#semiunión
team.info.with.grades <- semi_join(team.info, school.info)
## Joining, by = "names"
team.info.with.grades
##     names team.scores team.league
## 1   Sally           3       alpha
## 2     Tom           5        beta
## 3 Alfonzo           7     omicron
#unión por la derecha
us.state.areas <- as.data.frame(cbind(state.abb, state.area))
us.state.areas[1:3,]
##   state.abb state.area
## 1        AL      51609
## 2        AK     589757
## 3        AZ     113909
us.state.abbreviation.and.name <- as.data.frame(cbind(state.abb,
 state.name))
us.state.abbreviation.and.name[1:3,]
##   state.abb state.name
## 1        AL    Alabama
## 2        AK     Alaska
## 3        AZ    Arizona
us.state.abbreviation.and.name[1,1] <- "Intentional Mismatch"
us.state.with.abbreviation.and.name.and.area <- right_join(us.state.areas,
 us.state.abbreviation.and.name, by = "state.abb")
us.state.with.abbreviation.and.name.and.area[1:3,]
##   state.abb state.area state.name
## 1        AK     589757     Alaska
## 2        AZ     113909    Arizona
## 3        AR      53104   Arkansas
#slice
msleep <- ggplot2::msleep
nrow(msleep) #primeras 83 irelas
## [1] 83
msleep.only.first.5 <- slice(msleep, -6:-n())
nrow(msleep.only.first.5)
## [1] 5
msleep.20.rows <- msleep %>%
 slice(20:39)
nrow(msleep.20.rows)
## [1] 20
nrow(msleep) - nrow(msleep.20.rows)
## [1] 63
#resumir
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
data(gehan)
gehan2 <- gehan
library(tidyverse)
#cuantos pacientes estuvieron en el ensayo médico
gehan2 %>% summarise( kount = n())
##   kount
## 1    42
gehan2 %>%
 group_by(treat) %>%
 summarise(kount = n())
## # A tibble: 2 x 2
##   treat   kount
##   <fct>   <int>
## 1 6-MP       21
## 2 control    21
gehan2 %>%
 group_by(treat) %>%
 summarise(average.remiss.time = mean(time),
 median.remiss.time = median(time),
 std.dev.remiss.time = sd(time),
 median.abs.deviation = mad(time),
 IQR.remiss.time = IQR(time))
## # A tibble: 2 x 6
##   treat   average.remiss.time median.remiss.t~ std.dev.remiss.~ median.abs.devi~
##   <fct>                 <dbl>            <int>            <dbl>            <dbl>
## 1 6-MP                  17.1                16            10.0             10.4 
## 2 control                8.67                8             6.47             5.93
## # ... with 1 more variable: IQR.remiss.time <dbl>
gehan2 %>%
 group_by(treat) %>%
 summarise(minimum.remission = min(time),
 max.remission = max(time))
## # A tibble: 2 x 3
##   treat   minimum.remission max.remission
##   <fct>               <int>         <int>
## 1 6-MP                    6            35
## 2 control                 1            23
#resumir a través de
library(MASS)
subset.survey <- survey[1:10,]
library(dplyr)
head(subset.survey)
##      Sex Wr.Hnd NW.Hnd W.Hnd    Fold Pulse    Clap Exer Smoke Height      M.I
## 1 Female   18.5   18.0 Right  R on L    92    Left Some Never 173.00   Metric
## 2   Male   19.5   20.5  Left  R on L   104    Left None Regul 177.80 Imperial
## 3   Male   18.0   13.3 Right  L on R    87 Neither None Occas     NA     <NA>
## 4   Male   18.8   18.9 Right  R on L    NA Neither None Never 160.00   Metric
## 5   Male   20.0   20.0 Right Neither    35   Right Some Never 165.00   Metric
## 6 Female   18.0   17.7 Right  L on R    64   Right Some Never 172.72 Imperial
##      Age
## 1 18.250
## 2 17.583
## 3 16.917
## 4 20.333
## 5 23.667
## 6 21.000
subset.survey %>%
 na.omit() %>% #remove any NAs
 group_by(Sex) %>%
 summarise(across(where(is.numeric), mean,
 .names = "mean_{col}")) %>%
 head()
## # A tibble: 2 x 6
##   Sex    mean_Wr.Hnd mean_NW.Hnd mean_Pulse mean_Height mean_Age
##   <fct>        <dbl>       <dbl>      <dbl>       <dbl>    <dbl>
## 1 Female        17.8        17.7       76.7        168.     25.0
## 2 Male          19.1        19.2       76.8        174.     20.3
new.sleep <- msleep %>%
 group_by(vore, order)
s <- summarise(new.sleep, n())
## `summarise()` has grouped output by 'vore'. You can override using the `.groups` argument.
s
## # A tibble: 32 x 3
## # Groups:   vore [5]
##    vore  order           `n()`
##    <chr> <chr>           <int>
##  1 carni Carnivora          12
##  2 carni Cetacea             3
##  3 carni Cingulata           1
##  4 carni Didelphimorphia     1
##  5 carni Primates            1
##  6 carni Rodentia            1
##  7 herbi Artiodactyla        5
##  8 herbi Diprotodontia       1
##  9 herbi Hyracoidea          2
## 10 herbi Lagomorpha          1
## # ... with 22 more rows
new.sleep.totals <- msleep %>%
 group_by(vore, order) %>%
 summarise(n())
## `summarise()` has grouped output by 'vore'. You can override using the `.groups` argument.
new.sleep.totals
## # A tibble: 32 x 3
## # Groups:   vore [5]
##    vore  order           `n()`
##    <chr> <chr>           <int>
##  1 carni Carnivora          12
##  2 carni Cetacea             3
##  3 carni Cingulata           1
##  4 carni Didelphimorphia     1
##  5 carni Primates            1
##  6 carni Rodentia            1
##  7 herbi Artiodactyla        5
##  8 herbi Diprotodontia       1
##  9 herbi Hyracoidea          2
## 10 herbi Lagomorpha          1
## # ... with 22 more rows
#Reunir: convertir múltiples columnas en una sola
state <- c("Maryland", "Alaska", "New Jersey")
income <- c(76067,74444,73702)
median.us <- c(61372,61372,61372)
life.expectancy <- c(78.8,78.3,80.3)
teen.birth.rate.2015 <- c(17,29.3,12.1)
teen.birth.rate.2007 <- c(34.3,42.9,24.9)
teen.birth.rate.1991 <- c(54.1, 66, 41.3)
top.3.states <- data.frame(state, income, median.us,
 life.expectancy,
 teen.birth.rate.2015, teen.birth.rate.2007,
 teen.birth.rate.1991)
names(top.3.states) <- c("state", "income", "median.us",
 "life.expectancy","2015","2007","1991")
top.3.states
##        state income median.us life.expectancy 2015 2007 1991
## 1   Maryland  76067     61372            78.8 17.0 34.3 54.1
## 2     Alaska  74444     61372            78.3 29.3 42.9 66.0
## 3 New Jersey  73702     61372            80.3 12.1 24.9 41.3
new.top.3.states <- top.3.states %>%
 gather("2015", "2007", "1991", key = "year", value = "cases")
new.top.3.states
##        state income median.us life.expectancy year cases
## 1   Maryland  76067     61372            78.8 2015  17.0
## 2     Alaska  74444     61372            78.3 2015  29.3
## 3 New Jersey  73702     61372            80.3 2015  12.1
## 4   Maryland  76067     61372            78.8 2007  34.3
## 5     Alaska  74444     61372            78.3 2007  42.9
## 6 New Jersey  73702     61372            80.3 2007  24.9
## 7   Maryland  76067     61372            78.8 1991  54.1
## 8     Alaska  74444     61372            78.3 1991  66.0
## 9 New Jersey  73702     61372            80.3 1991  41.3
library(tidyverse)
library(nycflights13)
library(dplyr)
#Extenderse: consolidar muchas irelas en una sola
df_1 <- data_frame(Type = c("TypeA", "TypeA", "TypeB", "TypeB"),
 Answer = c("Yes", "No", NA, "No"), n = 1:4)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
df_1 #antes
## # A tibble: 4 x 3
##   Type  Answer     n
##   <chr> <chr>  <int>
## 1 TypeA Yes        1
## 2 TypeA No         2
## 3 TypeB <NA>       3
## 4 TypeB No         4
df_2 <- df_1 %>%
filter(!is.na(Answer)) %>%
 spread(key=Answer, value=n)
df_2 #despues
## # A tibble: 2 x 3
##   Type     No   Yes
##   <chr> <int> <int>
## 1 TypeA     2     1
## 2 TypeB     4    NA
#separar: dividir una sola columna en muchas columnas
state <- c("Maryland", "Alaska", "New Jersey")
income <- c(76067,74444,73702)
median.us <- c(61372,61372,61372)
life.expectancy <- c(78.8,78.3,80.3)
teen.birth <- c("17//34.3//54.1", "29.0//42.9//66.0", "12.1//24.9//41.3")
top.3.states <- data.frame(state, income, median.us,
 life.expectancy,teen.birth)
top.3.states
##        state income median.us life.expectancy       teen.birth
## 1   Maryland  76067     61372            78.8   17//34.3//54.1
## 2     Alaska  74444     61372            78.3 29.0//42.9//66.0
## 3 New Jersey  73702     61372            80.3 12.1//24.9//41.3
top.3.states.separated.years <- top.3.states %>%
 separate(teen.birth,
 into = c("2015", "2007","1991"), sep = "//")
top.3.states.separated.years
##        state income median.us life.expectancy 2015 2007 1991
## 1   Maryland  76067     61372            78.8   17 34.3 54.1
## 2     Alaska  74444     61372            78.3 29.0 42.9 66.0
## 3 New Jersey  73702     61372            80.3 12.1 24.9 41.3
#Basic counts
m <- mutate(new.sleep, kount = n()) #nnueva variable, cuenta agregada al extremo derecho
m[1:5,c(1:4,10:12)] #límite de números oara llenar una página
## # A tibble: 5 x 7
## # Groups:   vore, order [5]
##   name                       genus      vore  order        brainwt  bodywt kount
##   <chr>                      <chr>      <chr> <chr>          <dbl>   <dbl> <int>
## 1 Cheetah                    Acinonyx   carni Carnivora   NA        50        12
## 2 Owl monkey                 Aotus      omni  Primates     0.0155    0.48     10
## 3 Mountain beaver            Aplodontia herbi Rodentia    NA         1.35     16
## 4 Greater short-tailed shrew Blarina    omni  Soricomorp~  0.00029   0.019     3
## 5 Cow                        Bos        herbi Artiodacty~  0.423   600         5
#filtrar por recuento de  vore que superen los 14
f <- filter(new.sleep, n() > 14)
f[1:5,c(1:4,10:11)]
## # A tibble: 5 x 6
## # Groups:   vore, order [1]
##   name                      genus      vore  order    brainwt bodywt
##   <chr>                     <chr>      <chr> <chr>      <dbl>  <dbl>
## 1 Mountain beaver           Aplodontia herbi Rodentia NA       1.35 
## 2 Guinea pig                Cavis      herbi Rodentia  0.0055  0.728
## 3 Chinchilla                Chinchilla herbi Rodentia  0.0064  0.42 
## 4 Western american chipmunk Eutamias   herbi Rodentia NA       0.071
## 5 Mongolian gerbil          Meriones   herbi Rodentia NA       0.053
#Funciones Nth
salary.description <- c("Golden parachute type","Well to do",
"Average","Below average", "bring date seeds instead of flowers")
first(salary.description)
## [1] "Golden parachute type"
#última entrada
last(salary.description)
## [1] "bring date seeds instead of flowers"
#del tercero al fin
nth(salary.description, -3)
## [1] "Average"
#segundo elemento del vector
nth(salary.description,2)
## [1] "Well to do"
#contar valores distintos
library(tidyverse)
a.vector <- c(22,33,44,1,2,3,3,3,4)
original.length <- length(a.vector)
original.length
## [1] 9
#mostrar solo el número de diferentes números
distinct.a.vector <- n_distinct(a.vector)
distinct.a.vector
## [1] 7
test1 <- if_else(original.length == distinct.a.vector, "all values
unique","some duplicate values in vector")
test1
## [1] "some duplicate values in vector"
#ambos vectores contienen valores únicos
b.vector <- c(1,2,3,4,5,6)
length(b.vector)
## [1] 6
distinct.b.vector <- n_distinct(b.vector)
distinct.b.vector #show count (length) of distinct numbers
## [1] 6
test2 <- if_else(length(b.vector) == distinct.b.vector, "all values
unique", "duplicates")
test2
## [1] "all values\nunique"
#na_if
test <- c(100, 0, 999)
x <- 5000/test
x
## [1] 50.000000       Inf  5.005005
x <- 5000/na_if(test,0) # si aparece un 0 durante e test
x
## [1] 50.000000        NA  5.005005
class(x) #use class para mostrar el tipo de variable
## [1] "numeric"
#juntar datos para reemplazar datos perdidos
x <- c(33,4,11,NA,9)
x
## [1] 33  4 11 NA  9
x <- coalesce(x,0)
x
## [1] 33  4 11  0  9
#Ranking via index
y <- c(100,4,12,6,8,3)
rank1 <-row_number(y)
rank1
## [1] 6 2 5 3 4 1
y[rank1[1]] #rank más bajo; en este caso, rank1[1] puntos de # y[6] que es 3 (el más bajo)
## [1] 3
y[rank1[6]] #número más alto del ranking, en este caso el primero # número, 100
## [1] 100
#Rango mínimo
rank2 <- min_rank(y) #en este caso específico (para y), da los mismos resultados como #row_number
rank2
## [1] 6 2 5 3 4 1
#rango denso
rank3 <- dense_rank(y)
rank3
## [1] 6 2 5 3 4 1
#porcentaje del rango
rank4 <- percent_rank(y)
rank4
## [1] 1.0 0.2 0.8 0.4 0.6 0.0
#función de distribución acumulativa
y <- c(100,4,12,6,8,3)
rank5 <- cume_dist(y)
rank5
## [1] 1.0000000 0.3333333 0.8333333 0.5000000 0.6666667 0.1666667
rank6 = ntile(y, 3) #en este caso, escoger 3 cubos
rank6
## [1] 3 1 3 2 2 1
test.vector <- c(2,22,33,44,77,89,99)
quantile(test.vector, prob = seq(0,1,length = 11),type = 5)
##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##  2.0  6.0 20.0 28.6 36.3 44.0 67.1 81.8 90.0 97.0 99.0
#muestreo
data("ChickWeight")
my.sample <- sample_n(ChickWeight, 5)
my.sample
##   weight Time Chick Diet
## 1    153   14    41    4
## 2     58    4     2    1
## 3    141   12     6    1
## 4    108   14     4    1
## 5    162   16     2    1
set.seed(833)
my.sample <- sample_n(ChickWeight, 10, replace = TRUE)
my.sample
##    weight Time Chick Diet
## 1      98    8    45    4
## 2      42    0    17    1
## 3      98    8    36    3
## 4      51    2    11    1
## 5     198   20     3    1
## 6     237   21    49    4
## 7     205   16    50    4
## 8     170   16    39    3
## 9     332   18    35    3
## 10    144   14    33    3
my.sample <- sample_n(mtcars, 12, weight = cyl)
my.sample[,1:5]
##                     mpg cyl  disp  hp drat
## AMC Javelin        15.2   8 304.0 150 3.15
## Porsche 914-2      26.0   4 120.3  91 4.43
## Merc 280           19.2   6 167.6 123 3.92
## Cadillac Fleetwood 10.4   8 472.0 205 2.93
## Merc 240D          24.4   4 146.7  62 3.69
## Datsun 710         22.8   4 108.0  93 3.85
## Merc 280C          17.8   6 167.6 123 3.92
## Mazda RX4 Wag      21.0   6 160.0 110 3.90
## Merc 450SLC        15.2   8 275.8 180 3.07
## Chrysler Imperial  14.7   8 440.0 230 3.23
## Maserati Bora      15.0   8 301.0 335 3.54
## Valiant            18.1   6 225.0 105 2.76
test1 <- sample_frac(ChickWeight, 0.02)
test1
##    weight Time Chick Diet
## 1      48    2    13    1
## 2      62    6    12    1
## 3     197   20    45    4
## 4     234   18    42    4
## 5      58    4    28    2
## 6     163   16     3    1
## 7     103    8    41    4
## 8     103    8    42    4
## 9     120   18    19    1
## 10     48    2    36    3
## 11     80    6    48    4
## 12    137   12    33    3
by_hair_color <- starwars %>% group_by(hair_color)
my.sample <- sample_frac(by_hair_color, .07, replace = TRUE)
my.sample[,1:5]
## # A tibble: 5 x 5
## # Groups:   hair_color [3]
##   name       height  mass hair_color skin_color      
##   <chr>       <int> <dbl> <chr>      <chr>           
## 1 Eeth Koth     171    NA black      brown           
## 2 Dormé         165    NA brown      light           
## 3 Sebulba       112    40 none       grey, red       
## 4 Shaak Ti      178    57 none       red, blue, white
## 5 Tion Medon    206    80 none       grey
row.kount.only <- ChickWeight %>% tally()
row.kount.only
##     n
## 1 578
diet.kount <- ChickWeight %>% count(Diet)
diet.kount
##   Diet   n
## 1    1 220
## 2    2 120
## 3    3 120
## 4    4 118
#add_count para agrupación de filtrado de grupos
single.species.kount <- starwars %>%
 add_count(species) %>%
 filter(n == 1)
single.species.kount[,1:6]
## # A tibble: 29 x 6
##    name                  height  mass hair_color skin_color       eye_color
##    <chr>                  <int> <dbl> <chr>      <chr>            <chr>    
##  1 Greedo                   173    74 <NA>       green            black    
##  2 Jabba Desilijic Tiure    175  1358 <NA>       green-tan, brown orange   
##  3 Yoda                      66    17 white      green            brown    
##  4 Bossk                    190   113 none       green            red      
##  5 Ackbar                   180    83 none       brown mottle     orange   
##  6 Wicket Systri Warrick     88    20 brown      brown            brown    
##  7 Nien Nunb                160    68 none       grey             black    
##  8 Nute Gunray              191    90 none       mottled green    red      
##  9 Watto                    137    NA black      blue, grey       yellow   
## 10 Sebulba                  112    40 none       grey, red        orange   
## # ... with 19 more rows
#renombrar
mtcars <- rename(mtcars, spam_mpg = mpg)
data(mtcars)
names(mtcars)
##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"
mtcars <- rename(mtcars, spam_mpg = mpg)
names(mtcars)
##  [1] "spam_mpg" "cyl"      "disp"     "hp"       "drat"     "wt"      
##  [7] "qsec"     "vs"       "am"       "gear"     "carb"
#case_when
data(starwars)
new.starwars <- starwars %>%
 dplyr::select(name, mass, gender, species, height) %>%
 mutate(type = case_when(height > 200 | mass > 200 ~ "large",
 species == "Droid" ~ "robot", TRUE ~ "other"))
new.starwars
## # A tibble: 87 x 6
##    name                mass gender    species height type 
##    <chr>              <dbl> <chr>     <chr>    <int> <chr>
##  1 Luke Skywalker        77 masculine Human      172 other
##  2 C-3PO                 75 masculine Droid      167 robot
##  3 R2-D2                 32 masculine Droid       96 robot
##  4 Darth Vader          136 masculine Human      202 large
##  5 Leia Organa           49 feminine  Human      150 other
##  6 Owen Lars            120 masculine Human      178 other
##  7 Beru Whitesun lars    75 feminine  Human      165 other
##  8 R5-D4                 32 masculine Droid       97 robot
##  9 Biggs Darklighter     84 masculine Human      183 other
## 10 Obi-Wan Kenobi        77 masculine Human      182 other
## # ... with 77 more rows