R Markdown

1.1 FILTER COMMANDS

#seleccion de lista unicamente 6 cilindros
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

data("mtcars")
solo.6.cilindros <- filter(mtcars, cyl==6)
solo.6.cilindros

##                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4      21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Valiant        18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Merc 280       19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Merc 280C      17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Ferrari Dino   19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6

#doble filtro
cilindros.6.y.110.horsepower <- filter(mtcars, cyl==6, hp==110)
cilindros.6.y.110.horsepower

##                 mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4      21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag  21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Hornet 4 Drive 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1

#cumplir un filtro o otro o ambos
engranes.4.o.cilindros.mas.que.6 <- filter(mtcars, gear==4|cyl>6)
engranes.4.o.cilindros.mas.que.6

##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
## Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2

#filtro maximo valor de una columna
disp.maximo <- filter(mtcars, disp==max(disp))
disp.maximo

##                     mpg cyl disp  hp drat   wt  qsec vs am gear carb
## Cadillac Fleetwood 10.4   8  472 205 2.93 5.25 17.98  0  0    3    4

#filtro minimo valor de una columna
minimo.disp <- filter(mtcars, disp==min(disp))
minimo.disp

##                 mpg cyl disp hp drat    wt qsec vs am gear carb
## Toyota Corolla 33.9   4 71.1 65 4.22 1.835 19.9  1  1    4    1

#doble filtro con > y <
data("ChickWeight")
Pollo.tiempo.y.peso <- filter(ChickWeight, Time<3, weight>53)
Pollo.tiempo.y.peso

##   weight Time Chick Diet
## 1     55    2    22    2
## 2     55    2    40    3
## 3     55    2    43    4
## 4     54    2    50    4

#seleccion primeros valores de una tabla
data("airquality")
head(airquality,10)

##    Ozone Solar.R Wind Temp Month Day
## 1     41     190  7.4   67     5   1
## 2     36     118  8.0   72     5   2
## 3     12     149 12.6   74     5   3
## 4     18     313 11.5   62     5   4
## 5     NA      NA 14.3   56     5   5
## 6     28      NA 14.9   66     5   6
## 7     23     299  8.6   65     5   7
## 8     19      99 13.8   59     5   8
## 9      8      19 20.1   61     5   9
## 10    NA     194  8.6   69     5  10

#remover filas en una columna con valores NA
Aire.sin.na.en.ozone <- filter(airquality, !is.na(Ozone))
head(Aire.sin.na.en.ozone,8)

##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    28      NA 14.9   66     5   6
## 6    23     299  8.6   65     5   7
## 7    19      99 13.8   59     5   8
## 8     8      19 20.1   61     5   9

#remover todas las filas con NA
Aire.sin.ningun.na <- filter(airquality[1:10,], complete.cases(airquality[1:10,]))
Aire.sin.ningun.na

##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    23     299  8.6   65     5   7
## 6    19      99 13.8   59     5   8
## 7     8      19 20.1   61     5   9

#contar datos en cada columna
data("iris")
table(iris$Species)

## 
##     setosa versicolor  virginica 
##         50         50         50

#excluir una columna
iris.con.setosa.y.virginica <- filter(iris, Species %in% c("setosa", "virginica"))
table(iris.con.setosa.y.virginica$Species)

## 
##     setosa versicolor  virginica 
##         50          0         50

#numero de datos antes y despues del filtro
nrow(iris); nrow(iris.con.setosa.y.virginica)

## [1] 150

## [1] 100

#Filtrar datos y mostrar solo 3 columnas
data("airquality")
Aire.solo.3.columnas.y.ozone.mas.que.29 <- filter(airquality, Ozone>29) [,1:3]
head(Aire.solo.3.columnas.y.ozone.mas.que.29)

##   Ozone Solar.R Wind
## 1    41     190  7.4
## 2    36     118  8.0
## 3    34     307 12.0
## 4    30     322 11.5
## 5    32      92 12.0
## 6    45     252 14.9

#numero de valores iguales en una columna
table(mtcars$gear)

## 
##  3  4  5 
## 15 12  5

#cantidad de valores iguales mayor a cierto valor
mayor.frecuencia.de.engranes <- mtcars %>% group_by(gear) %>%  filter(n() > 10) 
table(mayor.frecuencia.de.engranes$gear)

## 
##  3  4 
## 15 12

#cantidad de valores iguales que cumplen con otro parametro
mayor.frecuencia.de.gears.y.menos.de.105.horsepower <- mtcars %>% group_by(gear) %>% filter(n()>10, hp<105) 
table(mayor.frecuencia.de.gears.y.menos.de.105.horsepower$gear)

## 
## 3 4 
## 1 7

#mostrar columnas que empíezan con s
names(iris)

## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"

monitor.iris <- iris %>% dplyr::select(starts_with("S"))
head(monitor.iris)

##   Sepal.Length Sepal.Width Species
## 1          5.1         3.5  setosa
## 2          4.9         3.0  setosa
## 3          4.7         3.2  setosa
## 4          4.6         3.1  setosa
## 5          5.0         3.6  setosa
## 6          5.4         3.9  setosa

#maximo de dos valores y una unica respuesta
carros.nuevos <- mtcars %>% filter_at(vars(cyl, hp), all_vars(. == max(.)))
carros.nuevos

##               mpg cyl disp  hp drat   wt qsec vs am gear carb
## Maserati Bora  15   8  301 335 3.54 3.57 14.6  0  1    5    8

#datos msleep
datos.msleep <- ggplot2::msleep
datos.msleep

## # A tibble: 83 x 11
##    name   genus vore  order conservation sleep_total sleep_rem sleep_cycle awake
##    <chr>  <chr> <chr> <chr> <chr>              <dbl>     <dbl>       <dbl> <dbl>
##  1 Cheet~ Acin~ carni Carn~ lc                  12.1      NA        NA      11.9
##  2 Owl m~ Aotus omni  Prim~ <NA>                17         1.8      NA       7  
##  3 Mount~ Aplo~ herbi Rode~ nt                  14.4       2.4      NA       9.6
##  4 Great~ Blar~ omni  Sori~ lc                  14.9       2.3       0.133   9.1
##  5 Cow    Bos   herbi Arti~ domesticated         4         0.7       0.667  20  
##  6 Three~ Brad~ herbi Pilo~ <NA>                14.4       2.2       0.767   9.6
##  7 North~ Call~ carni Carn~ vu                   8.7       1.4       0.383  15.3
##  8 Vespe~ Calo~ <NA>  Rode~ <NA>                 7        NA        NA      17  
##  9 Dog    Canis carni Carn~ domesticated        10.1       2.9       0.333  13.9
## 10 Roe d~ Capr~ herbi Arti~ lc                   3        NA        NA      21  
## # ... with 73 more rows, and 2 more variables: brainwt <dbl>, bodywt <dbl>

#seleccion de barras que aparecen y filtro a las columnas con sleep
tipos.de.sleep.mayores.que.5 <- msleep %>% select( name, sleep_total:sleep_rem, brainwt:bodywt) %>% filter_at(vars(contains("sleep")), all_vars(.>5))
tipos.de.sleep.mayores.que.5

## # A tibble: 2 x 5
##   name                 sleep_total sleep_rem brainwt bodywt
##   <chr>                      <dbl>     <dbl>   <dbl>  <dbl>
## 1 Thick-tailed opposum        19.4       6.6  NA       0.37
## 2 Giant armadillo             18.1       6.1   0.081  60

1.2. ARRANGE

#Organizar columna genus alfabeticamente
Organizar.genus <- ggplot2::msleep
Organizar.genus[,1:4]

## # A tibble: 83 x 4
##    name                       genus       vore  order       
##    <chr>                      <chr>       <chr> <chr>       
##  1 Cheetah                    Acinonyx    carni Carnivora   
##  2 Owl monkey                 Aotus       omni  Primates    
##  3 Mountain beaver            Aplodontia  herbi Rodentia    
##  4 Greater short-tailed shrew Blarina     omni  Soricomorpha
##  5 Cow                        Bos         herbi Artiodactyla
##  6 Three-toed sloth           Bradypus    herbi Pilosa      
##  7 Northern fur seal          Callorhinus carni Carnivora   
##  8 Vesper mouse               Calomys     <NA>  Rodentia    
##  9 Dog                        Canis       carni Carnivora   
## 10 Roe deer                   Capreolus   herbi Artiodactyla
## # ... with 73 more rows

#organizar alfabeticamene columna vore y order priorizando la primera
organizar.animal.vore.y.order.alfabeticamente <- arrange(msleep, vore, order)
organizar.animal.vore.y.order.alfabeticamente[,1:4]

## # A tibble: 83 x 4
##    name              genus        vore  order    
##    <chr>             <chr>        <chr> <chr>    
##  1 Cheetah           Acinonyx     carni Carnivora
##  2 Northern fur seal Callorhinus  carni Carnivora
##  3 Dog               Canis        carni Carnivora
##  4 Domestic cat      Felis        carni Carnivora
##  5 Gray seal         Haliochoerus carni Carnivora
##  6 Tiger             Panthera     carni Carnivora
##  7 Jaguar            Panthera     carni Carnivora
##  8 Lion              Panthera     carni Carnivora
##  9 Caspian seal      Phoca        carni Carnivora
## 10 Genet             Genetta      carni Carnivora
## # ... with 73 more rows

#organizar vore alfabe. y order alfabe. inverso
organizar.animal.vore.alfabeticamente.y.order.alfabeto.inverso <- arrange(msleep, vore, desc(order))
head(organizar.animal.vore.alfabeticamente.y.order.alfabeto.inverso[,1:4])

## # A tibble: 6 x 4
##   name                       genus         vore  order          
##   <chr>                      <chr>         <chr> <chr>          
## 1 Northern grasshopper mouse Onychomys     carni Rodentia       
## 2 Slow loris                 Nyctibeus     carni Primates       
## 3 Thick-tailed opposum       Lutreolina    carni Didelphimorphia
## 4 Long-nosed armadillo       Dasypus       carni Cingulata      
## 5 Pilot whale                Globicephalus carni Cetacea        
## 6 Common porpoise            Phocoena      carni Cetacea

1.3. RENAME

#cambiar nombres de columnas
nuevos.nombres.iris <- rename(iris, width.of.petals = Petal.Width, various.plants.and.animals = Species)
names(nuevos.nombres.iris)

## [1] "Sepal.Length"               "Sepal.Width"               
## [3] "Petal.Length"               "width.of.petals"           
## [5] "various.plants.and.animals"

1.4. MUTATE

#primeras 2 filas
data("ChickWeight")
ChickWeight[1:2,]

##   weight Time Chick Diet
## 1     42    0     1    1
## 2     51    2     1    1

#crear una nueva columna con la funcion logaritmo
weight.con.funcion.log <- mutate(ChickWeight, log.weight = log10(weight))
weight.con.funcion.log[1:2,]

##   weight Time Chick Diet log.weight
## 1     42    0     1    1   1.623249
## 2     51    2     1    1   1.707570

#crear nuevas columnas con raiz cuadrada
sleep.con.sqrt <- mutate_all(msleep[,6:11], funs("square root" = sqrt( . )))

## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

names(sleep.con.sqrt)

##  [1] "sleep_total"             "sleep_rem"              
##  [3] "sleep_cycle"             "awake"                  
##  [5] "brainwt"                 "bodywt"                 
##  [7] "sleep_total_square root" "sleep_rem_square root"  
##  [9] "sleep_cycle_square root" "awake_square root"      
## [11] "brainwt_square root"     "bodywt_square root"

#mostrar la tabla con nuevas columnas
sleep.con.sqrt

## # A tibble: 83 x 12
##    sleep_total sleep_rem sleep_cycle awake  brainwt  bodywt `sleep_total_square~
##          <dbl>     <dbl>       <dbl> <dbl>    <dbl>   <dbl>                <dbl>
##  1        12.1      NA        NA      11.9 NA        50                     3.48
##  2        17         1.8      NA       7    0.0155    0.48                  4.12
##  3        14.4       2.4      NA       9.6 NA         1.35                  3.79
##  4        14.9       2.3       0.133   9.1  0.00029   0.019                 3.86
##  5         4         0.7       0.667  20    0.423   600                     2   
##  6        14.4       2.2       0.767   9.6 NA         3.85                  3.79
##  7         8.7       1.4       0.383  15.3 NA        20.5                   2.95
##  8         7        NA        NA      17   NA         0.045                 2.65
##  9        10.1       2.9       0.333  13.9  0.07     14                     3.18
## 10         3        NA        NA      21    0.0982   14.8                   1.73
## # ... with 73 more rows, and 5 more variables: sleep_rem_square root <dbl>,
## #   sleep_cycle_square root <dbl>, awake_square root <dbl>,
## #   brainwt_square root <dbl>, bodywt_square root <dbl>

#obtener datos de titanic
data("Titanic")
Titanic <- as.data.frame(Titanic)
head(Titanic)

##   Class    Sex   Age Survived Freq
## 1   1st   Male Child       No    0
## 2   2nd   Male Child       No    0
## 3   3rd   Male Child       No   35
## 4  Crew   Male Child       No    0
## 5   1st Female Child       No    0
## 6   2nd Female Child       No    0

#crear 3 columnas y usar funcion rango en orden descendiente
titanic.con.ranks <- mutate_at(Titanic, vars(Class,Age,Survived), funs(Rank = min_rank(desc(.))))
head(titanic.con.ranks)

##   Class    Sex   Age Survived Freq Class_Rank Age_Rank Survived_Rank
## 1   1st   Male Child       No    0         25       17            17
## 2   2nd   Male Child       No    0         17       17            17
## 3   3rd   Male Child       No   35          9       17            17
## 4  Crew   Male Child       No    0          1       17            17
## 5   1st Female Child       No    0         25       17            17
## 6   2nd Female Child       No    0         17       17            17

#tabla co2
head(CO2)

##   Plant   Type  Treatment conc uptake
## 1   Qn1 Quebec nonchilled   95   16.0
## 2   Qn1 Quebec nonchilled  175   30.4
## 3   Qn1 Quebec nonchilled  250   34.8
## 4   Qn1 Quebec nonchilled  350   37.2
## 5   Qn1 Quebec nonchilled  500   35.3
## 6   Qn1 Quebec nonchilled  675   39.2

#se dividen todos los datos de la tabla en 10 se crea la Ecuacion y luego se cita en mutate
dividir.cada.numero.en.10 <- function (a.number) (a.number / 10)
tabla.con.nuevas.unidades <- CO2 %>% mutate_if(is.numeric, dividir.cada.numero.en.10)
head(tabla.con.nuevas.unidades)

##   Plant   Type  Treatment conc uptake
## 1   Qn1 Quebec nonchilled  9.5   1.60
## 2   Qn1 Quebec nonchilled 17.5   3.04
## 3   Qn1 Quebec nonchilled 25.0   3.48
## 4   Qn1 Quebec nonchilled 35.0   3.72
## 5   Qn1 Quebec nonchilled 50.0   3.53
## 6   Qn1 Quebec nonchilled 67.5   3.92

#como crear tablas y remover los valores na a cambio de 0
Tabla.nueva <- data.frame(alpha = c(22, 1, NA), almond = c(0, 5, 10), grape = c(0, 2, 2), apple = c(NA, 5, 10))
Tabla.nueva

##   alpha almond grape apple
## 1    22      0     0    NA
## 2     1      5     2     5
## 3    NA     10     2    10

tabla.nueva.arreglar.alpha <- Tabla.nueva %>% mutate_if(is.numeric, coalesce, ... = 0)
tabla.nueva.arreglar.alpha

##   alpha almond grape apple
## 1    22      0     0     0
## 2     1      5     2     5
## 3     0     10     2    10

#Contar cuantos hay de cada uno en una columna y y separar los que contienen alguna letra
msleep <- ggplot2::msleep
table(msleep$vore)

## 
##   carni   herbi insecti    omni 
##      19      32       5      20

datos.msleep.que.no.tienen.c.ni.a <- filter(msleep, !str_detect(vore, paste(c("c","a"), collapse = "|")))
table(datos.msleep.que.no.tienen.c.ni.a$vore)

## 
## herbi  omni 
##    32    20

#comprobar y marcar con v o f si existe un dato repetido en alguna columna
msleep.marcando.duplicados.en.columna.conservation <- mutate(msleep, duplicate.indicator = duplicated(conservation))
msleep.marcando.duplicados.en.columna.conservation[1:6,]

## # A tibble: 6 x 12
##   name    genus vore  order conservation sleep_total sleep_rem sleep_cycle awake
##   <chr>   <chr> <chr> <chr> <chr>              <dbl>     <dbl>       <dbl> <dbl>
## 1 Cheetah Acin~ carni Carn~ lc                  12.1      NA        NA      11.9
## 2 Owl mo~ Aotus omni  Prim~ <NA>                17         1.8      NA       7  
## 3 Mounta~ Aplo~ herbi Rode~ nt                  14.4       2.4      NA       9.6
## 4 Greate~ Blar~ omni  Sori~ lc                  14.9       2.3       0.133   9.1
## 5 Cow     Bos   herbi Arti~ domesticated         4         0.7       0.667  20  
## 6 Three-~ Brad~ herbi Pilo~ <NA>                14.4       2.2       0.767   9.6
## # ... with 3 more variables: brainwt <dbl>, bodywt <dbl>,
## #   duplicate.indicator <lgl>

#valores repedidos en una columna que no se muestra en la tabla y seleccion de ciertas columnas
msleep.marcando.duplicados.en.columna.conservation <- mutate(msleep, duplicate.indicator = duplicated(conservation))
msleep.marcando.duplicados.en.columna.conservation[1:6,c(1,2,3,12)]

## # A tibble: 6 x 4
##   name                       genus      vore  duplicate.indicator
##   <chr>                      <chr>      <chr> <lgl>              
## 1 Cheetah                    Acinonyx   carni FALSE              
## 2 Owl monkey                 Aotus      omni  FALSE              
## 3 Mountain beaver            Aplodontia herbi FALSE              
## 4 Greater short-tailed shrew Blarina    omni  TRUE               
## 5 Cow                        Bos        herbi FALSE              
## 6 Three-toed sloth           Bradypus   herbi TRUE

#marcar el duplicado si ocurre en alguna o en ambas.
msleep.con.2.columnas.para.tener.en.cuenta.el.duplicado <- mutate(msleep, duplicate.indicator = duplicated(conservation, genus)) %>% arrange(conservation,genus)
msleep.con.2.columnas.para.tener.en.cuenta.el.duplicado

## # A tibble: 83 x 12
##    name   genus vore  order conservation sleep_total sleep_rem sleep_cycle awake
##    <chr>  <chr> <chr> <chr> <chr>              <dbl>     <dbl>       <dbl> <dbl>
##  1 Giraf~ Gira~ herbi Arti~ cd                   1.9       0.4      NA      22.1
##  2 Pilot~ Glob~ carni Ceta~ cd                   2.7       0.1      NA      21.4
##  3 Cow    Bos   herbi Arti~ domesticated         4         0.7       0.667  20  
##  4 Dog    Canis carni Carn~ domesticated        10.1       2.9       0.333  13.9
##  5 Guine~ Cavis herbi Rode~ domesticated         9.4       0.8       0.217  14.6
##  6 Chinc~ Chin~ herbi Rode~ domesticated        12.5       1.5       0.117  11.5
##  7 Horse  Equus herbi Peri~ domesticated         2.9       0.6       1      21.1
##  8 Donkey Equus herbi Peri~ domesticated         3.1       0.4      NA      20.9
##  9 Domes~ Felis carni Carn~ domesticated        12.5       3.2       0.417  11.5
## 10 Rabbit Oryc~ herbi Lago~ domesticated         8.4       0.9       0.417  15.6
## # ... with 73 more rows, and 3 more variables: brainwt <dbl>, bodywt <dbl>,
## #   duplicate.indicator <lgl>

#crear tabla
fruit <- c("apple","pear","orange","grape", "orange","orange")
x <- c(1,2,4,9,4,6)
y <- c(22,3,4,55,15,9)
z <- c(3,1,4,10,12,8)
w <- c(2,2,2,4,5,6)
tabla.frutas <- data.frame(fruit,x,y,z,w)
tabla.frutas

##    fruit x  y  z w
## 1  apple 1 22  3 2
## 2   pear 2  3  1 2
## 3 orange 4  4  4 2
## 4  grape 9 55 10 4
## 5 orange 4 15 12 5
## 6 orange 6  9  8 6

#fruta duplicada
tabla.fruta.duplicado.simple <- mutate(tabla.frutas, duplicate.indicator = duplicated(fruit))
tabla.fruta.duplicado.simple

##    fruit x  y  z w duplicate.indicator
## 1  apple 1 22  3 2               FALSE
## 2   pear 2  3  1 2               FALSE
## 3 orange 4  4  4 2               FALSE
## 4  grape 9 55 10 4               FALSE
## 5 orange 4 15 12 5                TRUE
## 6 orange 6  9  8 6                TRUE

#Anula la columna z
fruit <- c("apple","pear","orange","grape", "orange","orange")
x <- c(1,2,4,9,4,6)
y <- c(22,3,4,55,15,9)
z <- c(3,1,4,10,12,8)
otra.tabla.fruta <- data.frame(fruit,x,y,z)
df.tabla.fruta <- mutate(otra.tabla.fruta, z = NULL)
df.tabla.fruta

##    fruit x  y
## 1  apple 1 22
## 2   pear 2  3
## 3 orange 4  4
## 4  grape 9 55
## 5 orange 4 15
## 6 orange 6  9

#libreria nueva y nuevos datos
library(nycflights13)
data("flights")
show(flights)

## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

#agregar otras columnas de gran interes para ciertas actividades 
mutate(flights, gain = arr_delay - dep_delay, hours = air_time / 60, gain_per_hour = gain / hours, gain_per_minute = 60 * gain_per_hour)

## # A tibble: 336,776 x 23
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 336,766 more rows, and 15 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## #   gain <dbl>, hours <dbl>, gain_per_hour <dbl>, gain_per_minute <dbl>

#colocar unicamente las columnas nuevas creadas que serian de nuestro interes
otro.modo.de.hacer.flights <- flights %>% mutate(gain = arr_delay - dep_delay, hours = air_time / 60) %>% mutate(gain_per_hour = gain / hours) %>% mutate(gain_per_minute = 60 * gain_per_hour)
otro.modo.de.hacer.flights[1:6,c(1:2,20:23)]

## # A tibble: 6 x 6
##    year month  gain hours gain_per_hour gain_per_minute
##   <int> <int> <dbl> <dbl>         <dbl>           <dbl>
## 1  2013     1     9  3.78          2.38            143.
## 2  2013     1    16  3.78          4.23            254.
## 3  2013     1    31  2.67         11.6             698.
## 4  2013     1   -17  3.05         -5.57           -334.
## 5  2013     1   -19  1.93         -9.83           -590.
## 6  2013     1    16  2.5           6.4             384

#crear un valor a partir de otros valores de la tacla
fruit <- c("apple","pear","orange","grape", "orange","orange")
x <- c(1,2,4,9,4,6)
y <- c(22,3,4,55,15,9)
z <- c(3,1,4,10,12,8)
otra.tabla.fruta <- data.frame(fruit,x,y,z)
otra.tabla.fruta

##    fruit x  y  z
## 1  apple 1 22  3
## 2   pear 2  3  1
## 3 orange 4  4  4
## 4  grape 9 55 10
## 5 orange 4 15 12
## 6 orange 6  9  8

uso.de.transmute.en.tabla.de.fruta <- transmute(otra.tabla.fruta, nueva.variable.sum.x.y.z = x + y + z)
uso.de.transmute.en.tabla.de.fruta

##   nueva.variable.sum.x.y.z
## 1                       26
## 2                        6
## 3                       12
## 4                       74
## 5                       31
## 6                       23

#creamos una funcion del doble de cada valor y la aplicamos a una tabla
doble.de.eso <- function(x) x*2
iris %>% mutate(across(where(is.numeric), doble.de.eso)) %>%
head()

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1         10.2         7.0          2.8         0.4  setosa
## 2          9.8         6.0          2.8         0.4  setosa
## 3          9.4         6.4          2.6         0.4  setosa
## 4          9.2         6.2          3.0         0.4  setosa
## 5         10.0         7.2          2.8         0.4  setosa
## 6         10.8         7.8          3.4         0.8  setosa

#otra tabla para trabajar y si se cumple alguno de los casos expuestos se coloca el numero que señala el simbolo ~
row1 <- c("a","b","c","d","e","f","columna.para.cambiar")
row2 <- c(1,1,1,6,6,1,2)
row3 <- c(3,4,4,6,4,4,4)
row4 <- c(4,6,25,5,5,2,9)
row5 <- c(5,3,6,3,3,6,2)
nueva.tabla.1 <- as.data.frame(rbind(row2,row3,row4,row5))
names(nueva.tabla.1) <- row1
nueva.tabla.1

##      a b  c d e f columna.para.cambiar
## row2 1 1  1 6 6 1                    2
## row3 3 4  4 6 4 4                    4
## row4 4 6 25 5 5 2                    9
## row5 5 3  6 3 3 6                    2

nuevo.mutate.1 <- nueva.tabla.1 %>%
mutate(columna.para.cambiar = case_when(a == 2 | a == 5 | a == 7 | (a == 1 & b == 4) ~ 2, a == 0 | a == 1 | a == 4 | a == 3 | c == 4 ~ 3, TRUE ~ NA_real_))
nuevo.mutate.1

##      a b  c d e f columna.para.cambiar
## row2 1 1  1 6 6 1                    3
## row3 3 4  4 6 4 4                    3
## row4 4 6 25 5 5 2                    3
## row5 5 3  6 3 3 6                    2

1.5. Select to choose variables/columns

#eliminar una columna de la tabla
fruit <- c("apple","pear","orange","grape", "orange","orange")
x <- c(1,2,4,9,4,6)
y <- c(22,3,4,55,15,9)
z <- c(3,1,4,10,12,8)
otra.tabla.fruta <- data.frame(fruit,x,y,z)
otra.tabla.fruta

##    fruit x  y  z
## 1  apple 1 22  3
## 2   pear 2  3  1
## 3 orange 4  4  4
## 4  grape 9 55 10
## 5 orange 4 15 12
## 6 orange 6  9  8

nueva.tabla.sin.fruit <- dplyr::select(otra.tabla.fruta, -fruit)
nueva.tabla.sin.fruit

##   x  y  z
## 1 1 22  3
## 2 2  3  1
## 3 4  4  4
## 4 9 55 10
## 5 4 15 12
## 6 6  9  8

#quitar columnas con una letra en especifico al inicio
mtcars.quitar.columnas.que.empiezen.por.d<- select(mtcars, -starts_with("d"))
head(mtcars.quitar.columnas.que.empiezen.por.d)

##                    mpg cyl  hp    wt  qsec vs am gear carb
## Mazda RX4         21.0   6 110 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6 110 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  93 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6 110 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8 175 3.440 17.02  0  0    3    2
## Valiant           18.1   6 105 3.460 20.22  1  0    3    1

#quitar columnas que terminen en una letra
mtcars.quitar.columnas.que.terminen.por.t <- select(mtcars,- ends_with("t"))
head(mtcars.quitar.columnas.que.terminen.por.t)

##                    mpg cyl disp  hp  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 17.02  0  0    3    2
## Valiant           18.1   6  225 105 20.22  1  0    3    1

#cambiar el orden de las columnas dentro del data.frame
fruit <- c("apple","pear","orange","grape", "orange","orange")
x <- c(1,2,4,9,4,6)
y <- c(22,3,4,55,15,9)
z <- c(3,1,4,10,12,8)
otra.tabla.fruta <- data.frame(fruit,z,y,x)
otra.tabla.fruta

##    fruit  z  y x
## 1  apple  3 22 1
## 2   pear  1  3 2
## 3 orange  4  4 4
## 4  grape 10 55 9
## 5 orange 12 15 4
## 6 orange  8  9 6

#otra tabla de datos y aplicando la funcion troupper(poner mayusculas) a todo
Estado <- c("Maryland", "Alaska", "Newersey")
ingreso <- c(76067,74444,73702)
Poblacion.media <- c(61372,61372,61372)
expectativa.de.vida <- c(78.8,78.3,80.3)
tres.estados <- data.frame(Estado, ingreso, Poblacion.media, expectativa.de.vida)
tres.estados.us <- select_all(tres.estados, toupper)
tres.estados.us

##     ESTADO INGRESO POBLACION.MEDIA EXPECTATIVA.DE.VIDA
## 1 Maryland   76067           61372                78.8
## 2   Alaska   74444           61372                78.3
## 3 Newersey   73702           61372                80.3

#mostrar datos de la primera columna
colocar.primera.columna <- pull(tres.estados.us,1)
colocar.primera.columna

## [1] "Maryland" "Alaska"   "Newersey"

#colocar datos de la ultima columna
colocar.ultima.columna <- pull(tres.estados.us,-1)
colocar.ultima.columna

## [1] 78.8 78.3 80.3

#numero de columnas en mtcars
nrow(mtcars)

## [1] 32

#contar las columnas que cumplen cierta condicion
mtcars.mas.que.200.en.cualquier.columna <- filter_all(mtcars, any_vars(. > 200))
nrow(mtcars.mas.que.200.en.cualquier.columna)

## [1] 16

#seleccionar datos que no contengan p
carros.sin.p <- mtcars %>% dplyr::select(-contains("p"))
names(carros.sin.p)

## [1] "cyl"  "drat" "wt"   "qsec" "vs"   "am"   "gear" "carb"

#crear un subconjunto que contenga uno o otro condicional
subconjunto.de.carros <- select(mtcars, matches("pg|gea"))
names(subconjunto.de.carros)

## [1] "mpg"  "gear"

1.6. Joins: Manipulations of data from two sources

#nuevos datos con una columna de datos abreviado
us.areas.de.estados <- as.data.frame(cbind(state.abb, state.area))
us.areas.de.estados[1:3,]

##   state.abb state.area
## 1        AL      51609
## 2        AK     589757
## 3        AZ     113909

#colocar nombre completo del abreviado
us.estado.abreviado.y.completo <- as.data.frame(cbind(state.abb,state.name))
us.estado.abreviado.y.completo[1:3,]

##   state.abb state.name
## 1        AL    Alabama
## 2        AK     Alaska
## 3        AZ    Arizona

#tabla completa con el nuevo dato "desabreviado
us.dos.datos.que.teniamos.mas.nombre.completo <- us.areas.de.estados %>% left_join(us.estado.abreviado.y.completo, by = "state.abb")
head(us.dos.datos.que.teniamos.mas.nombre.completo)

##   state.abb state.area state.name
## 1        AL      51609    Alabama
## 2        AK     589757     Alaska
## 3        AZ     113909    Arizona
## 4        AR      53104   Arkansas
## 5        CA     158693 California
## 6        CO     104247   Colorado

#nueva tabla teams
Persona <- c("Sally","Tom","Frieda","Alfonzo")
goles.en.el.equipo <- c(3,5,2,7)
liga.del.equipo <- c("alpha","beta","gamma", "omicron")
informacion.del.equipo <- data.frame(Persona, goles.en.el.equipo, liga.del.equipo)
informacion.del.equipo

##   Persona goles.en.el.equipo liga.del.equipo
## 1   Sally                  3           alpha
## 2     Tom                  5            beta
## 3  Frieda                  2           gamma
## 4 Alfonzo                  7         omicron

#nueva tabla school
Persona = c("Sally","Tom", "Bill", "Alfonzo")
calificaciones.colegio  <- c("A","B","C","B")
informacion.escolar <- data.frame(Persona, calificaciones.colegio)
informacion.escolar

##   Persona calificaciones.colegio
## 1   Sally                      A
## 2     Tom                      B
## 3    Bill                      C
## 4 Alfonzo                      B

#busca los nombres en comun y coloca los datos relacionados a esas personas en comun de ambas tablas creadas
equipo.y.escuela.personas.comunes <- inner_join(informacion.del.equipo, informacion.escolar, by = "Persona")
equipo.y.escuela.personas.comunes

##   Persona goles.en.el.equipo liga.del.equipo calificaciones.colegio
## 1   Sally                  3           alpha                      A
## 2     Tom                  5            beta                      B
## 3 Alfonzo                  7         omicron                      B

#informacion de la tabla teams que no coincide con la tabla school
informacion.del.equipo.pero.sin.calificacion <- anti_join(informacion.del.equipo, informacion.escolar,by = "Persona")
informacion.del.equipo.pero.sin.calificacion

##   Persona goles.en.el.equipo liga.del.equipo
## 1  Frieda                  2           gamma

#unir las dos tablas asi tengan espacios vacios
total.equipo.y.escuela <- full_join(informacion.del.equipo, informacion.escolar, by = "Persona")
total.equipo.y.escuela

##   Persona goles.en.el.equipo liga.del.equipo calificaciones.colegio
## 1   Sally                  3           alpha                      A
## 2     Tom                  5            beta                      B
## 3  Frieda                  2           gamma                   <NA>
## 4 Alfonzo                  7         omicron                      B
## 5    Bill                 NA            <NA>                      C

#toma las personas en comun y coloca los datos de la primera tabla
personas.en.comun.con.solo.los.datos.de.equipo <- semi_join(informacion.del.equipo, informacion.escolar)

## Joining, by = "Persona"

personas.en.comun.con.solo.los.datos.de.equipo

##   Persona goles.en.el.equipo liga.del.equipo
## 1   Sally                  3           alpha
## 2     Tom                  5            beta
## 3 Alfonzo                  7         omicron

us.tres.datos <- right_join(us.areas.de.estados,us.estado.abreviado.y.completo, by = "state.abb")
us.tres.datos[1:3,]

##   state.abb state.area state.name
## 1        AL      51609    Alabama
## 2        AK     589757     Alaska
## 3        AZ     113909    Arizona

1.7. Slice

#recuerdo de filas de msleep
msleep <- ggplot2::msleep
nrow(msleep)

## [1] 83

#el numero nos indica el numero de filas que queremos (n-1), en este caso (6-1)=5filas
msleep.solo.5.filas <- slice(msleep, -6:-n())
nrow(msleep.solo.5.filas)

## [1] 5

#tambien se puede seleccionar las filas que se quieren poner
msleep.20.filas <- msleep %>% slice(20:39)
nrow(msleep.20.filas)

## [1] 20

#podemos restar o quitar las que ya existen en la otra tabla
nrow(msleep)-nrow(msleep.20.filas)

## [1] 63

1.8. summarise

#nuevos datos sobre pacientes y la cantidad de ellos
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

data(gehan)
pacientes <- gehan
library(tidyverse)
pacientes %>% summarise( kount = n())

##   kount
## 1    42

#pacientes con tratamiento o sin tratamiento
pacientes %>% group_by(treat) %>% summarise(kount=n())

## # A tibble: 2 x 2
##   treat   kount
##   <fct>   <int>
## 1 6-MP       21
## 2 control    21

#nos muestran algunas funciones aplicables a los datos.
pacientes %>% group_by(treat) %>% summarise(tiempo.promedio.remiso = mean(time), tiempo.medio.remiso = median(time), desviacion.est.tiempo.remiso = sd(time), desviacion.abso.media.tiempo = mad(time), rangos.por.cuartiles.tiempo = IQR(time))

## # A tibble: 2 x 6
##   treat   tiempo.promedio.~ tiempo.medio.re~ desviacion.est.t~ desviacion.abso.~
##   <fct>               <dbl>            <int>             <dbl>             <dbl>
## 1 6-MP                17.1                16             10.0              10.4 
## 2 control              8.67                8              6.47              5.93
## # ... with 1 more variable: rangos.por.cuartiles.tiempo <dbl>

#tiempos minimos y tiempos maximos
pacientes %>% group_by(treat) %>% summarise(remision.minima = min(time), remision.maxima = max(time))

## # A tibble: 2 x 3
##   treat   remision.minima remision.maxima
##   <fct>             <int>           <int>
## 1 6-MP                  6              35
## 2 control               1              23

#nuevos datos de encuesta a pacientes
library(MASS)
encuesta.pacientes <- survey [1:10,]
library(dplyr)
head(encuesta.pacientes)

##      Sex Wr.Hnd NW.Hnd W.Hnd    Fold Pulse    Clap Exer Smoke Height      M.I
## 1 Female   18.5   18.0 Right  R on L    92    Left Some Never 173.00   Metric
## 2   Male   19.5   20.5  Left  R on L   104    Left None Regul 177.80 Imperial
## 3   Male   18.0   13.3 Right  L on R    87 Neither None Occas     NA     <NA>
## 4   Male   18.8   18.9 Right  R on L    NA Neither None Never 160.00   Metric
## 5   Male   20.0   20.0 Right Neither    35   Right Some Never 165.00   Metric
## 6 Female   18.0   17.7 Right  L on R    64   Right Some Never 172.72 Imperial
##      Age
## 1 18.250
## 2 17.583
## 3 16.917
## 4 20.333
## 5 23.667
## 6 21.000

#na.omit()  excluye todos los na
#selecciona solo los datos numericos y le calcula la media
encuesta.pacientes %>% na.omit() %>% group_by(Sex) %>% summarise(across(where(is.numeric), mean, .names = "mean_{col}")) %>% head()

## # A tibble: 2 x 6
##   Sex    mean_Wr.Hnd mean_NW.Hnd mean_Pulse mean_Height mean_Age
##   <fct>        <dbl>       <dbl>      <dbl>       <dbl>    <dbl>
## 1 Female        17.8        17.7       76.7        168.     25.0
## 2 Male          19.1        19.2       76.8        174.     20.3

#organizar vore y order y contar cuantos hay de cada uno de ellos
msleep.con.conteo.de.vore.y.order.y.organizado <- msleep %>% group_by(vore, order) 
sleep.vor.y.order <- summarise(msleep.con.conteo.de.vore.y.order.y.organizado, n())

## `summarise()` has grouped output by 'vore'. You can override using the `.groups` argument.

sleep.vor.y.order

## # A tibble: 32 x 3
## # Groups:   vore [5]
##    vore  order           `n()`
##    <chr> <chr>           <int>
##  1 carni Carnivora          12
##  2 carni Cetacea             3
##  3 carni Cingulata           1
##  4 carni Didelphimorphia     1
##  5 carni Primates            1
##  6 carni Rodentia            1
##  7 herbi Artiodactyla        5
##  8 herbi Diprotodontia       1
##  9 herbi Hyracoidea          2
## 10 herbi Lagomorpha          1
## # ... with 22 more rows

sleep.total <- msleep %>% group_by(vore, order) %>% summarise(n())

## `summarise()` has grouped output by 'vore'. You can override using the `.groups` argument.

sleep.total

## # A tibble: 32 x 3
## # Groups:   vore [5]
##    vore  order           `n()`
##    <chr> <chr>           <int>
##  1 carni Carnivora          12
##  2 carni Cetacea             3
##  3 carni Cingulata           1
##  4 carni Didelphimorphia     1
##  5 carni Primates            1
##  6 carni Rodentia            1
##  7 herbi Artiodactyla        5
##  8 herbi Diprotodontia       1
##  9 herbi Hyracoidea          2
## 10 herbi Lagomorpha          1
## # ... with 22 more rows

1.9. Gathering: convert multiple columns into one

#creamos una tabla con años separados
Estado <- c("Maryland", "Alaska", "New Jersey")
ingreso <- c(76067,74444,73702)
poblacion.media <- c(61372,61372,61372)
expectativa.de.vida <- c(78.8,78.3,80.3)
tasa.natalidad.en.adolescentes.2015 <- c(17,29.3,12.1)
tasa.natalidad.en.adolescentes.2007 <- c(34.3,42.9,24.9)
tasa.natalidad.en.adolescentes.1991 <- c(54.1, 66, 41.3)
tres.estados.importantes <- data.frame(Estado, ingreso, poblacion.media, expectativa.de.vida, tasa.natalidad.en.adolescentes.2015, tasa.natalidad.en.adolescentes.2007, tasa.natalidad.en.adolescentes.1991)
names(tres.estados.importantes) <- c("estado", "ingreso", "poblacion.media", "expectativa.de.vida","2015","2007","1991")
tres.estados.importantes

##       estado ingreso poblacion.media expectativa.de.vida 2015 2007 1991
## 1   Maryland   76067           61372                78.8 17.0 34.3 54.1
## 2     Alaska   74444           61372                78.3 29.3 42.9 66.0
## 3 New Jersey   73702           61372                80.3 12.1 24.9 41.3

#reorganizamos la tabla anterior con los años en una sola columna
nuevo.orden.tres.estados.importantes <- tres.estados.importantes %>% gather("2015", "2007", "1991", key = "year", value = "cases")
nuevo.orden.tres.estados.importantes

##       estado ingreso poblacion.media expectativa.de.vida year cases
## 1   Maryland   76067           61372                78.8 2015  17.0
## 2     Alaska   74444           61372                78.3 2015  29.3
## 3 New Jersey   73702           61372                80.3 2015  12.1
## 4   Maryland   76067           61372                78.8 2007  34.3
## 5     Alaska   74444           61372                78.3 2007  42.9
## 6 New Jersey   73702           61372                80.3 2007  24.9
## 7   Maryland   76067           61372                78.8 1991  54.1
## 8     Alaska   74444           61372                78.3 1991  66.0
## 9 New Jersey   73702           61372                80.3 1991  41.3

1.10. spreading: consolidation of multiple rows into one

#creacion de nueva tabla
nuevos.datos.1 <- data_frame(n = 1:4, Type = c("TypeA", "TypeA", "TypeB", "TypeB"), Answer = c("Si", "No", NA, "No"))

## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

nuevos.datos.1

## # A tibble: 4 x 3
##       n Type  Answer
##   <int> <chr> <chr> 
## 1     1 TypeA Si    
## 2     2 TypeA No    
## 3     3 TypeB <NA>  
## 4     4 TypeB No

#reorganizamos los datos en las filas
reorganizar.nuevos.datos.1 <- nuevos.datos.1 %>% filter(!is.na(Answer)) %>% spread(key=Answer, value=n)
reorganizar.nuevos.datos.1

## # A tibble: 2 x 3
##   Type     No    Si
##   <chr> <int> <int>
## 1 TypeA     2     1
## 2 TypeB     4    NA

1.11. Separate: divide a single column into multiple columns

#tenemos en una sola cacilla tres valores distintos
Estado <- c("Maryland", "Alaska", "New Jersey")
ingreso <- c(76067,74444,73702)
poblacion.media <- c(61372,61372,61372)
expectativa.de.vida <- c(78.8,78.3,80.3)
nacimiento.jovenes <- c("17//34.3//54.1", "29.0//42.9//66.0", "12.1//24.9//41.3")
tres.estados <- data.frame(Estado, ingreso, poblacion.media, expectativa.de.vida, nacimiento.jovenes)
tres.estados

##       Estado ingreso poblacion.media expectativa.de.vida nacimiento.jovenes
## 1   Maryland   76067           61372                78.8     17//34.3//54.1
## 2     Alaska   74444           61372                78.3   29.0//42.9//66.0
## 3 New Jersey   73702           61372                80.3   12.1//24.9//41.3

#separamos los valores que tenian "//" en el grupo de los tres años respectivos
tres.estados.años.separados <- tres.estados %>% separate(nacimiento.jovenes, into = c("2015", "2007","1991"), sep = "//")
tres.estados.años.separados

##       Estado ingreso poblacion.media expectativa.de.vida 2015 2007 1991
## 1   Maryland   76067           61372                78.8   17 34.3 54.1
## 2     Alaska   74444           61372                78.3 29.0 42.9 66.0
## 3 New Jersey   73702           61372                80.3 12.1 24.9 41.3

1.12. Recap of handy DPLYR functions

#recordando tablas previas
tabla.sleep.1 <- mutate(msleep.marcando.duplicados.en.columna.conservation, kount = n())
tabla.sleep.1[1:5, c(1:4, 10:12)]

## # A tibble: 5 x 7
##   name                       genus      vore  order  brainwt  bodywt duplicate.indic~
##   <chr>                      <chr>      <chr> <chr>    <dbl>   <dbl> <lgl>           
## 1 Cheetah                    Acinonyx   carni Carn~ NA        50     FALSE           
## 2 Owl monkey                 Aotus      omni  Prim~  0.0155    0.48  FALSE           
## 3 Mountain beaver            Aplodontia herbi Rode~ NA         1.35  FALSE           
## 4 Greater short-tailed shrew Blarina    omni  Sori~  0.00029   0.019 TRUE            
## 5 Cow                        Bos        herbi Arti~  0.423   600     FALSE

#reducimos otra fila de la tabla
Tabla.sleep.2 <- filter(msleep.marcando.duplicados.en.columna.conservation, n() > 14)
Tabla.sleep.2[1:5,c(1:4,10:11)]

## # A tibble: 5 x 6
##   name                       genus      vore  order         brainwt  bodywt
##   <chr>                      <chr>      <chr> <chr>           <dbl>   <dbl>
## 1 Cheetah                    Acinonyx   carni Carnivora    NA        50    
## 2 Owl monkey                 Aotus      omni  Primates      0.0155    0.48 
## 3 Mountain beaver            Aplodontia herbi Rodentia     NA         1.35 
## 4 Greater short-tailed shrew Blarina    omni  Soricomorpha  0.00029   0.019
## 5 Cow                        Bos        herbi Artiodactyla  0.423   600

#mostrar la primera entrada
descripcion.del.salario <- c("Golden parachute type","Well to do", "Average","Below average", "bring date seeds instead of flowers")
first(descripcion.del.salario)

## [1] "Golden parachute type"

#ultima entrada
last(descripcion.del.salario)

## [1] "bring date seeds instead of flowers"

#entrada 3 de ultima a primera
nth(descripcion.del.salario, -3)

## [1] "Average"

#segunda entrada de primera a ultima
nth(descripcion.del.salario, 2)

## [1] "Well to do"

#contar caracteres en el vector (longitud)
vector.1 <- c(22,33,44,1,2,3,3,3,4)
longitud.vector.1 <- length(vector.1)
longitud.vector.1

## [1] 9

#longitud sin elementos duplicados
longitud.sin.elementos.duplicados.vector.1 <- n_distinct(vector.1)
longitud.sin.elementos.duplicados.vector.1

## [1] 7

#condicion para saber si existe diferencias entre un vector y otro y que nos arroje una respuesta
prueba.vector.1 <- if_else(longitud.vector.1 == longitud.sin.elementos.duplicados.vector.1, "Valores unicos","valores duplicados")
prueba.vector.1

## [1] "valores duplicados"

#nuevo vector
vector.2 <- c(1,2,3,4,5,6)
length(vector.2)

## [1] 6

#descarte de elementos duplicados
longitud.sin.elementos.duplicados.vector.2 <- n_distinct(vector.2)
longitud.sin.elementos.duplicados.vector.2

## [1] 6

#resultado de la prueba de duplicados
prueba.vector.2 <- if_else(length(vector.2) == longitud.sin.elementos.duplicados.vector.2, "valores unicos", "valores duplicados" )
prueba.vector.2

## [1] "valores unicos"

#dividir 5000 en cada valor del vector
Otra.prueba.1 <- c(100, 0, 999)
tabla.x1 <- 5000/Otra.prueba.1
tabla.x1

## [1] 50.000000       Inf  5.005005

#si se tiene un valor de cero en la divicion se puede colocar na ya que es indefinido
tabla.x2 <- 5000/na_if(Otra.prueba.1,0)
tabla.x2

## [1] 50.000000        NA  5.005005

#arrojar el tipo de datos obtenidos
class(tabla.x1)

## [1] "numeric"

#tabla con valor na
Tabla.x3 <- c(33,4,11,NA,9)
Tabla.x3

## [1] 33  4 11 NA  9

#cambiar na por 0
remover.na.tabla.x3 <- coalesce(Tabla.x3,0)
remover.na.tabla.x3

## [1] 33  4 11  0  9

1.13. Ranking functions

#ordenar los datos dentro del vector en orden creciente
vector.3 <- c(100,4,12,6,8,3)
orden.valores <-row_number(vector.3)
orden.valores

## [1] 6 2 5 3 4 1

#mostrar el valor numero 1
vector.3[orden.valores[1]]

## [1] 3

#mostrar el valor 6
vector.3[orden.valores[6]]

## [1] 100

#rango minimo
rango.min <- min_rank(vector.3)
rango.min

## [1] 6 2 5 3 4 1

#rango denso
rango.denso <- dense_rank(vector.3)
rango.denso

## [1] 6 2 5 3 4 1

#rango porcentual 
rango.porcentual <- percent_rank(vector.3)
rango.porcentual

## [1] 1.0 0.2 0.8 0.4 0.6 0.0

#funcion de distribucion acumulativa
fun.distribucion.acum <- cume_dist(vector.3)
fun.distribucion.acum

## [1] 1.0000000 0.3333333 0.8333333 0.5000000 0.6666667 0.1666667

#agrupar en 3 grupos los 6 datos
agrup.datos.en.3.paquetes. <- ntile(vector.3, 3)
agrup.datos.en.3.paquetes.

## [1] 3 1 3 2 2 1

#utilizar la funcion cuantile
vector.4 <-c(2,22,33,44,77,89,99)
quantile(vector.4, prob = seq(0,1,length = 11),type = 5)

##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##  2.0  6.0 20.0 28.6 36.3 44.0 67.1 81.8 90.0 97.0 99.0

1.14. Sampling

#mostrar aleatoriamente filas de un grupo de datos
data("ChickWeight")
muestra.aleat.pollo <- sample_n(ChickWeight, 10)
muestra.aleat.pollo

##    weight Time Chick Diet
## 1      87   10     4    1
## 2     146   12    25    2
## 3      49    2    42    4
## 4      81   18    13    1
## 5      72    8    12    1
## 6      42    0     4    1
## 7      41    0    38    3
## 8     123   12    31    3
## 9     198   16    36    3
## 10    125   14     1    1

#Recomendacion si tiene un mejor sistema para ejecutar los datos
set.seed(833)

#se puedenponer aleatoriamiente valores repetidos (true) o valores que no se repitan (false)
muestra.aleat.pollo.repetidos <- sample_n(ChickWeight, 10, replace = TRUE)
muestra.aleat.pollo.repetidos

##    weight Time Chick Diet
## 1      98    8    45    4
## 2      42    0    17    1
## 3      98    8    36    3
## 4      51    2    11    1
## 5     198   20     3    1
## 6     237   21    49    4
## 7     205   16    50    4
## 8     170   16    39    3
## 9     332   18    35    3
## 10    144   14    33    3

#le colocamos mayor peso a los carros con mayores cyl
muestra.aleat.carros <- sample_n(mtcars, 12, weight = cyl)
muestra.aleat.carros[,1:5]

##                     mpg cyl  disp  hp drat
## AMC Javelin        15.2   8 304.0 150 3.15
## Porsche 914-2      26.0   4 120.3  91 4.43
## Merc 280           19.2   6 167.6 123 3.92
## Cadillac Fleetwood 10.4   8 472.0 205 2.93
## Merc 240D          24.4   4 146.7  62 3.69
## Datsun 710         22.8   4 108.0  93 3.85
## Merc 280C          17.8   6 167.6 123 3.92
## Mazda RX4 Wag      21.0   6 160.0 110 3.90
## Merc 450SLC        15.2   8 275.8 180 3.07
## Chrysler Imperial  14.7   8 440.0 230 3.23
## Maserati Bora      15.0   8 301.0 335 3.54
## Valiant            18.1   6 225.0 105 2.76

#colocamos un porcentaje de los datos que queremos que aparescan
datos.aleat.por.porcentaje <- sample_frac(ChickWeight, 0.04)
datos.aleat.por.porcentaje

##    weight Time Chick Diet
## 1      48    2    13    1
## 2      62    6    12    1
## 3     197   20    45    4
## 4     234   18    42    4
## 5      58    4    28    2
## 6     163   16     3    1
## 7     103    8    41    4
## 8     103    8    42    4
## 9     120   18    19    1
## 10     48    2    36    3
## 11     80    6    48    4
## 12    137   12    33    3
## 13    154   12    40    3
## 14     40    0     2    1
## 15    138   14    44    4
## 16    240   14    21    2
## 17    130   12    39    3
## 18     66    4    40    3
## 19    166   14    49    4
## 20    152   12    49    4
## 21    109   10    38    3
## 22    256   21    31    3
## 23    103   10     2    1

#seleccionar el porcentaje especificado y separar por color de cabello
Color.de.cabello.star.wars <- starwars %>% group_by(hair_color)
datos.aleat.cabello.star.wars <- sample_frac(Color.de.cabello.star.wars, .07, replace = TRUE)
datos.aleat.cabello.star.wars[,1:5]

## # A tibble: 5 x 5
## # Groups:   hair_color [3]
##   name          height  mass hair_color skin_color      
##   <chr>          <int> <dbl> <chr>      <chr>           
## 1 Barriss Offee    166    50 black      yellow          
## 2 Han Solo         180    80 brown      fair            
## 3 Shaak Ti         178    57 none       red, blue, white
## 4 Darth Vader      202   136 none       white           
## 5 Plo Koon         188    80 none       orange

#contar las filas unicamente de un grupo de datos
contar.filas.unicamente <- ChickWeight %>% tally()
contar.filas.unicamente

##     n
## 1 578

#contar valores en una columna separados por cada variacion
conteo.en.dieta <- ChickWeight %>% count(Diet)
conteo.en.dieta

##   Diet   n
## 1    1 220
## 2    2 120
## 3    3 120
## 4    4 118

1.15. Miscellaneous dplyr functions

#contar especies que solo tienen un miembro
conteo.especies.un.solo.miembro <- starwars %>% add_count(species) %>% filter(n == 1)
conteo.especies.un.solo.miembro[,1:6]

## # A tibble: 29 x 6
##    name                  height  mass hair_color skin_color       eye_color
##    <chr>                  <int> <dbl> <chr>      <chr>            <chr>    
##  1 Greedo                   173    74 <NA>       green            black    
##  2 Jabba Desilijic Tiure    175  1358 <NA>       green-tan, brown orange   
##  3 Yoda                      66    17 white      green            brown    
##  4 Bossk                    190   113 none       green            red      
##  5 Ackbar                   180    83 none       brown mottle     orange   
##  6 Wicket Systri Warrick     88    20 brown      brown            brown    
##  7 Nien Nunb                160    68 none       grey             black    
##  8 Nute Gunray              191    90 none       mottled green    red      
##  9 Watto                    137    NA black      blue, grey       yellow   
## 10 Sebulba                  112    40 none       grey, red        orange   
## # ... with 19 more rows

#recuerdo datos mtcars
data(mtcars)
names(mtcars)

##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"

#cambiar un nombre mtcars
mtcars <- rename(mtcars, mgp.cambiado = mpg)
names(mtcars)

##  [1] "mgp.cambiado" "cyl"          "disp"         "hp"           "drat"        
##  [6] "wt"           "qsec"         "vs"           "am"           "gear"        
## [11] "carb"

#nueva variable base a partir de otras variables
data(starwars)
nuevo.star.wars <- starwars %>% dplyr::select(name, mass, gender, species, height) %>% mutate(type = case_when(height > 200 | mass > 200 ~ "large", species == "Droid" ~ "robot", TRUE ~ "other"))
nuevo.star.wars

## # A tibble: 87 x 6
##    name                mass gender    species height type 
##    <chr>              <dbl> <chr>     <chr>    <int> <chr>
##  1 Luke Skywalker        77 masculine Human      172 other
##  2 C-3PO                 75 masculine Droid      167 robot
##  3 R2-D2                 32 masculine Droid       96 robot
##  4 Darth Vader          136 masculine Human      202 large
##  5 Leia Organa           49 feminine  Human      150 other
##  6 Owen Lars            120 masculine Human      178 other
##  7 Beru Whitesun lars    75 feminine  Human      165 other
##  8 R5-D4                 32 masculine Droid       97 robot
##  9 Biggs Darklighter     84 masculine Human      183 other
## 10 Obi-Wan Kenobi        77 masculine Human      182 other
## # ... with 77 more rows

Capitulo 1 DPLYR

R Markdown

1.1 FILTER COMMANDS

1.2. ARRANGE

1.3. RENAME

1.4. MUTATE

1.5. Select to choose variables/columns

1.6. Joins: Manipulations of data from two sources

1.7. Slice

1.8. summarise

1.9. Gathering: convert multiple columns into one

1.10. spreading: consolidation of multiple rows into one

1.11. Separate: divide a single column into multiple columns

1.12. Recap of handy DPLYR functions

1.13. Ranking functions

1.14. Sampling

1.15. Miscellaneous dplyr functions