capitulo1

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

if (!require("tidyverse")) install.packages("tidyverse")

## Loading required package: tidyverse

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(tidyverse)
data("mtcars")

############# 1.1 Comandos de filtro

##### 1.1.1 filtrar por una condicion unica 
#seleccionar de "mtcars" los carros que tengan 6 cilindros unicamente
#en el comando "filter" el igual se simboliza: ==
six.cyl.only <- filter(mtcars, cyl == 6)
six.cyl.only

##                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4      21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Valiant        18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Merc 280       19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Merc 280C      17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Ferrari Dino   19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6

#####  1.1.2 Filtrar por multiples condiciones 
#seleccionar de "mtcars" los carros que tengan 6 cilindros y 110 caballos de fuerza unicamente
six.cylinders.and.110.horse.power <- filter(mtcars, cyl == 6,
 hp == 110)
six.cylinders.and.110.horse.power

##                 mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4      21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag  21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Hornet 4 Drive 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1

##### 1.1.3 "OR" para filtrar de manera logica 
# seleccionar de "mtcars" los carros que tengan 4 engranajes o mas de 6 cilindros 
gear.eq.4.or.more.than.8 <- filter(mtcars, gear == 4|cyl > 6)
gear.eq.4.or.more.than.8

##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
## Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2

##### 1.1.4 filtrar por minimos, maximos y otros crterios numericos
# seleccionar el carro con el menor desplazamiento del motor
smallest.engine.displacement <- filter(mtcars, disp ==min(disp))
smallest.engine.displacement

##                 mpg cyl disp hp drat    wt qsec vs am gear carb
## Toyota Corolla 33.9   4 71.1 65 4.22 1.835 19.9  1  1    4    1

#filtrar por mas de una condicion 
data("ChickWeight")
chick.subset <- filter(ChickWeight, Time < 3, weight > 53)
chick.subset

##   weight Time Chick Diet
## 1     55    2    22    2
## 2     55    2    40    3
## 3     55    2    43    4
## 4     54    2    50    4

##### 1.1.5 Filtrar valores perdidos para una columna especifica
data("airquality")
head(airquality,10) #Muestra los 10 primeros datos

##    Ozone Solar.R Wind Temp Month Day
## 1     41     190  7.4   67     5   1
## 2     36     118  8.0   72     5   2
## 3     12     149 12.6   74     5   3
## 4     18     313 11.5   62     5   4
## 5     NA      NA 14.3   56     5   5
## 6     28      NA 14.9   66     5   6
## 7     23     299  8.6   65     5   7
## 8     19      99 13.8   59     5   8
## 9      8      19 20.1   61     5   9
## 10    NA     194  8.6   69     5  10

# eliminar cualquier fila con valores perdidos en la columna ozono 
no.missing.ozone = filter(airquality, !is.na(Ozone))
head(no.missing.ozone,8)

##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    28      NA 14.9   66     5   6
## 6    23     299  8.6   65     5   7
## 7    19      99 13.8   59     5   8
## 8     8      19 20.1   61     5   9

##### 1.1.6 Filtrar filas con valores faltantes en cualquier lugar del conjunto de datos 
# "complete.cases()" sirvepara eliminar filas que contengan algun valor faltante en cualquier columna 
airqual.no.NA.anywhere <- filter(airquality[1:15,],complete.cases(airquality[1:15,]))
airqual.no.NA.anywhere

##    Ozone Solar.R Wind Temp Month Day
## 1     41     190  7.4   67     5   1
## 2     36     118  8.0   72     5   2
## 3     12     149 12.6   74     5   3
## 4     18     313 11.5   62     5   4
## 5     23     299  8.6   65     5   7
## 6     19      99 13.8   59     5   8
## 7      8      19 20.1   61     5   9
## 8     16     256  9.7   69     5  12
## 9     11     290  9.2   66     5  13
## 10    14     274 10.9   68     5  14
## 11    18      65 13.2   58     5  15

##### 1.1.7 Filtrar por "%in%"
data("iris")
table(iris$Species) #muestra las especies que hay en "iris"

## 
##     setosa versicolor  virginica 
##         50         50         50

#filtrar unicamente las especies "setosa" y " virginica"
iris.two.species <- filter(iris,Species %in% c("setosa", "virginica"))
table(iris.two.species$Species)

## 
##     setosa versicolor  virginica 
##         50          0         50

#Mostrar el numero de filas antes y despues de realizar el filtro
nrow(iris); nrow(iris.two.species)

## [1] 150

## [1] 100

##### 1.1.8 Filtrar por una caracteristica e incluir solo 3 columnas
data("airquality")
airqual.3.columns <- filter(airquality, Ozone > 29)[,1:3]
head(airqual.3.columns)

##   Ozone Solar.R Wind
## 1    41     190  7.4
## 2    36     118  8.0
## 3    34     307 12.0
## 4    30     322 11.5
## 5    32      92 12.0
## 6    45     252 14.9

##### 1.1.9 Filtrar por frecuencia total de un valor en todas las filas
# muestra aquellos valores de "gear" que superan las 10 filas (valor que se repite mas de 10 veces)
table(mtcars$gear)

## 
##  3  4  5 
## 15 12  5

more.frequent.no.of.gears <- mtcars %>% group_by(gear) %>% filter(n() > 10) 
table(more.frequent.no.of.gears$gear)

## 
##  3  4 
## 15 12

# muestra aquellos valores de "gear" que superan las 10 filas y tienen menos de 105 caballos de fuerza 
more.frequent.no.of.gears.and.low.horsepower <- mtcars %>% group_by(gear) %>% filter(n() > 10, hp < 105)
 table(more.frequent.no.of.gears.and.low.horsepower$gear)

## 
## 3 4 
## 1 7

##### 1.1.10 Filtrar por columna usando "star with"
# seleccionar colmnas que empiezan con S
names(iris)

## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"

iris.display <- iris %>% dplyr::select(starts_with("S"))
head(iris.display)

##   Sepal.Length Sepal.Width Species
## 1          5.1         3.5  setosa
## 2          4.9         3.0  setosa
## 3          4.7         3.2  setosa
## 4          4.6         3.1  setosa
## 5          5.0         3.6  setosa
## 6          5.4         3.9  setosa

##### 1.1.11 Filtrar filas: las columnas cumplen los criterios (filter_at)
#selecciona el que presente la mayor cantidad de cilindros y caballos de fuerza 
new.mtcars <- mtcars %>% filter_at(vars(cyl, hp), all_vars(. == max(.)))
new.mtcars

##               mpg cyl disp  hp drat   wt qsec vs am gear carb
## Maserati Bora  15   8  301 335 3.54 3.57 14.6  0  1    5    8

#La función filter_at dice que mire solo las variables que contienen la palabra "sleep". Dentro de esas variables (en este caso, dos de ellas), filtre por cualquier valor mayor que 5. El "." significa cualquier variable con dormir en el nombre
msleep <- ggplot2::msleep
msleep

## # A tibble: 83 x 11
##    name   genus vore  order conservation sleep_total sleep_rem sleep_cycle awake
##    <chr>  <chr> <chr> <chr> <chr>              <dbl>     <dbl>       <dbl> <dbl>
##  1 Cheet~ Acin~ carni Carn~ lc                  12.1      NA        NA      11.9
##  2 Owl m~ Aotus omni  Prim~ <NA>                17         1.8      NA       7  
##  3 Mount~ Aplo~ herbi Rode~ nt                  14.4       2.4      NA       9.6
##  4 Great~ Blar~ omni  Sori~ lc                  14.9       2.3       0.133   9.1
##  5 Cow    Bos   herbi Arti~ domesticated         4         0.7       0.667  20  
##  6 Three~ Brad~ herbi Pilo~ <NA>                14.4       2.2       0.767   9.6
##  7 North~ Call~ carni Carn~ vu                   8.7       1.4       0.383  15.3
##  8 Vespe~ Calo~ <NA>  Rode~ <NA>                 7        NA        NA      17  
##  9 Dog    Canis carni Carn~ domesticated        10.1       2.9       0.333  13.9
## 10 Roe d~ Capr~ herbi Arti~ lc                   3        NA        NA      21  
## # ... with 73 more rows, and 2 more variables: brainwt <dbl>, bodywt <dbl>

msleep.over.5 <- msleep %>%
 select(name, sleep_total:sleep_rem, brainwt:bodywt) %>% filter_at(vars(contains("sleep")), all_vars(.>5))
msleep.over.5

## # A tibble: 2 x 5
##   name                 sleep_total sleep_rem brainwt bodywt
##   <chr>                      <dbl>     <dbl>   <dbl>  <dbl>
## 1 Thick-tailed opposum        19.4       6.6  NA       0.37
## 2 Giant armadillo             18.1       6.1   0.081  60

############### 1.2 Arreglar (sort)

msleep <- ggplot2::msleep
msleep[,1:4]

## # A tibble: 83 x 4
##    name                       genus       vore  order       
##    <chr>                      <chr>       <chr> <chr>       
##  1 Cheetah                    Acinonyx    carni Carnivora   
##  2 Owl monkey                 Aotus       omni  Primates    
##  3 Mountain beaver            Aplodontia  herbi Rodentia    
##  4 Greater short-tailed shrew Blarina     omni  Soricomorpha
##  5 Cow                        Bos         herbi Artiodactyla
##  6 Three-toed sloth           Bradypus    herbi Pilosa      
##  7 Northern fur seal          Callorhinus carni Carnivora   
##  8 Vesper mouse               Calomys     <NA>  Rodentia    
##  9 Dog                        Canis       carni Carnivora   
## 10 Roe deer                   Capreolus   herbi Artiodactyla
## # ... with 73 more rows

###### 1.2.1 Ordenar de forma ascendente 
animal.name.sequence <- arrange(msleep, vore, order)
animal.name.sequence[,1:4]

## # A tibble: 83 x 4
##    name              genus        vore  order    
##    <chr>             <chr>        <chr> <chr>    
##  1 Cheetah           Acinonyx     carni Carnivora
##  2 Northern fur seal Callorhinus  carni Carnivora
##  3 Dog               Canis        carni Carnivora
##  4 Domestic cat      Felis        carni Carnivora
##  5 Gray seal         Haliochoerus carni Carnivora
##  6 Tiger             Panthera     carni Carnivora
##  7 Jaguar            Panthera     carni Carnivora
##  8 Lion              Panthera     carni Carnivora
##  9 Caspian seal      Phoca        carni Carnivora
## 10 Genet             Genetta      carni Carnivora
## # ... with 73 more rows

###### 1.2.2 Ordenar de forma descendente 
animal.name.sequence.desc <- arrange(msleep, vore, desc(order))
head(animal.name.sequence.desc[,1:4])

## # A tibble: 6 x 4
##   name                       genus         vore  order          
##   <chr>                      <chr>         <chr> <chr>          
## 1 Northern grasshopper mouse Onychomys     carni Rodentia       
## 2 Slow loris                 Nyctibeus     carni Primates       
## 3 Thick-tailed opposum       Lutreolina    carni Didelphimorphia
## 4 Long-nosed armadillo       Dasypus       carni Cingulata      
## 5 Pilot whale                Globicephalus carni Cetacea        
## 6 Common porpoise            Phocoena      carni Cetacea

############### 1.2 renombrar
# Renombrar una o mas columnas
names(iris)

## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"

renamed.iris <- rename(iris, width.of.petals = Petal.Width,
various.plants.and.animals = Species)
names(renamed.iris)

## [1] "Sepal.Length"               "Sepal.Width"               
## [3] "Petal.Length"               "width.of.petals"           
## [5] "various.plants.and.animals"

############### 1.4 Mutate

# Mutate sire para agregar nuevas variables en un dataframe 
data("ChickWeight")
ChickWeight[1:2,] #first two rows

##   weight Time Chick Diet
## 1     42    0     1    1
## 2     51    2     1    1

#agrega una nueva variable llamada log.of.weight que calcula el logaritmo del peso
Chickweight.with.log <- mutate(ChickWeight,log.of.weight = log10(weight))
Chickweight.with.log[1:2,]

##   weight Time Chick Diet log.of.weight
## 1     42    0     1    1      1.623249
## 2     51    2     1    1      1.707570

##### 1.4.1 mutate_all para agregar nuevos campos de una vez
msleep <- ggplot2::msleep
names(msleep)

##  [1] "name"         "genus"        "vore"         "order"        "conservation"
##  [6] "sleep_total"  "sleep_rem"    "sleep_cycle"  "awake"        "brainwt"     
## [11] "bodywt"

msleep

## # A tibble: 83 x 11
##    name   genus vore  order conservation sleep_total sleep_rem sleep_cycle awake
##    <chr>  <chr> <chr> <chr> <chr>              <dbl>     <dbl>       <dbl> <dbl>
##  1 Cheet~ Acin~ carni Carn~ lc                  12.1      NA        NA      11.9
##  2 Owl m~ Aotus omni  Prim~ <NA>                17         1.8      NA       7  
##  3 Mount~ Aplo~ herbi Rode~ nt                  14.4       2.4      NA       9.6
##  4 Great~ Blar~ omni  Sori~ lc                  14.9       2.3       0.133   9.1
##  5 Cow    Bos   herbi Arti~ domesticated         4         0.7       0.667  20  
##  6 Three~ Brad~ herbi Pilo~ <NA>                14.4       2.2       0.767   9.6
##  7 North~ Call~ carni Carn~ vu                   8.7       1.4       0.383  15.3
##  8 Vespe~ Calo~ <NA>  Rode~ <NA>                 7        NA        NA      17  
##  9 Dog    Canis carni Carn~ domesticated        10.1       2.9       0.333  13.9
## 10 Roe d~ Capr~ herbi Arti~ lc                   3        NA        NA      21  
## # ... with 73 more rows, and 2 more variables: brainwt <dbl>, bodywt <dbl>

#crea una nuevas variables en las que calcula la raiz cuadrada de las columnas 6 a 11
msleep.with.square.roots <- mutate_all(msleep[,6:11], funs("square root" = sqrt( . )))

## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

names(msleep.with.square.roots)

##  [1] "sleep_total"             "sleep_rem"              
##  [3] "sleep_cycle"             "awake"                  
##  [5] "brainwt"                 "bodywt"                 
##  [7] "sleep_total_square root" "sleep_rem_square root"  
##  [9] "sleep_cycle_square root" "awake_square root"      
## [11] "brainwt_square root"     "bodywt_square root"

##### 1.4.2 mutate_at para agregar campos
data("Titanic")
Titanic <- as.data.frame(Titanic)#convertir la tabla a un data frame 
head(Titanic)

##   Class    Sex   Age Survived Freq
## 1   1st   Male Child       No    0
## 2   2nd   Male Child       No    0
## 3   3rd   Male Child       No   35
## 4  Crew   Male Child       No    0
## 5   1st Female Child       No    0
## 6   2nd Female Child       No    0

#crea tres variables nevas en las cuales cuenta el numero de campos quue presentan el mismo valor para cada una
titanic.with.ranks <- mutate_at(Titanic, vars(Class,Age,Survived), funs(Rank = min_rank(desc(.))))
head(titanic.with.ranks)

##   Class    Sex   Age Survived Freq Class_Rank Age_Rank Survived_Rank
## 1   1st   Male Child       No    0         25       17            17
## 2   2nd   Male Child       No    0         17       17            17
## 3   3rd   Male Child       No   35          9       17            17
## 4  Crew   Male Child       No    0          1       17            17
## 5   1st Female Child       No    0         25       17            17
## 6   2nd Female Child       No    0         17       17            17

##### 1.4.3 mutate_if
#usa el operador logico if y agrega nuevas variables 

divide.by.10 <- function (a.number) (a.number / 10)
head(CO2)

##   Plant   Type  Treatment conc uptake
## 1   Qn1 Quebec nonchilled   95   16.0
## 2   Qn1 Quebec nonchilled  175   30.4
## 3   Qn1 Quebec nonchilled  250   34.8
## 4   Qn1 Quebec nonchilled  350   37.2
## 5   Qn1 Quebec nonchilled  500   35.3
## 6   Qn1 Quebec nonchilled  675   39.2

#divide los campos qe sean numericos entre 10
new.df <- CO2 %>% mutate_if(is.numeric, divide.by.10)
head(new.df)

##   Plant   Type  Treatment conc uptake
## 1   Qn1 Quebec nonchilled  9.5   1.60
## 2   Qn1 Quebec nonchilled 17.5   3.04
## 3   Qn1 Quebec nonchilled 25.0   3.48
## 4   Qn1 Quebec nonchilled 35.0   3.72
## 5   Qn1 Quebec nonchilled 50.0   3.53
## 6   Qn1 Quebec nonchilled 67.5   3.92

# cambia los campos vavios por cero
df <- data.frame(alpha = c(22, 1, NA),almond = c(0, 5, 10),grape = c(0, 2, 2),apple = c(NA, 5, 10))
df

##   alpha almond grape apple
## 1    22      0     0    NA
## 2     1      5     2     5
## 3    NA     10     2    10

df.fix.alpha <- df %>% mutate_if(is.numeric, coalesce, ... =0)
df.fix.alpha

##   alpha almond grape apple
## 1    22      0     0     0
## 2     1      5     2     5
## 3     0     10     2    10

##### 1.4.4 Detección de cadena e indicador de duplicado verdadero / falso
msleep <- ggplot2::msleep
table(msleep$vore)

## 
##   carni   herbi insecti    omni 
##      19      32       5      20

msleep.no.c.or.a <- filter(msleep, !str_detect(vore, paste(c("c","a"), collapse = "|")))
table(msleep.no.c.or.a$vore)

## 
## herbi  omni 
##    32    20

#crea una nueva columna que indica si se repiten valores en otra columna 
msleep.with.dup.indicator <- mutate(msleep, duplicate.indicator = duplicated(conservation))
msleep.with.dup.indicator[1:6,]

## # A tibble: 6 x 12
##   name    genus vore  order conservation sleep_total sleep_rem sleep_cycle awake
##   <chr>   <chr> <chr> <chr> <chr>              <dbl>     <dbl>       <dbl> <dbl>
## 1 Cheetah Acin~ carni Carn~ lc                  12.1      NA        NA      11.9
## 2 Owl mo~ Aotus omni  Prim~ <NA>                17         1.8      NA       7  
## 3 Mounta~ Aplo~ herbi Rode~ nt                  14.4       2.4      NA       9.6
## 4 Greate~ Blar~ omni  Sori~ lc                  14.9       2.3       0.133   9.1
## 5 Cow     Bos   herbi Arti~ domesticated         4         0.7       0.667  20  
## 6 Three-~ Brad~ herbi Pilo~ <NA>                14.4       2.2       0.767   9.6
## # ... with 3 more variables: brainwt <dbl>, bodywt <dbl>,
## #   duplicate.indicator <lgl>

#otra forma de hacerlo
msleep.with.dup.indicator <- mutate(msleep, duplicate.indicator = duplicated(conservation))
msleep.with.dup.indicator[1:6,c(1,2,3,12)]

## # A tibble: 6 x 4
##   name                       genus      vore  duplicate.indicator
##   <chr>                      <chr>      <chr> <lgl>              
## 1 Cheetah                    Acinonyx   carni FALSE              
## 2 Owl monkey                 Aotus      omni  FALSE              
## 3 Mountain beaver            Aplodontia herbi FALSE              
## 4 Greater short-tailed shrew Blarina    omni  TRUE               
## 5 Cow                        Bos        herbi FALSE              
## 6 Three-toed sloth           Bradypus   herbi TRUE

#ordenar por conservation como clave principal y genus como clave menor:
#Conservation y genus deben estar duplicados para que en la nueva culumna aparezca True
msleep.with.dup.indicator2 <- mutate(msleep, duplicate.indicator = duplicated(conservation, genus)) %>% arrange(conservation,genus)
msleep.with.dup.indicator2

## # A tibble: 83 x 12
##    name   genus vore  order conservation sleep_total sleep_rem sleep_cycle awake
##    <chr>  <chr> <chr> <chr> <chr>              <dbl>     <dbl>       <dbl> <dbl>
##  1 Giraf~ Gira~ herbi Arti~ cd                   1.9       0.4      NA      22.1
##  2 Pilot~ Glob~ carni Ceta~ cd                   2.7       0.1      NA      21.4
##  3 Cow    Bos   herbi Arti~ domesticated         4         0.7       0.667  20  
##  4 Dog    Canis carni Carn~ domesticated        10.1       2.9       0.333  13.9
##  5 Guine~ Cavis herbi Rode~ domesticated         9.4       0.8       0.217  14.6
##  6 Chinc~ Chin~ herbi Rode~ domesticated        12.5       1.5       0.117  11.5
##  7 Horse  Equus herbi Peri~ domesticated         2.9       0.6       1      21.1
##  8 Donkey Equus herbi Peri~ domesticated         3.1       0.4      NA      20.9
##  9 Domes~ Felis carni Carn~ domesticated        12.5       3.2       0.417  11.5
## 10 Rabbit Oryc~ herbi Lago~ domesticated         8.4       0.9       0.417  15.6
## # ... with 73 more rows, and 3 more variables: brainwt <dbl>, bodywt <dbl>,
## #   duplicate.indicator <lgl>

#otro ejemplo 
fruit <- c("apple","pear","orange","grape", "orange","orange")
x <- c(1,2,4,9,4,6)
y <- c(22,3,4,55,15,9)
z <- c(3,1,4,10,12,8)
w <- c(2,2,2,4,5,6)

df <- data.frame(fruit,x,y,z,w)
df

##    fruit x  y  z w
## 1  apple 1 22  3 2
## 2   pear 2  3  1 2
## 3 orange 4  4  4 2
## 4  grape 9 55 10 4
## 5 orange 4 15 12 5
## 6 orange 6  9  8 6

#Duplicar la columna fruit
df.show.single.dup <- mutate(df, duplicate.indicator = duplicated(fruit))
df.show.single.dup

##    fruit x  y  z w duplicate.indicator
## 1  apple 1 22  3 2               FALSE
## 2   pear 2  3  1 2               FALSE
## 3 orange 4  4  4 2               FALSE
## 4  grape 9 55 10 4               FALSE
## 5 orange 4 15 12 5                TRUE
## 6 orange 6  9  8 6                TRUE

#Nota: cuando el DYPLR detecta el primer valor pone como falso que este repetido, ya que no sabe si bajando por la columna haya otro igual. Cuando encuentra un segundo valor igual pone verdadero. 


##### 1.4.5 Eliminar variables usando NULL
fruit <- c("apple","pear","orange","grape", "orange","orange")
x <- c(1,2,4,9,4,6)
y <- c(22,3,4,55,15,9)
z <- c(3,1,4,10,12,8)
df <- data.frame(fruit,x,y,z)
df <- mutate(df, z = NULL) #elimina la variable z
df

##    fruit x  y
## 1  apple 1 22
## 2   pear 2  3
## 3 orange 4  4
## 4  grape 9 55
## 5 orange 4 15
## 6 orange 6  9

##### 1.4.6 Secuencia de codificación preferida

#metodo no recomendado para agregar variables
if (!require("nycflights13")) install.packages("nycflights13")

## Loading required package: nycflights13

mutate(flights, gain = arr_delay - dep_delay, hours = air_time / 60,
 gain_per_hour = gain / hours,
 gain_per_minute = 60 * gain_per_hour)

## # A tibble: 336,776 x 23
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 336,766 more rows, and 15 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## #   gain <dbl>, hours <dbl>, gain_per_hour <dbl>, gain_per_minute <dbl>

#metodo recomendado
newfield.flights <- flights %>%
 mutate(gain = arr_delay - dep_delay, hours = air_time / 60) %>% mutate(gain_per_hour = gain / hours) %>% mutate(gain_per_minute = 60 * gain_per_hour)

newfield.flights[1:6,c(1:2,20:23)]

## # A tibble: 6 x 6
##    year month  gain hours gain_per_hour gain_per_minute
##   <int> <int> <dbl> <dbl>         <dbl>           <dbl>
## 1  2013     1     9  3.78          2.38            143.
## 2  2013     1    16  3.78          4.23            254.
## 3  2013     1    31  2.67         11.6             698.
## 4  2013     1   -17  3.05         -5.57           -334.
## 5  2013     1   -19  1.93         -9.83           -590.
## 6  2013     1    16  2.5           6.4             384

##### 1.4.7Transmutar: mantener solo las variables creadas

#Crear datos nuevos basados en calculos realizados en variables exitentes
fruit <- c("apple","pear","orange","grape", "orange","orange")
x <- c(1,2,4,9,4,6)
y <- c(22,3,4,55,15,9)
z <- c(3,1,4,10,12,8)
df <- data.frame(fruit,x,y,z)
df

##    fruit x  y  z
## 1  apple 1 22  3
## 2   pear 2  3  1
## 3 orange 4  4  4
## 4  grape 9 55 10
## 5 orange 4 15 12
## 6 orange 6  9  8

df <- transmute(df, new.variable = x + y + z)
df

##   new.variable
## 1           26
## 2            6
## 3           12
## 4           74
## 5           31
## 6           23

#####  1.4.8 Use Across para aplicar una función en varias columnas

#multiplica por 2 el valor de las columnas numericas
double.it <- function(x) x*2
head(iris) #iris original

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

iris %>%
 mutate(across(where(is.numeric), double.it)) %>%
 head()

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1         10.2         7.0          2.8         0.4  setosa
## 2          9.8         6.0          2.8         0.4  setosa
## 3          9.4         6.4          2.6         0.4  setosa
## 4          9.2         6.2          3.0         0.4  setosa
## 5         10.0         7.2          2.8         0.4  setosa
## 6         10.8         7.8          3.4         0.8  setosa

##### 1.4.9 Mutación condicional usando case_when
row1 <- c("a","b","c","d","e","f","column.to.be.changed")
row2 <- c(1,1,1,6,6,1,2)
row3 <- c(3,4,4,6,4,4,4)
row4 <- c(4,6,25,5,5,2,9)
row5 <- c(5,3,6,3,3,6,2)
df <- as.data.frame(rbind(row2,row3,row4,row5))
names(df) <- row1
df

##      a b  c d e f column.to.be.changed
## row2 1 1  1 6 6 1                    2
## row3 3 4  4 6 4 4                    4
## row4 4 6 25 5 5 2                    9
## row5 5 3  6 3 3 6                    2

# si se cumplen los "OR", la ultima columna cambiara su valor por 2 o 3
new.df <-df %>%
 mutate(column.to.be.changed = case_when(a == 2 | a == 5 |
 a == 7 | (a == 1 & b == 4) ~ 2, a == 0 | a == 1 | a == 4 |
 a == 3 | c == 4 ~ 3, TRUE ~ NA_real_))
new.df

##      a b  c d e f column.to.be.changed
## row2 1 1  1 6 6 1                    3
## row3 3 4  4 6 4 4                    3
## row4 4 6 25 5 5 2                    3
## row5 5 3  6 3 3 6                    2

############### 1.5 select to para elegir variables o columnas 

##### 1.5.1 borrar una columna 
fruit <- c("apple","pear","orange","grape", "orange","orange")
x <- c(1,2,4,9,4,6)
y <- c(22,3,4,55,15,9)
z <- c(3,1,4,10,12,8)
df <- data.frame(fruit,x,y,z) 
df

##    fruit x  y  z
## 1  apple 1 22  3
## 2   pear 2  3  1
## 3 orange 4  4  4
## 4  grape 9 55 10
## 5 orange 4 15 12
## 6 orange 6  9  8

# se pone un menos antes de la vaiable que se desea eliminar 
new.df.no.fruit <- dplyr::select(df, -fruit)
new.df.no.fruit

##   x  y  z
## 1 1 22  3
## 2 2  3  1
## 3 4  4  4
## 4 9 55 10
## 5 4 15 12
## 6 6  9  8

##### 1.5.2 borrar columnas por el nombre usando  starts_with o ends_with
data("mtcars")
names(mtcars)

##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"

# eliminar las columnas que inician con "d"
mtcars.no.col.names.start.with.d <- select(mtcars, -starts_with("d"))
names(mtcars.no.col.names.start.with.d)

## [1] "mpg"  "cyl"  "hp"   "wt"   "qsec" "vs"   "am"   "gear" "carb"

#eliminar las columnas que terminan con t
mtcars.no.col.names.ends.with <- select(mtcars,
 - ends_with("t"))
names(mtcars.no.col.names.ends.with)

## [1] "mpg"  "cyl"  "disp" "hp"   "qsec" "vs"   "am"   "gear" "carb"

##### 1.5.3 ordenar las columnas

fruit <- c("apple","pear","orange","grape", "orange","orange")
x <- c(1,2,4,9,4,6)
y <- c(22,3,4,55,15,9)
z <- c(3,1,4,10,12,8)
df <- data.frame(fruit,x,y,z)
df

##    fruit x  y  z
## 1  apple 1 22  3
## 2   pear 2  3  1
## 3 orange 4  4  4
## 4  grape 9 55 10
## 5 orange 4 15 12
## 6 orange 6  9  8

##### 1.5.4 select_all para aplicar una función a todas las columnas
state <- c("Maryland", "Alaska", "New Jersey")
income <- c(76067,74444,73702)
median.us <- c(61372,61372,61372)
life.expectancy <- c(78.8,78.3,80.3)
top.3.states <- data.frame(state, income, median.us,
 life.expectancy)
top.3.states

##        state income median.us life.expectancy
## 1   Maryland  76067     61372            78.8
## 2     Alaska  74444     61372            78.3
## 3 New Jersey  73702     61372            80.3

#para Escribir en mayúsculas los nombres de las columnas, utilizar la función "toupper"
new.top.3.states <- select_all(top.3.states, toupper)
new.top.3.statesnew.top.3.states <- select_all(top.3.states, toupper)
new.top.3.states

##        STATE INCOME MEDIAN.US LIFE.EXPECTANCY
## 1   Maryland  76067     61372            78.8
## 2     Alaska  74444     61372            78.3
## 3 New Jersey  73702     61372            80.3

##### 1.5.5 Seleccionar columnas mediante la función Pull
top.3.states <- data.frame(state, income, median.us, life.expectancy)
top.3.states

##        state income median.us life.expectancy
## 1   Maryland  76067     61372            78.8
## 2     Alaska  74444     61372            78.3
## 3 New Jersey  73702     61372            80.3

#obtener la primera columna
pull.first.column <- pull(top.3.states,1)
pull.first.column

## [1] "Maryland"   "Alaska"     "New Jersey"

#usando -1 se obtiene la columna de la derecha 
pull.last.column <- pull(top.3.states,-1)
pull.last.column

## [1] 78.8 78.3 80.3

##### 1.5.6 Seleccionar filas: cualquier variable cumple alguna condición

nrow(mtcars)

## [1] 32

#cualquier casilla que exceda los 200
mtcars.more.than.200 <- filter_all(mtcars, any_vars(. > 200)) #numero de filas que presentan valores mayores a 200
nrow(mtcars.more.than.200)

## [1] 16

##### 1.5.7 Seleccionar columnas: Omitir si el nombre de la columna contiene caracteres específicos

#seleccionar las columnas especificadas mas las que no tengan alguna p
names(mtcars)

##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"

cars.with.no.p <- mtcars %>% dplyr::select(-contains("p"))
names(cars.with.no.p) #inguna columna que tenga "p" es seleccionada

## [1] "cyl"  "drat" "wt"   "qsec" "vs"   "am"   "gear" "carb"

##### 1.5.8 Seleccionar Usando Wildcard Matching
names(mtcars)

##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"

subset.mtcars <- select(mtcars, matches("pg|gea")) #La función "coincide" es más general que "contiene" porque es una expresión regular y, por lo tanto, más flexible
names(subset.mtcars)

## [1] "mpg"  "gear"

subset.mtcars

##                      mpg gear
## Mazda RX4           21.0    4
## Mazda RX4 Wag       21.0    4
## Datsun 710          22.8    4
## Hornet 4 Drive      21.4    3
## Hornet Sportabout   18.7    3
## Valiant             18.1    3
## Duster 360          14.3    3
## Merc 240D           24.4    4
## Merc 230            22.8    4
## Merc 280            19.2    4
## Merc 280C           17.8    4
## Merc 450SE          16.4    3
## Merc 450SL          17.3    3
## Merc 450SLC         15.2    3
## Cadillac Fleetwood  10.4    3
## Lincoln Continental 10.4    3
## Chrysler Imperial   14.7    3
## Fiat 128            32.4    4
## Honda Civic         30.4    4
## Toyota Corolla      33.9    4
## Toyota Corona       21.5    3
## Dodge Challenger    15.5    3
## AMC Javelin         15.2    3
## Camaro Z28          13.3    3
## Pontiac Firebird    19.2    3
## Fiat X1-9           27.3    4
## Porsche 914-2       26.0    5
## Lotus Europa        30.4    5
## Ford Pantera L      15.8    5
## Ferrari Dino        19.7    5
## Maserati Bora       15.0    5
## Volvo 142E          21.4    4

############## 1.6 Uniones: manipulaciones de datos de dos fuentes

##### 1.6.1 Unión a la izquierda
us.state.areas <- as.data.frame(cbind(state.abb, state.area))
us.state.areas[1:3,]

##   state.abb state.area
## 1        AL      51609
## 2        AK     589757
## 3        AZ     113909

us.state.abbreviation.and.name <- as.data.frame(cbind(state.abb, state.name))
us.state.abbreviation.and.name[1:3,]

##   state.abb state.name
## 1        AL    Alabama
## 2        AK     Alaska
## 3        AZ    Arizona

#combina las dos tablas
state.info.abb.area.name <- us.state.areas %>%
 left_join(us.state.abbreviation.and.name, by = "state.abb")
head(state.info.abb.area.name)

##   state.abb state.area state.name
## 1        AL      51609    Alabama
## 2        AK     589757     Alaska
## 3        AZ     113909    Arizona
## 4        AR      53104   Arkansas
## 5        CA     158693 California
## 6        CO     104247   Colorado

##### 1.6.2 Inner Join

# primer data frame 
names <- c("Sally","Tom","Frieda","Alfonzo")
team.scores <- c(3,5,2,7)
team.league <- c("alpha","beta","gamma", "omicron")
team.info <- data.frame(names, team.scores, team.league)

# segundo data frame
names = c("Sally","Tom", "Bill", "Alfonzo")
school.grades <- c("A","B","C","B")
school.info <- data.frame(names, school.grades)

#se seleccionan los nombres repetidos en los dos data frame 
school.and.team <- inner_join(team.info, school.info, by = "names")
school.and.team

##     names team.scores team.league school.grades
## 1   Sally           3       alpha             A
## 2     Tom           5        beta             B
## 3 Alfonzo           7     omicron             B

##### 1.6.3 Anti-unión

# primer data frame
names <- c("Sally","Tom","Frieda","Alfonzo")
team.scores <- c(3,5,2,7)
team.league <- c("alpha","beta","gamma", "omicron")
team.info <- data.frame(names, team.scores, team.league)
team.info

##     names team.scores team.league
## 1   Sally           3       alpha
## 2     Tom           5        beta
## 3  Frieda           2       gamma
## 4 Alfonzo           7     omicron

# segundo data frame 
names <- c("Sally","Tom", "Bill", "Alfonzo")
school.grades <- c("A","B","C","B")
school.info <- data.frame(names, school.grades)
school.info

##     names school.grades
## 1   Sally             A
## 2     Tom             B
## 3    Bill             C
## 4 Alfonzo             B

# selecciona los datos del primer data frame que no coinciden con los del segundo 
team.info.but.no.grades <- anti_join(team.info, school.info, by = "names")
team.info.but.no.grades

##    names team.scores team.league
## 1 Frieda           2       gamma

##### 1.6.4 union completa

#primer data frame
names = c("Sally","Tom","Frieda","Alfonzo")
team.scores = c(3,5,2,7)
team.league = c("alpha","beta","gamma", "omicron")
team.info = data.frame(names, team.scores, team.league)

#segundo data frame
names = c("Sally","Tom", "Bill", "Alfonzo")
school.grades = c("A","B","C","B")
school.info = data.frame(names, school.grades)

# une las dos tablas incluso los datos que no coinciden 
team.info.and.or.grades <- full_join(team.info, school.info, by = "names")
team.info.and.or.grades

##     names team.scores team.league school.grades
## 1   Sally           3       alpha             A
## 2     Tom           5        beta             B
## 3  Frieda           2       gamma          <NA>
## 4 Alfonzo           7     omicron             B
## 5    Bill          NA        <NA>             C

##### 1.6.5 semi union 
#mantiene todas las observaciones del dataset 1 que coinciden con el dataset2
team.info.with.grades <- semi_join(team.info, school.info)

## Joining, by = "names"

team.info.with.grades

##     names team.scores team.league
## 1   Sally           3       alpha
## 2     Tom           5        beta
## 3 Alfonzo           7     omicron

##### 1.6.6 union a la derecha 
us.state.areas <- as.data.frame(cbind(state.abb, state.area))
us.state.areas[1:3,]

##   state.abb state.area
## 1        AL      51609
## 2        AK     589757
## 3        AZ     113909

us.state.abbreviation.and.name <- as.data.frame(cbind(state.abb, state.name))
us.state.abbreviation.and.name[1:3,]

##   state.abb state.name
## 1        AL    Alabama
## 2        AK     Alaska
## 3        AZ    Arizona

us.state.abbreviation.and.name[1,1] <- "Intentional Mismatch" #cambiar el nombre 

# uunion a la derecha, donde no haya dato del primer data set, se rellena con NA
us.state.with.abbreviation.and.name.and.area <- right_join(us.state.areas,
 us.state.abbreviation.and.name, by = "state.abb")
us.state.with.abbreviation.and.name.and.area[1:3,]

##   state.abb state.area state.name
## 1        AK     589757     Alaska
## 2        AZ     113909    Arizona
## 3        AR      53104   Arkansas

############ 1.7 Slice
msleep <- ggplot2::msleep
nrow(msleep)

## [1] 83

msleep.only.first.5 <- slice(msleep, -6:-n()) #elimina las filas 6 en adelante (-n())
nrow(msleep.only.first.5)

## [1] 5

msleep.20.rows <- msleep %>%slice(20:39) #mantiene las filas 20 a 39
nrow(msleep.20.rows)

## [1] 20

nrow(msleep) - nrow(msleep.20.rows) # diferencia de filas en los dos data frame

## [1] 63

############ 1.8 resumir 
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

data(gehan)
gehan2 <- gehan
library(tidyverse)

gehan2 %>% summarise( kount = n()) #numero de filas

##   kount
## 1    42

gehan2 %>% group_by(treat) %>% summarise(kount = n()) #recuento por variable "treat"

## # A tibble: 2 x 2
##   treat   kount
##   <fct>   <int>
## 1 6-MP       21
## 2 control    21

#Datos estadisticos dependiendo del "treat"
gehan2 %>%
 group_by(treat) %>%
 summarise(average.remiss.time = mean(time),
 median.remiss.time = median(time),
 std.dev.remiss.time = sd(time),
 median.abs.deviation = mad(time),
 IQR.remiss.time = IQR(time))

## # A tibble: 2 x 6
##   treat   average.remiss.time median.remiss.t~ std.dev.remiss.~ median.abs.devi~
##   <fct>                 <dbl>            <int>            <dbl>            <dbl>
## 1 6-MP                  17.1                16            10.0             10.4 
## 2 control                8.67                8             6.47             5.93
## # ... with 1 more variable: IQR.remiss.time <dbl>

#mostrar maximos y minimos de "treat"
gehan2 %>%
 group_by(treat) %>%
 summarise(minimum.remission = min(time),
 max.remission = max(time))

## # A tibble: 2 x 3
##   treat   minimum.remission max.remission
##   <fct>               <int>         <int>
## 1 6-MP                    6            35
## 2 control                 1            23

##### 1.8.1 Sumarise across

library(MASS)
subset.survey <- survey[1:10,]
library(dplyr)
head(subset.survey)

##      Sex Wr.Hnd NW.Hnd W.Hnd    Fold Pulse    Clap Exer Smoke Height      M.I
## 1 Female   18.5   18.0 Right  R on L    92    Left Some Never 173.00   Metric
## 2   Male   19.5   20.5  Left  R on L   104    Left None Regul 177.80 Imperial
## 3   Male   18.0   13.3 Right  L on R    87 Neither None Occas     NA     <NA>
## 4   Male   18.8   18.9 Right  R on L    NA Neither None Never 160.00   Metric
## 5   Male   20.0   20.0 Right Neither    35   Right Some Never 165.00   Metric
## 6 Female   18.0   17.7 Right  L on R    64   Right Some Never 172.72 Imperial
##      Age
## 1 18.250
## 2 17.583
## 3 16.917
## 4 20.333
## 5 23.667
## 6 21.000

subset.survey %>%
 na.omit() %>% #Remueve cualquier NAs
 group_by(Sex) %>%
 summarise(across(where(is.numeric), mean,
 .names = "mean_{col}")) %>%
 head()

## # A tibble: 2 x 6
##   Sex    mean_Wr.Hnd mean_NW.Hnd mean_Pulse mean_Height mean_Age
##   <fct>        <dbl>       <dbl>      <dbl>       <dbl>    <dbl>
## 1 Female        17.8        17.7       76.7        168.     25.0
## 2 Male          19.1        19.2       76.8        174.     20.3

#contar las combinaciones entre variables
new.sleep <- msleep %>% group_by(vore, order)
s <- summarise(new.sleep, n())

## `summarise()` has grouped output by 'vore'. You can override using the `.groups` argument.

## # A tibble: 32 x 3
## # Groups:   vore [5]
##    vore  order           `n()`
##    <chr> <chr>           <int>
##  1 carni Carnivora          12
##  2 carni Cetacea             3
##  3 carni Cingulata           1
##  4 carni Didelphimorphia     1
##  5 carni Primates            1
##  6 carni Rodentia            1
##  7 herbi Artiodactyla        5
##  8 herbi Diprotodontia       1
##  9 herbi Hyracoidea          2
## 10 herbi Lagomorpha          1
## # ... with 22 more rows

#totales
new.sleep.totals <- msleep %>%
 group_by(vore, order) %>%
 summarise(n())

## `summarise()` has grouped output by 'vore'. You can override using the `.groups` argument.

new.sleep.totals

## # A tibble: 32 x 3
## # Groups:   vore [5]
##    vore  order           `n()`
##    <chr> <chr>           <int>
##  1 carni Carnivora          12
##  2 carni Cetacea             3
##  3 carni Cingulata           1
##  4 carni Didelphimorphia     1
##  5 carni Primates            1
##  6 carni Rodentia            1
##  7 herbi Artiodactyla        5
##  8 herbi Diprotodontia       1
##  9 herbi Hyracoidea          2
## 10 herbi Lagomorpha          1
## # ... with 22 more rows

############# 1.9 1.9 Gathering: convertir varias columnas en una 

state <- c("Maryland", "Alaska", "New Jersey")
income <- c(76067,74444,73702)
median.us <- c(61372,61372,61372)
life.expectancy <- c(78.8,78.3,80.3)
teen.birth.rate.2015 <- c(17,29.3,12.1)
teen.birth.rate.2007 <- c(34.3,42.9,24.9)
teen.birth.rate.1991 <- c(54.1, 66, 41.3)
top.3.states <- data.frame(state, income, median.us,
 life.expectancy,
 teen.birth.rate.2015, teen.birth.rate.2007,
 teen.birth.rate.1991)
names(top.3.states) <- c("state", "income", "median.us",
 "life.expectancy","2015","2007","1991")
top.3.states

##        state income median.us life.expectancy 2015 2007 1991
## 1   Maryland  76067     61372            78.8 17.0 34.3 54.1
## 2     Alaska  74444     61372            78.3 29.3 42.9 66.0
## 3 New Jersey  73702     61372            80.3 12.1 24.9 41.3

# combinar las columnas de años
new.top.3.states <- top.3.states %>%
 gather("2015", "2007", "1991", key = "year", value = "cases")
new.top.3.states

##        state income median.us life.expectancy year cases
## 1   Maryland  76067     61372            78.8 2015  17.0
## 2     Alaska  74444     61372            78.3 2015  29.3
## 3 New Jersey  73702     61372            80.3 2015  12.1
## 4   Maryland  76067     61372            78.8 2007  34.3
## 5     Alaska  74444     61372            78.3 2007  42.9
## 6 New Jersey  73702     61372            80.3 2007  24.9
## 7   Maryland  76067     61372            78.8 1991  54.1
## 8     Alaska  74444     61372            78.3 1991  66.0
## 9 New Jersey  73702     61372            80.3 1991  41.3

############# 1.10 Spreading: Consolidacion de varias filas en una 

df_1 <- data_frame(Type = c("TypeA", "TypeA", "TypeB", "TypeB"),
 Answer = c("Yes", "No", NA, "No"), n = 1:4)

## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

df_1 #before

## # A tibble: 4 x 3
##   Type  Answer     n
##   <chr> <chr>  <int>
## 1 TypeA Yes        1
## 2 TypeA No         2
## 3 TypeB <NA>       3
## 4 TypeB No         4

#separar columnas
df_2 <- df_1 %>%
filter(!is.na(Answer)) %>%
 spread(key=Answer, value=n)
df_2

## # A tibble: 2 x 3
##   Type     No   Yes
##   <chr> <int> <int>
## 1 TypeA     2     1
## 2 TypeB     4    NA

######### 1.11 Separate:Divde una columna en varias 
state <- c("Maryland", "Alaska", "New Jersey")
income <- c(76067,74444,73702)
median.us <- c(61372,61372,61372)
life.expectancy <- c(78.8,78.3,80.3)
teen.birth <- c("17//34.3//54.1", "29.0//42.9//66.0", "12.1//24.9//41.3")

top.3.states <- data.frame(state, income, median.us,
 life.expectancy,teen.birth)
top.3.states

##        state income median.us life.expectancy       teen.birth
## 1   Maryland  76067     61372            78.8   17//34.3//54.1
## 2     Alaska  74444     61372            78.3 29.0//42.9//66.0
## 3 New Jersey  73702     61372            80.3 12.1//24.9//41.3

#Separa una columna en varias
top.3.states.separated.years <- top.3.states %>%
 separate(teen.birth,
 into = c("2015", "2007","1991"), sep = "//")
top.3.states.separated.years

##        state income median.us life.expectancy 2015 2007 1991
## 1   Maryland  76067     61372            78.8   17 34.3 54.1
## 2     Alaska  74444     61372            78.3 29.0 42.9 66.0
## 3 New Jersey  73702     61372            80.3 12.1 24.9 41.3

######### 1.12 Recap of Handy DPLYR Functions

##### 1.12.2 Recuentos basicos 
m <- mutate(new.sleep, kount = n()) 
m[1:5,c(1:4,10:12)]

## # A tibble: 5 x 7
## # Groups:   vore, order [5]
##   name                       genus      vore  order        brainwt  bodywt kount
##   <chr>                      <chr>      <chr> <chr>          <dbl>   <dbl> <int>
## 1 Cheetah                    Acinonyx   carni Carnivora   NA        50        12
## 2 Owl monkey                 Aotus      omni  Primates     0.0155    0.48     10
## 3 Mountain beaver            Aplodontia herbi Rodentia    NA         1.35     16
## 4 Greater short-tailed shrew Blarina    omni  Soricomorp~  0.00029   0.019     3
## 5 Cow                        Bos        herbi Artiodacty~  0.423   600         5

f <- filter(new.sleep, n() > 14)
f[1:5,c(1:4,10:11)]

## # A tibble: 5 x 6
## # Groups:   vore, order [1]
##   name                      genus      vore  order    brainwt bodywt
##   <chr>                     <chr>      <chr> <chr>      <dbl>  <dbl>
## 1 Mountain beaver           Aplodontia herbi Rodentia NA       1.35 
## 2 Guinea pig                Cavis      herbi Rodentia  0.0055  0.728
## 3 Chinchilla                Chinchilla herbi Rodentia  0.0064  0.42 
## 4 Western american chipmunk Eutamias   herbi Rodentia NA       0.071
## 5 Mongolian gerbil          Meriones   herbi Rodentia NA       0.053

##### 1.12.3 Funciones Nth 

#primera entrada
salary.description <- c("Golden parachute type","Well to do",
"Average","Below average", "bring date seeds instead of flowers")
first(salary.description)

## [1] "Golden parachute type"

#ultima entrada
last(salary.description)

## [1] "bring date seeds instead of flowers"

#tercera entrada 
nth(salary.description, -3)

## [1] "Average"

#segundo elemento del vector
nth(salary.description,2)

## [1] "Well to do"

##### 1.12.4 Contar valores distintos
#valores unicos en un vector
a.vector <- c(22,33,44,1,2,3,3,3,4)
original.length <- length(a.vector)
original.length

## [1] 9

#numeros de elementos distintos
distinct.a.vector <- n_distinct(a.vector)
distinct.a.vector

## [1] 7

test1 <- if_else(original.length == distinct.a.vector, "all values 
unique","some duplicate values in vector")
test1

## [1] "some duplicate values in vector"

b.vector <- c(1,2,3,4,5,6)
length(b.vector)

## [1] 6

distinct.b.vector <- n_distinct(b.vector)
distinct.b.vector #show count (length) of distinct numbers

## [1] 6

test2 <- if_else(length(b.vector) == distinct.b.vector, "all values 
unique", "duplicates")
test2

## [1] "all values \nunique"

##### 1.12.5 na_if
test <- c(100, 0, 999)
x <- 5000/test
x

## [1] 50.000000       Inf  5.005005

x <- 5000/na_if(test,0) # si hay algun cero, no se hace 
x

## [1] 50.000000        NA  5.005005

class(x) #tipo de variable

## [1] "numeric"

##### 1.12.6 Coalesce para remplazar valores faltantes 

x <- c(33,4,11,NA,9)
x

## [1] 33  4 11 NA  9

x <- coalesce(x,0)
x

## [1] 33  4 11  0  9

############ 1.13 Funciones de clasificación
y <- c(100,4,12,6,8,3)
rank1 <-row_number(y) # muestra en que posicion se encuentran los elementos de menor a mayor
rank1

## [1] 6 2 5 3 4 1

y[rank1[1]] #valor menor

## [1] 3

y[rank1[6]] #valor mayor (ocupa la sexta posicion)

## [1] 100

##### 1.13.2 Rango mínimo

# muestra en que posicion se encuentran los elementos de menor a mayor
rank2 <- min_rank(y)
rank2

## [1] 6 2 5 3 4 1

##### 1.13.3 Rango denso
rank3 <- dense_rank(y) # muestra en que posicion se encuentran los elementos de menor a mayor
rank3

## [1] 6 2 5 3 4 1

##### 1.13.4 Rango porcentual
rank4 <- percent_rank(y) #percentil en el que se ubican los datos
rank4

## [1] 1.0 0.2 0.8 0.4 0.6 0.0

##### 1.13.5 Función de distribución acumulativa
  y <- c(100,4,12,6,8,3)
rank5 <- cume_dist(y)  #proporción de todos los valores menores o iguales al rango actual:
rank5

## [1] 1.0000000 0.3333333 0.8333333 0.5000000 0.6666667 0.1666667

# Divida el vector de entrada en n buckets 
rank6 = ntile(y, 3) 
rank6

## [1] 3 1 3 2 2 1

test.vector <- c(2,22,33,44,77,89,99)
quantile(test.vector, prob = seq(0,1,length = 11),type = 5)

##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##  2.0  6.0 20.0 28.6 36.3 44.0 67.1 81.8 90.0 97.0 99.0

########### 1.14 muesteo 

data("ChickWeight")
my.sample <- sample_n(ChickWeight, 5) #muestra 5 filas aleatorias de la base de datos
my.sample

##   weight Time Chick Diet
## 1    167   21    22    2
## 2    164   20    22    2
## 3     89   14    20    1
## 4    331   21    21    2
## 5    103    8    44    4

set.seed(833)

my.sample <- sample_n(ChickWeight, 10, replace = TRUE) #replace =true; se puede obtener la misma fila mas de una vez
my.sample

##    weight Time Chick Diet
## 1      98    8    45    4
## 2      42    0    17    1
## 3      98    8    36    3
## 4      51    2    11    1
## 5     198   20     3    1
## 6     237   21    49    4
## 7     205   16    50    4
## 8     170   16    39    3
## 9     332   18    35    3
## 10    144   14    33    3

my.sample <- sample_n(mtcars, 12, weight = cyl)# es mas probable que se selccionen carros con mas cilindros
my.sample[,1:5]

##                     mpg cyl  disp  hp drat
## AMC Javelin        15.2   8 304.0 150 3.15
## Porsche 914-2      26.0   4 120.3  91 4.43
## Merc 280           19.2   6 167.6 123 3.92
## Cadillac Fleetwood 10.4   8 472.0 205 2.93
## Merc 240D          24.4   4 146.7  62 3.69
## Datsun 710         22.8   4 108.0  93 3.85
## Merc 280C          17.8   6 167.6 123 3.92
## Mazda RX4 Wag      21.0   6 160.0 110 3.90
## Merc 450SLC        15.2   8 275.8 180 3.07
## Chrysler Imperial  14.7   8 440.0 230 3.23
## Maserati Bora      15.0   8 301.0 335 3.54
## Valiant            18.1   6 225.0 105 2.76

#sample_frac para obtener una muestra igual a un porcentaje específico de las filas del marco de datos
test1 <- sample_frac(ChickWeight, 0.02)
test1

##    weight Time Chick Diet
## 1      48    2    13    1
## 2      62    6    12    1
## 3     197   20    45    4
## 4     234   18    42    4
## 5      58    4    28    2
## 6     163   16     3    1
## 7     103    8    41    4
## 8     103    8    42    4
## 9     120   18    19    1
## 10     48    2    36    3
## 11     80    6    48    4
## 12    137   12    33    3

#En este ejemplo, group_by identifica a los personajes de Starwars por grupo de cabello, y luego se selecciona el 7% de los registros en cada grupo. Esto es útil cuando desea un porcentaje establecido de grupos cuyos tamaños varían
by_hair_color <- starwars %>% group_by(hair_color)
my.sample <- sample_frac(by_hair_color, .07, replace = TRUE)
my.sample[,1:5]

## # A tibble: 5 x 5
## # Groups:   hair_color [3]
##   name       height  mass hair_color skin_color      
##   <chr>       <int> <dbl> <chr>      <chr>           
## 1 Eeth Koth     171    NA black      brown           
## 2 Dormé         165    NA brown      light           
## 3 Sebulba       112    40 none       grey, red       
## 4 Shaak Ti      178    57 none       red, blue, white
## 5 Tion Medon    206    80 none       grey

#recuentos basicos y recuentos por grupos
row.kount.only <- ChickWeight %>% tally()
row.kount.only

##     n
## 1 578

diet.kount <- ChickWeight %>% count(Diet)
diet.kount

##   Diet   n
## 1    1 220
## 2    2 120
## 3    3 120
## 4    4 118

############# 1.15 Miscellaneous DPLYR Functions

##### 1.15.1 add_count para filtrado grupal
single.species.kount <- starwars %>%
 add_count(species) %>%
 filter(n == 1)
single.species.kount[,1:6]

## # A tibble: 29 x 6
##    name                  height  mass hair_color skin_color       eye_color
##    <chr>                  <int> <dbl> <chr>      <chr>            <chr>    
##  1 Greedo                   173    74 <NA>       green            black    
##  2 Jabba Desilijic Tiure    175  1358 <NA>       green-tan, brown orange   
##  3 Yoda                      66    17 white      green            brown    
##  4 Bossk                    190   113 none       green            red      
##  5 Ackbar                   180    83 none       brown mottle     orange   
##  6 Wicket Systri Warrick     88    20 brown      brown            brown    
##  7 Nien Nunb                160    68 none       grey             black    
##  8 Nute Gunray              191    90 none       mottled green    red      
##  9 Watto                    137    NA black      blue, grey       yellow   
## 10 Sebulba                  112    40 none       grey, red        orange   
## # ... with 19 more rows

##### 1.15.2 renombrar
mtcars <- rename(mtcars, spam_mpg = mpg)
data(mtcars)
names(mtcars)

##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"

##### 1.15.3 case_when
#Puede crear una nueva variable que se base en una combinación compleja de variables existentes
data(starwars)
new.starwars <- starwars %>%
 dplyr::select(name, mass, gender, species, height) %>%
 mutate(type = case_when(height > 200 | mass > 200 ~ "large",
 species == "Droid" ~ "robot", TRUE ~ "other"))
new.starwars

## # A tibble: 87 x 6
##    name                mass gender    species height type 
##    <chr>              <dbl> <chr>     <chr>    <int> <chr>
##  1 Luke Skywalker        77 masculine Human      172 other
##  2 C-3PO                 75 masculine Droid      167 robot
##  3 R2-D2                 32 masculine Droid       96 robot
##  4 Darth Vader          136 masculine Human      202 large
##  5 Leia Organa           49 feminine  Human      150 other
##  6 Owen Lars            120 masculine Human      178 other
##  7 Beru Whitesun lars    75 feminine  Human      165 other
##  8 R5-D4                 32 masculine Droid       97 robot
##  9 Biggs Darklighter     84 masculine Human      183 other
## 10 Obi-Wan Kenobi        77 masculine Human      182 other
## # ... with 77 more rows

capitulo1

David Felipe García Ayala

21/10/2021

R Markdown