This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
if (!require("tidyverse")) install.packages("tidyverse")
## Loading required package: tidyverse
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.5 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.0.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidyverse)
data("mtcars")
############# 1.1 Comandos de filtro
##### 1.1.1 filtrar por una condicion unica
#seleccionar de "mtcars" los carros que tengan 6 cilindros unicamente
#en el comando "filter" el igual se simboliza: ==
six.cyl.only <- filter(mtcars, cyl == 6)
six.cyl.only
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
##### 1.1.2 Filtrar por multiples condiciones
#seleccionar de "mtcars" los carros que tengan 6 cilindros y 110 caballos de fuerza unicamente
six.cylinders.and.110.horse.power <- filter(mtcars, cyl == 6,
hp == 110)
six.cylinders.and.110.horse.power
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
##### 1.1.3 "OR" para filtrar de manera logica
# seleccionar de "mtcars" los carros que tengan 4 engranajes o mas de 6 cilindros
gear.eq.4.or.more.than.8 <- filter(mtcars, gear == 4|cyl > 6)
gear.eq.4.or.more.than.8
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
##### 1.1.4 filtrar por minimos, maximos y otros crterios numericos
# seleccionar el carro con el menor desplazamiento del motor
smallest.engine.displacement <- filter(mtcars, disp ==min(disp))
smallest.engine.displacement
## mpg cyl disp hp drat wt qsec vs am gear carb
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.9 1 1 4 1
#filtrar por mas de una condicion
data("ChickWeight")
chick.subset <- filter(ChickWeight, Time < 3, weight > 53)
chick.subset
## weight Time Chick Diet
## 1 55 2 22 2
## 2 55 2 40 3
## 3 55 2 43 4
## 4 54 2 50 4
##### 1.1.5 Filtrar valores perdidos para una columna especifica
data("airquality")
head(airquality,10) #Muestra los 10 primeros datos
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
## 7 23 299 8.6 65 5 7
## 8 19 99 13.8 59 5 8
## 9 8 19 20.1 61 5 9
## 10 NA 194 8.6 69 5 10
# eliminar cualquier fila con valores perdidos en la columna ozono
no.missing.ozone = filter(airquality, !is.na(Ozone))
head(no.missing.ozone,8)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 28 NA 14.9 66 5 6
## 6 23 299 8.6 65 5 7
## 7 19 99 13.8 59 5 8
## 8 8 19 20.1 61 5 9
##### 1.1.6 Filtrar filas con valores faltantes en cualquier lugar del conjunto de datos
# "complete.cases()" sirvepara eliminar filas que contengan algun valor faltante en cualquier columna
airqual.no.NA.anywhere <- filter(airquality[1:15,],complete.cases(airquality[1:15,]))
airqual.no.NA.anywhere
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 23 299 8.6 65 5 7
## 6 19 99 13.8 59 5 8
## 7 8 19 20.1 61 5 9
## 8 16 256 9.7 69 5 12
## 9 11 290 9.2 66 5 13
## 10 14 274 10.9 68 5 14
## 11 18 65 13.2 58 5 15
##### 1.1.7 Filtrar por "%in%"
data("iris")
table(iris$Species) #muestra las especies que hay en "iris"
##
## setosa versicolor virginica
## 50 50 50
#filtrar unicamente las especies "setosa" y " virginica"
iris.two.species <- filter(iris,Species %in% c("setosa", "virginica"))
table(iris.two.species$Species)
##
## setosa versicolor virginica
## 50 0 50
#Mostrar el numero de filas antes y despues de realizar el filtro
nrow(iris); nrow(iris.two.species)
## [1] 150
## [1] 100
##### 1.1.8 Filtrar por una caracteristica e incluir solo 3 columnas
data("airquality")
airqual.3.columns <- filter(airquality, Ozone > 29)[,1:3]
head(airqual.3.columns)
## Ozone Solar.R Wind
## 1 41 190 7.4
## 2 36 118 8.0
## 3 34 307 12.0
## 4 30 322 11.5
## 5 32 92 12.0
## 6 45 252 14.9
##### 1.1.9 Filtrar por frecuencia total de un valor en todas las filas
# muestra aquellos valores de "gear" que superan las 10 filas (valor que se repite mas de 10 veces)
table(mtcars$gear)
##
## 3 4 5
## 15 12 5
more.frequent.no.of.gears <- mtcars %>% group_by(gear) %>% filter(n() > 10)
table(more.frequent.no.of.gears$gear)
##
## 3 4
## 15 12
# muestra aquellos valores de "gear" que superan las 10 filas y tienen menos de 105 caballos de fuerza
more.frequent.no.of.gears.and.low.horsepower <- mtcars %>% group_by(gear) %>% filter(n() > 10, hp < 105)
table(more.frequent.no.of.gears.and.low.horsepower$gear)
##
## 3 4
## 1 7
##### 1.1.10 Filtrar por columna usando "star with"
# seleccionar colmnas que empiezan con S
names(iris)
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
iris.display <- iris %>% dplyr::select(starts_with("S"))
head(iris.display)
## Sepal.Length Sepal.Width Species
## 1 5.1 3.5 setosa
## 2 4.9 3.0 setosa
## 3 4.7 3.2 setosa
## 4 4.6 3.1 setosa
## 5 5.0 3.6 setosa
## 6 5.4 3.9 setosa
##### 1.1.11 Filtrar filas: las columnas cumplen los criterios (filter_at)
#selecciona el que presente la mayor cantidad de cilindros y caballos de fuerza
new.mtcars <- mtcars %>% filter_at(vars(cyl, hp), all_vars(. == max(.)))
new.mtcars
## mpg cyl disp hp drat wt qsec vs am gear carb
## Maserati Bora 15 8 301 335 3.54 3.57 14.6 0 1 5 8
#La función filter_at dice que mire solo las variables que contienen la palabra "sleep". Dentro de esas variables (en este caso, dos de ellas), filtre por cualquier valor mayor que 5. El "." significa cualquier variable con dormir en el nombre
msleep <- ggplot2::msleep
msleep
## # A tibble: 83 x 11
## name genus vore order conservation sleep_total sleep_rem sleep_cycle awake
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Cheet~ Acin~ carni Carn~ lc 12.1 NA NA 11.9
## 2 Owl m~ Aotus omni Prim~ <NA> 17 1.8 NA 7
## 3 Mount~ Aplo~ herbi Rode~ nt 14.4 2.4 NA 9.6
## 4 Great~ Blar~ omni Sori~ lc 14.9 2.3 0.133 9.1
## 5 Cow Bos herbi Arti~ domesticated 4 0.7 0.667 20
## 6 Three~ Brad~ herbi Pilo~ <NA> 14.4 2.2 0.767 9.6
## 7 North~ Call~ carni Carn~ vu 8.7 1.4 0.383 15.3
## 8 Vespe~ Calo~ <NA> Rode~ <NA> 7 NA NA 17
## 9 Dog Canis carni Carn~ domesticated 10.1 2.9 0.333 13.9
## 10 Roe d~ Capr~ herbi Arti~ lc 3 NA NA 21
## # ... with 73 more rows, and 2 more variables: brainwt <dbl>, bodywt <dbl>
msleep.over.5 <- msleep %>%
select(name, sleep_total:sleep_rem, brainwt:bodywt) %>% filter_at(vars(contains("sleep")), all_vars(.>5))
msleep.over.5
## # A tibble: 2 x 5
## name sleep_total sleep_rem brainwt bodywt
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Thick-tailed opposum 19.4 6.6 NA 0.37
## 2 Giant armadillo 18.1 6.1 0.081 60
############### 1.2 Arreglar (sort)
msleep <- ggplot2::msleep
msleep[,1:4]
## # A tibble: 83 x 4
## name genus vore order
## <chr> <chr> <chr> <chr>
## 1 Cheetah Acinonyx carni Carnivora
## 2 Owl monkey Aotus omni Primates
## 3 Mountain beaver Aplodontia herbi Rodentia
## 4 Greater short-tailed shrew Blarina omni Soricomorpha
## 5 Cow Bos herbi Artiodactyla
## 6 Three-toed sloth Bradypus herbi Pilosa
## 7 Northern fur seal Callorhinus carni Carnivora
## 8 Vesper mouse Calomys <NA> Rodentia
## 9 Dog Canis carni Carnivora
## 10 Roe deer Capreolus herbi Artiodactyla
## # ... with 73 more rows
###### 1.2.1 Ordenar de forma ascendente
animal.name.sequence <- arrange(msleep, vore, order)
animal.name.sequence[,1:4]
## # A tibble: 83 x 4
## name genus vore order
## <chr> <chr> <chr> <chr>
## 1 Cheetah Acinonyx carni Carnivora
## 2 Northern fur seal Callorhinus carni Carnivora
## 3 Dog Canis carni Carnivora
## 4 Domestic cat Felis carni Carnivora
## 5 Gray seal Haliochoerus carni Carnivora
## 6 Tiger Panthera carni Carnivora
## 7 Jaguar Panthera carni Carnivora
## 8 Lion Panthera carni Carnivora
## 9 Caspian seal Phoca carni Carnivora
## 10 Genet Genetta carni Carnivora
## # ... with 73 more rows
###### 1.2.2 Ordenar de forma descendente
animal.name.sequence.desc <- arrange(msleep, vore, desc(order))
head(animal.name.sequence.desc[,1:4])
## # A tibble: 6 x 4
## name genus vore order
## <chr> <chr> <chr> <chr>
## 1 Northern grasshopper mouse Onychomys carni Rodentia
## 2 Slow loris Nyctibeus carni Primates
## 3 Thick-tailed opposum Lutreolina carni Didelphimorphia
## 4 Long-nosed armadillo Dasypus carni Cingulata
## 5 Pilot whale Globicephalus carni Cetacea
## 6 Common porpoise Phocoena carni Cetacea
############### 1.2 renombrar
# Renombrar una o mas columnas
names(iris)
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
renamed.iris <- rename(iris, width.of.petals = Petal.Width,
various.plants.and.animals = Species)
names(renamed.iris)
## [1] "Sepal.Length" "Sepal.Width"
## [3] "Petal.Length" "width.of.petals"
## [5] "various.plants.and.animals"
############### 1.4 Mutate
# Mutate sire para agregar nuevas variables en un dataframe
data("ChickWeight")
ChickWeight[1:2,] #first two rows
## weight Time Chick Diet
## 1 42 0 1 1
## 2 51 2 1 1
#agrega una nueva variable llamada log.of.weight que calcula el logaritmo del peso
Chickweight.with.log <- mutate(ChickWeight,log.of.weight = log10(weight))
Chickweight.with.log[1:2,]
## weight Time Chick Diet log.of.weight
## 1 42 0 1 1 1.623249
## 2 51 2 1 1 1.707570
##### 1.4.1 mutate_all para agregar nuevos campos de una vez
msleep <- ggplot2::msleep
names(msleep)
## [1] "name" "genus" "vore" "order" "conservation"
## [6] "sleep_total" "sleep_rem" "sleep_cycle" "awake" "brainwt"
## [11] "bodywt"
msleep
## # A tibble: 83 x 11
## name genus vore order conservation sleep_total sleep_rem sleep_cycle awake
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Cheet~ Acin~ carni Carn~ lc 12.1 NA NA 11.9
## 2 Owl m~ Aotus omni Prim~ <NA> 17 1.8 NA 7
## 3 Mount~ Aplo~ herbi Rode~ nt 14.4 2.4 NA 9.6
## 4 Great~ Blar~ omni Sori~ lc 14.9 2.3 0.133 9.1
## 5 Cow Bos herbi Arti~ domesticated 4 0.7 0.667 20
## 6 Three~ Brad~ herbi Pilo~ <NA> 14.4 2.2 0.767 9.6
## 7 North~ Call~ carni Carn~ vu 8.7 1.4 0.383 15.3
## 8 Vespe~ Calo~ <NA> Rode~ <NA> 7 NA NA 17
## 9 Dog Canis carni Carn~ domesticated 10.1 2.9 0.333 13.9
## 10 Roe d~ Capr~ herbi Arti~ lc 3 NA NA 21
## # ... with 73 more rows, and 2 more variables: brainwt <dbl>, bodywt <dbl>
#crea una nuevas variables en las que calcula la raiz cuadrada de las columnas 6 a 11
msleep.with.square.roots <- mutate_all(msleep[,6:11], funs("square root" = sqrt( . )))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
names(msleep.with.square.roots)
## [1] "sleep_total" "sleep_rem"
## [3] "sleep_cycle" "awake"
## [5] "brainwt" "bodywt"
## [7] "sleep_total_square root" "sleep_rem_square root"
## [9] "sleep_cycle_square root" "awake_square root"
## [11] "brainwt_square root" "bodywt_square root"
##### 1.4.2 mutate_at para agregar campos
data("Titanic")
Titanic <- as.data.frame(Titanic)#convertir la tabla a un data frame
head(Titanic)
## Class Sex Age Survived Freq
## 1 1st Male Child No 0
## 2 2nd Male Child No 0
## 3 3rd Male Child No 35
## 4 Crew Male Child No 0
## 5 1st Female Child No 0
## 6 2nd Female Child No 0
#crea tres variables nevas en las cuales cuenta el numero de campos quue presentan el mismo valor para cada una
titanic.with.ranks <- mutate_at(Titanic, vars(Class,Age,Survived), funs(Rank = min_rank(desc(.))))
head(titanic.with.ranks)
## Class Sex Age Survived Freq Class_Rank Age_Rank Survived_Rank
## 1 1st Male Child No 0 25 17 17
## 2 2nd Male Child No 0 17 17 17
## 3 3rd Male Child No 35 9 17 17
## 4 Crew Male Child No 0 1 17 17
## 5 1st Female Child No 0 25 17 17
## 6 2nd Female Child No 0 17 17 17
##### 1.4.3 mutate_if
#usa el operador logico if y agrega nuevas variables
divide.by.10 <- function (a.number) (a.number / 10)
head(CO2)
## Plant Type Treatment conc uptake
## 1 Qn1 Quebec nonchilled 95 16.0
## 2 Qn1 Quebec nonchilled 175 30.4
## 3 Qn1 Quebec nonchilled 250 34.8
## 4 Qn1 Quebec nonchilled 350 37.2
## 5 Qn1 Quebec nonchilled 500 35.3
## 6 Qn1 Quebec nonchilled 675 39.2
#divide los campos qe sean numericos entre 10
new.df <- CO2 %>% mutate_if(is.numeric, divide.by.10)
head(new.df)
## Plant Type Treatment conc uptake
## 1 Qn1 Quebec nonchilled 9.5 1.60
## 2 Qn1 Quebec nonchilled 17.5 3.04
## 3 Qn1 Quebec nonchilled 25.0 3.48
## 4 Qn1 Quebec nonchilled 35.0 3.72
## 5 Qn1 Quebec nonchilled 50.0 3.53
## 6 Qn1 Quebec nonchilled 67.5 3.92
# cambia los campos vavios por cero
df <- data.frame(alpha = c(22, 1, NA),almond = c(0, 5, 10),grape = c(0, 2, 2),apple = c(NA, 5, 10))
df
## alpha almond grape apple
## 1 22 0 0 NA
## 2 1 5 2 5
## 3 NA 10 2 10
df.fix.alpha <- df %>% mutate_if(is.numeric, coalesce, ... =0)
df.fix.alpha
## alpha almond grape apple
## 1 22 0 0 0
## 2 1 5 2 5
## 3 0 10 2 10
##### 1.4.4 Detección de cadena e indicador de duplicado verdadero / falso
msleep <- ggplot2::msleep
table(msleep$vore)
##
## carni herbi insecti omni
## 19 32 5 20
msleep.no.c.or.a <- filter(msleep, !str_detect(vore, paste(c("c","a"), collapse = "|")))
table(msleep.no.c.or.a$vore)
##
## herbi omni
## 32 20
#crea una nueva columna que indica si se repiten valores en otra columna
msleep.with.dup.indicator <- mutate(msleep, duplicate.indicator = duplicated(conservation))
msleep.with.dup.indicator[1:6,]
## # A tibble: 6 x 12
## name genus vore order conservation sleep_total sleep_rem sleep_cycle awake
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Cheetah Acin~ carni Carn~ lc 12.1 NA NA 11.9
## 2 Owl mo~ Aotus omni Prim~ <NA> 17 1.8 NA 7
## 3 Mounta~ Aplo~ herbi Rode~ nt 14.4 2.4 NA 9.6
## 4 Greate~ Blar~ omni Sori~ lc 14.9 2.3 0.133 9.1
## 5 Cow Bos herbi Arti~ domesticated 4 0.7 0.667 20
## 6 Three-~ Brad~ herbi Pilo~ <NA> 14.4 2.2 0.767 9.6
## # ... with 3 more variables: brainwt <dbl>, bodywt <dbl>,
## # duplicate.indicator <lgl>
#otra forma de hacerlo
msleep.with.dup.indicator <- mutate(msleep, duplicate.indicator = duplicated(conservation))
msleep.with.dup.indicator[1:6,c(1,2,3,12)]
## # A tibble: 6 x 4
## name genus vore duplicate.indicator
## <chr> <chr> <chr> <lgl>
## 1 Cheetah Acinonyx carni FALSE
## 2 Owl monkey Aotus omni FALSE
## 3 Mountain beaver Aplodontia herbi FALSE
## 4 Greater short-tailed shrew Blarina omni TRUE
## 5 Cow Bos herbi FALSE
## 6 Three-toed sloth Bradypus herbi TRUE
#ordenar por conservation como clave principal y genus como clave menor:
#Conservation y genus deben estar duplicados para que en la nueva culumna aparezca True
msleep.with.dup.indicator2 <- mutate(msleep, duplicate.indicator = duplicated(conservation, genus)) %>% arrange(conservation,genus)
msleep.with.dup.indicator2
## # A tibble: 83 x 12
## name genus vore order conservation sleep_total sleep_rem sleep_cycle awake
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Giraf~ Gira~ herbi Arti~ cd 1.9 0.4 NA 22.1
## 2 Pilot~ Glob~ carni Ceta~ cd 2.7 0.1 NA 21.4
## 3 Cow Bos herbi Arti~ domesticated 4 0.7 0.667 20
## 4 Dog Canis carni Carn~ domesticated 10.1 2.9 0.333 13.9
## 5 Guine~ Cavis herbi Rode~ domesticated 9.4 0.8 0.217 14.6
## 6 Chinc~ Chin~ herbi Rode~ domesticated 12.5 1.5 0.117 11.5
## 7 Horse Equus herbi Peri~ domesticated 2.9 0.6 1 21.1
## 8 Donkey Equus herbi Peri~ domesticated 3.1 0.4 NA 20.9
## 9 Domes~ Felis carni Carn~ domesticated 12.5 3.2 0.417 11.5
## 10 Rabbit Oryc~ herbi Lago~ domesticated 8.4 0.9 0.417 15.6
## # ... with 73 more rows, and 3 more variables: brainwt <dbl>, bodywt <dbl>,
## # duplicate.indicator <lgl>
#otro ejemplo
fruit <- c("apple","pear","orange","grape", "orange","orange")
x <- c(1,2,4,9,4,6)
y <- c(22,3,4,55,15,9)
z <- c(3,1,4,10,12,8)
w <- c(2,2,2,4,5,6)
df <- data.frame(fruit,x,y,z,w)
df
## fruit x y z w
## 1 apple 1 22 3 2
## 2 pear 2 3 1 2
## 3 orange 4 4 4 2
## 4 grape 9 55 10 4
## 5 orange 4 15 12 5
## 6 orange 6 9 8 6
#Duplicar la columna fruit
df.show.single.dup <- mutate(df, duplicate.indicator = duplicated(fruit))
df.show.single.dup
## fruit x y z w duplicate.indicator
## 1 apple 1 22 3 2 FALSE
## 2 pear 2 3 1 2 FALSE
## 3 orange 4 4 4 2 FALSE
## 4 grape 9 55 10 4 FALSE
## 5 orange 4 15 12 5 TRUE
## 6 orange 6 9 8 6 TRUE
#Nota: cuando el DYPLR detecta el primer valor pone como falso que este repetido, ya que no sabe si bajando por la columna haya otro igual. Cuando encuentra un segundo valor igual pone verdadero.
##### 1.4.5 Eliminar variables usando NULL
fruit <- c("apple","pear","orange","grape", "orange","orange")
x <- c(1,2,4,9,4,6)
y <- c(22,3,4,55,15,9)
z <- c(3,1,4,10,12,8)
df <- data.frame(fruit,x,y,z)
df <- mutate(df, z = NULL) #elimina la variable z
df
## fruit x y
## 1 apple 1 22
## 2 pear 2 3
## 3 orange 4 4
## 4 grape 9 55
## 5 orange 4 15
## 6 orange 6 9
##### 1.4.6 Secuencia de codificación preferida
#metodo no recomendado para agregar variables
if (!require("nycflights13")) install.packages("nycflights13")
## Loading required package: nycflights13
mutate(flights, gain = arr_delay - dep_delay, hours = air_time / 60,
gain_per_hour = gain / hours,
gain_per_minute = 60 * gain_per_hour)
## # A tibble: 336,776 x 23
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ... with 336,766 more rows, and 15 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## # gain <dbl>, hours <dbl>, gain_per_hour <dbl>, gain_per_minute <dbl>
#metodo recomendado
newfield.flights <- flights %>%
mutate(gain = arr_delay - dep_delay, hours = air_time / 60) %>% mutate(gain_per_hour = gain / hours) %>% mutate(gain_per_minute = 60 * gain_per_hour)
newfield.flights[1:6,c(1:2,20:23)]
## # A tibble: 6 x 6
## year month gain hours gain_per_hour gain_per_minute
## <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 9 3.78 2.38 143.
## 2 2013 1 16 3.78 4.23 254.
## 3 2013 1 31 2.67 11.6 698.
## 4 2013 1 -17 3.05 -5.57 -334.
## 5 2013 1 -19 1.93 -9.83 -590.
## 6 2013 1 16 2.5 6.4 384
##### 1.4.7Transmutar: mantener solo las variables creadas
#Crear datos nuevos basados en calculos realizados en variables exitentes
fruit <- c("apple","pear","orange","grape", "orange","orange")
x <- c(1,2,4,9,4,6)
y <- c(22,3,4,55,15,9)
z <- c(3,1,4,10,12,8)
df <- data.frame(fruit,x,y,z)
df
## fruit x y z
## 1 apple 1 22 3
## 2 pear 2 3 1
## 3 orange 4 4 4
## 4 grape 9 55 10
## 5 orange 4 15 12
## 6 orange 6 9 8
df <- transmute(df, new.variable = x + y + z)
df
## new.variable
## 1 26
## 2 6
## 3 12
## 4 74
## 5 31
## 6 23
##### 1.4.8 Use Across para aplicar una función en varias columnas
#multiplica por 2 el valor de las columnas numericas
double.it <- function(x) x*2
head(iris) #iris original
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
iris %>%
mutate(across(where(is.numeric), double.it)) %>%
head()
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 10.2 7.0 2.8 0.4 setosa
## 2 9.8 6.0 2.8 0.4 setosa
## 3 9.4 6.4 2.6 0.4 setosa
## 4 9.2 6.2 3.0 0.4 setosa
## 5 10.0 7.2 2.8 0.4 setosa
## 6 10.8 7.8 3.4 0.8 setosa
##### 1.4.9 Mutación condicional usando case_when
row1 <- c("a","b","c","d","e","f","column.to.be.changed")
row2 <- c(1,1,1,6,6,1,2)
row3 <- c(3,4,4,6,4,4,4)
row4 <- c(4,6,25,5,5,2,9)
row5 <- c(5,3,6,3,3,6,2)
df <- as.data.frame(rbind(row2,row3,row4,row5))
names(df) <- row1
df
## a b c d e f column.to.be.changed
## row2 1 1 1 6 6 1 2
## row3 3 4 4 6 4 4 4
## row4 4 6 25 5 5 2 9
## row5 5 3 6 3 3 6 2
# si se cumplen los "OR", la ultima columna cambiara su valor por 2 o 3
new.df <-df %>%
mutate(column.to.be.changed = case_when(a == 2 | a == 5 |
a == 7 | (a == 1 & b == 4) ~ 2, a == 0 | a == 1 | a == 4 |
a == 3 | c == 4 ~ 3, TRUE ~ NA_real_))
new.df
## a b c d e f column.to.be.changed
## row2 1 1 1 6 6 1 3
## row3 3 4 4 6 4 4 3
## row4 4 6 25 5 5 2 3
## row5 5 3 6 3 3 6 2
############### 1.5 select to para elegir variables o columnas
##### 1.5.1 borrar una columna
fruit <- c("apple","pear","orange","grape", "orange","orange")
x <- c(1,2,4,9,4,6)
y <- c(22,3,4,55,15,9)
z <- c(3,1,4,10,12,8)
df <- data.frame(fruit,x,y,z)
df
## fruit x y z
## 1 apple 1 22 3
## 2 pear 2 3 1
## 3 orange 4 4 4
## 4 grape 9 55 10
## 5 orange 4 15 12
## 6 orange 6 9 8
# se pone un menos antes de la vaiable que se desea eliminar
new.df.no.fruit <- dplyr::select(df, -fruit)
new.df.no.fruit
## x y z
## 1 1 22 3
## 2 2 3 1
## 3 4 4 4
## 4 9 55 10
## 5 4 15 12
## 6 6 9 8
##### 1.5.2 borrar columnas por el nombre usando starts_with o ends_with
data("mtcars")
names(mtcars)
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear"
## [11] "carb"
# eliminar las columnas que inician con "d"
mtcars.no.col.names.start.with.d <- select(mtcars, -starts_with("d"))
names(mtcars.no.col.names.start.with.d)
## [1] "mpg" "cyl" "hp" "wt" "qsec" "vs" "am" "gear" "carb"
#eliminar las columnas que terminan con t
mtcars.no.col.names.ends.with <- select(mtcars,
- ends_with("t"))
names(mtcars.no.col.names.ends.with)
## [1] "mpg" "cyl" "disp" "hp" "qsec" "vs" "am" "gear" "carb"
##### 1.5.3 ordenar las columnas
fruit <- c("apple","pear","orange","grape", "orange","orange")
x <- c(1,2,4,9,4,6)
y <- c(22,3,4,55,15,9)
z <- c(3,1,4,10,12,8)
df <- data.frame(fruit,x,y,z)
df
## fruit x y z
## 1 apple 1 22 3
## 2 pear 2 3 1
## 3 orange 4 4 4
## 4 grape 9 55 10
## 5 orange 4 15 12
## 6 orange 6 9 8
##### 1.5.4 select_all para aplicar una función a todas las columnas
state <- c("Maryland", "Alaska", "New Jersey")
income <- c(76067,74444,73702)
median.us <- c(61372,61372,61372)
life.expectancy <- c(78.8,78.3,80.3)
top.3.states <- data.frame(state, income, median.us,
life.expectancy)
top.3.states
## state income median.us life.expectancy
## 1 Maryland 76067 61372 78.8
## 2 Alaska 74444 61372 78.3
## 3 New Jersey 73702 61372 80.3
#para Escribir en mayúsculas los nombres de las columnas, utilizar la función "toupper"
new.top.3.states <- select_all(top.3.states, toupper)
new.top.3.statesnew.top.3.states <- select_all(top.3.states, toupper)
new.top.3.states
## STATE INCOME MEDIAN.US LIFE.EXPECTANCY
## 1 Maryland 76067 61372 78.8
## 2 Alaska 74444 61372 78.3
## 3 New Jersey 73702 61372 80.3
##### 1.5.5 Seleccionar columnas mediante la función Pull
top.3.states <- data.frame(state, income, median.us, life.expectancy)
top.3.states
## state income median.us life.expectancy
## 1 Maryland 76067 61372 78.8
## 2 Alaska 74444 61372 78.3
## 3 New Jersey 73702 61372 80.3
#obtener la primera columna
pull.first.column <- pull(top.3.states,1)
pull.first.column
## [1] "Maryland" "Alaska" "New Jersey"
#usando -1 se obtiene la columna de la derecha
pull.last.column <- pull(top.3.states,-1)
pull.last.column
## [1] 78.8 78.3 80.3
##### 1.5.6 Seleccionar filas: cualquier variable cumple alguna condición
nrow(mtcars)
## [1] 32
#cualquier casilla que exceda los 200
mtcars.more.than.200 <- filter_all(mtcars, any_vars(. > 200)) #numero de filas que presentan valores mayores a 200
nrow(mtcars.more.than.200)
## [1] 16
##### 1.5.7 Seleccionar columnas: Omitir si el nombre de la columna contiene caracteres específicos
#seleccionar las columnas especificadas mas las que no tengan alguna p
names(mtcars)
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear"
## [11] "carb"
cars.with.no.p <- mtcars %>% dplyr::select(-contains("p"))
names(cars.with.no.p) #inguna columna que tenga "p" es seleccionada
## [1] "cyl" "drat" "wt" "qsec" "vs" "am" "gear" "carb"
##### 1.5.8 Seleccionar Usando Wildcard Matching
names(mtcars)
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear"
## [11] "carb"
subset.mtcars <- select(mtcars, matches("pg|gea")) #La función "coincide" es más general que "contiene" porque es una expresión regular y, por lo tanto, más flexible
names(subset.mtcars)
## [1] "mpg" "gear"
subset.mtcars
## mpg gear
## Mazda RX4 21.0 4
## Mazda RX4 Wag 21.0 4
## Datsun 710 22.8 4
## Hornet 4 Drive 21.4 3
## Hornet Sportabout 18.7 3
## Valiant 18.1 3
## Duster 360 14.3 3
## Merc 240D 24.4 4
## Merc 230 22.8 4
## Merc 280 19.2 4
## Merc 280C 17.8 4
## Merc 450SE 16.4 3
## Merc 450SL 17.3 3
## Merc 450SLC 15.2 3
## Cadillac Fleetwood 10.4 3
## Lincoln Continental 10.4 3
## Chrysler Imperial 14.7 3
## Fiat 128 32.4 4
## Honda Civic 30.4 4
## Toyota Corolla 33.9 4
## Toyota Corona 21.5 3
## Dodge Challenger 15.5 3
## AMC Javelin 15.2 3
## Camaro Z28 13.3 3
## Pontiac Firebird 19.2 3
## Fiat X1-9 27.3 4
## Porsche 914-2 26.0 5
## Lotus Europa 30.4 5
## Ford Pantera L 15.8 5
## Ferrari Dino 19.7 5
## Maserati Bora 15.0 5
## Volvo 142E 21.4 4
############## 1.6 Uniones: manipulaciones de datos de dos fuentes
##### 1.6.1 Unión a la izquierda
us.state.areas <- as.data.frame(cbind(state.abb, state.area))
us.state.areas[1:3,]
## state.abb state.area
## 1 AL 51609
## 2 AK 589757
## 3 AZ 113909
us.state.abbreviation.and.name <- as.data.frame(cbind(state.abb, state.name))
us.state.abbreviation.and.name[1:3,]
## state.abb state.name
## 1 AL Alabama
## 2 AK Alaska
## 3 AZ Arizona
#combina las dos tablas
state.info.abb.area.name <- us.state.areas %>%
left_join(us.state.abbreviation.and.name, by = "state.abb")
head(state.info.abb.area.name)
## state.abb state.area state.name
## 1 AL 51609 Alabama
## 2 AK 589757 Alaska
## 3 AZ 113909 Arizona
## 4 AR 53104 Arkansas
## 5 CA 158693 California
## 6 CO 104247 Colorado
##### 1.6.2 Inner Join
# primer data frame
names <- c("Sally","Tom","Frieda","Alfonzo")
team.scores <- c(3,5,2,7)
team.league <- c("alpha","beta","gamma", "omicron")
team.info <- data.frame(names, team.scores, team.league)
# segundo data frame
names = c("Sally","Tom", "Bill", "Alfonzo")
school.grades <- c("A","B","C","B")
school.info <- data.frame(names, school.grades)
#se seleccionan los nombres repetidos en los dos data frame
school.and.team <- inner_join(team.info, school.info, by = "names")
school.and.team
## names team.scores team.league school.grades
## 1 Sally 3 alpha A
## 2 Tom 5 beta B
## 3 Alfonzo 7 omicron B
##### 1.6.3 Anti-unión
# primer data frame
names <- c("Sally","Tom","Frieda","Alfonzo")
team.scores <- c(3,5,2,7)
team.league <- c("alpha","beta","gamma", "omicron")
team.info <- data.frame(names, team.scores, team.league)
team.info
## names team.scores team.league
## 1 Sally 3 alpha
## 2 Tom 5 beta
## 3 Frieda 2 gamma
## 4 Alfonzo 7 omicron
# segundo data frame
names <- c("Sally","Tom", "Bill", "Alfonzo")
school.grades <- c("A","B","C","B")
school.info <- data.frame(names, school.grades)
school.info
## names school.grades
## 1 Sally A
## 2 Tom B
## 3 Bill C
## 4 Alfonzo B
# selecciona los datos del primer data frame que no coinciden con los del segundo
team.info.but.no.grades <- anti_join(team.info, school.info, by = "names")
team.info.but.no.grades
## names team.scores team.league
## 1 Frieda 2 gamma
##### 1.6.4 union completa
#primer data frame
names = c("Sally","Tom","Frieda","Alfonzo")
team.scores = c(3,5,2,7)
team.league = c("alpha","beta","gamma", "omicron")
team.info = data.frame(names, team.scores, team.league)
#segundo data frame
names = c("Sally","Tom", "Bill", "Alfonzo")
school.grades = c("A","B","C","B")
school.info = data.frame(names, school.grades)
# une las dos tablas incluso los datos que no coinciden
team.info.and.or.grades <- full_join(team.info, school.info, by = "names")
team.info.and.or.grades
## names team.scores team.league school.grades
## 1 Sally 3 alpha A
## 2 Tom 5 beta B
## 3 Frieda 2 gamma <NA>
## 4 Alfonzo 7 omicron B
## 5 Bill NA <NA> C
##### 1.6.5 semi union
#mantiene todas las observaciones del dataset 1 que coinciden con el dataset2
team.info.with.grades <- semi_join(team.info, school.info)
## Joining, by = "names"
team.info.with.grades
## names team.scores team.league
## 1 Sally 3 alpha
## 2 Tom 5 beta
## 3 Alfonzo 7 omicron
##### 1.6.6 union a la derecha
us.state.areas <- as.data.frame(cbind(state.abb, state.area))
us.state.areas[1:3,]
## state.abb state.area
## 1 AL 51609
## 2 AK 589757
## 3 AZ 113909
us.state.abbreviation.and.name <- as.data.frame(cbind(state.abb, state.name))
us.state.abbreviation.and.name[1:3,]
## state.abb state.name
## 1 AL Alabama
## 2 AK Alaska
## 3 AZ Arizona
us.state.abbreviation.and.name[1,1] <- "Intentional Mismatch" #cambiar el nombre
# uunion a la derecha, donde no haya dato del primer data set, se rellena con NA
us.state.with.abbreviation.and.name.and.area <- right_join(us.state.areas,
us.state.abbreviation.and.name, by = "state.abb")
us.state.with.abbreviation.and.name.and.area[1:3,]
## state.abb state.area state.name
## 1 AK 589757 Alaska
## 2 AZ 113909 Arizona
## 3 AR 53104 Arkansas
############ 1.7 Slice
msleep <- ggplot2::msleep
nrow(msleep)
## [1] 83
msleep.only.first.5 <- slice(msleep, -6:-n()) #elimina las filas 6 en adelante (-n())
nrow(msleep.only.first.5)
## [1] 5
msleep.20.rows <- msleep %>%slice(20:39) #mantiene las filas 20 a 39
nrow(msleep.20.rows)
## [1] 20
nrow(msleep) - nrow(msleep.20.rows) # diferencia de filas en los dos data frame
## [1] 63
############ 1.8 resumir
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
data(gehan)
gehan2 <- gehan
library(tidyverse)
gehan2 %>% summarise( kount = n()) #numero de filas
## kount
## 1 42
gehan2 %>% group_by(treat) %>% summarise(kount = n()) #recuento por variable "treat"
## # A tibble: 2 x 2
## treat kount
## <fct> <int>
## 1 6-MP 21
## 2 control 21
#Datos estadisticos dependiendo del "treat"
gehan2 %>%
group_by(treat) %>%
summarise(average.remiss.time = mean(time),
median.remiss.time = median(time),
std.dev.remiss.time = sd(time),
median.abs.deviation = mad(time),
IQR.remiss.time = IQR(time))
## # A tibble: 2 x 6
## treat average.remiss.time median.remiss.t~ std.dev.remiss.~ median.abs.devi~
## <fct> <dbl> <int> <dbl> <dbl>
## 1 6-MP 17.1 16 10.0 10.4
## 2 control 8.67 8 6.47 5.93
## # ... with 1 more variable: IQR.remiss.time <dbl>
#mostrar maximos y minimos de "treat"
gehan2 %>%
group_by(treat) %>%
summarise(minimum.remission = min(time),
max.remission = max(time))
## # A tibble: 2 x 3
## treat minimum.remission max.remission
## <fct> <int> <int>
## 1 6-MP 6 35
## 2 control 1 23
##### 1.8.1 Sumarise across
library(MASS)
subset.survey <- survey[1:10,]
library(dplyr)
head(subset.survey)
## Sex Wr.Hnd NW.Hnd W.Hnd Fold Pulse Clap Exer Smoke Height M.I
## 1 Female 18.5 18.0 Right R on L 92 Left Some Never 173.00 Metric
## 2 Male 19.5 20.5 Left R on L 104 Left None Regul 177.80 Imperial
## 3 Male 18.0 13.3 Right L on R 87 Neither None Occas NA <NA>
## 4 Male 18.8 18.9 Right R on L NA Neither None Never 160.00 Metric
## 5 Male 20.0 20.0 Right Neither 35 Right Some Never 165.00 Metric
## 6 Female 18.0 17.7 Right L on R 64 Right Some Never 172.72 Imperial
## Age
## 1 18.250
## 2 17.583
## 3 16.917
## 4 20.333
## 5 23.667
## 6 21.000
subset.survey %>%
na.omit() %>% #Remueve cualquier NAs
group_by(Sex) %>%
summarise(across(where(is.numeric), mean,
.names = "mean_{col}")) %>%
head()
## # A tibble: 2 x 6
## Sex mean_Wr.Hnd mean_NW.Hnd mean_Pulse mean_Height mean_Age
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Female 17.8 17.7 76.7 168. 25.0
## 2 Male 19.1 19.2 76.8 174. 20.3
#contar las combinaciones entre variables
new.sleep <- msleep %>% group_by(vore, order)
s <- summarise(new.sleep, n())
## `summarise()` has grouped output by 'vore'. You can override using the `.groups` argument.
s
## # A tibble: 32 x 3
## # Groups: vore [5]
## vore order `n()`
## <chr> <chr> <int>
## 1 carni Carnivora 12
## 2 carni Cetacea 3
## 3 carni Cingulata 1
## 4 carni Didelphimorphia 1
## 5 carni Primates 1
## 6 carni Rodentia 1
## 7 herbi Artiodactyla 5
## 8 herbi Diprotodontia 1
## 9 herbi Hyracoidea 2
## 10 herbi Lagomorpha 1
## # ... with 22 more rows
#totales
new.sleep.totals <- msleep %>%
group_by(vore, order) %>%
summarise(n())
## `summarise()` has grouped output by 'vore'. You can override using the `.groups` argument.
new.sleep.totals
## # A tibble: 32 x 3
## # Groups: vore [5]
## vore order `n()`
## <chr> <chr> <int>
## 1 carni Carnivora 12
## 2 carni Cetacea 3
## 3 carni Cingulata 1
## 4 carni Didelphimorphia 1
## 5 carni Primates 1
## 6 carni Rodentia 1
## 7 herbi Artiodactyla 5
## 8 herbi Diprotodontia 1
## 9 herbi Hyracoidea 2
## 10 herbi Lagomorpha 1
## # ... with 22 more rows
############# 1.9 1.9 Gathering: convertir varias columnas en una
state <- c("Maryland", "Alaska", "New Jersey")
income <- c(76067,74444,73702)
median.us <- c(61372,61372,61372)
life.expectancy <- c(78.8,78.3,80.3)
teen.birth.rate.2015 <- c(17,29.3,12.1)
teen.birth.rate.2007 <- c(34.3,42.9,24.9)
teen.birth.rate.1991 <- c(54.1, 66, 41.3)
top.3.states <- data.frame(state, income, median.us,
life.expectancy,
teen.birth.rate.2015, teen.birth.rate.2007,
teen.birth.rate.1991)
names(top.3.states) <- c("state", "income", "median.us",
"life.expectancy","2015","2007","1991")
top.3.states
## state income median.us life.expectancy 2015 2007 1991
## 1 Maryland 76067 61372 78.8 17.0 34.3 54.1
## 2 Alaska 74444 61372 78.3 29.3 42.9 66.0
## 3 New Jersey 73702 61372 80.3 12.1 24.9 41.3
# combinar las columnas de años
new.top.3.states <- top.3.states %>%
gather("2015", "2007", "1991", key = "year", value = "cases")
new.top.3.states
## state income median.us life.expectancy year cases
## 1 Maryland 76067 61372 78.8 2015 17.0
## 2 Alaska 74444 61372 78.3 2015 29.3
## 3 New Jersey 73702 61372 80.3 2015 12.1
## 4 Maryland 76067 61372 78.8 2007 34.3
## 5 Alaska 74444 61372 78.3 2007 42.9
## 6 New Jersey 73702 61372 80.3 2007 24.9
## 7 Maryland 76067 61372 78.8 1991 54.1
## 8 Alaska 74444 61372 78.3 1991 66.0
## 9 New Jersey 73702 61372 80.3 1991 41.3
############# 1.10 Spreading: Consolidacion de varias filas en una
df_1 <- data_frame(Type = c("TypeA", "TypeA", "TypeB", "TypeB"),
Answer = c("Yes", "No", NA, "No"), n = 1:4)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
df_1 #before
## # A tibble: 4 x 3
## Type Answer n
## <chr> <chr> <int>
## 1 TypeA Yes 1
## 2 TypeA No 2
## 3 TypeB <NA> 3
## 4 TypeB No 4
#separar columnas
df_2 <- df_1 %>%
filter(!is.na(Answer)) %>%
spread(key=Answer, value=n)
df_2
## # A tibble: 2 x 3
## Type No Yes
## <chr> <int> <int>
## 1 TypeA 2 1
## 2 TypeB 4 NA
######### 1.11 Separate:Divde una columna en varias
state <- c("Maryland", "Alaska", "New Jersey")
income <- c(76067,74444,73702)
median.us <- c(61372,61372,61372)
life.expectancy <- c(78.8,78.3,80.3)
teen.birth <- c("17//34.3//54.1", "29.0//42.9//66.0", "12.1//24.9//41.3")
top.3.states <- data.frame(state, income, median.us,
life.expectancy,teen.birth)
top.3.states
## state income median.us life.expectancy teen.birth
## 1 Maryland 76067 61372 78.8 17//34.3//54.1
## 2 Alaska 74444 61372 78.3 29.0//42.9//66.0
## 3 New Jersey 73702 61372 80.3 12.1//24.9//41.3
#Separa una columna en varias
top.3.states.separated.years <- top.3.states %>%
separate(teen.birth,
into = c("2015", "2007","1991"), sep = "//")
top.3.states.separated.years
## state income median.us life.expectancy 2015 2007 1991
## 1 Maryland 76067 61372 78.8 17 34.3 54.1
## 2 Alaska 74444 61372 78.3 29.0 42.9 66.0
## 3 New Jersey 73702 61372 80.3 12.1 24.9 41.3
######### 1.12 Recap of Handy DPLYR Functions
##### 1.12.2 Recuentos basicos
m <- mutate(new.sleep, kount = n())
m[1:5,c(1:4,10:12)]
## # A tibble: 5 x 7
## # Groups: vore, order [5]
## name genus vore order brainwt bodywt kount
## <chr> <chr> <chr> <chr> <dbl> <dbl> <int>
## 1 Cheetah Acinonyx carni Carnivora NA 50 12
## 2 Owl monkey Aotus omni Primates 0.0155 0.48 10
## 3 Mountain beaver Aplodontia herbi Rodentia NA 1.35 16
## 4 Greater short-tailed shrew Blarina omni Soricomorp~ 0.00029 0.019 3
## 5 Cow Bos herbi Artiodacty~ 0.423 600 5
f <- filter(new.sleep, n() > 14)
f[1:5,c(1:4,10:11)]
## # A tibble: 5 x 6
## # Groups: vore, order [1]
## name genus vore order brainwt bodywt
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 Mountain beaver Aplodontia herbi Rodentia NA 1.35
## 2 Guinea pig Cavis herbi Rodentia 0.0055 0.728
## 3 Chinchilla Chinchilla herbi Rodentia 0.0064 0.42
## 4 Western american chipmunk Eutamias herbi Rodentia NA 0.071
## 5 Mongolian gerbil Meriones herbi Rodentia NA 0.053
##### 1.12.3 Funciones Nth
#primera entrada
salary.description <- c("Golden parachute type","Well to do",
"Average","Below average", "bring date seeds instead of flowers")
first(salary.description)
## [1] "Golden parachute type"
#ultima entrada
last(salary.description)
## [1] "bring date seeds instead of flowers"
#tercera entrada
nth(salary.description, -3)
## [1] "Average"
#segundo elemento del vector
nth(salary.description,2)
## [1] "Well to do"
##### 1.12.4 Contar valores distintos
#valores unicos en un vector
a.vector <- c(22,33,44,1,2,3,3,3,4)
original.length <- length(a.vector)
original.length
## [1] 9
#numeros de elementos distintos
distinct.a.vector <- n_distinct(a.vector)
distinct.a.vector
## [1] 7
test1 <- if_else(original.length == distinct.a.vector, "all values
unique","some duplicate values in vector")
test1
## [1] "some duplicate values in vector"
b.vector <- c(1,2,3,4,5,6)
length(b.vector)
## [1] 6
distinct.b.vector <- n_distinct(b.vector)
distinct.b.vector #show count (length) of distinct numbers
## [1] 6
test2 <- if_else(length(b.vector) == distinct.b.vector, "all values
unique", "duplicates")
test2
## [1] "all values \nunique"
##### 1.12.5 na_if
test <- c(100, 0, 999)
x <- 5000/test
x
## [1] 50.000000 Inf 5.005005
x <- 5000/na_if(test,0) # si hay algun cero, no se hace
x
## [1] 50.000000 NA 5.005005
class(x) #tipo de variable
## [1] "numeric"
##### 1.12.6 Coalesce para remplazar valores faltantes
x <- c(33,4,11,NA,9)
x
## [1] 33 4 11 NA 9
x <- coalesce(x,0)
x
## [1] 33 4 11 0 9
############ 1.13 Funciones de clasificación
y <- c(100,4,12,6,8,3)
rank1 <-row_number(y) # muestra en que posicion se encuentran los elementos de menor a mayor
rank1
## [1] 6 2 5 3 4 1
y[rank1[1]] #valor menor
## [1] 3
y[rank1[6]] #valor mayor (ocupa la sexta posicion)
## [1] 100
##### 1.13.2 Rango mínimo
# muestra en que posicion se encuentran los elementos de menor a mayor
rank2 <- min_rank(y)
rank2
## [1] 6 2 5 3 4 1
##### 1.13.3 Rango denso
rank3 <- dense_rank(y) # muestra en que posicion se encuentran los elementos de menor a mayor
rank3
## [1] 6 2 5 3 4 1
##### 1.13.4 Rango porcentual
rank4 <- percent_rank(y) #percentil en el que se ubican los datos
rank4
## [1] 1.0 0.2 0.8 0.4 0.6 0.0
##### 1.13.5 Función de distribución acumulativa
y <- c(100,4,12,6,8,3)
rank5 <- cume_dist(y) #proporción de todos los valores menores o iguales al rango actual:
rank5
## [1] 1.0000000 0.3333333 0.8333333 0.5000000 0.6666667 0.1666667
# Divida el vector de entrada en n buckets
rank6 = ntile(y, 3)
rank6
## [1] 3 1 3 2 2 1
test.vector <- c(2,22,33,44,77,89,99)
quantile(test.vector, prob = seq(0,1,length = 11),type = 5)
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 2.0 6.0 20.0 28.6 36.3 44.0 67.1 81.8 90.0 97.0 99.0
########### 1.14 muesteo
data("ChickWeight")
my.sample <- sample_n(ChickWeight, 5) #muestra 5 filas aleatorias de la base de datos
my.sample
## weight Time Chick Diet
## 1 167 21 22 2
## 2 164 20 22 2
## 3 89 14 20 1
## 4 331 21 21 2
## 5 103 8 44 4
set.seed(833)
my.sample <- sample_n(ChickWeight, 10, replace = TRUE) #replace =true; se puede obtener la misma fila mas de una vez
my.sample
## weight Time Chick Diet
## 1 98 8 45 4
## 2 42 0 17 1
## 3 98 8 36 3
## 4 51 2 11 1
## 5 198 20 3 1
## 6 237 21 49 4
## 7 205 16 50 4
## 8 170 16 39 3
## 9 332 18 35 3
## 10 144 14 33 3
my.sample <- sample_n(mtcars, 12, weight = cyl)# es mas probable que se selccionen carros con mas cilindros
my.sample[,1:5]
## mpg cyl disp hp drat
## AMC Javelin 15.2 8 304.0 150 3.15
## Porsche 914-2 26.0 4 120.3 91 4.43
## Merc 280 19.2 6 167.6 123 3.92
## Cadillac Fleetwood 10.4 8 472.0 205 2.93
## Merc 240D 24.4 4 146.7 62 3.69
## Datsun 710 22.8 4 108.0 93 3.85
## Merc 280C 17.8 6 167.6 123 3.92
## Mazda RX4 Wag 21.0 6 160.0 110 3.90
## Merc 450SLC 15.2 8 275.8 180 3.07
## Chrysler Imperial 14.7 8 440.0 230 3.23
## Maserati Bora 15.0 8 301.0 335 3.54
## Valiant 18.1 6 225.0 105 2.76
#sample_frac para obtener una muestra igual a un porcentaje específico de las filas del marco de datos
test1 <- sample_frac(ChickWeight, 0.02)
test1
## weight Time Chick Diet
## 1 48 2 13 1
## 2 62 6 12 1
## 3 197 20 45 4
## 4 234 18 42 4
## 5 58 4 28 2
## 6 163 16 3 1
## 7 103 8 41 4
## 8 103 8 42 4
## 9 120 18 19 1
## 10 48 2 36 3
## 11 80 6 48 4
## 12 137 12 33 3
#En este ejemplo, group_by identifica a los personajes de Starwars por grupo de cabello, y luego se selecciona el 7% de los registros en cada grupo. Esto es útil cuando desea un porcentaje establecido de grupos cuyos tamaños varían
by_hair_color <- starwars %>% group_by(hair_color)
my.sample <- sample_frac(by_hair_color, .07, replace = TRUE)
my.sample[,1:5]
## # A tibble: 5 x 5
## # Groups: hair_color [3]
## name height mass hair_color skin_color
## <chr> <int> <dbl> <chr> <chr>
## 1 Eeth Koth 171 NA black brown
## 2 Dormé 165 NA brown light
## 3 Sebulba 112 40 none grey, red
## 4 Shaak Ti 178 57 none red, blue, white
## 5 Tion Medon 206 80 none grey
#recuentos basicos y recuentos por grupos
row.kount.only <- ChickWeight %>% tally()
row.kount.only
## n
## 1 578
diet.kount <- ChickWeight %>% count(Diet)
diet.kount
## Diet n
## 1 1 220
## 2 2 120
## 3 3 120
## 4 4 118
############# 1.15 Miscellaneous DPLYR Functions
##### 1.15.1 add_count para filtrado grupal
single.species.kount <- starwars %>%
add_count(species) %>%
filter(n == 1)
single.species.kount[,1:6]
## # A tibble: 29 x 6
## name height mass hair_color skin_color eye_color
## <chr> <int> <dbl> <chr> <chr> <chr>
## 1 Greedo 173 74 <NA> green black
## 2 Jabba Desilijic Tiure 175 1358 <NA> green-tan, brown orange
## 3 Yoda 66 17 white green brown
## 4 Bossk 190 113 none green red
## 5 Ackbar 180 83 none brown mottle orange
## 6 Wicket Systri Warrick 88 20 brown brown brown
## 7 Nien Nunb 160 68 none grey black
## 8 Nute Gunray 191 90 none mottled green red
## 9 Watto 137 NA black blue, grey yellow
## 10 Sebulba 112 40 none grey, red orange
## # ... with 19 more rows
##### 1.15.2 renombrar
mtcars <- rename(mtcars, spam_mpg = mpg)
data(mtcars)
names(mtcars)
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear"
## [11] "carb"
##### 1.15.3 case_when
#Puede crear una nueva variable que se base en una combinación compleja de variables existentes
data(starwars)
new.starwars <- starwars %>%
dplyr::select(name, mass, gender, species, height) %>%
mutate(type = case_when(height > 200 | mass > 200 ~ "large",
species == "Droid" ~ "robot", TRUE ~ "other"))
new.starwars
## # A tibble: 87 x 6
## name mass gender species height type
## <chr> <dbl> <chr> <chr> <int> <chr>
## 1 Luke Skywalker 77 masculine Human 172 other
## 2 C-3PO 75 masculine Droid 167 robot
## 3 R2-D2 32 masculine Droid 96 robot
## 4 Darth Vader 136 masculine Human 202 large
## 5 Leia Organa 49 feminine Human 150 other
## 6 Owen Lars 120 masculine Human 178 other
## 7 Beru Whitesun lars 75 feminine Human 165 other
## 8 R5-D4 32 masculine Droid 97 robot
## 9 Biggs Darklighter 84 masculine Human 183 other
## 10 Obi-Wan Kenobi 77 masculine Human 182 other
## # ... with 77 more rows