#1,1,1 single condition filter
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.5 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.0.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
data("mtcars")
#select only cars with six cylinders
six.cyl.only<-filter(mtcars,cyl==6)
six.cyl.only
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
#Seleccionar los carros con mas de 150 hp
mas.cientocincuenta<-filter(mtcars,hp > 150)
mas.cientocincuenta
## mpg cyl disp hp drat wt qsec vs am gear carb
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
#1,1,2 Multiple-Condition Filter
six.cylinders.and.110.horse.power<-filter(mtcars,cyl==6,hp==110)
six.cylinders.and.110.horse.power
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
#1,1,3 OR Logic for Filtering
gear.eq.4.or.more.than.8<-filter(mtcars,gear==4|cyl>6)
gear.eq.4.or.more.than.8
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
#1,1,4 Filter by Minimums, Maximuns, and other Numeric criteria
smallest.engine.displacement<- filter(mtcars,disp==min(disp))
smallest.engine.displacement
## mpg cyl disp hp drat wt qsec vs am gear carb
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.9 1 1 4 1
data("Chickweight")
## Warning in data("Chickweight"): data set 'Chickweight' not found
chick.subset<-filter(ChickWeight,Time<3,weight>53)
chick.subset
## weight Time Chick Diet
## 1 55 2 22 2
## 2 55 2 40 3
## 3 55 2 43 4
## 4 54 2 50 4
#1,1,5 Filter Out Missing Values (NAs) for a specific Column
data("airquality")
head(airquality,10)# before filter
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
## 7 23 299 8.6 65 5 7
## 8 19 99 13.8 59 5 8
## 9 8 19 20.1 61 5 9
## 10 NA 194 8.6 69 5 10
no.missing.ozone=filter(airquality,!is.na(Ozone))
head(no.missing.ozone,8)#after filter
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 28 NA 14.9 66 5 6
## 6 23 299 8.6 65 5 7
## 7 19 99 13.8 59 5 8
## 8 8 19 20.1 61 5 9
#1,1,6 Filter Rows with NAs Anywhere in the Dataset
airqual.no.NA.anywhere<-filter(airquality[1:10,],
complete.cases(airquality[1:10,]))
airqual.no.NA.anywhere
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 23 299 8.6 65 5 7
## 6 19 99 13.8 59 5 8
## 7 8 19 20.1 61 5 9
#1,1,7 Filter by %in%
data("iris")
table(iris$Species) #counts of species in the dataset
##
## setosa versicolor virginica
## 50 50 50
iris.two.species<-filter(iris,Species %in% c("setosa", "virginica"))
table(iris.two.species$Species)
##
## setosa versicolor virginica
## 50 0 50
nrow(iris); nrow(iris.two.species)
## [1] 150
## [1] 100
#Seleccionar los carros con mas de 30 mpg y carb 1
mas.30mpg.and.carb1<-filter(mtcars,mpg>30,carb==1)
mas.30mpg.and.carb1
## mpg cyl disp hp drat wt qsec vs am gear carb
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
#1,1,8 Filter for Ozone>29 and include Only three coumns
data("airquality")
airqual.3.coloumns<-filter(airquality, Ozone > 29)[,1:3]
head(airqual.3.coloumns)
## Ozone Solar.R Wind
## 1 41 190 7.4
## 2 36 118 8.0
## 3 34 307 12.0
## 4 30 322 11.5
## 5 32 92 12.0
## 6 45 252 14.9
#datos para wind>7
data("airquality")
wind.3.columns<-filter(airquality, Wind>7)[,3:5]
head(wind.3.columns)
## Wind Temp Month
## 1 7.4 67 5
## 2 8.0 72 5
## 3 12.6 74 5
## 4 11.5 62 5
## 5 14.3 56 5
## 6 14.9 66 5
#Filter by total frequency of a value across All rows
table(mtcars$gear)
##
## 3 4 5
## 15 12 5
more.frequent.no.of.gear<-mtcars%>%
group_by(gear)%>%
filter(n()>10) #
table(more.frequent.no.of.gear$gear)
##
## 3 4
## 15 12
#additional crteria can be added to the filter by includying a requirement that the horsepower be less than 105
more.frequent.no.of.gears.and.low.horsepower<-mtcars%>%group_by(gear)%>%
filter(n()>10,hp<105)
table(more.frequent.no.of.gears.and.low.horsepower$gear)
##
## 3 4
## 1 7
#1,1,10 Filter by Column Name Using "starts with"
names (iris) #show the column names
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
iris.display<-iris %>% dplyr::select(starts_with("s"))
head(iris.display) #use head to reduce number or rows output
## Sepal.Length Sepal.Width Species
## 1 5.1 3.5 setosa
## 2 4.9 3.0 setosa
## 3 4.7 3.2 setosa
## 4 4.6 3.1 setosa
## 5 5.0 3.6 setosa
## 6 5.4 3.9 setosa
#Filter Rows: Columns Meet Criteria (filter_at)
new.mtcars<-mtcars%>% filter_at(vars(cyl,hp),
all_vars(.==max(.)))
new.mtcars
## mpg cyl disp hp drat wt qsec vs am gear carb
## Maserati Bora 15 8 301 335 3.54 3.57 14.6 0 1 5 8
msleep<-ggplot2::msleep
msleep
## # A tibble: 83 x 11
## name genus vore order conservation sleep_total sleep_rem sleep_cycle awake
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Cheet~ Acin~ carni Carn~ lc 12.1 NA NA 11.9
## 2 Owl m~ Aotus omni Prim~ <NA> 17 1.8 NA 7
## 3 Mount~ Aplo~ herbi Rode~ nt 14.4 2.4 NA 9.6
## 4 Great~ Blar~ omni Sori~ lc 14.9 2.3 0.133 9.1
## 5 Cow Bos herbi Arti~ domesticated 4 0.7 0.667 20
## 6 Three~ Brad~ herbi Pilo~ <NA> 14.4 2.2 0.767 9.6
## 7 North~ Call~ carni Carn~ vu 8.7 1.4 0.383 15.3
## 8 Vespe~ Calo~ <NA> Rode~ <NA> 7 NA NA 17
## 9 Dog Canis carni Carn~ domesticated 10.1 2.9 0.333 13.9
## 10 Roe d~ Capr~ herbi Arti~ lc 3 NA NA 21
## # ... with 73 more rows, and 2 more variables: brainwt <dbl>, bodywt <dbl>
msleep.over.5<-msleep%>%
select(name, sleep_total:sleep_rem,brainwt:bodywt)%>%
filter_at(vars(contains("sleep")),all_vars(.>5))
msleep.over.5
## # A tibble: 2 x 5
## name sleep_total sleep_rem brainwt bodywt
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Thick-tailed opposum 19.4 6.6 NA 0.37
## 2 Giant armadillo 18.1 6.1 0.081 60
#1,2 Arrange (Sort)
msleep<-ggplot2::msleep
msleep[,1:4]
## # A tibble: 83 x 4
## name genus vore order
## <chr> <chr> <chr> <chr>
## 1 Cheetah Acinonyx carni Carnivora
## 2 Owl monkey Aotus omni Primates
## 3 Mountain beaver Aplodontia herbi Rodentia
## 4 Greater short-tailed shrew Blarina omni Soricomorpha
## 5 Cow Bos herbi Artiodactyla
## 6 Three-toed sloth Bradypus herbi Pilosa
## 7 Northern fur seal Callorhinus carni Carnivora
## 8 Vesper mouse Calomys <NA> Rodentia
## 9 Dog Canis carni Carnivora
## 10 Roe deer Capreolus herbi Artiodactyla
## # ... with 73 more rows
#1,2,1 Ascending
animal.name.sequence<-arrange(msleep,vore,order)
animal.name.sequence[,1:4]
## # A tibble: 83 x 4
## name genus vore order
## <chr> <chr> <chr> <chr>
## 1 Cheetah Acinonyx carni Carnivora
## 2 Northern fur seal Callorhinus carni Carnivora
## 3 Dog Canis carni Carnivora
## 4 Domestic cat Felis carni Carnivora
## 5 Gray seal Haliochoerus carni Carnivora
## 6 Tiger Panthera carni Carnivora
## 7 Jaguar Panthera carni Carnivora
## 8 Lion Panthera carni Carnivora
## 9 Caspian seal Phoca carni Carnivora
## 10 Genet Genetta carni Carnivora
## # ... with 73 more rows
#1,2,2 Descending
animal.name.sequence.desc<-arrange(msleep,vore,desc(order))
head(animal.name.sequence.desc[,1:4])
## # A tibble: 6 x 4
## name genus vore order
## <chr> <chr> <chr> <chr>
## 1 Northern grasshopper mouse Onychomys carni Rodentia
## 2 Slow loris Nyctibeus carni Primates
## 3 Thick-tailed opposum Lutreolina carni Didelphimorphia
## 4 Long-nosed armadillo Dasypus carni Cingulata
## 5 Pilot whale Globicephalus carni Cetacea
## 6 Common porpoise Phocoena carni Cetacea
#1,3 Rename
#Rename one or more Columns in a dataset:
names(iris)
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
renamed.iris<-rename(iris,widh.of.petals=Petal.Width,various.plants.and.animals=Species)
names(renamed.iris)
## [1] "Sepal.Length" "Sepal.Width"
## [3] "Petal.Length" "widh.of.petals"
## [5] "various.plants.and.animals"
#1,4 Mutate
data("ChickWeight")
ChickWeight[1:2,]#first two rows
## weight Time Chick Diet
## 1 42 0 1 1
## 2 51 2 1 1
ChickWeight.with.log<-mutate(ChickWeight,
log.of.weight=log10(weight))
ChickWeight.with.log[1:2,]
## weight Time Chick Diet log.of.weight
## 1 42 0 1 1 1.623249
## 2 51 2 1 1 1.707570
#1,4,1 mutate_all
msleep<-ggplot2::msleep
names(msleep)
## [1] "name" "genus" "vore" "order" "conservation"
## [6] "sleep_total" "sleep_rem" "sleep_cycle" "awake" "brainwt"
## [11] "bodywt"
msleep.with.square.roots<-mutate_all(msleep[,6:11],
funs("square root"=sqrt(.)))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
names(msleep.with.square.roots)
## [1] "sleep_total" "sleep_rem"
## [3] "sleep_cycle" "awake"
## [5] "brainwt" "bodywt"
## [7] "sleep_total_square root" "sleep_rem_square root"
## [9] "sleep_cycle_square root" "awake_square root"
## [11] "brainwt_square root" "bodywt_square root"
msleep.with.square.roots
## # A tibble: 83 x 12
## sleep_total sleep_rem sleep_cycle awake brainwt bodywt `sleep_total_square~
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 12.1 NA NA 11.9 NA 50 3.48
## 2 17 1.8 NA 7 0.0155 0.48 4.12
## 3 14.4 2.4 NA 9.6 NA 1.35 3.79
## 4 14.9 2.3 0.133 9.1 0.00029 0.019 3.86
## 5 4 0.7 0.667 20 0.423 600 2
## 6 14.4 2.2 0.767 9.6 NA 3.85 3.79
## 7 8.7 1.4 0.383 15.3 NA 20.5 2.95
## 8 7 NA NA 17 NA 0.045 2.65
## 9 10.1 2.9 0.333 13.9 0.07 14 3.18
## 10 3 NA NA 21 0.0982 14.8 1.73
## # ... with 73 more rows, and 5 more variables: sleep_rem_square root <dbl>,
## # sleep_cycle_square root <dbl>, awake_square root <dbl>,
## # brainwt_square root <dbl>, bodywt_square root <dbl>
#1,4,2 mutate_at to Add Fields
data("Titanic")
Titanic<-as.data.frame(Titanic)
head(Titanic)
## Class Sex Age Survived Freq
## 1 1st Male Child No 0
## 2 2nd Male Child No 0
## 3 3rd Male Child No 35
## 4 Crew Male Child No 0
## 5 1st Female Child No 0
## 6 2nd Female Child No 0
titanic.with.ranks<-mutate_at(Titanic,vars(Class,Age,Survived),
funs(Rank=min_rank(desc(.))))
head(titanic.with.ranks)
## Class Sex Age Survived Freq Class_Rank Age_Rank Survived_Rank
## 1 1st Male Child No 0 25 17 17
## 2 2nd Male Child No 0 17 17 17
## 3 3rd Male Child No 35 9 17 17
## 4 Crew Male Child No 0 1 17 17
## 5 1st Female Child No 0 25 17 17
## 6 2nd Female Child No 0 17 17 17
#1,4,3 mutate_if
divide.by.10<-function (a.number) (a.number/10)
head(CO2)
## Plant Type Treatment conc uptake
## 1 Qn1 Quebec nonchilled 95 16.0
## 2 Qn1 Quebec nonchilled 175 30.4
## 3 Qn1 Quebec nonchilled 250 34.8
## 4 Qn1 Quebec nonchilled 350 37.2
## 5 Qn1 Quebec nonchilled 500 35.3
## 6 Qn1 Quebec nonchilled 675 39.2
new.df<-CO2%>%
mutate_if(is.numeric,divide.by.10)
head(new.df)
## Plant Type Treatment conc uptake
## 1 Qn1 Quebec nonchilled 9.5 1.60
## 2 Qn1 Quebec nonchilled 17.5 3.04
## 3 Qn1 Quebec nonchilled 25.0 3.48
## 4 Qn1 Quebec nonchilled 35.0 3.72
## 5 Qn1 Quebec nonchilled 50.0 3.53
## 6 Qn1 Quebec nonchilled 67.5 3.92
df<-data.frame(
alpha=c(22,1,NA),
almond=c(0,5,10),
grape=c(0,2,2),
apple=c(NA,5,10))
df
## alpha almond grape apple
## 1 22 0 0 NA
## 2 1 5 2 5
## 3 NA 10 2 10
df.fix.alpha<-df %>% mutate_if(is.numeric,coalesce,...=0)
df.fix.alpha
## alpha almond grape apple
## 1 22 0 0 0
## 2 1 5 2 5
## 3 0 10 2 10
#1,4,4 string detect and true/false Duplicate indicator
msleep<-ggplot2::msleep
table(msleep$vore)
##
## carni herbi insecti omni
## 19 32 5 20
msleep.no.c.or.a<-filter(msleep, !str_detect(vore,
paste(c("c","a"),collapse = "|")))
table(msleep.no.c.or.a$vore)
##
## herbi omni
## 32 20
msleep.with.dup.indicador<-mutate(msleep,duplicate.indicator=duplicated(conservation))
msleep.with.dup.indicador[1:6,]
## # A tibble: 6 x 12
## name genus vore order conservation sleep_total sleep_rem sleep_cycle awake
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Cheetah Acin~ carni Carn~ lc 12.1 NA NA 11.9
## 2 Owl mo~ Aotus omni Prim~ <NA> 17 1.8 NA 7
## 3 Mounta~ Aplo~ herbi Rode~ nt 14.4 2.4 NA 9.6
## 4 Greate~ Blar~ omni Sori~ lc 14.9 2.3 0.133 9.1
## 5 Cow Bos herbi Arti~ domesticated 4 0.7 0.667 20
## 6 Three-~ Brad~ herbi Pilo~ <NA> 14.4 2.2 0.767 9.6
## # ... with 3 more variables: brainwt <dbl>, bodywt <dbl>,
## # duplicate.indicator <lgl>
msleep.with.dup.indicador<-mutate(msleep,
duplicate.indicator=duplicated(conservation))
msleep.with.dup.indicador[1:6,c(1,2,3,12)]
## # A tibble: 6 x 4
## name genus vore duplicate.indicator
## <chr> <chr> <chr> <lgl>
## 1 Cheetah Acinonyx carni FALSE
## 2 Owl monkey Aotus omni FALSE
## 3 Mountain beaver Aplodontia herbi FALSE
## 4 Greater short-tailed shrew Blarina omni TRUE
## 5 Cow Bos herbi FALSE
## 6 Three-toed sloth Bradypus herbi TRUE
msleep.with.dup.indicador2<-mutate(msleep,duplicate.indicator = duplicated(conservation, genus)) %>%
arrange(conservation,genus)
msleep.with.dup.indicador2
## # A tibble: 83 x 12
## name genus vore order conservation sleep_total sleep_rem sleep_cycle awake
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Giraf~ Gira~ herbi Arti~ cd 1.9 0.4 NA 22.1
## 2 Pilot~ Glob~ carni Ceta~ cd 2.7 0.1 NA 21.4
## 3 Cow Bos herbi Arti~ domesticated 4 0.7 0.667 20
## 4 Dog Canis carni Carn~ domesticated 10.1 2.9 0.333 13.9
## 5 Guine~ Cavis herbi Rode~ domesticated 9.4 0.8 0.217 14.6
## 6 Chinc~ Chin~ herbi Rode~ domesticated 12.5 1.5 0.117 11.5
## 7 Horse Equus herbi Peri~ domesticated 2.9 0.6 1 21.1
## 8 Donkey Equus herbi Peri~ domesticated 3.1 0.4 NA 20.9
## 9 Domes~ Felis carni Carn~ domesticated 12.5 3.2 0.417 11.5
## 10 Rabbit Oryc~ herbi Lago~ domesticated 8.4 0.9 0.417 15.6
## # ... with 73 more rows, and 3 more variables: brainwt <dbl>, bodywt <dbl>,
## # duplicate.indicator <lgl>
fruit<-c("apple","pear","orange","grape","orange","orange")
x<-c(1,2,4,9,4,6)
y<-c(22,3,4,55,15,9)
z<-c(3,1,4,10,12,8)
w<-c(2,2,2,4,5,6)
df<-data.frame(fruit,x,y,z,w)
df
## fruit x y z w
## 1 apple 1 22 3 2
## 2 pear 2 3 1 2
## 3 orange 4 4 4 2
## 4 grape 9 55 10 4
## 5 orange 4 15 12 5
## 6 orange 6 9 8 6
df.show.single.dup<-mutate(df,duplicate.indicator=duplicated(fruit))
df.show.single.dup
## fruit x y z w duplicate.indicator
## 1 apple 1 22 3 2 FALSE
## 2 pear 2 3 1 2 FALSE
## 3 orange 4 4 4 2 FALSE
## 4 grape 9 55 10 4 FALSE
## 5 orange 4 15 12 5 TRUE
## 6 orange 6 9 8 6 TRUE
#1,4,5 Drop Variables Using NULL
fruit<-c("apple","pear","orange","grape","orange","orange")
x<-c(1,2,4,9,4,6)
y<-c(22,3,4,55,15,9)
z<-c(3,1,4,10,12,8)
df<-data.frame(fruit,x,y,z)
df<-mutate(df,z=NULL)
df
## fruit x y
## 1 apple 1 22
## 2 pear 2 3
## 3 orange 4 4
## 4 grape 9 55
## 5 orange 4 15
## 6 orange 6 9
#1,4,6 Preferred coding sequence
if(!require("nycflights13")) install.packages("nycflights13")
## Loading required package: nycflights13
mutate(flights,
gain= arr_delay-dep_delay,
hours=air_time/60,
gain_per_hour=gain/hours,
gain_per_minute=60*gain_per_hour)
## # A tibble: 336,776 x 23
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ... with 336,766 more rows, and 15 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## # gain <dbl>, hours <dbl>, gain_per_hour <dbl>, gain_per_minute <dbl>
if (!require("nycflights13")) install.packages("nycflights13")
newfield.flights<-flights %>%
mutate(gain=arr_delay-dep_delay,
hours=air_time/60)%>%
mutate(gain_per_hour=gain/hours)%>%
mutate(gain_per_minute=60*gain_per_hour)
newfield.flights[1:6,c(1:2,20:23)]
## # A tibble: 6 x 6
## year month gain hours gain_per_hour gain_per_minute
## <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 9 3.78 2.38 143.
## 2 2013 1 16 3.78 4.23 254.
## 3 2013 1 31 2.67 11.6 698.
## 4 2013 1 -17 3.05 -5.57 -334.
## 5 2013 1 -19 1.93 -9.83 -590.
## 6 2013 1 16 2.5 6.4 384
#1,4,7 Transmute:keep Only variables created
fruit<-c("apple","pear","orange","grape","orange","orange")
x<-c(1,2,4,9,4,6)
y<-c(22,3,4,55,15,9)
z<-c(3,1,4,10,12,8)
df<-data.frame(fruit,x,y,z)
df
## fruit x y z
## 1 apple 1 22 3
## 2 pear 2 3 1
## 3 orange 4 4 4
## 4 grape 9 55 10
## 5 orange 4 15 12
## 6 orange 6 9 8
dl<-transmute(df,new.variable=x+y+z)
dl
## new.variable
## 1 26
## 2 6
## 3 12
## 4 74
## 5 31
## 6 23
mh<-data.frame(fruit)
fruit.mut<-mutate(mh,dl)
fruit.mut
## fruit new.variable
## 1 apple 26
## 2 pear 6
## 3 orange 12
## 4 grape 74
## 5 orange 31
## 6 orange 23
names(fruit.mut)
## [1] "fruit" "new.variable"
rename(fruit.mut,totfruit=new.variable)
## fruit totfruit
## 1 apple 26
## 2 pear 6
## 3 orange 12
## 4 grape 74
## 5 orange 31
## 6 orange 23
#use across to apply a function over multiple columns
double.it<-function(x) x*2
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
iris %>%
mutate(across(where(is.numeric), double.it))%>%
head()
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 10.2 7.0 2.8 0.4 setosa
## 2 9.8 6.0 2.8 0.4 setosa
## 3 9.4 6.4 2.6 0.4 setosa
## 4 9.2 6.2 3.0 0.4 setosa
## 5 10.0 7.2 2.8 0.4 setosa
## 6 10.8 7.8 3.4 0.8 setosa
#1,4,9 Conditional Mutating using case_when
row1<-c("a","b","c","d","e","f","column.to.be.changed")
row2<-c(1,1,1,6,6,1,2)
row3<-c(3,4,4,6,4,4,4)
row4<-c(4,6,25,5,5,2,9)
row5<-c(5,3,6,3,3,6,2)
df<-as.data.frame(rbind(row2,row3,row4,row5))
names(df)<-row1
df
## a b c d e f column.to.be.changed
## row2 1 1 1 6 6 1 2
## row3 3 4 4 6 4 4 4
## row4 4 6 25 5 5 2 9
## row5 5 3 6 3 3 6 2
new.df<-df%>%
mutate(column.to.be.changed=case_when(a == 2|a == 5|
a == 7|(a == 1 & b == 4) ~ 2,a == 0|a == 1|a == 4|a == 3|c == 4~3,TRUE ~ NA_real_))
new.df
## a b c d e f column.to.be.changed
## row2 1 1 1 6 6 1 3
## row3 3 4 4 6 4 4 3
## row4 4 6 25 5 5 2 3
## row5 5 3 6 3 3 6 2
#1,5 Select to choose variables/columns
#1,5,1 Delete a Column
fruit<-c("apple","pear","orange","grape","orange","orange")
x<-c(1,2,4,9,4,6)
y<-c(22,3,4,55,15,9)
z<-c(3,1,4,10,12,8)
df<-data.frame(fruit,x,y,z)
df
## fruit x y z
## 1 apple 1 22 3
## 2 pear 2 3 1
## 3 orange 4 4 4
## 4 grape 9 55 10
## 5 orange 4 15 12
## 6 orange 6 9 8
new.df.no.fruit<-dplyr::select(df, -fruit)
new.df.no.fruit
## x y z
## 1 1 22 3
## 2 2 3 1
## 3 4 4 4
## 4 9 55 10
## 5 4 15 12
## 6 6 9 8
#Delete columns by name using starts_with or ends_with
data ("mtcars")
names(mtcars)
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear"
## [11] "carb"
mtcars.no.col.names.start.with.d<-select(mtcars,-starts_with("d"))
names(mtcars.no.col.names.start.with.d)
## [1] "mpg" "cyl" "hp" "wt" "qsec" "vs" "am" "gear" "carb"
mtcars.no.col.names.ends.with.t<-select(mtcars,-ends_with("t"))
names(mtcars.no.col.names.ends.with.t)
## [1] "mpg" "cyl" "disp" "hp" "qsec" "vs" "am" "gear" "carb"
#1,5,3 Delete a Column
fruit<-c("apple","pear","orange","grape","orange","orange")
x<-c(1,2,4,9,4,6)
y<-c(22,3,4,55,15,9)
z<-c(3,1,4,10,12,8)
df<-data.frame(fruit,x,y,z)
df
## fruit x y z
## 1 apple 1 22 3
## 2 pear 2 3 1
## 3 orange 4 4 4
## 4 grape 9 55 10
## 5 orange 4 15 12
## 6 orange 6 9 8
#1,5,4 select_all to apply a function to all columns
#create new dataframe
state<-c("Maryland","Alaska","New Jersey")
income<-c(76067,74444,73702)
median.us<-c(61372,61372,61372)
life.expectancy<-c(78.8,78.3,80.3)
top.3.states<-data.frame(state,income,median.us,life.expectancy)
top.3.states
## state income median.us life.expectancy
## 1 Maryland 76067 61372 78.8
## 2 Alaska 74444 61372 78.3
## 3 New Jersey 73702 61372 80.3
new.top.3.states<-select_all(top.3.states,toupper)
new.top.3.states
## STATE INCOME MEDIAN.US LIFE.EXPECTANCY
## 1 Maryland 76067 61372 78.8
## 2 Alaska 74444 61372 78.3
## 3 New Jersey 73702 61372 80.3
#1,5,5 select columns using the pull function
new.top.3.states<-select_all(top.3.states,toupper)
new.top.3.states
## STATE INCOME MEDIAN.US LIFE.EXPECTANCY
## 1 Maryland 76067 61372 78.8
## 2 Alaska 74444 61372 78.3
## 3 New Jersey 73702 61372 80.3
pull.first.column<-pull(top.3.states,1)
pull.first.column
## [1] "Maryland" "Alaska" "New Jersey"
pull.last.column<-pull(top.3.states,-1)
pull.last.column
## [1] 78.8 78.3 80.3
#1,5,6 select rows:any variable meets some condition
nrow(mtcars)
## [1] 32
mtcars.more.than.200<-filter_all(mtcars,any_vars(.>200))
nrow(mtcars.more.than.200)
## [1] 16
#1,5,7 select columns:omit if column name contains specific characters
names(mtcars)
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear"
## [11] "carb"
cars.with.no.p<-mtcars%>%
dplyr::select(-contains("p"))
names(cars.with.no.p)
## [1] "cyl" "drat" "wt" "qsec" "vs" "am" "gear" "carb"
#1,5,8 Select using wildcard matching
names(mtcars)
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear"
## [11] "carb"
subset.mtcars<-select(mtcars,
matches("pg|gea"))
names(subset.mtcars)
## [1] "mpg" "gear"
#1,6 Joins:Manipulations od data from two sources
#1,6,1 left join (Most Common)
us.state.areas<-as.data.frame(cbind(state.abb,state.area))
us.state.areas[1:3,]
## state.abb state.area
## 1 AL 51609
## 2 AK 589757
## 3 AZ 113909
us.state.abbreviation.and.name<-as.data.frame(cbind(state.abb,state.name))
us.state.abbreviation.and.name[1:3,]
## state.abb state.name
## 1 AL Alabama
## 2 AK Alaska
## 3 AZ Arizona
state.info.abb.area.name<-us.state.areas %>%
left_join(us.state.abbreviation.and.name,by="state.abb")
head(state.info.abb.area.name)
## state.abb state.area state.name
## 1 AL 51609 Alabama
## 2 AK 589757 Alaska
## 3 AZ 113909 Arizona
## 4 AR 53104 Arkansas
## 5 CA 158693 California
## 6 CO 104247 Colorado
#1.6.2 Inner Join
#DATAFRAME 1
names<-c("Sally", "Tom","Frieda","Alfonzo")
team.scores<-c(3,5,2,7)
team.league<-c("alpha","beta","gamma","omicron")
team.info<-data.frame(names,team.scores,team.league)
#DATAFRAME 2
names=c("Sally","Tom","Bill","Alfonzo")
school.grades<-c("A","B","C","B")
school.info<-data_frame(names,school.grades)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
school.and.team<-inner_join(team.info,school.info,by="names")
school.and.team
## names team.scores team.league school.grades
## 1 Sally 3 alpha A
## 2 Tom 5 beta B
## 3 Alfonzo 7 omicron B
#1,6,3 Anti-join
#DATAFRAME 1
names<-c("Sally", "Tom","Frieda","Alfonzo")
team.scores<-c(3,5,2,7)
team.league<-c("alpha","beta","gamma","omicron")
team.info<-data.frame(names,team.scores,team.league)
team.info
## names team.scores team.league
## 1 Sally 3 alpha
## 2 Tom 5 beta
## 3 Frieda 2 gamma
## 4 Alfonzo 7 omicron
#DATAFRAME 2
names=c("Sally","Tom","Bill","Alfonzo")
school.grades<-c("A","B","C","B")
school.info<-data_frame(names,school.grades)
school.info
## # A tibble: 4 x 2
## names school.grades
## <chr> <chr>
## 1 Sally A
## 2 Tom B
## 3 Bill C
## 4 Alfonzo B
team.info.but.no.grades<-anti_join(team.info,school.info, by="names")
team.info.but.no.grades
## names team.scores team.league
## 1 Frieda 2 gamma
x<-anti_join(school.info,team.info, by="names")
x
## # A tibble: 1 x 2
## names school.grades
## <chr> <chr>
## 1 Bill C
#1,6,4 join
#DATAFRAME 1
names<-c("Sally", "Tom","Frieda","Alfonzo")
team.scores<-c(3,5,2,7)
team.league<-c("alpha","beta","gamma","omicron")
team.info<-data.frame(names,team.scores,team.league)
#DATAFRAME 2
names=c("Sally","Tom","Bill","Alfonzo")
school.grades<-c("A","B","C","B")
school.info<-data_frame(names,school.grades)
team.info.and.or.grades<-full_join(team.info,school.info,by="names")
team.info.and.or.grades
## names team.scores team.league school.grades
## 1 Sally 3 alpha A
## 2 Tom 5 beta B
## 3 Frieda 2 gamma <NA>
## 4 Alfonzo 7 omicron B
## 5 Bill NA <NA> C
#1,6,5 Semi-join
team.info.with.grades<-semi_join(team.info,school.info)
## Joining, by = "names"
team.info.with.grades
## names team.scores team.league
## 1 Sally 3 alpha
## 2 Tom 5 beta
## 3 Alfonzo 7 omicron
#1,6,6 Right Join
us.state.areas<-as.data.frame(cbind(state.abb,state.area))
us.state.areas[1:3,]
## state.abb state.area
## 1 AL 51609
## 2 AK 589757
## 3 AZ 113909
us.state.abbreviation.and.name<-as.data.frame(cbind(state.abb,state.name))
us.state.abbreviation.and.name[1:3,]
## state.abb state.name
## 1 AL Alabama
## 2 AK Alaska
## 3 AZ Arizona
us.state.abbreviation.and.name[1,1]<-"Intentional Mismatch"
us.state.with.abbreviation.and.name.and.area<-right_join(us.state.areas,
us.state.abbreviation.and.name,by="state.abb")
us.state.with.abbreviation.and.name.and.area[1:3,]
## state.abb state.area state.name
## 1 AK 589757 Alaska
## 2 AZ 113909 Arizona
## 3 AR 53104 Arkansas
#1,7 Slice
msleep<-ggplot2::msleep
nrow(msleep)
## [1] 83
msleep.only.first.5<-slice(msleep,-6:-n())
nrow(msleep.only.first.5)
## [1] 5
msleep.20.rows<-msleep%>%
slice(20:39)
nrow(msleep.20.rows)
## [1] 20
nrow(msleep)-nrow(msleep.20.rows)
## [1] 63
#1,8 Summarise
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
data(gehan)
gehan2<-gehan
library(tidyverse)
gehan2 %>% summarise(kount=n())
## kount
## 1 42
gehan2 %>%
group_by(treat)%>%
summarise(kount=n())
## # A tibble: 2 x 2
## treat kount
## <fct> <int>
## 1 6-MP 21
## 2 control 21
gehan2 %>%
group_by(treat) %>%
summarise(avarage.remiss.time=mean(time),
median.remiss.time=median(time),
std.dev.remiss.time=sd(time),
median.abs.deviation=mad(time),
IQR.remiss.time=IQR(time))
## # A tibble: 2 x 6
## treat avarage.remiss.time median.remiss.t~ std.dev.remiss.~ median.abs.devi~
## <fct> <dbl> <int> <dbl> <dbl>
## 1 6-MP 17.1 16 10.0 10.4
## 2 control 8.67 8 6.47 5.93
## # ... with 1 more variable: IQR.remiss.time <dbl>
motorcycles<-c("HondaCb 190 R","Suzuki Gcxs 150","yamahaa R15")
Cyl<-c("188","150","150")
freno<-c("ABS","DISCO","DISCO")
duracion.gas.gal<-c(155,140,140)
dat<-data.frame(motorcycles,Cyl,freno,duracion.gas.gal)
head(x)
## # A tibble: 1 x 2
## names school.grades
## <chr> <chr>
## 1 Bill C
dat%>%
group_by(Cyl)%>%
summarise(kount=n())
## # A tibble: 2 x 2
## Cyl kount
## <chr> <int>
## 1 150 2
## 2 188 1
dat%>%
summarise(prome=mean(duracion.gas.gal),
medi=median(duracion.gas.gal),
desv=sd(duracion.gas.gal),
mediana=mad(duracion.gas.gal))
## prome medi desv mediana
## 1 145 140 8.660254 0
gehan2%>%
group_by(treat) %>%
summarise(minimum.remission=min(time),
max.remission=max(time))
## # A tibble: 2 x 3
## treat minimum.remission max.remission
## <fct> <int> <int>
## 1 6-MP 6 35
## 2 control 1 23
#1,8,1 Summarise Across
library(MASS)
subset.survey<-survey[1:10,]
library(dplyr)
head(subset.survey)
## Sex Wr.Hnd NW.Hnd W.Hnd Fold Pulse Clap Exer Smoke Height M.I
## 1 Female 18.5 18.0 Right R on L 92 Left Some Never 173.00 Metric
## 2 Male 19.5 20.5 Left R on L 104 Left None Regul 177.80 Imperial
## 3 Male 18.0 13.3 Right L on R 87 Neither None Occas NA <NA>
## 4 Male 18.8 18.9 Right R on L NA Neither None Never 160.00 Metric
## 5 Male 20.0 20.0 Right Neither 35 Right Some Never 165.00 Metric
## 6 Female 18.0 17.7 Right L on R 64 Right Some Never 172.72 Imperial
## Age
## 1 18.250
## 2 17.583
## 3 16.917
## 4 20.333
## 5 23.667
## 6 21.000
subset.survey %>%
na.omit() %>%
group_by(Sex) %>%
summarise(across(where(is.numeric), mean,
.names = "men_{col}"))%>%
head()
## # A tibble: 2 x 6
## Sex men_Wr.Hnd men_NW.Hnd men_Pulse men_Height men_Age
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Female 17.8 17.7 76.7 168. 25.0
## 2 Male 19.1 19.2 76.8 174. 20.3
new.sleep<-msleep %>%
group_by(vore,order)
s<-summarise(new.sleep,n())
## `summarise()` has grouped output by 'vore'. You can override using the `.groups` argument.
s
## # A tibble: 32 x 3
## # Groups: vore [5]
## vore order `n()`
## <chr> <chr> <int>
## 1 carni Carnivora 12
## 2 carni Cetacea 3
## 3 carni Cingulata 1
## 4 carni Didelphimorphia 1
## 5 carni Primates 1
## 6 carni Rodentia 1
## 7 herbi Artiodactyla 5
## 8 herbi Diprotodontia 1
## 9 herbi Hyracoidea 2
## 10 herbi Lagomorpha 1
## # ... with 22 more rows
new.sleep.totals<-msleep%>%
group_by(vore,order)%>%
summarise(n())
## `summarise()` has grouped output by 'vore'. You can override using the `.groups` argument.
new.sleep.totals
## # A tibble: 32 x 3
## # Groups: vore [5]
## vore order `n()`
## <chr> <chr> <int>
## 1 carni Carnivora 12
## 2 carni Cetacea 3
## 3 carni Cingulata 1
## 4 carni Didelphimorphia 1
## 5 carni Primates 1
## 6 carni Rodentia 1
## 7 herbi Artiodactyla 5
## 8 herbi Diprotodontia 1
## 9 herbi Hyracoidea 2
## 10 herbi Lagomorpha 1
## # ... with 22 more rows
#1,9 Gathering: Covert Multiple Columns into One
state<-c("Maryland","Alaska","New Jersey")
income<-c(76067,74444,73702)
median.us<-c(61372,61372,61372)
life.expectancy<-c(78.8,78.3,80.3)
teen.birth.rate.2015<-c(18,29.3,12.1)
teen.birth.rate.2007<-c(34.3,42.9,24.9)
teen.birth.rate.1991<-c(54.1,66,41.3)
top.3.states<-data.frame(state,income,median.us,life.expectancy,teen.birth.rate.2015,teen.birth.rate.2007,
teen.birth.rate.1991)
names(top.3.states)<-c("state","income","median.us","life.spectancy","2015","2007","1991")
top.3.states
## state income median.us life.spectancy 2015 2007 1991
## 1 Maryland 76067 61372 78.8 18.0 34.3 54.1
## 2 Alaska 74444 61372 78.3 29.3 42.9 66.0
## 3 New Jersey 73702 61372 80.3 12.1 24.9 41.3
new.top.3.states<-top.3.states %>%
gather("2015","2007","1991",key="year",value = "cases")
new.top.3.states
## state income median.us life.spectancy year cases
## 1 Maryland 76067 61372 78.8 2015 18.0
## 2 Alaska 74444 61372 78.3 2015 29.3
## 3 New Jersey 73702 61372 80.3 2015 12.1
## 4 Maryland 76067 61372 78.8 2007 34.3
## 5 Alaska 74444 61372 78.3 2007 42.9
## 6 New Jersey 73702 61372 80.3 2007 24.9
## 7 Maryland 76067 61372 78.8 1991 54.1
## 8 Alaska 74444 61372 78.3 1991 66.0
## 9 New Jersey 73702 61372 80.3 1991 41.3
#1,10 Spreading:Consolidation of Multiple Rows into One
df_1<-data_frame(Type=c("TypeA","TypeA","TypeB","TypeB"),
Answer=c("Yes","No",NA,"No"),n=1:4)
df_1
## # A tibble: 4 x 3
## Type Answer n
## <chr> <chr> <int>
## 1 TypeA Yes 1
## 2 TypeA No 2
## 3 TypeB <NA> 3
## 4 TypeB No 4
df_2<-df_1%>%
filter(!is.na(Answer))%>%
spread(key=Answer,value=n)
df_2
## # A tibble: 2 x 3
## Type No Yes
## <chr> <int> <int>
## 1 TypeA 2 1
## 2 TypeB 4 NA
#1,11 Separate:Divide a Single Column into Multiple Columns
state<-c("Maryland","Alaska","New Jersey")
income<-c(76067,74444,73702)
median.us<-c(61372,61372,61372)
life.expectancy<-c(78.8,78.3,80.3)
teen.birth<-c("17//34.3//54.1","29.0//42.9//66.0","12.1//24.9//41.3")
top.3.states<-data.frame(state,income,median.us,
life.expectancy,teen.birth)
top.3.states
## state income median.us life.expectancy teen.birth
## 1 Maryland 76067 61372 78.8 17//34.3//54.1
## 2 Alaska 74444 61372 78.3 29.0//42.9//66.0
## 3 New Jersey 73702 61372 80.3 12.1//24.9//41.3
top.3.states.separated.years<-top.3.states%>%
separate(teen.birth,
into = c("2015","2007","1991"),sep="//")
top.3.states.separated.years
## state income median.us life.expectancy 2015 2007 1991
## 1 Maryland 76067 61372 78.8 17 34.3 54.1
## 2 Alaska 74444 61372 78.3 29.0 42.9 66.0
## 3 New Jersey 73702 61372 80.3 12.1 24.9 41.3
#1,12 Recap of Handy DPLYR Functions
#1,12,1 Number of Observations (n) Used Across Multiple DPLYR Functions
#1,12,2 Basic Counts
m<-mutate(new.sleep,koun = n())
m[1:5,c(1:4,10:12)]
## # A tibble: 5 x 7
## # Groups: vore, order [5]
## name genus vore order brainwt bodywt koun
## <chr> <chr> <chr> <chr> <dbl> <dbl> <int>
## 1 Cheetah Acinonyx carni Carnivora NA 50 12
## 2 Owl monkey Aotus omni Primates 0.0155 0.48 10
## 3 Mountain beaver Aplodontia herbi Rodentia NA 1.35 16
## 4 Greater short-tailed shrew Blarina omni Soricomorp~ 0.00029 0.019 3
## 5 Cow Bos herbi Artiodacty~ 0.423 600 5
f<-filter(new.sleep,n()>14)
f[1:5,c(1:4,10:11)]
## # A tibble: 5 x 6
## # Groups: vore, order [1]
## name genus vore order brainwt bodywt
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 Mountain beaver Aplodontia herbi Rodentia NA 1.35
## 2 Guinea pig Cavis herbi Rodentia 0.0055 0.728
## 3 Chinchilla Chinchilla herbi Rodentia 0.0064 0.42
## 4 Western american chipmunk Eutamias herbi Rodentia NA 0.071
## 5 Mongolian gerbil Meriones herbi Rodentia NA 0.053
#1,12,3 Nth functions
salary.description<-c("Golden parachute","Well to do","Average","Below average", "bring date seeds instead of flowers")
first(salary.description)
## [1] "Golden parachute"
last(salary.description)
## [1] "bring date seeds instead of flowers"
nth(salary.description,3)
## [1] "Average"
nth(salary.description,2)
## [1] "Well to do"
#1,12,4 Count Distinct Values
a.vector<-c(22,33,44,1,2,3,3,3,4)
original.length<-length(a.vector)
original.length
## [1] 9
#Show number of distinct element only (7):
distinct.a.vector<-n_distinct(a.vector)
distinct.a.vector
## [1] 7
test1<-if_else(original.length==distinct.a.vector,"all values unique","some duplicate values in vector")
test1
## [1] "some duplicate values in vector"
b.vector<-c(1,2,3,4,5,6)
length(b.vector)
## [1] 6
distinct.b.vector<-n_distinct(b.vector)
distinct.b.vector
## [1] 6
test2<-if_else(length(b.vector)==distinct.b.vector,"all values unique","duplicates")
test2
## [1] "all values unique"
#1,12,5 na_if
test<-c(100,0,999)
x<-5000/test
x
## [1] 50.000000 Inf 5.005005
class(x)
## [1] "numeric"
#1,12,6 Colalesce to Replace Missing Values
x<-c(33,4,11,NA,9)
x
## [1] 33 4 11 NA 9
x<-coalesce(x,0)
x
## [1] 33 4 11 0 9
#1,13 Ranking Functions
#1,13,1
y<-c(100,4,12,6,8,3)
rank1<-row_number(y)
rank1
## [1] 6 2 5 3 4 1
y[rank1[1]]
## [1] 3
y[rank1[6]]
## [1] 100
#1,13,2 Minimum Rank
rank2<-min_rank(y)
rank2
## [1] 6 2 5 3 4 1
#1,13,3 Dense Rank
rank3<-dense_rank(y)
rank3
## [1] 6 2 5 3 4 1
#1,13,4 Percent Rank
rank4<-percent_rank(y)
rank4
## [1] 1.0 0.2 0.8 0.4 0.6 0.0
#1,13,5 Cumulative Distribution Function
y<-c(100,4,12,6,8,3)
rank5<-cume_dist(y)
rank5
## [1] 1.0000000 0.3333333 0.8333333 0.5000000 0.6666667 0.1666667
rank6=ntile(y,3)
rank6
## [1] 3 1 3 2 2 1
test.vector<-c(2,22,33,44,77,89,99)
quantile(test.vector,prob=seq(0,1,length=11),type=5)
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 2.0 6.0 20.0 28.6 36.3 44.0 67.1 81.8 90.0 97.0 99.0
#1,14
set.seed(833)
data("ChickWeight")
my.sample<-sample_n(ChickWeight,5)
my.sample
## weight Time Chick Diet
## 1 98 8 45 4
## 2 42 0 17 1
## 3 98 8 36 3
## 4 51 2 11 1
## 5 198 20 3 1
my.sample<-sample_n(ChickWeight,10,replace=TRUE)
my.sample
## weight Time Chick Diet
## 1 237 21 49 4
## 2 205 16 50 4
## 3 170 16 39 3
## 4 332 18 35 3
## 5 144 14 33 3
## 6 231 18 25 2
## 7 41 0 40 3
## 8 51 10 16 1
## 9 42 0 41 4
## 10 70 12 24 2
my.sample<-sample_n(mtcars,12,weight=cyl)
my.sample[,1:5]
## mpg cyl disp hp drat
## Hornet Sportabout 18.7 8 360.0 175 3.15
## Chrysler Imperial 14.7 8 440.0 230 3.23
## Ford Pantera L 15.8 8 351.0 264 4.22
## Valiant 18.1 6 225.0 105 2.76
## Lotus Europa 30.4 4 95.1 113 3.77
## Fiat X1-9 27.3 4 79.0 66 4.08
## Porsche 914-2 26.0 4 120.3 91 4.43
## Toyota Corolla 33.9 4 71.1 65 4.22
## Merc 240D 24.4 4 146.7 62 3.69
## Maserati Bora 15.0 8 301.0 335 3.54
## Fiat 128 32.4 4 78.7 66 4.08
## Pontiac Firebird 19.2 8 400.0 175 3.08
test1<-sample_frac(ChickWeight,0.02)
test1
## weight Time Chick Diet
## 1 163 16 3 1
## 2 103 8 41 4
## 3 103 8 42 4
## 4 120 18 19 1
## 5 48 2 36 3
## 6 80 6 48 4
## 7 137 12 33 3
## 8 154 12 40 3
## 9 40 0 2 1
## 10 138 14 44 4
## 11 240 14 21 2
## 12 130 12 39 3
by_hair_color<-starwars %>% group_by(hair_color)
my.sample<-sample_frac(by_hair_color,.07,replace=TRUE)
my.sample[,1:5]
## # A tibble: 5 x 5
## # Groups: hair_color [3]
## name height mass hair_color skin_color
## <chr> <int> <dbl> <chr> <chr>
## 1 Lando Calrissian 177 79 black dark
## 2 Chewbacca 228 112 brown unknown
## 3 San Hill 191 NA none grey
## 4 Mace Windu 188 84 none dark
## 5 Dexter Jettster 198 102 none brown
row.kount.only<-ChickWeight %>% tally()
row.kount.only
## n
## 1 578
diet.kount<-ChickWeight %>% count(Diet)
diet.kount
## Diet n
## 1 1 220
## 2 2 120
## 3 3 120
## 4 4 118
#1,15 Miscellaneous DPLYR Functions
#1,15,1 add_count for Groupwise Filtering
single.species.kount<-starwars%>%
add_count(species)%>%
filter(n==1)
single.species.kount[,1:6]
## # A tibble: 29 x 6
## name height mass hair_color skin_color eye_color
## <chr> <int> <dbl> <chr> <chr> <chr>
## 1 Greedo 173 74 <NA> green black
## 2 Jabba Desilijic Tiure 175 1358 <NA> green-tan, brown orange
## 3 Yoda 66 17 white green brown
## 4 Bossk 190 113 none green red
## 5 Ackbar 180 83 none brown mottle orange
## 6 Wicket Systri Warrick 88 20 brown brown brown
## 7 Nien Nunb 160 68 none grey black
## 8 Nute Gunray 191 90 none mottled green red
## 9 Watto 137 NA black blue, grey yellow
## 10 Sebulba 112 40 none grey, red orange
## # ... with 19 more rows
#1,15,2 Rename
mtcars<-rename(mtcars,spam_mpg=mpg)
data(mtcars)
names(mtcars)
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear"
## [11] "carb"
mtcars<-rename(mtcars,spam_mpg=mpg)
names(mtcars)
## [1] "spam_mpg" "cyl" "disp" "hp" "drat" "wt"
## [7] "qsec" "vs" "am" "gear" "carb"
#1,15,3 case_when
data(starwars)
new.starwars<-starwars %>%
dplyr::select(name,mass,gender,species,height)%>%
mutate(type=case_when(height>200|mass>200~"large",
species=="Droid"~"robot", TRUE~"other"))
new.starwars
## # A tibble: 87 x 6
## name mass gender species height type
## <chr> <dbl> <chr> <chr> <int> <chr>
## 1 Luke Skywalker 77 masculine Human 172 other
## 2 C-3PO 75 masculine Droid 167 robot
## 3 R2-D2 32 masculine Droid 96 robot
## 4 Darth Vader 136 masculine Human 202 large
## 5 Leia Organa 49 feminine Human 150 other
## 6 Owen Lars 120 masculine Human 178 other
## 7 Beru Whitesun lars 75 feminine Human 165 other
## 8 R5-D4 32 masculine Droid 97 robot
## 9 Biggs Darklighter 84 masculine Human 183 other
## 10 Obi-Wan Kenobi 77 masculine Human 182 other
## # ... with 77 more rows
#Finalizacion del Capitulo 1:Dplyr