“Ciencia de datos: QUIZ 1”

“Andrea Arredondo Sánchez”

“9 de febrero de 2019”

0. Ejercicios de [https://www.r-exercises.com/2017/10/19/dplyr-basic-functions-exercises/]

library(tidyverse)
attach(iris)

#1. Select the first three columns of the iris dataset using their column names
select(iris,"Sepal.Length","Sepal.Width","Petal.Length") %>% head
##   Sepal.Length Sepal.Width Petal.Length
## 1          5.1         3.5          1.4
## 2          4.9         3.0          1.4
## 3          4.7         3.2          1.3
## 4          4.6         3.1          1.5
## 5          5.0         3.6          1.4
## 6          5.4         3.9          1.7
#2. Select all the columns of the iris dataset except “Petal Width”
select(iris,-Petal.Width) %>% head
##   Sepal.Length Sepal.Width Petal.Length Species
## 1          5.1         3.5          1.4  setosa
## 2          4.9         3.0          1.4  setosa
## 3          4.7         3.2          1.3  setosa
## 4          4.6         3.1          1.5  setosa
## 5          5.0         3.6          1.4  setosa
## 6          5.4         3.9          1.7  setosa
#3. Select all columns of the iris dataset that start with the character string “P”
select(iris,starts_with("P")) %>% head
##   Petal.Length Petal.Width
## 1          1.4         0.2
## 2          1.4         0.2
## 3          1.3         0.2
## 4          1.5         0.2
## 5          1.4         0.2
## 6          1.7         0.4
#4. Filter the rows of the iris dataset for Sepal.Length >= 4.6 and Petal.Width >= 0.5
iris %>% filter(Sepal.Length>=4.6,Petal.Width>=0.5) %>% head
##   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1          5.1         3.3          1.7         0.5     setosa
## 2          5.0         3.5          1.6         0.6     setosa
## 3          7.0         3.2          4.7         1.4 versicolor
## 4          6.4         3.2          4.5         1.5 versicolor
## 5          6.9         3.1          4.9         1.5 versicolor
## 6          5.5         2.3          4.0         1.3 versicolor
#5. Pipe the iris data frame to the function that will select two columns (Sepal.Width and Sepal.Length)  
iris %>% select(Sepal.Width,Sepal.Length) %>% head
##   Sepal.Width Sepal.Length
## 1         3.5          5.1
## 2         3.0          4.9
## 3         3.2          4.7
## 4         3.1          4.6
## 5         3.6          5.0
## 6         3.9          5.4
#6. Arrange rows by a particular column, such as the Sepal.Width
iris %>% arrange(Sepal.Width) %>% head
##   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1          5.0         2.0          3.5         1.0 versicolor
## 2          6.0         2.2          4.0         1.0 versicolor
## 3          6.2         2.2          4.5         1.5 versicolor
## 4          6.0         2.2          5.0         1.5  virginica
## 5          4.5         2.3          1.3         0.3     setosa
## 6          5.5         2.3          4.0         1.3 versicolor
#7. Select three columns from iris, arrange the rows by Sepal.Length, then arrange the rows by Sepal.Width
iris %>% select(Sepal.Length,Sepal.Width,Species) %>% arrange(Sepal.Length,Sepal.Width) %>% head
##   Sepal.Length Sepal.Width Species
## 1          4.3         3.0  setosa
## 2          4.4         2.9  setosa
## 3          4.4         3.0  setosa
## 4          4.4         3.2  setosa
## 5          4.5         2.3  setosa
## 6          4.6         3.1  setosa
#8. Create a new column called proportion, which is the ratio of Sepal.Length to Sepal.Width
iris %>% mutate(proportion=(Sepal.Length/Sepal.Width)) %>% head
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species proportion
## 1          5.1         3.5          1.4         0.2  setosa   1.457143
## 2          4.9         3.0          1.4         0.2  setosa   1.633333
## 3          4.7         3.2          1.3         0.2  setosa   1.468750
## 4          4.6         3.1          1.5         0.2  setosa   1.483871
## 5          5.0         3.6          1.4         0.2  setosa   1.388889
## 6          5.4         3.9          1.7         0.4  setosa   1.384615
#9. Compute the average number of Sepal.Length, apply the mean() function to the column Sepal.Length, and call the summary value “avg_slength”  
iris %>% summarise(avg_slength=mean(Sepal.Length)) 
##   avg_slength
## 1    5.843333
#10. Split the iris data frame by the Sepal.Length, then ask for the same summary statistics as above
iris %>% group_by(Sepal.Length) %>% summarise(avg_slength=mean(Sepal.Length))
## # A tibble: 35 x 2
##    Sepal.Length avg_slength
##           <dbl>       <dbl>
##  1          4.3         4.3
##  2          4.4         4.4
##  3          4.5         4.5
##  4          4.6         4.6
##  5          4.7         4.7
##  6          4.8         4.8
##  7          4.9         4.9
##  8          5           5  
##  9          5.1         5.1
## 10          5.2         5.2
## # ... with 25 more rows

Ejercicio 1

library(gapminder)
attach(gapminder)

data.frame(select(gapminder,year,lifeExp,country) %>% head)
##   year lifeExp     country
## 1 1952  28.801 Afghanistan
## 2 1957  30.332 Afghanistan
## 3 1962  31.997 Afghanistan
## 4 1967  34.020 Afghanistan
## 5 1972  36.088 Afghanistan
## 6 1977  38.438 Afghanistan

Ejercicio 2

data.frame(select(gapminder,country:lifeExp) %>% head)
##       country continent year lifeExp
## 1 Afghanistan      Asia 1952  28.801
## 2 Afghanistan      Asia 1957  30.332
## 3 Afghanistan      Asia 1962  31.997
## 4 Afghanistan      Asia 1967  34.020
## 5 Afghanistan      Asia 1972  36.088
## 6 Afghanistan      Asia 1977  38.438

Ejercicio 3

data.frame(select(gapminder,-lifeExp) %>% head)
##       country continent year      pop gdpPercap
## 1 Afghanistan      Asia 1952  8425333  779.4453
## 2 Afghanistan      Asia 1957  9240934  820.8530
## 3 Afghanistan      Asia 1962 10267083  853.1007
## 4 Afghanistan      Asia 1967 11537966  836.1971
## 5 Afghanistan      Asia 1972 13079460  739.9811
## 6 Afghanistan      Asia 1977 14880372  786.1134

Ejercicio 4

everything <- function(){
  d=data.frame(select(gapminder,matches("continent"),country,year:gdpPercap))
  colnames(d)[colnames(d)=="continent"]<-"cont"
  head(d)
}
everything()
##   cont     country year lifeExp      pop gdpPercap
## 1 Asia Afghanistan 1952  28.801  8425333  779.4453
## 2 Asia Afghanistan 1957  30.332  9240934  820.8530
## 3 Asia Afghanistan 1962  31.997 10267083  853.1007
## 4 Asia Afghanistan 1967  34.020 11537966  836.1971
## 5 Asia Afghanistan 1972  36.088 13079460  739.9811
## 6 Asia Afghanistan 1977  38.438 14880372  786.1134

Ejercicio 5

data.frame(arrange(gapminder,year) %>% head)
##       country continent year lifeExp      pop  gdpPercap
## 1 Afghanistan      Asia 1952  28.801  8425333   779.4453
## 2     Albania    Europe 1952  55.230  1282697  1601.0561
## 3     Algeria    Africa 1952  43.077  9279525  2449.0082
## 4      Angola    Africa 1952  30.015  4232095  3520.6103
## 5   Argentina  Americas 1952  62.485 17876956  5911.3151
## 6   Australia   Oceania 1952  69.120  8691212 10039.5956

Ejercicio 6

data.frame(arrange(gapminder,desc(year)) %>% head)
##       country continent year lifeExp      pop  gdpPercap
## 1 Afghanistan      Asia 2007  43.828 31889923   974.5803
## 2     Albania    Europe 2007  76.423  3600523  5937.0295
## 3     Algeria    Africa 2007  72.301 33333216  6223.3675
## 4      Angola    Africa 2007  42.731 12420476  4797.2313
## 5   Argentina  Americas 2007  75.320 40301927 12779.3796
## 6   Australia   Oceania 2007  81.235 20434176 34435.3674

Ejercicio 7

data.frame(arrange(gapminder,year,lifeExp) %>% head)
##        country continent year lifeExp     pop gdpPercap
## 1  Afghanistan      Asia 1952  28.801 8425333  779.4453
## 2       Gambia    Africa 1952  30.000  284320  485.2307
## 3       Angola    Africa 1952  30.015 4232095 3520.6103
## 4 Sierra Leone    Africa 1952  30.331 2143249  879.7877
## 5   Mozambique    Africa 1952  31.286 6446316  468.5260
## 6 Burkina Faso    Africa 1952  31.975 4469979  543.2552

Ejercicio 8

head(data.frame(gapminder %>% filter(pop>100000000)))
##      country continent year lifeExp       pop gdpPercap
## 1 Bangladesh      Asia 1987  52.819 103764241  751.9794
## 2 Bangladesh      Asia 1992  56.018 113704579  837.8102
## 3 Bangladesh      Asia 1997  59.412 123315288  972.7700
## 4 Bangladesh      Asia 2002  62.013 135656790 1136.3904
## 5 Bangladesh      Asia 2007  64.062 150448339 1391.2538
## 6     Brazil  Americas 1972  59.504 100840058 4985.7115

Ejercicio 9

data.frame(gapminder %>% filter(continent=="Asia") %>% head)
##       country continent year lifeExp      pop gdpPercap
## 1 Afghanistan      Asia 1952  28.801  8425333  779.4453
## 2 Afghanistan      Asia 1957  30.332  9240934  820.8530
## 3 Afghanistan      Asia 1962  31.997 10267083  853.1007
## 4 Afghanistan      Asia 1967  34.020 11537966  836.1971
## 5 Afghanistan      Asia 1972  36.088 13079460  739.9811
## 6 Afghanistan      Asia 1977  38.438 14880372  786.1134

Ejercicio 10

i<- data.frame(gapminder %>% filter(gdpPercap>10000))
select(i,-gdpPercap) %>% head
##     country continent year lifeExp      pop
## 1 Argentina  Americas 1977  68.481 26983828
## 2 Argentina  Americas 1997  73.275 36203463
## 3 Argentina  Americas 2007  75.320 40301927
## 4 Australia   Oceania 1952  69.120  8691212
## 5 Australia   Oceania 1957  70.330  9712569
## 6 Australia   Oceania 1962  70.930 10794968

Ejercicio 11

select(iris, starts_with("Petal")) %>% head
##   Petal.Length Petal.Width
## 1          1.4         0.2
## 2          1.4         0.2
## 3          1.3         0.2
## 4          1.5         0.2
## 5          1.4         0.2
## 6          1.7         0.4