Question (a)
# Data set "iris" is selected.
# iris is a data frame with 150 cases (rows) and 5 variables (columns) named Sepal.Length, Sepal.Width, Petal.Length, Petal.Width, and Species.
# assign iris to data
data <- iris
# Display number of observations (rows) and variables, data type of each column, and a head of the first cases.
glimpse(data)
## Rows: 150
## Columns: 5
## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9, 5.4, 4.~
## $ Sepal.Width <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.~
## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5, 1.~
## $ Petal.Width <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2, 0.~
## $ Species <fct> setosa, setosa, setosa, setosa, setosa, setosa, setosa, s~
# get the number and percent of zeros, missing numbers, infinite numbers.
status(data)
## variable q_zeros p_zeros q_na p_na q_inf p_inf type unique
## Sepal.Length Sepal.Length 0 0 0 0 0 0 numeric 35
## Sepal.Width Sepal.Width 0 0 0 0 0 0 numeric 23
## Petal.Length Petal.Length 0 0 0 0 0 0 numeric 43
## Petal.Width Petal.Width 0 0 0 0 0 0 numeric 22
## Species Species 0 0 0 0 0 0 factor 3
# display the frequency of the factor variable "Species"
freq(data$Species)

## var frequency percentage cumulative_perc
## 1 setosa 50 33.33 33.33
## 2 versicolor 50 33.33 66.66
## 3 virginica 50 33.33 100.00
# According to the figure above, the frequency of each species is the same.
# get the distribution of four numerical variables: "Sepal.Length", "Sepal.Width", "Petal.Length", and "Petal.Width"
plot_num(data)

# make a codebook
variable_id <- colnames(data)
item_text <- c("Length of Sepal in centimeters", "Width of Sepal in centimeters", "Length of Petal in centimeteers",
"Width of Petal in centimeteers", "Species of iris")
desc <- data %>% describe() %>% as_tibble() %>% select("n","min","max","mean")
cbind(variable_id, item_text, desc)
## variable_id item_text n min max mean
## 1 Sepal.Length Length of Sepal in centimeters 150 4.3 7.9 5.843333
## 2 Sepal.Width Width of Sepal in centimeters 150 2.0 4.4 3.057333
## 3 Petal.Length Length of Petal in centimeteers 150 1.0 6.9 3.758000
## 4 Petal.Width Width of Petal in centimeteers 150 0.1 2.5 1.199333
## 5 Species Species of iris 150 1.0 3.0 2.000000
Question (b)
# Data set "iris" is selected.
data <- iris
# Function 1: filter()
# filter() method in dplyr is used to produce a subset of a data frame, retaining all rows that satisfy the specified conditions.
# For example, filter all samples with "Sepal.Length" longer than 7
data %>% filter(Sepal.Length > 7)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 7.1 3.0 5.9 2.1 virginica
## 2 7.6 3.0 6.6 2.1 virginica
## 3 7.3 2.9 6.3 1.8 virginica
## 4 7.2 3.6 6.1 2.5 virginica
## 5 7.7 3.8 6.7 2.2 virginica
## 6 7.7 2.6 6.9 2.3 virginica
## 7 7.7 2.8 6.7 2.0 virginica
## 8 7.2 3.2 6.0 1.8 virginica
## 9 7.2 3.0 5.8 1.6 virginica
## 10 7.4 2.8 6.1 1.9 virginica
## 11 7.9 3.8 6.4 2.0 virginica
## 12 7.7 3.0 6.1 2.3 virginica
# According to the output, 12 rows were selected from data set iris
# Function 2: arrange()
# function arrange() is used to sort data by columns
# For example, arrange the data set by column "Sepal.Length" in descending order.
data %>% filter(Sepal.Length > 7) %>% arrange(desc(Sepal.Length))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 7.9 3.8 6.4 2.0 virginica
## 2 7.7 3.8 6.7 2.2 virginica
## 3 7.7 2.6 6.9 2.3 virginica
## 4 7.7 2.8 6.7 2.0 virginica
## 5 7.7 3.0 6.1 2.3 virginica
## 6 7.6 3.0 6.6 2.1 virginica
## 7 7.4 2.8 6.1 1.9 virginica
## 8 7.3 2.9 6.3 1.8 virginica
## 9 7.2 3.6 6.1 2.5 virginica
## 10 7.2 3.2 6.0 1.8 virginica
## 11 7.2 3.0 5.8 1.6 virginica
## 12 7.1 3.0 5.9 2.1 virginica
# Apparently, all rows filtered are sorted according to column "Sepal.Length)
# Function 3: mutate()
# function mutate() is used to modify an existing column or add a new column
# For example, add a new column named "Total", which is the sum of "Sepal.Length", "Sepal.Width", "Petal.Length", and "Petal.Width".
data %>% filter(Sepal.Length > 7) %>% mutate(Total = Sepal.Length + Sepal.Width + Petal.Length + Petal.Width)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species Total
## 1 7.1 3.0 5.9 2.1 virginica 18.1
## 2 7.6 3.0 6.6 2.1 virginica 19.3
## 3 7.3 2.9 6.3 1.8 virginica 18.3
## 4 7.2 3.6 6.1 2.5 virginica 19.4
## 5 7.7 3.8 6.7 2.2 virginica 20.4
## 6 7.7 2.6 6.9 2.3 virginica 19.5
## 7 7.7 2.8 6.7 2.0 virginica 19.2
## 8 7.2 3.2 6.0 1.8 virginica 18.2
## 9 7.2 3.0 5.8 1.6 virginica 17.6
## 10 7.4 2.8 6.1 1.9 virginica 18.2
## 11 7.9 3.8 6.4 2.0 virginica 20.1
## 12 7.7 3.0 6.1 2.3 virginica 19.1
# Function 4: select()
# function select() is used to select certain columns in a data frame
# For example, select column "Sepal.Length" and "Species"
data %>% filter(Sepal.Length > 7) %>% select(Sepal.Length, Species)
## Sepal.Length Species
## 1 7.1 virginica
## 2 7.6 virginica
## 3 7.3 virginica
## 4 7.2 virginica
## 5 7.7 virginica
## 6 7.7 virginica
## 7 7.7 virginica
## 8 7.2 virginica
## 9 7.2 virginica
## 10 7.4 virginica
## 11 7.9 virginica
## 12 7.7 virginica
# Function 5: summarise()
# Typically, function summarise() is used to create a new data frame with one (or more) rows for each combination of grouping variables.
# For example, consider variable "Species" as grouping variable, and calculate the average Sepal.Length for each Species.
data %>% group_by(Species) %>% summarise(Avg_Sepal.Length = mean(Sepal.Length))
## # A tibble: 3 x 2
## Species Avg_Sepal.Length
## <fct> <dbl>
## 1 setosa 5.01
## 2 versicolor 5.94
## 3 virginica 6.59