Question (a)

# Data set "iris" is selected.
# iris is a data frame with 150 cases (rows) and 5 variables (columns) named Sepal.Length, Sepal.Width, Petal.Length, Petal.Width, and Species.

# assign iris to data
data <- iris

# Display number of observations (rows) and variables, data type of each column, and a head of the first cases.
glimpse(data)

## Rows: 150
## Columns: 5
## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9, 5.4, 4.~
## $ Sepal.Width  <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.~
## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5, 1.~
## $ Petal.Width  <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2, 0.~
## $ Species      <fct> setosa, setosa, setosa, setosa, setosa, setosa, setosa, s~

# get the number and percent of zeros, missing numbers, infinite numbers.
status(data)

##                  variable q_zeros p_zeros q_na p_na q_inf p_inf    type unique
## Sepal.Length Sepal.Length       0       0    0    0     0     0 numeric     35
## Sepal.Width   Sepal.Width       0       0    0    0     0     0 numeric     23
## Petal.Length Petal.Length       0       0    0    0     0     0 numeric     43
## Petal.Width   Petal.Width       0       0    0    0     0     0 numeric     22
## Species           Species       0       0    0    0     0     0  factor      3

# display the frequency of the factor variable "Species"
freq(data$Species)

##          var frequency percentage cumulative_perc
## 1     setosa        50      33.33           33.33
## 2 versicolor        50      33.33           66.66
## 3  virginica        50      33.33          100.00

# According to the figure above, the frequency of each species is the same.

# get the distribution of four numerical variables: "Sepal.Length", "Sepal.Width", "Petal.Length", and "Petal.Width"
plot_num(data)

# make a codebook
variable_id <- colnames(data)
item_text <- c("Length of Sepal in centimeters", "Width of Sepal in centimeters", "Length of Petal in centimeteers", 
               "Width of Petal in centimeteers", "Species of iris")
desc <- data %>% describe() %>% as_tibble() %>% select("n","min","max","mean")
cbind(variable_id, item_text, desc)

##    variable_id                       item_text   n min max     mean
## 1 Sepal.Length  Length of Sepal in centimeters 150 4.3 7.9 5.843333
## 2  Sepal.Width   Width of Sepal in centimeters 150 2.0 4.4 3.057333
## 3 Petal.Length Length of Petal in centimeteers 150 1.0 6.9 3.758000
## 4  Petal.Width  Width of Petal in centimeteers 150 0.1 2.5 1.199333
## 5      Species                 Species of iris 150 1.0 3.0 2.000000

Question (b)

# Data set "iris" is selected.
data <- iris

# Function 1: filter()
# filter() method in dplyr is used to produce a subset of a data frame, retaining all rows that satisfy the specified conditions.
# For example, filter all samples with "Sepal.Length" longer than 7
data %>% filter(Sepal.Length > 7)

##    Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 1           7.1         3.0          5.9         2.1 virginica
## 2           7.6         3.0          6.6         2.1 virginica
## 3           7.3         2.9          6.3         1.8 virginica
## 4           7.2         3.6          6.1         2.5 virginica
## 5           7.7         3.8          6.7         2.2 virginica
## 6           7.7         2.6          6.9         2.3 virginica
## 7           7.7         2.8          6.7         2.0 virginica
## 8           7.2         3.2          6.0         1.8 virginica
## 9           7.2         3.0          5.8         1.6 virginica
## 10          7.4         2.8          6.1         1.9 virginica
## 11          7.9         3.8          6.4         2.0 virginica
## 12          7.7         3.0          6.1         2.3 virginica

# According to the output, 12 rows were selected from data set iris


# Function 2: arrange()
# function arrange() is used to sort data by columns
# For example, arrange the data set by column "Sepal.Length" in descending order.
data %>% filter(Sepal.Length > 7) %>% arrange(desc(Sepal.Length))

##    Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 1           7.9         3.8          6.4         2.0 virginica
## 2           7.7         3.8          6.7         2.2 virginica
## 3           7.7         2.6          6.9         2.3 virginica
## 4           7.7         2.8          6.7         2.0 virginica
## 5           7.7         3.0          6.1         2.3 virginica
## 6           7.6         3.0          6.6         2.1 virginica
## 7           7.4         2.8          6.1         1.9 virginica
## 8           7.3         2.9          6.3         1.8 virginica
## 9           7.2         3.6          6.1         2.5 virginica
## 10          7.2         3.2          6.0         1.8 virginica
## 11          7.2         3.0          5.8         1.6 virginica
## 12          7.1         3.0          5.9         2.1 virginica

# Apparently, all rows filtered are sorted according to column "Sepal.Length)

# Function 3: mutate()
# function mutate() is used to modify an existing column or add a new column
# For example, add a new column named "Total", which is the sum of "Sepal.Length", "Sepal.Width", "Petal.Length", and "Petal.Width".
data %>% filter(Sepal.Length > 7) %>% mutate(Total = Sepal.Length + Sepal.Width + Petal.Length + Petal.Width)

##    Sepal.Length Sepal.Width Petal.Length Petal.Width   Species Total
## 1           7.1         3.0          5.9         2.1 virginica  18.1
## 2           7.6         3.0          6.6         2.1 virginica  19.3
## 3           7.3         2.9          6.3         1.8 virginica  18.3
## 4           7.2         3.6          6.1         2.5 virginica  19.4
## 5           7.7         3.8          6.7         2.2 virginica  20.4
## 6           7.7         2.6          6.9         2.3 virginica  19.5
## 7           7.7         2.8          6.7         2.0 virginica  19.2
## 8           7.2         3.2          6.0         1.8 virginica  18.2
## 9           7.2         3.0          5.8         1.6 virginica  17.6
## 10          7.4         2.8          6.1         1.9 virginica  18.2
## 11          7.9         3.8          6.4         2.0 virginica  20.1
## 12          7.7         3.0          6.1         2.3 virginica  19.1

# Function 4: select()
# function select() is used to select certain columns in a data frame
# For example, select column "Sepal.Length" and "Species"
data %>% filter(Sepal.Length > 7) %>% select(Sepal.Length, Species)

##    Sepal.Length   Species
## 1           7.1 virginica
## 2           7.6 virginica
## 3           7.3 virginica
## 4           7.2 virginica
## 5           7.7 virginica
## 6           7.7 virginica
## 7           7.7 virginica
## 8           7.2 virginica
## 9           7.2 virginica
## 10          7.4 virginica
## 11          7.9 virginica
## 12          7.7 virginica

# Function 5: summarise()
# Typically, function summarise() is used to create a new data frame with one (or more) rows for each combination of grouping variables.
# For example, consider variable "Species" as grouping variable, and calculate the average Sepal.Length for each Species.
data %>% group_by(Species) %>% summarise(Avg_Sepal.Length = mean(Sepal.Length))

## # A tibble: 3 x 2
##   Species    Avg_Sepal.Length
##   <fct>                 <dbl>
## 1 setosa                 5.01
## 2 versicolor             5.94
## 3 virginica              6.59

main

Question (a)

Question (b)