1 In-class 1

Task: Comment on what each code chunk in the following classroom markdown file is trying to achive and on its output.

1.1 Data

# load {WWGbook} packages
pacman::p_load(WWGbook)

# Loads "classroom" data sets in {WWGbook} packages
data(classroom, package="WWGbook")
# information related to "classroom" data sets
?classroom
  • The Study of Instructional Improvement (SII; Hill, Rowan, and Ball, 2004) was carried out by researchers at the University of Michigan to study the math achievement scores of first- and third-grade students in randomly selected classrooms from a national U.S. sample of elementary schools.
str(classroom)
## 'data.frame':    1190 obs. of  12 variables:
##  $ sex     : int  1 0 1 0 0 1 0 0 1 0 ...
##  $ minority: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ mathkind: int  448 460 511 449 425 450 452 443 422 480 ...
##  $ mathgain: int  32 109 56 83 53 65 51 66 88 -7 ...
##  $ ses     : num  0.46 -0.27 -0.03 -0.38 -0.03 0.76 -0.03 0.2 0.64 0.13 ...
##  $ yearstea: num  1 1 1 2 2 2 2 2 2 2 ...
##  $ mathknow: num  NA NA NA -0.11 -0.11 -0.11 -0.11 -0.11 -0.11 -0.11 ...
##  $ housepov: num  0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 ...
##  $ mathprep: num  2 2 2 3.25 3.25 3.25 3.25 3.25 3.25 3.25 ...
##  $ classid : int  160 160 160 217 217 217 217 217 217 217 ...
##  $ schoolid: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ childid : int  1 2 3 4 5 6 7 8 9 10 ...

A data frame with 1190 observations on the following 12 variables.

  • sex: Indicator variable (0 = boys, 1 = girls)
  • minority: Indicator variable (0 = non-minority students, 1 = minority students)
  • mathkind: Student math score in the spring of their kindergarten year
  • mathgain: Student gain in math achievement score from the spring of kindergarten to the spring of first grade (the dependent variable)
  • ses: Student socioeconomic status
  • yearstea: First grade teacher years of teaching experience
  • mathknow: First grade teacher mathematics content knowledge: based on a scale based composed of 30 items (higher values indicate higher content knowledge)
  • housepov: Percentage of households in the neighborhood of the school below the poverty level
  • mathprep: First grade teacher mathematics preparation: number of mathematics content and methods courses
  • classid: Classroom ID number
  • schoolid: School ID number
  • childid: Student ID number

1.2 Split data

# 取school id 跟 housepov 因為每所學校只有一個housepov值,藉由duplicated function把重複的值刪掉,生成dta_schl dataframe,間接得知可能有107所學校

dta_schl <- classroom[duplicated(classroom$schoolid)==FALSE, 
                     c("schoolid", "housepov")]
# 因為每個班級在yearstea, mathknow, mathprep 只會有一個值,藉由duplicated function把重複的值刪掉,並以 schoolid, classid, yearstea, mathknow, mathprep 作為欄位排序,生成dta_cls dataframe

dta_cls <- classroom[duplicated(classroom$classid)==FALSE, 
                     c(11, 10, 6, 7, 9)]
# 取classroom裡 childid, classid, schoolid, sex, minority, mathkind, mathgain, ses 的欄位,生成dta_chld dataframe
dta_chld <- classroom[, c(12, 10, 11, 1:5)]
# 列出 dta_schl, dta_cls, dta_chld 這三個data frame的dimension (rowx column)
sapply(list(dta_schl, dta_cls, dta_chld), dim)
##      [,1] [,2] [,3]
## [1,]  107  312 1190
## [2,]    2    5    8
# sapply is a user-friendly version and wrapper of lapply by default returning a vector, matrix

1.3 Combine data

# 將dta_chld 與 dta_cls 兩個data frame 依照 classid 及 schoolid兩欄相符狀況合併
dta_12 <- merge(x=dta_chld, y=dta_cls, by=c("classid", "schoolid"))
dta_12 |> head()
##   classid schoolid childid sex minority mathkind mathgain   ses yearstea
## 1       1       61     653   0        1      442       35 -0.09        2
## 2       1       61     654   0        1      422      109 -1.32        2
## 3       1       61     655   1        1      482       66  0.43        2
## 4       1       61     656   1        1      489       19 -0.06        2
## 5       1       61     657   0        1      460       10 -1.28        2
## 6      10       17     216   1        1      381      143 -0.49        5
##   mathknow mathprep
## 1    -0.72     2.50
## 2    -0.72     2.50
## 3    -0.72     2.50
## 4    -0.72     2.50
## 5    -0.72     2.50
## 6    -1.26     3.25
# 同前一步驟??? 
dta_13 <- merge(x=dta_chld, y=dta_cls, by=c("classid", "schoolid"))
dta_13 |> head()
##   classid schoolid childid sex minority mathkind mathgain   ses yearstea
## 1       1       61     653   0        1      442       35 -0.09        2
## 2       1       61     654   0        1      422      109 -1.32        2
## 3       1       61     655   1        1      482       66  0.43        2
## 4       1       61     656   1        1      489       19 -0.06        2
## 5       1       61     657   0        1      460       10 -1.28        2
## 6      10       17     216   1        1      381      143 -0.49        5
##   mathknow mathprep
## 1    -0.72     2.50
## 2    -0.72     2.50
## 3    -0.72     2.50
## 4    -0.72     2.50
## 5    -0.72     2.50
## 6    -1.26     3.25
# 將dta_chld 與 dta_cls 兩個data frame 依照schoolid 相符狀況合併
dta_23 <- merge(x=dta_cls, y=dta_schl, by="schoolid")
dta_23 |> head()
##   schoolid classid yearstea mathknow mathprep housepov
## 1        1     160     1.00       NA     2.00    0.082
## 2        1     217     2.00    -0.11     3.25    0.082
## 3        2     197     1.00    -1.25     2.50    0.082
## 4        2     211     2.00    -0.72     2.33    0.082
## 5        2     307    12.54       NA     2.30    0.082
## 6        3      11    20.00     0.45     3.83    0.086
# dta_12 與 dta_schl 兩個data frame 依照schoolid 相符狀況合併
# 基本上已跟classroom一樣
dta_123 <- merge(x=dta_12, y=dta_schl, by=c("schoolid"))
dta_123 |> head()
##   schoolid classid childid sex minority mathkind mathgain   ses yearstea
## 1        1     217       6   1        1      450       65  0.76        2
## 2        1     217       9   1        1      422       88  0.64        2
## 3        1     160       1   1        1      448       32  0.46        1
## 4        1     217       8   0        1      443       66  0.20        2
## 5        1     217      11   0        1      502       60  0.83        2
## 6        1     160       2   0        1      460      109 -0.27        1
##   mathknow mathprep housepov
## 1    -0.11     3.25    0.082
## 2    -0.11     3.25    0.082
## 3       NA     2.00    0.082
## 4    -0.11     3.25    0.082
## 5    -0.11     3.25    0.082
## 6       NA     2.00    0.082
# 列出 dta_12, dta_13, dta_23, dta_123 這四個data frame的dimension (rowx column)
sapply(list(dta_12, dta_13, dta_23, dta_123), dim)
##      [,1] [,2] [,3] [,4]
## [1,] 1190 1190  312 1190
## [2,]   11   11    6   12

2 Inclass 2

Question: Merge the two data sets: state.x77{datasets} and USArrests{datasets} and compute all pair-wise correlations for numerical variables. Is there anything interesting to report?

state.x77{datasets} matrix with 50 rows and 8 columns giving the following statistics in the respective columns.

  • Population: population estimate as of July 1, 1975
  • Income: per capita income (1974)
  • Illiteracy: illiteracy (1970, percent of population)
  • Life Exp: life expectancy in years (1969–71)
  • Murder: murder and non-negligent manslaughter rate per 100,000 population (1976)
  • HS Grad: percent high-school graduates (1970)
  • Frost: mean number of days with minimum temperature below freezing (1931–1960) in capital or large city
  • Area: land area in square miles

USArrests{datasets}

This data set contains statistics, in arrests per 100,000 residents for assault, murder, and rape in each of the 50 US states in 1973. Also given is the percent of the population living in urban areas.

Murder: numeric Murder arrests (per 100,000) Assault: numeric Assault arrests (per 100,000) UrbanPop: numeric Percent urban population Rape: numeric Rape arrests (per 100,000)

2.1 Input data

pacman::p_load(dplyr, magrittr, corrplot)

# merge by row name (states)
dat2 <- merge(state.x77, USArrests, by="row.names", all=TRUE)

# column name
dat2 %<>%
 dplyr::rename(State = Row.names, 
               Murder1976 = Murder.x, 
               Murder1973 = Murder.y)
# change type and show head
dat2 <- dat2 |>
  mutate(State= as.factor(State))

head(dat2)
##        State Population Income Illiteracy Life Exp Murder1976 HS Grad Frost
## 1    Alabama       3615   3624        2.1    69.05       15.1    41.3    20
## 2     Alaska        365   6315        1.5    69.31       11.3    66.7   152
## 3    Arizona       2212   4530        1.8    70.55        7.8    58.1    15
## 4   Arkansas       2110   3378        1.9    70.66       10.1    39.9    65
## 5 California      21198   5114        1.1    71.71       10.3    62.6    20
## 6   Colorado       2541   4884        0.7    72.06        6.8    63.9   166
##     Area Murder1973 Assault UrbanPop Rape
## 1  50708       13.2     236       58 21.2
## 2 566432       10.0     263       48 44.5
## 3 113417        8.1     294       80 31.0
## 4  51945        8.8     190       50 19.5
## 5 156361        9.0     276       91 40.6
## 6 103766        7.9     204       78 38.7

2.2 Correlation table

cor(dat2[,-1])|> knitr::kable(digits = 2)
Population Income Illiteracy Life Exp Murder1976 HS Grad Frost Area Murder1973 Assault UrbanPop Rape
Population 1.00 0.21 0.11 -0.07 0.34 -0.10 -0.33 0.02 0.32 0.32 0.51 0.31
Income 0.21 1.00 -0.44 0.34 -0.23 0.62 0.23 0.36 -0.22 0.04 0.48 0.36
Illiteracy 0.11 -0.44 1.00 -0.59 0.70 -0.66 -0.67 0.08 0.71 0.51 -0.06 0.15
Life Exp -0.07 0.34 -0.59 1.00 -0.78 0.58 0.26 -0.11 -0.78 -0.63 0.27 -0.27
Murder1976 0.34 -0.23 0.70 -0.78 1.00 -0.49 -0.54 0.23 0.93 0.74 0.02 0.58
HS Grad -0.10 0.62 -0.66 0.58 -0.49 1.00 0.37 0.33 -0.52 -0.23 0.36 0.27
Frost -0.33 0.23 -0.67 0.26 -0.54 0.37 1.00 0.06 -0.54 -0.47 -0.25 -0.28
Area 0.02 0.36 0.08 -0.11 0.23 0.33 0.06 1.00 0.15 0.23 -0.06 0.52
Murder1973 0.32 -0.22 0.71 -0.78 0.93 -0.52 -0.54 0.15 1.00 0.80 0.07 0.56
Assault 0.32 0.04 0.51 -0.63 0.74 -0.23 -0.47 0.23 0.80 1.00 0.26 0.67
UrbanPop 0.51 0.48 -0.06 0.27 0.02 0.36 -0.25 -0.06 0.07 0.26 1.00 0.41
Rape 0.31 0.36 0.15 -0.27 0.58 0.27 -0.28 0.52 0.56 0.67 0.41 1.00

2.3 Correlation plot

corrplot.mixed(cor(dat2[,-1]), number.cex = .7, tl.cex = .5, tl.col = "black")

# corrplot(cor(dat2[,-1]))

For corr. > 0.5
Positive correlation
- “Population estimate as of July 1, 1975” and “Percent urban population in 1973”.
- “Per capita income in 1974” and “Percent high-school graduates in 1970”.
- “illiteracy in 1970” with “Murder arrests in 1973”, “Assault arrests in 1973”, and “Murder and non-negligent manslaughter rate in 1976”.
- “Life expectancy” with “Percent high-school graduates in 1970”.
- “Murder and non-negligent manslaughter rate in 1976” with “Murder arrests in 1973”, “Assault arrests in 1973”, and “Rape arrests in 1973”.
- “Murder arrests in 1973” and “Assault arrests in 1973”, “Rape arrests in 1973”.

Negative correlation
- “illiteracy in 1970” with “Life expectancy”, “Percent high-school graduates in 1970”, and “Minimum temperature below freezing in past 30 years”.
- “Murder and non-negligent manslaughter rate in 1976” and “Minimum temperature below freezing in past 30 years”.
- “Percent high-school graduates in 1970” and “Murder arrests in 1973”.
- “Minimum temperature below freezing in past 30 years” and “Murder arrests in 1973”.

謀殺犯罪率高的可能與較高的平均氣溫、低高中畢業率、低識字率、低預期壽命有關;但與地區人口數及平均所得較無關。

3 Inclass 3

Summarize the backpain{HSAUR3} into the following format:

driver suburban case control total
no no ? ? ?
no yes ? ? ?
yes no ? ? ?
yes yes ? ? ?
  • You should provide comments for each code chunk.

3.1 Input data

# load package
pacman::p_load(HSAUR3)

# Input data
data("backpain", package="HSAUR3")
dat3 <- backpain

3.2 Data management and table

dat3 %>% 
 dplyr::rename(Group = status, 
               Driver = driver, 
               Suburban = suburban) %>% # column rename
  group_by(Driver, Suburban) %>% # Group by Driver and Suburban
  tidyr::spread(key="Group", value ="Group") %>% # key(Column names) to value
  summarize(Case = sum(is.na(case)), # sum of case number
            Control = sum(is.na(control)), # sum of control number
            Total = n()) %>% # sum of total
  as.data.frame () %>% #tibble to dataframe
  knitr::kable ()
## `summarise()` has grouped output by 'Driver'. You can override using the `.groups` argument.
Driver Suburban Case Control Total
no no 38 17 64
no yes 5 4 11
yes no 43 44 107
yes yes 37 58 158

4 Inclass 4

The data set Vocab{carData} gives observations on gender, education and vocabulary, from respondents to U.S. General Social Surveys, 1972-2004.

Task: Summarize the relationship between education and vocabulary over the years by gender.

  • year: Year of the survey.
  • sex: Sex of the respondent, Female or Male.
  • education: Education, in years.
  • vocabulary: Vocabulary test score: number correct on a 10-word test.

4.1 Input data

# load package
pacman::p_load(carData)

# Input data
data("Vocab", package="carData")
dat4 <- Vocab
#columan rename
dat4 <- dat4 %>% 
 dplyr::rename(Year = year, 
               Sex = sex, 
               Education = education,
               Vocabulary = vocabulary)

4.2 Plot

dat4 %>% 
  mutate(Year = as.factor(Year)) %>% # for show the year number
  lattice::xyplot(Vocabulary ~ Education | Year, 
                  groups = Sex,
                  type = c("p", "g", "r"), data = ., 
                  cex= .5,
                  pch = 19,
                  auto.key = list(columns = 2),
                  xlab = "Education", ylab = "Vocabulary",
                  layout = c(4,6),
                  par.settings = list(superpose.symbol = list(pch = 19, cex = 1.5,
                                                   col = c("#D95F02", "#7570B3")),
                                      superpose.line = list(col = c("#D95F02", "#7570B3"),
                                                 lwd = 1.5)))

  • 不分男女隨著受教育的時間越長,單字量隨之增加。(感覺早期女性較男性佳,後期男性則較女性佳)

4.3 Reference

  • National Opinion Research Center General Social Survey. GSS Cumulative Datafile 1972-2016, downloaded from http://gss.norc.org/.

5 Inclass 5

An example with dplyr

Task: Supply comments to each code chunk in the following survey rmarkdown file and preview it as an R notebook or knit to html.

The data set concerns species and weight of animals caught in plots in a study area in Arizona over time.

Each row holds information for a single animal, and the columns represent:

  • record_id: Unique id for the observation
  • month: month of observation
  • day: day of observation
  • year: year of observation
  • plot_id: ID of a particular plot
  • species_id: 2-letter code
  • sex: sex of animal (“M”, “F”)
  • hindfoot_length: length of the hindfoot in mm
  • weight: weight of the animal in grams
  • genus: genus of animal
  • species: species of animal
  • taxa: e.g. Rodent, Reptile, Bird, Rabbit
  • plot_type: type of plot

5.1 Input data

# load package.
pacman::p_load(tidyverse)
# read csv data from the link.
dat5 <- read_csv("http://kbroman.org/datacarp/portal_data_joined.csv")
## Rows: 34786 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): species_id, sex, genus, species, taxa, plot_type
## dbl (7): record_id, month, day, year, plot_id, hindfoot_length, weight
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

5.2 Check Data structure

# show every column in dat5 data frame.
glimpse(dat5)
## Rows: 34,786
## Columns: 13
## $ record_id       <dbl> 1, 72, 224, 266, 349, 363, 435, 506, 588, 661, 748, 84~
## $ month           <dbl> 7, 8, 9, 10, 11, 11, 12, 1, 2, 3, 4, 5, 6, 8, 9, 10, 1~
## $ day             <dbl> 16, 19, 13, 16, 12, 12, 10, 8, 18, 11, 8, 6, 9, 5, 4, ~
## $ year            <dbl> 1977, 1977, 1977, 1977, 1977, 1977, 1977, 1978, 1978, ~
## $ plot_id         <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~
## $ species_id      <chr> "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", ~
## $ sex             <chr> "M", "M", NA, NA, NA, NA, NA, NA, "M", NA, NA, "M", "M~
## $ hindfoot_length <dbl> 32, 31, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, NA, 34~
## $ weight          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 218, NA, NA, 204, 200,~
## $ genus           <chr> "Neotoma", "Neotoma", "Neotoma", "Neotoma", "Neotoma",~
## $ species         <chr> "albigula", "albigula", "albigula", "albigula", "albig~
## $ taxa            <chr> "Rodent", "Rodent", "Rodent", "Rodent", "Rodent", "Rod~
## $ plot_type       <chr> "Control", "Control", "Control", "Control", "Control",~
# dimension of the data (Rows: 34,786 x Columns: 13).
dim(dat5)
## [1] 34786    13

5.3 Subset / Group

# choose plot_id, species_id, weight column from the data and check the head 6 row.
dplyr::select(dat5, plot_id, species_id, weight) %>% 
  head()
## # A tibble: 6 x 3
##   plot_id species_id weight
##     <dbl> <chr>       <dbl>
## 1       2 NL             NA
## 2       2 NL             NA
## 3       2 NL             NA
## 4       2 NL             NA
## 5       2 NL             NA
## 6       2 NL             NA
# Except for record_id, species_id, other column will be choosed from the data and check the head 6 row.

dplyr::select(dat5, -record_id, -species_id) %>% 
  head()
## # A tibble: 6 x 11
##   month   day  year plot_id sex   hindfoot_length weight genus   species  taxa  
##   <dbl> <dbl> <dbl>   <dbl> <chr>           <dbl>  <dbl> <chr>   <chr>    <chr> 
## 1     7    16  1977       2 M                  32     NA Neotoma albigula Rodent
## 2     8    19  1977       2 M                  31     NA Neotoma albigula Rodent
## 3     9    13  1977       2 <NA>               NA     NA Neotoma albigula Rodent
## 4    10    16  1977       2 <NA>               NA     NA Neotoma albigula Rodent
## 5    11    12  1977       2 <NA>               NA     NA Neotoma albigula Rodent
## 6    11    12  1977       2 <NA>               NA     NA Neotoma albigula Rodent
## # ... with 1 more variable: plot_type <chr>
# select/subset the data that the year column equal to 1995 from the data and check the head 6 row.
dplyr::filter(dat5, year == 1995) %>% head()
## # A tibble: 6 x 13
##   record_id month   day  year plot_id species_id sex   hindfoot_length weight
##       <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
## 1     22314     6     7  1995       2 NL         M                  34     NA
## 2     22728     9    23  1995       2 NL         F                  32    165
## 3     22899    10    28  1995       2 NL         F                  32    171
## 4     23032    12     2  1995       2 NL         F                  33     NA
## 5     22003     1    11  1995       2 DM         M                  37     41
## 6     22042     2     4  1995       2 DM         F                  36     45
## # ... with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
## #   plot_type <chr>
# choose species_id, sex, weight column from the data and subset the data that the weight column equal and small to 5 and show the head 6 row of the data.
head(dplyr::select(dplyr::filter(dat5, weight <= 5), species_id, sex, weight))
## # A tibble: 6 x 3
##   species_id sex   weight
##   <chr>      <chr>  <dbl>
## 1 PF         M          5
## 2 PF         F          5
## 3 PF         F          5
## 4 PF         F          4
## 5 PF         F          5
## 6 PF         F          4
# same purpose as previous chunk by use %>% function.
dat5 %>% 
  dplyr::filter(weight <= 5) %>% 
  dplyr::select(species_id, sex, weight) %>% 
  head()
## # A tibble: 6 x 3
##   species_id sex   weight
##   <chr>      <chr>  <dbl>
## 1 PF         M          5
## 2 PF         F          5
## 3 PF         F          5
## 4 PF         F          4
## 5 PF         F          5
## 6 PF         F          4
# create two column "weight_kg" and "weight_lb" by unit conversion calculated. 
dat5 %>% 
  mutate(weight_kg = weight / 1000,
         weight_lb = weight_kg * 2.2) %>% 
  head()
## # A tibble: 6 x 15
##   record_id month   day  year plot_id species_id sex   hindfoot_length weight
##       <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
## 1         1     7    16  1977       2 NL         M                  32     NA
## 2        72     8    19  1977       2 NL         M                  31     NA
## 3       224     9    13  1977       2 NL         <NA>               NA     NA
## 4       266    10    16  1977       2 NL         <NA>               NA     NA
## 5       349    11    12  1977       2 NL         <NA>               NA     NA
## 6       363    11    12  1977       2 NL         <NA>               NA     NA
## # ... with 6 more variables: genus <chr>, species <chr>, taxa <chr>,
## #   plot_type <chr>, weight_kg <dbl>, weight_lb <dbl>
dat5 %>% 
  filter(!is.na(weight)) %>% # 保留非NA的數值
  group_by(sex, species_id) %>% # 依照sex及species_id群組
  summarize(mean_weight = mean(weight)) %>% # 新增 mean_weight (weight的平均值)欄位
  arrange(desc(mean_weight)) %>% # mean_weight 從高到低排序
  head() # show the head 6 row of the data.
## `summarise()` has grouped output by 'sex'. You can override using the `.groups` argument.
## # A tibble: 6 x 3
## # Groups:   sex [3]
##   sex   species_id mean_weight
##   <chr> <chr>            <dbl>
## 1 <NA>  NL                168.
## 2 M     NL                166.
## 3 F     NL                154.
## 4 M     SS                130 
## 5 <NA>  SH                130 
## 6 M     DS                122.
# tally() is a lower-level function that assumes you've done the grouping
dat5 %>%
  group_by(sex) %>%
  tally() # similar to dplyr::count(), but need group first
## # A tibble: 3 x 2
##   sex       n
##   <chr> <int>
## 1 F     15690
## 2 M     17348
## 3 <NA>   1748
# the purpose same as previous chunk, count() no need group first.
dat5 %>%
  count(sex)
## # A tibble: 3 x 2
##   sex       n
##   <chr> <int>
## 1 F     15690
## 2 M     17348
## 3 <NA>   1748
# the purpose same as previous chunk
# create a count column = current group size.
dat5 %>%
  group_by(sex) %>%
  summarize(count = n())
## # A tibble: 3 x 2
##   sex   count
##   <chr> <int>
## 1 F     15690
## 2 M     17348
## 3 <NA>   1748
# the purpose same as previous chunk.
# create a count column = sum of the count of observation year without miss value.
# if change the "year" to "day" or "month", the result will be the same.

dat5 %>%
  group_by(sex) %>%
  summarize(count = sum(!is.na(year)))
## # A tibble: 3 x 2
##   sex   count
##   <chr> <int>
## 1 F     15690
## 2 M     17348
## 3 <NA>   1748
# 各個動物屬在不同plot_id的平均體重值
dat5_gw <- dat5 %>% 
  filter(!is.na(weight)) %>% # remove the NA row in the weight column. 
  group_by(genus, plot_id) %>% # group by genus and plot_id
  summarize(mean_weight = mean(weight)) # create the mean_weight column
## `summarise()` has grouped output by 'genus'. You can override using the `.groups` argument.
# view the data
glimpse(dat5_gw)
## Rows: 196
## Columns: 3
## Groups: genus [10]
## $ genus       <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
## $ plot_id     <dbl> 1, 2, 3, 5, 18, 19, 20, 21, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,~
## $ mean_weight <dbl> 7.000000, 6.000000, 8.611111, 7.750000, 9.500000, 9.533333~

5.4 Reshape

# long to wide
# new column is by genus, the row value is by mean_weight
dat5_w <- dat5_gw %>%
  spread(key = genus, value = mean_weight)
# view the data
glimpse(dat5_w)
## Rows: 24
## Columns: 11
## $ plot_id         <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,~
## $ Baiomys         <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA~
## $ Chaetodipus     <dbl> 22.19939, 25.11014, 24.63636, 23.02381, 17.98276, 24.8~
## $ Dipodomys       <dbl> 60.23214, 55.68259, 52.04688, 57.52454, 51.11356, 58.6~
## $ Neotoma         <dbl> 156.2222, 169.1436, 158.2414, 164.1667, 190.0370, 179.~
## $ Onychomys       <dbl> 27.67550, 26.87302, 26.03241, 28.09375, 27.01695, 25.8~
## $ Perognathus     <dbl> 9.625000, 6.947368, 7.507812, 7.824427, 8.658537, 7.80~
## $ Peromyscus      <dbl> 22.22222, 22.26966, 21.37037, 22.60000, 21.23171, 21.8~
## $ Reithrodontomys <dbl> 11.375000, 10.680556, 10.516588, 10.263158, 11.154545,~
## $ Sigmodon        <dbl> NA, 70.85714, 65.61404, 82.00000, 82.66667, 68.77778, ~
## $ Spermophilus    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 13~
# same purpose of dat5_w
dat5_gw %>%
  spread(genus, mean_weight, fill = 0) %>%  # fill= If set, missing values will be replaced with this value.
  head()
## # A tibble: 6 x 11
##   plot_id Baiomys Chaetodipus Dipodomys Neotoma Onychomys Perognathus Peromyscus
##     <dbl>   <dbl>       <dbl>     <dbl>   <dbl>     <dbl>       <dbl>      <dbl>
## 1       1    7           22.2      60.2    156.      27.7        9.62       22.2
## 2       2    6           25.1      55.7    169.      26.9        6.95       22.3
## 3       3    8.61        24.6      52.0    158.      26.0        7.51       21.4
## 4       4    0           23.0      57.5    164.      28.1        7.82       22.6
## 5       5    7.75        18.0      51.1    190.      27.0        8.66       21.2
## 6       6    0           24.9      58.6    180.      25.9        7.81       21.8
## # ... with 3 more variables: Reithrodontomys <dbl>, Sigmodon <dbl>,
## #   Spermophilus <dbl>
# wide to long exclude plot_id.
dat5_l <- dat5_w %>%
  gather(key = genus, value = mean_weight, -plot_id)
# view the data
glimpse(dat5_l)
## Rows: 240
## Columns: 3
## $ plot_id     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,~
## $ genus       <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
## $ mean_weight <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA, NA~
# same as dat5_l
dat5_w %>%
  gather(key = genus, value = mean_weight, Baiomys:Spermophilus) %>%
  head()
## # A tibble: 6 x 3
##   plot_id genus   mean_weight
##     <dbl> <chr>         <dbl>
## 1       1 Baiomys        7   
## 2       2 Baiomys        6   
## 3       3 Baiomys        8.61
## 4       4 Baiomys       NA   
## 5       5 Baiomys        7.75
## 6       6 Baiomys       NA
# remove missing value of weight, hindfoot_length, and sex in the data set
# 34789 -> 30676
dat5_complete <- dat5 %>%
  filter(!is.na(weight),           
         !is.na(hindfoot_length),  
         !is.na(sex))                
# keep the number of species higher than 49. 
# 14 x 2
species_counts <- dat5_complete %>%
    count(species_id) %>% 
    filter(n >= 50)
# keep the species_id with number of species higher than 49 from species_counts. 
# 30676 -> 30463
dat5_complete <- dat5_complete %>%
  filter(species_id %in% species_counts$species_id)