Task: Comment on what each code chunk in the following classroom markdown file is trying to achive and on its output.
# load {WWGbook} packages
::p_load(WWGbook)
pacman
# Loads "classroom" data sets in {WWGbook} packages
data(classroom, package="WWGbook")
# information related to "classroom" data sets
?classroom
str(classroom)
## 'data.frame': 1190 obs. of 12 variables:
## $ sex : int 1 0 1 0 0 1 0 0 1 0 ...
## $ minority: int 1 1 1 1 1 1 1 1 1 1 ...
## $ mathkind: int 448 460 511 449 425 450 452 443 422 480 ...
## $ mathgain: int 32 109 56 83 53 65 51 66 88 -7 ...
## $ ses : num 0.46 -0.27 -0.03 -0.38 -0.03 0.76 -0.03 0.2 0.64 0.13 ...
## $ yearstea: num 1 1 1 2 2 2 2 2 2 2 ...
## $ mathknow: num NA NA NA -0.11 -0.11 -0.11 -0.11 -0.11 -0.11 -0.11 ...
## $ housepov: num 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 ...
## $ mathprep: num 2 2 2 3.25 3.25 3.25 3.25 3.25 3.25 3.25 ...
## $ classid : int 160 160 160 217 217 217 217 217 217 217 ...
## $ schoolid: int 1 1 1 1 1 1 1 1 1 1 ...
## $ childid : int 1 2 3 4 5 6 7 8 9 10 ...
A data frame with 1190 observations on the following 12 variables.
# 取school id 跟 housepov 因為每所學校只有一個housepov值,藉由duplicated function把重複的值刪掉,生成dta_schl dataframe,間接得知可能有107所學校
<- classroom[duplicated(classroom$schoolid)==FALSE,
dta_schl c("schoolid", "housepov")]
# 因為每個班級在yearstea, mathknow, mathprep 只會有一個值,藉由duplicated function把重複的值刪掉,並以 schoolid, classid, yearstea, mathknow, mathprep 作為欄位排序,生成dta_cls dataframe
<- classroom[duplicated(classroom$classid)==FALSE,
dta_cls c(11, 10, 6, 7, 9)]
# 取classroom裡 childid, classid, schoolid, sex, minority, mathkind, mathgain, ses 的欄位,生成dta_chld dataframe
<- classroom[, c(12, 10, 11, 1:5)] dta_chld
# 列出 dta_schl, dta_cls, dta_chld 這三個data frame的dimension (rowx column)
sapply(list(dta_schl, dta_cls, dta_chld), dim)
## [,1] [,2] [,3]
## [1,] 107 312 1190
## [2,] 2 5 8
# sapply is a user-friendly version and wrapper of lapply by default returning a vector, matrix
# 將dta_chld 與 dta_cls 兩個data frame 依照 classid 及 schoolid兩欄相符狀況合併
<- merge(x=dta_chld, y=dta_cls, by=c("classid", "schoolid"))
dta_12 |> head() dta_12
## classid schoolid childid sex minority mathkind mathgain ses yearstea
## 1 1 61 653 0 1 442 35 -0.09 2
## 2 1 61 654 0 1 422 109 -1.32 2
## 3 1 61 655 1 1 482 66 0.43 2
## 4 1 61 656 1 1 489 19 -0.06 2
## 5 1 61 657 0 1 460 10 -1.28 2
## 6 10 17 216 1 1 381 143 -0.49 5
## mathknow mathprep
## 1 -0.72 2.50
## 2 -0.72 2.50
## 3 -0.72 2.50
## 4 -0.72 2.50
## 5 -0.72 2.50
## 6 -1.26 3.25
# 同前一步驟???
<- merge(x=dta_chld, y=dta_cls, by=c("classid", "schoolid"))
dta_13 |> head() dta_13
## classid schoolid childid sex minority mathkind mathgain ses yearstea
## 1 1 61 653 0 1 442 35 -0.09 2
## 2 1 61 654 0 1 422 109 -1.32 2
## 3 1 61 655 1 1 482 66 0.43 2
## 4 1 61 656 1 1 489 19 -0.06 2
## 5 1 61 657 0 1 460 10 -1.28 2
## 6 10 17 216 1 1 381 143 -0.49 5
## mathknow mathprep
## 1 -0.72 2.50
## 2 -0.72 2.50
## 3 -0.72 2.50
## 4 -0.72 2.50
## 5 -0.72 2.50
## 6 -1.26 3.25
# 將dta_chld 與 dta_cls 兩個data frame 依照schoolid 相符狀況合併
<- merge(x=dta_cls, y=dta_schl, by="schoolid")
dta_23 |> head() dta_23
## schoolid classid yearstea mathknow mathprep housepov
## 1 1 160 1.00 NA 2.00 0.082
## 2 1 217 2.00 -0.11 3.25 0.082
## 3 2 197 1.00 -1.25 2.50 0.082
## 4 2 211 2.00 -0.72 2.33 0.082
## 5 2 307 12.54 NA 2.30 0.082
## 6 3 11 20.00 0.45 3.83 0.086
# dta_12 與 dta_schl 兩個data frame 依照schoolid 相符狀況合併
# 基本上已跟classroom一樣
<- merge(x=dta_12, y=dta_schl, by=c("schoolid"))
dta_123 |> head() dta_123
## schoolid classid childid sex minority mathkind mathgain ses yearstea
## 1 1 217 6 1 1 450 65 0.76 2
## 2 1 217 9 1 1 422 88 0.64 2
## 3 1 160 1 1 1 448 32 0.46 1
## 4 1 217 8 0 1 443 66 0.20 2
## 5 1 217 11 0 1 502 60 0.83 2
## 6 1 160 2 0 1 460 109 -0.27 1
## mathknow mathprep housepov
## 1 -0.11 3.25 0.082
## 2 -0.11 3.25 0.082
## 3 NA 2.00 0.082
## 4 -0.11 3.25 0.082
## 5 -0.11 3.25 0.082
## 6 NA 2.00 0.082
# 列出 dta_12, dta_13, dta_23, dta_123 這四個data frame的dimension (rowx column)
sapply(list(dta_12, dta_13, dta_23, dta_123), dim)
## [,1] [,2] [,3] [,4]
## [1,] 1190 1190 312 1190
## [2,] 11 11 6 12
Question: Merge the two data sets: state.x77{datasets} and USArrests{datasets} and compute all pair-wise correlations for numerical variables. Is there anything interesting to report?
state.x77{datasets} matrix with 50 rows and 8 columns giving the following statistics in the respective columns.
USArrests{datasets}
This data set contains statistics, in arrests per 100,000 residents for assault, murder, and rape in each of the 50 US states in 1973. Also given is the percent of the population living in urban areas.
Murder: numeric Murder arrests (per 100,000) Assault: numeric Assault arrests (per 100,000) UrbanPop: numeric Percent urban population Rape: numeric Rape arrests (per 100,000)
::p_load(dplyr, magrittr, corrplot)
pacman
# merge by row name (states)
<- merge(state.x77, USArrests, by="row.names", all=TRUE)
dat2
# column name
%<>%
dat2 ::rename(State = Row.names,
dplyrMurder1976 = Murder.x,
Murder1973 = Murder.y)
# change type and show head
<- dat2 |>
dat2 mutate(State= as.factor(State))
head(dat2)
## State Population Income Illiteracy Life Exp Murder1976 HS Grad Frost
## 1 Alabama 3615 3624 2.1 69.05 15.1 41.3 20
## 2 Alaska 365 6315 1.5 69.31 11.3 66.7 152
## 3 Arizona 2212 4530 1.8 70.55 7.8 58.1 15
## 4 Arkansas 2110 3378 1.9 70.66 10.1 39.9 65
## 5 California 21198 5114 1.1 71.71 10.3 62.6 20
## 6 Colorado 2541 4884 0.7 72.06 6.8 63.9 166
## Area Murder1973 Assault UrbanPop Rape
## 1 50708 13.2 236 58 21.2
## 2 566432 10.0 263 48 44.5
## 3 113417 8.1 294 80 31.0
## 4 51945 8.8 190 50 19.5
## 5 156361 9.0 276 91 40.6
## 6 103766 7.9 204 78 38.7
cor(dat2[,-1])|> knitr::kable(digits = 2)
Population | Income | Illiteracy | Life Exp | Murder1976 | HS Grad | Frost | Area | Murder1973 | Assault | UrbanPop | Rape | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
Population | 1.00 | 0.21 | 0.11 | -0.07 | 0.34 | -0.10 | -0.33 | 0.02 | 0.32 | 0.32 | 0.51 | 0.31 |
Income | 0.21 | 1.00 | -0.44 | 0.34 | -0.23 | 0.62 | 0.23 | 0.36 | -0.22 | 0.04 | 0.48 | 0.36 |
Illiteracy | 0.11 | -0.44 | 1.00 | -0.59 | 0.70 | -0.66 | -0.67 | 0.08 | 0.71 | 0.51 | -0.06 | 0.15 |
Life Exp | -0.07 | 0.34 | -0.59 | 1.00 | -0.78 | 0.58 | 0.26 | -0.11 | -0.78 | -0.63 | 0.27 | -0.27 |
Murder1976 | 0.34 | -0.23 | 0.70 | -0.78 | 1.00 | -0.49 | -0.54 | 0.23 | 0.93 | 0.74 | 0.02 | 0.58 |
HS Grad | -0.10 | 0.62 | -0.66 | 0.58 | -0.49 | 1.00 | 0.37 | 0.33 | -0.52 | -0.23 | 0.36 | 0.27 |
Frost | -0.33 | 0.23 | -0.67 | 0.26 | -0.54 | 0.37 | 1.00 | 0.06 | -0.54 | -0.47 | -0.25 | -0.28 |
Area | 0.02 | 0.36 | 0.08 | -0.11 | 0.23 | 0.33 | 0.06 | 1.00 | 0.15 | 0.23 | -0.06 | 0.52 |
Murder1973 | 0.32 | -0.22 | 0.71 | -0.78 | 0.93 | -0.52 | -0.54 | 0.15 | 1.00 | 0.80 | 0.07 | 0.56 |
Assault | 0.32 | 0.04 | 0.51 | -0.63 | 0.74 | -0.23 | -0.47 | 0.23 | 0.80 | 1.00 | 0.26 | 0.67 |
UrbanPop | 0.51 | 0.48 | -0.06 | 0.27 | 0.02 | 0.36 | -0.25 | -0.06 | 0.07 | 0.26 | 1.00 | 0.41 |
Rape | 0.31 | 0.36 | 0.15 | -0.27 | 0.58 | 0.27 | -0.28 | 0.52 | 0.56 | 0.67 | 0.41 | 1.00 |
corrplot.mixed(cor(dat2[,-1]), number.cex = .7, tl.cex = .5, tl.col = "black")
# corrplot(cor(dat2[,-1]))
For corr. > 0.5
Positive correlation
- “Population estimate as of July 1, 1975” and “Percent urban population in 1973”.
- “Per capita income in 1974” and “Percent high-school graduates in 1970”.
- “illiteracy in 1970” with “Murder arrests in 1973”, “Assault arrests in 1973”, and “Murder and non-negligent manslaughter rate in 1976”.
- “Life expectancy” with “Percent high-school graduates in 1970”.
- “Murder and non-negligent manslaughter rate in 1976” with “Murder arrests in 1973”, “Assault arrests in 1973”, and “Rape arrests in 1973”.
- “Murder arrests in 1973” and “Assault arrests in 1973”, “Rape arrests in 1973”.
Negative correlation
- “illiteracy in 1970” with “Life expectancy”, “Percent high-school graduates in 1970”, and “Minimum temperature below freezing in past 30 years”.
- “Murder and non-negligent manslaughter rate in 1976” and “Minimum temperature below freezing in past 30 years”.
- “Percent high-school graduates in 1970” and “Murder arrests in 1973”.
- “Minimum temperature below freezing in past 30 years” and “Murder arrests in 1973”.
謀殺犯罪率高的可能與較高的平均氣溫、低高中畢業率、低識字率、低預期壽命有關;但與地區人口數及平均所得較無關。
Summarize the backpain{HSAUR3} into the following format:
driver | suburban | case | control | total |
---|---|---|---|---|
no | no | ? | ? ? | |
no | yes | ? | ? ? | |
yes | no | ? | ? ? | |
yes | yes | ? | ? ? |
# load package
::p_load(HSAUR3)
pacman
# Input data
data("backpain", package="HSAUR3")
<- backpain dat3
%>%
dat3 ::rename(Group = status,
dplyrDriver = driver,
Suburban = suburban) %>% # column rename
group_by(Driver, Suburban) %>% # Group by Driver and Suburban
::spread(key="Group", value ="Group") %>% # key(Column names) to value
tidyrsummarize(Case = sum(is.na(case)), # sum of case number
Control = sum(is.na(control)), # sum of control number
Total = n()) %>% # sum of total
as.data.frame () %>% #tibble to dataframe
::kable () knitr
## `summarise()` has grouped output by 'Driver'. You can override using the `.groups` argument.
Driver | Suburban | Case | Control | Total |
---|---|---|---|---|
no | no | 38 | 17 | 64 |
no | yes | 5 | 4 | 11 |
yes | no | 43 | 44 | 107 |
yes | yes | 37 | 58 | 158 |
The data set Vocab{carData} gives observations on gender, education and vocabulary, from respondents to U.S. General Social Surveys, 1972-2004.
Task: Summarize the relationship between education and vocabulary over the years by gender.
# load package
::p_load(carData)
pacman
# Input data
data("Vocab", package="carData")
<- Vocab dat4
#columan rename
<- dat4 %>%
dat4 ::rename(Year = year,
dplyrSex = sex,
Education = education,
Vocabulary = vocabulary)
%>%
dat4 mutate(Year = as.factor(Year)) %>% # for show the year number
::xyplot(Vocabulary ~ Education | Year,
latticegroups = Sex,
type = c("p", "g", "r"), data = .,
cex= .5,
pch = 19,
auto.key = list(columns = 2),
xlab = "Education", ylab = "Vocabulary",
layout = c(4,6),
par.settings = list(superpose.symbol = list(pch = 19, cex = 1.5,
col = c("#D95F02", "#7570B3")),
superpose.line = list(col = c("#D95F02", "#7570B3"),
lwd = 1.5)))
An example with dplyr
Task: Supply comments to each code chunk in the following survey rmarkdown file and preview it as an R notebook or knit to html.
The data set concerns species and weight of animals caught in plots in a study area in Arizona over time.
Each row holds information for a single animal, and the columns represent:
# load package.
::p_load(tidyverse) pacman
# read csv data from the link.
<- read_csv("http://kbroman.org/datacarp/portal_data_joined.csv") dat5
## Rows: 34786 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): species_id, sex, genus, species, taxa, plot_type
## dbl (7): record_id, month, day, year, plot_id, hindfoot_length, weight
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
# show every column in dat5 data frame.
glimpse(dat5)
## Rows: 34,786
## Columns: 13
## $ record_id <dbl> 1, 72, 224, 266, 349, 363, 435, 506, 588, 661, 748, 84~
## $ month <dbl> 7, 8, 9, 10, 11, 11, 12, 1, 2, 3, 4, 5, 6, 8, 9, 10, 1~
## $ day <dbl> 16, 19, 13, 16, 12, 12, 10, 8, 18, 11, 8, 6, 9, 5, 4, ~
## $ year <dbl> 1977, 1977, 1977, 1977, 1977, 1977, 1977, 1978, 1978, ~
## $ plot_id <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~
## $ species_id <chr> "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", ~
## $ sex <chr> "M", "M", NA, NA, NA, NA, NA, NA, "M", NA, NA, "M", "M~
## $ hindfoot_length <dbl> 32, 31, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, NA, 34~
## $ weight <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 218, NA, NA, 204, 200,~
## $ genus <chr> "Neotoma", "Neotoma", "Neotoma", "Neotoma", "Neotoma",~
## $ species <chr> "albigula", "albigula", "albigula", "albigula", "albig~
## $ taxa <chr> "Rodent", "Rodent", "Rodent", "Rodent", "Rodent", "Rod~
## $ plot_type <chr> "Control", "Control", "Control", "Control", "Control",~
# dimension of the data (Rows: 34,786 x Columns: 13).
dim(dat5)
## [1] 34786 13
# choose plot_id, species_id, weight column from the data and check the head 6 row.
::select(dat5, plot_id, species_id, weight) %>%
dplyrhead()
## # A tibble: 6 x 3
## plot_id species_id weight
## <dbl> <chr> <dbl>
## 1 2 NL NA
## 2 2 NL NA
## 3 2 NL NA
## 4 2 NL NA
## 5 2 NL NA
## 6 2 NL NA
# Except for record_id, species_id, other column will be choosed from the data and check the head 6 row.
::select(dat5, -record_id, -species_id) %>%
dplyrhead()
## # A tibble: 6 x 11
## month day year plot_id sex hindfoot_length weight genus species taxa
## <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 7 16 1977 2 M 32 NA Neotoma albigula Rodent
## 2 8 19 1977 2 M 31 NA Neotoma albigula Rodent
## 3 9 13 1977 2 <NA> NA NA Neotoma albigula Rodent
## 4 10 16 1977 2 <NA> NA NA Neotoma albigula Rodent
## 5 11 12 1977 2 <NA> NA NA Neotoma albigula Rodent
## 6 11 12 1977 2 <NA> NA NA Neotoma albigula Rodent
## # ... with 1 more variable: plot_type <chr>
# select/subset the data that the year column equal to 1995 from the data and check the head 6 row.
::filter(dat5, year == 1995) %>% head() dplyr
## # A tibble: 6 x 13
## record_id month day year plot_id species_id sex hindfoot_length weight
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 22314 6 7 1995 2 NL M 34 NA
## 2 22728 9 23 1995 2 NL F 32 165
## 3 22899 10 28 1995 2 NL F 32 171
## 4 23032 12 2 1995 2 NL F 33 NA
## 5 22003 1 11 1995 2 DM M 37 41
## 6 22042 2 4 1995 2 DM F 36 45
## # ... with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
## # plot_type <chr>
# choose species_id, sex, weight column from the data and subset the data that the weight column equal and small to 5 and show the head 6 row of the data.
head(dplyr::select(dplyr::filter(dat5, weight <= 5), species_id, sex, weight))
## # A tibble: 6 x 3
## species_id sex weight
## <chr> <chr> <dbl>
## 1 PF M 5
## 2 PF F 5
## 3 PF F 5
## 4 PF F 4
## 5 PF F 5
## 6 PF F 4
# same purpose as previous chunk by use %>% function.
%>%
dat5 ::filter(weight <= 5) %>%
dplyr::select(species_id, sex, weight) %>%
dplyrhead()
## # A tibble: 6 x 3
## species_id sex weight
## <chr> <chr> <dbl>
## 1 PF M 5
## 2 PF F 5
## 3 PF F 5
## 4 PF F 4
## 5 PF F 5
## 6 PF F 4
# create two column "weight_kg" and "weight_lb" by unit conversion calculated.
%>%
dat5 mutate(weight_kg = weight / 1000,
weight_lb = weight_kg * 2.2) %>%
head()
## # A tibble: 6 x 15
## record_id month day year plot_id species_id sex hindfoot_length weight
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 1 7 16 1977 2 NL M 32 NA
## 2 72 8 19 1977 2 NL M 31 NA
## 3 224 9 13 1977 2 NL <NA> NA NA
## 4 266 10 16 1977 2 NL <NA> NA NA
## 5 349 11 12 1977 2 NL <NA> NA NA
## 6 363 11 12 1977 2 NL <NA> NA NA
## # ... with 6 more variables: genus <chr>, species <chr>, taxa <chr>,
## # plot_type <chr>, weight_kg <dbl>, weight_lb <dbl>
%>%
dat5 filter(!is.na(weight)) %>% # 保留非NA的數值
group_by(sex, species_id) %>% # 依照sex及species_id群組
summarize(mean_weight = mean(weight)) %>% # 新增 mean_weight (weight的平均值)欄位
arrange(desc(mean_weight)) %>% # mean_weight 從高到低排序
head() # show the head 6 row of the data.
## `summarise()` has grouped output by 'sex'. You can override using the `.groups` argument.
## # A tibble: 6 x 3
## # Groups: sex [3]
## sex species_id mean_weight
## <chr> <chr> <dbl>
## 1 <NA> NL 168.
## 2 M NL 166.
## 3 F NL 154.
## 4 M SS 130
## 5 <NA> SH 130
## 6 M DS 122.
# tally() is a lower-level function that assumes you've done the grouping
%>%
dat5 group_by(sex) %>%
tally() # similar to dplyr::count(), but need group first
## # A tibble: 3 x 2
## sex n
## <chr> <int>
## 1 F 15690
## 2 M 17348
## 3 <NA> 1748
# the purpose same as previous chunk, count() no need group first.
%>%
dat5 count(sex)
## # A tibble: 3 x 2
## sex n
## <chr> <int>
## 1 F 15690
## 2 M 17348
## 3 <NA> 1748
# the purpose same as previous chunk
# create a count column = current group size.
%>%
dat5 group_by(sex) %>%
summarize(count = n())
## # A tibble: 3 x 2
## sex count
## <chr> <int>
## 1 F 15690
## 2 M 17348
## 3 <NA> 1748
# the purpose same as previous chunk.
# create a count column = sum of the count of observation year without miss value.
# if change the "year" to "day" or "month", the result will be the same.
%>%
dat5 group_by(sex) %>%
summarize(count = sum(!is.na(year)))
## # A tibble: 3 x 2
## sex count
## <chr> <int>
## 1 F 15690
## 2 M 17348
## 3 <NA> 1748
# 各個動物屬在不同plot_id的平均體重值
<- dat5 %>%
dat5_gw filter(!is.na(weight)) %>% # remove the NA row in the weight column.
group_by(genus, plot_id) %>% # group by genus and plot_id
summarize(mean_weight = mean(weight)) # create the mean_weight column
## `summarise()` has grouped output by 'genus'. You can override using the `.groups` argument.
# view the data
glimpse(dat5_gw)
## Rows: 196
## Columns: 3
## Groups: genus [10]
## $ genus <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
## $ plot_id <dbl> 1, 2, 3, 5, 18, 19, 20, 21, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,~
## $ mean_weight <dbl> 7.000000, 6.000000, 8.611111, 7.750000, 9.500000, 9.533333~
# long to wide
# new column is by genus, the row value is by mean_weight
<- dat5_gw %>%
dat5_w spread(key = genus, value = mean_weight)
# view the data
glimpse(dat5_w)
## Rows: 24
## Columns: 11
## $ plot_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,~
## $ Baiomys <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA~
## $ Chaetodipus <dbl> 22.19939, 25.11014, 24.63636, 23.02381, 17.98276, 24.8~
## $ Dipodomys <dbl> 60.23214, 55.68259, 52.04688, 57.52454, 51.11356, 58.6~
## $ Neotoma <dbl> 156.2222, 169.1436, 158.2414, 164.1667, 190.0370, 179.~
## $ Onychomys <dbl> 27.67550, 26.87302, 26.03241, 28.09375, 27.01695, 25.8~
## $ Perognathus <dbl> 9.625000, 6.947368, 7.507812, 7.824427, 8.658537, 7.80~
## $ Peromyscus <dbl> 22.22222, 22.26966, 21.37037, 22.60000, 21.23171, 21.8~
## $ Reithrodontomys <dbl> 11.375000, 10.680556, 10.516588, 10.263158, 11.154545,~
## $ Sigmodon <dbl> NA, 70.85714, 65.61404, 82.00000, 82.66667, 68.77778, ~
## $ Spermophilus <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 13~
# same purpose of dat5_w
%>%
dat5_gw spread(genus, mean_weight, fill = 0) %>% # fill= If set, missing values will be replaced with this value.
head()
## # A tibble: 6 x 11
## plot_id Baiomys Chaetodipus Dipodomys Neotoma Onychomys Perognathus Peromyscus
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 7 22.2 60.2 156. 27.7 9.62 22.2
## 2 2 6 25.1 55.7 169. 26.9 6.95 22.3
## 3 3 8.61 24.6 52.0 158. 26.0 7.51 21.4
## 4 4 0 23.0 57.5 164. 28.1 7.82 22.6
## 5 5 7.75 18.0 51.1 190. 27.0 8.66 21.2
## 6 6 0 24.9 58.6 180. 25.9 7.81 21.8
## # ... with 3 more variables: Reithrodontomys <dbl>, Sigmodon <dbl>,
## # Spermophilus <dbl>
# wide to long exclude plot_id.
<- dat5_w %>%
dat5_l gather(key = genus, value = mean_weight, -plot_id)
# view the data
glimpse(dat5_l)
## Rows: 240
## Columns: 3
## $ plot_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,~
## $ genus <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
## $ mean_weight <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA, NA~
# same as dat5_l
%>%
dat5_w gather(key = genus, value = mean_weight, Baiomys:Spermophilus) %>%
head()
## # A tibble: 6 x 3
## plot_id genus mean_weight
## <dbl> <chr> <dbl>
## 1 1 Baiomys 7
## 2 2 Baiomys 6
## 3 3 Baiomys 8.61
## 4 4 Baiomys NA
## 5 5 Baiomys 7.75
## 6 6 Baiomys NA
# remove missing value of weight, hindfoot_length, and sex in the data set
# 34789 -> 30676
<- dat5 %>%
dat5_complete filter(!is.na(weight),
!is.na(hindfoot_length),
!is.na(sex))
# keep the number of species higher than 49.
# 14 x 2
<- dat5_complete %>%
species_counts count(species_id) %>%
filter(n >= 50)
# keep the species_id with number of species higher than 49 from species_counts.
# 30676 -> 30463
<- dat5_complete %>%
dat5_complete filter(species_id %in% species_counts$species_id)