Comment on what each code chunk in the following classroom markdown file is trying to achive and on its output.
載入WWGbook package中的classroom dataset
::p_load(WWGbook)
pacmandata(classroom, package="WWGbook")
#了解這個資料中的訊息
?classroom
The Study of Instructional Improvement (SII; Hill, Rowan, and Ball, 2004) was carried out by researchers at the University of Michigan to study the math achievement scores of first- and third-grade students in randomly selected classrooms from a national U.S. sample of elementary schools.
#顯示classroom的資料結構
str(classroom)
'data.frame': 1190 obs. of 12 variables:
$ sex : int 1 0 1 0 0 1 0 0 1 0 ...
$ minority: int 1 1 1 1 1 1 1 1 1 1 ...
$ mathkind: int 448 460 511 449 425 450 452 443 422 480 ...
$ mathgain: int 32 109 56 83 53 65 51 66 88 -7 ...
$ ses : num 0.46 -0.27 -0.03 -0.38 -0.03 0.76 -0.03 0.2 0.64 0.13 ...
$ yearstea: num 1 1 1 2 2 2 2 2 2 2 ...
$ mathknow: num NA NA NA -0.11 -0.11 -0.11 -0.11 -0.11 -0.11 -0.11 ...
$ housepov: num 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 ...
$ mathprep: num 2 2 2 3.25 3.25 3.25 3.25 3.25 3.25 3.25 ...
$ classid : int 160 160 160 217 217 217 217 217 217 217 ...
$ schoolid: int 1 1 1 1 1 1 1 1 1 1 ...
$ childid : int 1 2 3 4 5 6 7 8 9 10 ...
A data frame with 1190 observations on the following 12 variables.
split data to unique school ID and only keep “schoolid”, “housepov” two variables
<- classroom[duplicated(classroom$schoolid)==FALSE,
dta_schl c("schoolid", "housepov")]
duplicated 可以刪掉重複值。藉由duplicated為FALSE,保留唯一筆schoolid
<- classroom[duplicated(classroom$classid)==FALSE,
dta_cls c(11, 10, 6,7,9)]
刪掉重複classid,並保留schoolid、classid、yearstea、mathknow、mathprep
<- classroom[, c(12, 10, 11, 1:5)] dta_chld
排序欄位:childid、classid、schoolid、sex、minority、mathkind、mathgain、ses
sapply(list(dta_schl, dta_cls, dta_chld), dim)
[,1] [,2] [,3]
[1,] 107 312 1190
[2,] 2 5 8
將3個dataset放入list,利用sapply看每個dataset的dimention
merge dataset by column name
<- merge(x=dta_chld, y=dta_cls, by=c("classid", "schoolid")) dta_12
<- merge(x=dta_chld, y=dta_cls, by=c("classid", "schoolid")) dta_13
<- merge(x=dta_cls, y=dta_schl, by="schoolid") dta_23
<- merge(x=dta_12, y=dta_schl, by=c("schoolid")) dta_123
sapply(list(dta_12, dta_13, dta_23, dta_123), dim)
[,1] [,2] [,3] [,4]
[1,] 1190 1190 312 1190
[2,] 11 11 6 12
show 每一個merge完的dataset的dimention
library(dplyr)
<-dta_chld|> filter(schoolid== 1) #11筆
dta_chldv<-dta_cls|> filter(schoolid== 1) #2筆
dta_clsv<- merge(x=dta_chldv, y=dta_clsv, by="schoolid") #22筆 dta_12s
Merge the two data sets: state.x77{datasets} and USArrests{datasets} and compute all pair-wise correlations for numerical variables. Is there anything interesting to report?
# attributes of dataset
attributes(state.x77)
$dim
[1] 50 8
$dimnames
$dimnames[[1]]
[1] "Alabama" "Alaska" "Arizona" "Arkansas"
[5] "California" "Colorado" "Connecticut" "Delaware"
[9] "Florida" "Georgia" "Hawaii" "Idaho"
[13] "Illinois" "Indiana" "Iowa" "Kansas"
[17] "Kentucky" "Louisiana" "Maine" "Maryland"
[21] "Massachusetts" "Michigan" "Minnesota" "Mississippi"
[25] "Missouri" "Montana" "Nebraska" "Nevada"
[29] "New Hampshire" "New Jersey" "New Mexico" "New York"
[33] "North Carolina" "North Dakota" "Ohio" "Oklahoma"
[37] "Oregon" "Pennsylvania" "Rhode Island" "South Carolina"
[41] "South Dakota" "Tennessee" "Texas" "Utah"
[45] "Vermont" "Virginia" "Washington" "West Virginia"
[49] "Wisconsin" "Wyoming"
$dimnames[[2]]
[1] "Population" "Income" "Illiteracy" "Life Exp" "Murder"
[6] "HS Grad" "Frost" "Area"
attributes(USArrests)
$names
[1] "Murder" "Assault" "UrbanPop" "Rape"
$class
[1] "data.frame"
$row.names
[1] "Alabama" "Alaska" "Arizona" "Arkansas"
[5] "California" "Colorado" "Connecticut" "Delaware"
[9] "Florida" "Georgia" "Hawaii" "Idaho"
[13] "Illinois" "Indiana" "Iowa" "Kansas"
[17] "Kentucky" "Louisiana" "Maine" "Maryland"
[21] "Massachusetts" "Michigan" "Minnesota" "Mississippi"
[25] "Missouri" "Montana" "Nebraska" "Nevada"
[29] "New Hampshire" "New Jersey" "New Mexico" "New York"
[33] "North Carolina" "North Dakota" "Ohio" "Oklahoma"
[37] "Oregon" "Pennsylvania" "Rhode Island" "South Carolina"
[41] "South Dakota" "Tennessee" "Texas" "Utah"
[45] "Vermont" "Virginia" "Washington" "West Virginia"
[49] "Wisconsin" "Wyoming"
# merge
<- merge(state.x77,USArrests, by="row.names")
dta2 #column name
names(dta2)
[1] "Row.names" "Population" "Income" "Illiteracy" "Life Exp"
[6] "Murder.x" "HS Grad" "Frost" "Area" "Murder.y"
[11] "Assault" "UrbanPop" "Rape"
merge state.x77 and USArrests by row.names.
::p_load(dplyr, magrittr)
pacman# rename
%<>%
dta2 ::rename(State=Row.names, Murder1976=Murder.x, Murder1973=Murder.y)
dplyr#
$State <- as.factor(dta2$State)
dta2str(dta2)
'data.frame': 50 obs. of 13 variables:
$ State : Factor w/ 50 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
$ Population: num 3615 365 2212 2110 21198 ...
$ Income : num 3624 6315 4530 3378 5114 ...
$ Illiteracy: num 2.1 1.5 1.8 1.9 1.1 0.7 1.1 0.9 1.3 2 ...
$ Life Exp : num 69 69.3 70.5 70.7 71.7 ...
$ Murder1976: num 15.1 11.3 7.8 10.1 10.3 6.8 3.1 6.2 10.7 13.9 ...
$ HS Grad : num 41.3 66.7 58.1 39.9 62.6 63.9 56 54.6 52.6 40.6 ...
$ Frost : num 20 152 15 65 20 166 139 103 11 60 ...
$ Area : num 50708 566432 113417 51945 156361 ...
$ Murder1973: num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
$ Assault : int 236 263 294 190 276 204 110 238 335 211 ...
$ UrbanPop : int 58 48 80 50 91 78 77 72 80 60 ...
$ Rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
::p_load(dplyr, magrittr, corrplot)
pacman
# merge by row name (states)
<- merge(state.x77, USArrests, by="row.names", all=TRUE)
dat2
# column name
%<>%
dat2 ::rename(State = Row.names,
dplyrMurder1976 = Murder.x,
Murder1973 = Murder.y)
# change type and show head
<-dat2 |>
dat2mutate(State= as.factor(State))
<-round(cor(dta2[,-1]), 2)
res ::kable(res)#要有括號才跑得出來 knitr
Population | Income | Illiteracy | Life Exp | Murder1976 | HS Grad | Frost | Area | Murder1973 | Assault | UrbanPop | Rape | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
Population | 1.00 | 0.21 | 0.11 | -0.07 | 0.34 | -0.10 | -0.33 | 0.02 | 0.32 | 0.32 | 0.51 | 0.31 |
Income | 0.21 | 1.00 | -0.44 | 0.34 | -0.23 | 0.62 | 0.23 | 0.36 | -0.22 | 0.04 | 0.48 | 0.36 |
Illiteracy | 0.11 | -0.44 | 1.00 | -0.59 | 0.70 | -0.66 | -0.67 | 0.08 | 0.71 | 0.51 | -0.06 | 0.15 |
Life Exp | -0.07 | 0.34 | -0.59 | 1.00 | -0.78 | 0.58 | 0.26 | -0.11 | -0.78 | -0.63 | 0.27 | -0.27 |
Murder1976 | 0.34 | -0.23 | 0.70 | -0.78 | 1.00 | -0.49 | -0.54 | 0.23 | 0.93 | 0.74 | 0.02 | 0.58 |
HS Grad | -0.10 | 0.62 | -0.66 | 0.58 | -0.49 | 1.00 | 0.37 | 0.33 | -0.52 | -0.23 | 0.36 | 0.27 |
Frost | -0.33 | 0.23 | -0.67 | 0.26 | -0.54 | 0.37 | 1.00 | 0.06 | -0.54 | -0.47 | -0.25 | -0.28 |
Area | 0.02 | 0.36 | 0.08 | -0.11 | 0.23 | 0.33 | 0.06 | 1.00 | 0.15 | 0.23 | -0.06 | 0.52 |
Murder1973 | 0.32 | -0.22 | 0.71 | -0.78 | 0.93 | -0.52 | -0.54 | 0.15 | 1.00 | 0.80 | 0.07 | 0.56 |
Assault | 0.32 | 0.04 | 0.51 | -0.63 | 0.74 | -0.23 | -0.47 | 0.23 | 0.80 | 1.00 | 0.26 | 0.67 |
UrbanPop | 0.51 | 0.48 | -0.06 | 0.27 | 0.02 | 0.36 | -0.25 | -0.06 | 0.07 | 0.26 | 1.00 | 0.41 |
Rape | 0.31 | 0.36 | 0.15 | -0.27 | 0.58 | 0.27 | -0.28 | 0.52 | 0.56 | 0.67 | 0.41 | 1.00 |
class(res)
[1] "matrix" "array"
library(corrplot)
corrplot(res, type = "upper", order = "hclust",
tl.col = "black", tl.srt = 45)
The correlation matrix is reordered according to the correlation coefficient using “hclust” method.
tl.col (for text label color) and tl.srt (for text label string rotation) are used to change text colors and rotations.
Possible values for the argument type are : “upper”, “lower”, “full”
結論 Illiteracy has positive correlation on Rape, Assault and Murder.
Summarize the backpain{HSAUR3} into the following format:
You should provide comments for each code chunk.
# load package
::p_load(HSAUR3)
pacman
# Input data
data("backpain", package="HSAUR3")
<- backpain
dta3 head(dta3)
ID status driver suburban
1 1 case yes yes
2 1 control yes no
3 2 case yes yes
4 2 control yes yes
5 3 case yes no
6 3 control yes yes
str(dta3)
'data.frame': 434 obs. of 4 variables:
$ ID : Factor w/ 217 levels "1","2","3","4",..: 1 1 2 2 3 3 4 4 5 5 ...
$ status : Factor w/ 2 levels "case","control": 1 2 1 2 1 2 1 2 1 2 ...
$ driver : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 1 1 2 2 ...
$ suburban: Factor w/ 2 levels "no","yes": 2 1 2 2 1 2 1 1 1 2 ...
<- with(dta3,table(driver, suburban, status))
A <-with(dta3,ftable(driver, suburban, status))
tbl <-with(dta3,ftable(addmargins(A, c(1, 3),
tbl1FUN = list(list(Sum = sum), Sum = sum))))
Margins computed over dimensions
in the following order:
1: driver
2: status
tbl
status case control
driver suburban
no no 26 47
yes 6 7
yes no 64 63
yes 121 100
tbl1
status case control Sum
driver suburban
no no 26 47 73
yes 6 7 13
yes no 64 63 127
yes 121 100 221
Sum no 90 110 200
yes 127 107 234
addmargins: Puts Arbitrary Margins on Multidimensional Tables or Arrays
::p_load(dplyr, magrittr ,tidyr)
pacman|>
dta3 pivot_wider(names_from = status, values_from = status)|>
group_by(driver, suburban) |>
summarize(Case = sum(!is.na(case)), # sum of case number
Control = sum(!is.na(control)), # sum of control number
Total = as.numeric(Case)+as.numeric(Control)) %>% # sum of total
as.data.frame
driver suburban Case Control Total
1 no no 26 47 73
2 no yes 6 7 13
3 yes no 64 63 127
4 yes yes 121 100 221
The data set Vocab{carData} gives observations on gender, education and vocabulary, from respondents to U.S. General Social Surveys, 1972-2004. Summarize the relationship between education and vocabulary over the years by gender.
# load package
::p_load(carData)
pacman
# Input data
data("Vocab", package="carData")
<- Vocab
dat4
# columan rename
<- dat4 %>%
dat4 ::rename(Year = year,
dplyrSex = sex,
Education = education,
Vocabulary = vocabulary)
# plot
%>%
dat4 mutate(Year = as.factor(Year)) %>% # for show the year number
::xyplot(Vocabulary ~ Education | Year,
latticegroups = Sex,
type = c("p", "g", "r"), data = .,
cex= .5,
pch = 19,
auto.key = list(columns = 2),
xlab = "Education", ylab = "Vocabulary",
layout = c(4,6),
par.settings = list(superpose.symbol = list(pch = 19, cex = 1.5,
col = c("#D95F02", "#7570B3")),
superpose.line = list(col = c("#D95F02", "#7570B3"),
lwd = 1.5)))
Supply comments to each code chunk in the following survey rmarkdown file and preview it as an R notebook or knit to html.
# 載入tidyverse package
::p_load(tidyverse) pacman
# 從網路上載下csv
<- read_csv("http://kbroman.org/datacarp/portal_data_joined.csv") dta
# 看一下資料數據型態
glimpse(dta)
Rows: 34,786
Columns: 13
$ record_id <dbl> 1, 72, 224, 266, 349, 363, 435, 506, 588, 661, 748, 84~
$ month <dbl> 7, 8, 9, 10, 11, 11, 12, 1, 2, 3, 4, 5, 6, 8, 9, 10, 1~
$ day <dbl> 16, 19, 13, 16, 12, 12, 10, 8, 18, 11, 8, 6, 9, 5, 4, ~
$ year <dbl> 1977, 1977, 1977, 1977, 1977, 1977, 1977, 1978, 1978, ~
$ plot_id <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~
$ species_id <chr> "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", ~
$ sex <chr> "M", "M", NA, NA, NA, NA, NA, NA, "M", NA, NA, "M", "M~
$ hindfoot_length <dbl> 32, 31, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, NA, 34~
$ weight <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 218, NA, NA, 204, 200,~
$ genus <chr> "Neotoma", "Neotoma", "Neotoma", "Neotoma", "Neotoma",~
$ species <chr> "albigula", "albigula", "albigula", "albigula", "albig~
$ taxa <chr> "Rodent", "Rodent", "Rodent", "Rodent", "Rodent", "Rod~
$ plot_type <chr> "Control", "Control", "Control", "Control", "Control",~
# 資料維度
dim(dta)
[1] 34786 13
# 用dplyr::select選擇dta中的plot_id, species_id, weight三個變項
::select(dta, plot_id, species_id, weight) %>% head() dplyr
# A tibble: 6 x 3
plot_id species_id weight
<dbl> <chr> <dbl>
1 2 NL NA
2 2 NL NA
3 2 NL NA
4 2 NL NA
5 2 NL NA
6 2 NL NA
# select 也可以指定要刪掉哪幾個變項就好(刪掉record_id,species_id)
::select(dta, -record_id, -species_id) %>% head() dplyr
# A tibble: 6 x 11
month day year plot_id sex hindfoot_length weight genus species taxa
<dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr> <chr>
1 7 16 1977 2 M 32 NA Neotoma albigula Rodent
2 8 19 1977 2 M 31 NA Neotoma albigula Rodent
3 9 13 1977 2 <NA> NA NA Neotoma albigula Rodent
4 10 16 1977 2 <NA> NA NA Neotoma albigula Rodent
5 11 12 1977 2 <NA> NA NA Neotoma albigula Rodent
6 11 12 1977 2 <NA> NA NA Neotoma albigula Rodent
# ... with 1 more variable: plot_type <chr>
# 利用filter篩選年代為1995年的資料
::filter(dta, year == 1995) %>% head() dplyr
# A tibble: 6 x 13
record_id month day year plot_id species_id sex hindfoot_length weight
<dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
1 22314 6 7 1995 2 NL M 34 NA
2 22728 9 23 1995 2 NL F 32 165
3 22899 10 28 1995 2 NL F 32 171
4 23032 12 2 1995 2 NL F 33 NA
5 22003 1 11 1995 2 DM M 37 41
6 22042 2 4 1995 2 DM F 36 45
# ... with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
# plot_type <chr>
# 先filter體重<= 5,用select選擇species_id, sex, weight三個變項
head(dplyr::select(dplyr::filter(dta, weight <= 5), species_id, sex, weight))
# A tibble: 6 x 3
species_id sex weight
<chr> <chr> <dbl>
1 PF M 5
2 PF F 5
3 PF F 5
4 PF F 4
5 PF F 5
6 PF F 4
# 這個與放面的語法一樣,透過%>%可以讓code更直觀
%>%
dta ::filter(weight <= 5) %>%
dplyr::select(species_id, sex, weight) %>%
dplyr head
# A tibble: 6 x 3
species_id sex weight
<chr> <chr> <dbl>
1 PF M 5
2 PF F 5
3 PF F 5
4 PF F 4
5 PF F 5
6 PF F 4
# 用mutate 建立weight_kg與weight_lb兩個變項,並分別給定計算方式
%>%
dta mutate(weight_kg = weight / 1000,
weight_lb = weight_kg * 2.2) %>%
head()
# A tibble: 6 x 15
record_id month day year plot_id species_id sex hindfoot_length weight
<dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
1 1 7 16 1977 2 NL M 32 NA
2 72 8 19 1977 2 NL M 31 NA
3 224 9 13 1977 2 NL <NA> NA NA
4 266 10 16 1977 2 NL <NA> NA NA
5 349 11 12 1977 2 NL <NA> NA NA
6 363 11 12 1977 2 NL <NA> NA NA
# ... with 6 more variables: genus <chr>, species <chr>, taxa <chr>,
# plot_type <chr>, weight_kg <dbl>, weight_lb <dbl>
# filter篩選體重沒有na的資料,group_by依照 sex與species_id,計算mean weight,並遞減排序
%>%
dta filter(!is.na(weight)) %>%
group_by(sex, species_id) %>%
summarize(mean_weight = mean(weight)) %>%
arrange(desc(mean_weight)) %>%
head()
# A tibble: 6 x 3
# Groups: sex [3]
sex species_id mean_weight
<chr> <chr> <dbl>
1 <NA> NL 168.
2 M NL 166.
3 F NL 154.
4 M SS 130
5 <NA> SH 130
6 M DS 122.
%>%
dta group_by(sex) %>%
#與count()相似,但必須先寫group_by()才行 tally
# A tibble: 3 x 2
sex n
<chr> <int>
1 F 15690
2 M 17348
3 <NA> 1748
%>%
dta count(sex) #count()除了計算數量還包含執行group_by的功能
# A tibble: 3 x 2
sex n
<chr> <int>
1 F 15690
2 M 17348
3 <NA> 1748
#用summarize可以建立一個叫count的變項,並透過group_by sex後n()計算row的總數
%>%
dta group_by(sex) %>%
summarize(count = n())
# A tibble: 3 x 2
sex count
<chr> <int>
1 F 15690
2 M 17348
3 <NA> 1748
# 建立count欄位,並用sum加總沒有Missing的year數量
%>%
dta group_by(sex) %>%
summarize(count = sum(!is.na(year)))
# A tibble: 3 x 2
sex count
<chr> <int>
1 F 15690
2 M 17348
3 <NA> 1748
<- dta %>%
dta_gw filter(!is.na(weight)) %>% #篩選出沒有na的weight列
group_by(genus, plot_id) %>% # group_by genus與plot_id
summarize(mean_weight = mean(weight)) #計算體重平均並放入mean_weight變項中
# 看資料的數據屬性
glimpse(dta_gw)
Rows: 196
Columns: 3
Groups: genus [10]
$ genus <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
$ plot_id <dbl> 1, 2, 3, 5, 18, 19, 20, 21, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,~
$ mean_weight <dbl> 7.000000, 6.000000, 8.611111, 7.750000, 9.500000, 9.533333~
# can be used to “spread” a key-value pair across multiple columns.
# 與pivot_wider一樣功能?
<- dta_gw %>%
dta_w spread(key = genus, value = mean_weight)
key: Column whose values will become variable names
value: Column where values will fill under new variables created from key
glimpse(dta_w)
Rows: 24
Columns: 11
$ plot_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,~
$ Baiomys <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA~
$ Chaetodipus <dbl> 22.19939, 25.11014, 24.63636, 23.02381, 17.98276, 24.8~
$ Dipodomys <dbl> 60.23214, 55.68259, 52.04688, 57.52454, 51.11356, 58.6~
$ Neotoma <dbl> 156.2222, 169.1436, 158.2414, 164.1667, 190.0370, 179.~
$ Onychomys <dbl> 27.67550, 26.87302, 26.03241, 28.09375, 27.01695, 25.8~
$ Perognathus <dbl> 9.625000, 6.947368, 7.507812, 7.824427, 8.658537, 7.80~
$ Peromyscus <dbl> 22.22222, 22.26966, 21.37037, 22.60000, 21.23171, 21.8~
$ Reithrodontomys <dbl> 11.375000, 10.680556, 10.516588, 10.263158, 11.154545,~
$ Sigmodon <dbl> NA, 70.85714, 65.61404, 82.00000, 82.66667, 68.77778, ~
$ Spermophilus <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 13~
%>%
dta_gw spread(genus, mean_weight, fill = 0) %>% # fill把NA設定為0
head()
# A tibble: 6 x 11
plot_id Baiomys Chaetodipus Dipodomys Neotoma Onychomys Perognathus Peromyscus
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 7 22.2 60.2 156. 27.7 9.62 22.2
2 2 6 25.1 55.7 169. 26.9 6.95 22.3
3 3 8.61 24.6 52.0 158. 26.0 7.51 21.4
4 4 0 23.0 57.5 164. 28.1 7.82 22.6
5 5 7.75 18.0 51.1 190. 27.0 8.66 21.2
6 6 0 24.9 58.6 180. 25.9 7.81 21.8
# ... with 3 more variables: Reithrodontomys <dbl>, Sigmodon <dbl>,
# Spermophilus <dbl>
# Gather columns into key-value pairs
# 轉long form的意思
<- dta_w %>%
dta_l gather(key = genus,
value = mean_weight,
-plot_id)#因為針對整個dta_w進行轉置,會錯誤納入plot_id為genus中,所以要排除他
glimpse(dta_l)#因為沒有一起進入轉置,所以才能出現plot_id
Rows: 240
Columns: 3
$ plot_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,~
$ genus <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
$ mean_weight <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA, NA~
%>%
dta_w gather(key = genus,
value = mean_weight,
:Spermophilus) %>% #這邊直接指定屬genus範圍是Baiomys:Spermophilus
Baiomyshead()
# A tibble: 6 x 3
plot_id genus mean_weight
<dbl> <chr> <dbl>
1 1 Baiomys 7
2 2 Baiomys 6
3 3 Baiomys 8.61
4 4 Baiomys NA
5 5 Baiomys 7.75
6 6 Baiomys NA
# filter by not NA on weight, hindfoot_length, sex sequently
<- dta %>%
dta_complete filter(!is.na(weight),
!is.na(hindfoot_length),
!is.na(sex))
# 先依照species_id計算數量,並篩選出>= 50
<- dta_complete %>%
species_counts count(species_id) %>%
filter(n >= 50)
# 篩選有在dta_complete的species_id也有在species_counts$species_id
# 意思與保留species_id計算數量>= 50相同
<- dta_complete %>%
dta_complete filter(species_id %in% species_counts$species_id)