Supply comments to each code chunk in the following survey rmarkdown file and preview it as an R notebook or knit to html.
The data set concerns species and weight of animals caught in plots in a study area in Arizona over time.
Each row holds information for a single animal, and the columns represent:
dta <- read_csv("http://kbroman.org/datacarp/portal_data_joined.csv")
#> Parsed with column specification:
#> cols(
#> record_id = col_double(),
#> month = col_double(),
#> day = col_double(),
#> year = col_double(),
#> plot_id = col_double(),
#> species_id = col_character(),
#> sex = col_character(),
#> hindfoot_length = col_double(),
#> weight = col_double(),
#> genus = col_character(),
#> species = col_character(),
#> taxa = col_character(),
#> plot_type = col_character()
#> )glimpse(dta)
#> Observations: 34,786
#> Variables: 13
#> $ record_id <dbl> 1, 72, 224, 266, 349, 363, 435, 506, 588, 661, 748,...
#> $ month <dbl> 7, 8, 9, 10, 11, 11, 12, 1, 2, 3, 4, 5, 6, 8, 9, 10...
#> $ day <dbl> 16, 19, 13, 16, 12, 12, 10, 8, 18, 11, 8, 6, 9, 5, ...
#> $ year <dbl> 1977, 1977, 1977, 1977, 1977, 1977, 1977, 1978, 197...
#> $ plot_id <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...
#> $ species_id <chr> "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL...
#> $ sex <chr> "M", "M", NA, NA, NA, NA, NA, NA, "M", NA, NA, "M",...
#> $ hindfoot_length <dbl> 32, 31, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, NA,...
#> $ weight <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 218, NA, NA, 204, 2...
#> $ genus <chr> "Neotoma", "Neotoma", "Neotoma", "Neotoma", "Neotom...
#> $ species <chr> "albigula", "albigula", "albigula", "albigula", "al...
#> $ taxa <chr> "Rodent", "Rodent", "Rodent", "Rodent", "Rodent", "...
#> $ plot_type <chr> "Control", "Control", "Control", "Control", "Contro...dplyr::select(dta, plot_id, species_id, weight) %>% head()
#> # A tibble: 6 x 3
#> plot_id species_id weight
#> <dbl> <chr> <dbl>
#> 1 2 NL NA
#> 2 2 NL NA
#> 3 2 NL NA
#> 4 2 NL NA
#> 5 2 NL NA
#> 6 2 NL NAdplyr::select(dta, -record_id, -species_id) %>% head()
#> # A tibble: 6 x 11
#> month day year plot_id sex hindfoot_length weight genus species taxa
#> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr> <chr>
#> 1 7 16 1977 2 M 32 NA Neot~ albigu~ Rode~
#> 2 8 19 1977 2 M 31 NA Neot~ albigu~ Rode~
#> 3 9 13 1977 2 <NA> NA NA Neot~ albigu~ Rode~
#> 4 10 16 1977 2 <NA> NA NA Neot~ albigu~ Rode~
#> 5 11 12 1977 2 <NA> NA NA Neot~ albigu~ Rode~
#> 6 11 12 1977 2 <NA> NA NA Neot~ albigu~ Rode~
#> # ... with 1 more variable: plot_type <chr>dplyr 是 Tidyverse package 裡用來做資料整理的。
選取資料裡的 plot_id、species_id、weight,瀏覽前六筆觀察值,再將資料裡的 record_id 及 species_id 欄位去除,一樣瀏覽前六筆資料。
dplyr::filter(dta, year == 1995) %>% head()
#> # A tibble: 6 x 13
#> record_id month day year plot_id species_id sex hindfoot_length weight
#> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
#> 1 22314 6 7 1995 2 NL M 34 NA
#> 2 22728 9 23 1995 2 NL F 32 165
#> 3 22899 10 28 1995 2 NL F 32 171
#> 4 23032 12 2 1995 2 NL F 33 NA
#> 5 22003 1 11 1995 2 DM M 37 41
#> 6 22042 2 4 1995 2 DM F 36 45
#> # ... with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
#> # plot_type <chr>這裡篩選 1995 年的 資料 ( year == 1995 ),再瀏覽前六筆觀察值。
head(dplyr::select(dplyr::filter(dta, weight <= 5), species_id, sex, weight))
#> # A tibble: 6 x 3
#> species_id sex weight
#> <chr> <chr> <dbl>
#> 1 PF M 5
#> 2 PF F 5
#> 3 PF F 5
#> 4 PF F 4
#> 5 PF F 5
#> 6 PF F 4語法先寫出「瀏覽前六筆資料」( head ),再詳列所要的觀察值條件:重量小於等於 5 weight <= 5 的觀察紀錄,列出其物種編號 ( speices_id )、性別 ( sex )、體重 ( weight )。
dta %>%
dplyr::filter(weight <= 5) %>%
dplyr::select(species_id, sex, weight) %>%
head
#> # A tibble: 6 x 3
#> species_id sex weight
#> <chr> <chr> <dbl>
#> 1 PF M 5
#> 2 PF F 5
#> 3 PF F 5
#> 4 PF F 4
#> 5 PF F 5
#> 6 PF F 4先指定所要討論的資料 ( dta ),再寫出篩選條件 ( dplyr::filter(weight <= 5) ),接著指定所要的觀察值欄位 ( dplyr::select(species_id, sex, weight) ),最後要採取的動作是取出前六筆資料 ( head )。
mutate:以既有變量為根據新增新的變量dta %>%
mutate(weight_kg = weight / 1000,
weight_lb = weight_kg * 2.2) %>%
head()
#> # A tibble: 6 x 15
#> record_id month day year plot_id species_id sex hindfoot_length weight
#> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
#> 1 1 7 16 1977 2 NL M 32 NA
#> 2 72 8 19 1977 2 NL M 31 NA
#> 3 224 9 13 1977 2 NL <NA> NA NA
#> 4 266 10 16 1977 2 NL <NA> NA NA
#> 5 349 11 12 1977 2 NL <NA> NA NA
#> 6 363 11 12 1977 2 NL <NA> NA NA
#> # ... with 6 more variables: genus <chr>, species <chr>, taxa <chr>,
#> # plot_type <chr>, weight_kg <dbl>, weight_lb <dbl>在 dta 資料裡新增 weight_kg 和 weight_lb 兩個變量,分別是將原有的 weight 換算成以公斤為單位 ( 除以 1000 ),後者是換算成磅 ( 乘上 2.2 )。
dta %>%
filter(!is.na(weight)) %>%
group_by(sex, species_id) %>%
summarize(mean_weight = mean(weight)) %>%
arrange(desc(mean_weight)) %>%
head()
#> # A tibble: 6 x 3
#> # Groups: sex [3]
#> sex species_id mean_weight
#> <chr> <chr> <dbl>
#> 1 <NA> NL 168.
#> 2 M NL 166.
#> 3 F NL 154.
#> 4 M SS 130
#> 5 <NA> SH 130
#> 6 M DS 122.這裡的 ! 表示否定 ( Not ),所以對資料 dta 處理的第一步 filter(!is.na(weight)) 即是篩選出 weight 裡不是 NA 值的觀察值。
接下來,group_by(sex, species_id) 表示以性別和物種編號來分組,然後以此分組方式計算各組的基本統計 ( summarize() ),特別指定計算 weight 的平均,並新增 mean_weight 來呈現。
最後,以降序 ( desc() )的方式依 mean_weight 排列所篩選的資料 ( arrange(desc(mean_weight)) )。
dta %>%
group_by(sex) %>%
tally
#> # A tibble: 3 x 2
#> sex n
#> <chr> <int>
#> 1 F 15690
#> 2 M 17348
#> 3 <NA> 1748使用 tally 來計算資料個數,相當於 basic R 的 length() ,但更進階,可以同時加入 group_by 分組的功能,也可以選擇統計列的個數 ( 加入 wt )。
tally(x, wt, sort = FALSE)
count_(x, vars, wt = NULL, sort = FALSE)
## `vars` : Variables to group by
dta %>%
count(sex)
#> # A tibble: 3 x 2
#> sex n
#> <chr> <int>
#> 1 F 15690
#> 2 M 17348
#> 3 <NA> 1748這裡使用 count() 一樣能計算指定變數的個數。
dta %>%
group_by(sex) %>%
summarize(count = n())
#> # A tibble: 3 x 2
#> sex count
#> <chr> <int>
#> 1 F 15690
#> 2 M 17348
#> 3 <NA> 1748n() 一樣是計算個數的 function。
dta %>%
group_by(sex) %>%
summarize(count = sum(!is.na(year)))
#> # A tibble: 3 x 2
#> sex count
#> <chr> <int>
#> 1 F 15690
#> 2 M 17348
#> 3 <NA> 1748這裡使用 sum() 計算排除遺失年份觀察值的性別變數裡二個類別的個數。
dta_gw <- dta %>%
filter(!is.na(weight)) %>%
group_by(genus, plot_id) %>%
summarize(mean_weight = mean(weight))產生新的資料 dta_gw ,這是將 dta 資料先排除 weight 有遺漏值的觀察值,再取出 genus 、 plot_id 和 mean_weight。
glimpse(dta_gw)
#> Observations: 196
#> Variables: 3
#> Groups: genus [10]
#> $ genus <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", ...
#> $ plot_id <dbl> 1, 2, 3, 5, 18, 19, 20, 21, 1, 2, 3, 4, 5, 6, 7, 8, 9, ...
#> $ mean_weight <dbl> 7.000000, 6.000000, 8.611111, 7.750000, 9.500000, 9.533...接下來,將 dta_gw 用 spread() 擴展。
spread(data, key, value, fill = NA, convert = FALSE, drop =TRUE, sep = NULL)
key 要擴展的原變量
value 擴展後各變量要填入的值,也是原資料裡的某變量
fill 若有遺漏值,要填入什麼
將屬名變量 ( genus ) 的類別拆成單一變量,以 mean_weight 為內容,類似於 basic R 的 unstack()。
glimpse(dta_w)
#> Observations: 24
#> Variables: 11
#> $ plot_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
#> $ Baiomys <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA,...
#> $ Chaetodipus <dbl> 22.19939, 25.11014, 24.63636, 23.02381, 17.98276, 2...
#> $ Dipodomys <dbl> 60.23214, 55.68259, 52.04688, 57.52454, 51.11356, 5...
#> $ Neotoma <dbl> 156.2222, 169.1436, 158.2414, 164.1667, 190.0370, 1...
#> $ Onychomys <dbl> 27.67550, 26.87302, 26.03241, 28.09375, 27.01695, 2...
#> $ Perognathus <dbl> 9.625000, 6.947368, 7.507812, 7.824427, 8.658537, 7...
#> $ Peromyscus <dbl> 22.22222, 22.26966, 21.37037, 22.60000, 21.23171, 2...
#> $ Reithrodontomys <dbl> 11.375000, 10.680556, 10.516588, 10.263158, 11.1545...
#> $ Sigmodon <dbl> NA, 70.85714, 65.61404, 82.00000, 82.66667, 68.7777...
#> $ Spermophilus <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...dta_gw %>%
spread(genus, mean_weight, fill = 0) %>%
head()
#> # A tibble: 6 x 11
#> plot_id Baiomys Chaetodipus Dipodomys Neotoma Onychomys Perognathus Peromyscus
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 7 22.2 60.2 156. 27.7 9.62 22.2
#> 2 2 6 25.1 55.7 169. 26.9 6.95 22.3
#> 3 3 8.61 24.6 52.0 158. 26.0 7.51 21.4
#> 4 4 0 23.0 57.5 164. 28.1 7.82 22.6
#> 5 5 7.75 18.0 51.1 190. 27.0 8.66 21.2
#> 6 6 0 24.9 58.6 180. 25.9 7.81 21.8
#> # ... with 3 more variables: Reithrodontomys <dbl>, Sigmodon <dbl>,
#> # Spermophilus <dbl>將 0 填入到有遺漏值的變量裡。
使用 gather() function 將被拆開的各個屬名再合併回去成 genus 變量。
gather(data, key, value, ..., na.rm = FALSE,
convert = FALSE, factor_key = FALSE)
# `key` and `value` are the names of new key and value columns, as strings or symbols
gather() 的廣義是轉置,因此在這裡的語法可以解釋為;
將原資料 dta_w
→ 增加 genus 欄位 ( key=genius )
( 這裡也等於去掉所有屬名變量!因為沒有保留! )
→ 增加 mean_weight 欄位 ( value = mean_weight )
→ 去除 plot_id ,也就是不要讓 plot_id 轉置 ( -plot_id )
然後,gather() 會將新增的欄位與原本的資料對起來,也就是 dta_w 裡的 plot_id 保留,key 和 value 自動對應成 (Baiomys, 7.000000)、(Chaetopidus, 22.19939)、(Dipodomys, 60.23214)……etc.。
glimpse(dta_l)
#> Observations: 240
#> Variables: 3
#> $ plot_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ...
#> $ genus <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", ...
#> $ mean_weight <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA,...也可以更明確地將所要合併的變量列出來,就會對應到 key 所新增的變量。
使用 fliter() 可以依指定條件篩選資料,is.na() 是判斷輸入值是否含有遺漏值, ! 為否定。
這裡的語法為:
將 dta 資料裡 weight 、 hindfoot_length 、 sex 三個變量裡不是遺漏值的觀察值挑出來另外製成 dta_complete 這個資料。
glimpse(dta_complete)
#> Observations: 30,676
#> Variables: 13
#> $ record_id <dbl> 845, 1164, 1261, 1756, 1818, 1882, 2133, 2184, 2406...
#> $ month <dbl> 5, 8, 9, 4, 5, 7, 10, 11, 1, 5, 5, 7, 10, 11, 1, 2,...
#> $ day <dbl> 6, 5, 4, 29, 30, 4, 25, 17, 16, 18, 18, 8, 1, 23, 2...
#> $ year <dbl> 1978, 1978, 1978, 1979, 1979, 1979, 1979, 1979, 198...
#> $ plot_id <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...
#> $ species_id <chr> "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL...
#> $ sex <chr> "M", "M", "M", "M", "M", "M", "F", "F", "F", "F", "...
#> $ hindfoot_length <dbl> 32, 34, 32, 33, 32, 32, 33, 30, 33, 31, 33, 30, 34,...
#> $ weight <dbl> 204, 199, 197, 166, 184, 206, 274, 186, 184, 87, 17...
#> $ genus <chr> "Neotoma", "Neotoma", "Neotoma", "Neotoma", "Neotom...
#> $ species <chr> "albigula", "albigula", "albigula", "albigula", "al...
#> $ taxa <chr> "Rodent", "Rodent", "Rodent", "Rodent", "Rodent", "...
#> $ plot_type <chr> "Control", "Control", "Control", "Control", "Contro...
head(dta_complete)
#> # A tibble: 6 x 13
#> record_id month day year plot_id species_id sex hindfoot_length weight
#> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
#> 1 845 5 6 1978 2 NL M 32 204
#> 2 1164 8 5 1978 2 NL M 34 199
#> 3 1261 9 4 1978 2 NL M 32 197
#> 4 1756 4 29 1979 2 NL M 33 166
#> 5 1818 5 30 1979 2 NL M 32 184
#> 6 1882 7 4 1979 2 NL M 32 206
#> # ... with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
#> # plot_type <chr>使用 count() 來計算個數。
這裡的語法為:
將 dta_complete 資料裡 species_id 類別中各變量總數大於等於 50 的資料挑出,將此個數統計另製成 species_count 資料。
glimpse(species_counts)
#> Observations: 14
#> Variables: 2
#> $ species_id <chr> "DM", "DO", "DS", "NL", "OL", "OT", "PB", "PE", "PF", "P...
#> $ n <int> 9727, 2790, 2023, 1045, 905, 2081, 2803, 1198, 1469, 835...
head(species_counts)
#> # A tibble: 6 x 2
#> species_id n
#> <chr> <int>
#> 1 DM 9727
#> 2 DO 2790
#> 3 DS 2023
#> 4 NL 1045
#> 5 OL 905
#> 6 OT 2081從 dta_complete 裡挑出其 species_id 符合 species_counts 裡 species_id 的資料,並取代成新的 dta_complete 資料。
glimpse(dta_complete)
#> Observations: 30,463
#> Variables: 13
#> $ record_id <dbl> 845, 1164, 1261, 1756, 1818, 1882, 2133, 2184, 2406...
#> $ month <dbl> 5, 8, 9, 4, 5, 7, 10, 11, 1, 5, 5, 7, 10, 11, 1, 2,...
#> $ day <dbl> 6, 5, 4, 29, 30, 4, 25, 17, 16, 18, 18, 8, 1, 23, 2...
#> $ year <dbl> 1978, 1978, 1978, 1979, 1979, 1979, 1979, 1979, 198...
#> $ plot_id <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...
#> $ species_id <chr> "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL...
#> $ sex <chr> "M", "M", "M", "M", "M", "M", "F", "F", "F", "F", "...
#> $ hindfoot_length <dbl> 32, 34, 32, 33, 32, 32, 33, 30, 33, 31, 33, 30, 34,...
#> $ weight <dbl> 204, 199, 197, 166, 184, 206, 274, 186, 184, 87, 17...
#> $ genus <chr> "Neotoma", "Neotoma", "Neotoma", "Neotoma", "Neotom...
#> $ species <chr> "albigula", "albigula", "albigula", "albigula", "al...
#> $ taxa <chr> "Rodent", "Rodent", "Rodent", "Rodent", "Rodent", "...
#> $ plot_type <chr> "Control", "Control", "Control", "Control", "Contro...