In class 5

The data set concerns species and weight of animals caught in plots in a study area in Arizona over time.

Each row holds information for a single animal, and the columns represent:

record_id: Unique id for the observation
month: month of observation
day: day of observation
year: year of observation
plot_id: ID of a particular plot
species_id: 2-letter code
sex: sex of animal (“M”, “F”)
hindfoot_length: length of the hindfoot in mm
weight: weight of the animal in grams
genus: genus of animal
species: species of animal
taxa: e.g. Rodent, Reptile, Bird, Rabbit
plot_type: type of plot

pacman::p_load(tidyverse)#載入包

dta <- read_csv("http://kbroman.org/datacarp/portal_data_joined.csv")#從網路上閱讀數據

## Rows: 34786 Columns: 13

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): species_id, sex, genus, species, taxa, plot_type
## dbl (7): record_id, month, day, year, plot_id, hindfoot_length, weight

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

glimpse(dta)#查看資料類型

## Rows: 34,786
## Columns: 13
## $ record_id       <dbl> 1, 72, 224, 266, 349, 363, 435, 506, 588, 661, 748, 84~
## $ month           <dbl> 7, 8, 9, 10, 11, 11, 12, 1, 2, 3, 4, 5, 6, 8, 9, 10, 1~
## $ day             <dbl> 16, 19, 13, 16, 12, 12, 10, 8, 18, 11, 8, 6, 9, 5, 4, ~
## $ year            <dbl> 1977, 1977, 1977, 1977, 1977, 1977, 1977, 1978, 1978, ~
## $ plot_id         <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~
## $ species_id      <chr> "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", ~
## $ sex             <chr> "M", "M", NA, NA, NA, NA, NA, NA, "M", NA, NA, "M", "M~
## $ hindfoot_length <dbl> 32, 31, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, NA, 34~
## $ weight          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 218, NA, NA, 204, 200,~
## $ genus           <chr> "Neotoma", "Neotoma", "Neotoma", "Neotoma", "Neotoma",~
## $ species         <chr> "albigula", "albigula", "albigula", "albigula", "albig~
## $ taxa            <chr> "Rodent", "Rodent", "Rodent", "Rodent", "Rodent", "Rod~
## $ plot_type       <chr> "Control", "Control", "Control", "Control", "Control",~

dim(dta)#查看資料維度

## [1] 34786    13

dplyr::select(dta, plot_id, species_id, weight) %>% head()#選擇dta中的plot_id, species_id, weight三列

dplyr::select(dta, -record_id, -species_id) %>% head()#刪掉record_id，species_id兩列

dplyr::filter(dta, year == 1995) %>% head()# 篩選年份為1995年的數據並查看前六行

head(dplyr::select(dplyr::filter(dta, weight <= 5), species_id, sex, weight))#選擇species_id, sex, weight三列，且weight不大於5的行

dta %>% 
  dplyr::filter(weight <= 5) %>% 
  dplyr::select(species_id, sex, weight) %>% 
  head #和上一步好像是一樣的，只是用了%>%的寫法

dta %>% 
  mutate(weight_kg = weight / 1000,
         weight_lb = weight_kg * 2.2) %>% 
  head()#用mutate函數生成weight_kg與weight_lb兩個新變項，並展示計算方式和前六行

dta %>% 
  filter(!is.na(weight)) %>%#篩選不含缺失值的資料
  group_by(sex, species_id) %>%#用sex，species_id分類
  summarize(mean_weight = mean(weight)) %>%#計算平均體重
  arrange(desc(mean_weight)) %>% #按遞減方式排序
  head()#最後展示前六行

## `summarise()` has grouped output by 'sex'. You can override using the `.groups` argument.

dta %>%
  group_by(sex) %>%
  tally#分別計算男性和女性的人數

dta %>%
  count(sex)#這兩行代碼一直跑不出來，後來發現是dplyr和plyr在打架

dta %>%
  group_by(sex) %>%
  summarize(count = n())#用summarize生成count的變項，並透過group_by sex後n()計算row的總數

dta %>%
  group_by(sex) %>%
  summarize(count = sum(!is.na(year)))#同上，但加入條件：year沒有缺失值的行數

dta_gw <- dta %>%  #建立新数据框dta_gw
  filter(!is.na(weight)) %>%#篩選體重沒有缺失值的數據
  group_by(genus, plot_id) %>%#用genus和plot_id列分類
  summarize(mean_weight = mean(weight))#求體重的平均數

## `summarise()` has grouped output by 'genus'. You can override using the `.groups` argument.

glimpse(dta_gw)#查看dta_gw的資料形態

## Rows: 196
## Columns: 3
## Groups: genus [10]
## $ genus       <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
## $ plot_id     <dbl> 1, 2, 3, 5, 18, 19, 20, 21, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,~
## $ mean_weight <dbl> 7.000000, 6.000000, 8.611111, 7.750000, 9.500000, 9.533333~

dta_w <- dta_gw %>%
  spread(key = genus, value = mean_weight)# 建立dta_w數據框，把原有的長表轉成寬表
#spread(資料框/長表,key="要展開的欄位名稱",value="數值欄位名稱")

glimpse(dta_w)#檢查資料形態

## Rows: 24
## Columns: 11
## $ plot_id         <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,~
## $ Baiomys         <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA~
## $ Chaetodipus     <dbl> 22.19939, 25.11014, 24.63636, 23.02381, 17.98276, 24.8~
## $ Dipodomys       <dbl> 60.23214, 55.68259, 52.04688, 57.52454, 51.11356, 58.6~
## $ Neotoma         <dbl> 156.2222, 169.1436, 158.2414, 164.1667, 190.0370, 179.~
## $ Onychomys       <dbl> 27.67550, 26.87302, 26.03241, 28.09375, 27.01695, 25.8~
## $ Perognathus     <dbl> 9.625000, 6.947368, 7.507812, 7.824427, 8.658537, 7.80~
## $ Peromyscus      <dbl> 22.22222, 22.26966, 21.37037, 22.60000, 21.23171, 21.8~
## $ Reithrodontomys <dbl> 11.375000, 10.680556, 10.516588, 10.263158, 11.154545,~
## $ Sigmodon        <dbl> NA, 70.85714, 65.61404, 82.00000, 82.66667, 68.77778, ~
## $ Spermophilus    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 13~

dta_gw %>%
  spread(genus, mean_weight, fill = 0) %>%
  head()#用0代替缺失值

dta_l <- dta_w %>%
  gather(key = genus, value = mean_weight, -plot_id)#轉回長數據，但是不加plot_id，只轉其餘兩行

glimpse(dta_l)#檢查數據形態

## Rows: 240
## Columns: 3
## $ plot_id     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,~
## $ genus       <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
## $ mean_weight <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA, NA~

dta_w %>%
  gather(key = genus, value = mean_weight, Baiomys:Spermophilus) %>%
  head()# 範圍規定是從Baiomys到Spermophilus，然後查看前六行

dta_complete <- dta %>%
  filter(!is.na(weight),           
         !is.na(hindfoot_length),  
         !is.na(sex))   #過濾掉weight、hindfoot_length、sex中的遺漏值

species_counts <- dta_complete %>%
    count(species_id) %>% 
    filter(n >= 50)
#選出species_id總數不小於50的值

dta_complete <- dta_complete %>%
  filter(species_id %in% species_counts$species_id)#The %in% operator in R can be used to identify if an element (e.g., a number) belongs to a vector or dataframe.
#此處篩選有在dta_complete的species_id也有在species_counts$species_id