The data set concerns species and weight of animals caught in plots in a study area in Arizona over time.
Each row holds information for a single animal, and the columns represent:
#load data
pacman::p_load(tidyverse)
#read data
dta <- read_csv("http://kbroman.org/datacarp/portal_data_joined.csv")
## Parsed with column specification:
## cols(
## record_id = col_double(),
## month = col_double(),
## day = col_double(),
## year = col_double(),
## plot_id = col_double(),
## species_id = col_character(),
## sex = col_character(),
## hindfoot_length = col_double(),
## weight = col_double(),
## genus = col_character(),
## species = col_character(),
## taxa = col_character(),
## plot_type = col_character()
## )
#glimpse data
glimpse(dta)
## Observations: 34,786
## Variables: 13
## $ record_id <dbl> 1, 72, 224, 266, 349, 363, 435, 506, 588, 661, 748,...
## $ month <dbl> 7, 8, 9, 10, 11, 11, 12, 1, 2, 3, 4, 5, 6, 8, 9, 10...
## $ day <dbl> 16, 19, 13, 16, 12, 12, 10, 8, 18, 11, 8, 6, 9, 5, ...
## $ year <dbl> 1977, 1977, 1977, 1977, 1977, 1977, 1977, 1978, 197...
## $ plot_id <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...
## $ species_id <chr> "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL...
## $ sex <chr> "M", "M", NA, NA, NA, NA, NA, NA, "M", NA, NA, "M",...
## $ hindfoot_length <dbl> 32, 31, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, NA,...
## $ weight <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 218, NA, NA, 204, 2...
## $ genus <chr> "Neotoma", "Neotoma", "Neotoma", "Neotoma", "Neotom...
## $ species <chr> "albigula", "albigula", "albigula", "albigula", "al...
## $ taxa <chr> "Rodent", "Rodent", "Rodent", "Rodent", "Rodent", "...
## $ plot_type <chr> "Control", "Control", "Control", "Control", "Contro...
#data dimension
dim(dta)
## [1] 34786 13
# head (plot_id, species_id, weight) the data
dplyr::select(dta, plot_id, species_id, weight) %>% head()
## # A tibble: 6 x 3
## plot_id species_id weight
## <dbl> <chr> <dbl>
## 1 2 NL NA
## 2 2 NL NA
## 3 2 NL NA
## 4 2 NL NA
## 5 2 NL NA
## 6 2 NL NA
# head data except (record_id, species_id)
dplyr::select(dta, -record_id, -species_id) %>% head()
## # A tibble: 6 x 11
## month day year plot_id sex hindfoot_length weight genus species taxa
## <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 7 16 1977 2 M 32 NA Neot~ albigu~ Rode~
## 2 8 19 1977 2 M 31 NA Neot~ albigu~ Rode~
## 3 9 13 1977 2 <NA> NA NA Neot~ albigu~ Rode~
## 4 10 16 1977 2 <NA> NA NA Neot~ albigu~ Rode~
## 5 11 12 1977 2 <NA> NA NA Neot~ albigu~ Rode~
## 6 11 12 1977 2 <NA> NA NA Neot~ albigu~ Rode~
## # ... with 1 more variable: plot_type <chr>
# filter and head data with year=1995
dplyr::filter(dta, year == 1995) %>% head()
## # A tibble: 6 x 13
## record_id month day year plot_id species_id sex hindfoot_length weight
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 22314 6 7 1995 2 NL M 34 NA
## 2 22728 9 23 1995 2 NL F 32 165
## 3 22899 10 28 1995 2 NL F 32 171
## 4 23032 12 2 1995 2 NL F 33 NA
## 5 22003 1 11 1995 2 DM M 37 41
## 6 22042 2 4 1995 2 DM F 36 45
## # ... with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
## # plot_type <chr>
#filter data with weight <= 5 & head species_id, sex, weight the data
head(dplyr::select(dplyr::filter(dta, weight <= 5), species_id, sex, weight))
## # A tibble: 6 x 3
## species_id sex weight
## <chr> <chr> <dbl>
## 1 PF M 5
## 2 PF F 5
## 3 PF F 5
## 4 PF F 4
## 5 PF F 5
## 6 PF F 4
#filter data with weight <= 5 & head species_id, sex, weight the data
dta %>%
dplyr::filter(weight <= 5) %>%
dplyr::select(species_id, sex, weight) %>%
head
## # A tibble: 6 x 3
## species_id sex weight
## <chr> <chr> <dbl>
## 1 PF M 5
## 2 PF F 5
## 3 PF F 5
## 4 PF F 4
## 5 PF F 5
## 6 PF F 4
#create two variable weight_kg & weight_lb and head them
dta %>%
mutate(weight_kg = weight / 1000,
weight_lb = weight_kg * 2.2) %>%
head()
## # A tibble: 6 x 15
## record_id month day year plot_id species_id sex hindfoot_length weight
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 1 7 16 1977 2 NL M 32 NA
## 2 72 8 19 1977 2 NL M 31 NA
## 3 224 9 13 1977 2 NL <NA> NA NA
## 4 266 10 16 1977 2 NL <NA> NA NA
## 5 349 11 12 1977 2 NL <NA> NA NA
## 6 363 11 12 1977 2 NL <NA> NA NA
## # ... with 6 more variables: genus <chr>, species <chr>, taxa <chr>,
## # plot_type <chr>, weight_kg <dbl>, weight_lb <dbl>
# filter the weight is not missing value =>group by(sex, species_id)=> summarize the mean_weight =>arrange and desc the mean_weight => head the data
dta %>%
filter(!is.na(weight)) %>%
group_by(sex, species_id) %>%
summarize(mean_weight = mean(weight)) %>%
arrange(desc(mean_weight)) %>%
head()
## # A tibble: 6 x 3
## # Groups: sex [3]
## sex species_id mean_weight
## <chr> <chr> <dbl>
## 1 <NA> NL 168.
## 2 M NL 166.
## 3 F NL 154.
## 4 M SS 130
## 5 <NA> SH 130
## 6 M DS 122.
#group by(sex) and tally the number
dta %>%
group_by(sex) %>%
tally
## # A tibble: 3 x 2
## sex n
## <chr> <int>
## 1 F 15690
## 2 M 17348
## 3 <NA> 1748
#count the sex
dta %>%
count(sex)
## # A tibble: 3 x 2
## sex n
## <chr> <int>
## 1 F 15690
## 2 M 17348
## 3 <NA> 1748
# group by sex and summarize the count
dta %>%
group_by(sex) %>%
summarize(count = n())
## # A tibble: 3 x 2
## sex count
## <chr> <int>
## 1 F 15690
## 2 M 17348
## 3 <NA> 1748
# group by sex , count the sum(exclude missing data) and create the missing data variable(year)
dta %>%
group_by(sex) %>%
summarize(count = sum(!is.na(year)))
## # A tibble: 3 x 2
## sex count
## <chr> <int>
## 1 F 15690
## 2 M 17348
## 3 <NA> 1748
# filter the data without missing value and group by (genus, plot_id) than summarize weight means, save the data as dta_gw
dta_gw <- dta %>%
filter(!is.na(weight)) %>%
group_by(genus, plot_id) %>%
summarize(mean_weight = mean(weight))
# glimpse the dta_gw
glimpse(dta_gw)
## Observations: 196
## Variables: 3
## Groups: genus [10]
## $ genus <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", ...
## $ plot_id <dbl> 1, 2, 3, 5, 18, 19, 20, 21, 1, 2, 3, 4, 5, 6, 7, 8, 9, ...
## $ mean_weight <dbl> 7.000000, 6.000000, 8.611111, 7.750000, 9.500000, 9.533...
#make a wide format by spreading the genus and put mean_weight into new row value
dta_w <- dta_gw %>%
spread(key = genus, value = mean_weight)
#glimpse the dta_w
glimpse(dta_w)
## Observations: 24
## Variables: 11
## $ plot_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
## $ Baiomys <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA,...
## $ Chaetodipus <dbl> 22.19939, 25.11014, 24.63636, 23.02381, 17.98276, 2...
## $ Dipodomys <dbl> 60.23214, 55.68259, 52.04688, 57.52454, 51.11356, 5...
## $ Neotoma <dbl> 156.2222, 169.1436, 158.2414, 164.1667, 190.0370, 1...
## $ Onychomys <dbl> 27.67550, 26.87302, 26.03241, 28.09375, 27.01695, 2...
## $ Perognathus <dbl> 9.625000, 6.947368, 7.507812, 7.824427, 8.658537, 7...
## $ Peromyscus <dbl> 22.22222, 22.26966, 21.37037, 22.60000, 21.23171, 2...
## $ Reithrodontomys <dbl> 11.375000, 10.680556, 10.516588, 10.263158, 11.1545...
## $ Sigmodon <dbl> NA, 70.85714, 65.61404, 82.00000, 82.66667, 68.7777...
## $ Spermophilus <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
#make a wide format by spreading the genus and put mean_weight into the row value. Then,Fill the missing values with 0 and head the data
dta_gw %>%
spread(genus, mean_weight, fill = 0) %>%
head()
## # A tibble: 6 x 11
## plot_id Baiomys Chaetodipus Dipodomys Neotoma Onychomys Perognathus Peromyscus
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 7 22.2 60.2 156. 27.7 9.62 22.2
## 2 2 6 25.1 55.7 169. 26.9 6.95 22.3
## 3 3 8.61 24.6 52.0 158. 26.0 7.51 21.4
## 4 4 0 23.0 57.5 164. 28.1 7.82 22.6
## 5 5 7.75 18.0 51.1 190. 27.0 8.66 21.2
## 6 6 0 24.9 58.6 180. 25.9 7.81 21.8
## # ... with 3 more variables: Reithrodontomys <dbl>, Sigmodon <dbl>,
## # Spermophilus <dbl>
#gather the genus with different class , put mean_weight into the new column , reserve for plot_id type =>save dta_w as dta_l
dta_l <- dta_w %>%
gather(key = genus, value = mean_weight, -plot_id)
#glimpse the dta_l
glimpse(dta_l)
## Observations: 240
## Variables: 3
## $ plot_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ...
## $ genus <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", ...
## $ mean_weight <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA,...
#gather the genus with Baiomys , put mean_weight into the new column and head the data
dta_w %>%
gather(key = genus, value = mean_weight, Baiomys:Spermophilus) %>%
head()
## # A tibble: 6 x 3
## plot_id genus mean_weight
## <dbl> <chr> <dbl>
## 1 1 Baiomys 7
## 2 2 Baiomys 6
## 3 3 Baiomys 8.61
## 4 4 Baiomys NA
## 5 5 Baiomys 7.75
## 6 6 Baiomys NA
#filter the data without missing values (weight, hindfoot_length,sex). Save the data and name it dta_complete
dta_complete <- dta %>%
filter(!is.na(weight),
!is.na(hindfoot_length),
!is.na(sex))
#count the species_id and filter(n >= 50)
#save as species_counts data
species_counts <- dta_complete %>%
count(species_id) %>%
filter(n >= 50)
# Retain data that species id appears in species_counts
dta_complete <- dta_complete %>%
filter(species_id %in% species_counts$species_id)
#data resource: