Inclass1

Comment on what each code chunk in the following classroom markdown file is trying to achive and on its output.

Data

載入WWGbook package中的classroom dataset

pacman::p_load(WWGbook)
data(classroom, package="WWGbook")

#了解這個資料中的訊息
?classroom

The Study of Instructional Improvement (SII; Hill, Rowan, and Ball, 2004) was carried out by researchers at the University of Michigan to study the math achievement scores of first- and third-grade students in randomly selected classrooms from a national U.S. sample of elementary schools.

#顯示classroom的資料結構
str(classroom)

'data.frame':   1190 obs. of  12 variables:
 $ sex     : int  1 0 1 0 0 1 0 0 1 0 ...
 $ minority: int  1 1 1 1 1 1 1 1 1 1 ...
 $ mathkind: int  448 460 511 449 425 450 452 443 422 480 ...
 $ mathgain: int  32 109 56 83 53 65 51 66 88 -7 ...
 $ ses     : num  0.46 -0.27 -0.03 -0.38 -0.03 0.76 -0.03 0.2 0.64 0.13 ...
 $ yearstea: num  1 1 1 2 2 2 2 2 2 2 ...
 $ mathknow: num  NA NA NA -0.11 -0.11 -0.11 -0.11 -0.11 -0.11 -0.11 ...
 $ housepov: num  0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 ...
 $ mathprep: num  2 2 2 3.25 3.25 3.25 3.25 3.25 3.25 3.25 ...
 $ classid : int  160 160 160 217 217 217 217 217 217 217 ...
 $ schoolid: int  1 1 1 1 1 1 1 1 1 1 ...
 $ childid : int  1 2 3 4 5 6 7 8 9 10 ...

A data frame with 1190 observations on the following 12 variables.

Split data

split data to unique school ID and only keep “schoolid”, “housepov” two variables

dta_schl <- classroom[duplicated(classroom$schoolid)==FALSE, 
                     c("schoolid", "housepov")]

duplicated 可以刪掉重複值。藉由duplicated為FALSE，保留唯一筆schoolid

dta_cls <- classroom[duplicated(classroom$classid)==FALSE, 
                     c(11, 10, 6,7,9)]

刪掉重複classid，並保留schoolid、classid、yearstea、mathknow、mathprep

dta_chld <- classroom[, c(12, 10, 11, 1:5)]

排序欄位：childid、classid、schoolid、sex、minority、mathkind、mathgain、ses

sapply(list(dta_schl, dta_cls, dta_chld), dim)

     [,1] [,2] [,3]
[1,]  107  312 1190
[2,]    2    5    8

將3個dataset放入list，利用sapply看每個dataset的dimention

Combine data

merge dataset by column name

dta_12 <- merge(x=dta_chld, y=dta_cls, by=c("classid", "schoolid"))

dta_13 <- merge(x=dta_chld, y=dta_cls, by=c("classid", "schoolid"))

dta_23 <- merge(x=dta_cls, y=dta_schl, by="schoolid")

dta_123 <- merge(x=dta_12, y=dta_schl, by=c("schoolid"))

sapply(list(dta_12, dta_13, dta_23, dta_123), dim)

     [,1] [,2] [,3] [,4]
[1,] 1190 1190  312 1190
[2,]   11   11    6   12

show 每一個merge完的dataset的dimention

when merge is not specifications of the columns

library(dplyr)
dta_chldv<-dta_chld|> filter(schoolid== 1) #11筆
dta_clsv<-dta_cls|> filter(schoolid== 1) #2筆
dta_12s<- merge(x=dta_chldv, y=dta_clsv, by="schoolid") #22筆

Inclass2

Merge the two data sets: state.x77{datasets} and USArrests{datasets} and compute all pair-wise correlations for numerical variables. Is there anything interesting to report?

# attributes of dataset
attributes(state.x77)

$dim
[1] 50  8

$dimnames
$dimnames[[1]]
 [1] "Alabama"        "Alaska"         "Arizona"        "Arkansas"      
 [5] "California"     "Colorado"       "Connecticut"    "Delaware"      
 [9] "Florida"        "Georgia"        "Hawaii"         "Idaho"         
[13] "Illinois"       "Indiana"        "Iowa"           "Kansas"        
[17] "Kentucky"       "Louisiana"      "Maine"          "Maryland"      
[21] "Massachusetts"  "Michigan"       "Minnesota"      "Mississippi"   
[25] "Missouri"       "Montana"        "Nebraska"       "Nevada"        
[29] "New Hampshire"  "New Jersey"     "New Mexico"     "New York"      
[33] "North Carolina" "North Dakota"   "Ohio"           "Oklahoma"      
[37] "Oregon"         "Pennsylvania"   "Rhode Island"   "South Carolina"
[41] "South Dakota"   "Tennessee"      "Texas"          "Utah"          
[45] "Vermont"        "Virginia"       "Washington"     "West Virginia" 
[49] "Wisconsin"      "Wyoming"       

$dimnames[[2]]
[1] "Population" "Income"     "Illiteracy" "Life Exp"   "Murder"    
[6] "HS Grad"    "Frost"      "Area"

attributes(USArrests)

$names
[1] "Murder"   "Assault"  "UrbanPop" "Rape"    

$class
[1] "data.frame"

$row.names
 [1] "Alabama"        "Alaska"         "Arizona"        "Arkansas"      
 [5] "California"     "Colorado"       "Connecticut"    "Delaware"      
 [9] "Florida"        "Georgia"        "Hawaii"         "Idaho"         
[13] "Illinois"       "Indiana"        "Iowa"           "Kansas"        
[17] "Kentucky"       "Louisiana"      "Maine"          "Maryland"      
[21] "Massachusetts"  "Michigan"       "Minnesota"      "Mississippi"   
[25] "Missouri"       "Montana"        "Nebraska"       "Nevada"        
[29] "New Hampshire"  "New Jersey"     "New Mexico"     "New York"      
[33] "North Carolina" "North Dakota"   "Ohio"           "Oklahoma"      
[37] "Oregon"         "Pennsylvania"   "Rhode Island"   "South Carolina"
[41] "South Dakota"   "Tennessee"      "Texas"          "Utah"          
[45] "Vermont"        "Virginia"       "Washington"     "West Virginia" 
[49] "Wisconsin"      "Wyoming"

# merge
dta2 <- merge(state.x77,USArrests, by="row.names")
#column name
names(dta2)

 [1] "Row.names"  "Population" "Income"     "Illiteracy" "Life Exp"  
 [6] "Murder.x"   "HS Grad"    "Frost"      "Area"       "Murder.y"  
[11] "Assault"    "UrbanPop"   "Rape"

merge state.x77 and USArrests by row.names.

pacman::p_load(dplyr, magrittr)
# rename
dta2 %<>% 
  dplyr::rename(State=Row.names, Murder1976=Murder.x, Murder1973=Murder.y)
#
dta2$State <- as.factor(dta2$State)
str(dta2)

'data.frame':   50 obs. of  13 variables:
 $ State     : Factor w/ 50 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ Population: num  3615 365 2212 2110 21198 ...
 $ Income    : num  3624 6315 4530 3378 5114 ...
 $ Illiteracy: num  2.1 1.5 1.8 1.9 1.1 0.7 1.1 0.9 1.3 2 ...
 $ Life Exp  : num  69 69.3 70.5 70.7 71.7 ...
 $ Murder1976: num  15.1 11.3 7.8 10.1 10.3 6.8 3.1 6.2 10.7 13.9 ...
 $ HS Grad   : num  41.3 66.7 58.1 39.9 62.6 63.9 56 54.6 52.6 40.6 ...
 $ Frost     : num  20 152 15 65 20 166 139 103 11 60 ...
 $ Area      : num  50708 566432 113417 51945 156361 ...
 $ Murder1973: num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
 $ Assault   : int  236 263 294 190 276 204 110 238 335 211 ...
 $ UrbanPop  : int  58 48 80 50 91 78 77 72 80 60 ...
 $ Rape      : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...

pacman::p_load(dplyr, magrittr, corrplot)

# merge by row name (states)
dat2 <- merge(state.x77, USArrests, by="row.names", all=TRUE)

# column name
dat2 %<>%
 dplyr::rename(State = Row.names, 
               Murder1976 = Murder.x, 
               Murder1973 = Murder.y)
# change type and show head
dat2<-dat2 |>
  mutate(State= as.factor(State))

res <-round(cor(dta2[,-1]), 2)
knitr::kable(res)#要有括號才跑得出來

	Population	Income	Illiteracy	Life Exp	Murder1976	HS Grad	Frost	Area	Murder1973	Assault	UrbanPop	Rape
Population	1.00	0.21	0.11	-0.07	0.34	-0.10	-0.33	0.02	0.32	0.32	0.51	0.31
Income	0.21	1.00	-0.44	0.34	-0.23	0.62	0.23	0.36	-0.22	0.04	0.48	0.36
Illiteracy	0.11	-0.44	1.00	-0.59	0.70	-0.66	-0.67	0.08	0.71	0.51	-0.06	0.15
Life Exp	-0.07	0.34	-0.59	1.00	-0.78	0.58	0.26	-0.11	-0.78	-0.63	0.27	-0.27
Murder1976	0.34	-0.23	0.70	-0.78	1.00	-0.49	-0.54	0.23	0.93	0.74	0.02	0.58
HS Grad	-0.10	0.62	-0.66	0.58	-0.49	1.00	0.37	0.33	-0.52	-0.23	0.36	0.27
Frost	-0.33	0.23	-0.67	0.26	-0.54	0.37	1.00	0.06	-0.54	-0.47	-0.25	-0.28
Area	0.02	0.36	0.08	-0.11	0.23	0.33	0.06	1.00	0.15	0.23	-0.06	0.52
Murder1973	0.32	-0.22	0.71	-0.78	0.93	-0.52	-0.54	0.15	1.00	0.80	0.07	0.56
Assault	0.32	0.04	0.51	-0.63	0.74	-0.23	-0.47	0.23	0.80	1.00	0.26	0.67
UrbanPop	0.51	0.48	-0.06	0.27	0.02	0.36	-0.25	-0.06	0.07	0.26	1.00	0.41
Rape	0.31	0.36	0.15	-0.27	0.58	0.27	-0.28	0.52	0.56	0.67	0.41	1.00

class(res)

[1] "matrix" "array"

library(corrplot)
corrplot(res, type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 45)

The correlation matrix is reordered according to the correlation coefficient using “hclust” method.

tl.col (for text label color) and tl.srt (for text label string rotation) are used to change text colors and rotations.

Possible values for the argument type are : “upper”, “lower”, “full”

結論 Illiteracy has positive correlation on Rape, Assault and Murder.

Inclass3

Summarize the backpain{HSAUR3} into the following format:

You should provide comments for each code chunk.

# load package
pacman::p_load(HSAUR3)

# Input data
data("backpain", package="HSAUR3")
dta3 <- backpain
head(dta3)

  ID  status driver suburban
1  1    case    yes      yes
2  1 control    yes       no
3  2    case    yes      yes
4  2 control    yes      yes
5  3    case    yes       no
6  3 control    yes      yes

str(dta3)

'data.frame':   434 obs. of  4 variables:
 $ ID      : Factor w/ 217 levels "1","2","3","4",..: 1 1 2 2 3 3 4 4 5 5 ...
 $ status  : Factor w/ 2 levels "case","control": 1 2 1 2 1 2 1 2 1 2 ...
 $ driver  : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 1 1 2 2 ...
 $ suburban: Factor w/ 2 levels "no","yes": 2 1 2 2 1 2 1 1 1 2 ...

base R approach

A <- with(dta3,table(driver, suburban, status))
tbl <-with(dta3,ftable(driver, suburban, status))
tbl1<-with(dta3,ftable(addmargins(A, c(1, 3),
        FUN = list(list(Sum = sum), Sum = sum))))

Margins computed over dimensions
in the following order:
1: driver
2: status

tbl

                status case control
driver suburban                    
no     no                26      47
       yes                6       7
yes    no                64      63
       yes              121     100

tbl1

                status case control Sum
driver suburban                        
no     no                26      47  73
       yes                6       7  13
yes    no                64      63 127
       yes              121     100 221
Sum    no                90     110 200
       yes              127     107 234

addmargins: Puts Arbitrary Margins on Multidimensional Tables or Arrays

tidyverse approach

pacman::p_load(dplyr, magrittr ,tidyr)
dta3 |>
  pivot_wider(names_from = status, values_from = status)|>
  group_by(driver, suburban) |> 
  summarize(Case = sum(!is.na(case)), # sum of case number
            Control = sum(!is.na(control)), # sum of control number
            Total = as.numeric(Case)+as.numeric(Control)) %>% # sum of total
  as.data.frame

  driver suburban Case Control Total
1     no       no   26      47    73
2     no      yes    6       7    13
3    yes       no   64      63   127
4    yes      yes  121     100   221

Inclass4

The data set Vocab{carData} gives observations on gender, education and vocabulary, from respondents to U.S. General Social Surveys, 1972-2004. Summarize the relationship between education and vocabulary over the years by gender.

# load package
pacman::p_load(carData)

# Input data
data("Vocab", package="carData")
dat4 <- Vocab

# columan rename
dat4 <- dat4 %>% 
 dplyr::rename(Year = year, 
               Sex = sex, 
               Education = education,
               Vocabulary = vocabulary)
# plot
dat4 %>% 
  mutate(Year = as.factor(Year)) %>% # for show the year number
  lattice::xyplot(Vocabulary ~ Education | Year, 
                  groups = Sex,
                  type = c("p", "g", "r"), data = ., 
                  cex= .5,
                  pch = 19,
                  auto.key = list(columns = 2),
                  xlab = "Education", ylab = "Vocabulary",
                  layout = c(4,6),
                  par.settings = list(superpose.symbol = list(pch = 19, cex = 1.5,
                                                   col = c("#D95F02", "#7570B3")),
                                      superpose.line = list(col = c("#D95F02", "#7570B3"),
                                                 lwd = 1.5)))

Inclass5

Supply comments to each code chunk in the following survey rmarkdown file and preview it as an R notebook or knit to html.

# 載入tidyverse package
pacman::p_load(tidyverse)

# 從網路上載下csv
dta <- read_csv("http://kbroman.org/datacarp/portal_data_joined.csv")

# 看一下資料數據型態
glimpse(dta)

Rows: 34,786
Columns: 13
$ record_id       <dbl> 1, 72, 224, 266, 349, 363, 435, 506, 588, 661, 748, 84~
$ month           <dbl> 7, 8, 9, 10, 11, 11, 12, 1, 2, 3, 4, 5, 6, 8, 9, 10, 1~
$ day             <dbl> 16, 19, 13, 16, 12, 12, 10, 8, 18, 11, 8, 6, 9, 5, 4, ~
$ year            <dbl> 1977, 1977, 1977, 1977, 1977, 1977, 1977, 1978, 1978, ~
$ plot_id         <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~
$ species_id      <chr> "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", ~
$ sex             <chr> "M", "M", NA, NA, NA, NA, NA, NA, "M", NA, NA, "M", "M~
$ hindfoot_length <dbl> 32, 31, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, NA, 34~
$ weight          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 218, NA, NA, 204, 200,~
$ genus           <chr> "Neotoma", "Neotoma", "Neotoma", "Neotoma", "Neotoma",~
$ species         <chr> "albigula", "albigula", "albigula", "albigula", "albig~
$ taxa            <chr> "Rodent", "Rodent", "Rodent", "Rodent", "Rodent", "Rod~
$ plot_type       <chr> "Control", "Control", "Control", "Control", "Control",~

# 資料維度
dim(dta)

[1] 34786    13

select

# 用dplyr::select選擇dta中的plot_id, species_id, weight三個變項
dplyr::select(dta, plot_id, species_id, weight) %>% head()

# A tibble: 6 x 3
  plot_id species_id weight
    <dbl> <chr>       <dbl>
1       2 NL             NA
2       2 NL             NA
3       2 NL             NA
4       2 NL             NA
5       2 NL             NA
6       2 NL             NA

# select 也可以指定要刪掉哪幾個變項就好(刪掉record_id，species_id)
dplyr::select(dta, -record_id, -species_id) %>% head()

# A tibble: 6 x 11
  month   day  year plot_id sex   hindfoot_length weight genus   species  taxa  
  <dbl> <dbl> <dbl>   <dbl> <chr>           <dbl>  <dbl> <chr>   <chr>    <chr> 
1     7    16  1977       2 M                  32     NA Neotoma albigula Rodent
2     8    19  1977       2 M                  31     NA Neotoma albigula Rodent
3     9    13  1977       2 <NA>               NA     NA Neotoma albigula Rodent
4    10    16  1977       2 <NA>               NA     NA Neotoma albigula Rodent
5    11    12  1977       2 <NA>               NA     NA Neotoma albigula Rodent
6    11    12  1977       2 <NA>               NA     NA Neotoma albigula Rodent
# ... with 1 more variable: plot_type <chr>

filter

# 利用filter篩選年代為1995年的資料
dplyr::filter(dta, year == 1995) %>% head()

# A tibble: 6 x 13
  record_id month   day  year plot_id species_id sex   hindfoot_length weight
      <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
1     22314     6     7  1995       2 NL         M                  34     NA
2     22728     9    23  1995       2 NL         F                  32    165
3     22899    10    28  1995       2 NL         F                  32    171
4     23032    12     2  1995       2 NL         F                  33     NA
5     22003     1    11  1995       2 DM         M                  37     41
6     22042     2     4  1995       2 DM         F                  36     45
# ... with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
#   plot_type <chr>

# 先filter體重<= 5，用select選擇species_id, sex, weight三個變項
head(dplyr::select(dplyr::filter(dta, weight <= 5), species_id, sex, weight))

# A tibble: 6 x 3
  species_id sex   weight
  <chr>      <chr>  <dbl>
1 PF         M          5
2 PF         F          5
3 PF         F          5
4 PF         F          4
5 PF         F          5
6 PF         F          4

pipe

# 這個與放面的語法一樣，透過%>%可以讓code更直觀
dta %>% 
  dplyr::filter(weight <= 5) %>% 
  dplyr::select(species_id, sex, weight) %>% 
  head

# A tibble: 6 x 3
  species_id sex   weight
  <chr>      <chr>  <dbl>
1 PF         M          5
2 PF         F          5
3 PF         F          5
4 PF         F          4
5 PF         F          5
6 PF         F          4

mutate

# 用mutate 建立weight_kg與weight_lb兩個變項，並分別給定計算方式
dta %>% 
  mutate(weight_kg = weight / 1000,
         weight_lb = weight_kg * 2.2) %>% 
  head()

# A tibble: 6 x 15
  record_id month   day  year plot_id species_id sex   hindfoot_length weight
      <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
1         1     7    16  1977       2 NL         M                  32     NA
2        72     8    19  1977       2 NL         M                  31     NA
3       224     9    13  1977       2 NL         <NA>               NA     NA
4       266    10    16  1977       2 NL         <NA>               NA     NA
5       349    11    12  1977       2 NL         <NA>               NA     NA
6       363    11    12  1977       2 NL         <NA>               NA     NA
# ... with 6 more variables: genus <chr>, species <chr>, taxa <chr>,
#   plot_type <chr>, weight_kg <dbl>, weight_lb <dbl>

group_by, summarize and arrange

# filter篩選體重沒有na的資料，group_by依照 sex與species_id，計算mean weight，並遞減排序
dta %>% 
  filter(!is.na(weight)) %>%
  group_by(sex, species_id) %>%
  summarize(mean_weight = mean(weight)) %>%
  arrange(desc(mean_weight)) %>% 
  head()

# A tibble: 6 x 3
# Groups:   sex [3]
  sex   species_id mean_weight
  <chr> <chr>            <dbl>
1 <NA>  NL                168.
2 M     NL                166.
3 F     NL                154.
4 M     SS                130 
5 <NA>  SH                130 
6 M     DS                122.

compare tally, count and summarize

dta %>%
  group_by(sex) %>%
  tally #與count()相似，但必須先寫group_by()才行

# A tibble: 3 x 2
  sex       n
  <chr> <int>
1 F     15690
2 M     17348
3 <NA>   1748

dta %>%
  count(sex) #count()除了計算數量還包含執行group_by的功能

# A tibble: 3 x 2
  sex       n
  <chr> <int>
1 F     15690
2 M     17348
3 <NA>   1748

#用summarize可以建立一個叫count的變項，並透過group_by sex後n()計算row的總數
dta %>%
  group_by(sex) %>%
  summarize(count = n())

# A tibble: 3 x 2
  sex   count
  <chr> <int>
1 F     15690
2 M     17348
3 <NA>   1748

# 建立count欄位，並用sum加總沒有Missing的year數量
dta %>%
  group_by(sex) %>%
  summarize(count = sum(!is.na(year)))

# A tibble: 3 x 2
  sex   count
  <chr> <int>
1 F     15690
2 M     17348
3 <NA>   1748

dta_gw <- dta %>% 
  filter(!is.na(weight)) %>% #篩選出沒有na的weight列
  group_by(genus, plot_id) %>% # group_by genus與plot_id
  summarize(mean_weight = mean(weight)) #計算體重平均並放入mean_weight變項中

# 看資料的數據屬性
glimpse(dta_gw)

Rows: 196
Columns: 3
Groups: genus [10]
$ genus       <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
$ plot_id     <dbl> 1, 2, 3, 5, 18, 19, 20, 21, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,~
$ mean_weight <dbl> 7.000000, 6.000000, 8.611111, 7.750000, 9.500000, 9.533333~

spread

# can be used to “spread” a key-value pair across multiple columns.
# 與pivot_wider一樣功能?

dta_w <- dta_gw %>%
  spread(key = genus, value = mean_weight)

key: Column whose values will become variable names

value: Column where values will fill under new variables created from key

glimpse(dta_w)

Rows: 24
Columns: 11
$ plot_id         <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,~
$ Baiomys         <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA~
$ Chaetodipus     <dbl> 22.19939, 25.11014, 24.63636, 23.02381, 17.98276, 24.8~
$ Dipodomys       <dbl> 60.23214, 55.68259, 52.04688, 57.52454, 51.11356, 58.6~
$ Neotoma         <dbl> 156.2222, 169.1436, 158.2414, 164.1667, 190.0370, 179.~
$ Onychomys       <dbl> 27.67550, 26.87302, 26.03241, 28.09375, 27.01695, 25.8~
$ Perognathus     <dbl> 9.625000, 6.947368, 7.507812, 7.824427, 8.658537, 7.80~
$ Peromyscus      <dbl> 22.22222, 22.26966, 21.37037, 22.60000, 21.23171, 21.8~
$ Reithrodontomys <dbl> 11.375000, 10.680556, 10.516588, 10.263158, 11.154545,~
$ Sigmodon        <dbl> NA, 70.85714, 65.61404, 82.00000, 82.66667, 68.77778, ~
$ Spermophilus    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 13~

dta_gw %>%
  spread(genus, mean_weight, fill = 0) %>% # fill把NA設定為0
  head()

# A tibble: 6 x 11
  plot_id Baiomys Chaetodipus Dipodomys Neotoma Onychomys Perognathus Peromyscus
    <dbl>   <dbl>       <dbl>     <dbl>   <dbl>     <dbl>       <dbl>      <dbl>
1       1    7           22.2      60.2    156.      27.7        9.62       22.2
2       2    6           25.1      55.7    169.      26.9        6.95       22.3
3       3    8.61        24.6      52.0    158.      26.0        7.51       21.4
4       4    0           23.0      57.5    164.      28.1        7.82       22.6
5       5    7.75        18.0      51.1    190.      27.0        8.66       21.2
6       6    0           24.9      58.6    180.      25.9        7.81       21.8
# ... with 3 more variables: Reithrodontomys <dbl>, Sigmodon <dbl>,
#   Spermophilus <dbl>

gather

# Gather columns into key-value pairs
# 轉long form的意思
dta_l <- dta_w %>%
  gather(key = genus, 
         value = mean_weight,
           -plot_id)#因為針對整個dta_w進行轉置，會錯誤納入plot_id為genus中，所以要排除他

glimpse(dta_l)#因為沒有一起進入轉置，所以才能出現plot_id

Rows: 240
Columns: 3
$ plot_id     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,~
$ genus       <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
$ mean_weight <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA, NA~

dta_w %>%
  gather(key = genus, 
         value = mean_weight, 
         Baiomys:Spermophilus) %>% #這邊直接指定屬genus範圍是Baiomys:Spermophilus
  head()

# A tibble: 6 x 3
  plot_id genus   mean_weight
    <dbl> <chr>         <dbl>
1       1 Baiomys        7   
2       2 Baiomys        6   
3       3 Baiomys        8.61
4       4 Baiomys       NA   
5       5 Baiomys        7.75
6       6 Baiomys       NA

different using filter approach

# filter by not NA on weight, hindfoot_length, sex sequently
dta_complete <- dta %>%
  filter(!is.na(weight),           
         !is.na(hindfoot_length),  
         !is.na(sex))

# 先依照species_id計算數量，並篩選出>= 50
species_counts <- dta_complete %>%
    count(species_id) %>% 
    filter(n >= 50)

# 篩選有在dta_complete的species_id也有在species_counts$species_id
# 意思與保留species_id計算數量>= 50相同
dta_complete <- dta_complete %>%
  filter(species_id %in% species_counts$species_id)

1101Inclass

Jenny Hsu

2021/11/3