In-class exercises_2

Merge the two data sets: state.x77{datasets} and USArrests{datasets} and compute all pair-wise correlations for numerical variables. Is there anything interesting to report?

library(datasets)
data(USArrests, package="datasets")
data(state, package="datasets")
USArrests|> head() |> knitr::kable()
Murder Assault UrbanPop Rape
Alabama 13.2 236 58 21.2
Alaska 10.0 263 48 44.5
Arizona 8.1 294 80 31.0
Arkansas 8.8 190 50 19.5
California 9.0 276 91 40.6
Colorado 7.9 204 78 38.7
state.x77|> head() |> knitr::kable()
Population Income Illiteracy Life Exp Murder HS Grad Frost Area
Alabama 3615 3624 2.1 69.05 15.1 41.3 20 50708
Alaska 365 6315 1.5 69.31 11.3 66.7 152 566432
Arizona 2212 4530 1.8 70.55 7.8 58.1 15 113417
Arkansas 2110 3378 1.9 70.66 10.1 39.9 65 51945
California 21198 5114 1.1 71.71 10.3 62.6 20 156361
Colorado 2541 4884 0.7 72.06 6.8 63.9 166 103766

先了解兩個資料的樣態,再決定怎麼合併

str(USArrests) #'data.frame':   50 obs. of  4 variables
'data.frame':   50 obs. of  4 variables:
 $ Murder  : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
 $ Assault : int  236 263 294 190 276 204 110 238 335 211 ...
 $ UrbanPop: int  58 48 80 50 91 78 77 72 80 60 ...
 $ Rape    : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
str(state.x77) #num [1:50, 1:8]
 num [1:50, 1:8] 3615 365 2212 2110 21198 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
  ..$ : chr [1:8] "Population" "Income" "Illiteracy" "Life Exp" ...
class(USArrests) # class(USArrests)是data.frame
[1] "data.frame"
class(state.x77) # class(state.x77)是"matrix" "array" 
[1] "matrix" "array" 
state.x77f<-state.x77|>as.data.frame()   #換成data.frame
str(state.x77f) #'data.frame':  50 obs. of  8 variables
'data.frame':   50 obs. of  8 variables:
 $ Population: num  3615 365 2212 2110 21198 ...
 $ Income    : num  3624 6315 4530 3378 5114 ...
 $ Illiteracy: num  2.1 1.5 1.8 1.9 1.1 0.7 1.1 0.9 1.3 2 ...
 $ Life Exp  : num  69 69.3 70.5 70.7 71.7 ...
 $ Murder    : num  15.1 11.3 7.8 10.1 10.3 6.8 3.1 6.2 10.7 13.9 ...
 $ HS Grad   : num  41.3 66.7 58.1 39.9 62.6 63.9 56 54.6 52.6 40.6 ...
 $ Frost     : num  20 152 15 65 20 166 139 103 11 60 ...
 $ Area      : num  50708 566432 113417 51945 156361 ...
names(USArrests)
[1] "Murder"   "Assault"  "UrbanPop" "Rape"    
names(state.x77f) #發現資料column重複的是"Murder"
[1] "Population" "Income"     "Illiteracy" "Life Exp"   "Murder"    
[6] "HS Grad"    "Frost"      "Area"      
row.names(USArrests) #Row的名稱
 [1] "Alabama"        "Alaska"         "Arizona"        "Arkansas"      
 [5] "California"     "Colorado"       "Connecticut"    "Delaware"      
 [9] "Florida"        "Georgia"        "Hawaii"         "Idaho"         
[13] "Illinois"       "Indiana"        "Iowa"           "Kansas"        
[17] "Kentucky"       "Louisiana"      "Maine"          "Maryland"      
[21] "Massachusetts"  "Michigan"       "Minnesota"      "Mississippi"   
[25] "Missouri"       "Montana"        "Nebraska"       "Nevada"        
[29] "New Hampshire"  "New Jersey"     "New Mexico"     "New York"      
[33] "North Carolina" "North Dakota"   "Ohio"           "Oklahoma"      
[37] "Oregon"         "Pennsylvania"   "Rhode Island"   "South Carolina"
[41] "South Dakota"   "Tennessee"      "Texas"          "Utah"          
[45] "Vermont"        "Virginia"       "Washington"     "West Virginia" 
[49] "Wisconsin"      "Wyoming"       
row.names(state.x77f)#兩個資料的row name是一樣的
 [1] "Alabama"        "Alaska"         "Arizona"        "Arkansas"      
 [5] "California"     "Colorado"       "Connecticut"    "Delaware"      
 [9] "Florida"        "Georgia"        "Hawaii"         "Idaho"         
[13] "Illinois"       "Indiana"        "Iowa"           "Kansas"        
[17] "Kentucky"       "Louisiana"      "Maine"          "Maryland"      
[21] "Massachusetts"  "Michigan"       "Minnesota"      "Mississippi"   
[25] "Missouri"       "Montana"        "Nebraska"       "Nevada"        
[29] "New Hampshire"  "New Jersey"     "New Mexico"     "New York"      
[33] "North Carolina" "North Dakota"   "Ohio"           "Oklahoma"      
[37] "Oregon"         "Pennsylvania"   "Rhode Island"   "South Carolina"
[41] "South Dakota"   "Tennessee"      "Texas"          "Utah"          
[45] "Vermont"        "Virginia"       "Washington"     "West Virginia" 
[49] "Wisconsin"      "Wyoming"       
summary(USArrests$Murder)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.800   4.075   7.250   7.788  11.250  17.400 
summary(state.x77f$Murder)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.400   4.350   6.850   7.378  10.675  15.100 
dta_77USA_1 <- merge(x=state.x77f, y=USArrests, by=c("Murder"))
str(dta_77USA_1)
'data.frame':   15 obs. of  11 variables:
 $ Murder    : num  2.7 3.3 3.3 4.3 5.3 6.8 7.4 8.5 9.7 11.1 ...
 $ Population: num  1058 5814 812 3559 813 ...
 $ Income    : num  3694 4755 4281 4864 4119 ...
 $ Illiteracy: num  0.7 1.1 0.7 0.6 0.6 0.7 0.8 0.9 2.2 0.9 ...
 $ Life Exp  : num  70.4 71.8 71.2 71.7 71.9 ...
 $ HS Grad   : num  54.7 58.5 57.6 63.5 59.5 63.9 53.2 52.3 55.2 52.8 ...
 $ Frost     : num  161 103 174 32 126 166 124 101 120 125 ...
 $ Area      : num  30920 7826 9027 66570 82677 ...
 $ Assault   : int  72 110 110 102 46 161 159 156 109 254 ...
 $ UrbanPop  : int  66 77 77 62 83 60 89 63 52 86 ...
 $ Rape      : num  14.9 11.1 11.1 16.5 20.2 15.6 18.8 20.7 16.3 26.1 ...
dta_77USA_1|> head() |> knitr::kable() 
Murder Population Income Illiteracy Life Exp HS Grad Frost Area Assault UrbanPop Rape
2.7 1058 3694 0.7 70.39 54.7 161 30920 72 66 14.9
3.3 5814 4755 1.1 71.83 58.5 103 7826 110 77 11.1
3.3 812 4281 0.7 71.23 57.6 174 9027 110 77 11.1
4.3 3559 4864 0.6 71.72 63.5 32 66570 102 62 16.5
5.3 813 4119 0.6 71.87 59.5 126 82677 46 83 20.2
6.8 2541 4884 0.7 72.06 63.9 166 103766 161 60 15.6
#只保留有Murder column的資料
dta_77USA_2 <- merge(x=state.x77f, y=USArrests, by="row.names")
str(dta_77USA_2)
'data.frame':   50 obs. of  13 variables:
 $ Row.names : 'AsIs' chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
 $ Population: num  3615 365 2212 2110 21198 ...
 $ Income    : num  3624 6315 4530 3378 5114 ...
 $ Illiteracy: num  2.1 1.5 1.8 1.9 1.1 0.7 1.1 0.9 1.3 2 ...
 $ Life Exp  : num  69 69.3 70.5 70.7 71.7 ...
 $ Murder.x  : num  15.1 11.3 7.8 10.1 10.3 6.8 3.1 6.2 10.7 13.9 ...
 $ HS Grad   : num  41.3 66.7 58.1 39.9 62.6 63.9 56 54.6 52.6 40.6 ...
 $ Frost     : num  20 152 15 65 20 166 139 103 11 60 ...
 $ Area      : num  50708 566432 113417 51945 156361 ...
 $ Murder.y  : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
 $ Assault   : int  236 263 294 190 276 204 110 238 335 211 ...
 $ UrbanPop  : int  58 48 80 50 91 78 77 72 80 60 ...
 $ Rape      : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
dta_77USA_2|> head() |> knitr::kable() 
Row.names Population Income Illiteracy Life Exp Murder.x HS Grad Frost Area Murder.y Assault UrbanPop Rape
Alabama 3615 3624 2.1 69.05 15.1 41.3 20 50708 13.2 236 58 21.2
Alaska 365 6315 1.5 69.31 11.3 66.7 152 566432 10.0 263 48 44.5
Arizona 2212 4530 1.8 70.55 7.8 58.1 15 113417 8.1 294 80 31.0
Arkansas 2110 3378 1.9 70.66 10.1 39.9 65 51945 8.8 190 50 19.5
California 21198 5114 1.1 71.71 10.3 62.6 20 156361 9.0 276 91 40.6
Colorado 2541 4884 0.7 72.06 6.8 63.9 166 103766 7.9 204 78 38.7
#依據row name(50州)進行合併,因Murder在兩個data framne中重複,因此會產出Murder.x(state.x77f)和Murder.y(USArrests)
sum(is.na(dta_77USA_2))
[1] 0
dta_77USA_3 <- merge(x=state.x77f, y=USArrests, by=c("Murder"), all = T)
# all=T 是用來詢問是否顯示所有資料,如果有資料不全者,沒下 all = T,應該不會出現在合併資料中。
str(dta_77USA_3)
'data.frame':   88 obs. of  11 variables:
 $ Murder    : num  0.8 1.4 1.7 2.1 2.1 2.2 2.2 2.3 2.3 2.4 ...
 $ Population: num  NA 637 681 NA NA ...
 $ Income    : num  NA 5087 4167 NA NA ...
 $ Illiteracy: num  NA 0.8 0.5 NA NA NA NA 0.5 0.6 1.3 ...
 $ Life Exp  : num  NA 72.8 72.1 NA NA ...
 $ HS Grad   : num  NA 50.3 53.3 NA NA NA NA 59 57.6 46.4 ...
 $ Frost     : num  NA 186 172 NA NA NA NA 140 160 127 ...
 $ Area      : num  NA 69273 75955 NA NA ...
 $ Assault   : int  45 NA NA 83 57 56 48 NA NA NA ...
 $ UrbanPop  : int  44 NA NA 51 56 57 32 NA NA NA ...
 $ Rape      : num  7.3 NA NA 7.8 9.5 11.3 11.2 NA NA NA ...
dta_77USA_3|> head() |> knitr::kable() 
Murder Population Income Illiteracy Life Exp HS Grad Frost Area Assault UrbanPop Rape
0.8 NA NA NA NA NA NA NA 45 44 7.3
1.4 637 5087 0.8 72.78 50.3 186 69273 NA NA NA
1.7 681 4167 0.5 72.08 53.3 172 75955 NA NA NA
2.1 NA NA NA NA NA NA NA 83 51 7.8
2.1 NA NA NA NA NA NA NA 57 56 9.5
2.2 NA NA NA NA NA NA NA 56 57 11.3
#以Murder進行合併,保留所有資料包括NA

sum(is.na(USArrests))  #在USArrests資料中並無NA
[1] 0
sum(is.na(state.x77f))  #在state.x77f資料中並無NA
[1] 0
sum(is.na(dta_77USA_3)) #為何合併後會出現367個NA資料?
[1] 367
dta_77USA_3 <- merge(x=state.x77f, y=USArrests, by="row.names", all = T)
str(dta_77USA_3)
'data.frame':   50 obs. of  13 variables:
 $ Row.names : 'AsIs' chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
 $ Population: num  3615 365 2212 2110 21198 ...
 $ Income    : num  3624 6315 4530 3378 5114 ...
 $ Illiteracy: num  2.1 1.5 1.8 1.9 1.1 0.7 1.1 0.9 1.3 2 ...
 $ Life Exp  : num  69 69.3 70.5 70.7 71.7 ...
 $ Murder.x  : num  15.1 11.3 7.8 10.1 10.3 6.8 3.1 6.2 10.7 13.9 ...
 $ HS Grad   : num  41.3 66.7 58.1 39.9 62.6 63.9 56 54.6 52.6 40.6 ...
 $ Frost     : num  20 152 15 65 20 166 139 103 11 60 ...
 $ Area      : num  50708 566432 113417 51945 156361 ...
 $ Murder.y  : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
 $ Assault   : int  236 263 294 190 276 204 110 238 335 211 ...
 $ UrbanPop  : int  58 48 80 50 91 78 77 72 80 60 ...
 $ Rape      : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
dta_77USA_3|> head() |> knitr::kable()
Row.names Population Income Illiteracy Life Exp Murder.x HS Grad Frost Area Murder.y Assault UrbanPop Rape
Alabama 3615 3624 2.1 69.05 15.1 41.3 20 50708 13.2 236 58 21.2
Alaska 365 6315 1.5 69.31 11.3 66.7 152 566432 10.0 263 48 44.5
Arizona 2212 4530 1.8 70.55 7.8 58.1 15 113417 8.1 294 80 31.0
Arkansas 2110 3378 1.9 70.66 10.1 39.9 65 51945 8.8 190 50 19.5
California 21198 5114 1.1 71.71 10.3 62.6 20 156361 9.0 276 91 40.6
Colorado 2541 4884 0.7 72.06 6.8 63.9 166 103766 7.9 204 78 38.7
#依據row names進行合併,row.names也變成一個column(chr)
dta_77USA_4 <- data.frame(x=state.x77f, y=USArrests, by="row.names", all = T)
str(dta_77USA_4)
'data.frame':   50 obs. of  14 variables:
 $ x.Population: num  3615 365 2212 2110 21198 ...
 $ x.Income    : num  3624 6315 4530 3378 5114 ...
 $ x.Illiteracy: num  2.1 1.5 1.8 1.9 1.1 0.7 1.1 0.9 1.3 2 ...
 $ x.Life.Exp  : num  69 69.3 70.5 70.7 71.7 ...
 $ x.Murder    : num  15.1 11.3 7.8 10.1 10.3 6.8 3.1 6.2 10.7 13.9 ...
 $ x.HS.Grad   : num  41.3 66.7 58.1 39.9 62.6 63.9 56 54.6 52.6 40.6 ...
 $ x.Frost     : num  20 152 15 65 20 166 139 103 11 60 ...
 $ x.Area      : num  50708 566432 113417 51945 156361 ...
 $ y.Murder    : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
 $ y.Assault   : int  236 263 294 190 276 204 110 238 335 211 ...
 $ y.UrbanPop  : int  58 48 80 50 91 78 77 72 80 60 ...
 $ y.Rape      : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
 $ by          : chr  "row.names" "row.names" "row.names" "row.names" ...
 $ all         : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
dta_77USA_4|> head() |> knitr::kable()
x.Population x.Income x.Illiteracy x.Life.Exp x.Murder x.HS.Grad x.Frost x.Area y.Murder y.Assault y.UrbanPop y.Rape by all
Alabama 3615 3624 2.1 69.05 15.1 41.3 20 50708 13.2 236 58 21.2 row.names TRUE
Alaska 365 6315 1.5 69.31 11.3 66.7 152 566432 10.0 263 48 44.5 row.names TRUE
Arizona 2212 4530 1.8 70.55 7.8 58.1 15 113417 8.1 294 80 31.0 row.names TRUE
Arkansas 2110 3378 1.9 70.66 10.1 39.9 65 51945 8.8 190 50 19.5 row.names TRUE
California 21198 5114 1.1 71.71 10.3 62.6 20 156361 9.0 276 91 40.6 row.names TRUE
Colorado 2541 4884 0.7 72.06 6.8 63.9 166 103766 7.9 204 78 38.7 row.names TRUE
#用data.frame進行合併,row.name"不會變成變項",其他變項會依據來源,編著來自x或來自y
cor(dta_77USA_1)|> knitr::kable(digits = 2)
Murder Population Income Illiteracy Life Exp HS Grad Frost Area Assault UrbanPop Rape
Murder 1.00 0.33 -0.10 0.80 -0.74 -0.59 -0.53 0.37 0.81 -0.11 0.72
Population 0.33 1.00 -0.02 0.12 0.05 -0.40 -0.36 -0.09 0.41 0.57 0.56
Income -0.10 -0.02 1.00 -0.46 0.22 0.66 0.36 0.57 0.28 0.09 0.02
Illiteracy 0.80 0.12 -0.46 1.00 -0.75 -0.68 -0.67 0.18 0.51 -0.35 0.50
Life Exp -0.74 0.05 0.22 -0.75 1.00 0.66 0.41 -0.21 -0.60 0.25 -0.36
HS Grad -0.59 -0.40 0.66 -0.68 0.66 1.00 0.60 0.36 -0.37 -0.13 -0.41
Frost -0.53 -0.36 0.36 -0.67 0.41 0.60 1.00 0.12 -0.28 0.19 -0.46
Area 0.37 -0.09 0.57 0.18 -0.21 0.36 0.12 1.00 0.54 -0.07 0.50
Assault 0.81 0.41 0.28 0.51 -0.60 -0.37 -0.28 0.54 1.00 0.14 0.69
UrbanPop -0.11 0.57 0.09 -0.35 0.25 -0.13 0.19 -0.07 0.14 1.00 0.24
Rape 0.72 0.56 0.02 0.50 -0.36 -0.41 -0.46 0.50 0.69 0.24 1.00
#cor()做資料的correlation表格, digits為小數點後2位
#cor(dta_77USA_2)|> knitr::kable(digits = 2)
#Error in cor(dta_77USA_2) : 'x' 必須是數值
#資料有非數值型 $Row.names : 'AsIs' chr也不能做相關分析

#cor(dta_77USA_3)|> knitr::kable(digits = 2)
#Error in cor(dta_77USA_3) : 'x' 必須是數值  
#有NA資料無法做相關分析



#cor(dta_77USA_4)|> knitr::kable(digits = 2)
#Error in cor(dta_77USA_4) : 'x' 必須是數值
#資料有非數值型(column有chr $by和$ all)也不能做相關分析

數值呈現相關性不容易閱讀,可以做Corrplot

library(corrplot)
corr <- cor(dta_77USA_1[,1:11])

#參數全部默認情況下的相關係數圖

corrplot(corr = corr)

corr <- cor(dta_77USA_1[,1:11])
corrplot(corr = corr,order="AOE",type="upper",tl.pos="d", tl.cex=0.35, tl.col="black")
corrplot(corr = corr,add=TRUE, type="lower", method="number",order="AOE",diag=FALSE,tl.pos="n", cl.pos="n",number.cex=0.6)  

圖形顏色搭配數值,比單純數值更為直觀。

以有興趣的Murder來說:與 UrbanPop、Income、HS Grad、Frost、lifeExp、呈現負相關;與population、Rape、Assault、Area、Illiteracy呈現正相關。

In-class exercises_3

Summarize the backpain{HSAUR3} into the following format: driver suburban case control total no no ? ? ? no yes ? ? ? yes no ? ? ? yes yes ? ? ? You should provide comments for each code chunk.

pacman::p_load(HSAUR3)
data(backpain,package = "HSAUR3")
dta_bp <- backpain
str(dta_bp)  #434 obs. of  4 variables
'data.frame':   434 obs. of  4 variables:
 $ ID      : Factor w/ 217 levels "1","2","3","4",..: 1 1 2 2 3 3 4 4 5 5 ...
 $ status  : Factor w/ 2 levels "case","control": 1 2 1 2 1 2 1 2 1 2 ...
 $ driver  : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 1 1 2 2 ...
 $ suburban: Factor w/ 2 levels "no","yes": 2 1 2 2 1 2 1 1 1 2 ...
head(dta_bp)
  ID  status driver suburban
1  1    case    yes      yes
2  1 control    yes       no
3  2    case    yes      yes
4  2 control    yes      yes
5  3    case    yes       no
6  3 control    yes      yes
dta_bp|> head() |> knitr::kable()
ID status driver suburban
1 case yes yes
1 control yes no
2 case yes yes
2 control yes yes
3 case yes no
3 control yes yes
typeof(dta_bp) #list
[1] "list"
summary(dta_bp) 
       ID          status    driver    suburban 
 1      :  2   case   :217   no : 86   no :200  
 2      :  2   control:217   yes:348   yes:234  
 3      :  2                                    
 4      :  2                                    
 5      :  2                                    
 6      :  2                                    
 (Other):422                                    
names(dta_bp)
[1] "ID"       "status"   "driver"   "suburban"

#long data to wide data library(tidyverse)
library(dplyr)

dta_bp %>% dplyr::rename(Group = status, Driver = driver, Suburban = suburban) %>% # column rename

#long data to wide data
library(tidyverse)  #這邊是參考昶濬的作法,完成inclass-5,這部分有比較了解
backpain %>% 
 dplyr::rename(Group = status,   #先修改名字
               Driver = driver, 
               Suburban = suburban)  %>%
  group_by(Driver, Suburban) %>% # 依照Driver和Suburban分組
  tidyr::spread(key="Group", value ="Group") %>% # key(Column names) to value
  summarize(Case = sum(is.na(case)), # sum of case number
            Control = sum(is.na(control)), # sum of control number
            Total = n()) %>% # sum of total
  as.data.frame () %>% #tibble to dataframe
  knitr::kable ()
Driver Suburban Case Control Total
no no 38 17 64
no yes 5 4 11
yes no 43 44 107
yes yes 37 58 158

In-class exercises_4

The data set Vocab{carData} gives observations on gender, education and vocabulary, from respondents to U.S. General Social Surveys, 1972-2004. Summarize the relationship between education and vocabulary over the years by gender.

library(carData)
data(Vocab,package = "carData")
str(Vocab)
'data.frame':   30351 obs. of  4 variables:
 $ year      : num  1974 1974 1974 1974 1974 ...
 $ sex       : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 2 2 2 1 1 ...
 $ education : num  14 16 10 10 12 16 17 10 12 11 ...
 $ vocabulary: num  9 9 9 5 8 8 9 5 3 5 ...
 - attr(*, "na.action")= 'omit' Named int [1:32115] 1 2 3 4 5 6 7 8 9 10 ...
  ..- attr(*, "names")= chr [1:32115] "19720001" "19720002" "19720003" "19720004" ...
class(Vocab)
[1] "data.frame"
typeof(Vocab)
[1] "list"
summary(Vocab)
      year          sex          education       vocabulary    
 Min.   :1974   Female:17148   Min.   : 0.00   Min.   : 0.000  
 1st Qu.:1987   Male  :13203   1st Qu.:12.00   1st Qu.: 5.000  
 Median :1994                  Median :12.00   Median : 6.000  
 Mean   :1995                  Mean   :13.03   Mean   : 6.004  
 3rd Qu.:2006                  3rd Qu.:15.00   3rd Qu.: 7.000  
 Max.   :2016                  Max.   :20.00   Max.   :10.000  
names(Vocab)
[1] "year"       "sex"        "education"  "vocabulary"
library(lattice)
library(tidyr)
library(dplyr)

Vocab %>% 
mutate(year = as.factor(year)) %>%  #把year變成factor ,若沒有設表格上都會以year呈現
lattice::xyplot(vocabulary ~ education | year, 
         groups=sex, 
         type=c("p","g","r"), 
         data=., 
         xlab="education", 
         ylab="vocabulary score", 
         layout = c(6, 4),  #設定xyplot的圖形排列為6x4
         cex=0.3, #縮小男生女生在表格中的點
         auto.key=list(columns=2))

In-class exercises_5

Supply comments to each code chunk in the following survey rmarkdown file and preview it as an R notebook or knit to html.

The data set concerns species and weight of animals caught in plots in a study area in Arizona over time.

Each row holds information for a single animal, and the columns represent:

  • record_id: Unique id for the observation
  • month: month of observation
  • day: day of observation
  • year: year of observation
  • plot_id: ID of a particular plot
  • species_id: 2-letter code
  • sex: sex of animal (“M”, “F”)
  • hindfoot_length: length of the hindfoot in mm
  • weight: weight of the animal in grams
  • genus: genus of animal
  • species: species of animal
  • taxa: e.g. Rodent, Reptile, Bird, Rabbit
  • plot_type: type of plot
pacman::p_load(tidyverse)
dta <- read_csv("http://kbroman.org/datacarp/portal_data_joined.csv")
#讀csv檔,存成dta
glimpse(dta)
Rows: 34,786
Columns: 13
$ record_id       <dbl> 1, 72, 224, 266, 349, 363, 435, 506, 588, 661, 748, 84~
$ month           <dbl> 7, 8, 9, 10, 11, 11, 12, 1, 2, 3, 4, 5, 6, 8, 9, 10, 1~
$ day             <dbl> 16, 19, 13, 16, 12, 12, 10, 8, 18, 11, 8, 6, 9, 5, 4, ~
$ year            <dbl> 1977, 1977, 1977, 1977, 1977, 1977, 1977, 1978, 1978, ~
$ plot_id         <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~
$ species_id      <chr> "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", ~
$ sex             <chr> "M", "M", NA, NA, NA, NA, NA, NA, "M", NA, NA, "M", "M~
$ hindfoot_length <dbl> 32, 31, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, NA, 34~
$ weight          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 218, NA, NA, 204, 200,~
$ genus           <chr> "Neotoma", "Neotoma", "Neotoma", "Neotoma", "Neotoma",~
$ species         <chr> "albigula", "albigula", "albigula", "albigula", "albig~
$ taxa            <chr> "Rodent", "Rodent", "Rodent", "Rodent", "Rodent", "Rod~
$ plot_type       <chr> "Control", "Control", "Control", "Control", "Control",~
#總覽資料
typeof(dta)
[1] "list"
##資料型態
class(dta)
[1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame" 
#資料型態
dim(dta)
[1] 34786    13
#資料的rowx column=34,786 x 13
dplyr::select(dta, plot_id, species_id, weight) %>% head()
# A tibble: 6 x 3
  plot_id species_id weight
    <dbl> <chr>       <dbl>
1       2 NL             NA
2       2 NL             NA
3       2 NL             NA
4       2 NL             NA
5       2 NL             NA
6       2 NL             NA
#選資料的plot_id, species_id, weight做表格,列出前6項
dplyr::select(dta, -record_id, -species_id) %>% head()
# A tibble: 6 x 11
  month   day  year plot_id sex   hindfoot_length weight genus   species  taxa  
  <dbl> <dbl> <dbl>   <dbl> <chr>           <dbl>  <dbl> <chr>   <chr>    <chr> 
1     7    16  1977       2 M                  32     NA Neotoma albigula Rodent
2     8    19  1977       2 M                  31     NA Neotoma albigula Rodent
3     9    13  1977       2 <NA>               NA     NA Neotoma albigula Rodent
4    10    16  1977       2 <NA>               NA     NA Neotoma albigula Rodent
5    11    12  1977       2 <NA>               NA     NA Neotoma albigula Rodent
6    11    12  1977       2 <NA>               NA     NA Neotoma albigula Rodent
# ... with 1 more variable: plot_type <chr>
#選資料的"除了" record_id, species_idt以外的資料做表格,列出前6項
year1995<-dplyr::filter(dta, year == 1995) %>% head()
#以year當過濾,篩選出year==1995的資料
str(year1995)   #column數沒有變,只剩下6筆資料
tibble [6 x 13] (S3: tbl_df/tbl/data.frame)
 $ record_id      : num [1:6] 22314 22728 22899 23032 22003 ...
 $ month          : num [1:6] 6 9 10 12 1 2
 $ day            : num [1:6] 7 23 28 2 11 4
 $ year           : num [1:6] 1995 1995 1995 1995 1995 ...
 $ plot_id        : num [1:6] 2 2 2 2 2 2
 $ species_id     : chr [1:6] "NL" "NL" "NL" "NL" ...
 $ sex            : chr [1:6] "M" "F" "F" "F" ...
 $ hindfoot_length: num [1:6] 34 32 32 33 37 36
 $ weight         : num [1:6] NA 165 171 NA 41 45
 $ genus          : chr [1:6] "Neotoma" "Neotoma" "Neotoma" "Neotoma" ...
 $ species        : chr [1:6] "albigula" "albigula" "albigula" "albigula" ...
 $ taxa           : chr [1:6] "Rodent" "Rodent" "Rodent" "Rodent" ...
 $ plot_type      : chr [1:6] "Control" "Control" "Control" "Control" ...
summary(year1995$year)  #確定一下year1995$year的樣貌
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1995    1995    1995    1995    1995    1995 
year1995 #列出year1995所有資料,確實只有6筆
# A tibble: 6 x 13
  record_id month   day  year plot_id species_id sex   hindfoot_length weight
      <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
1     22314     6     7  1995       2 NL         M                  34     NA
2     22728     9    23  1995       2 NL         F                  32    165
3     22899    10    28  1995       2 NL         F                  32    171
4     23032    12     2  1995       2 NL         F                  33     NA
5     22003     1    11  1995       2 DM         M                  37     41
6     22042     2     4  1995       2 DM         F                  36     45
# ... with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
#   plot_type <chr>
weight5<-dplyr::select(dplyr::filter(dta, weight <= 5), species_id, sex, weight)
#選擇weight<=5的資料,只列出species_id, sex, weight
str(weight5)
tibble [75 x 3] (S3: tbl_df/tbl/data.frame)
 $ species_id: chr [1:75] "PF" "PF" "PF" "PF" ...
 $ sex       : chr [1:75] "M" "F" "F" "F" ...
 $ weight    : num [1:75] 5 5 5 4 5 4 5 5 5 5 ...
head(dplyr::select(dplyr::filter(dta, weight <= 5), species_id, sex, weight))
# A tibble: 6 x 3
  species_id sex   weight
  <chr>      <chr>  <dbl>
1 PF         M          5
2 PF         F          5
3 PF         F          5
4 PF         F          4
5 PF         F          5
6 PF         F          4
#head在前面跟在後面沒有差別,都是列出前6個row
dplyr::select(dplyr::filter(dta, weight <= 5), species_id, sex, weight)%>% head()
# A tibble: 6 x 3
  species_id sex   weight
  <chr>      <chr>  <dbl>
1 PF         M          5
2 PF         F          5
3 PF         F          5
4 PF         F          4
5 PF         F          5
6 PF         F          4
dta %>% 
  dplyr::filter(weight <= 5) %>% 
  dplyr::select(species_id, sex, weight) %>% 
  head
# A tibble: 6 x 3
  species_id sex   weight
  <chr>      <chr>  <dbl>
1 PF         M          5
2 PF         F          5
3 PF         F          5
4 PF         F          4
5 PF         F          5
6 PF         F          4
#跟前面一樣,只是語法不同
dta %>% 
  mutate(weight_kg = weight / 1000,   
#mutate新增衍生變數weight_kg= weight / 1000
#mutate新增衍生變數weight_lb = weight_kg * 2.2
         weight_lb = weight_kg * 2.2) %>% 
  head()
# A tibble: 6 x 15
  record_id month   day  year plot_id species_id sex   hindfoot_length weight
      <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
1         1     7    16  1977       2 NL         M                  32     NA
2        72     8    19  1977       2 NL         M                  31     NA
3       224     9    13  1977       2 NL         <NA>               NA     NA
4       266    10    16  1977       2 NL         <NA>               NA     NA
5       349    11    12  1977       2 NL         <NA>               NA     NA
6       363    11    12  1977       2 NL         <NA>               NA     NA
# ... with 6 more variables: genus <chr>, species <chr>, taxa <chr>,
#   plot_type <chr>, weight_kg <dbl>, weight_lb <dbl>
#原來有13個,再加上新增的weight_kg、weight_lb,共有15個
dta %>% 
  filter(!is.na(weight)) %>%
#filter()篩選符合條件的觀測值   
#filter(!is.na)篩選出weight非NA的值(!is.na)
  group_by(sex, species_id) %>%
#group_by()依照類別變數分組,常搭配 summarize()
#依照類別sex, species_id分組
  summarize(mean_weight = mean(weight)) %>%
#summarise()聚合變數  
#summarize(mean_weight = mean(weight))  
#依照group分組後,創一個mean_weight,帶入mean(weight)
  arrange(desc(mean_weight)) %>% 
#arrange()依照變數排序觀測值   
#arrange(desc(mean_weight)) mean_weight以descending的方式
  head()
# A tibble: 6 x 3
# Groups:   sex [3]
  sex   species_id mean_weight
  <chr> <chr>            <dbl>
1 <NA>  NL                168.
2 M     NL                166.
3 F     NL                154.
4 M     SS                130 
5 <NA>  SH                130 
6 M     DS                122.
#列出前6筆資料,可以觀察到sex和species_id都是不同的組合,其中NA值也會納入
dta %>%
  group_by(sex) %>%
#依性別分組
  tally
# A tibble: 3 x 2
  sex       n
  <chr> <int>
1 F     15690
2 M     17348
3 <NA>   1748
#依分組類別資料計算個數,包括NA也會統計個數
table(dta$sex)

    F     M 
15690 17348 
#table不會計算NA個數
dta_count<-dta %>%
  count(sex)
#跟tally功能一樣
str(dta_count)
spec_tbl_df [3 x 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ sex: chr [1:3] "F" "M" NA
 $ n  : int [1:3] 15690 17348 1748
 - attr(*, "spec")=
  .. cols(
  ..   record_id = col_double(),
  ..   month = col_double(),
  ..   day = col_double(),
  ..   year = col_double(),
  ..   plot_id = col_double(),
  ..   species_id = col_character(),
  ..   sex = col_character(),
  ..   hindfoot_length = col_double(),
  ..   weight = col_double(),
  ..   genus = col_character(),
  ..   species = col_character(),
  ..   taxa = col_character(),
  ..   plot_type = col_character()
  .. )
 - attr(*, "problems")=<externalptr> 
dta_count<-dta %>%
  group_by(sex) %>%
  summarize(count = n())
#按照sex分組  
#summarize(計數=n)  
#依照group分組後,創一個count,帶入分組的n
str(dta_count)
tibble [3 x 2] (S3: tbl_df/tbl/data.frame)
 $ sex  : chr [1:3] "F" "M" NA
 $ count: int [1:3] 15690 17348 1748
dta_year<-dta %>%
  group_by(sex) %>%
  summarize(count = sum(!is.na(year)))
#按照sex分組  
#summarize(計數=n但排除year是na的)  
#依照group分組後,創一個count,帶入分組的n
str(dta_year)
tibble [3 x 2] (S3: tbl_df/tbl/data.frame)
 $ sex  : chr [1:3] "F" "M" NA
 $ count: int [1:3] 15690 17348 1748
dta_gw <- dta %>% 
  filter(!is.na(weight)) %>%
#篩選排除weight為NA的值
  group_by(genus, plot_id) %>%
#依genus, plot_id分組
  summarize(mean_weight = mean(weight))
#依照group分組後,創一個mean_weight,帶入mean(weight)
head(dta_gw)
# A tibble: 6 x 3
# Groups:   genus [1]
  genus   plot_id mean_weight
  <chr>     <dbl>       <dbl>
1 Baiomys       1        7   
2 Baiomys       2        6   
3 Baiomys       3        8.61
4 Baiomys       5        7.75
5 Baiomys      18        9.5 
6 Baiomys      19        9.53
#列出前6筆資料看一下
glimpse(dta_gw)
Rows: 196
Columns: 3
Groups: genus [10]
$ genus       <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
$ plot_id     <dbl> 1, 2, 3, 5, 18, 19, 20, 21, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,~
$ mean_weight <dbl> 7.000000, 6.000000, 8.611111, 7.750000, 9.500000, 9.533333~
#總覽資料,剩下196 row x 3 column
#column:上述篩選的genus, plot_id和新創建的mean_weight
table(dta_gw$genus)

        Baiomys     Chaetodipus       Dipodomys         Neotoma       Onychomys 
              8              24              24              24              24 
    Perognathus      Peromyscus Reithrodontomys        Sigmodon    Spermophilus 
             23              24              24              19               2 
dta_w <- dta_gw %>%
  spread(key = genus, value = mean_weight)
#spread用來擴展表,把某一列的值(鍵值對)分開拆成多列
#通常是long data要變成wide data
#key=是原來要拆的column,value=是拆出來的那些列的值應該填什麼
#把genus的變成wide data,並且填入mean_weight
head(dta_w)
# A tibble: 6 x 11
  plot_id Baiomys Chaetodipus Dipodomys Neotoma Onychomys Perognathus Peromyscus
    <dbl>   <dbl>       <dbl>     <dbl>   <dbl>     <dbl>       <dbl>      <dbl>
1       1    7           22.2      60.2    156.      27.7        9.62       22.2
2       2    6           25.1      55.7    169.      26.9        6.95       22.3
3       3    8.61        24.6      52.0    158.      26.0        7.51       21.4
4       4   NA           23.0      57.5    164.      28.1        7.82       22.6
5       5    7.75        18.0      51.1    190.      27.0        8.66       21.2
6       6   NA           24.9      58.6    180.      25.9        7.81       21.8
# ... with 3 more variables: Reithrodontomys <dbl>, Sigmodon <dbl>,
#   Spermophilus <dbl>
head(dta_w)
# A tibble: 6 x 11
  plot_id Baiomys Chaetodipus Dipodomys Neotoma Onychomys Perognathus Peromyscus
    <dbl>   <dbl>       <dbl>     <dbl>   <dbl>     <dbl>       <dbl>      <dbl>
1       1    7           22.2      60.2    156.      27.7        9.62       22.2
2       2    6           25.1      55.7    169.      26.9        6.95       22.3
3       3    8.61        24.6      52.0    158.      26.0        7.51       21.4
4       4   NA           23.0      57.5    164.      28.1        7.82       22.6
5       5    7.75        18.0      51.1    190.      27.0        8.66       21.2
6       6   NA           24.9      58.6    180.      25.9        7.81       21.8
# ... with 3 more variables: Reithrodontomys <dbl>, Sigmodon <dbl>,
#   Spermophilus <dbl>
glimpse(dta_w)
Rows: 24
Columns: 11
$ plot_id         <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,~
$ Baiomys         <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA~
$ Chaetodipus     <dbl> 22.19939, 25.11014, 24.63636, 23.02381, 17.98276, 24.8~
$ Dipodomys       <dbl> 60.23214, 55.68259, 52.04688, 57.52454, 51.11356, 58.6~
$ Neotoma         <dbl> 156.2222, 169.1436, 158.2414, 164.1667, 190.0370, 179.~
$ Onychomys       <dbl> 27.67550, 26.87302, 26.03241, 28.09375, 27.01695, 25.8~
$ Perognathus     <dbl> 9.625000, 6.947368, 7.507812, 7.824427, 8.658537, 7.80~
$ Peromyscus      <dbl> 22.22222, 22.26966, 21.37037, 22.60000, 21.23171, 21.8~
$ Reithrodontomys <dbl> 11.375000, 10.680556, 10.516588, 10.263158, 11.154545,~
$ Sigmodon        <dbl> NA, 70.85714, 65.61404, 82.00000, 82.66667, 68.77778, ~
$ Spermophilus    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 13~
#從table(dta_gw$genus)可看的出來,genus共有10種,最多的資料是24筆
#多一個column是原來的plot_id
dta_gw %>%
  spread(genus, mean_weight, fill = 0) %>%
  head()
# A tibble: 6 x 11
  plot_id Baiomys Chaetodipus Dipodomys Neotoma Onychomys Perognathus Peromyscus
    <dbl>   <dbl>       <dbl>     <dbl>   <dbl>     <dbl>       <dbl>      <dbl>
1       1    7           22.2      60.2    156.      27.7        9.62       22.2
2       2    6           25.1      55.7    169.      26.9        6.95       22.3
3       3    8.61        24.6      52.0    158.      26.0        7.51       21.4
4       4    0           23.0      57.5    164.      28.1        7.82       22.6
5       5    7.75        18.0      51.1    190.      27.0        8.66       21.2
6       6    0           24.9      58.6    180.      25.9        7.81       21.8
# ... with 3 more variables: Reithrodontomys <dbl>, Sigmodon <dbl>,
#   Spermophilus <dbl>
#spread同上面
#這次用0取代NA所有資料
dta_l <- dta_w %>%
  gather(key = genus, value = mean_weight, -plot_id)
head(dta_l)
# A tibble: 6 x 3
  plot_id genus   mean_weight
    <dbl> <chr>         <dbl>
1       1 Baiomys        7   
2       2 Baiomys        6   
3       3 Baiomys        8.61
4       4 Baiomys       NA   
5       5 Baiomys        7.75
6       6 Baiomys       NA   
#gather:wide data再變回long data
#依照genus集合
#刪去"plot_id"這個column
glimpse(dta_l)
Rows: 240
Columns: 3
$ plot_id     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,~
$ genus       <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
$ mean_weight <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA, NA~
#為什麼前面-plot_id,這邊glimpse後,plot_id依然在?
dta_w_Bs <- dta_w %>%
  gather(key = genus, value = mean_weight, Baiomys:Spermophilus) 
#只gather的genus變數中的Baiomys到Spermophilus類別
head(dta_w_Bs)
# A tibble: 6 x 3
  plot_id genus   mean_weight
    <dbl> <chr>         <dbl>
1       1 Baiomys        7   
2       2 Baiomys        6   
3       3 Baiomys        8.61
4       4 Baiomys       NA   
5       5 Baiomys        7.75
6       6 Baiomys       NA   
tail(dta_w_Bs)
# A tibble: 6 x 3
  plot_id genus        mean_weight
    <dbl> <chr>              <dbl>
1      19 Spermophilus          NA
2      20 Spermophilus          57
3      21 Spermophilus          NA
4      22 Spermophilus          NA
5      23 Spermophilus          NA
6      24 Spermophilus          NA
glimpse(dta_w_Bs)
Rows: 240
Columns: 3
$ plot_id     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,~
$ genus       <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
$ mean_weight <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA, NA~
table(dta_w_Bs$genus)

        Baiomys     Chaetodipus       Dipodomys         Neotoma       Onychomys 
             24              24              24              24              24 
    Perognathus      Peromyscus Reithrodontomys        Sigmodon    Spermophilus 
             24              24              24              24              24 
#確認genus變數中是真的從Baiomys到Spermophilus類別
dta_complete <- dta %>%
  filter(!is.na(weight),           
         !is.na(hindfoot_length),  
         !is.na(sex)) 
#篩選資料,排除weight有NA的、hindfoot_length有NA的、sex
glimpse(dta_complete)
Rows: 30,676
Columns: 13
$ record_id       <dbl> 845, 1164, 1261, 1756, 1818, 1882, 2133, 2184, 2406, 3~
$ month           <dbl> 5, 8, 9, 4, 5, 7, 10, 11, 1, 5, 5, 7, 10, 11, 1, 2, 3,~
$ day             <dbl> 6, 5, 4, 29, 30, 4, 25, 17, 16, 18, 18, 8, 1, 23, 25, ~
$ year            <dbl> 1978, 1978, 1978, 1979, 1979, 1979, 1979, 1979, 1980, ~
$ plot_id         <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~
$ species_id      <chr> "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", ~
$ sex             <chr> "M", "M", "M", "M", "M", "M", "F", "F", "F", "F", "F",~
$ hindfoot_length <dbl> 32, 34, 32, 33, 32, 32, 33, 30, 33, 31, 33, 30, 34, 34~
$ weight          <dbl> 204, 199, 197, 166, 184, 206, 274, 186, 184, 87, 174, ~
$ genus           <chr> "Neotoma", "Neotoma", "Neotoma", "Neotoma", "Neotoma",~
$ species         <chr> "albigula", "albigula", "albigula", "albigula", "albig~
$ taxa            <chr> "Rodent", "Rodent", "Rodent", "Rodent", "Rodent", "Rod~
$ plot_type       <chr> "Control", "Control", "Control", "Control", "Control",~
species_counts <- dta_complete %>%
    count(species_id) %>% 
#用排除NA的資料dta_complete,計數species_id
    filter(n >= 50)
#只列出n>=50的資料
glimpse(species_counts)
Rows: 14
Columns: 2
$ species_id <chr> "DM", "DO", "DS", "NL", "OL", "OT", "PB", "PE", "PF", "PM",~
$ n          <int> 9727, 2790, 2023, 1045, 905, 2081, 2803, 1198, 1469, 835, 2~
dta_complete %>%
#刪除filter(n >= 50)看看發生甚麼變化
    count(species_id) %>% 
#用排除NA的資料dta_complete,計數species_id
glimpse()
Rows: 24
Columns: 2
$ species_id <chr> "BA", "DM", "DO", "DS", "NL", "OL", "OT", "OX", "PB", "PE",~
$ n          <int> 45, 9727, 2790, 2023, 1045, 905, 2081, 5, 2803, 1198, 1469,~
#資料果然筆前一筆更多
summary(species_counts$species_id)
   Length     Class      Mode 
       14 character character 
dta_complete <- dta_complete %>%
  filter(species_id %in% species_counts$species_id)
#判斷某個值(或向量),是否存在於另一個向量之中,會使用 %in% 的符號
#species_id是不是有在species_counts$species_id裡面?
glimpse(dta_complete)
Rows: 30,463
Columns: 13
$ record_id       <dbl> 845, 1164, 1261, 1756, 1818, 1882, 2133, 2184, 2406, 3~
$ month           <dbl> 5, 8, 9, 4, 5, 7, 10, 11, 1, 5, 5, 7, 10, 11, 1, 2, 3,~
$ day             <dbl> 6, 5, 4, 29, 30, 4, 25, 17, 16, 18, 18, 8, 1, 23, 25, ~
$ year            <dbl> 1978, 1978, 1978, 1979, 1979, 1979, 1979, 1979, 1980, ~
$ plot_id         <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~
$ species_id      <chr> "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", ~
$ sex             <chr> "M", "M", "M", "M", "M", "M", "F", "F", "F", "F", "F",~
$ hindfoot_length <dbl> 32, 34, 32, 33, 32, 32, 33, 30, 33, 31, 33, 30, 34, 34~
$ weight          <dbl> 204, 199, 197, 166, 184, 206, 274, 186, 184, 87, 174, ~
$ genus           <chr> "Neotoma", "Neotoma", "Neotoma", "Neotoma", "Neotoma",~
$ species         <chr> "albigula", "albigula", "albigula", "albigula", "albig~
$ taxa            <chr> "Rodent", "Rodent", "Rodent", "Rodent", "Rodent", "Rod~
$ plot_type       <chr> "Control", "Control", "Control", "Control", "Control",~
head(dta_complete)
# A tibble: 6 x 13
  record_id month   day  year plot_id species_id sex   hindfoot_length weight
      <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
1       845     5     6  1978       2 NL         M                  32    204
2      1164     8     5  1978       2 NL         M                  34    199
3      1261     9     4  1978       2 NL         M                  32    197
4      1756     4    29  1979       2 NL         M                  33    166
5      1818     5    30  1979       2 NL         M                  32    184
6      1882     7     4  1979       2 NL         M                  32    206
# ... with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
#   plot_type <chr>