Merge the two data sets: state.x77{datasets} and USArrests{datasets} and compute all pair-wise correlations for numerical variables. Is there anything interesting to report?
library(datasets)
data(USArrests, package="datasets")
data(state, package="datasets")
USArrests|> head() |> knitr::kable()| Murder | Assault | UrbanPop | Rape | |
|---|---|---|---|---|
| Alabama | 13.2 | 236 | 58 | 21.2 |
| Alaska | 10.0 | 263 | 48 | 44.5 |
| Arizona | 8.1 | 294 | 80 | 31.0 |
| Arkansas | 8.8 | 190 | 50 | 19.5 |
| California | 9.0 | 276 | 91 | 40.6 |
| Colorado | 7.9 | 204 | 78 | 38.7 |
state.x77|> head() |> knitr::kable()| Population | Income | Illiteracy | Life Exp | Murder | HS Grad | Frost | Area | |
|---|---|---|---|---|---|---|---|---|
| Alabama | 3615 | 3624 | 2.1 | 69.05 | 15.1 | 41.3 | 20 | 50708 |
| Alaska | 365 | 6315 | 1.5 | 69.31 | 11.3 | 66.7 | 152 | 566432 |
| Arizona | 2212 | 4530 | 1.8 | 70.55 | 7.8 | 58.1 | 15 | 113417 |
| Arkansas | 2110 | 3378 | 1.9 | 70.66 | 10.1 | 39.9 | 65 | 51945 |
| California | 21198 | 5114 | 1.1 | 71.71 | 10.3 | 62.6 | 20 | 156361 |
| Colorado | 2541 | 4884 | 0.7 | 72.06 | 6.8 | 63.9 | 166 | 103766 |
先了解兩個資料的樣態,再決定怎麼合併
str(USArrests) #'data.frame': 50 obs. of 4 variables'data.frame': 50 obs. of 4 variables:
$ Murder : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
$ Assault : int 236 263 294 190 276 204 110 238 335 211 ...
$ UrbanPop: int 58 48 80 50 91 78 77 72 80 60 ...
$ Rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
str(state.x77) #num [1:50, 1:8] num [1:50, 1:8] 3615 365 2212 2110 21198 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
..$ : chr [1:8] "Population" "Income" "Illiteracy" "Life Exp" ...
class(USArrests) # class(USArrests)是data.frame[1] "data.frame"
class(state.x77) # class(state.x77)是"matrix" "array" [1] "matrix" "array"
state.x77f<-state.x77|>as.data.frame() #換成data.frame
str(state.x77f) #'data.frame': 50 obs. of 8 variables'data.frame': 50 obs. of 8 variables:
$ Population: num 3615 365 2212 2110 21198 ...
$ Income : num 3624 6315 4530 3378 5114 ...
$ Illiteracy: num 2.1 1.5 1.8 1.9 1.1 0.7 1.1 0.9 1.3 2 ...
$ Life Exp : num 69 69.3 70.5 70.7 71.7 ...
$ Murder : num 15.1 11.3 7.8 10.1 10.3 6.8 3.1 6.2 10.7 13.9 ...
$ HS Grad : num 41.3 66.7 58.1 39.9 62.6 63.9 56 54.6 52.6 40.6 ...
$ Frost : num 20 152 15 65 20 166 139 103 11 60 ...
$ Area : num 50708 566432 113417 51945 156361 ...
names(USArrests)[1] "Murder" "Assault" "UrbanPop" "Rape"
names(state.x77f) #發現資料column重複的是"Murder"[1] "Population" "Income" "Illiteracy" "Life Exp" "Murder"
[6] "HS Grad" "Frost" "Area"
row.names(USArrests) #Row的名稱 [1] "Alabama" "Alaska" "Arizona" "Arkansas"
[5] "California" "Colorado" "Connecticut" "Delaware"
[9] "Florida" "Georgia" "Hawaii" "Idaho"
[13] "Illinois" "Indiana" "Iowa" "Kansas"
[17] "Kentucky" "Louisiana" "Maine" "Maryland"
[21] "Massachusetts" "Michigan" "Minnesota" "Mississippi"
[25] "Missouri" "Montana" "Nebraska" "Nevada"
[29] "New Hampshire" "New Jersey" "New Mexico" "New York"
[33] "North Carolina" "North Dakota" "Ohio" "Oklahoma"
[37] "Oregon" "Pennsylvania" "Rhode Island" "South Carolina"
[41] "South Dakota" "Tennessee" "Texas" "Utah"
[45] "Vermont" "Virginia" "Washington" "West Virginia"
[49] "Wisconsin" "Wyoming"
row.names(state.x77f)#兩個資料的row name是一樣的 [1] "Alabama" "Alaska" "Arizona" "Arkansas"
[5] "California" "Colorado" "Connecticut" "Delaware"
[9] "Florida" "Georgia" "Hawaii" "Idaho"
[13] "Illinois" "Indiana" "Iowa" "Kansas"
[17] "Kentucky" "Louisiana" "Maine" "Maryland"
[21] "Massachusetts" "Michigan" "Minnesota" "Mississippi"
[25] "Missouri" "Montana" "Nebraska" "Nevada"
[29] "New Hampshire" "New Jersey" "New Mexico" "New York"
[33] "North Carolina" "North Dakota" "Ohio" "Oklahoma"
[37] "Oregon" "Pennsylvania" "Rhode Island" "South Carolina"
[41] "South Dakota" "Tennessee" "Texas" "Utah"
[45] "Vermont" "Virginia" "Washington" "West Virginia"
[49] "Wisconsin" "Wyoming"
summary(USArrests$Murder) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.800 4.075 7.250 7.788 11.250 17.400
summary(state.x77f$Murder) Min. 1st Qu. Median Mean 3rd Qu. Max.
1.400 4.350 6.850 7.378 10.675 15.100
dta_77USA_1 <- merge(x=state.x77f, y=USArrests, by=c("Murder"))
str(dta_77USA_1)'data.frame': 15 obs. of 11 variables:
$ Murder : num 2.7 3.3 3.3 4.3 5.3 6.8 7.4 8.5 9.7 11.1 ...
$ Population: num 1058 5814 812 3559 813 ...
$ Income : num 3694 4755 4281 4864 4119 ...
$ Illiteracy: num 0.7 1.1 0.7 0.6 0.6 0.7 0.8 0.9 2.2 0.9 ...
$ Life Exp : num 70.4 71.8 71.2 71.7 71.9 ...
$ HS Grad : num 54.7 58.5 57.6 63.5 59.5 63.9 53.2 52.3 55.2 52.8 ...
$ Frost : num 161 103 174 32 126 166 124 101 120 125 ...
$ Area : num 30920 7826 9027 66570 82677 ...
$ Assault : int 72 110 110 102 46 161 159 156 109 254 ...
$ UrbanPop : int 66 77 77 62 83 60 89 63 52 86 ...
$ Rape : num 14.9 11.1 11.1 16.5 20.2 15.6 18.8 20.7 16.3 26.1 ...
dta_77USA_1|> head() |> knitr::kable() | Murder | Population | Income | Illiteracy | Life Exp | HS Grad | Frost | Area | Assault | UrbanPop | Rape |
|---|---|---|---|---|---|---|---|---|---|---|
| 2.7 | 1058 | 3694 | 0.7 | 70.39 | 54.7 | 161 | 30920 | 72 | 66 | 14.9 |
| 3.3 | 5814 | 4755 | 1.1 | 71.83 | 58.5 | 103 | 7826 | 110 | 77 | 11.1 |
| 3.3 | 812 | 4281 | 0.7 | 71.23 | 57.6 | 174 | 9027 | 110 | 77 | 11.1 |
| 4.3 | 3559 | 4864 | 0.6 | 71.72 | 63.5 | 32 | 66570 | 102 | 62 | 16.5 |
| 5.3 | 813 | 4119 | 0.6 | 71.87 | 59.5 | 126 | 82677 | 46 | 83 | 20.2 |
| 6.8 | 2541 | 4884 | 0.7 | 72.06 | 63.9 | 166 | 103766 | 161 | 60 | 15.6 |
#只保留有Murder column的資料dta_77USA_2 <- merge(x=state.x77f, y=USArrests, by="row.names")
str(dta_77USA_2)'data.frame': 50 obs. of 13 variables:
$ Row.names : 'AsIs' chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
$ Population: num 3615 365 2212 2110 21198 ...
$ Income : num 3624 6315 4530 3378 5114 ...
$ Illiteracy: num 2.1 1.5 1.8 1.9 1.1 0.7 1.1 0.9 1.3 2 ...
$ Life Exp : num 69 69.3 70.5 70.7 71.7 ...
$ Murder.x : num 15.1 11.3 7.8 10.1 10.3 6.8 3.1 6.2 10.7 13.9 ...
$ HS Grad : num 41.3 66.7 58.1 39.9 62.6 63.9 56 54.6 52.6 40.6 ...
$ Frost : num 20 152 15 65 20 166 139 103 11 60 ...
$ Area : num 50708 566432 113417 51945 156361 ...
$ Murder.y : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
$ Assault : int 236 263 294 190 276 204 110 238 335 211 ...
$ UrbanPop : int 58 48 80 50 91 78 77 72 80 60 ...
$ Rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
dta_77USA_2|> head() |> knitr::kable() | Row.names | Population | Income | Illiteracy | Life Exp | Murder.x | HS Grad | Frost | Area | Murder.y | Assault | UrbanPop | Rape |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Alabama | 3615 | 3624 | 2.1 | 69.05 | 15.1 | 41.3 | 20 | 50708 | 13.2 | 236 | 58 | 21.2 |
| Alaska | 365 | 6315 | 1.5 | 69.31 | 11.3 | 66.7 | 152 | 566432 | 10.0 | 263 | 48 | 44.5 |
| Arizona | 2212 | 4530 | 1.8 | 70.55 | 7.8 | 58.1 | 15 | 113417 | 8.1 | 294 | 80 | 31.0 |
| Arkansas | 2110 | 3378 | 1.9 | 70.66 | 10.1 | 39.9 | 65 | 51945 | 8.8 | 190 | 50 | 19.5 |
| California | 21198 | 5114 | 1.1 | 71.71 | 10.3 | 62.6 | 20 | 156361 | 9.0 | 276 | 91 | 40.6 |
| Colorado | 2541 | 4884 | 0.7 | 72.06 | 6.8 | 63.9 | 166 | 103766 | 7.9 | 204 | 78 | 38.7 |
#依據row name(50州)進行合併,因Murder在兩個data framne中重複,因此會產出Murder.x(state.x77f)和Murder.y(USArrests)
sum(is.na(dta_77USA_2))[1] 0
dta_77USA_3 <- merge(x=state.x77f, y=USArrests, by=c("Murder"), all = T)
# all=T 是用來詢問是否顯示所有資料,如果有資料不全者,沒下 all = T,應該不會出現在合併資料中。
str(dta_77USA_3)'data.frame': 88 obs. of 11 variables:
$ Murder : num 0.8 1.4 1.7 2.1 2.1 2.2 2.2 2.3 2.3 2.4 ...
$ Population: num NA 637 681 NA NA ...
$ Income : num NA 5087 4167 NA NA ...
$ Illiteracy: num NA 0.8 0.5 NA NA NA NA 0.5 0.6 1.3 ...
$ Life Exp : num NA 72.8 72.1 NA NA ...
$ HS Grad : num NA 50.3 53.3 NA NA NA NA 59 57.6 46.4 ...
$ Frost : num NA 186 172 NA NA NA NA 140 160 127 ...
$ Area : num NA 69273 75955 NA NA ...
$ Assault : int 45 NA NA 83 57 56 48 NA NA NA ...
$ UrbanPop : int 44 NA NA 51 56 57 32 NA NA NA ...
$ Rape : num 7.3 NA NA 7.8 9.5 11.3 11.2 NA NA NA ...
dta_77USA_3|> head() |> knitr::kable() | Murder | Population | Income | Illiteracy | Life Exp | HS Grad | Frost | Area | Assault | UrbanPop | Rape |
|---|---|---|---|---|---|---|---|---|---|---|
| 0.8 | NA | NA | NA | NA | NA | NA | NA | 45 | 44 | 7.3 |
| 1.4 | 637 | 5087 | 0.8 | 72.78 | 50.3 | 186 | 69273 | NA | NA | NA |
| 1.7 | 681 | 4167 | 0.5 | 72.08 | 53.3 | 172 | 75955 | NA | NA | NA |
| 2.1 | NA | NA | NA | NA | NA | NA | NA | 83 | 51 | 7.8 |
| 2.1 | NA | NA | NA | NA | NA | NA | NA | 57 | 56 | 9.5 |
| 2.2 | NA | NA | NA | NA | NA | NA | NA | 56 | 57 | 11.3 |
#以Murder進行合併,保留所有資料包括NA
sum(is.na(USArrests)) #在USArrests資料中並無NA[1] 0
sum(is.na(state.x77f)) #在state.x77f資料中並無NA[1] 0
sum(is.na(dta_77USA_3)) #為何合併後會出現367個NA資料?[1] 367
dta_77USA_3 <- merge(x=state.x77f, y=USArrests, by="row.names", all = T)
str(dta_77USA_3)'data.frame': 50 obs. of 13 variables:
$ Row.names : 'AsIs' chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
$ Population: num 3615 365 2212 2110 21198 ...
$ Income : num 3624 6315 4530 3378 5114 ...
$ Illiteracy: num 2.1 1.5 1.8 1.9 1.1 0.7 1.1 0.9 1.3 2 ...
$ Life Exp : num 69 69.3 70.5 70.7 71.7 ...
$ Murder.x : num 15.1 11.3 7.8 10.1 10.3 6.8 3.1 6.2 10.7 13.9 ...
$ HS Grad : num 41.3 66.7 58.1 39.9 62.6 63.9 56 54.6 52.6 40.6 ...
$ Frost : num 20 152 15 65 20 166 139 103 11 60 ...
$ Area : num 50708 566432 113417 51945 156361 ...
$ Murder.y : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
$ Assault : int 236 263 294 190 276 204 110 238 335 211 ...
$ UrbanPop : int 58 48 80 50 91 78 77 72 80 60 ...
$ Rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
dta_77USA_3|> head() |> knitr::kable()| Row.names | Population | Income | Illiteracy | Life Exp | Murder.x | HS Grad | Frost | Area | Murder.y | Assault | UrbanPop | Rape |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Alabama | 3615 | 3624 | 2.1 | 69.05 | 15.1 | 41.3 | 20 | 50708 | 13.2 | 236 | 58 | 21.2 |
| Alaska | 365 | 6315 | 1.5 | 69.31 | 11.3 | 66.7 | 152 | 566432 | 10.0 | 263 | 48 | 44.5 |
| Arizona | 2212 | 4530 | 1.8 | 70.55 | 7.8 | 58.1 | 15 | 113417 | 8.1 | 294 | 80 | 31.0 |
| Arkansas | 2110 | 3378 | 1.9 | 70.66 | 10.1 | 39.9 | 65 | 51945 | 8.8 | 190 | 50 | 19.5 |
| California | 21198 | 5114 | 1.1 | 71.71 | 10.3 | 62.6 | 20 | 156361 | 9.0 | 276 | 91 | 40.6 |
| Colorado | 2541 | 4884 | 0.7 | 72.06 | 6.8 | 63.9 | 166 | 103766 | 7.9 | 204 | 78 | 38.7 |
#依據row names進行合併,row.names也變成一個column(chr)dta_77USA_4 <- data.frame(x=state.x77f, y=USArrests, by="row.names", all = T)
str(dta_77USA_4)'data.frame': 50 obs. of 14 variables:
$ x.Population: num 3615 365 2212 2110 21198 ...
$ x.Income : num 3624 6315 4530 3378 5114 ...
$ x.Illiteracy: num 2.1 1.5 1.8 1.9 1.1 0.7 1.1 0.9 1.3 2 ...
$ x.Life.Exp : num 69 69.3 70.5 70.7 71.7 ...
$ x.Murder : num 15.1 11.3 7.8 10.1 10.3 6.8 3.1 6.2 10.7 13.9 ...
$ x.HS.Grad : num 41.3 66.7 58.1 39.9 62.6 63.9 56 54.6 52.6 40.6 ...
$ x.Frost : num 20 152 15 65 20 166 139 103 11 60 ...
$ x.Area : num 50708 566432 113417 51945 156361 ...
$ y.Murder : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
$ y.Assault : int 236 263 294 190 276 204 110 238 335 211 ...
$ y.UrbanPop : int 58 48 80 50 91 78 77 72 80 60 ...
$ y.Rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
$ by : chr "row.names" "row.names" "row.names" "row.names" ...
$ all : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
dta_77USA_4|> head() |> knitr::kable()| x.Population | x.Income | x.Illiteracy | x.Life.Exp | x.Murder | x.HS.Grad | x.Frost | x.Area | y.Murder | y.Assault | y.UrbanPop | y.Rape | by | all | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Alabama | 3615 | 3624 | 2.1 | 69.05 | 15.1 | 41.3 | 20 | 50708 | 13.2 | 236 | 58 | 21.2 | row.names | TRUE |
| Alaska | 365 | 6315 | 1.5 | 69.31 | 11.3 | 66.7 | 152 | 566432 | 10.0 | 263 | 48 | 44.5 | row.names | TRUE |
| Arizona | 2212 | 4530 | 1.8 | 70.55 | 7.8 | 58.1 | 15 | 113417 | 8.1 | 294 | 80 | 31.0 | row.names | TRUE |
| Arkansas | 2110 | 3378 | 1.9 | 70.66 | 10.1 | 39.9 | 65 | 51945 | 8.8 | 190 | 50 | 19.5 | row.names | TRUE |
| California | 21198 | 5114 | 1.1 | 71.71 | 10.3 | 62.6 | 20 | 156361 | 9.0 | 276 | 91 | 40.6 | row.names | TRUE |
| Colorado | 2541 | 4884 | 0.7 | 72.06 | 6.8 | 63.9 | 166 | 103766 | 7.9 | 204 | 78 | 38.7 | row.names | TRUE |
#用data.frame進行合併,row.name"不會變成變項",其他變項會依據來源,編著來自x或來自ycor(dta_77USA_1)|> knitr::kable(digits = 2)| Murder | Population | Income | Illiteracy | Life Exp | HS Grad | Frost | Area | Assault | UrbanPop | Rape | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Murder | 1.00 | 0.33 | -0.10 | 0.80 | -0.74 | -0.59 | -0.53 | 0.37 | 0.81 | -0.11 | 0.72 |
| Population | 0.33 | 1.00 | -0.02 | 0.12 | 0.05 | -0.40 | -0.36 | -0.09 | 0.41 | 0.57 | 0.56 |
| Income | -0.10 | -0.02 | 1.00 | -0.46 | 0.22 | 0.66 | 0.36 | 0.57 | 0.28 | 0.09 | 0.02 |
| Illiteracy | 0.80 | 0.12 | -0.46 | 1.00 | -0.75 | -0.68 | -0.67 | 0.18 | 0.51 | -0.35 | 0.50 |
| Life Exp | -0.74 | 0.05 | 0.22 | -0.75 | 1.00 | 0.66 | 0.41 | -0.21 | -0.60 | 0.25 | -0.36 |
| HS Grad | -0.59 | -0.40 | 0.66 | -0.68 | 0.66 | 1.00 | 0.60 | 0.36 | -0.37 | -0.13 | -0.41 |
| Frost | -0.53 | -0.36 | 0.36 | -0.67 | 0.41 | 0.60 | 1.00 | 0.12 | -0.28 | 0.19 | -0.46 |
| Area | 0.37 | -0.09 | 0.57 | 0.18 | -0.21 | 0.36 | 0.12 | 1.00 | 0.54 | -0.07 | 0.50 |
| Assault | 0.81 | 0.41 | 0.28 | 0.51 | -0.60 | -0.37 | -0.28 | 0.54 | 1.00 | 0.14 | 0.69 |
| UrbanPop | -0.11 | 0.57 | 0.09 | -0.35 | 0.25 | -0.13 | 0.19 | -0.07 | 0.14 | 1.00 | 0.24 |
| Rape | 0.72 | 0.56 | 0.02 | 0.50 | -0.36 | -0.41 | -0.46 | 0.50 | 0.69 | 0.24 | 1.00 |
#cor()做資料的correlation表格, digits為小數點後2位#cor(dta_77USA_2)|> knitr::kable(digits = 2)
#Error in cor(dta_77USA_2) : 'x' 必須是數值
#資料有非數值型 $Row.names : 'AsIs' chr也不能做相關分析
#cor(dta_77USA_3)|> knitr::kable(digits = 2)
#Error in cor(dta_77USA_3) : 'x' 必須是數值
#有NA資料無法做相關分析
#cor(dta_77USA_4)|> knitr::kable(digits = 2)
#Error in cor(dta_77USA_4) : 'x' 必須是數值
#資料有非數值型(column有chr $by和$ all)也不能做相關分析數值呈現相關性不容易閱讀,可以做Corrplot
library(corrplot)
corr <- cor(dta_77USA_1[,1:11])
#參數全部默認情況下的相關係數圖
corrplot(corr = corr)corr <- cor(dta_77USA_1[,1:11])
corrplot(corr = corr,order="AOE",type="upper",tl.pos="d", tl.cex=0.35, tl.col="black")
corrplot(corr = corr,add=TRUE, type="lower", method="number",order="AOE",diag=FALSE,tl.pos="n", cl.pos="n",number.cex=0.6) 圖形顏色搭配數值,比單純數值更為直觀。
以有興趣的Murder來說:與 UrbanPop、Income、HS Grad、Frost、lifeExp、呈現負相關;與population、Rape、Assault、Area、Illiteracy呈現正相關。
Summarize the backpain{HSAUR3} into the following format: driver suburban case control total no no ? ? ? no yes ? ? ? yes no ? ? ? yes yes ? ? ? You should provide comments for each code chunk.
pacman::p_load(HSAUR3)
data(backpain,package = "HSAUR3")
dta_bp <- backpain
str(dta_bp) #434 obs. of 4 variables'data.frame': 434 obs. of 4 variables:
$ ID : Factor w/ 217 levels "1","2","3","4",..: 1 1 2 2 3 3 4 4 5 5 ...
$ status : Factor w/ 2 levels "case","control": 1 2 1 2 1 2 1 2 1 2 ...
$ driver : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 1 1 2 2 ...
$ suburban: Factor w/ 2 levels "no","yes": 2 1 2 2 1 2 1 1 1 2 ...
head(dta_bp) ID status driver suburban
1 1 case yes yes
2 1 control yes no
3 2 case yes yes
4 2 control yes yes
5 3 case yes no
6 3 control yes yes
dta_bp|> head() |> knitr::kable()| ID | status | driver | suburban |
|---|---|---|---|
| 1 | case | yes | yes |
| 1 | control | yes | no |
| 2 | case | yes | yes |
| 2 | control | yes | yes |
| 3 | case | yes | no |
| 3 | control | yes | yes |
typeof(dta_bp) #list[1] "list"
summary(dta_bp) ID status driver suburban
1 : 2 case :217 no : 86 no :200
2 : 2 control:217 yes:348 yes:234
3 : 2
4 : 2
5 : 2
6 : 2
(Other):422
names(dta_bp)[1] "ID" "status" "driver" "suburban"
#long data to wide data library(tidyverse)
library(dplyr)
dta_bp %>% dplyr::rename(Group = status, Driver = driver, Suburban = suburban) %>% # column rename
#long data to wide data
library(tidyverse) #這邊是參考昶濬的作法,完成inclass-5,這部分有比較了解
backpain %>%
dplyr::rename(Group = status, #先修改名字
Driver = driver,
Suburban = suburban) %>%
group_by(Driver, Suburban) %>% # 依照Driver和Suburban分組
tidyr::spread(key="Group", value ="Group") %>% # key(Column names) to value
summarize(Case = sum(is.na(case)), # sum of case number
Control = sum(is.na(control)), # sum of control number
Total = n()) %>% # sum of total
as.data.frame () %>% #tibble to dataframe
knitr::kable ()| Driver | Suburban | Case | Control | Total |
|---|---|---|---|---|
| no | no | 38 | 17 | 64 |
| no | yes | 5 | 4 | 11 |
| yes | no | 43 | 44 | 107 |
| yes | yes | 37 | 58 | 158 |
The data set Vocab{carData} gives observations on gender, education and vocabulary, from respondents to U.S. General Social Surveys, 1972-2004. Summarize the relationship between education and vocabulary over the years by gender.
library(carData)
data(Vocab,package = "carData")
str(Vocab)'data.frame': 30351 obs. of 4 variables:
$ year : num 1974 1974 1974 1974 1974 ...
$ sex : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 2 2 2 1 1 ...
$ education : num 14 16 10 10 12 16 17 10 12 11 ...
$ vocabulary: num 9 9 9 5 8 8 9 5 3 5 ...
- attr(*, "na.action")= 'omit' Named int [1:32115] 1 2 3 4 5 6 7 8 9 10 ...
..- attr(*, "names")= chr [1:32115] "19720001" "19720002" "19720003" "19720004" ...
class(Vocab)[1] "data.frame"
typeof(Vocab)[1] "list"
summary(Vocab) year sex education vocabulary
Min. :1974 Female:17148 Min. : 0.00 Min. : 0.000
1st Qu.:1987 Male :13203 1st Qu.:12.00 1st Qu.: 5.000
Median :1994 Median :12.00 Median : 6.000
Mean :1995 Mean :13.03 Mean : 6.004
3rd Qu.:2006 3rd Qu.:15.00 3rd Qu.: 7.000
Max. :2016 Max. :20.00 Max. :10.000
names(Vocab)[1] "year" "sex" "education" "vocabulary"
library(lattice)
library(tidyr)
library(dplyr)
Vocab %>%
mutate(year = as.factor(year)) %>% #把year變成factor ,若沒有設表格上都會以year呈現
lattice::xyplot(vocabulary ~ education | year,
groups=sex,
type=c("p","g","r"),
data=.,
xlab="education",
ylab="vocabulary score",
layout = c(6, 4), #設定xyplot的圖形排列為6x4
cex=0.3, #縮小男生女生在表格中的點
auto.key=list(columns=2))Supply comments to each code chunk in the following survey rmarkdown file and preview it as an R notebook or knit to html.
The data set concerns species and weight of animals caught in plots in a study area in Arizona over time.
Each row holds information for a single animal, and the columns represent:
pacman::p_load(tidyverse)dta <- read_csv("http://kbroman.org/datacarp/portal_data_joined.csv")
#讀csv檔,存成dtaglimpse(dta)Rows: 34,786
Columns: 13
$ record_id <dbl> 1, 72, 224, 266, 349, 363, 435, 506, 588, 661, 748, 84~
$ month <dbl> 7, 8, 9, 10, 11, 11, 12, 1, 2, 3, 4, 5, 6, 8, 9, 10, 1~
$ day <dbl> 16, 19, 13, 16, 12, 12, 10, 8, 18, 11, 8, 6, 9, 5, 4, ~
$ year <dbl> 1977, 1977, 1977, 1977, 1977, 1977, 1977, 1978, 1978, ~
$ plot_id <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~
$ species_id <chr> "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", ~
$ sex <chr> "M", "M", NA, NA, NA, NA, NA, NA, "M", NA, NA, "M", "M~
$ hindfoot_length <dbl> 32, 31, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, NA, 34~
$ weight <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 218, NA, NA, 204, 200,~
$ genus <chr> "Neotoma", "Neotoma", "Neotoma", "Neotoma", "Neotoma",~
$ species <chr> "albigula", "albigula", "albigula", "albigula", "albig~
$ taxa <chr> "Rodent", "Rodent", "Rodent", "Rodent", "Rodent", "Rod~
$ plot_type <chr> "Control", "Control", "Control", "Control", "Control",~
#總覽資料
typeof(dta)[1] "list"
##資料型態
class(dta)[1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
#資料型態dim(dta)[1] 34786 13
#資料的rowx column=34,786 x 13dplyr::select(dta, plot_id, species_id, weight) %>% head()# A tibble: 6 x 3
plot_id species_id weight
<dbl> <chr> <dbl>
1 2 NL NA
2 2 NL NA
3 2 NL NA
4 2 NL NA
5 2 NL NA
6 2 NL NA
#選資料的plot_id, species_id, weight做表格,列出前6項dplyr::select(dta, -record_id, -species_id) %>% head()# A tibble: 6 x 11
month day year plot_id sex hindfoot_length weight genus species taxa
<dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr> <chr>
1 7 16 1977 2 M 32 NA Neotoma albigula Rodent
2 8 19 1977 2 M 31 NA Neotoma albigula Rodent
3 9 13 1977 2 <NA> NA NA Neotoma albigula Rodent
4 10 16 1977 2 <NA> NA NA Neotoma albigula Rodent
5 11 12 1977 2 <NA> NA NA Neotoma albigula Rodent
6 11 12 1977 2 <NA> NA NA Neotoma albigula Rodent
# ... with 1 more variable: plot_type <chr>
#選資料的"除了" record_id, species_idt以外的資料做表格,列出前6項year1995<-dplyr::filter(dta, year == 1995) %>% head()
#以year當過濾,篩選出year==1995的資料
str(year1995) #column數沒有變,只剩下6筆資料tibble [6 x 13] (S3: tbl_df/tbl/data.frame)
$ record_id : num [1:6] 22314 22728 22899 23032 22003 ...
$ month : num [1:6] 6 9 10 12 1 2
$ day : num [1:6] 7 23 28 2 11 4
$ year : num [1:6] 1995 1995 1995 1995 1995 ...
$ plot_id : num [1:6] 2 2 2 2 2 2
$ species_id : chr [1:6] "NL" "NL" "NL" "NL" ...
$ sex : chr [1:6] "M" "F" "F" "F" ...
$ hindfoot_length: num [1:6] 34 32 32 33 37 36
$ weight : num [1:6] NA 165 171 NA 41 45
$ genus : chr [1:6] "Neotoma" "Neotoma" "Neotoma" "Neotoma" ...
$ species : chr [1:6] "albigula" "albigula" "albigula" "albigula" ...
$ taxa : chr [1:6] "Rodent" "Rodent" "Rodent" "Rodent" ...
$ plot_type : chr [1:6] "Control" "Control" "Control" "Control" ...
summary(year1995$year) #確定一下year1995$year的樣貌 Min. 1st Qu. Median Mean 3rd Qu. Max.
1995 1995 1995 1995 1995 1995
year1995 #列出year1995所有資料,確實只有6筆# A tibble: 6 x 13
record_id month day year plot_id species_id sex hindfoot_length weight
<dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
1 22314 6 7 1995 2 NL M 34 NA
2 22728 9 23 1995 2 NL F 32 165
3 22899 10 28 1995 2 NL F 32 171
4 23032 12 2 1995 2 NL F 33 NA
5 22003 1 11 1995 2 DM M 37 41
6 22042 2 4 1995 2 DM F 36 45
# ... with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
# plot_type <chr>
weight5<-dplyr::select(dplyr::filter(dta, weight <= 5), species_id, sex, weight)
#選擇weight<=5的資料,只列出species_id, sex, weight
str(weight5)tibble [75 x 3] (S3: tbl_df/tbl/data.frame)
$ species_id: chr [1:75] "PF" "PF" "PF" "PF" ...
$ sex : chr [1:75] "M" "F" "F" "F" ...
$ weight : num [1:75] 5 5 5 4 5 4 5 5 5 5 ...
head(dplyr::select(dplyr::filter(dta, weight <= 5), species_id, sex, weight))# A tibble: 6 x 3
species_id sex weight
<chr> <chr> <dbl>
1 PF M 5
2 PF F 5
3 PF F 5
4 PF F 4
5 PF F 5
6 PF F 4
#head在前面跟在後面沒有差別,都是列出前6個row
dplyr::select(dplyr::filter(dta, weight <= 5), species_id, sex, weight)%>% head()# A tibble: 6 x 3
species_id sex weight
<chr> <chr> <dbl>
1 PF M 5
2 PF F 5
3 PF F 5
4 PF F 4
5 PF F 5
6 PF F 4
dta %>%
dplyr::filter(weight <= 5) %>%
dplyr::select(species_id, sex, weight) %>%
head# A tibble: 6 x 3
species_id sex weight
<chr> <chr> <dbl>
1 PF M 5
2 PF F 5
3 PF F 5
4 PF F 4
5 PF F 5
6 PF F 4
#跟前面一樣,只是語法不同dta %>%
mutate(weight_kg = weight / 1000,
#mutate新增衍生變數weight_kg= weight / 1000
#mutate新增衍生變數weight_lb = weight_kg * 2.2
weight_lb = weight_kg * 2.2) %>%
head()# A tibble: 6 x 15
record_id month day year plot_id species_id sex hindfoot_length weight
<dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
1 1 7 16 1977 2 NL M 32 NA
2 72 8 19 1977 2 NL M 31 NA
3 224 9 13 1977 2 NL <NA> NA NA
4 266 10 16 1977 2 NL <NA> NA NA
5 349 11 12 1977 2 NL <NA> NA NA
6 363 11 12 1977 2 NL <NA> NA NA
# ... with 6 more variables: genus <chr>, species <chr>, taxa <chr>,
# plot_type <chr>, weight_kg <dbl>, weight_lb <dbl>
#原來有13個,再加上新增的weight_kg、weight_lb,共有15個dta %>%
filter(!is.na(weight)) %>%
#filter()篩選符合條件的觀測值
#filter(!is.na)篩選出weight非NA的值(!is.na)
group_by(sex, species_id) %>%
#group_by()依照類別變數分組,常搭配 summarize()
#依照類別sex, species_id分組
summarize(mean_weight = mean(weight)) %>%
#summarise()聚合變數
#summarize(mean_weight = mean(weight))
#依照group分組後,創一個mean_weight,帶入mean(weight)
arrange(desc(mean_weight)) %>%
#arrange()依照變數排序觀測值
#arrange(desc(mean_weight)) mean_weight以descending的方式
head()# A tibble: 6 x 3
# Groups: sex [3]
sex species_id mean_weight
<chr> <chr> <dbl>
1 <NA> NL 168.
2 M NL 166.
3 F NL 154.
4 M SS 130
5 <NA> SH 130
6 M DS 122.
#列出前6筆資料,可以觀察到sex和species_id都是不同的組合,其中NA值也會納入dta %>%
group_by(sex) %>%
#依性別分組
tally# A tibble: 3 x 2
sex n
<chr> <int>
1 F 15690
2 M 17348
3 <NA> 1748
#依分組類別資料計算個數,包括NA也會統計個數
table(dta$sex)
F M
15690 17348
#table不會計算NA個數dta_count<-dta %>%
count(sex)
#跟tally功能一樣
str(dta_count)spec_tbl_df [3 x 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ sex: chr [1:3] "F" "M" NA
$ n : int [1:3] 15690 17348 1748
- attr(*, "spec")=
.. cols(
.. record_id = col_double(),
.. month = col_double(),
.. day = col_double(),
.. year = col_double(),
.. plot_id = col_double(),
.. species_id = col_character(),
.. sex = col_character(),
.. hindfoot_length = col_double(),
.. weight = col_double(),
.. genus = col_character(),
.. species = col_character(),
.. taxa = col_character(),
.. plot_type = col_character()
.. )
- attr(*, "problems")=<externalptr>
dta_count<-dta %>%
group_by(sex) %>%
summarize(count = n())
#按照sex分組
#summarize(計數=n)
#依照group分組後,創一個count,帶入分組的n
str(dta_count)tibble [3 x 2] (S3: tbl_df/tbl/data.frame)
$ sex : chr [1:3] "F" "M" NA
$ count: int [1:3] 15690 17348 1748
dta_year<-dta %>%
group_by(sex) %>%
summarize(count = sum(!is.na(year)))
#按照sex分組
#summarize(計數=n但排除year是na的)
#依照group分組後,創一個count,帶入分組的n
str(dta_year)tibble [3 x 2] (S3: tbl_df/tbl/data.frame)
$ sex : chr [1:3] "F" "M" NA
$ count: int [1:3] 15690 17348 1748
dta_gw <- dta %>%
filter(!is.na(weight)) %>%
#篩選排除weight為NA的值
group_by(genus, plot_id) %>%
#依genus, plot_id分組
summarize(mean_weight = mean(weight))
#依照group分組後,創一個mean_weight,帶入mean(weight)
head(dta_gw)# A tibble: 6 x 3
# Groups: genus [1]
genus plot_id mean_weight
<chr> <dbl> <dbl>
1 Baiomys 1 7
2 Baiomys 2 6
3 Baiomys 3 8.61
4 Baiomys 5 7.75
5 Baiomys 18 9.5
6 Baiomys 19 9.53
#列出前6筆資料看一下glimpse(dta_gw)Rows: 196
Columns: 3
Groups: genus [10]
$ genus <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
$ plot_id <dbl> 1, 2, 3, 5, 18, 19, 20, 21, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,~
$ mean_weight <dbl> 7.000000, 6.000000, 8.611111, 7.750000, 9.500000, 9.533333~
#總覽資料,剩下196 row x 3 column
#column:上述篩選的genus, plot_id和新創建的mean_weighttable(dta_gw$genus)
Baiomys Chaetodipus Dipodomys Neotoma Onychomys
8 24 24 24 24
Perognathus Peromyscus Reithrodontomys Sigmodon Spermophilus
23 24 24 19 2
dta_w <- dta_gw %>%
spread(key = genus, value = mean_weight)
#spread用來擴展表,把某一列的值(鍵值對)分開拆成多列
#通常是long data要變成wide data
#key=是原來要拆的column,value=是拆出來的那些列的值應該填什麼
#把genus的變成wide data,並且填入mean_weight
head(dta_w)# A tibble: 6 x 11
plot_id Baiomys Chaetodipus Dipodomys Neotoma Onychomys Perognathus Peromyscus
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 7 22.2 60.2 156. 27.7 9.62 22.2
2 2 6 25.1 55.7 169. 26.9 6.95 22.3
3 3 8.61 24.6 52.0 158. 26.0 7.51 21.4
4 4 NA 23.0 57.5 164. 28.1 7.82 22.6
5 5 7.75 18.0 51.1 190. 27.0 8.66 21.2
6 6 NA 24.9 58.6 180. 25.9 7.81 21.8
# ... with 3 more variables: Reithrodontomys <dbl>, Sigmodon <dbl>,
# Spermophilus <dbl>
head(dta_w)# A tibble: 6 x 11
plot_id Baiomys Chaetodipus Dipodomys Neotoma Onychomys Perognathus Peromyscus
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 7 22.2 60.2 156. 27.7 9.62 22.2
2 2 6 25.1 55.7 169. 26.9 6.95 22.3
3 3 8.61 24.6 52.0 158. 26.0 7.51 21.4
4 4 NA 23.0 57.5 164. 28.1 7.82 22.6
5 5 7.75 18.0 51.1 190. 27.0 8.66 21.2
6 6 NA 24.9 58.6 180. 25.9 7.81 21.8
# ... with 3 more variables: Reithrodontomys <dbl>, Sigmodon <dbl>,
# Spermophilus <dbl>
glimpse(dta_w)Rows: 24
Columns: 11
$ plot_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,~
$ Baiomys <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA~
$ Chaetodipus <dbl> 22.19939, 25.11014, 24.63636, 23.02381, 17.98276, 24.8~
$ Dipodomys <dbl> 60.23214, 55.68259, 52.04688, 57.52454, 51.11356, 58.6~
$ Neotoma <dbl> 156.2222, 169.1436, 158.2414, 164.1667, 190.0370, 179.~
$ Onychomys <dbl> 27.67550, 26.87302, 26.03241, 28.09375, 27.01695, 25.8~
$ Perognathus <dbl> 9.625000, 6.947368, 7.507812, 7.824427, 8.658537, 7.80~
$ Peromyscus <dbl> 22.22222, 22.26966, 21.37037, 22.60000, 21.23171, 21.8~
$ Reithrodontomys <dbl> 11.375000, 10.680556, 10.516588, 10.263158, 11.154545,~
$ Sigmodon <dbl> NA, 70.85714, 65.61404, 82.00000, 82.66667, 68.77778, ~
$ Spermophilus <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 13~
#從table(dta_gw$genus)可看的出來,genus共有10種,最多的資料是24筆
#多一個column是原來的plot_iddta_gw %>%
spread(genus, mean_weight, fill = 0) %>%
head()# A tibble: 6 x 11
plot_id Baiomys Chaetodipus Dipodomys Neotoma Onychomys Perognathus Peromyscus
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 7 22.2 60.2 156. 27.7 9.62 22.2
2 2 6 25.1 55.7 169. 26.9 6.95 22.3
3 3 8.61 24.6 52.0 158. 26.0 7.51 21.4
4 4 0 23.0 57.5 164. 28.1 7.82 22.6
5 5 7.75 18.0 51.1 190. 27.0 8.66 21.2
6 6 0 24.9 58.6 180. 25.9 7.81 21.8
# ... with 3 more variables: Reithrodontomys <dbl>, Sigmodon <dbl>,
# Spermophilus <dbl>
#spread同上面
#這次用0取代NA所有資料dta_l <- dta_w %>%
gather(key = genus, value = mean_weight, -plot_id)
head(dta_l)# A tibble: 6 x 3
plot_id genus mean_weight
<dbl> <chr> <dbl>
1 1 Baiomys 7
2 2 Baiomys 6
3 3 Baiomys 8.61
4 4 Baiomys NA
5 5 Baiomys 7.75
6 6 Baiomys NA
#gather:wide data再變回long data
#依照genus集合
#刪去"plot_id"這個columnglimpse(dta_l)Rows: 240
Columns: 3
$ plot_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,~
$ genus <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
$ mean_weight <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA, NA~
#為什麼前面-plot_id,這邊glimpse後,plot_id依然在?dta_w_Bs <- dta_w %>%
gather(key = genus, value = mean_weight, Baiomys:Spermophilus)
#只gather的genus變數中的Baiomys到Spermophilus類別
head(dta_w_Bs)# A tibble: 6 x 3
plot_id genus mean_weight
<dbl> <chr> <dbl>
1 1 Baiomys 7
2 2 Baiomys 6
3 3 Baiomys 8.61
4 4 Baiomys NA
5 5 Baiomys 7.75
6 6 Baiomys NA
tail(dta_w_Bs)# A tibble: 6 x 3
plot_id genus mean_weight
<dbl> <chr> <dbl>
1 19 Spermophilus NA
2 20 Spermophilus 57
3 21 Spermophilus NA
4 22 Spermophilus NA
5 23 Spermophilus NA
6 24 Spermophilus NA
glimpse(dta_w_Bs)Rows: 240
Columns: 3
$ plot_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,~
$ genus <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
$ mean_weight <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA, NA~
table(dta_w_Bs$genus)
Baiomys Chaetodipus Dipodomys Neotoma Onychomys
24 24 24 24 24
Perognathus Peromyscus Reithrodontomys Sigmodon Spermophilus
24 24 24 24 24
#確認genus變數中是真的從Baiomys到Spermophilus類別dta_complete <- dta %>%
filter(!is.na(weight),
!is.na(hindfoot_length),
!is.na(sex))
#篩選資料,排除weight有NA的、hindfoot_length有NA的、sex
glimpse(dta_complete)Rows: 30,676
Columns: 13
$ record_id <dbl> 845, 1164, 1261, 1756, 1818, 1882, 2133, 2184, 2406, 3~
$ month <dbl> 5, 8, 9, 4, 5, 7, 10, 11, 1, 5, 5, 7, 10, 11, 1, 2, 3,~
$ day <dbl> 6, 5, 4, 29, 30, 4, 25, 17, 16, 18, 18, 8, 1, 23, 25, ~
$ year <dbl> 1978, 1978, 1978, 1979, 1979, 1979, 1979, 1979, 1980, ~
$ plot_id <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~
$ species_id <chr> "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", ~
$ sex <chr> "M", "M", "M", "M", "M", "M", "F", "F", "F", "F", "F",~
$ hindfoot_length <dbl> 32, 34, 32, 33, 32, 32, 33, 30, 33, 31, 33, 30, 34, 34~
$ weight <dbl> 204, 199, 197, 166, 184, 206, 274, 186, 184, 87, 174, ~
$ genus <chr> "Neotoma", "Neotoma", "Neotoma", "Neotoma", "Neotoma",~
$ species <chr> "albigula", "albigula", "albigula", "albigula", "albig~
$ taxa <chr> "Rodent", "Rodent", "Rodent", "Rodent", "Rodent", "Rod~
$ plot_type <chr> "Control", "Control", "Control", "Control", "Control",~
species_counts <- dta_complete %>%
count(species_id) %>%
#用排除NA的資料dta_complete,計數species_id
filter(n >= 50)
#只列出n>=50的資料
glimpse(species_counts)Rows: 14
Columns: 2
$ species_id <chr> "DM", "DO", "DS", "NL", "OL", "OT", "PB", "PE", "PF", "PM",~
$ n <int> 9727, 2790, 2023, 1045, 905, 2081, 2803, 1198, 1469, 835, 2~
dta_complete %>%
#刪除filter(n >= 50)看看發生甚麼變化
count(species_id) %>%
#用排除NA的資料dta_complete,計數species_id
glimpse()Rows: 24
Columns: 2
$ species_id <chr> "BA", "DM", "DO", "DS", "NL", "OL", "OT", "OX", "PB", "PE",~
$ n <int> 45, 9727, 2790, 2023, 1045, 905, 2081, 5, 2803, 1198, 1469,~
#資料果然筆前一筆更多summary(species_counts$species_id) Length Class Mode
14 character character
dta_complete <- dta_complete %>%
filter(species_id %in% species_counts$species_id)
#判斷某個值(或向量),是否存在於另一個向量之中,會使用 %in% 的符號
#species_id是不是有在species_counts$species_id裡面?
glimpse(dta_complete)Rows: 30,463
Columns: 13
$ record_id <dbl> 845, 1164, 1261, 1756, 1818, 1882, 2133, 2184, 2406, 3~
$ month <dbl> 5, 8, 9, 4, 5, 7, 10, 11, 1, 5, 5, 7, 10, 11, 1, 2, 3,~
$ day <dbl> 6, 5, 4, 29, 30, 4, 25, 17, 16, 18, 18, 8, 1, 23, 25, ~
$ year <dbl> 1978, 1978, 1978, 1979, 1979, 1979, 1979, 1979, 1980, ~
$ plot_id <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~
$ species_id <chr> "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", ~
$ sex <chr> "M", "M", "M", "M", "M", "M", "F", "F", "F", "F", "F",~
$ hindfoot_length <dbl> 32, 34, 32, 33, 32, 32, 33, 30, 33, 31, 33, 30, 34, 34~
$ weight <dbl> 204, 199, 197, 166, 184, 206, 274, 186, 184, 87, 174, ~
$ genus <chr> "Neotoma", "Neotoma", "Neotoma", "Neotoma", "Neotoma",~
$ species <chr> "albigula", "albigula", "albigula", "albigula", "albig~
$ taxa <chr> "Rodent", "Rodent", "Rodent", "Rodent", "Rodent", "Rod~
$ plot_type <chr> "Control", "Control", "Control", "Control", "Control",~
head(dta_complete)# A tibble: 6 x 13
record_id month day year plot_id species_id sex hindfoot_length weight
<dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
1 845 5 6 1978 2 NL M 32 204
2 1164 8 5 1978 2 NL M 34 199
3 1261 9 4 1978 2 NL M 32 197
4 1756 4 29 1979 2 NL M 33 166
5 1818 5 30 1979 2 NL M 32 184
6 1882 7 4 1979 2 NL M 32 206
# ... with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
# plot_type <chr>