Comment on what each code chunk in the following classroom markdown fileis trying to achive and on its output.
::p_load(WWGbook)
pacmandata(classroom, package="WWGbook")
#用?查詢classroom資料集
?classroom
R右下角視窗會跳出這個data set的內容說明。
The Study of Instructional Improvement (SII; Hill, Rowan, and Ball, 2004) was carried out by researchers at the University of Michigan to study the math achievement scores of first- and third-grade students in randomly selected classrooms from a national U.S. sample of elementary schools.
#exam data structure
str(classroom)
'data.frame': 1190 obs. of 12 variables:
$ sex : int 1 0 1 0 0 1 0 0 1 0 ...
$ minority: int 1 1 1 1 1 1 1 1 1 1 ...
$ mathkind: int 448 460 511 449 425 450 452 443 422 480 ...
$ mathgain: int 32 109 56 83 53 65 51 66 88 -7 ...
$ ses : num 0.46 -0.27 -0.03 -0.38 -0.03 0.76 -0.03 0.2 0.64 0.13 ...
$ yearstea: num 1 1 1 2 2 2 2 2 2 2 ...
$ mathknow: num NA NA NA -0.11 -0.11 -0.11 -0.11 -0.11 -0.11 -0.11 ...
$ housepov: num 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 ...
$ mathprep: num 2 2 2 3.25 3.25 3.25 3.25 3.25 3.25 3.25 ...
$ classid : int 160 160 160 217 217 217 217 217 217 217 ...
$ schoolid: int 1 1 1 1 1 1 1 1 1 1 ...
$ childid : int 1 2 3 4 5 6 7 8 9 10 ...
<- classroom[classroom$schoolid,c("schoolid", "housepov")] #vector中的元素是重複的
dta_schl head(dta_schl)
schoolid housepov
1 1 0.082
1.1 1 0.082
1.2 1 0.082
1.3 1 0.082
1.4 1 0.082
1.5 1 0.082
#duplicated=FALSE設定邏輯向量,回傳c("schoolid", "housepov")不重複的元素
<- classroom[duplicated(classroom$schoolid)==FALSE, c("schoolid", "housepov")]
dta_schl head(dta_schl,10)
schoolid housepov
1 1 0.082
12 2 0.082
22 3 0.086
36 4 0.365
42 5 0.511
48 6 0.044
60 7 0.148
74 8 0.085
90 9 0.537
96 10 0.346
#依序回傳c(11, 10, 6,7,9)這五個變數不重複的元素
<- classroom[duplicated(classroom$classid)==FALSE, c(11, 10, 6,7,9)]
dta_cls head(dta_cls)
schoolid classid yearstea mathknow mathprep
1 1 160 1.00 NA 2.00
4 1 217 2.00 -0.11 3.25
12 2 197 1.00 -1.25 2.50
14 2 211 2.00 -0.72 2.33
18 2 307 12.54 NA 2.30
22 3 11 20.00 0.45 3.83
#只取c(12, 10, 11, 1:5)這8個變數,且指定順序
<- classroom[, c(12, 10, 11, 1:5)]
dta_chld head(dta_chld)
childid classid schoolid sex minority mathkind mathgain ses
1 1 160 1 1 1 448 32 0.46
2 2 160 1 0 1 460 109 -0.27
3 3 160 1 1 1 511 56 -0.03
4 4 217 1 0 1 449 83 -0.38
5 5 217 1 0 1 425 53 -0.03
6 6 217 1 1 1 450 65 0.76
#list(dta_schl, dta_cls, dta_chld)將不同類型的三個向量放到一個object中
#sapply()功能與lapply()相同
#檢索object的dim維度
sapply(list(dta_schl, dta_cls, dta_chld), dim)
[,1] [,2] [,3]
[1,] 107 312 1190
[2,] 2 5 8
dta_schl這個data.frame中2個維度(變數),107個觀察值。
dta_cls這個data.frame中5個變數,312個觀察值。
dta_chld這個data.frame中8個變數,1190個觀察值。
#依據classid、schoolid來combine dta_chld和dta_cls這兩個data.frame
<- merge(x=dta_chld, y=dta_cls, by=c("classid", "schoolid"))
dta_12 head(dta_12)
classid schoolid childid sex minority mathkind mathgain ses yearstea
1 1 61 653 0 1 442 35 -0.09 2
2 1 61 654 0 1 422 109 -1.32 2
3 1 61 655 1 1 482 66 0.43 2
4 1 61 656 1 1 489 19 -0.06 2
5 1 61 657 0 1 460 10 -1.28 2
6 10 17 216 1 1 381 143 -0.49 5
mathknow mathprep
1 -0.72 2.50
2 -0.72 2.50
3 -0.72 2.50
4 -0.72 2.50
5 -0.72 2.50
6 -1.26 3.25
#依據classid、schoolid來combine dta_chld和dta_cls這兩個data.frame
<- merge(x=dta_chld, y=dta_cls, by=c("classid", "schoolid")) dta_13
#依據schoolid來combine dta_chld和dta_cls
<- merge(x=dta_cls, y=dta_schl, by="schoolid") dta_23
#依據schoolid來combine dta_12和dta_schl
<- merge(x=dta_12, y=dta_schl, by=c("schoolid")) dta_123
#合併後的data frame分別有多少觀察值(obs)與維度(變數)
sapply(list(dta_12, dta_13, dta_23, dta_123), dim)
[,1] [,2] [,3] [,4]
[1,] 1190 1190 312 1190
[2,] 11 11 6 12
Merge the two data sets: state.x77{datasets} and USArrests{datasets} and compute all pair-wise correlations for numerical variables.
Is there anything interesting to report?
#show dataset
|> head() |> knitr::kable() state.x77
Population | Income | Illiteracy | Life Exp | Murder | HS Grad | Frost | Area | |
---|---|---|---|---|---|---|---|---|
Alabama | 3615 | 3624 | 2.1 | 69.05 | 15.1 | 41.3 | 20 | 50708 |
Alaska | 365 | 6315 | 1.5 | 69.31 | 11.3 | 66.7 | 152 | 566432 |
Arizona | 2212 | 4530 | 1.8 | 70.55 | 7.8 | 58.1 | 15 | 113417 |
Arkansas | 2110 | 3378 | 1.9 | 70.66 | 10.1 | 39.9 | 65 | 51945 |
California | 21198 | 5114 | 1.1 | 71.71 | 10.3 | 62.6 | 20 | 156361 |
Colorado | 2541 | 4884 | 0.7 | 72.06 | 6.8 | 63.9 | 166 | 103766 |
#了解資料
str(state.x77)
num [1:50, 1:8] 3615 365 2212 2110 21198 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
..$ : chr [1:8] "Population" "Income" "Illiteracy" "Life Exp" ...
class(state.x77)
[1] "matrix" "array"
typeof(state.x77)
[1] "double"
comment(state.x77)
NULL
#show dataset
|> head() |> knitr::kable() USArrests
Murder | Assault | UrbanPop | Rape | |
---|---|---|---|---|
Alabama | 13.2 | 236 | 58 | 21.2 |
Alaska | 10.0 | 263 | 48 | 44.5 |
Arizona | 8.1 | 294 | 80 | 31.0 |
Arkansas | 8.8 | 190 | 50 | 19.5 |
California | 9.0 | 276 | 91 | 40.6 |
Colorado | 7.9 | 204 | 78 | 38.7 |
#了解資料
str(USArrests)
'data.frame': 50 obs. of 4 variables:
$ Murder : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
$ Assault : int 236 263 294 190 276 204 110 238 335 211 ...
$ UrbanPop: int 58 48 80 50 91 78 77 72 80 60 ...
$ Rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
class(USArrests)
[1] "data.frame"
typeof(USArrests)
[1] "list"
comment(USArrests)
NULL
#Merge data files - default
<-merge(state.x77, USArrests)
q2dta1|> head() |> knitr::kable() q2dta1
Murder | Population | Income | Illiteracy | Life Exp | HS Grad | Frost | Area | Assault | UrbanPop | Rape |
---|---|---|---|---|---|---|---|---|---|---|
2.7 | 1058 | 3694 | 0.7 | 70.39 | 54.7 | 161 | 30920 | 72 | 66 | 14.9 |
3.3 | 5814 | 4755 | 1.1 | 71.83 | 58.5 | 103 | 7826 | 110 | 77 | 11.1 |
3.3 | 812 | 4281 | 0.7 | 71.23 | 57.6 | 174 | 9027 | 110 | 77 | 11.1 |
4.3 | 3559 | 4864 | 0.6 | 71.72 | 63.5 | 32 | 66570 | 102 | 62 | 16.5 |
5.3 | 813 | 4119 | 0.6 | 71.87 | 59.5 | 126 | 82677 | 46 | 83 | 20.2 |
6.8 | 2541 | 4884 | 0.7 | 72.06 | 63.9 | 166 | 103766 | 161 | 60 | 15.6 |
依據相同變數Murder合併資料,且刪除missing data。
#Merge data files - all
<-merge(state.x77, USArrests, all = TRUE)
q2dta2|> head() |> knitr::kable() q2dta2
Murder | Population | Income | Illiteracy | Life Exp | HS Grad | Frost | Area | Assault | UrbanPop | Rape |
---|---|---|---|---|---|---|---|---|---|---|
0.8 | NA | NA | NA | NA | NA | NA | NA | 45 | 44 | 7.3 |
1.4 | 637 | 5087 | 0.8 | 72.78 | 50.3 | 186 | 69273 | NA | NA | NA |
1.7 | 681 | 4167 | 0.5 | 72.08 | 53.3 | 172 | 75955 | NA | NA | NA |
2.1 | NA | NA | NA | NA | NA | NA | NA | 83 | 51 | 7.8 |
2.1 | NA | NA | NA | NA | NA | NA | NA | 57 | 56 | 9.5 |
2.2 | NA | NA | NA | NA | NA | NA | NA | 56 | 57 | 11.3 |
方法一會依據相同變數Murder合併資料,但保留了missing data。
# merge two files together by matching row.names
<- merge(state.x77, USArrests, by="row.names", all=T)
q2dta3|> head() |> knitr::kable() q2dta3
Row.names | Population | Income | Illiteracy | Life Exp | Murder.x | HS Grad | Frost | Area | Murder.y | Assault | UrbanPop | Rape |
---|---|---|---|---|---|---|---|---|---|---|---|---|
Alabama | 3615 | 3624 | 2.1 | 69.05 | 15.1 | 41.3 | 20 | 50708 | 13.2 | 236 | 58 | 21.2 |
Alaska | 365 | 6315 | 1.5 | 69.31 | 11.3 | 66.7 | 152 | 566432 | 10.0 | 263 | 48 | 44.5 |
Arizona | 2212 | 4530 | 1.8 | 70.55 | 7.8 | 58.1 | 15 | 113417 | 8.1 | 294 | 80 | 31.0 |
Arkansas | 2110 | 3378 | 1.9 | 70.66 | 10.1 | 39.9 | 65 | 51945 | 8.8 | 190 | 50 | 19.5 |
California | 21198 | 5114 | 1.1 | 71.71 | 10.3 | 62.6 | 20 | 156361 | 9.0 | 276 | 91 | 40.6 |
Colorado | 2541 | 4884 | 0.7 | 72.06 | 6.8 | 63.9 | 166 | 103766 | 7.9 | 204 | 78 | 38.7 |
方法二依據50州的州名(row.name)合併資料。因有兩個相同變數,R自動命名為Murder.x和Murder.y。
#data.frame將兩類資料放在同一個框架
<-data.frame(state.x77, USArrests)
q2dta4|> head() |> knitr::kable() q2dta4
Population | Income | Illiteracy | Life.Exp | Murder | HS.Grad | Frost | Area | Murder.1 | Assault | UrbanPop | Rape | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
Alabama | 3615 | 3624 | 2.1 | 69.05 | 15.1 | 41.3 | 20 | 50708 | 13.2 | 236 | 58 | 21.2 |
Alaska | 365 | 6315 | 1.5 | 69.31 | 11.3 | 66.7 | 152 | 566432 | 10.0 | 263 | 48 | 44.5 |
Arizona | 2212 | 4530 | 1.8 | 70.55 | 7.8 | 58.1 | 15 | 113417 | 8.1 | 294 | 80 | 31.0 |
Arkansas | 2110 | 3378 | 1.9 | 70.66 | 10.1 | 39.9 | 65 | 51945 | 8.8 | 190 | 50 | 19.5 |
California | 21198 | 5114 | 1.1 | 71.71 | 10.3 | 62.6 | 20 | 156361 | 9.0 | 276 | 91 | 40.6 |
Colorado | 2541 | 4884 | 0.7 | 72.06 | 6.8 | 63.9 | 166 | 103766 | 7.9 | 204 | 78 | 38.7 |
使用data.frame()合併資料,合併資料的方式也是依據50州的州名,但合併後州名row.name不是變數。
cor(q2dta1)|> knitr::kable(digits = 2)
Murder | Population | Income | Illiteracy | Life Exp | HS Grad | Frost | Area | Assault | UrbanPop | Rape | |
---|---|---|---|---|---|---|---|---|---|---|---|
Murder | 1.00 | 0.33 | -0.10 | 0.80 | -0.74 | -0.59 | -0.53 | 0.37 | 0.81 | -0.11 | 0.72 |
Population | 0.33 | 1.00 | -0.02 | 0.12 | 0.05 | -0.40 | -0.36 | -0.09 | 0.41 | 0.57 | 0.56 |
Income | -0.10 | -0.02 | 1.00 | -0.46 | 0.22 | 0.66 | 0.36 | 0.57 | 0.28 | 0.09 | 0.02 |
Illiteracy | 0.80 | 0.12 | -0.46 | 1.00 | -0.75 | -0.68 | -0.67 | 0.18 | 0.51 | -0.35 | 0.50 |
Life Exp | -0.74 | 0.05 | 0.22 | -0.75 | 1.00 | 0.66 | 0.41 | -0.21 | -0.60 | 0.25 | -0.36 |
HS Grad | -0.59 | -0.40 | 0.66 | -0.68 | 0.66 | 1.00 | 0.60 | 0.36 | -0.37 | -0.13 | -0.41 |
Frost | -0.53 | -0.36 | 0.36 | -0.67 | 0.41 | 0.60 | 1.00 | 0.12 | -0.28 | 0.19 | -0.46 |
Area | 0.37 | -0.09 | 0.57 | 0.18 | -0.21 | 0.36 | 0.12 | 1.00 | 0.54 | -0.07 | 0.50 |
Assault | 0.81 | 0.41 | 0.28 | 0.51 | -0.60 | -0.37 | -0.28 | 0.54 | 1.00 | 0.14 | 0.69 |
UrbanPop | -0.11 | 0.57 | 0.09 | -0.35 | 0.25 | -0.13 | 0.19 | -0.07 | 0.14 | 1.00 | 0.24 |
Rape | 0.72 | 0.56 | 0.02 | 0.50 | -0.36 | -0.41 | -0.46 | 0.50 | 0.69 | 0.24 | 1.00 |
cor(q2dta2)|> knitr::kable(digits = 2)
Murder | Population | Income | Illiteracy | Life Exp | HS Grad | Frost | Area | Assault | UrbanPop | Rape | |
---|---|---|---|---|---|---|---|---|---|---|---|
Murder | 1 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
Population | NA | 1 | NA | NA | NA | NA | NA | NA | NA | NA | NA |
Income | NA | NA | 1 | NA | NA | NA | NA | NA | NA | NA | NA |
Illiteracy | NA | NA | NA | 1 | NA | NA | NA | NA | NA | NA | NA |
Life Exp | NA | NA | NA | NA | 1 | NA | NA | NA | NA | NA | NA |
HS Grad | NA | NA | NA | NA | NA | 1 | NA | NA | NA | NA | NA |
Frost | NA | NA | NA | NA | NA | NA | 1 | NA | NA | NA | NA |
Area | NA | NA | NA | NA | NA | NA | NA | 1 | NA | NA | NA |
Assault | NA | NA | NA | NA | NA | NA | NA | NA | 1 | NA | NA |
UrbanPop | NA | NA | NA | NA | NA | NA | NA | NA | NA | 1 | NA |
Rape | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 1 |
有missing value,無法算correlation
cor(q2dta3[,-1])|> knitr::kable(digits = 2) #[,-1]
Population | Income | Illiteracy | Life Exp | Murder.x | HS Grad | Frost | Area | Murder.y | Assault | UrbanPop | Rape | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
Population | 1.00 | 0.21 | 0.11 | -0.07 | 0.34 | -0.10 | -0.33 | 0.02 | 0.32 | 0.32 | 0.51 | 0.31 |
Income | 0.21 | 1.00 | -0.44 | 0.34 | -0.23 | 0.62 | 0.23 | 0.36 | -0.22 | 0.04 | 0.48 | 0.36 |
Illiteracy | 0.11 | -0.44 | 1.00 | -0.59 | 0.70 | -0.66 | -0.67 | 0.08 | 0.71 | 0.51 | -0.06 | 0.15 |
Life Exp | -0.07 | 0.34 | -0.59 | 1.00 | -0.78 | 0.58 | 0.26 | -0.11 | -0.78 | -0.63 | 0.27 | -0.27 |
Murder.x | 0.34 | -0.23 | 0.70 | -0.78 | 1.00 | -0.49 | -0.54 | 0.23 | 0.93 | 0.74 | 0.02 | 0.58 |
HS Grad | -0.10 | 0.62 | -0.66 | 0.58 | -0.49 | 1.00 | 0.37 | 0.33 | -0.52 | -0.23 | 0.36 | 0.27 |
Frost | -0.33 | 0.23 | -0.67 | 0.26 | -0.54 | 0.37 | 1.00 | 0.06 | -0.54 | -0.47 | -0.25 | -0.28 |
Area | 0.02 | 0.36 | 0.08 | -0.11 | 0.23 | 0.33 | 0.06 | 1.00 | 0.15 | 0.23 | -0.06 | 0.52 |
Murder.y | 0.32 | -0.22 | 0.71 | -0.78 | 0.93 | -0.52 | -0.54 | 0.15 | 1.00 | 0.80 | 0.07 | 0.56 |
Assault | 0.32 | 0.04 | 0.51 | -0.63 | 0.74 | -0.23 | -0.47 | 0.23 | 0.80 | 1.00 | 0.26 | 0.67 |
UrbanPop | 0.51 | 0.48 | -0.06 | 0.27 | 0.02 | 0.36 | -0.25 | -0.06 | 0.07 | 0.26 | 1.00 | 0.41 |
Rape | 0.31 | 0.36 | 0.15 | -0.27 | 0.58 | 0.27 | -0.28 | 0.52 | 0.56 | 0.67 | 0.41 | 1.00 |
Error in cor(q2dta3) : ‘x’ 必須是數值,因此使用[,-1]排除第一行州名
cor(q2dta4)|> knitr::kable(digits = 2)
Population | Income | Illiteracy | Life.Exp | Murder | HS.Grad | Frost | Area | Murder.1 | Assault | UrbanPop | Rape | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
Population | 1.00 | 0.21 | 0.11 | -0.07 | 0.34 | -0.10 | -0.33 | 0.02 | 0.32 | 0.32 | 0.51 | 0.31 |
Income | 0.21 | 1.00 | -0.44 | 0.34 | -0.23 | 0.62 | 0.23 | 0.36 | -0.22 | 0.04 | 0.48 | 0.36 |
Illiteracy | 0.11 | -0.44 | 1.00 | -0.59 | 0.70 | -0.66 | -0.67 | 0.08 | 0.71 | 0.51 | -0.06 | 0.15 |
Life.Exp | -0.07 | 0.34 | -0.59 | 1.00 | -0.78 | 0.58 | 0.26 | -0.11 | -0.78 | -0.63 | 0.27 | -0.27 |
Murder | 0.34 | -0.23 | 0.70 | -0.78 | 1.00 | -0.49 | -0.54 | 0.23 | 0.93 | 0.74 | 0.02 | 0.58 |
HS.Grad | -0.10 | 0.62 | -0.66 | 0.58 | -0.49 | 1.00 | 0.37 | 0.33 | -0.52 | -0.23 | 0.36 | 0.27 |
Frost | -0.33 | 0.23 | -0.67 | 0.26 | -0.54 | 0.37 | 1.00 | 0.06 | -0.54 | -0.47 | -0.25 | -0.28 |
Area | 0.02 | 0.36 | 0.08 | -0.11 | 0.23 | 0.33 | 0.06 | 1.00 | 0.15 | 0.23 | -0.06 | 0.52 |
Murder.1 | 0.32 | -0.22 | 0.71 | -0.78 | 0.93 | -0.52 | -0.54 | 0.15 | 1.00 | 0.80 | 0.07 | 0.56 |
Assault | 0.32 | 0.04 | 0.51 | -0.63 | 0.74 | -0.23 | -0.47 | 0.23 | 0.80 | 1.00 | 0.26 | 0.67 |
UrbanPop | 0.51 | 0.48 | -0.06 | 0.27 | 0.02 | 0.36 | -0.25 | -0.06 | 0.07 | 0.26 | 1.00 | 0.41 |
Rape | 0.31 | 0.36 | 0.15 | -0.27 | 0.58 | 0.27 | -0.28 | 0.52 | 0.56 | 0.67 | 0.41 | 1.00 |
與文盲高度正相關的變數為謀殺犯罪率。 cor(Illiteracy,Murder)=0.70; cor(Illiteracy,Murder1)=0.71。
與文盲高度負相關的變數為HS.Grad、Frost。 cor(Illiteracy,HS.Grad)=-0.66; cor(Illiteracy,Frost)=-0.67。
與謀殺犯罪率高度負相關的變數為Life.Exp(生活期待) cor(Murder,Life.Exp)=-0.78 cor(Murder1,Life.Exp)=-0.78
與暴力高度負相關的變數為Life.Exp(生活期待) cor(Assault,Life.Exp)=-0.63
與暴力高度正相關的變數為謀殺犯罪率、強暴。 cor(Assault,Murder)=0.74 cor(Assault,Murder1)=0.80 cor(Assault,Rape)=0.67
暴力與區域低度相關。 cor(Area,Rape)=0.56
::p_load(corrplot,RColorBrewer)
pacman
corrplot(cor(q2dta4), type="upper", order="hclust",
col=brewer.pal(n=8, name="RdYlBu"))
Summarize the backpain{HSAUR3} into the following format: You should provide comments for each code chunk.
# install package
::p_load(HSAUR3)
pacman# load data
data(backpain,package = "HSAUR3")
#copy data
<-backpain
q3dta# show data
|> head() |> knitr::kable() q3dta
ID | status | driver | suburban |
---|---|---|---|
1 | case | yes | yes |
1 | control | yes | no |
2 | case | yes | yes |
2 | control | yes | yes |
3 | case | yes | no |
3 | control | yes | yes |
# exam data str
str(q3dta)
'data.frame': 434 obs. of 4 variables:
$ ID : Factor w/ 217 levels "1","2","3","4",..: 1 1 2 2 3 3 4 4 5 5 ...
$ status : Factor w/ 2 levels "case","control": 1 2 1 2 1 2 1 2 1 2 ...
$ driver : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 1 1 2 2 ...
$ suburban: Factor w/ 2 levels "no","yes": 2 1 2 2 1 2 1 1 1 2 ...
class(q3dta)
[1] "data.frame"
typeof(q3dta)
[1] "list"
summary(q3dta)
ID status driver suburban
1 : 2 case :217 no : 86 no :200
2 : 2 control:217 yes:348 yes:234
3 : 2
4 : 2
5 : 2
6 : 2
(Other):422
# cross-tabulate status by driver
<-with(q3dta, table(driver,status))
q3tab1 q3tab1
status
driver case control
no 32 54
yes 185 163
# cross-tabulate status by suburban
<-with(q3dta, table(suburban,status))
q3tab2 q3tab2
status
suburban case control
no 90 110
yes 127 107
# three-way table()
<-with(q3dta, table(driver, suburban, status))
q3tab3 q3tab3
, , status = case
suburban
driver no yes
no 26 6
yes 64 121
, , status = control
suburban
driver no yes
no 47 7
yes 63 100
# cross-tabulate status by suburban given driver
<-with(q3dta, ftable(driver, suburban, status))
q3tab4 q3tab4
status case control
driver suburban
no no 26 47
yes 6 7
yes no 64 63
yes 121 100
q3tab4很接近題目要求的format,接下來要依據driver、suburban來分群計算Total。
str(q3tab4)
'ftable' int [1:4, 1:2] 26 6 64 121 47 7 63 100
- attr(*, "row.vars")=List of 2
..$ driver : chr [1:2] "no" "yes"
..$ suburban: chr [1:2] "no" "yes"
- attr(*, "col.vars")=List of 1
..$ status: chr [1:2] "case" "control"
#轉換資料型態
<-as.data.frame(q3tab4)
q3tab q3tab
driver suburban status Freq
1 no no case 26
2 yes no case 64
3 no yes case 6
4 yes yes case 121
5 no no control 47
6 yes no control 63
7 no yes control 7
8 yes yes control 100
#要用aggregate?還是reshape?
#aggregate(Freq~driver+suburban+status, by=list(status,suburban), FUN=sum)
#aggregate(cbind(driver,suburban) ~ status, data=q3tab, FUN=sum)
# long to wide format
reshape(q3tab,
idvar="suburban",
timevar="status",
v.names="driver",
direction="wide")
suburban Freq driver.case driver.control
1 no 26 no no
3 yes 6 no no
這結果不是我要的….
#Summarize a transformed variable
::p_load(dplyr, magrittr ,tidyr)
pacman
#把case從status撈出來
<-q3tab |>
q3tab_casefilter(status=='case')|>
::rename(Case=Freq)
dplyr
#把control從status撈出來
<-q3tab |>
q3tab_controlfilter(status=='control')|>
::rename(Control=Freq)
dplyr
#merge後再篩選變數
<-merge(x =q3tab_case, y = q3tab_control, by = c("driver","suburban"))|>
q3dta_mergeselect(-c(3,5))
#新增變數total,然後水平加總Case+Control
<-mutate(q3dta_merge,total=Case+Control)
q3answer q3answer
driver suburban Case Control total
1 no no 26 47 73
2 no yes 6 7 13
3 yes no 64 63 127
4 yes yes 121 100 221
終於用笨方法得到答案了。
The data set Vocab{carData} gives observations on gender, education and vocabulary, from respondents to U.S. General Social Surveys, 1972-2004.
Summarize the relationship between education and vocabulary over the years by gender.
# Save a copy of data as tibble
<- as_tibble(carData::Vocab)
q4dta
# show data
|> head() |> knitr::kable() q4dta
year | sex | education | vocabulary |
---|---|---|---|
1974 | Male | 14 | 9 |
1974 | Male | 16 | 9 |
1974 | Female | 10 | 9 |
1974 | Female | 10 | 5 |
1974 | Female | 12 | 8 |
1974 | Male | 16 | 8 |
#rename
%<>%
q4dta ::rename(Year=year, Gender=sex,
dplyrEdu=education, Voc=vocabulary)
#依據講義tidy_01 p.23
%>%
q4dta ::xyplot(Edu ~ Voc | Year,
latticegroups=Gender,
type=c("p","g","r"),
data=.,
xlab="Vocabulary",
ylab="Education",
auto.key=list(columns=2))
#q4dta$Year %>%
#as.factor()%>%
#lattice::xyplot(Edu ~ Voc | Year,
# groups=Gender,
# type=c("p","g","r"),
# data=.,
# xlab="Vocabulary",
# ylab="Education",
# auto.key=list(columns=2))
將Year轉成factor,會出現error message。 Error in eval(substitute(groups), data, environment(x)) : numeric ‘envir’ arg not of length one
Supply comments to each code chunk in the following survey rmarkdown file and preview it as an R notebook or knit to html.
The data set concerns species and weight of animals caught in plots in a study area in Arizona over time.
Each row holds information for a single animal, and the columns represent:
# 下載並安裝tidyverse package
::p_load(tidyverse) pacman
# 從網路讀取csv檔
<- read_csv("http://kbroman.org/datacarp/portal_data_joined.csv") q5dta
# 資料檢視
glimpse(q5dta)
Rows: 34,786
Columns: 13
$ record_id <dbl> 1, 72, 224, 266, 349, 363, 435, 506, 588, 661, 748, 84~
$ month <dbl> 7, 8, 9, 10, 11, 11, 12, 1, 2, 3, 4, 5, 6, 8, 9, 10, 1~
$ day <dbl> 16, 19, 13, 16, 12, 12, 10, 8, 18, 11, 8, 6, 9, 5, 4, ~
$ year <dbl> 1977, 1977, 1977, 1977, 1977, 1977, 1977, 1978, 1978, ~
$ plot_id <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~
$ species_id <chr> "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", ~
$ sex <chr> "M", "M", NA, NA, NA, NA, NA, NA, "M", NA, NA, "M", "M~
$ hindfoot_length <dbl> 32, 31, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, NA, 34~
$ weight <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 218, NA, NA, 204, 200,~
$ genus <chr> "Neotoma", "Neotoma", "Neotoma", "Neotoma", "Neotoma",~
$ species <chr> "albigula", "albigula", "albigula", "albigula", "albig~
$ taxa <chr> "Rodent", "Rodent", "Rodent", "Rodent", "Rodent", "Rod~
$ plot_type <chr> "Control", "Control", "Control", "Control", "Control",~
#資料維度
dim(q5dta)
[1] 34786 13
#select()篩選變數
::select(q5dta, plot_id, species_id, weight) %>% head() dplyr
# A tibble: 6 x 3
plot_id species_id weight
<dbl> <chr> <dbl>
1 2 NL NA
2 2 NL NA
3 2 NL NA
4 2 NL NA
5 2 NL NA
6 2 NL NA
#select()也可以刪除不要的變數,在變數前加-
::select(q5dta, -record_id, -species_id) %>% head() dplyr
# A tibble: 6 x 11
month day year plot_id sex hindfoot_length weight genus species taxa
<dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr> <chr>
1 7 16 1977 2 M 32 NA Neotoma albigula Rodent
2 8 19 1977 2 M 31 NA Neotoma albigula Rodent
3 9 13 1977 2 <NA> NA NA Neotoma albigula Rodent
4 10 16 1977 2 <NA> NA NA Neotoma albigula Rodent
5 11 12 1977 2 <NA> NA NA Neotoma albigula Rodent
6 11 12 1977 2 <NA> NA NA Neotoma albigula Rodent
# ... with 1 more variable: plot_type <chr>
#filter()選出要分析的觀察值:year == 1995
::filter(q5dta, year == 1995) %>% head() #然後只看前六筆 dplyr
# A tibble: 6 x 13
record_id month day year plot_id species_id sex hindfoot_length weight
<dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
1 22314 6 7 1995 2 NL M 34 NA
2 22728 9 23 1995 2 NL F 32 165
3 22899 10 28 1995 2 NL F 32 171
4 23032 12 2 1995 2 NL F 33 NA
5 22003 1 11 1995 2 DM M 37 41
6 22042 2 4 1995 2 DM F 36 45
# ... with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
# plot_type <chr>
#選擇四個變數:體重(要<=5公斤)、species_id、sex、weight
#然後只看前六筆
head(dplyr::select(dplyr::filter(q5dta, weight <= 5), species_id, sex, weight))
# A tibble: 6 x 3
species_id sex weight
<chr> <chr> <dbl>
1 PF M 5
2 PF F 5
3 PF F 5
4 PF F 4
5 PF F 5
6 PF F 4
%>%
q5dta ::filter(weight <= 5) %>% #篩選觀察值的體重小於等於5
dplyr::select(species_id, sex, weight) %>% #select變數
dplyr head
# A tibble: 6 x 3
species_id sex weight
<chr> <chr> <dbl>
1 PF M 5
2 PF F 5
3 PF F 5
4 PF F 4
5 PF F 5
6 PF F 4
%>%
q5dta mutate(weight_kg = weight / 1000, #新增變數weight_kg、weight_lb
weight_lb = weight_kg * 2.2) %>%
head()
# A tibble: 6 x 15
record_id month day year plot_id species_id sex hindfoot_length weight
<dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
1 1 7 16 1977 2 NL M 32 NA
2 72 8 19 1977 2 NL M 31 NA
3 224 9 13 1977 2 NL <NA> NA NA
4 266 10 16 1977 2 NL <NA> NA NA
5 349 11 12 1977 2 NL <NA> NA NA
6 363 11 12 1977 2 NL <NA> NA NA
# ... with 6 more variables: genus <chr>, species <chr>, taxa <chr>,
# plot_type <chr>, weight_kg <dbl>, weight_lb <dbl>
%>%
q5dta filter(!is.na(weight)) %>% #篩選出在is.na()檢查回傳FALSE的元素
group_by(sex, species_id) %>% #分群條件
summarize(mean_weight = mean(weight)) %>% #計算mean_weight
arrange(desc(mean_weight)) %>% #由高到低排列
head() #檢視前六筆
# A tibble: 6 x 3
# Groups: sex [3]
sex species_id mean_weight
<chr> <chr> <dbl>
1 <NA> NL 168.
2 M NL 166.
3 F NL 154.
4 M SS 130
5 <NA> SH 130
6 M DS 122.
%>%
q5dta group_by(sex) %>% #依照性別分群
#計算總數量 tally
# A tibble: 3 x 2
sex n
<chr> <int>
1 F 15690
2 M 17348
3 <NA> 1748
#方法二
%>%
q5dta group_by(sex) %>% #依照性別分群
summarise(total=n()) #計算總數量
# A tibble: 3 x 2
sex total
<chr> <int>
1 F 15690
2 M 17348
3 <NA> 1748
%>%
q5dta count(sex) #依據性別計算總數量
# A tibble: 3 x 2
sex n
<chr> <int>
1 F 15690
2 M 17348
3 <NA> 1748
%>%
q5dta group_by(sex) %>% #先依照性別分群
summarize(count = n()) #再計算總數
# A tibble: 3 x 2
sex count
<chr> <int>
1 F 15690
2 M 17348
3 <NA> 1748
%>%
q5dta group_by(sex) %>% #先依照性別分群
summarize(count = sum(!is.na(year))) #再計算沒有遺漏值的year數量總數
# A tibble: 3 x 2
sex count
<chr> <int>
1 F 15690
2 M 17348
3 <NA> 1748
答案都一樣!?
<- q5dta %>%
dta_gw filter(!is.na(weight)) %>% #篩選出沒有遺漏值的weight列
group_by(genus, plot_id) %>% #依據genus與plot_id分群
summarize(mean_weight = mean(weight)) #計算體重平均mean(weight)
glimpse(dta_gw) #檢視資料
Rows: 196
Columns: 3
Groups: genus [10]
$ genus <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
$ plot_id <dbl> 1, 2, 3, 5, 18, 19, 20, 21, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,~
$ mean_weight <dbl> 7.000000, 6.000000, 8.611111, 7.750000, 9.500000, 9.533333~
#long format to wide format
#spread(資料框/長表,key="要展開的欄位名稱",value="數值欄位名稱")
<- dta_gw %>%
dta_w spread(key = genus, value = mean_weight) #
head(dta_w)
# A tibble: 6 x 11
plot_id Baiomys Chaetodipus Dipodomys Neotoma Onychomys Perognathus Peromyscus
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 7 22.2 60.2 156. 27.7 9.62 22.2
2 2 6 25.1 55.7 169. 26.9 6.95 22.3
3 3 8.61 24.6 52.0 158. 26.0 7.51 21.4
4 4 NA 23.0 57.5 164. 28.1 7.82 22.6
5 5 7.75 18.0 51.1 190. 27.0 8.66 21.2
6 6 NA 24.9 58.6 180. 25.9 7.81 21.8
# ... with 3 more variables: Reithrodontomys <dbl>, Sigmodon <dbl>,
# Spermophilus <dbl>
glimpse(dta_w) #資料檢視Rows:24 > Columns:11
Rows: 24
Columns: 11
$ plot_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,~
$ Baiomys <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA~
$ Chaetodipus <dbl> 22.19939, 25.11014, 24.63636, 23.02381, 17.98276, 24.8~
$ Dipodomys <dbl> 60.23214, 55.68259, 52.04688, 57.52454, 51.11356, 58.6~
$ Neotoma <dbl> 156.2222, 169.1436, 158.2414, 164.1667, 190.0370, 179.~
$ Onychomys <dbl> 27.67550, 26.87302, 26.03241, 28.09375, 27.01695, 25.8~
$ Perognathus <dbl> 9.625000, 6.947368, 7.507812, 7.824427, 8.658537, 7.80~
$ Peromyscus <dbl> 22.22222, 22.26966, 21.37037, 22.60000, 21.23171, 21.8~
$ Reithrodontomys <dbl> 11.375000, 10.680556, 10.516588, 10.263158, 11.154545,~
$ Sigmodon <dbl> NA, 70.85714, 65.61404, 82.00000, 82.66667, 68.77778, ~
$ Spermophilus <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 13~
%>%
dta_gw spread(genus, mean_weight, fill = 0) %>% #fill = 0:If set, missing values will be replaced with this value.
head()
# A tibble: 6 x 11
plot_id Baiomys Chaetodipus Dipodomys Neotoma Onychomys Perognathus Peromyscus
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 7 22.2 60.2 156. 27.7 9.62 22.2
2 2 6 25.1 55.7 169. 26.9 6.95 22.3
3 3 8.61 24.6 52.0 158. 26.0 7.51 21.4
4 4 0 23.0 57.5 164. 28.1 7.82 22.6
5 5 7.75 18.0 51.1 190. 27.0 8.66 21.2
6 6 0 24.9 58.6 180. 25.9 7.81 21.8
# ... with 3 more variables: Reithrodontomys <dbl>, Sigmodon <dbl>,
# Spermophilus <dbl>
#寬表轉長表
#gather(資料框/寬表,key="主鍵欄位名稱",value="數值欄位名稱",要轉換的資料1,要轉換的資料2,...)
<- dta_w %>%
dta_l gather(key = genus, value = mean_weight, -plot_id) #-plot_id表示除去plot_id,就只轉置剩下兩列
head(dta_l)
# A tibble: 6 x 3
plot_id genus mean_weight
<dbl> <chr> <dbl>
1 1 Baiomys 7
2 2 Baiomys 6
3 3 Baiomys 8.61
4 4 Baiomys NA
5 5 Baiomys 7.75
6 6 Baiomys NA
glimpse(dta_l) #看一下Rows數是不是有大於Columns數
Rows: 240
Columns: 3
$ plot_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,~
$ genus <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
$ mean_weight <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA, NA~
#指定gather的變數類別Baiomys:Spermophilus
%>%
dta_w gather(key = genus, value = mean_weight, Baiomys:Spermophilus) %>%
head()
# A tibble: 6 x 3
plot_id genus mean_weight
<dbl> <chr> <dbl>
1 1 Baiomys 7
2 2 Baiomys 6
3 3 Baiomys 8.61
4 4 Baiomys NA
5 5 Baiomys 7.75
6 6 Baiomys NA
#丟掉weight、hindfoot_length、sex的遺漏值
<- q5dta %>%
dta_complete filter(!is.na(weight),
!is.na(hindfoot_length),
!is.na(sex))
head(dta_complete)
# A tibble: 6 x 13
record_id month day year plot_id species_id sex hindfoot_length weight
<dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
1 845 5 6 1978 2 NL M 32 204
2 1164 8 5 1978 2 NL M 34 199
3 1261 9 4 1978 2 NL M 32 197
4 1756 4 29 1979 2 NL M 33 166
5 1818 5 30 1979 2 NL M 32 184
6 1882 7 4 1979 2 NL M 32 206
# ... with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
# plot_type <chr>
<- dta_complete %>%
species_counts count(species_id) %>% #依據species_id計算總數
filter(n >= 50) #篩選出n >= 50
head(species_counts)
# A tibble: 6 x 2
species_id n
<chr> <int>
1 DM 9727
2 DO 2790
3 DS 2023
4 NL 1045
5 OL 905
6 OT 2081
#The %in% operator in R can be used to identify if an element (e.g., a number) belongs to a vector or dataframe.
<- dta_complete %>%
dta_complete filter(species_id %in% species_counts$species_id)
head(dta_complete)
# A tibble: 6 x 13
record_id month day year plot_id species_id sex hindfoot_length weight
<dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
1 845 5 6 1978 2 NL M 32 204
2 1164 8 5 1978 2 NL M 34 199
3 1261 9 4 1978 2 NL M 32 197
4 1756 4 29 1979 2 NL M 33 166
5 1818 5 30 1979 2 NL M 32 184
6 1882 7 4 1979 2 NL M 32 206
# ... with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
# plot_type <chr>