Comment on what each code chunk in the following classroom markdown fileis trying to achive and on its output.
pacman::p_load(WWGbook)
data(classroom, package="WWGbook")#用?查詢classroom資料集
?classroom R右下角視窗會跳出這個data set的內容說明。
The Study of Instructional Improvement (SII; Hill, Rowan, and Ball, 2004) was carried out by researchers at the University of Michigan to study the math achievement scores of first- and third-grade students in randomly selected classrooms from a national U.S. sample of elementary schools.
#exam data structure
str(classroom)'data.frame':   1190 obs. of  12 variables:
 $ sex     : int  1 0 1 0 0 1 0 0 1 0 ...
 $ minority: int  1 1 1 1 1 1 1 1 1 1 ...
 $ mathkind: int  448 460 511 449 425 450 452 443 422 480 ...
 $ mathgain: int  32 109 56 83 53 65 51 66 88 -7 ...
 $ ses     : num  0.46 -0.27 -0.03 -0.38 -0.03 0.76 -0.03 0.2 0.64 0.13 ...
 $ yearstea: num  1 1 1 2 2 2 2 2 2 2 ...
 $ mathknow: num  NA NA NA -0.11 -0.11 -0.11 -0.11 -0.11 -0.11 -0.11 ...
 $ housepov: num  0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 ...
 $ mathprep: num  2 2 2 3.25 3.25 3.25 3.25 3.25 3.25 3.25 ...
 $ classid : int  160 160 160 217 217 217 217 217 217 217 ...
 $ schoolid: int  1 1 1 1 1 1 1 1 1 1 ...
 $ childid : int  1 2 3 4 5 6 7 8 9 10 ...dta_schl <- classroom[classroom$schoolid,c("schoolid", "housepov")]  #vector中的元素是重複的
head(dta_schl)    schoolid housepov
1          1    0.082
1.1        1    0.082
1.2        1    0.082
1.3        1    0.082
1.4        1    0.082
1.5        1    0.082#duplicated=FALSE設定邏輯向量,回傳c("schoolid", "housepov")不重複的元素 
dta_schl <- classroom[duplicated(classroom$schoolid)==FALSE, c("schoolid", "housepov")]
head(dta_schl,10)   schoolid housepov
1         1    0.082
12        2    0.082
22        3    0.086
36        4    0.365
42        5    0.511
48        6    0.044
60        7    0.148
74        8    0.085
90        9    0.537
96       10    0.346#依序回傳c(11, 10, 6,7,9)這五個變數不重複的元素
dta_cls <- classroom[duplicated(classroom$classid)==FALSE,  c(11, 10, 6,7,9)] 
head(dta_cls)   schoolid classid yearstea mathknow mathprep
1         1     160     1.00       NA     2.00
4         1     217     2.00    -0.11     3.25
12        2     197     1.00    -1.25     2.50
14        2     211     2.00    -0.72     2.33
18        2     307    12.54       NA     2.30
22        3      11    20.00     0.45     3.83#只取c(12, 10, 11, 1:5)這8個變數,且指定順序
dta_chld <- classroom[, c(12, 10, 11, 1:5)]
head(dta_chld)  childid classid schoolid sex minority mathkind mathgain   ses
1       1     160        1   1        1      448       32  0.46
2       2     160        1   0        1      460      109 -0.27
3       3     160        1   1        1      511       56 -0.03
4       4     217        1   0        1      449       83 -0.38
5       5     217        1   0        1      425       53 -0.03
6       6     217        1   1        1      450       65  0.76#list(dta_schl, dta_cls, dta_chld)將不同類型的三個向量放到一個object中
#sapply()功能與lapply()相同
#檢索object的dim維度
sapply(list(dta_schl, dta_cls, dta_chld), dim)     [,1] [,2] [,3]
[1,]  107  312 1190
[2,]    2    5    8dta_schl這個data.frame中2個維度(變數),107個觀察值。
dta_cls這個data.frame中5個變數,312個觀察值。
dta_chld這個data.frame中8個變數,1190個觀察值。
#依據classid、schoolid來combine dta_chld和dta_cls這兩個data.frame
dta_12 <- merge(x=dta_chld, y=dta_cls, by=c("classid", "schoolid"))
head(dta_12)  classid schoolid childid sex minority mathkind mathgain   ses yearstea
1       1       61     653   0        1      442       35 -0.09        2
2       1       61     654   0        1      422      109 -1.32        2
3       1       61     655   1        1      482       66  0.43        2
4       1       61     656   1        1      489       19 -0.06        2
5       1       61     657   0        1      460       10 -1.28        2
6      10       17     216   1        1      381      143 -0.49        5
  mathknow mathprep
1    -0.72     2.50
2    -0.72     2.50
3    -0.72     2.50
4    -0.72     2.50
5    -0.72     2.50
6    -1.26     3.25#依據classid、schoolid來combine dta_chld和dta_cls這兩個data.frame
dta_13 <- merge(x=dta_chld, y=dta_cls, by=c("classid", "schoolid"))#依據schoolid來combine dta_chld和dta_cls
dta_23 <- merge(x=dta_cls, y=dta_schl, by="schoolid")#依據schoolid來combine dta_12和dta_schl
dta_123 <- merge(x=dta_12, y=dta_schl, by=c("schoolid"))#合併後的data frame分別有多少觀察值(obs)與維度(變數)
sapply(list(dta_12, dta_13, dta_23, dta_123), dim)     [,1] [,2] [,3] [,4]
[1,] 1190 1190  312 1190
[2,]   11   11    6   12Merge the two data sets: state.x77{datasets} and USArrests{datasets} and compute all pair-wise correlations for numerical variables.
Is there anything interesting to report?
#show dataset
state.x77 |> head() |> knitr::kable()| Population | Income | Illiteracy | Life Exp | Murder | HS Grad | Frost | Area | |
|---|---|---|---|---|---|---|---|---|
| Alabama | 3615 | 3624 | 2.1 | 69.05 | 15.1 | 41.3 | 20 | 50708 | 
| Alaska | 365 | 6315 | 1.5 | 69.31 | 11.3 | 66.7 | 152 | 566432 | 
| Arizona | 2212 | 4530 | 1.8 | 70.55 | 7.8 | 58.1 | 15 | 113417 | 
| Arkansas | 2110 | 3378 | 1.9 | 70.66 | 10.1 | 39.9 | 65 | 51945 | 
| California | 21198 | 5114 | 1.1 | 71.71 | 10.3 | 62.6 | 20 | 156361 | 
| Colorado | 2541 | 4884 | 0.7 | 72.06 | 6.8 | 63.9 | 166 | 103766 | 
#了解資料
str(state.x77) num [1:50, 1:8] 3615 365 2212 2110 21198 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
  ..$ : chr [1:8] "Population" "Income" "Illiteracy" "Life Exp" ...class(state.x77)[1] "matrix" "array" typeof(state.x77)[1] "double"comment(state.x77)NULL#show dataset
USArrests |> head() |> knitr::kable()| Murder | Assault | UrbanPop | Rape | |
|---|---|---|---|---|
| Alabama | 13.2 | 236 | 58 | 21.2 | 
| Alaska | 10.0 | 263 | 48 | 44.5 | 
| Arizona | 8.1 | 294 | 80 | 31.0 | 
| Arkansas | 8.8 | 190 | 50 | 19.5 | 
| California | 9.0 | 276 | 91 | 40.6 | 
| Colorado | 7.9 | 204 | 78 | 38.7 | 
#了解資料
str(USArrests)'data.frame':   50 obs. of  4 variables:
 $ Murder  : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
 $ Assault : int  236 263 294 190 276 204 110 238 335 211 ...
 $ UrbanPop: int  58 48 80 50 91 78 77 72 80 60 ...
 $ Rape    : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...class(USArrests)[1] "data.frame"typeof(USArrests)[1] "list"comment(USArrests)NULL#Merge data files - default
q2dta1<-merge(state.x77, USArrests) 
q2dta1|> head() |> knitr::kable()| Murder | Population | Income | Illiteracy | Life Exp | HS Grad | Frost | Area | Assault | UrbanPop | Rape | 
|---|---|---|---|---|---|---|---|---|---|---|
| 2.7 | 1058 | 3694 | 0.7 | 70.39 | 54.7 | 161 | 30920 | 72 | 66 | 14.9 | 
| 3.3 | 5814 | 4755 | 1.1 | 71.83 | 58.5 | 103 | 7826 | 110 | 77 | 11.1 | 
| 3.3 | 812 | 4281 | 0.7 | 71.23 | 57.6 | 174 | 9027 | 110 | 77 | 11.1 | 
| 4.3 | 3559 | 4864 | 0.6 | 71.72 | 63.5 | 32 | 66570 | 102 | 62 | 16.5 | 
| 5.3 | 813 | 4119 | 0.6 | 71.87 | 59.5 | 126 | 82677 | 46 | 83 | 20.2 | 
| 6.8 | 2541 | 4884 | 0.7 | 72.06 | 63.9 | 166 | 103766 | 161 | 60 | 15.6 | 
依據相同變數Murder合併資料,且刪除missing data。
#Merge data files - all
q2dta2<-merge(state.x77, USArrests, all = TRUE) 
q2dta2|> head() |> knitr::kable()| Murder | Population | Income | Illiteracy | Life Exp | HS Grad | Frost | Area | Assault | UrbanPop | Rape | 
|---|---|---|---|---|---|---|---|---|---|---|
| 0.8 | NA | NA | NA | NA | NA | NA | NA | 45 | 44 | 7.3 | 
| 1.4 | 637 | 5087 | 0.8 | 72.78 | 50.3 | 186 | 69273 | NA | NA | NA | 
| 1.7 | 681 | 4167 | 0.5 | 72.08 | 53.3 | 172 | 75955 | NA | NA | NA | 
| 2.1 | NA | NA | NA | NA | NA | NA | NA | 83 | 51 | 7.8 | 
| 2.1 | NA | NA | NA | NA | NA | NA | NA | 57 | 56 | 9.5 | 
| 2.2 | NA | NA | NA | NA | NA | NA | NA | 56 | 57 | 11.3 | 
方法一會依據相同變數Murder合併資料,但保留了missing data。
# merge two files together by matching row.names
q2dta3<- merge(state.x77, USArrests, by="row.names", all=T)
q2dta3|> head() |> knitr::kable()| Row.names | Population | Income | Illiteracy | Life Exp | Murder.x | HS Grad | Frost | Area | Murder.y | Assault | UrbanPop | Rape | 
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Alabama | 3615 | 3624 | 2.1 | 69.05 | 15.1 | 41.3 | 20 | 50708 | 13.2 | 236 | 58 | 21.2 | 
| Alaska | 365 | 6315 | 1.5 | 69.31 | 11.3 | 66.7 | 152 | 566432 | 10.0 | 263 | 48 | 44.5 | 
| Arizona | 2212 | 4530 | 1.8 | 70.55 | 7.8 | 58.1 | 15 | 113417 | 8.1 | 294 | 80 | 31.0 | 
| Arkansas | 2110 | 3378 | 1.9 | 70.66 | 10.1 | 39.9 | 65 | 51945 | 8.8 | 190 | 50 | 19.5 | 
| California | 21198 | 5114 | 1.1 | 71.71 | 10.3 | 62.6 | 20 | 156361 | 9.0 | 276 | 91 | 40.6 | 
| Colorado | 2541 | 4884 | 0.7 | 72.06 | 6.8 | 63.9 | 166 | 103766 | 7.9 | 204 | 78 | 38.7 | 
方法二依據50州的州名(row.name)合併資料。因有兩個相同變數,R自動命名為Murder.x和Murder.y。
#data.frame將兩類資料放在同一個框架
q2dta4<-data.frame(state.x77, USArrests)
q2dta4|> head() |> knitr::kable()| Population | Income | Illiteracy | Life.Exp | Murder | HS.Grad | Frost | Area | Murder.1 | Assault | UrbanPop | Rape | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Alabama | 3615 | 3624 | 2.1 | 69.05 | 15.1 | 41.3 | 20 | 50708 | 13.2 | 236 | 58 | 21.2 | 
| Alaska | 365 | 6315 | 1.5 | 69.31 | 11.3 | 66.7 | 152 | 566432 | 10.0 | 263 | 48 | 44.5 | 
| Arizona | 2212 | 4530 | 1.8 | 70.55 | 7.8 | 58.1 | 15 | 113417 | 8.1 | 294 | 80 | 31.0 | 
| Arkansas | 2110 | 3378 | 1.9 | 70.66 | 10.1 | 39.9 | 65 | 51945 | 8.8 | 190 | 50 | 19.5 | 
| California | 21198 | 5114 | 1.1 | 71.71 | 10.3 | 62.6 | 20 | 156361 | 9.0 | 276 | 91 | 40.6 | 
| Colorado | 2541 | 4884 | 0.7 | 72.06 | 6.8 | 63.9 | 166 | 103766 | 7.9 | 204 | 78 | 38.7 | 
使用data.frame()合併資料,合併資料的方式也是依據50州的州名,但合併後州名row.name不是變數。
cor(q2dta1)|> knitr::kable(digits = 2)| Murder | Population | Income | Illiteracy | Life Exp | HS Grad | Frost | Area | Assault | UrbanPop | Rape | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Murder | 1.00 | 0.33 | -0.10 | 0.80 | -0.74 | -0.59 | -0.53 | 0.37 | 0.81 | -0.11 | 0.72 | 
| Population | 0.33 | 1.00 | -0.02 | 0.12 | 0.05 | -0.40 | -0.36 | -0.09 | 0.41 | 0.57 | 0.56 | 
| Income | -0.10 | -0.02 | 1.00 | -0.46 | 0.22 | 0.66 | 0.36 | 0.57 | 0.28 | 0.09 | 0.02 | 
| Illiteracy | 0.80 | 0.12 | -0.46 | 1.00 | -0.75 | -0.68 | -0.67 | 0.18 | 0.51 | -0.35 | 0.50 | 
| Life Exp | -0.74 | 0.05 | 0.22 | -0.75 | 1.00 | 0.66 | 0.41 | -0.21 | -0.60 | 0.25 | -0.36 | 
| HS Grad | -0.59 | -0.40 | 0.66 | -0.68 | 0.66 | 1.00 | 0.60 | 0.36 | -0.37 | -0.13 | -0.41 | 
| Frost | -0.53 | -0.36 | 0.36 | -0.67 | 0.41 | 0.60 | 1.00 | 0.12 | -0.28 | 0.19 | -0.46 | 
| Area | 0.37 | -0.09 | 0.57 | 0.18 | -0.21 | 0.36 | 0.12 | 1.00 | 0.54 | -0.07 | 0.50 | 
| Assault | 0.81 | 0.41 | 0.28 | 0.51 | -0.60 | -0.37 | -0.28 | 0.54 | 1.00 | 0.14 | 0.69 | 
| UrbanPop | -0.11 | 0.57 | 0.09 | -0.35 | 0.25 | -0.13 | 0.19 | -0.07 | 0.14 | 1.00 | 0.24 | 
| Rape | 0.72 | 0.56 | 0.02 | 0.50 | -0.36 | -0.41 | -0.46 | 0.50 | 0.69 | 0.24 | 1.00 | 
cor(q2dta2)|> knitr::kable(digits = 2)| Murder | Population | Income | Illiteracy | Life Exp | HS Grad | Frost | Area | Assault | UrbanPop | Rape | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Murder | 1 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 
| Population | NA | 1 | NA | NA | NA | NA | NA | NA | NA | NA | NA | 
| Income | NA | NA | 1 | NA | NA | NA | NA | NA | NA | NA | NA | 
| Illiteracy | NA | NA | NA | 1 | NA | NA | NA | NA | NA | NA | NA | 
| Life Exp | NA | NA | NA | NA | 1 | NA | NA | NA | NA | NA | NA | 
| HS Grad | NA | NA | NA | NA | NA | 1 | NA | NA | NA | NA | NA | 
| Frost | NA | NA | NA | NA | NA | NA | 1 | NA | NA | NA | NA | 
| Area | NA | NA | NA | NA | NA | NA | NA | 1 | NA | NA | NA | 
| Assault | NA | NA | NA | NA | NA | NA | NA | NA | 1 | NA | NA | 
| UrbanPop | NA | NA | NA | NA | NA | NA | NA | NA | NA | 1 | NA | 
| Rape | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 1 | 
有missing value,無法算correlation
cor(q2dta3[,-1])|> knitr::kable(digits = 2) #[,-1]| Population | Income | Illiteracy | Life Exp | Murder.x | HS Grad | Frost | Area | Murder.y | Assault | UrbanPop | Rape | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Population | 1.00 | 0.21 | 0.11 | -0.07 | 0.34 | -0.10 | -0.33 | 0.02 | 0.32 | 0.32 | 0.51 | 0.31 | 
| Income | 0.21 | 1.00 | -0.44 | 0.34 | -0.23 | 0.62 | 0.23 | 0.36 | -0.22 | 0.04 | 0.48 | 0.36 | 
| Illiteracy | 0.11 | -0.44 | 1.00 | -0.59 | 0.70 | -0.66 | -0.67 | 0.08 | 0.71 | 0.51 | -0.06 | 0.15 | 
| Life Exp | -0.07 | 0.34 | -0.59 | 1.00 | -0.78 | 0.58 | 0.26 | -0.11 | -0.78 | -0.63 | 0.27 | -0.27 | 
| Murder.x | 0.34 | -0.23 | 0.70 | -0.78 | 1.00 | -0.49 | -0.54 | 0.23 | 0.93 | 0.74 | 0.02 | 0.58 | 
| HS Grad | -0.10 | 0.62 | -0.66 | 0.58 | -0.49 | 1.00 | 0.37 | 0.33 | -0.52 | -0.23 | 0.36 | 0.27 | 
| Frost | -0.33 | 0.23 | -0.67 | 0.26 | -0.54 | 0.37 | 1.00 | 0.06 | -0.54 | -0.47 | -0.25 | -0.28 | 
| Area | 0.02 | 0.36 | 0.08 | -0.11 | 0.23 | 0.33 | 0.06 | 1.00 | 0.15 | 0.23 | -0.06 | 0.52 | 
| Murder.y | 0.32 | -0.22 | 0.71 | -0.78 | 0.93 | -0.52 | -0.54 | 0.15 | 1.00 | 0.80 | 0.07 | 0.56 | 
| Assault | 0.32 | 0.04 | 0.51 | -0.63 | 0.74 | -0.23 | -0.47 | 0.23 | 0.80 | 1.00 | 0.26 | 0.67 | 
| UrbanPop | 0.51 | 0.48 | -0.06 | 0.27 | 0.02 | 0.36 | -0.25 | -0.06 | 0.07 | 0.26 | 1.00 | 0.41 | 
| Rape | 0.31 | 0.36 | 0.15 | -0.27 | 0.58 | 0.27 | -0.28 | 0.52 | 0.56 | 0.67 | 0.41 | 1.00 | 
Error in cor(q2dta3) : ‘x’ 必須是數值,因此使用[,-1]排除第一行州名
cor(q2dta4)|> knitr::kable(digits = 2)| Population | Income | Illiteracy | Life.Exp | Murder | HS.Grad | Frost | Area | Murder.1 | Assault | UrbanPop | Rape | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Population | 1.00 | 0.21 | 0.11 | -0.07 | 0.34 | -0.10 | -0.33 | 0.02 | 0.32 | 0.32 | 0.51 | 0.31 | 
| Income | 0.21 | 1.00 | -0.44 | 0.34 | -0.23 | 0.62 | 0.23 | 0.36 | -0.22 | 0.04 | 0.48 | 0.36 | 
| Illiteracy | 0.11 | -0.44 | 1.00 | -0.59 | 0.70 | -0.66 | -0.67 | 0.08 | 0.71 | 0.51 | -0.06 | 0.15 | 
| Life.Exp | -0.07 | 0.34 | -0.59 | 1.00 | -0.78 | 0.58 | 0.26 | -0.11 | -0.78 | -0.63 | 0.27 | -0.27 | 
| Murder | 0.34 | -0.23 | 0.70 | -0.78 | 1.00 | -0.49 | -0.54 | 0.23 | 0.93 | 0.74 | 0.02 | 0.58 | 
| HS.Grad | -0.10 | 0.62 | -0.66 | 0.58 | -0.49 | 1.00 | 0.37 | 0.33 | -0.52 | -0.23 | 0.36 | 0.27 | 
| Frost | -0.33 | 0.23 | -0.67 | 0.26 | -0.54 | 0.37 | 1.00 | 0.06 | -0.54 | -0.47 | -0.25 | -0.28 | 
| Area | 0.02 | 0.36 | 0.08 | -0.11 | 0.23 | 0.33 | 0.06 | 1.00 | 0.15 | 0.23 | -0.06 | 0.52 | 
| Murder.1 | 0.32 | -0.22 | 0.71 | -0.78 | 0.93 | -0.52 | -0.54 | 0.15 | 1.00 | 0.80 | 0.07 | 0.56 | 
| Assault | 0.32 | 0.04 | 0.51 | -0.63 | 0.74 | -0.23 | -0.47 | 0.23 | 0.80 | 1.00 | 0.26 | 0.67 | 
| UrbanPop | 0.51 | 0.48 | -0.06 | 0.27 | 0.02 | 0.36 | -0.25 | -0.06 | 0.07 | 0.26 | 1.00 | 0.41 | 
| Rape | 0.31 | 0.36 | 0.15 | -0.27 | 0.58 | 0.27 | -0.28 | 0.52 | 0.56 | 0.67 | 0.41 | 1.00 | 
與文盲高度正相關的變數為謀殺犯罪率。 cor(Illiteracy,Murder)=0.70; cor(Illiteracy,Murder1)=0.71。
與文盲高度負相關的變數為HS.Grad、Frost。 cor(Illiteracy,HS.Grad)=-0.66; cor(Illiteracy,Frost)=-0.67。
與謀殺犯罪率高度負相關的變數為Life.Exp(生活期待) cor(Murder,Life.Exp)=-0.78 cor(Murder1,Life.Exp)=-0.78
與暴力高度負相關的變數為Life.Exp(生活期待) cor(Assault,Life.Exp)=-0.63
與暴力高度正相關的變數為謀殺犯罪率、強暴。 cor(Assault,Murder)=0.74 cor(Assault,Murder1)=0.80 cor(Assault,Rape)=0.67
暴力與區域低度相關。 cor(Area,Rape)=0.56
pacman::p_load(corrplot,RColorBrewer)
corrplot(cor(q2dta4), type="upper", order="hclust",
         col=brewer.pal(n=8, name="RdYlBu"))Summarize the backpain{HSAUR3} into the following format:  You should provide comments for each code chunk.
# install package
pacman::p_load(HSAUR3)
# load data
data(backpain,package = "HSAUR3")
#copy data
q3dta<-backpain
# show data
q3dta |> head() |> knitr::kable()| ID | status | driver | suburban | 
|---|---|---|---|
| 1 | case | yes | yes | 
| 1 | control | yes | no | 
| 2 | case | yes | yes | 
| 2 | control | yes | yes | 
| 3 | case | yes | no | 
| 3 | control | yes | yes | 
# exam data str
str(q3dta)'data.frame':   434 obs. of  4 variables:
 $ ID      : Factor w/ 217 levels "1","2","3","4",..: 1 1 2 2 3 3 4 4 5 5 ...
 $ status  : Factor w/ 2 levels "case","control": 1 2 1 2 1 2 1 2 1 2 ...
 $ driver  : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 1 1 2 2 ...
 $ suburban: Factor w/ 2 levels "no","yes": 2 1 2 2 1 2 1 1 1 2 ...class(q3dta)[1] "data.frame"typeof(q3dta)[1] "list"summary(q3dta)       ID          status    driver    suburban 
 1      :  2   case   :217   no : 86   no :200  
 2      :  2   control:217   yes:348   yes:234  
 3      :  2                                    
 4      :  2                                    
 5      :  2                                    
 6      :  2                                    
 (Other):422                                    # cross-tabulate status by driver
q3tab1<-with(q3dta, table(driver,status))
q3tab1      status
driver case control
   no    32      54
   yes  185     163# cross-tabulate status by suburban
q3tab2<-with(q3dta, table(suburban,status))
q3tab2        status
suburban case control
     no    90     110
     yes  127     107# three-way table()
q3tab3<-with(q3dta, table(driver, suburban, status))
q3tab3, , status = case
      suburban
driver  no yes
   no   26   6
   yes  64 121
, , status = control
      suburban
driver  no yes
   no   47   7
   yes  63 100# cross-tabulate status by suburban given driver
q3tab4<-with(q3dta, ftable(driver, suburban, status))
q3tab4                status case control
driver suburban                    
no     no                26      47
       yes                6       7
yes    no                64      63
       yes              121     100q3tab4很接近題目要求的format,接下來要依據driver、suburban來分群計算Total。
str(q3tab4) 'ftable' int [1:4, 1:2] 26 6 64 121 47 7 63 100
 - attr(*, "row.vars")=List of 2
  ..$ driver  : chr [1:2] "no" "yes"
  ..$ suburban: chr [1:2] "no" "yes"
 - attr(*, "col.vars")=List of 1
  ..$ status: chr [1:2] "case" "control"#轉換資料型態
q3tab<-as.data.frame(q3tab4)
q3tab  driver suburban  status Freq
1     no       no    case   26
2    yes       no    case   64
3     no      yes    case    6
4    yes      yes    case  121
5     no       no control   47
6    yes       no control   63
7     no      yes control    7
8    yes      yes control  100#要用aggregate?還是reshape?
#aggregate(Freq~driver+suburban+status, by=list(status,suburban), FUN=sum)
#aggregate(cbind(driver,suburban) ~ status, data=q3tab, FUN=sum)
# long to wide format
reshape(q3tab,
        idvar="suburban",
        timevar="status",
        v.names="driver",
        direction="wide")  suburban Freq driver.case driver.control
1       no   26          no             no
3      yes    6          no             no這結果不是我要的….
#Summarize a transformed variable
pacman::p_load(dplyr, magrittr ,tidyr)
#把case從status撈出來
q3tab_case<-q3tab |>
 filter(status=='case')|>
 dplyr::rename(Case=Freq)
    
#把control從status撈出來
q3tab_control<-q3tab |>
 filter(status=='control')|>
 dplyr::rename(Control=Freq)
#merge後再篩選變數
q3dta_merge<-merge(x =q3tab_case, y = q3tab_control, by = c("driver","suburban"))|>
   select(-c(3,5))
#新增變數total,然後水平加總Case+Control
q3answer<-mutate(q3dta_merge,total=Case+Control)
q3answer  driver suburban Case Control total
1     no       no   26      47    73
2     no      yes    6       7    13
3    yes       no   64      63   127
4    yes      yes  121     100   221終於用笨方法得到答案了。
The data set Vocab{carData} gives observations on gender, education and vocabulary, from respondents to U.S. General Social Surveys, 1972-2004.
Summarize the relationship between education and vocabulary over the years by gender.
# Save a copy of data as tibble
q4dta <- as_tibble(carData::Vocab)
# show data
q4dta |> head() |> knitr::kable()| year | sex | education | vocabulary | 
|---|---|---|---|
| 1974 | Male | 14 | 9 | 
| 1974 | Male | 16 | 9 | 
| 1974 | Female | 10 | 9 | 
| 1974 | Female | 10 | 5 | 
| 1974 | Female | 12 | 8 | 
| 1974 | Male | 16 | 8 | 
#rename
q4dta %<>%
 dplyr::rename(Year=year, Gender=sex, 
               Edu=education, Voc=vocabulary)#依據講義tidy_01 p.23
q4dta %>%
 lattice::xyplot(Edu ~ Voc | Year, 
        groups=Gender, 
        type=c("p","g","r"), 
        data=., 
        xlab="Vocabulary", 
        ylab="Education", 
        auto.key=list(columns=2))#q4dta$Year %>% 
    #as.factor()%>% 
#lattice::xyplot(Edu ~ Voc | Year, 
       # groups=Gender, 
       # type=c("p","g","r"), 
       # data=., 
       # xlab="Vocabulary", 
       # ylab="Education", 
       # auto.key=list(columns=2))將Year轉成factor,會出現error message。 Error in eval(substitute(groups), data, environment(x)) : numeric ‘envir’ arg not of length one
Supply comments to each code chunk in the following survey rmarkdown file and preview it as an R notebook or knit to html.
The data set concerns species and weight of animals caught in plots in a study area in Arizona over time.
Each row holds information for a single animal, and the columns represent:
# 下載並安裝tidyverse package
pacman::p_load(tidyverse)# 從網路讀取csv檔
q5dta <- read_csv("http://kbroman.org/datacarp/portal_data_joined.csv")# 資料檢視
glimpse(q5dta)Rows: 34,786
Columns: 13
$ record_id       <dbl> 1, 72, 224, 266, 349, 363, 435, 506, 588, 661, 748, 84~
$ month           <dbl> 7, 8, 9, 10, 11, 11, 12, 1, 2, 3, 4, 5, 6, 8, 9, 10, 1~
$ day             <dbl> 16, 19, 13, 16, 12, 12, 10, 8, 18, 11, 8, 6, 9, 5, 4, ~
$ year            <dbl> 1977, 1977, 1977, 1977, 1977, 1977, 1977, 1978, 1978, ~
$ plot_id         <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~
$ species_id      <chr> "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", ~
$ sex             <chr> "M", "M", NA, NA, NA, NA, NA, NA, "M", NA, NA, "M", "M~
$ hindfoot_length <dbl> 32, 31, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, NA, 34~
$ weight          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 218, NA, NA, 204, 200,~
$ genus           <chr> "Neotoma", "Neotoma", "Neotoma", "Neotoma", "Neotoma",~
$ species         <chr> "albigula", "albigula", "albigula", "albigula", "albig~
$ taxa            <chr> "Rodent", "Rodent", "Rodent", "Rodent", "Rodent", "Rod~
$ plot_type       <chr> "Control", "Control", "Control", "Control", "Control",~#資料維度
dim(q5dta)[1] 34786    13#select()篩選變數
dplyr::select(q5dta, plot_id, species_id, weight) %>% head()# A tibble: 6 x 3
  plot_id species_id weight
    <dbl> <chr>       <dbl>
1       2 NL             NA
2       2 NL             NA
3       2 NL             NA
4       2 NL             NA
5       2 NL             NA
6       2 NL             NA#select()也可以刪除不要的變數,在變數前加-
dplyr::select(q5dta, -record_id, -species_id) %>% head()# A tibble: 6 x 11
  month   day  year plot_id sex   hindfoot_length weight genus   species  taxa  
  <dbl> <dbl> <dbl>   <dbl> <chr>           <dbl>  <dbl> <chr>   <chr>    <chr> 
1     7    16  1977       2 M                  32     NA Neotoma albigula Rodent
2     8    19  1977       2 M                  31     NA Neotoma albigula Rodent
3     9    13  1977       2 <NA>               NA     NA Neotoma albigula Rodent
4    10    16  1977       2 <NA>               NA     NA Neotoma albigula Rodent
5    11    12  1977       2 <NA>               NA     NA Neotoma albigula Rodent
6    11    12  1977       2 <NA>               NA     NA Neotoma albigula Rodent
# ... with 1 more variable: plot_type <chr>#filter()選出要分析的觀察值:year == 1995
dplyr::filter(q5dta, year == 1995) %>% head() #然後只看前六筆# A tibble: 6 x 13
  record_id month   day  year plot_id species_id sex   hindfoot_length weight
      <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
1     22314     6     7  1995       2 NL         M                  34     NA
2     22728     9    23  1995       2 NL         F                  32    165
3     22899    10    28  1995       2 NL         F                  32    171
4     23032    12     2  1995       2 NL         F                  33     NA
5     22003     1    11  1995       2 DM         M                  37     41
6     22042     2     4  1995       2 DM         F                  36     45
# ... with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
#   plot_type <chr>#選擇四個變數:體重(要<=5公斤)、species_id、sex、weight
#然後只看前六筆
head(dplyr::select(dplyr::filter(q5dta, weight <= 5), species_id, sex, weight))# A tibble: 6 x 3
  species_id sex   weight
  <chr>      <chr>  <dbl>
1 PF         M          5
2 PF         F          5
3 PF         F          5
4 PF         F          4
5 PF         F          5
6 PF         F          4q5dta %>% 
  dplyr::filter(weight <= 5) %>% #篩選觀察值的體重小於等於5
  dplyr::select(species_id, sex, weight) %>% #select變數
  head# A tibble: 6 x 3
  species_id sex   weight
  <chr>      <chr>  <dbl>
1 PF         M          5
2 PF         F          5
3 PF         F          5
4 PF         F          4
5 PF         F          5
6 PF         F          4q5dta %>% 
  mutate(weight_kg = weight / 1000, #新增變數weight_kg、weight_lb
         weight_lb = weight_kg * 2.2) %>% 
  head()# A tibble: 6 x 15
  record_id month   day  year plot_id species_id sex   hindfoot_length weight
      <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
1         1     7    16  1977       2 NL         M                  32     NA
2        72     8    19  1977       2 NL         M                  31     NA
3       224     9    13  1977       2 NL         <NA>               NA     NA
4       266    10    16  1977       2 NL         <NA>               NA     NA
5       349    11    12  1977       2 NL         <NA>               NA     NA
6       363    11    12  1977       2 NL         <NA>               NA     NA
# ... with 6 more variables: genus <chr>, species <chr>, taxa <chr>,
#   plot_type <chr>, weight_kg <dbl>, weight_lb <dbl>q5dta %>% 
  filter(!is.na(weight)) %>% #篩選出在is.na()檢查回傳FALSE的元素
  group_by(sex, species_id) %>% #分群條件
  summarize(mean_weight = mean(weight)) %>% #計算mean_weight
  arrange(desc(mean_weight)) %>% #由高到低排列
  head() #檢視前六筆# A tibble: 6 x 3
# Groups:   sex [3]
  sex   species_id mean_weight
  <chr> <chr>            <dbl>
1 <NA>  NL                168.
2 M     NL                166.
3 F     NL                154.
4 M     SS                130 
5 <NA>  SH                130 
6 M     DS                122.q5dta %>%
  group_by(sex) %>% #依照性別分群
  tally #計算總數量# A tibble: 3 x 2
  sex       n
  <chr> <int>
1 F     15690
2 M     17348
3 <NA>   1748#方法二
q5dta %>%
  group_by(sex) %>% #依照性別分群
  summarise(total=n()) #計算總數量# A tibble: 3 x 2
  sex   total
  <chr> <int>
1 F     15690
2 M     17348
3 <NA>   1748q5dta %>%
  count(sex) #依據性別計算總數量# A tibble: 3 x 2
  sex       n
  <chr> <int>
1 F     15690
2 M     17348
3 <NA>   1748q5dta %>%
  group_by(sex) %>% #先依照性別分群
  summarize(count = n()) #再計算總數# A tibble: 3 x 2
  sex   count
  <chr> <int>
1 F     15690
2 M     17348
3 <NA>   1748q5dta %>%
  group_by(sex) %>% #先依照性別分群
  summarize(count = sum(!is.na(year))) #再計算沒有遺漏值的year數量總數# A tibble: 3 x 2
  sex   count
  <chr> <int>
1 F     15690
2 M     17348
3 <NA>   1748答案都一樣!?
dta_gw <- q5dta %>% 
  filter(!is.na(weight)) %>% #篩選出沒有遺漏值的weight列
  group_by(genus, plot_id) %>% #依據genus與plot_id分群
  summarize(mean_weight = mean(weight)) #計算體重平均mean(weight)glimpse(dta_gw) #檢視資料Rows: 196
Columns: 3
Groups: genus [10]
$ genus       <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
$ plot_id     <dbl> 1, 2, 3, 5, 18, 19, 20, 21, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,~
$ mean_weight <dbl> 7.000000, 6.000000, 8.611111, 7.750000, 9.500000, 9.533333~#long format to wide format
#spread(資料框/長表,key="要展開的欄位名稱",value="數值欄位名稱")
dta_w <- dta_gw %>%
  spread(key = genus, value = mean_weight) #
head(dta_w)# A tibble: 6 x 11
  plot_id Baiomys Chaetodipus Dipodomys Neotoma Onychomys Perognathus Peromyscus
    <dbl>   <dbl>       <dbl>     <dbl>   <dbl>     <dbl>       <dbl>      <dbl>
1       1    7           22.2      60.2    156.      27.7        9.62       22.2
2       2    6           25.1      55.7    169.      26.9        6.95       22.3
3       3    8.61        24.6      52.0    158.      26.0        7.51       21.4
4       4   NA           23.0      57.5    164.      28.1        7.82       22.6
5       5    7.75        18.0      51.1    190.      27.0        8.66       21.2
6       6   NA           24.9      58.6    180.      25.9        7.81       21.8
# ... with 3 more variables: Reithrodontomys <dbl>, Sigmodon <dbl>,
#   Spermophilus <dbl>glimpse(dta_w) #資料檢視Rows:24 > Columns:11Rows: 24
Columns: 11
$ plot_id         <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,~
$ Baiomys         <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA~
$ Chaetodipus     <dbl> 22.19939, 25.11014, 24.63636, 23.02381, 17.98276, 24.8~
$ Dipodomys       <dbl> 60.23214, 55.68259, 52.04688, 57.52454, 51.11356, 58.6~
$ Neotoma         <dbl> 156.2222, 169.1436, 158.2414, 164.1667, 190.0370, 179.~
$ Onychomys       <dbl> 27.67550, 26.87302, 26.03241, 28.09375, 27.01695, 25.8~
$ Perognathus     <dbl> 9.625000, 6.947368, 7.507812, 7.824427, 8.658537, 7.80~
$ Peromyscus      <dbl> 22.22222, 22.26966, 21.37037, 22.60000, 21.23171, 21.8~
$ Reithrodontomys <dbl> 11.375000, 10.680556, 10.516588, 10.263158, 11.154545,~
$ Sigmodon        <dbl> NA, 70.85714, 65.61404, 82.00000, 82.66667, 68.77778, ~
$ Spermophilus    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 13~dta_gw %>%
  spread(genus, mean_weight, fill = 0) %>% #fill = 0:If set, missing values will be replaced with this value.
  head()# A tibble: 6 x 11
  plot_id Baiomys Chaetodipus Dipodomys Neotoma Onychomys Perognathus Peromyscus
    <dbl>   <dbl>       <dbl>     <dbl>   <dbl>     <dbl>       <dbl>      <dbl>
1       1    7           22.2      60.2    156.      27.7        9.62       22.2
2       2    6           25.1      55.7    169.      26.9        6.95       22.3
3       3    8.61        24.6      52.0    158.      26.0        7.51       21.4
4       4    0           23.0      57.5    164.      28.1        7.82       22.6
5       5    7.75        18.0      51.1    190.      27.0        8.66       21.2
6       6    0           24.9      58.6    180.      25.9        7.81       21.8
# ... with 3 more variables: Reithrodontomys <dbl>, Sigmodon <dbl>,
#   Spermophilus <dbl>#寬表轉長表 
#gather(資料框/寬表,key="主鍵欄位名稱",value="數值欄位名稱",要轉換的資料1,要轉換的資料2,...)
dta_l <- dta_w %>%
  gather(key = genus, value = mean_weight, -plot_id) #-plot_id表示除去plot_id,就只轉置剩下兩列
head(dta_l)# A tibble: 6 x 3
  plot_id genus   mean_weight
    <dbl> <chr>         <dbl>
1       1 Baiomys        7   
2       2 Baiomys        6   
3       3 Baiomys        8.61
4       4 Baiomys       NA   
5       5 Baiomys        7.75
6       6 Baiomys       NA   glimpse(dta_l) #看一下Rows數是不是有大於Columns數Rows: 240
Columns: 3
$ plot_id     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,~
$ genus       <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
$ mean_weight <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA, NA~#指定gather的變數類別Baiomys:Spermophilus
dta_w %>%
  gather(key = genus, value = mean_weight, Baiomys:Spermophilus) %>% 
  head()# A tibble: 6 x 3
  plot_id genus   mean_weight
    <dbl> <chr>         <dbl>
1       1 Baiomys        7   
2       2 Baiomys        6   
3       3 Baiomys        8.61
4       4 Baiomys       NA   
5       5 Baiomys        7.75
6       6 Baiomys       NA   #丟掉weight、hindfoot_length、sex的遺漏值
dta_complete <- q5dta %>%
  filter(!is.na(weight),           
         !is.na(hindfoot_length),  
         !is.na(sex))
head(dta_complete)# A tibble: 6 x 13
  record_id month   day  year plot_id species_id sex   hindfoot_length weight
      <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
1       845     5     6  1978       2 NL         M                  32    204
2      1164     8     5  1978       2 NL         M                  34    199
3      1261     9     4  1978       2 NL         M                  32    197
4      1756     4    29  1979       2 NL         M                  33    166
5      1818     5    30  1979       2 NL         M                  32    184
6      1882     7     4  1979       2 NL         M                  32    206
# ... with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
#   plot_type <chr>species_counts <- dta_complete %>%
    count(species_id) %>% #依據species_id計算總數
    filter(n >= 50) #篩選出n >= 50
head(species_counts)# A tibble: 6 x 2
  species_id     n
  <chr>      <int>
1 DM          9727
2 DO          2790
3 DS          2023
4 NL          1045
5 OL           905
6 OT          2081#The %in% operator in R can be used to identify if an element (e.g., a number) belongs to a vector or dataframe. 
dta_complete <- dta_complete %>%
  filter(species_id %in% species_counts$species_id) 
head(dta_complete)# A tibble: 6 x 13
  record_id month   day  year plot_id species_id sex   hindfoot_length weight
      <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
1       845     5     6  1978       2 NL         M                  32    204
2      1164     8     5  1978       2 NL         M                  34    199
3      1261     9     4  1978       2 NL         M                  32    197
4      1756     4    29  1979       2 NL         M                  33    166
5      1818     5    30  1979       2 NL         M                  32    184
6      1882     7     4  1979       2 NL         M                  32    206
# ... with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
#   plot_type <chr>