Q1

Comment on what each code chunk in the following classroom markdown fileis trying to achive and on its output.

Data

pacman::p_load(WWGbook)
data(classroom, package="WWGbook")
#用?查詢classroom資料集
?classroom 

R右下角視窗會跳出這個data set的內容說明。

The Study of Instructional Improvement (SII; Hill, Rowan, and Ball, 2004) was carried out by researchers at the University of Michigan to study the math achievement scores of first- and third-grade students in randomly selected classrooms from a national U.S. sample of elementary schools.

#exam data structure
str(classroom)
'data.frame':   1190 obs. of  12 variables:
 $ sex     : int  1 0 1 0 0 1 0 0 1 0 ...
 $ minority: int  1 1 1 1 1 1 1 1 1 1 ...
 $ mathkind: int  448 460 511 449 425 450 452 443 422 480 ...
 $ mathgain: int  32 109 56 83 53 65 51 66 88 -7 ...
 $ ses     : num  0.46 -0.27 -0.03 -0.38 -0.03 0.76 -0.03 0.2 0.64 0.13 ...
 $ yearstea: num  1 1 1 2 2 2 2 2 2 2 ...
 $ mathknow: num  NA NA NA -0.11 -0.11 -0.11 -0.11 -0.11 -0.11 -0.11 ...
 $ housepov: num  0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 ...
 $ mathprep: num  2 2 2 3.25 3.25 3.25 3.25 3.25 3.25 3.25 ...
 $ classid : int  160 160 160 217 217 217 217 217 217 217 ...
 $ schoolid: int  1 1 1 1 1 1 1 1 1 1 ...
 $ childid : int  1 2 3 4 5 6 7 8 9 10 ...

Split data

dta_schl <- classroom[classroom$schoolid,c("schoolid", "housepov")]  #vector中的元素是重複的
head(dta_schl)
    schoolid housepov
1          1    0.082
1.1        1    0.082
1.2        1    0.082
1.3        1    0.082
1.4        1    0.082
1.5        1    0.082
#duplicated=FALSE設定邏輯向量,回傳c("schoolid", "housepov")不重複的元素 
dta_schl <- classroom[duplicated(classroom$schoolid)==FALSE, c("schoolid", "housepov")]
head(dta_schl,10)
   schoolid housepov
1         1    0.082
12        2    0.082
22        3    0.086
36        4    0.365
42        5    0.511
48        6    0.044
60        7    0.148
74        8    0.085
90        9    0.537
96       10    0.346
#依序回傳c(11, 10, 6,7,9)這五個變數不重複的元素
dta_cls <- classroom[duplicated(classroom$classid)==FALSE,  c(11, 10, 6,7,9)] 
head(dta_cls)
   schoolid classid yearstea mathknow mathprep
1         1     160     1.00       NA     2.00
4         1     217     2.00    -0.11     3.25
12        2     197     1.00    -1.25     2.50
14        2     211     2.00    -0.72     2.33
18        2     307    12.54       NA     2.30
22        3      11    20.00     0.45     3.83
#只取c(12, 10, 11, 1:5)這8個變數,且指定順序
dta_chld <- classroom[, c(12, 10, 11, 1:5)]
head(dta_chld)
  childid classid schoolid sex minority mathkind mathgain   ses
1       1     160        1   1        1      448       32  0.46
2       2     160        1   0        1      460      109 -0.27
3       3     160        1   1        1      511       56 -0.03
4       4     217        1   0        1      449       83 -0.38
5       5     217        1   0        1      425       53 -0.03
6       6     217        1   1        1      450       65  0.76
#list(dta_schl, dta_cls, dta_chld)將不同類型的三個向量放到一個object中
#sapply()功能與lapply()相同
#檢索object的dim維度

sapply(list(dta_schl, dta_cls, dta_chld), dim)
     [,1] [,2] [,3]
[1,]  107  312 1190
[2,]    2    5    8

dta_schl這個data.frame中2個維度(變數),107個觀察值。

dta_cls這個data.frame中5個變數,312個觀察值。

dta_chld這個data.frame中8個變數,1190個觀察值。

Combine data

#依據classid、schoolid來combine dta_chld和dta_cls這兩個data.frame
dta_12 <- merge(x=dta_chld, y=dta_cls, by=c("classid", "schoolid"))
head(dta_12)
  classid schoolid childid sex minority mathkind mathgain   ses yearstea
1       1       61     653   0        1      442       35 -0.09        2
2       1       61     654   0        1      422      109 -1.32        2
3       1       61     655   1        1      482       66  0.43        2
4       1       61     656   1        1      489       19 -0.06        2
5       1       61     657   0        1      460       10 -1.28        2
6      10       17     216   1        1      381      143 -0.49        5
  mathknow mathprep
1    -0.72     2.50
2    -0.72     2.50
3    -0.72     2.50
4    -0.72     2.50
5    -0.72     2.50
6    -1.26     3.25
#依據classid、schoolid來combine dta_chld和dta_cls這兩個data.frame
dta_13 <- merge(x=dta_chld, y=dta_cls, by=c("classid", "schoolid"))
#依據schoolid來combine dta_chld和dta_cls
dta_23 <- merge(x=dta_cls, y=dta_schl, by="schoolid")
#依據schoolid來combine dta_12和dta_schl
dta_123 <- merge(x=dta_12, y=dta_schl, by=c("schoolid"))
#合併後的data frame分別有多少觀察值(obs)與維度(變數)
sapply(list(dta_12, dta_13, dta_23, dta_123), dim)
     [,1] [,2] [,3] [,4]
[1,] 1190 1190  312 1190
[2,]   11   11    6   12

Q2

Merge the two data sets: state.x77{datasets} and USArrests{datasets} and compute all pair-wise correlations for numerical variables.

Is there anything interesting to report?

load dataset1:state.x77

#show dataset
state.x77 |> head() |> knitr::kable()
Population Income Illiteracy Life Exp Murder HS Grad Frost Area
Alabama 3615 3624 2.1 69.05 15.1 41.3 20 50708
Alaska 365 6315 1.5 69.31 11.3 66.7 152 566432
Arizona 2212 4530 1.8 70.55 7.8 58.1 15 113417
Arkansas 2110 3378 1.9 70.66 10.1 39.9 65 51945
California 21198 5114 1.1 71.71 10.3 62.6 20 156361
Colorado 2541 4884 0.7 72.06 6.8 63.9 166 103766
#了解資料
str(state.x77)
 num [1:50, 1:8] 3615 365 2212 2110 21198 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
  ..$ : chr [1:8] "Population" "Income" "Illiteracy" "Life Exp" ...
class(state.x77)
[1] "matrix" "array" 
typeof(state.x77)
[1] "double"
comment(state.x77)
NULL

load dataset2:USArrests

#show dataset
USArrests |> head() |> knitr::kable()
Murder Assault UrbanPop Rape
Alabama 13.2 236 58 21.2
Alaska 10.0 263 48 44.5
Arizona 8.1 294 80 31.0
Arkansas 8.8 190 50 19.5
California 9.0 276 91 40.6
Colorado 7.9 204 78 38.7
#了解資料
str(USArrests)
'data.frame':   50 obs. of  4 variables:
 $ Murder  : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
 $ Assault : int  236 263 294 190 276 204 110 238 335 211 ...
 $ UrbanPop: int  58 48 80 50 91 78 77 72 80 60 ...
 $ Rape    : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
class(USArrests)
[1] "data.frame"
typeof(USArrests)
[1] "list"
comment(USArrests)
NULL

Merge data files

#Merge data files - default
q2dta1<-merge(state.x77, USArrests) 
q2dta1|> head() |> knitr::kable()
Murder Population Income Illiteracy Life Exp HS Grad Frost Area Assault UrbanPop Rape
2.7 1058 3694 0.7 70.39 54.7 161 30920 72 66 14.9
3.3 5814 4755 1.1 71.83 58.5 103 7826 110 77 11.1
3.3 812 4281 0.7 71.23 57.6 174 9027 110 77 11.1
4.3 3559 4864 0.6 71.72 63.5 32 66570 102 62 16.5
5.3 813 4119 0.6 71.87 59.5 126 82677 46 83 20.2
6.8 2541 4884 0.7 72.06 63.9 166 103766 161 60 15.6

依據相同變數Murder合併資料,且刪除missing data。

#Merge data files - all
q2dta2<-merge(state.x77, USArrests, all = TRUE) 
q2dta2|> head() |> knitr::kable()
Murder Population Income Illiteracy Life Exp HS Grad Frost Area Assault UrbanPop Rape
0.8 NA NA NA NA NA NA NA 45 44 7.3
1.4 637 5087 0.8 72.78 50.3 186 69273 NA NA NA
1.7 681 4167 0.5 72.08 53.3 172 75955 NA NA NA
2.1 NA NA NA NA NA NA NA 83 51 7.8
2.1 NA NA NA NA NA NA NA 57 56 9.5
2.2 NA NA NA NA NA NA NA 56 57 11.3

方法一會依據相同變數Murder合併資料,但保留了missing data。

# merge two files together by matching row.names
q2dta3<- merge(state.x77, USArrests, by="row.names", all=T)
q2dta3|> head() |> knitr::kable()
Row.names Population Income Illiteracy Life Exp Murder.x HS Grad Frost Area Murder.y Assault UrbanPop Rape
Alabama 3615 3624 2.1 69.05 15.1 41.3 20 50708 13.2 236 58 21.2
Alaska 365 6315 1.5 69.31 11.3 66.7 152 566432 10.0 263 48 44.5
Arizona 2212 4530 1.8 70.55 7.8 58.1 15 113417 8.1 294 80 31.0
Arkansas 2110 3378 1.9 70.66 10.1 39.9 65 51945 8.8 190 50 19.5
California 21198 5114 1.1 71.71 10.3 62.6 20 156361 9.0 276 91 40.6
Colorado 2541 4884 0.7 72.06 6.8 63.9 166 103766 7.9 204 78 38.7

方法二依據50州的州名(row.name)合併資料。因有兩個相同變數,R自動命名為Murder.x和Murder.y。

#data.frame將兩類資料放在同一個框架
q2dta4<-data.frame(state.x77, USArrests)
q2dta4|> head() |> knitr::kable()
Population Income Illiteracy Life.Exp Murder HS.Grad Frost Area Murder.1 Assault UrbanPop Rape
Alabama 3615 3624 2.1 69.05 15.1 41.3 20 50708 13.2 236 58 21.2
Alaska 365 6315 1.5 69.31 11.3 66.7 152 566432 10.0 263 48 44.5
Arizona 2212 4530 1.8 70.55 7.8 58.1 15 113417 8.1 294 80 31.0
Arkansas 2110 3378 1.9 70.66 10.1 39.9 65 51945 8.8 190 50 19.5
California 21198 5114 1.1 71.71 10.3 62.6 20 156361 9.0 276 91 40.6
Colorado 2541 4884 0.7 72.06 6.8 63.9 166 103766 7.9 204 78 38.7

使用data.frame()合併資料,合併資料的方式也是依據50州的州名,但合併後州名row.name不是變數。

Correlation

cor(q2dta1)|> knitr::kable(digits = 2)
Murder Population Income Illiteracy Life Exp HS Grad Frost Area Assault UrbanPop Rape
Murder 1.00 0.33 -0.10 0.80 -0.74 -0.59 -0.53 0.37 0.81 -0.11 0.72
Population 0.33 1.00 -0.02 0.12 0.05 -0.40 -0.36 -0.09 0.41 0.57 0.56
Income -0.10 -0.02 1.00 -0.46 0.22 0.66 0.36 0.57 0.28 0.09 0.02
Illiteracy 0.80 0.12 -0.46 1.00 -0.75 -0.68 -0.67 0.18 0.51 -0.35 0.50
Life Exp -0.74 0.05 0.22 -0.75 1.00 0.66 0.41 -0.21 -0.60 0.25 -0.36
HS Grad -0.59 -0.40 0.66 -0.68 0.66 1.00 0.60 0.36 -0.37 -0.13 -0.41
Frost -0.53 -0.36 0.36 -0.67 0.41 0.60 1.00 0.12 -0.28 0.19 -0.46
Area 0.37 -0.09 0.57 0.18 -0.21 0.36 0.12 1.00 0.54 -0.07 0.50
Assault 0.81 0.41 0.28 0.51 -0.60 -0.37 -0.28 0.54 1.00 0.14 0.69
UrbanPop -0.11 0.57 0.09 -0.35 0.25 -0.13 0.19 -0.07 0.14 1.00 0.24
Rape 0.72 0.56 0.02 0.50 -0.36 -0.41 -0.46 0.50 0.69 0.24 1.00
cor(q2dta2)|> knitr::kable(digits = 2)
Murder Population Income Illiteracy Life Exp HS Grad Frost Area Assault UrbanPop Rape
Murder 1 NA NA NA NA NA NA NA NA NA NA
Population NA 1 NA NA NA NA NA NA NA NA NA
Income NA NA 1 NA NA NA NA NA NA NA NA
Illiteracy NA NA NA 1 NA NA NA NA NA NA NA
Life Exp NA NA NA NA 1 NA NA NA NA NA NA
HS Grad NA NA NA NA NA 1 NA NA NA NA NA
Frost NA NA NA NA NA NA 1 NA NA NA NA
Area NA NA NA NA NA NA NA 1 NA NA NA
Assault NA NA NA NA NA NA NA NA 1 NA NA
UrbanPop NA NA NA NA NA NA NA NA NA 1 NA
Rape NA NA NA NA NA NA NA NA NA NA 1

有missing value,無法算correlation

cor(q2dta3[,-1])|> knitr::kable(digits = 2) #[,-1]
Population Income Illiteracy Life Exp Murder.x HS Grad Frost Area Murder.y Assault UrbanPop Rape
Population 1.00 0.21 0.11 -0.07 0.34 -0.10 -0.33 0.02 0.32 0.32 0.51 0.31
Income 0.21 1.00 -0.44 0.34 -0.23 0.62 0.23 0.36 -0.22 0.04 0.48 0.36
Illiteracy 0.11 -0.44 1.00 -0.59 0.70 -0.66 -0.67 0.08 0.71 0.51 -0.06 0.15
Life Exp -0.07 0.34 -0.59 1.00 -0.78 0.58 0.26 -0.11 -0.78 -0.63 0.27 -0.27
Murder.x 0.34 -0.23 0.70 -0.78 1.00 -0.49 -0.54 0.23 0.93 0.74 0.02 0.58
HS Grad -0.10 0.62 -0.66 0.58 -0.49 1.00 0.37 0.33 -0.52 -0.23 0.36 0.27
Frost -0.33 0.23 -0.67 0.26 -0.54 0.37 1.00 0.06 -0.54 -0.47 -0.25 -0.28
Area 0.02 0.36 0.08 -0.11 0.23 0.33 0.06 1.00 0.15 0.23 -0.06 0.52
Murder.y 0.32 -0.22 0.71 -0.78 0.93 -0.52 -0.54 0.15 1.00 0.80 0.07 0.56
Assault 0.32 0.04 0.51 -0.63 0.74 -0.23 -0.47 0.23 0.80 1.00 0.26 0.67
UrbanPop 0.51 0.48 -0.06 0.27 0.02 0.36 -0.25 -0.06 0.07 0.26 1.00 0.41
Rape 0.31 0.36 0.15 -0.27 0.58 0.27 -0.28 0.52 0.56 0.67 0.41 1.00

Error in cor(q2dta3) : ‘x’ 必須是數值,因此使用[,-1]排除第一行州名

cor(q2dta4)|> knitr::kable(digits = 2)
Population Income Illiteracy Life.Exp Murder HS.Grad Frost Area Murder.1 Assault UrbanPop Rape
Population 1.00 0.21 0.11 -0.07 0.34 -0.10 -0.33 0.02 0.32 0.32 0.51 0.31
Income 0.21 1.00 -0.44 0.34 -0.23 0.62 0.23 0.36 -0.22 0.04 0.48 0.36
Illiteracy 0.11 -0.44 1.00 -0.59 0.70 -0.66 -0.67 0.08 0.71 0.51 -0.06 0.15
Life.Exp -0.07 0.34 -0.59 1.00 -0.78 0.58 0.26 -0.11 -0.78 -0.63 0.27 -0.27
Murder 0.34 -0.23 0.70 -0.78 1.00 -0.49 -0.54 0.23 0.93 0.74 0.02 0.58
HS.Grad -0.10 0.62 -0.66 0.58 -0.49 1.00 0.37 0.33 -0.52 -0.23 0.36 0.27
Frost -0.33 0.23 -0.67 0.26 -0.54 0.37 1.00 0.06 -0.54 -0.47 -0.25 -0.28
Area 0.02 0.36 0.08 -0.11 0.23 0.33 0.06 1.00 0.15 0.23 -0.06 0.52
Murder.1 0.32 -0.22 0.71 -0.78 0.93 -0.52 -0.54 0.15 1.00 0.80 0.07 0.56
Assault 0.32 0.04 0.51 -0.63 0.74 -0.23 -0.47 0.23 0.80 1.00 0.26 0.67
UrbanPop 0.51 0.48 -0.06 0.27 0.02 0.36 -0.25 -0.06 0.07 0.26 1.00 0.41
Rape 0.31 0.36 0.15 -0.27 0.58 0.27 -0.28 0.52 0.56 0.67 0.41 1.00

與文盲高度正相關的變數為謀殺犯罪率。 cor(Illiteracy,Murder)=0.70; cor(Illiteracy,Murder1)=0.71。

與文盲高度負相關的變數為HS.Grad、Frost。 cor(Illiteracy,HS.Grad)=-0.66; cor(Illiteracy,Frost)=-0.67。

與謀殺犯罪率高度負相關的變數為Life.Exp(生活期待) cor(Murder,Life.Exp)=-0.78 cor(Murder1,Life.Exp)=-0.78

與暴力高度負相關的變數為Life.Exp(生活期待) cor(Assault,Life.Exp)=-0.63

與暴力高度正相關的變數為謀殺犯罪率、強暴。 cor(Assault,Murder)=0.74 cor(Assault,Murder1)=0.80 cor(Assault,Rape)=0.67

暴力與區域低度相關。 cor(Area,Rape)=0.56

plot data

pacman::p_load(corrplot,RColorBrewer)

corrplot(cor(q2dta4), type="upper", order="hclust",
         col=brewer.pal(n=8, name="RdYlBu"))

Q3

Summarize the backpain{HSAUR3} into the following format: You should provide comments for each code chunk.

# install package
pacman::p_load(HSAUR3)
# load data
data(backpain,package = "HSAUR3")
#copy data
q3dta<-backpain
# show data
q3dta |> head() |> knitr::kable()
ID status driver suburban
1 case yes yes
1 control yes no
2 case yes yes
2 control yes yes
3 case yes no
3 control yes yes
# exam data str
str(q3dta)
'data.frame':   434 obs. of  4 variables:
 $ ID      : Factor w/ 217 levels "1","2","3","4",..: 1 1 2 2 3 3 4 4 5 5 ...
 $ status  : Factor w/ 2 levels "case","control": 1 2 1 2 1 2 1 2 1 2 ...
 $ driver  : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 1 1 2 2 ...
 $ suburban: Factor w/ 2 levels "no","yes": 2 1 2 2 1 2 1 1 1 2 ...
class(q3dta)
[1] "data.frame"
typeof(q3dta)
[1] "list"
summary(q3dta)
       ID          status    driver    suburban 
 1      :  2   case   :217   no : 86   no :200  
 2      :  2   control:217   yes:348   yes:234  
 3      :  2                                    
 4      :  2                                    
 5      :  2                                    
 6      :  2                                    
 (Other):422                                    
# cross-tabulate status by driver
q3tab1<-with(q3dta, table(driver,status))
q3tab1
      status
driver case control
   no    32      54
   yes  185     163
# cross-tabulate status by suburban
q3tab2<-with(q3dta, table(suburban,status))
q3tab2
        status
suburban case control
     no    90     110
     yes  127     107
# three-way table()
q3tab3<-with(q3dta, table(driver, suburban, status))
q3tab3
, , status = case

      suburban
driver  no yes
   no   26   6
   yes  64 121

, , status = control

      suburban
driver  no yes
   no   47   7
   yes  63 100
# cross-tabulate status by suburban given driver
q3tab4<-with(q3dta, ftable(driver, suburban, status))
q3tab4
                status case control
driver suburban                    
no     no                26      47
       yes                6       7
yes    no                64      63
       yes              121     100

q3tab4很接近題目要求的format,接下來要依據driver、suburban來分群計算Total。

str(q3tab4)
 'ftable' int [1:4, 1:2] 26 6 64 121 47 7 63 100
 - attr(*, "row.vars")=List of 2
  ..$ driver  : chr [1:2] "no" "yes"
  ..$ suburban: chr [1:2] "no" "yes"
 - attr(*, "col.vars")=List of 1
  ..$ status: chr [1:2] "case" "control"
#轉換資料型態
q3tab<-as.data.frame(q3tab4)
q3tab
  driver suburban  status Freq
1     no       no    case   26
2    yes       no    case   64
3     no      yes    case    6
4    yes      yes    case  121
5     no       no control   47
6    yes       no control   63
7     no      yes control    7
8    yes      yes control  100
#要用aggregate?還是reshape?

#aggregate(Freq~driver+suburban+status, by=list(status,suburban), FUN=sum)
#aggregate(cbind(driver,suburban) ~ status, data=q3tab, FUN=sum)


# long to wide format
reshape(q3tab,
        idvar="suburban",
        timevar="status",
        v.names="driver",
        direction="wide")
  suburban Freq driver.case driver.control
1       no   26          no             no
3      yes    6          no             no

這結果不是我要的….

#Summarize a transformed variable
pacman::p_load(dplyr, magrittr ,tidyr)

#把case從status撈出來
q3tab_case<-q3tab |>
 filter(status=='case')|>
 dplyr::rename(Case=Freq)
    

#把control從status撈出來
q3tab_control<-q3tab |>
 filter(status=='control')|>
 dplyr::rename(Control=Freq)

#merge後再篩選變數
q3dta_merge<-merge(x =q3tab_case, y = q3tab_control, by = c("driver","suburban"))|>
   select(-c(3,5))

#新增變數total,然後水平加總Case+Control
q3answer<-mutate(q3dta_merge,total=Case+Control)
q3answer
  driver suburban Case Control total
1     no       no   26      47    73
2     no      yes    6       7    13
3    yes       no   64      63   127
4    yes      yes  121     100   221

終於用笨方法得到答案了。

Q4

The data set Vocab{carData} gives observations on gender, education and vocabulary, from respondents to U.S. General Social Surveys, 1972-2004.

Summarize the relationship between education and vocabulary over the years by gender.

# Save a copy of data as tibble
q4dta <- as_tibble(carData::Vocab)

# show data
q4dta |> head() |> knitr::kable()
year sex education vocabulary
1974 Male 14 9
1974 Male 16 9
1974 Female 10 9
1974 Female 10 5
1974 Female 12 8
1974 Male 16 8
#rename
q4dta %<>%
 dplyr::rename(Year=year, Gender=sex, 
               Edu=education, Voc=vocabulary)
#依據講義tidy_01 p.23
q4dta %>%
 lattice::xyplot(Edu ~ Voc | Year, 
        groups=Gender, 
        type=c("p","g","r"), 
        data=., 
        xlab="Vocabulary", 
        ylab="Education", 
        auto.key=list(columns=2))

#q4dta$Year %>% 
    #as.factor()%>% 
#lattice::xyplot(Edu ~ Voc | Year, 
       # groups=Gender, 
       # type=c("p","g","r"), 
       # data=., 
       # xlab="Vocabulary", 
       # ylab="Education", 
       # auto.key=list(columns=2))

將Year轉成factor,會出現error message。 Error in eval(substitute(groups), data, environment(x)) : numeric ‘envir’ arg not of length one

Q5

Supply comments to each code chunk in the following survey rmarkdown file and preview it as an R notebook or knit to html.

The data set concerns species and weight of animals caught in plots in a study area in Arizona over time.

Each row holds information for a single animal, and the columns represent:

  • record_id: Unique id for the observation
  • month: month of observation
  • day: day of observation
  • year: year of observation
  • plot_id: ID of a particular plot
  • species_id: 2-letter code
  • sex: sex of animal (�𢜛��, �𤧶��)
  • hindfoot_length: length of the hindfoot in mm
  • weight: weight of the animal in grams
  • genus: genus of animal
  • species: species of animal
  • taxa: e.g. Rodent, Reptile, Bird, Rabbit
  • plot_type: type of plot
# 下載並安裝tidyverse package
pacman::p_load(tidyverse)
# 從網路讀取csv檔
q5dta <- read_csv("http://kbroman.org/datacarp/portal_data_joined.csv")
# 資料檢視
glimpse(q5dta)
Rows: 34,786
Columns: 13
$ record_id       <dbl> 1, 72, 224, 266, 349, 363, 435, 506, 588, 661, 748, 84~
$ month           <dbl> 7, 8, 9, 10, 11, 11, 12, 1, 2, 3, 4, 5, 6, 8, 9, 10, 1~
$ day             <dbl> 16, 19, 13, 16, 12, 12, 10, 8, 18, 11, 8, 6, 9, 5, 4, ~
$ year            <dbl> 1977, 1977, 1977, 1977, 1977, 1977, 1977, 1978, 1978, ~
$ plot_id         <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~
$ species_id      <chr> "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", ~
$ sex             <chr> "M", "M", NA, NA, NA, NA, NA, NA, "M", NA, NA, "M", "M~
$ hindfoot_length <dbl> 32, 31, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, NA, 34~
$ weight          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 218, NA, NA, 204, 200,~
$ genus           <chr> "Neotoma", "Neotoma", "Neotoma", "Neotoma", "Neotoma",~
$ species         <chr> "albigula", "albigula", "albigula", "albigula", "albig~
$ taxa            <chr> "Rodent", "Rodent", "Rodent", "Rodent", "Rodent", "Rod~
$ plot_type       <chr> "Control", "Control", "Control", "Control", "Control",~
#資料維度
dim(q5dta)
[1] 34786    13
#select()篩選變數
dplyr::select(q5dta, plot_id, species_id, weight) %>% head()
# A tibble: 6 x 3
  plot_id species_id weight
    <dbl> <chr>       <dbl>
1       2 NL             NA
2       2 NL             NA
3       2 NL             NA
4       2 NL             NA
5       2 NL             NA
6       2 NL             NA
#select()也可以刪除不要的變數,在變數前加-
dplyr::select(q5dta, -record_id, -species_id) %>% head()
# A tibble: 6 x 11
  month   day  year plot_id sex   hindfoot_length weight genus   species  taxa  
  <dbl> <dbl> <dbl>   <dbl> <chr>           <dbl>  <dbl> <chr>   <chr>    <chr> 
1     7    16  1977       2 M                  32     NA Neotoma albigula Rodent
2     8    19  1977       2 M                  31     NA Neotoma albigula Rodent
3     9    13  1977       2 <NA>               NA     NA Neotoma albigula Rodent
4    10    16  1977       2 <NA>               NA     NA Neotoma albigula Rodent
5    11    12  1977       2 <NA>               NA     NA Neotoma albigula Rodent
6    11    12  1977       2 <NA>               NA     NA Neotoma albigula Rodent
# ... with 1 more variable: plot_type <chr>
#filter()選出要分析的觀察值:year == 1995
dplyr::filter(q5dta, year == 1995) %>% head() #然後只看前六筆
# A tibble: 6 x 13
  record_id month   day  year plot_id species_id sex   hindfoot_length weight
      <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
1     22314     6     7  1995       2 NL         M                  34     NA
2     22728     9    23  1995       2 NL         F                  32    165
3     22899    10    28  1995       2 NL         F                  32    171
4     23032    12     2  1995       2 NL         F                  33     NA
5     22003     1    11  1995       2 DM         M                  37     41
6     22042     2     4  1995       2 DM         F                  36     45
# ... with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
#   plot_type <chr>
#選擇四個變數:體重(要<=5公斤)、species_id、sex、weight
#然後只看前六筆
head(dplyr::select(dplyr::filter(q5dta, weight <= 5), species_id, sex, weight))
# A tibble: 6 x 3
  species_id sex   weight
  <chr>      <chr>  <dbl>
1 PF         M          5
2 PF         F          5
3 PF         F          5
4 PF         F          4
5 PF         F          5
6 PF         F          4
q5dta %>% 
  dplyr::filter(weight <= 5) %>% #篩選觀察值的體重小於等於5
  dplyr::select(species_id, sex, weight) %>% #select變數
  head
# A tibble: 6 x 3
  species_id sex   weight
  <chr>      <chr>  <dbl>
1 PF         M          5
2 PF         F          5
3 PF         F          5
4 PF         F          4
5 PF         F          5
6 PF         F          4
q5dta %>% 
  mutate(weight_kg = weight / 1000, #新增變數weight_kg、weight_lb
         weight_lb = weight_kg * 2.2) %>% 
  head()
# A tibble: 6 x 15
  record_id month   day  year plot_id species_id sex   hindfoot_length weight
      <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
1         1     7    16  1977       2 NL         M                  32     NA
2        72     8    19  1977       2 NL         M                  31     NA
3       224     9    13  1977       2 NL         <NA>               NA     NA
4       266    10    16  1977       2 NL         <NA>               NA     NA
5       349    11    12  1977       2 NL         <NA>               NA     NA
6       363    11    12  1977       2 NL         <NA>               NA     NA
# ... with 6 more variables: genus <chr>, species <chr>, taxa <chr>,
#   plot_type <chr>, weight_kg <dbl>, weight_lb <dbl>
q5dta %>% 
  filter(!is.na(weight)) %>% #篩選出在is.na()檢查回傳FALSE的元素
  group_by(sex, species_id) %>% #分群條件
  summarize(mean_weight = mean(weight)) %>% #計算mean_weight
  arrange(desc(mean_weight)) %>% #由高到低排列
  head() #檢視前六筆
# A tibble: 6 x 3
# Groups:   sex [3]
  sex   species_id mean_weight
  <chr> <chr>            <dbl>
1 <NA>  NL                168.
2 M     NL                166.
3 F     NL                154.
4 M     SS                130 
5 <NA>  SH                130 
6 M     DS                122.
q5dta %>%
  group_by(sex) %>% #依照性別分群
  tally #計算總數量
# A tibble: 3 x 2
  sex       n
  <chr> <int>
1 F     15690
2 M     17348
3 <NA>   1748
#方法二
q5dta %>%
  group_by(sex) %>% #依照性別分群
  summarise(total=n()) #計算總數量
# A tibble: 3 x 2
  sex   total
  <chr> <int>
1 F     15690
2 M     17348
3 <NA>   1748
q5dta %>%
  count(sex) #依據性別計算總數量
# A tibble: 3 x 2
  sex       n
  <chr> <int>
1 F     15690
2 M     17348
3 <NA>   1748
q5dta %>%
  group_by(sex) %>% #先依照性別分群
  summarize(count = n()) #再計算總數
# A tibble: 3 x 2
  sex   count
  <chr> <int>
1 F     15690
2 M     17348
3 <NA>   1748
q5dta %>%
  group_by(sex) %>% #先依照性別分群
  summarize(count = sum(!is.na(year))) #再計算沒有遺漏值的year數量總數
# A tibble: 3 x 2
  sex   count
  <chr> <int>
1 F     15690
2 M     17348
3 <NA>   1748

答案都一樣!?

dta_gw <- q5dta %>% 
  filter(!is.na(weight)) %>% #篩選出沒有遺漏值的weight列
  group_by(genus, plot_id) %>% #依據genus與plot_id分群
  summarize(mean_weight = mean(weight)) #計算體重平均mean(weight)
glimpse(dta_gw) #檢視資料
Rows: 196
Columns: 3
Groups: genus [10]
$ genus       <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
$ plot_id     <dbl> 1, 2, 3, 5, 18, 19, 20, 21, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,~
$ mean_weight <dbl> 7.000000, 6.000000, 8.611111, 7.750000, 9.500000, 9.533333~
#long format to wide format
#spread(資料框/長表,key="要展開的欄位名稱",value="數值欄位名稱")
dta_w <- dta_gw %>%
  spread(key = genus, value = mean_weight) #
head(dta_w)
# A tibble: 6 x 11
  plot_id Baiomys Chaetodipus Dipodomys Neotoma Onychomys Perognathus Peromyscus
    <dbl>   <dbl>       <dbl>     <dbl>   <dbl>     <dbl>       <dbl>      <dbl>
1       1    7           22.2      60.2    156.      27.7        9.62       22.2
2       2    6           25.1      55.7    169.      26.9        6.95       22.3
3       3    8.61        24.6      52.0    158.      26.0        7.51       21.4
4       4   NA           23.0      57.5    164.      28.1        7.82       22.6
5       5    7.75        18.0      51.1    190.      27.0        8.66       21.2
6       6   NA           24.9      58.6    180.      25.9        7.81       21.8
# ... with 3 more variables: Reithrodontomys <dbl>, Sigmodon <dbl>,
#   Spermophilus <dbl>
glimpse(dta_w) #資料檢視Rows:24 > Columns:11
Rows: 24
Columns: 11
$ plot_id         <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,~
$ Baiomys         <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA~
$ Chaetodipus     <dbl> 22.19939, 25.11014, 24.63636, 23.02381, 17.98276, 24.8~
$ Dipodomys       <dbl> 60.23214, 55.68259, 52.04688, 57.52454, 51.11356, 58.6~
$ Neotoma         <dbl> 156.2222, 169.1436, 158.2414, 164.1667, 190.0370, 179.~
$ Onychomys       <dbl> 27.67550, 26.87302, 26.03241, 28.09375, 27.01695, 25.8~
$ Perognathus     <dbl> 9.625000, 6.947368, 7.507812, 7.824427, 8.658537, 7.80~
$ Peromyscus      <dbl> 22.22222, 22.26966, 21.37037, 22.60000, 21.23171, 21.8~
$ Reithrodontomys <dbl> 11.375000, 10.680556, 10.516588, 10.263158, 11.154545,~
$ Sigmodon        <dbl> NA, 70.85714, 65.61404, 82.00000, 82.66667, 68.77778, ~
$ Spermophilus    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 13~
dta_gw %>%
  spread(genus, mean_weight, fill = 0) %>% #fill = 0:If set, missing values will be replaced with this value.
  head()
# A tibble: 6 x 11
  plot_id Baiomys Chaetodipus Dipodomys Neotoma Onychomys Perognathus Peromyscus
    <dbl>   <dbl>       <dbl>     <dbl>   <dbl>     <dbl>       <dbl>      <dbl>
1       1    7           22.2      60.2    156.      27.7        9.62       22.2
2       2    6           25.1      55.7    169.      26.9        6.95       22.3
3       3    8.61        24.6      52.0    158.      26.0        7.51       21.4
4       4    0           23.0      57.5    164.      28.1        7.82       22.6
5       5    7.75        18.0      51.1    190.      27.0        8.66       21.2
6       6    0           24.9      58.6    180.      25.9        7.81       21.8
# ... with 3 more variables: Reithrodontomys <dbl>, Sigmodon <dbl>,
#   Spermophilus <dbl>
#寬表轉長表 
#gather(資料框/寬表,key="主鍵欄位名稱",value="數值欄位名稱",要轉換的資料1,要轉換的資料2,...)
dta_l <- dta_w %>%
  gather(key = genus, value = mean_weight, -plot_id) #-plot_id表示除去plot_id,就只轉置剩下兩列
head(dta_l)
# A tibble: 6 x 3
  plot_id genus   mean_weight
    <dbl> <chr>         <dbl>
1       1 Baiomys        7   
2       2 Baiomys        6   
3       3 Baiomys        8.61
4       4 Baiomys       NA   
5       5 Baiomys        7.75
6       6 Baiomys       NA   
glimpse(dta_l) #看一下Rows數是不是有大於Columns數
Rows: 240
Columns: 3
$ plot_id     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,~
$ genus       <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
$ mean_weight <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA, NA~
#指定gather的變數類別Baiomys:Spermophilus
dta_w %>%
  gather(key = genus, value = mean_weight, Baiomys:Spermophilus) %>% 
  head()
# A tibble: 6 x 3
  plot_id genus   mean_weight
    <dbl> <chr>         <dbl>
1       1 Baiomys        7   
2       2 Baiomys        6   
3       3 Baiomys        8.61
4       4 Baiomys       NA   
5       5 Baiomys        7.75
6       6 Baiomys       NA   
#丟掉weight、hindfoot_length、sex的遺漏值
dta_complete <- q5dta %>%
  filter(!is.na(weight),           
         !is.na(hindfoot_length),  
         !is.na(sex))
head(dta_complete)
# A tibble: 6 x 13
  record_id month   day  year plot_id species_id sex   hindfoot_length weight
      <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
1       845     5     6  1978       2 NL         M                  32    204
2      1164     8     5  1978       2 NL         M                  34    199
3      1261     9     4  1978       2 NL         M                  32    197
4      1756     4    29  1979       2 NL         M                  33    166
5      1818     5    30  1979       2 NL         M                  32    184
6      1882     7     4  1979       2 NL         M                  32    206
# ... with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
#   plot_type <chr>
species_counts <- dta_complete %>%
    count(species_id) %>% #依據species_id計算總數
    filter(n >= 50) #篩選出n >= 50

head(species_counts)
# A tibble: 6 x 2
  species_id     n
  <chr>      <int>
1 DM          9727
2 DO          2790
3 DS          2023
4 NL          1045
5 OL           905
6 OT          2081
#The %in% operator in R can be used to identify if an element (e.g., a number) belongs to a vector or dataframe. 

dta_complete <- dta_complete %>%
  filter(species_id %in% species_counts$species_id) 
head(dta_complete)
# A tibble: 6 x 13
  record_id month   day  year plot_id species_id sex   hindfoot_length weight
      <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
1       845     5     6  1978       2 NL         M                  32    204
2      1164     8     5  1978       2 NL         M                  34    199
3      1261     9     4  1978       2 NL         M                  32    197
4      1756     4    29  1979       2 NL         M                  33    166
5      1818     5    30  1979       2 NL         M                  32    184
6      1882     7     4  1979       2 NL         M                  32    206
# ... with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
#   plot_type <chr>