Q1

Comment on what each code chunk in the following classroom markdown fileis trying to achive and on its output.

Data

pacman::p_load(WWGbook)
data(classroom, package="WWGbook")

#用?查詢classroom資料集
?classroom

R右下角視窗會跳出這個data set的內容說明。

The Study of Instructional Improvement (SII; Hill, Rowan, and Ball, 2004) was carried out by researchers at the University of Michigan to study the math achievement scores of first- and third-grade students in randomly selected classrooms from a national U.S. sample of elementary schools.

#exam data structure
str(classroom)

'data.frame':   1190 obs. of  12 variables:
 $ sex     : int  1 0 1 0 0 1 0 0 1 0 ...
 $ minority: int  1 1 1 1 1 1 1 1 1 1 ...
 $ mathkind: int  448 460 511 449 425 450 452 443 422 480 ...
 $ mathgain: int  32 109 56 83 53 65 51 66 88 -7 ...
 $ ses     : num  0.46 -0.27 -0.03 -0.38 -0.03 0.76 -0.03 0.2 0.64 0.13 ...
 $ yearstea: num  1 1 1 2 2 2 2 2 2 2 ...
 $ mathknow: num  NA NA NA -0.11 -0.11 -0.11 -0.11 -0.11 -0.11 -0.11 ...
 $ housepov: num  0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 ...
 $ mathprep: num  2 2 2 3.25 3.25 3.25 3.25 3.25 3.25 3.25 ...
 $ classid : int  160 160 160 217 217 217 217 217 217 217 ...
 $ schoolid: int  1 1 1 1 1 1 1 1 1 1 ...
 $ childid : int  1 2 3 4 5 6 7 8 9 10 ...

Split data

dta_schl <- classroom[classroom$schoolid,c("schoolid", "housepov")]  #vector中的元素是重複的
head(dta_schl)

    schoolid housepov
1          1    0.082
1.1        1    0.082
1.2        1    0.082
1.3        1    0.082
1.4        1    0.082
1.5        1    0.082

#duplicated=FALSE設定邏輯向量，回傳c("schoolid", "housepov")不重複的元素 
dta_schl <- classroom[duplicated(classroom$schoolid)==FALSE, c("schoolid", "housepov")]
head(dta_schl,10)

   schoolid housepov
1         1    0.082
12        2    0.082
22        3    0.086
36        4    0.365
42        5    0.511
48        6    0.044
60        7    0.148
74        8    0.085
90        9    0.537
96       10    0.346

#依序回傳c(11, 10, 6,7,9)這五個變數不重複的元素
dta_cls <- classroom[duplicated(classroom$classid)==FALSE,  c(11, 10, 6,7,9)] 
head(dta_cls)

   schoolid classid yearstea mathknow mathprep
1         1     160     1.00       NA     2.00
4         1     217     2.00    -0.11     3.25
12        2     197     1.00    -1.25     2.50
14        2     211     2.00    -0.72     2.33
18        2     307    12.54       NA     2.30
22        3      11    20.00     0.45     3.83

#只取c(12, 10, 11, 1:5)這8個變數，且指定順序
dta_chld <- classroom[, c(12, 10, 11, 1:5)]
head(dta_chld)

  childid classid schoolid sex minority mathkind mathgain   ses
1       1     160        1   1        1      448       32  0.46
2       2     160        1   0        1      460      109 -0.27
3       3     160        1   1        1      511       56 -0.03
4       4     217        1   0        1      449       83 -0.38
5       5     217        1   0        1      425       53 -0.03
6       6     217        1   1        1      450       65  0.76

#list(dta_schl, dta_cls, dta_chld)將不同類型的三個向量放到一個object中
#sapply()功能與lapply()相同
#檢索object的dim維度

sapply(list(dta_schl, dta_cls, dta_chld), dim)

     [,1] [,2] [,3]
[1,]  107  312 1190
[2,]    2    5    8

dta_schl這個data.frame中2個維度(變數)，107個觀察值。

dta_cls這個data.frame中5個變數，312個觀察值。

dta_chld這個data.frame中8個變數，1190個觀察值。

Combine data

#依據classid、schoolid來combine dta_chld和dta_cls這兩個data.frame
dta_12 <- merge(x=dta_chld, y=dta_cls, by=c("classid", "schoolid"))
head(dta_12)

  classid schoolid childid sex minority mathkind mathgain   ses yearstea
1       1       61     653   0        1      442       35 -0.09        2
2       1       61     654   0        1      422      109 -1.32        2
3       1       61     655   1        1      482       66  0.43        2
4       1       61     656   1        1      489       19 -0.06        2
5       1       61     657   0        1      460       10 -1.28        2
6      10       17     216   1        1      381      143 -0.49        5
  mathknow mathprep
1    -0.72     2.50
2    -0.72     2.50
3    -0.72     2.50
4    -0.72     2.50
5    -0.72     2.50
6    -1.26     3.25

#依據classid、schoolid來combine dta_chld和dta_cls這兩個data.frame
dta_13 <- merge(x=dta_chld, y=dta_cls, by=c("classid", "schoolid"))

#依據schoolid來combine dta_chld和dta_cls
dta_23 <- merge(x=dta_cls, y=dta_schl, by="schoolid")

#依據schoolid來combine dta_12和dta_schl
dta_123 <- merge(x=dta_12, y=dta_schl, by=c("schoolid"))

#合併後的data frame分別有多少觀察值(obs)與維度(變數)
sapply(list(dta_12, dta_13, dta_23, dta_123), dim)

     [,1] [,2] [,3] [,4]
[1,] 1190 1190  312 1190
[2,]   11   11    6   12

Q2

Merge the two data sets: state.x77{datasets} and USArrests{datasets} and compute all pair-wise correlations for numerical variables.

Is there anything interesting to report?

load dataset1：state.x77

#show dataset
state.x77 |> head() |> knitr::kable()

	Population	Income	Illiteracy	Life Exp	Murder	HS Grad	Frost	Area
Alabama	3615	3624	2.1	69.05	15.1	41.3	20	50708
Alaska	365	6315	1.5	69.31	11.3	66.7	152	566432
Arizona	2212	4530	1.8	70.55	7.8	58.1	15	113417
Arkansas	2110	3378	1.9	70.66	10.1	39.9	65	51945
California	21198	5114	1.1	71.71	10.3	62.6	20	156361
Colorado	2541	4884	0.7	72.06	6.8	63.9	166	103766

#了解資料
str(state.x77)

 num [1:50, 1:8] 3615 365 2212 2110 21198 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
  ..$ : chr [1:8] "Population" "Income" "Illiteracy" "Life Exp" ...

class(state.x77)

[1] "matrix" "array"

typeof(state.x77)

[1] "double"

comment(state.x77)

NULL

load dataset2：USArrests

#show dataset
USArrests |> head() |> knitr::kable()

	Murder	Assault	UrbanPop	Rape
Alabama	13.2	236	58	21.2
Alaska	10.0	263	48	44.5
Arizona	8.1	294	80	31.0
Arkansas	8.8	190	50	19.5
California	9.0	276	91	40.6
Colorado	7.9	204	78	38.7

#了解資料
str(USArrests)

'data.frame':   50 obs. of  4 variables:
 $ Murder  : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
 $ Assault : int  236 263 294 190 276 204 110 238 335 211 ...
 $ UrbanPop: int  58 48 80 50 91 78 77 72 80 60 ...
 $ Rape    : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...

class(USArrests)

[1] "data.frame"

typeof(USArrests)

[1] "list"

comment(USArrests)

NULL

Merge data files

#Merge data files - default
q2dta1<-merge(state.x77, USArrests) 
q2dta1|> head() |> knitr::kable()

Murder	Population	Income	Illiteracy	Life Exp	HS Grad	Frost	Area	Assault	UrbanPop	Rape
2.7	1058	3694	0.7	70.39	54.7	161	30920	72	66	14.9
3.3	5814	4755	1.1	71.83	58.5	103	7826	110	77	11.1
3.3	812	4281	0.7	71.23	57.6	174	9027	110	77	11.1
4.3	3559	4864	0.6	71.72	63.5	32	66570	102	62	16.5
5.3	813	4119	0.6	71.87	59.5	126	82677	46	83	20.2
6.8	2541	4884	0.7	72.06	63.9	166	103766	161	60	15.6

依據相同變數Murder合併資料，且刪除missing data。

#Merge data files - all
q2dta2<-merge(state.x77, USArrests, all = TRUE) 
q2dta2|> head() |> knitr::kable()

Murder	Population	Income	Illiteracy	Life Exp	HS Grad	Frost	Area	Assault	UrbanPop	Rape
0.8	NA	NA	NA	NA	NA	NA	NA	45	44	7.3
1.4	637	5087	0.8	72.78	50.3	186	69273	NA	NA	NA
1.7	681	4167	0.5	72.08	53.3	172	75955	NA	NA	NA
2.1	NA	NA	NA	NA	NA	NA	NA	83	51	7.8
2.1	NA	NA	NA	NA	NA	NA	NA	57	56	9.5
2.2	NA	NA	NA	NA	NA	NA	NA	56	57	11.3

方法一會依據相同變數Murder合併資料，但保留了missing data。

# merge two files together by matching row.names
q2dta3<- merge(state.x77, USArrests, by="row.names", all=T)
q2dta3|> head() |> knitr::kable()

Row.names	Population	Income	Illiteracy	Life Exp	Murder.x	HS Grad	Frost	Area	Murder.y	Assault	UrbanPop	Rape
Alabama	3615	3624	2.1	69.05	15.1	41.3	20	50708	13.2	236	58	21.2
Alaska	365	6315	1.5	69.31	11.3	66.7	152	566432	10.0	263	48	44.5
Arizona	2212	4530	1.8	70.55	7.8	58.1	15	113417	8.1	294	80	31.0
Arkansas	2110	3378	1.9	70.66	10.1	39.9	65	51945	8.8	190	50	19.5
California	21198	5114	1.1	71.71	10.3	62.6	20	156361	9.0	276	91	40.6
Colorado	2541	4884	0.7	72.06	6.8	63.9	166	103766	7.9	204	78	38.7

方法二依據50州的州名(row.name)合併資料。因有兩個相同變數，R自動命名為Murder.x和Murder.y。

#data.frame將兩類資料放在同一個框架
q2dta4<-data.frame(state.x77, USArrests)
q2dta4|> head() |> knitr::kable()

	Population	Income	Illiteracy	Life.Exp	Murder	HS.Grad	Frost	Area	Murder.1	Assault	UrbanPop	Rape
Alabama	3615	3624	2.1	69.05	15.1	41.3	20	50708	13.2	236	58	21.2
Alaska	365	6315	1.5	69.31	11.3	66.7	152	566432	10.0	263	48	44.5
Arizona	2212	4530	1.8	70.55	7.8	58.1	15	113417	8.1	294	80	31.0
Arkansas	2110	3378	1.9	70.66	10.1	39.9	65	51945	8.8	190	50	19.5
California	21198	5114	1.1	71.71	10.3	62.6	20	156361	9.0	276	91	40.6
Colorado	2541	4884	0.7	72.06	6.8	63.9	166	103766	7.9	204	78	38.7

使用data.frame()合併資料，合併資料的方式也是依據50州的州名，但合併後州名row.name不是變數。

Correlation

cor(q2dta1)|> knitr::kable(digits = 2)

	Murder	Population	Income	Illiteracy	Life Exp	HS Grad	Frost	Area	Assault	UrbanPop	Rape
Murder	1.00	0.33	-0.10	0.80	-0.74	-0.59	-0.53	0.37	0.81	-0.11	0.72
Population	0.33	1.00	-0.02	0.12	0.05	-0.40	-0.36	-0.09	0.41	0.57	0.56
Income	-0.10	-0.02	1.00	-0.46	0.22	0.66	0.36	0.57	0.28	0.09	0.02
Illiteracy	0.80	0.12	-0.46	1.00	-0.75	-0.68	-0.67	0.18	0.51	-0.35	0.50
Life Exp	-0.74	0.05	0.22	-0.75	1.00	0.66	0.41	-0.21	-0.60	0.25	-0.36
HS Grad	-0.59	-0.40	0.66	-0.68	0.66	1.00	0.60	0.36	-0.37	-0.13	-0.41
Frost	-0.53	-0.36	0.36	-0.67	0.41	0.60	1.00	0.12	-0.28	0.19	-0.46
Area	0.37	-0.09	0.57	0.18	-0.21	0.36	0.12	1.00	0.54	-0.07	0.50
Assault	0.81	0.41	0.28	0.51	-0.60	-0.37	-0.28	0.54	1.00	0.14	0.69
UrbanPop	-0.11	0.57	0.09	-0.35	0.25	-0.13	0.19	-0.07	0.14	1.00	0.24
Rape	0.72	0.56	0.02	0.50	-0.36	-0.41	-0.46	0.50	0.69	0.24	1.00

cor(q2dta2)|> knitr::kable(digits = 2)

	Murder	Population	Income	Illiteracy	Life Exp	HS Grad	Frost	Area	Assault	UrbanPop	Rape
Murder	1	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
Population	NA	1	NA	NA	NA	NA	NA	NA	NA	NA	NA
Income	NA	NA	1	NA	NA	NA	NA	NA	NA	NA	NA
Illiteracy	NA	NA	NA	1	NA	NA	NA	NA	NA	NA	NA
Life Exp	NA	NA	NA	NA	1	NA	NA	NA	NA	NA	NA
HS Grad	NA	NA	NA	NA	NA	1	NA	NA	NA	NA	NA
Frost	NA	NA	NA	NA	NA	NA	1	NA	NA	NA	NA
Area	NA	NA	NA	NA	NA	NA	NA	1	NA	NA	NA
Assault	NA	NA	NA	NA	NA	NA	NA	NA	1	NA	NA
UrbanPop	NA	NA	NA	NA	NA	NA	NA	NA	NA	1	NA
Rape	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	1

有missing value，無法算correlation

cor(q2dta3[,-1])|> knitr::kable(digits = 2) #[,-1]

	Population	Income	Illiteracy	Life Exp	Murder.x	HS Grad	Frost	Area	Murder.y	Assault	UrbanPop	Rape
Population	1.00	0.21	0.11	-0.07	0.34	-0.10	-0.33	0.02	0.32	0.32	0.51	0.31
Income	0.21	1.00	-0.44	0.34	-0.23	0.62	0.23	0.36	-0.22	0.04	0.48	0.36
Illiteracy	0.11	-0.44	1.00	-0.59	0.70	-0.66	-0.67	0.08	0.71	0.51	-0.06	0.15
Life Exp	-0.07	0.34	-0.59	1.00	-0.78	0.58	0.26	-0.11	-0.78	-0.63	0.27	-0.27
Murder.x	0.34	-0.23	0.70	-0.78	1.00	-0.49	-0.54	0.23	0.93	0.74	0.02	0.58
HS Grad	-0.10	0.62	-0.66	0.58	-0.49	1.00	0.37	0.33	-0.52	-0.23	0.36	0.27
Frost	-0.33	0.23	-0.67	0.26	-0.54	0.37	1.00	0.06	-0.54	-0.47	-0.25	-0.28
Area	0.02	0.36	0.08	-0.11	0.23	0.33	0.06	1.00	0.15	0.23	-0.06	0.52
Murder.y	0.32	-0.22	0.71	-0.78	0.93	-0.52	-0.54	0.15	1.00	0.80	0.07	0.56
Assault	0.32	0.04	0.51	-0.63	0.74	-0.23	-0.47	0.23	0.80	1.00	0.26	0.67
UrbanPop	0.51	0.48	-0.06	0.27	0.02	0.36	-0.25	-0.06	0.07	0.26	1.00	0.41
Rape	0.31	0.36	0.15	-0.27	0.58	0.27	-0.28	0.52	0.56	0.67	0.41	1.00

Error in cor(q2dta3) : ‘x’ 必須是數值，因此使用[,-1]排除第一行州名

cor(q2dta4)|> knitr::kable(digits = 2)

	Population	Income	Illiteracy	Life.Exp	Murder	HS.Grad	Frost	Area	Murder.1	Assault	UrbanPop	Rape
Population	1.00	0.21	0.11	-0.07	0.34	-0.10	-0.33	0.02	0.32	0.32	0.51	0.31
Income	0.21	1.00	-0.44	0.34	-0.23	0.62	0.23	0.36	-0.22	0.04	0.48	0.36
Illiteracy	0.11	-0.44	1.00	-0.59	0.70	-0.66	-0.67	0.08	0.71	0.51	-0.06	0.15
Life.Exp	-0.07	0.34	-0.59	1.00	-0.78	0.58	0.26	-0.11	-0.78	-0.63	0.27	-0.27
Murder	0.34	-0.23	0.70	-0.78	1.00	-0.49	-0.54	0.23	0.93	0.74	0.02	0.58
HS.Grad	-0.10	0.62	-0.66	0.58	-0.49	1.00	0.37	0.33	-0.52	-0.23	0.36	0.27
Frost	-0.33	0.23	-0.67	0.26	-0.54	0.37	1.00	0.06	-0.54	-0.47	-0.25	-0.28
Area	0.02	0.36	0.08	-0.11	0.23	0.33	0.06	1.00	0.15	0.23	-0.06	0.52
Murder.1	0.32	-0.22	0.71	-0.78	0.93	-0.52	-0.54	0.15	1.00	0.80	0.07	0.56
Assault	0.32	0.04	0.51	-0.63	0.74	-0.23	-0.47	0.23	0.80	1.00	0.26	0.67
UrbanPop	0.51	0.48	-0.06	0.27	0.02	0.36	-0.25	-0.06	0.07	0.26	1.00	0.41
Rape	0.31	0.36	0.15	-0.27	0.58	0.27	-0.28	0.52	0.56	0.67	0.41	1.00

與文盲高度正相關的變數為謀殺犯罪率。 cor(Illiteracy,Murder)=0.70； cor(Illiteracy,Murder1)=0.71。

與文盲高度負相關的變數為HS.Grad、Frost。 cor(Illiteracy,HS.Grad)=-0.66； cor(Illiteracy,Frost)=-0.67。

與謀殺犯罪率高度負相關的變數為Life.Exp(生活期待) cor(Murder,Life.Exp)=-0.78 cor(Murder1,Life.Exp)=-0.78

與暴力高度負相關的變數為Life.Exp(生活期待) cor(Assault,Life.Exp)=-0.63

與暴力高度正相關的變數為謀殺犯罪率、強暴。 cor(Assault,Murder)=0.74 cor(Assault,Murder1)=0.80 cor(Assault,Rape)=0.67

暴力與區域低度相關。 cor(Area,Rape)=0.56

plot data

pacman::p_load(corrplot,RColorBrewer)

corrplot(cor(q2dta4), type="upper", order="hclust",
         col=brewer.pal(n=8, name="RdYlBu"))

Q3

Summarize the backpain{HSAUR3} into the following format: You should provide comments for each code chunk.

# install package
pacman::p_load(HSAUR3)
# load data
data(backpain,package = "HSAUR3")
#copy data
q3dta<-backpain
# show data
q3dta |> head() |> knitr::kable()

ID	status	driver	suburban
1	case	yes	yes
1	control	yes	no
2	case	yes	yes
2	control	yes	yes
3	case	yes	no
3	control	yes	yes

# exam data str
str(q3dta)

'data.frame':   434 obs. of  4 variables:
 $ ID      : Factor w/ 217 levels "1","2","3","4",..: 1 1 2 2 3 3 4 4 5 5 ...
 $ status  : Factor w/ 2 levels "case","control": 1 2 1 2 1 2 1 2 1 2 ...
 $ driver  : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 1 1 2 2 ...
 $ suburban: Factor w/ 2 levels "no","yes": 2 1 2 2 1 2 1 1 1 2 ...

class(q3dta)

[1] "data.frame"

typeof(q3dta)

[1] "list"

summary(q3dta)

       ID          status    driver    suburban 
 1      :  2   case   :217   no : 86   no :200  
 2      :  2   control:217   yes:348   yes:234  
 3      :  2                                    
 4      :  2                                    
 5      :  2                                    
 6      :  2                                    
 (Other):422

# cross-tabulate status by driver
q3tab1<-with(q3dta, table(driver,status))
q3tab1

      status
driver case control
   no    32      54
   yes  185     163

# cross-tabulate status by suburban
q3tab2<-with(q3dta, table(suburban,status))
q3tab2

        status
suburban case control
     no    90     110
     yes  127     107

# three-way table()
q3tab3<-with(q3dta, table(driver, suburban, status))
q3tab3

, , status = case

      suburban
driver  no yes
   no   26   6
   yes  64 121

, , status = control

      suburban
driver  no yes
   no   47   7
   yes  63 100

# cross-tabulate status by suburban given driver
q3tab4<-with(q3dta, ftable(driver, suburban, status))
q3tab4

                status case control
driver suburban                    
no     no                26      47
       yes                6       7
yes    no                64      63
       yes              121     100

q3tab4很接近題目要求的format，接下來要依據driver、suburban來分群計算Total。

str(q3tab4)

 'ftable' int [1:4, 1:2] 26 6 64 121 47 7 63 100
 - attr(*, "row.vars")=List of 2
  ..$ driver  : chr [1:2] "no" "yes"
  ..$ suburban: chr [1:2] "no" "yes"
 - attr(*, "col.vars")=List of 1
  ..$ status: chr [1:2] "case" "control"

#轉換資料型態
q3tab<-as.data.frame(q3tab4)
q3tab

  driver suburban  status Freq
1     no       no    case   26
2    yes       no    case   64
3     no      yes    case    6
4    yes      yes    case  121
5     no       no control   47
6    yes       no control   63
7     no      yes control    7
8    yes      yes control  100

#要用aggregate?還是reshape?

#aggregate(Freq~driver+suburban+status, by=list(status,suburban), FUN=sum)
#aggregate(cbind(driver,suburban) ~ status, data=q3tab, FUN=sum)


# long to wide format
reshape(q3tab,
        idvar="suburban",
        timevar="status",
        v.names="driver",
        direction="wide")

  suburban Freq driver.case driver.control
1       no   26          no             no
3      yes    6          no             no

這結果不是我要的….

#Summarize a transformed variable
pacman::p_load(dplyr, magrittr ,tidyr)

#把case從status撈出來
q3tab_case<-q3tab |>
 filter(status=='case')|>
 dplyr::rename(Case=Freq)
    

#把control從status撈出來
q3tab_control<-q3tab |>
 filter(status=='control')|>
 dplyr::rename(Control=Freq)

#merge後再篩選變數
q3dta_merge<-merge(x =q3tab_case, y = q3tab_control, by = c("driver","suburban"))|>
   select(-c(3,5))

#新增變數total，然後水平加總Case+Control
q3answer<-mutate(q3dta_merge,total=Case+Control)
q3answer

  driver suburban Case Control total
1     no       no   26      47    73
2     no      yes    6       7    13
3    yes       no   64      63   127
4    yes      yes  121     100   221

終於用笨方法得到答案了。

Q4

The data set Vocab{carData} gives observations on gender, education and vocabulary, from respondents to U.S. General Social Surveys, 1972-2004.

Summarize the relationship between education and vocabulary over the years by gender.

# Save a copy of data as tibble
q4dta <- as_tibble(carData::Vocab)

# show data
q4dta |> head() |> knitr::kable()

year	sex	education	vocabulary
1974	Male	14	9
1974	Male	16	9
1974	Female	10	9
1974	Female	10	5
1974	Female	12	8
1974	Male	16	8

#rename
q4dta %<>%
 dplyr::rename(Year=year, Gender=sex, 
               Edu=education, Voc=vocabulary)

#依據講義tidy_01 p.23
q4dta %>%
 lattice::xyplot(Edu ~ Voc | Year, 
        groups=Gender, 
        type=c("p","g","r"), 
        data=., 
        xlab="Vocabulary", 
        ylab="Education", 
        auto.key=list(columns=2))

#q4dta$Year %>% 
    #as.factor()%>% 
#lattice::xyplot(Edu ~ Voc | Year, 
       # groups=Gender, 
       # type=c("p","g","r"), 
       # data=., 
       # xlab="Vocabulary", 
       # ylab="Education", 
       # auto.key=list(columns=2))

將Year轉成factor，會出現error message。 Error in eval(substitute(groups), data, environment(x)) : numeric ‘envir’ arg not of length one

Q5

Supply comments to each code chunk in the following survey rmarkdown file and preview it as an R notebook or knit to html.

The data set concerns species and weight of animals caught in plots in a study area in Arizona over time.

Each row holds information for a single animal, and the columns represent:

record_id: Unique id for the observation
month: month of observation
day: day of observation
year: year of observation
plot_id: ID of a particular plot
species_id: 2-letter code
sex: sex of animal (�𢜛��, �𤧶��)
hindfoot_length: length of the hindfoot in mm
weight: weight of the animal in grams
genus: genus of animal
species: species of animal
taxa: e.g. Rodent, Reptile, Bird, Rabbit
plot_type: type of plot

# 下載並安裝tidyverse package
pacman::p_load(tidyverse)

# 從網路讀取csv檔
q5dta <- read_csv("http://kbroman.org/datacarp/portal_data_joined.csv")

# 資料檢視
glimpse(q5dta)

Rows: 34,786
Columns: 13
$ record_id       <dbl> 1, 72, 224, 266, 349, 363, 435, 506, 588, 661, 748, 84~
$ month           <dbl> 7, 8, 9, 10, 11, 11, 12, 1, 2, 3, 4, 5, 6, 8, 9, 10, 1~
$ day             <dbl> 16, 19, 13, 16, 12, 12, 10, 8, 18, 11, 8, 6, 9, 5, 4, ~
$ year            <dbl> 1977, 1977, 1977, 1977, 1977, 1977, 1977, 1978, 1978, ~
$ plot_id         <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~
$ species_id      <chr> "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", ~
$ sex             <chr> "M", "M", NA, NA, NA, NA, NA, NA, "M", NA, NA, "M", "M~
$ hindfoot_length <dbl> 32, 31, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, NA, 34~
$ weight          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 218, NA, NA, 204, 200,~
$ genus           <chr> "Neotoma", "Neotoma", "Neotoma", "Neotoma", "Neotoma",~
$ species         <chr> "albigula", "albigula", "albigula", "albigula", "albig~
$ taxa            <chr> "Rodent", "Rodent", "Rodent", "Rodent", "Rodent", "Rod~
$ plot_type       <chr> "Control", "Control", "Control", "Control", "Control",~

#資料維度
dim(q5dta)

[1] 34786    13

#select()篩選變數
dplyr::select(q5dta, plot_id, species_id, weight) %>% head()

# A tibble: 6 x 3
  plot_id species_id weight
    <dbl> <chr>       <dbl>
1       2 NL             NA
2       2 NL             NA
3       2 NL             NA
4       2 NL             NA
5       2 NL             NA
6       2 NL             NA

#select()也可以刪除不要的變數，在變數前加-
dplyr::select(q5dta, -record_id, -species_id) %>% head()

# A tibble: 6 x 11
  month   day  year plot_id sex   hindfoot_length weight genus   species  taxa  
  <dbl> <dbl> <dbl>   <dbl> <chr>           <dbl>  <dbl> <chr>   <chr>    <chr> 
1     7    16  1977       2 M                  32     NA Neotoma albigula Rodent
2     8    19  1977       2 M                  31     NA Neotoma albigula Rodent
3     9    13  1977       2 <NA>               NA     NA Neotoma albigula Rodent
4    10    16  1977       2 <NA>               NA     NA Neotoma albigula Rodent
5    11    12  1977       2 <NA>               NA     NA Neotoma albigula Rodent
6    11    12  1977       2 <NA>               NA     NA Neotoma albigula Rodent
# ... with 1 more variable: plot_type <chr>

#filter()選出要分析的觀察值：year == 1995
dplyr::filter(q5dta, year == 1995) %>% head() #然後只看前六筆

# A tibble: 6 x 13
  record_id month   day  year plot_id species_id sex   hindfoot_length weight
      <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
1     22314     6     7  1995       2 NL         M                  34     NA
2     22728     9    23  1995       2 NL         F                  32    165
3     22899    10    28  1995       2 NL         F                  32    171
4     23032    12     2  1995       2 NL         F                  33     NA
5     22003     1    11  1995       2 DM         M                  37     41
6     22042     2     4  1995       2 DM         F                  36     45
# ... with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
#   plot_type <chr>

#選擇四個變數：體重(要<=5公斤)、species_id、sex、weight
#然後只看前六筆
head(dplyr::select(dplyr::filter(q5dta, weight <= 5), species_id, sex, weight))

# A tibble: 6 x 3
  species_id sex   weight
  <chr>      <chr>  <dbl>
1 PF         M          5
2 PF         F          5
3 PF         F          5
4 PF         F          4
5 PF         F          5
6 PF         F          4

q5dta %>% 
  dplyr::filter(weight <= 5) %>% #篩選觀察值的體重小於等於5
  dplyr::select(species_id, sex, weight) %>% #select變數
  head

# A tibble: 6 x 3
  species_id sex   weight
  <chr>      <chr>  <dbl>
1 PF         M          5
2 PF         F          5
3 PF         F          5
4 PF         F          4
5 PF         F          5
6 PF         F          4

q5dta %>% 
  mutate(weight_kg = weight / 1000, #新增變數weight_kg、weight_lb
         weight_lb = weight_kg * 2.2) %>% 
  head()

# A tibble: 6 x 15
  record_id month   day  year plot_id species_id sex   hindfoot_length weight
      <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
1         1     7    16  1977       2 NL         M                  32     NA
2        72     8    19  1977       2 NL         M                  31     NA
3       224     9    13  1977       2 NL         <NA>               NA     NA
4       266    10    16  1977       2 NL         <NA>               NA     NA
5       349    11    12  1977       2 NL         <NA>               NA     NA
6       363    11    12  1977       2 NL         <NA>               NA     NA
# ... with 6 more variables: genus <chr>, species <chr>, taxa <chr>,
#   plot_type <chr>, weight_kg <dbl>, weight_lb <dbl>

q5dta %>% 
  filter(!is.na(weight)) %>% #篩選出在is.na()檢查回傳FALSE的元素
  group_by(sex, species_id) %>% #分群條件
  summarize(mean_weight = mean(weight)) %>% #計算mean_weight
  arrange(desc(mean_weight)) %>% #由高到低排列
  head() #檢視前六筆

# A tibble: 6 x 3
# Groups:   sex [3]
  sex   species_id mean_weight
  <chr> <chr>            <dbl>
1 <NA>  NL                168.
2 M     NL                166.
3 F     NL                154.
4 M     SS                130 
5 <NA>  SH                130 
6 M     DS                122.

q5dta %>%
  group_by(sex) %>% #依照性別分群
  tally #計算總數量

# A tibble: 3 x 2
  sex       n
  <chr> <int>
1 F     15690
2 M     17348
3 <NA>   1748

#方法二
q5dta %>%
  group_by(sex) %>% #依照性別分群
  summarise(total=n()) #計算總數量

# A tibble: 3 x 2
  sex   total
  <chr> <int>
1 F     15690
2 M     17348
3 <NA>   1748

q5dta %>%
  count(sex) #依據性別計算總數量

# A tibble: 3 x 2
  sex       n
  <chr> <int>
1 F     15690
2 M     17348
3 <NA>   1748

q5dta %>%
  group_by(sex) %>% #先依照性別分群
  summarize(count = n()) #再計算總數

# A tibble: 3 x 2
  sex   count
  <chr> <int>
1 F     15690
2 M     17348
3 <NA>   1748

q5dta %>%
  group_by(sex) %>% #先依照性別分群
  summarize(count = sum(!is.na(year))) #再計算沒有遺漏值的year數量總數

# A tibble: 3 x 2
  sex   count
  <chr> <int>
1 F     15690
2 M     17348
3 <NA>   1748

答案都一樣!?

dta_gw <- q5dta %>% 
  filter(!is.na(weight)) %>% #篩選出沒有遺漏值的weight列
  group_by(genus, plot_id) %>% #依據genus與plot_id分群
  summarize(mean_weight = mean(weight)) #計算體重平均mean(weight)

glimpse(dta_gw) #檢視資料

Rows: 196
Columns: 3
Groups: genus [10]
$ genus       <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
$ plot_id     <dbl> 1, 2, 3, 5, 18, 19, 20, 21, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,~
$ mean_weight <dbl> 7.000000, 6.000000, 8.611111, 7.750000, 9.500000, 9.533333~

#long format to wide format
#spread(資料框/長表,key="要展開的欄位名稱",value="數值欄位名稱")
dta_w <- dta_gw %>%
  spread(key = genus, value = mean_weight) #
head(dta_w)

# A tibble: 6 x 11
  plot_id Baiomys Chaetodipus Dipodomys Neotoma Onychomys Perognathus Peromyscus
    <dbl>   <dbl>       <dbl>     <dbl>   <dbl>     <dbl>       <dbl>      <dbl>
1       1    7           22.2      60.2    156.      27.7        9.62       22.2
2       2    6           25.1      55.7    169.      26.9        6.95       22.3
3       3    8.61        24.6      52.0    158.      26.0        7.51       21.4
4       4   NA           23.0      57.5    164.      28.1        7.82       22.6
5       5    7.75        18.0      51.1    190.      27.0        8.66       21.2
6       6   NA           24.9      58.6    180.      25.9        7.81       21.8
# ... with 3 more variables: Reithrodontomys <dbl>, Sigmodon <dbl>,
#   Spermophilus <dbl>

glimpse(dta_w) #資料檢視Rows:24 > Columns:11

Rows: 24
Columns: 11
$ plot_id         <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,~
$ Baiomys         <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA~
$ Chaetodipus     <dbl> 22.19939, 25.11014, 24.63636, 23.02381, 17.98276, 24.8~
$ Dipodomys       <dbl> 60.23214, 55.68259, 52.04688, 57.52454, 51.11356, 58.6~
$ Neotoma         <dbl> 156.2222, 169.1436, 158.2414, 164.1667, 190.0370, 179.~
$ Onychomys       <dbl> 27.67550, 26.87302, 26.03241, 28.09375, 27.01695, 25.8~
$ Perognathus     <dbl> 9.625000, 6.947368, 7.507812, 7.824427, 8.658537, 7.80~
$ Peromyscus      <dbl> 22.22222, 22.26966, 21.37037, 22.60000, 21.23171, 21.8~
$ Reithrodontomys <dbl> 11.375000, 10.680556, 10.516588, 10.263158, 11.154545,~
$ Sigmodon        <dbl> NA, 70.85714, 65.61404, 82.00000, 82.66667, 68.77778, ~
$ Spermophilus    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 13~

dta_gw %>%
  spread(genus, mean_weight, fill = 0) %>% #fill = 0：If set, missing values will be replaced with this value.
  head()

# A tibble: 6 x 11
  plot_id Baiomys Chaetodipus Dipodomys Neotoma Onychomys Perognathus Peromyscus
    <dbl>   <dbl>       <dbl>     <dbl>   <dbl>     <dbl>       <dbl>      <dbl>
1       1    7           22.2      60.2    156.      27.7        9.62       22.2
2       2    6           25.1      55.7    169.      26.9        6.95       22.3
3       3    8.61        24.6      52.0    158.      26.0        7.51       21.4
4       4    0           23.0      57.5    164.      28.1        7.82       22.6
5       5    7.75        18.0      51.1    190.      27.0        8.66       21.2
6       6    0           24.9      58.6    180.      25.9        7.81       21.8
# ... with 3 more variables: Reithrodontomys <dbl>, Sigmodon <dbl>,
#   Spermophilus <dbl>

#寬表轉長表 
#gather(資料框/寬表,key="主鍵欄位名稱",value="數值欄位名稱",要轉換的資料1,要轉換的資料2,...)
dta_l <- dta_w %>%
  gather(key = genus, value = mean_weight, -plot_id) #-plot_id表示除去plot_id，就只轉置剩下兩列
head(dta_l)

# A tibble: 6 x 3
  plot_id genus   mean_weight
    <dbl> <chr>         <dbl>
1       1 Baiomys        7   
2       2 Baiomys        6   
3       3 Baiomys        8.61
4       4 Baiomys       NA   
5       5 Baiomys        7.75
6       6 Baiomys       NA

glimpse(dta_l) #看一下Rows數是不是有大於Columns數

Rows: 240
Columns: 3
$ plot_id     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,~
$ genus       <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
$ mean_weight <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA, NA~

#指定gather的變數類別Baiomys:Spermophilus
dta_w %>%
  gather(key = genus, value = mean_weight, Baiomys:Spermophilus) %>% 
  head()

# A tibble: 6 x 3
  plot_id genus   mean_weight
    <dbl> <chr>         <dbl>
1       1 Baiomys        7   
2       2 Baiomys        6   
3       3 Baiomys        8.61
4       4 Baiomys       NA   
5       5 Baiomys        7.75
6       6 Baiomys       NA

#丟掉weight、hindfoot_length、sex的遺漏值
dta_complete <- q5dta %>%
  filter(!is.na(weight),           
         !is.na(hindfoot_length),  
         !is.na(sex))
head(dta_complete)

# A tibble: 6 x 13
  record_id month   day  year plot_id species_id sex   hindfoot_length weight
      <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
1       845     5     6  1978       2 NL         M                  32    204
2      1164     8     5  1978       2 NL         M                  34    199
3      1261     9     4  1978       2 NL         M                  32    197
4      1756     4    29  1979       2 NL         M                  33    166
5      1818     5    30  1979       2 NL         M                  32    184
6      1882     7     4  1979       2 NL         M                  32    206
# ... with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
#   plot_type <chr>

species_counts <- dta_complete %>%
    count(species_id) %>% #依據species_id計算總數
    filter(n >= 50) #篩選出n >= 50

head(species_counts)

# A tibble: 6 x 2
  species_id     n
  <chr>      <int>
1 DM          9727
2 DO          2790
3 DS          2023
4 NL          1045
5 OL           905
6 OT          2081

#The %in% operator in R can be used to identify if an element (e.g., a number) belongs to a vector or dataframe. 

dta_complete <- dta_complete %>%
  filter(species_id %in% species_counts$species_id) 
head(dta_complete)

# A tibble: 6 x 13
  record_id month   day  year plot_id species_id sex   hindfoot_length weight
      <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
1       845     5     6  1978       2 NL         M                  32    204
2      1164     8     5  1978       2 NL         M                  34    199
3      1261     9     4  1978       2 NL         M                  32    197
4      1756     4    29  1979       2 NL         M                  33    166
5      1818     5    30  1979       2 NL         M                  32    184
6      1882     7     4  1979       2 NL         M                  32    206
# ... with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
#   plot_type <chr>

DataM L5 inclass exercises

Ching-Fang Wu

Sun Nov 07 09:27:15 2021

Q1