Inclass exercise

In-class exercises_2

Merge the two data sets: state.x77{datasets} and USArrests{datasets} and compute all pair-wise correlations for numerical variables. Is there anything interesting to report?

library(datasets)
data(USArrests, package="datasets")
data(state, package="datasets")
USArrests|> head() |> knitr::kable()

	Murder	Assault	UrbanPop	Rape
Alabama	13.2	236	58	21.2
Alaska	10.0	263	48	44.5
Arizona	8.1	294	80	31.0
Arkansas	8.8	190	50	19.5
California	9.0	276	91	40.6
Colorado	7.9	204	78	38.7

state.x77|> head() |> knitr::kable()

	Population	Income	Illiteracy	Life Exp	Murder	HS Grad	Frost	Area
Alabama	3615	3624	2.1	69.05	15.1	41.3	20	50708
Alaska	365	6315	1.5	69.31	11.3	66.7	152	566432
Arizona	2212	4530	1.8	70.55	7.8	58.1	15	113417
Arkansas	2110	3378	1.9	70.66	10.1	39.9	65	51945
California	21198	5114	1.1	71.71	10.3	62.6	20	156361
Colorado	2541	4884	0.7	72.06	6.8	63.9	166	103766

先了解兩個資料的樣態，再決定怎麼合併

str(USArrests) #'data.frame':   50 obs. of  4 variables

'data.frame':   50 obs. of  4 variables:
 $ Murder  : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
 $ Assault : int  236 263 294 190 276 204 110 238 335 211 ...
 $ UrbanPop: int  58 48 80 50 91 78 77 72 80 60 ...
 $ Rape    : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...

str(state.x77) #num [1:50, 1:8]

 num [1:50, 1:8] 3615 365 2212 2110 21198 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
  ..$ : chr [1:8] "Population" "Income" "Illiteracy" "Life Exp" ...

class(USArrests) # class(USArrests)是data.frame

[1] "data.frame"

class(state.x77) # class(state.x77)是"matrix" "array"

[1] "matrix" "array"

state.x77f<-state.x77|>as.data.frame()   #換成data.frame
str(state.x77f) #'data.frame':  50 obs. of  8 variables

'data.frame':   50 obs. of  8 variables:
 $ Population: num  3615 365 2212 2110 21198 ...
 $ Income    : num  3624 6315 4530 3378 5114 ...
 $ Illiteracy: num  2.1 1.5 1.8 1.9 1.1 0.7 1.1 0.9 1.3 2 ...
 $ Life Exp  : num  69 69.3 70.5 70.7 71.7 ...
 $ Murder    : num  15.1 11.3 7.8 10.1 10.3 6.8 3.1 6.2 10.7 13.9 ...
 $ HS Grad   : num  41.3 66.7 58.1 39.9 62.6 63.9 56 54.6 52.6 40.6 ...
 $ Frost     : num  20 152 15 65 20 166 139 103 11 60 ...
 $ Area      : num  50708 566432 113417 51945 156361 ...

names(USArrests)

[1] "Murder"   "Assault"  "UrbanPop" "Rape"

names(state.x77f) #發現資料column重複的是"Murder"

[1] "Population" "Income"     "Illiteracy" "Life Exp"   "Murder"    
[6] "HS Grad"    "Frost"      "Area"

row.names(USArrests) #Row的名稱

 [1] "Alabama"        "Alaska"         "Arizona"        "Arkansas"      
 [5] "California"     "Colorado"       "Connecticut"    "Delaware"      
 [9] "Florida"        "Georgia"        "Hawaii"         "Idaho"         
[13] "Illinois"       "Indiana"        "Iowa"           "Kansas"        
[17] "Kentucky"       "Louisiana"      "Maine"          "Maryland"      
[21] "Massachusetts"  "Michigan"       "Minnesota"      "Mississippi"   
[25] "Missouri"       "Montana"        "Nebraska"       "Nevada"        
[29] "New Hampshire"  "New Jersey"     "New Mexico"     "New York"      
[33] "North Carolina" "North Dakota"   "Ohio"           "Oklahoma"      
[37] "Oregon"         "Pennsylvania"   "Rhode Island"   "South Carolina"
[41] "South Dakota"   "Tennessee"      "Texas"          "Utah"          
[45] "Vermont"        "Virginia"       "Washington"     "West Virginia" 
[49] "Wisconsin"      "Wyoming"

row.names(state.x77f)#兩個資料的row name是一樣的

 [1] "Alabama"        "Alaska"         "Arizona"        "Arkansas"      
 [5] "California"     "Colorado"       "Connecticut"    "Delaware"      
 [9] "Florida"        "Georgia"        "Hawaii"         "Idaho"         
[13] "Illinois"       "Indiana"        "Iowa"           "Kansas"        
[17] "Kentucky"       "Louisiana"      "Maine"          "Maryland"      
[21] "Massachusetts"  "Michigan"       "Minnesota"      "Mississippi"   
[25] "Missouri"       "Montana"        "Nebraska"       "Nevada"        
[29] "New Hampshire"  "New Jersey"     "New Mexico"     "New York"      
[33] "North Carolina" "North Dakota"   "Ohio"           "Oklahoma"      
[37] "Oregon"         "Pennsylvania"   "Rhode Island"   "South Carolina"
[41] "South Dakota"   "Tennessee"      "Texas"          "Utah"          
[45] "Vermont"        "Virginia"       "Washington"     "West Virginia" 
[49] "Wisconsin"      "Wyoming"

summary(USArrests$Murder)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.800   4.075   7.250   7.788  11.250  17.400

summary(state.x77f$Murder)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.400   4.350   6.850   7.378  10.675  15.100

dta_77USA_1 <- merge(x=state.x77f, y=USArrests, by=c("Murder"))
str(dta_77USA_1)

'data.frame':   15 obs. of  11 variables:
 $ Murder    : num  2.7 3.3 3.3 4.3 5.3 6.8 7.4 8.5 9.7 11.1 ...
 $ Population: num  1058 5814 812 3559 813 ...
 $ Income    : num  3694 4755 4281 4864 4119 ...
 $ Illiteracy: num  0.7 1.1 0.7 0.6 0.6 0.7 0.8 0.9 2.2 0.9 ...
 $ Life Exp  : num  70.4 71.8 71.2 71.7 71.9 ...
 $ HS Grad   : num  54.7 58.5 57.6 63.5 59.5 63.9 53.2 52.3 55.2 52.8 ...
 $ Frost     : num  161 103 174 32 126 166 124 101 120 125 ...
 $ Area      : num  30920 7826 9027 66570 82677 ...
 $ Assault   : int  72 110 110 102 46 161 159 156 109 254 ...
 $ UrbanPop  : int  66 77 77 62 83 60 89 63 52 86 ...
 $ Rape      : num  14.9 11.1 11.1 16.5 20.2 15.6 18.8 20.7 16.3 26.1 ...

dta_77USA_1|> head() |> knitr::kable()

Murder	Population	Income	Illiteracy	Life Exp	HS Grad	Frost	Area	Assault	UrbanPop	Rape
2.7	1058	3694	0.7	70.39	54.7	161	30920	72	66	14.9
3.3	5814	4755	1.1	71.83	58.5	103	7826	110	77	11.1
3.3	812	4281	0.7	71.23	57.6	174	9027	110	77	11.1
4.3	3559	4864	0.6	71.72	63.5	32	66570	102	62	16.5
5.3	813	4119	0.6	71.87	59.5	126	82677	46	83	20.2
6.8	2541	4884	0.7	72.06	63.9	166	103766	161	60	15.6

#只保留有Murder column的資料

dta_77USA_2 <- merge(x=state.x77f, y=USArrests, by="row.names")
str(dta_77USA_2)

'data.frame':   50 obs. of  13 variables:
 $ Row.names : 'AsIs' chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
 $ Population: num  3615 365 2212 2110 21198 ...
 $ Income    : num  3624 6315 4530 3378 5114 ...
 $ Illiteracy: num  2.1 1.5 1.8 1.9 1.1 0.7 1.1 0.9 1.3 2 ...
 $ Life Exp  : num  69 69.3 70.5 70.7 71.7 ...
 $ Murder.x  : num  15.1 11.3 7.8 10.1 10.3 6.8 3.1 6.2 10.7 13.9 ...
 $ HS Grad   : num  41.3 66.7 58.1 39.9 62.6 63.9 56 54.6 52.6 40.6 ...
 $ Frost     : num  20 152 15 65 20 166 139 103 11 60 ...
 $ Area      : num  50708 566432 113417 51945 156361 ...
 $ Murder.y  : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
 $ Assault   : int  236 263 294 190 276 204 110 238 335 211 ...
 $ UrbanPop  : int  58 48 80 50 91 78 77 72 80 60 ...
 $ Rape      : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...

dta_77USA_2|> head() |> knitr::kable()

Row.names	Population	Income	Illiteracy	Life Exp	Murder.x	HS Grad	Frost	Area	Murder.y	Assault	UrbanPop	Rape
Alabama	3615	3624	2.1	69.05	15.1	41.3	20	50708	13.2	236	58	21.2
Alaska	365	6315	1.5	69.31	11.3	66.7	152	566432	10.0	263	48	44.5
Arizona	2212	4530	1.8	70.55	7.8	58.1	15	113417	8.1	294	80	31.0
Arkansas	2110	3378	1.9	70.66	10.1	39.9	65	51945	8.8	190	50	19.5
California	21198	5114	1.1	71.71	10.3	62.6	20	156361	9.0	276	91	40.6
Colorado	2541	4884	0.7	72.06	6.8	63.9	166	103766	7.9	204	78	38.7

#依據row name(50州)進行合併，因Murder在兩個data framne中重複，因此會產出Murder.x(state.x77f)和Murder.y(USArrests)
sum(is.na(dta_77USA_2))

[1] 0

dta_77USA_3 <- merge(x=state.x77f, y=USArrests, by=c("Murder"), all = T)
# all=T 是用來詢問是否顯示所有資料，如果有資料不全者，沒下 all = T，應該不會出現在合併資料中。
str(dta_77USA_3)

'data.frame':   88 obs. of  11 variables:
 $ Murder    : num  0.8 1.4 1.7 2.1 2.1 2.2 2.2 2.3 2.3 2.4 ...
 $ Population: num  NA 637 681 NA NA ...
 $ Income    : num  NA 5087 4167 NA NA ...
 $ Illiteracy: num  NA 0.8 0.5 NA NA NA NA 0.5 0.6 1.3 ...
 $ Life Exp  : num  NA 72.8 72.1 NA NA ...
 $ HS Grad   : num  NA 50.3 53.3 NA NA NA NA 59 57.6 46.4 ...
 $ Frost     : num  NA 186 172 NA NA NA NA 140 160 127 ...
 $ Area      : num  NA 69273 75955 NA NA ...
 $ Assault   : int  45 NA NA 83 57 56 48 NA NA NA ...
 $ UrbanPop  : int  44 NA NA 51 56 57 32 NA NA NA ...
 $ Rape      : num  7.3 NA NA 7.8 9.5 11.3 11.2 NA NA NA ...

dta_77USA_3|> head() |> knitr::kable()

Murder	Population	Income	Illiteracy	Life Exp	HS Grad	Frost	Area	Assault	UrbanPop	Rape
0.8	NA	NA	NA	NA	NA	NA	NA	45	44	7.3
1.4	637	5087	0.8	72.78	50.3	186	69273	NA	NA	NA
1.7	681	4167	0.5	72.08	53.3	172	75955	NA	NA	NA
2.1	NA	NA	NA	NA	NA	NA	NA	83	51	7.8
2.1	NA	NA	NA	NA	NA	NA	NA	57	56	9.5
2.2	NA	NA	NA	NA	NA	NA	NA	56	57	11.3

#以Murder進行合併，保留所有資料包括NA

sum(is.na(USArrests))  #在USArrests資料中並無NA

[1] 0

sum(is.na(state.x77f))  #在state.x77f資料中並無NA

[1] 0

sum(is.na(dta_77USA_3)) #為何合併後會出現367個NA資料?

[1] 367

dta_77USA_3 <- merge(x=state.x77f, y=USArrests, by="row.names", all = T)
str(dta_77USA_3)

'data.frame':   50 obs. of  13 variables:
 $ Row.names : 'AsIs' chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
 $ Population: num  3615 365 2212 2110 21198 ...
 $ Income    : num  3624 6315 4530 3378 5114 ...
 $ Illiteracy: num  2.1 1.5 1.8 1.9 1.1 0.7 1.1 0.9 1.3 2 ...
 $ Life Exp  : num  69 69.3 70.5 70.7 71.7 ...
 $ Murder.x  : num  15.1 11.3 7.8 10.1 10.3 6.8 3.1 6.2 10.7 13.9 ...
 $ HS Grad   : num  41.3 66.7 58.1 39.9 62.6 63.9 56 54.6 52.6 40.6 ...
 $ Frost     : num  20 152 15 65 20 166 139 103 11 60 ...
 $ Area      : num  50708 566432 113417 51945 156361 ...
 $ Murder.y  : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
 $ Assault   : int  236 263 294 190 276 204 110 238 335 211 ...
 $ UrbanPop  : int  58 48 80 50 91 78 77 72 80 60 ...
 $ Rape      : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...

dta_77USA_3|> head() |> knitr::kable()

Row.names	Population	Income	Illiteracy	Life Exp	Murder.x	HS Grad	Frost	Area	Murder.y	Assault	UrbanPop	Rape
Alabama	3615	3624	2.1	69.05	15.1	41.3	20	50708	13.2	236	58	21.2
Alaska	365	6315	1.5	69.31	11.3	66.7	152	566432	10.0	263	48	44.5
Arizona	2212	4530	1.8	70.55	7.8	58.1	15	113417	8.1	294	80	31.0
Arkansas	2110	3378	1.9	70.66	10.1	39.9	65	51945	8.8	190	50	19.5
California	21198	5114	1.1	71.71	10.3	62.6	20	156361	9.0	276	91	40.6
Colorado	2541	4884	0.7	72.06	6.8	63.9	166	103766	7.9	204	78	38.7

#依據row names進行合併，row.names也變成一個column(chr)

dta_77USA_4 <- data.frame(x=state.x77f, y=USArrests, by="row.names", all = T)
str(dta_77USA_4)

'data.frame':   50 obs. of  14 variables:
 $ x.Population: num  3615 365 2212 2110 21198 ...
 $ x.Income    : num  3624 6315 4530 3378 5114 ...
 $ x.Illiteracy: num  2.1 1.5 1.8 1.9 1.1 0.7 1.1 0.9 1.3 2 ...
 $ x.Life.Exp  : num  69 69.3 70.5 70.7 71.7 ...
 $ x.Murder    : num  15.1 11.3 7.8 10.1 10.3 6.8 3.1 6.2 10.7 13.9 ...
 $ x.HS.Grad   : num  41.3 66.7 58.1 39.9 62.6 63.9 56 54.6 52.6 40.6 ...
 $ x.Frost     : num  20 152 15 65 20 166 139 103 11 60 ...
 $ x.Area      : num  50708 566432 113417 51945 156361 ...
 $ y.Murder    : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
 $ y.Assault   : int  236 263 294 190 276 204 110 238 335 211 ...
 $ y.UrbanPop  : int  58 48 80 50 91 78 77 72 80 60 ...
 $ y.Rape      : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
 $ by          : chr  "row.names" "row.names" "row.names" "row.names" ...
 $ all         : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...

dta_77USA_4|> head() |> knitr::kable()

	x.Population	x.Income	x.Illiteracy	x.Life.Exp	x.Murder	x.HS.Grad	x.Frost	x.Area	y.Murder	y.Assault	y.UrbanPop	y.Rape	by	all
Alabama	3615	3624	2.1	69.05	15.1	41.3	20	50708	13.2	236	58	21.2	row.names	TRUE
Alaska	365	6315	1.5	69.31	11.3	66.7	152	566432	10.0	263	48	44.5	row.names	TRUE
Arizona	2212	4530	1.8	70.55	7.8	58.1	15	113417	8.1	294	80	31.0	row.names	TRUE
Arkansas	2110	3378	1.9	70.66	10.1	39.9	65	51945	8.8	190	50	19.5	row.names	TRUE
California	21198	5114	1.1	71.71	10.3	62.6	20	156361	9.0	276	91	40.6	row.names	TRUE
Colorado	2541	4884	0.7	72.06	6.8	63.9	166	103766	7.9	204	78	38.7	row.names	TRUE

#用data.frame進行合併，row.name"不會變成變項"，其他變項會依據來源，編著來自x或來自y

cor(dta_77USA_1)|> knitr::kable(digits = 2)

	Murder	Population	Income	Illiteracy	Life Exp	HS Grad	Frost	Area	Assault	UrbanPop	Rape
Murder	1.00	0.33	-0.10	0.80	-0.74	-0.59	-0.53	0.37	0.81	-0.11	0.72
Population	0.33	1.00	-0.02	0.12	0.05	-0.40	-0.36	-0.09	0.41	0.57	0.56
Income	-0.10	-0.02	1.00	-0.46	0.22	0.66	0.36	0.57	0.28	0.09	0.02
Illiteracy	0.80	0.12	-0.46	1.00	-0.75	-0.68	-0.67	0.18	0.51	-0.35	0.50
Life Exp	-0.74	0.05	0.22	-0.75	1.00	0.66	0.41	-0.21	-0.60	0.25	-0.36
HS Grad	-0.59	-0.40	0.66	-0.68	0.66	1.00	0.60	0.36	-0.37	-0.13	-0.41
Frost	-0.53	-0.36	0.36	-0.67	0.41	0.60	1.00	0.12	-0.28	0.19	-0.46
Area	0.37	-0.09	0.57	0.18	-0.21	0.36	0.12	1.00	0.54	-0.07	0.50
Assault	0.81	0.41	0.28	0.51	-0.60	-0.37	-0.28	0.54	1.00	0.14	0.69
UrbanPop	-0.11	0.57	0.09	-0.35	0.25	-0.13	0.19	-0.07	0.14	1.00	0.24
Rape	0.72	0.56	0.02	0.50	-0.36	-0.41	-0.46	0.50	0.69	0.24	1.00

#cor()做資料的correlation表格， digits為小數點後2位

#cor(dta_77USA_2)|> knitr::kable(digits = 2)
#Error in cor(dta_77USA_2) : 'x' 必須是數值
#資料有非數值型 $Row.names : 'AsIs' chr也不能做相關分析

#cor(dta_77USA_3)|> knitr::kable(digits = 2)
#Error in cor(dta_77USA_3) : 'x' 必須是數值  
#有NA資料無法做相關分析



#cor(dta_77USA_4)|> knitr::kable(digits = 2)
#Error in cor(dta_77USA_4) : 'x' 必須是數值
#資料有非數值型(column有chr $by和$ all)也不能做相關分析

數值呈現相關性不容易閱讀，可以做Corrplot

library(corrplot)
corr <- cor(dta_77USA_1[,1:11])

#參數全部默認情況下的相關係數圖

corrplot(corr = corr)

corr <- cor(dta_77USA_1[,1:11])
corrplot(corr = corr,order="AOE",type="upper",tl.pos="d", tl.cex=0.35, tl.col="black")
corrplot(corr = corr,add=TRUE, type="lower", method="number",order="AOE",diag=FALSE,tl.pos="n", cl.pos="n",number.cex=0.6)

圖形顏色搭配數值，比單純數值更為直觀。

以有興趣的Murder來說：與 UrbanPop、Income、HS Grad、Frost、lifeExp、呈現負相關；與population、Rape、Assault、Area、Illiteracy呈現正相關。

In-class exercises_3

Summarize the backpain{HSAUR3} into the following format: driver suburban case control total no no ? ? ? no yes ? ? ? yes no ? ? ? yes yes ? ? ? You should provide comments for each code chunk.

pacman::p_load(HSAUR3)
data(backpain,package = "HSAUR3")
dta_bp <- backpain
str(dta_bp)  #434 obs. of  4 variables

'data.frame':   434 obs. of  4 variables:
 $ ID      : Factor w/ 217 levels "1","2","3","4",..: 1 1 2 2 3 3 4 4 5 5 ...
 $ status  : Factor w/ 2 levels "case","control": 1 2 1 2 1 2 1 2 1 2 ...
 $ driver  : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 1 1 2 2 ...
 $ suburban: Factor w/ 2 levels "no","yes": 2 1 2 2 1 2 1 1 1 2 ...

head(dta_bp)

  ID  status driver suburban
1  1    case    yes      yes
2  1 control    yes       no
3  2    case    yes      yes
4  2 control    yes      yes
5  3    case    yes       no
6  3 control    yes      yes

dta_bp|> head() |> knitr::kable()

ID	status	driver	suburban
1	case	yes	yes
1	control	yes	no
2	case	yes	yes
2	control	yes	yes
3	case	yes	no
3	control	yes	yes

typeof(dta_bp) #list

[1] "list"

summary(dta_bp)

       ID          status    driver    suburban 
 1      :  2   case   :217   no : 86   no :200  
 2      :  2   control:217   yes:348   yes:234  
 3      :  2                                    
 4      :  2                                    
 5      :  2                                    
 6      :  2                                    
 (Other):422

names(dta_bp)

[1] "ID"       "status"   "driver"   "suburban"

#long data to wide data library(tidyverse)
library(dplyr)

dta_bp %>% dplyr::rename(Group = status, Driver = driver, Suburban = suburban) %>% # column rename

#long data to wide data
library(tidyverse)  #這邊是參考昶濬的作法，完成inclass-5，這部分有比較了解
backpain %>% 
 dplyr::rename(Group = status,   #先修改名字
               Driver = driver, 
               Suburban = suburban)  %>%
  group_by(Driver, Suburban) %>% # 依照Driver和Suburban分組
  tidyr::spread(key="Group", value ="Group") %>% # key(Column names) to value
  summarize(Case = sum(is.na(case)), # sum of case number
            Control = sum(is.na(control)), # sum of control number
            Total = n()) %>% # sum of total
  as.data.frame () %>% #tibble to dataframe
  knitr::kable ()

Driver	Suburban	Case	Control	Total
no	no	38	17	64
no	yes	5	4	11
yes	no	43	44	107
yes	yes	37	58	158

In-class exercises_4

The data set Vocab{carData} gives observations on gender, education and vocabulary, from respondents to U.S. General Social Surveys, 1972-2004. Summarize the relationship between education and vocabulary over the years by gender.

library(carData)
data(Vocab,package = "carData")
str(Vocab)

'data.frame':   30351 obs. of  4 variables:
 $ year      : num  1974 1974 1974 1974 1974 ...
 $ sex       : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 2 2 2 1 1 ...
 $ education : num  14 16 10 10 12 16 17 10 12 11 ...
 $ vocabulary: num  9 9 9 5 8 8 9 5 3 5 ...
 - attr(*, "na.action")= 'omit' Named int [1:32115] 1 2 3 4 5 6 7 8 9 10 ...
  ..- attr(*, "names")= chr [1:32115] "19720001" "19720002" "19720003" "19720004" ...

class(Vocab)

[1] "data.frame"

typeof(Vocab)

[1] "list"

summary(Vocab)

      year          sex          education       vocabulary    
 Min.   :1974   Female:17148   Min.   : 0.00   Min.   : 0.000  
 1st Qu.:1987   Male  :13203   1st Qu.:12.00   1st Qu.: 5.000  
 Median :1994                  Median :12.00   Median : 6.000  
 Mean   :1995                  Mean   :13.03   Mean   : 6.004  
 3rd Qu.:2006                  3rd Qu.:15.00   3rd Qu.: 7.000  
 Max.   :2016                  Max.   :20.00   Max.   :10.000

names(Vocab)

[1] "year"       "sex"        "education"  "vocabulary"

library(lattice)
library(tidyr)
library(dplyr)

Vocab %>% 
mutate(year = as.factor(year)) %>%  #把year變成factor ，若沒有設表格上都會以year呈現
lattice::xyplot(vocabulary ~ education | year, 
         groups=sex, 
         type=c("p","g","r"), 
         data=., 
         xlab="education", 
         ylab="vocabulary score", 
         layout = c(6, 4),  #設定xyplot的圖形排列為6x4
         cex=0.3, #縮小男生女生在表格中的點
         auto.key=list(columns=2))

In-class exercises_5

Supply comments to each code chunk in the following survey rmarkdown file and preview it as an R notebook or knit to html.

The data set concerns species and weight of animals caught in plots in a study area in Arizona over time.

Each row holds information for a single animal, and the columns represent:

record_id: Unique id for the observation
month: month of observation
day: day of observation
year: year of observation
plot_id: ID of a particular plot
species_id: 2-letter code
sex: sex of animal (“M”, “F”)
hindfoot_length: length of the hindfoot in mm
weight: weight of the animal in grams
genus: genus of animal
species: species of animal
taxa: e.g. Rodent, Reptile, Bird, Rabbit
plot_type: type of plot

pacman::p_load(tidyverse)

dta <- read_csv("http://kbroman.org/datacarp/portal_data_joined.csv")
#讀csv檔，存成dta

glimpse(dta)

Rows: 34,786
Columns: 13
$ record_id       <dbl> 1, 72, 224, 266, 349, 363, 435, 506, 588, 661, 748, 84~
$ month           <dbl> 7, 8, 9, 10, 11, 11, 12, 1, 2, 3, 4, 5, 6, 8, 9, 10, 1~
$ day             <dbl> 16, 19, 13, 16, 12, 12, 10, 8, 18, 11, 8, 6, 9, 5, 4, ~
$ year            <dbl> 1977, 1977, 1977, 1977, 1977, 1977, 1977, 1978, 1978, ~
$ plot_id         <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~
$ species_id      <chr> "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", ~
$ sex             <chr> "M", "M", NA, NA, NA, NA, NA, NA, "M", NA, NA, "M", "M~
$ hindfoot_length <dbl> 32, 31, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, NA, 34~
$ weight          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 218, NA, NA, 204, 200,~
$ genus           <chr> "Neotoma", "Neotoma", "Neotoma", "Neotoma", "Neotoma",~
$ species         <chr> "albigula", "albigula", "albigula", "albigula", "albig~
$ taxa            <chr> "Rodent", "Rodent", "Rodent", "Rodent", "Rodent", "Rod~
$ plot_type       <chr> "Control", "Control", "Control", "Control", "Control",~

#總覽資料
typeof(dta)

[1] "list"

##資料型態
class(dta)

[1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame"

#資料型態

dim(dta)

[1] 34786    13

#資料的rowx column=34,786 x 13

dplyr::select(dta, plot_id, species_id, weight) %>% head()

# A tibble: 6 x 3
  plot_id species_id weight
    <dbl> <chr>       <dbl>
1       2 NL             NA
2       2 NL             NA
3       2 NL             NA
4       2 NL             NA
5       2 NL             NA
6       2 NL             NA

#選資料的plot_id, species_id, weight做表格，列出前6項

dplyr::select(dta, -record_id, -species_id) %>% head()

# A tibble: 6 x 11
  month   day  year plot_id sex   hindfoot_length weight genus   species  taxa  
  <dbl> <dbl> <dbl>   <dbl> <chr>           <dbl>  <dbl> <chr>   <chr>    <chr> 
1     7    16  1977       2 M                  32     NA Neotoma albigula Rodent
2     8    19  1977       2 M                  31     NA Neotoma albigula Rodent
3     9    13  1977       2 <NA>               NA     NA Neotoma albigula Rodent
4    10    16  1977       2 <NA>               NA     NA Neotoma albigula Rodent
5    11    12  1977       2 <NA>               NA     NA Neotoma albigula Rodent
6    11    12  1977       2 <NA>               NA     NA Neotoma albigula Rodent
# ... with 1 more variable: plot_type <chr>

#選資料的"除了" record_id, species_idt以外的資料做表格，列出前6項

year1995<-dplyr::filter(dta, year == 1995) %>% head()
#以year當過濾，篩選出year==1995的資料
str(year1995)   #column數沒有變，只剩下6筆資料

tibble [6 x 13] (S3: tbl_df/tbl/data.frame)
 $ record_id      : num [1:6] 22314 22728 22899 23032 22003 ...
 $ month          : num [1:6] 6 9 10 12 1 2
 $ day            : num [1:6] 7 23 28 2 11 4
 $ year           : num [1:6] 1995 1995 1995 1995 1995 ...
 $ plot_id        : num [1:6] 2 2 2 2 2 2
 $ species_id     : chr [1:6] "NL" "NL" "NL" "NL" ...
 $ sex            : chr [1:6] "M" "F" "F" "F" ...
 $ hindfoot_length: num [1:6] 34 32 32 33 37 36
 $ weight         : num [1:6] NA 165 171 NA 41 45
 $ genus          : chr [1:6] "Neotoma" "Neotoma" "Neotoma" "Neotoma" ...
 $ species        : chr [1:6] "albigula" "albigula" "albigula" "albigula" ...
 $ taxa           : chr [1:6] "Rodent" "Rodent" "Rodent" "Rodent" ...
 $ plot_type      : chr [1:6] "Control" "Control" "Control" "Control" ...

summary(year1995$year)  #確定一下year1995$year的樣貌

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1995    1995    1995    1995    1995    1995

year1995 #列出year1995所有資料，確實只有6筆

# A tibble: 6 x 13
  record_id month   day  year plot_id species_id sex   hindfoot_length weight
      <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
1     22314     6     7  1995       2 NL         M                  34     NA
2     22728     9    23  1995       2 NL         F                  32    165
3     22899    10    28  1995       2 NL         F                  32    171
4     23032    12     2  1995       2 NL         F                  33     NA
5     22003     1    11  1995       2 DM         M                  37     41
6     22042     2     4  1995       2 DM         F                  36     45
# ... with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
#   plot_type <chr>

weight5<-dplyr::select(dplyr::filter(dta, weight <= 5), species_id, sex, weight)
#選擇weight<=5的資料，只列出species_id, sex, weight
str(weight5)

tibble [75 x 3] (S3: tbl_df/tbl/data.frame)
 $ species_id: chr [1:75] "PF" "PF" "PF" "PF" ...
 $ sex       : chr [1:75] "M" "F" "F" "F" ...
 $ weight    : num [1:75] 5 5 5 4 5 4 5 5 5 5 ...

head(dplyr::select(dplyr::filter(dta, weight <= 5), species_id, sex, weight))

# A tibble: 6 x 3
  species_id sex   weight
  <chr>      <chr>  <dbl>
1 PF         M          5
2 PF         F          5
3 PF         F          5
4 PF         F          4
5 PF         F          5
6 PF         F          4

#head在前面跟在後面沒有差別，都是列出前6個row
dplyr::select(dplyr::filter(dta, weight <= 5), species_id, sex, weight)%>% head()

# A tibble: 6 x 3
  species_id sex   weight
  <chr>      <chr>  <dbl>
1 PF         M          5
2 PF         F          5
3 PF         F          5
4 PF         F          4
5 PF         F          5
6 PF         F          4

dta %>% 
  dplyr::filter(weight <= 5) %>% 
  dplyr::select(species_id, sex, weight) %>% 
  head

# A tibble: 6 x 3
  species_id sex   weight
  <chr>      <chr>  <dbl>
1 PF         M          5
2 PF         F          5
3 PF         F          5
4 PF         F          4
5 PF         F          5
6 PF         F          4

#跟前面一樣，只是語法不同

dta %>% 
  mutate(weight_kg = weight / 1000,   
#mutate新增衍生變數weight_kg= weight / 1000
#mutate新增衍生變數weight_lb = weight_kg * 2.2
         weight_lb = weight_kg * 2.2) %>% 
  head()

# A tibble: 6 x 15
  record_id month   day  year plot_id species_id sex   hindfoot_length weight
      <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
1         1     7    16  1977       2 NL         M                  32     NA
2        72     8    19  1977       2 NL         M                  31     NA
3       224     9    13  1977       2 NL         <NA>               NA     NA
4       266    10    16  1977       2 NL         <NA>               NA     NA
5       349    11    12  1977       2 NL         <NA>               NA     NA
6       363    11    12  1977       2 NL         <NA>               NA     NA
# ... with 6 more variables: genus <chr>, species <chr>, taxa <chr>,
#   plot_type <chr>, weight_kg <dbl>, weight_lb <dbl>

#原來有13個，再加上新增的weight_kg、weight_lb，共有15個

dta %>% 
  filter(!is.na(weight)) %>%
#filter()篩選符合條件的觀測值   
#filter(!is.na)篩選出weight非NA的值(!is.na)
  group_by(sex, species_id) %>%
#group_by()依照類別變數分組，常搭配 summarize()
#依照類別sex, species_id分組
  summarize(mean_weight = mean(weight)) %>%
#summarise()聚合變數  
#summarize(mean_weight = mean(weight))  
#依照group分組後，創一個mean_weight,帶入mean(weight)
  arrange(desc(mean_weight)) %>% 
#arrange()依照變數排序觀測值   
#arrange(desc(mean_weight)) mean_weight以descending的方式
  head()

# A tibble: 6 x 3
# Groups:   sex [3]
  sex   species_id mean_weight
  <chr> <chr>            <dbl>
1 <NA>  NL                168.
2 M     NL                166.
3 F     NL                154.
4 M     SS                130 
5 <NA>  SH                130 
6 M     DS                122.

#列出前6筆資料，可以觀察到sex和species_id都是不同的組合，其中NA值也會納入

dta %>%
  group_by(sex) %>%
#依性別分組
  tally

# A tibble: 3 x 2
  sex       n
  <chr> <int>
1 F     15690
2 M     17348
3 <NA>   1748

#依分組類別資料計算個數，包括NA也會統計個數
table(dta$sex)


    F     M 
15690 17348

#table不會計算NA個數

dta_count<-dta %>%
  count(sex)
#跟tally功能一樣
str(dta_count)

spec_tbl_df [3 x 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ sex: chr [1:3] "F" "M" NA
 $ n  : int [1:3] 15690 17348 1748
 - attr(*, "spec")=
  .. cols(
  ..   record_id = col_double(),
  ..   month = col_double(),
  ..   day = col_double(),
  ..   year = col_double(),
  ..   plot_id = col_double(),
  ..   species_id = col_character(),
  ..   sex = col_character(),
  ..   hindfoot_length = col_double(),
  ..   weight = col_double(),
  ..   genus = col_character(),
  ..   species = col_character(),
  ..   taxa = col_character(),
  ..   plot_type = col_character()
  .. )
 - attr(*, "problems")=<externalptr>

dta_count<-dta %>%
  group_by(sex) %>%
  summarize(count = n())
#按照sex分組  
#summarize(計數=n)  
#依照group分組後，創一個count,帶入分組的n
str(dta_count)

tibble [3 x 2] (S3: tbl_df/tbl/data.frame)
 $ sex  : chr [1:3] "F" "M" NA
 $ count: int [1:3] 15690 17348 1748

dta_year<-dta %>%
  group_by(sex) %>%
  summarize(count = sum(!is.na(year)))
#按照sex分組  
#summarize(計數=n但排除year是na的)  
#依照group分組後，創一個count,帶入分組的n
str(dta_year)

tibble [3 x 2] (S3: tbl_df/tbl/data.frame)
 $ sex  : chr [1:3] "F" "M" NA
 $ count: int [1:3] 15690 17348 1748

dta_gw <- dta %>% 
  filter(!is.na(weight)) %>%
#篩選排除weight為NA的值
  group_by(genus, plot_id) %>%
#依genus, plot_id分組
  summarize(mean_weight = mean(weight))
#依照group分組後，創一個mean_weight,帶入mean(weight)
head(dta_gw)

# A tibble: 6 x 3
# Groups:   genus [1]
  genus   plot_id mean_weight
  <chr>     <dbl>       <dbl>
1 Baiomys       1        7   
2 Baiomys       2        6   
3 Baiomys       3        8.61
4 Baiomys       5        7.75
5 Baiomys      18        9.5 
6 Baiomys      19        9.53

#列出前6筆資料看一下

glimpse(dta_gw)

Rows: 196
Columns: 3
Groups: genus [10]
$ genus       <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
$ plot_id     <dbl> 1, 2, 3, 5, 18, 19, 20, 21, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,~
$ mean_weight <dbl> 7.000000, 6.000000, 8.611111, 7.750000, 9.500000, 9.533333~

#總覽資料，剩下196 row x 3 column
#column:上述篩選的genus, plot_id和新創建的mean_weight

table(dta_gw$genus)


        Baiomys     Chaetodipus       Dipodomys         Neotoma       Onychomys 
              8              24              24              24              24 
    Perognathus      Peromyscus Reithrodontomys        Sigmodon    Spermophilus 
             23              24              24              19               2

dta_w <- dta_gw %>%
  spread(key = genus, value = mean_weight)
#spread用來擴展表，把某一列的值（鍵值對）分開拆成多列
#通常是long data要變成wide data
#key=是原來要拆的column，value=是拆出來的那些列的值應該填什麼
#把genus的變成wide data，並且填入mean_weight
head(dta_w)

# A tibble: 6 x 11
  plot_id Baiomys Chaetodipus Dipodomys Neotoma Onychomys Perognathus Peromyscus
    <dbl>   <dbl>       <dbl>     <dbl>   <dbl>     <dbl>       <dbl>      <dbl>
1       1    7           22.2      60.2    156.      27.7        9.62       22.2
2       2    6           25.1      55.7    169.      26.9        6.95       22.3
3       3    8.61        24.6      52.0    158.      26.0        7.51       21.4
4       4   NA           23.0      57.5    164.      28.1        7.82       22.6
5       5    7.75        18.0      51.1    190.      27.0        8.66       21.2
6       6   NA           24.9      58.6    180.      25.9        7.81       21.8
# ... with 3 more variables: Reithrodontomys <dbl>, Sigmodon <dbl>,
#   Spermophilus <dbl>

head(dta_w)

# A tibble: 6 x 11
  plot_id Baiomys Chaetodipus Dipodomys Neotoma Onychomys Perognathus Peromyscus
    <dbl>   <dbl>       <dbl>     <dbl>   <dbl>     <dbl>       <dbl>      <dbl>
1       1    7           22.2      60.2    156.      27.7        9.62       22.2
2       2    6           25.1      55.7    169.      26.9        6.95       22.3
3       3    8.61        24.6      52.0    158.      26.0        7.51       21.4
4       4   NA           23.0      57.5    164.      28.1        7.82       22.6
5       5    7.75        18.0      51.1    190.      27.0        8.66       21.2
6       6   NA           24.9      58.6    180.      25.9        7.81       21.8
# ... with 3 more variables: Reithrodontomys <dbl>, Sigmodon <dbl>,
#   Spermophilus <dbl>

glimpse(dta_w)

Rows: 24
Columns: 11
$ plot_id         <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,~
$ Baiomys         <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA~
$ Chaetodipus     <dbl> 22.19939, 25.11014, 24.63636, 23.02381, 17.98276, 24.8~
$ Dipodomys       <dbl> 60.23214, 55.68259, 52.04688, 57.52454, 51.11356, 58.6~
$ Neotoma         <dbl> 156.2222, 169.1436, 158.2414, 164.1667, 190.0370, 179.~
$ Onychomys       <dbl> 27.67550, 26.87302, 26.03241, 28.09375, 27.01695, 25.8~
$ Perognathus     <dbl> 9.625000, 6.947368, 7.507812, 7.824427, 8.658537, 7.80~
$ Peromyscus      <dbl> 22.22222, 22.26966, 21.37037, 22.60000, 21.23171, 21.8~
$ Reithrodontomys <dbl> 11.375000, 10.680556, 10.516588, 10.263158, 11.154545,~
$ Sigmodon        <dbl> NA, 70.85714, 65.61404, 82.00000, 82.66667, 68.77778, ~
$ Spermophilus    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 13~

#從table(dta_gw$genus)可看的出來，genus共有10種，最多的資料是24筆
#多一個column是原來的plot_id

dta_gw %>%
  spread(genus, mean_weight, fill = 0) %>%
  head()

# A tibble: 6 x 11
  plot_id Baiomys Chaetodipus Dipodomys Neotoma Onychomys Perognathus Peromyscus
    <dbl>   <dbl>       <dbl>     <dbl>   <dbl>     <dbl>       <dbl>      <dbl>
1       1    7           22.2      60.2    156.      27.7        9.62       22.2
2       2    6           25.1      55.7    169.      26.9        6.95       22.3
3       3    8.61        24.6      52.0    158.      26.0        7.51       21.4
4       4    0           23.0      57.5    164.      28.1        7.82       22.6
5       5    7.75        18.0      51.1    190.      27.0        8.66       21.2
6       6    0           24.9      58.6    180.      25.9        7.81       21.8
# ... with 3 more variables: Reithrodontomys <dbl>, Sigmodon <dbl>,
#   Spermophilus <dbl>

#spread同上面
#這次用0取代NA所有資料

dta_l <- dta_w %>%
  gather(key = genus, value = mean_weight, -plot_id)
head(dta_l)

# A tibble: 6 x 3
  plot_id genus   mean_weight
    <dbl> <chr>         <dbl>
1       1 Baiomys        7   
2       2 Baiomys        6   
3       3 Baiomys        8.61
4       4 Baiomys       NA   
5       5 Baiomys        7.75
6       6 Baiomys       NA

#gather:wide data再變回long data
#依照genus集合
#刪去"plot_id"這個column

glimpse(dta_l)

Rows: 240
Columns: 3
$ plot_id     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,~
$ genus       <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
$ mean_weight <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA, NA~

#為什麼前面-plot_id，這邊glimpse後，plot_id依然在?

dta_w_Bs <- dta_w %>%
  gather(key = genus, value = mean_weight, Baiomys:Spermophilus) 
#只gather的genus變數中的Baiomys到Spermophilus類別
head(dta_w_Bs)

# A tibble: 6 x 3
  plot_id genus   mean_weight
    <dbl> <chr>         <dbl>
1       1 Baiomys        7   
2       2 Baiomys        6   
3       3 Baiomys        8.61
4       4 Baiomys       NA   
5       5 Baiomys        7.75
6       6 Baiomys       NA

tail(dta_w_Bs)

# A tibble: 6 x 3
  plot_id genus        mean_weight
    <dbl> <chr>              <dbl>
1      19 Spermophilus          NA
2      20 Spermophilus          57
3      21 Spermophilus          NA
4      22 Spermophilus          NA
5      23 Spermophilus          NA
6      24 Spermophilus          NA

glimpse(dta_w_Bs)

Rows: 240
Columns: 3
$ plot_id     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,~
$ genus       <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
$ mean_weight <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA, NA~

table(dta_w_Bs$genus)


        Baiomys     Chaetodipus       Dipodomys         Neotoma       Onychomys 
             24              24              24              24              24 
    Perognathus      Peromyscus Reithrodontomys        Sigmodon    Spermophilus 
             24              24              24              24              24

#確認genus變數中是真的從Baiomys到Spermophilus類別

dta_complete <- dta %>%
  filter(!is.na(weight),           
         !is.na(hindfoot_length),  
         !is.na(sex)) 
#篩選資料，排除weight有NA的、hindfoot_length有NA的、sex
glimpse(dta_complete)

Rows: 30,676
Columns: 13
$ record_id       <dbl> 845, 1164, 1261, 1756, 1818, 1882, 2133, 2184, 2406, 3~
$ month           <dbl> 5, 8, 9, 4, 5, 7, 10, 11, 1, 5, 5, 7, 10, 11, 1, 2, 3,~
$ day             <dbl> 6, 5, 4, 29, 30, 4, 25, 17, 16, 18, 18, 8, 1, 23, 25, ~
$ year            <dbl> 1978, 1978, 1978, 1979, 1979, 1979, 1979, 1979, 1980, ~
$ plot_id         <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~
$ species_id      <chr> "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", ~
$ sex             <chr> "M", "M", "M", "M", "M", "M", "F", "F", "F", "F", "F",~
$ hindfoot_length <dbl> 32, 34, 32, 33, 32, 32, 33, 30, 33, 31, 33, 30, 34, 34~
$ weight          <dbl> 204, 199, 197, 166, 184, 206, 274, 186, 184, 87, 174, ~
$ genus           <chr> "Neotoma", "Neotoma", "Neotoma", "Neotoma", "Neotoma",~
$ species         <chr> "albigula", "albigula", "albigula", "albigula", "albig~
$ taxa            <chr> "Rodent", "Rodent", "Rodent", "Rodent", "Rodent", "Rod~
$ plot_type       <chr> "Control", "Control", "Control", "Control", "Control",~

species_counts <- dta_complete %>%
    count(species_id) %>% 
#用排除NA的資料dta_complete，計數species_id
    filter(n >= 50)
#只列出n>=50的資料
glimpse(species_counts)

Rows: 14
Columns: 2
$ species_id <chr> "DM", "DO", "DS", "NL", "OL", "OT", "PB", "PE", "PF", "PM",~
$ n          <int> 9727, 2790, 2023, 1045, 905, 2081, 2803, 1198, 1469, 835, 2~

dta_complete %>%
#刪除filter(n >= 50)看看發生甚麼變化
    count(species_id) %>% 
#用排除NA的資料dta_complete，計數species_id
glimpse()

Rows: 24
Columns: 2
$ species_id <chr> "BA", "DM", "DO", "DS", "NL", "OL", "OT", "OX", "PB", "PE",~
$ n          <int> 45, 9727, 2790, 2023, 1045, 905, 2081, 5, 2803, 1198, 1469,~

#資料果然筆前一筆更多

summary(species_counts$species_id)

   Length     Class      Mode 
       14 character character

dta_complete <- dta_complete %>%
  filter(species_id %in% species_counts$species_id)
#判斷某個值(或向量)，是否存在於另一個向量之中，會使用 %in% 的符號
#species_id是不是有在species_counts$species_id裡面?
glimpse(dta_complete)

Rows: 30,463
Columns: 13
$ record_id       <dbl> 845, 1164, 1261, 1756, 1818, 1882, 2133, 2184, 2406, 3~
$ month           <dbl> 5, 8, 9, 4, 5, 7, 10, 11, 1, 5, 5, 7, 10, 11, 1, 2, 3,~
$ day             <dbl> 6, 5, 4, 29, 30, 4, 25, 17, 16, 18, 18, 8, 1, 23, 25, ~
$ year            <dbl> 1978, 1978, 1978, 1979, 1979, 1979, 1979, 1979, 1980, ~
$ plot_id         <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~
$ species_id      <chr> "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", ~
$ sex             <chr> "M", "M", "M", "M", "M", "M", "F", "F", "F", "F", "F",~
$ hindfoot_length <dbl> 32, 34, 32, 33, 32, 32, 33, 30, 33, 31, 33, 30, 34, 34~
$ weight          <dbl> 204, 199, 197, 166, 184, 206, 274, 186, 184, 87, 174, ~
$ genus           <chr> "Neotoma", "Neotoma", "Neotoma", "Neotoma", "Neotoma",~
$ species         <chr> "albigula", "albigula", "albigula", "albigula", "albig~
$ taxa            <chr> "Rodent", "Rodent", "Rodent", "Rodent", "Rodent", "Rod~
$ plot_type       <chr> "Control", "Control", "Control", "Control", "Control",~

head(dta_complete)

# A tibble: 6 x 13
  record_id month   day  year plot_id species_id sex   hindfoot_length weight
      <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
1       845     5     6  1978       2 NL         M                  32    204
2      1164     8     5  1978       2 NL         M                  34    199
3      1261     9     4  1978       2 NL         M                  32    197
4      1756     4    29  1979       2 NL         M                  33    166
5      1818     5    30  1979       2 NL         M                  32    184
6      1882     7     4  1979       2 NL         M                  32    206
# ... with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
#   plot_type <chr>

Inclass exercise

Ruei-Ning Su

2021-11-07

In-class exercises_2

In-class exercises_3

In-class exercises_4

In-class exercises_5