In class 1

“Children, Classrooms, Schools”

#目錄設定說明

-html_document: out屬性是html格式 -toc: true 標題可點選 -toc_depth: 2 標題的層級，這裡設定為2層 -toc_float: 在左側顯示浮動的標題點選視窗，這裡沒有設定表示不用視窗 -collapsed: false 是否要展開層級 -smooth_scroll: false 是否要滑動的顯示過程 -theme: readable 選擇需要的theme -highlight: pygments 設定語法的顯示模式（有default, tango, pygments, kate, monochrome, espresso, zenburn, haddock, and textmate 可選）

#Global options

echo = TRUE, 顯示代碼 cache = FALSE 把緩衝的儲存關了 prompt = FALSE, 網路說是R最前面程式的大號顯示關掉，但不管是TRUE 或FALSE，在Rmarkdown中都會有大號> tidy = FALSE, 不用自動排列程式碼, 若自動排列會有段行方式呈現 comment = NA, 在prompt = TRUE時可以在這裡設定呈現方式comment=’’ message = FALSE, 訊息不要在程式欄位顯示，在console出現 warning = FALSE, 程式碼警告訊息不要在程式欄位顯示，在console出現 fig.width = 6, 設定圖形寬度 fig.height = 4 設定圖形高度

#Data

> #在Rmarkdown下載WWGbook
> pacman::p_load(WWGbook)
> data(classroom, package="WWGbook")

> #查看classroom是什麼，這裡是data
> ?classroom

> #顯示資料類型和結構
> str(classroom)

'data.frame':   1190 obs. of  12 variables:
 $ sex     : int  1 0 1 0 0 1 0 0 1 0 ...
 $ minority: int  1 1 1 1 1 1 1 1 1 1 ...
 $ mathkind: int  448 460 511 449 425 450 452 443 422 480 ...
 $ mathgain: int  32 109 56 83 53 65 51 66 88 -7 ...
 $ ses     : num  0.46 -0.27 -0.03 -0.38 -0.03 0.76 -0.03 0.2 0.64 0.13 ...
 $ yearstea: num  1 1 1 2 2 2 2 2 2 2 ...
 $ mathknow: num  NA NA NA -0.11 -0.11 -0.11 -0.11 -0.11 -0.11 -0.11 ...
 $ housepov: num  0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 ...
 $ mathprep: num  2 2 2 3.25 3.25 3.25 3.25 3.25 3.25 3.25 ...
 $ classid : int  160 160 160 217 217 217 217 217 217 217 ...
 $ schoolid: int  1 1 1 1 1 1 1 1 1 1 ...
 $ childid : int  1 2 3 4 5 6 7 8 9 10 ...

#Split data

> #duplicated刪除重複資料，這裡是刪除重複schoolid
> #把"schoolid", "housepov"存到新的資料檔案dta_schl 
> dta_schl <- classroom[duplicated(classroom$schoolid)==FALSE, 
+                      c("schoolid", "housepov")]

> #去除重複後把11, 10, 6,7,9欄的資料存出來，並按此順序排序
> dta_cls <- classroom[duplicated(classroom$classid)==FALSE, 
+                      c(11, 10, 6,7,9)]

> #把12, 10, 11, 留下，並針對1～5欄位也留下
> dta_chld <- classroom[, c(12, 10, 11, 1:5)]

#呈現ID數量與欄位數

> #sapply和 lapply 功能一樣,給一個 list,它會回傳vector,依指定的功能函數來一項一項做運算。
> #一定要加dim，不然無法跑
> sapply(list(dta_schl, dta_cls, dta_chld), dim)

     [,1] [,2] [,3]
[1,]  107  312 1190
[2,]    2    5    8

#Combine data

> #依據"classid", "schoolid"合併資料
> dta_12 <- merge(x=dta_chld, y=dta_cls, by=c("classid", "schoolid"))

> #跟dta_12是一樣程式
> dta_13 <- merge(x=dta_chld, y=dta_cls, by=c("classid", "schoolid"))

> #依據"schoolid"相同的合併
> dta_23 <- merge(x=dta_cls, y=dta_schl, by="schoolid")

> #by=c()就算只有一個欄位也可用c來combine
> dta_24 <- merge(x=dta_cls, y=dta_schl, by=c("schoolid"))

> dta_123 <- merge(x=dta_12, y=dta_schl, by=c("schoolid"))

#確認過dta_23, dta_24結果是一樣的，當合併一個變項時，c可用可不用

> sapply(list(dta_12, dta_13, dta_23, dta_24,dta_123), dim)

     [,1] [,2] [,3] [,4] [,5]
[1,] 1190 1190  312  312 1190
[2,]   11   11    6    6   12

In class 2

> str(state.x77)

 num [1:50, 1:8] 3615 365 2212 2110 21198 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
  ..$ : chr [1:8] "Population" "Income" "Illiteracy" "Life Exp" ...

> head(state.x77)

           Population Income Illiteracy Life Exp Murder HS Grad Frost   Area
Alabama          3615   3624        2.1    69.05   15.1    41.3    20  50708
Alaska            365   6315        1.5    69.31   11.3    66.7   152 566432
Arizona          2212   4530        1.8    70.55    7.8    58.1    15 113417
Arkansas         2110   3378        1.9    70.66   10.1    39.9    65  51945
California      21198   5114        1.1    71.71   10.3    62.6    20 156361
Colorado         2541   4884        0.7    72.06    6.8    63.9   166 103766

> str(USArrests)

'data.frame':   50 obs. of  4 variables:
 $ Murder  : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
 $ Assault : int  236 263 294 190 276 204 110 238 335 211 ...
 $ UrbanPop: int  58 48 80 50 91 78 77 72 80 60 ...
 $ Rape    : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...

> head(USArrests)

           Murder Assault UrbanPop Rape
Alabama      13.2     236       58 21.2
Alaska       10.0     263       48 44.5
Arizona       8.1     294       80 31.0
Arkansas      8.8     190       50 19.5
California    9.0     276       91 40.6
Colorado      7.9     204       78 38.7

> dta <- merge(x=state.x77, y=USArrests, by="row.names", all=TRUE)

> head(dta)

   Row.names Population Income Illiteracy Life Exp Murder.x HS Grad Frost
1    Alabama       3615   3624        2.1    69.05     15.1    41.3    20
2     Alaska        365   6315        1.5    69.31     11.3    66.7   152
3    Arizona       2212   4530        1.8    70.55      7.8    58.1    15
4   Arkansas       2110   3378        1.9    70.66     10.1    39.9    65
5 California      21198   5114        1.1    71.71     10.3    62.6    20
6   Colorado       2541   4884        0.7    72.06      6.8    63.9   166
    Area Murder.y Assault UrbanPop Rape
1  50708     13.2     236       58 21.2
2 566432     10.0     263       48 44.5
3 113417      8.1     294       80 31.0
4  51945      8.8     190       50 19.5
5 156361      9.0     276       91 40.6
6 103766      7.9     204       78 38.7

> View(dta)

> library(tidyverse)

#此時非矩陣，用kable看資料

> cor(dta[,-1])|> knitr::kable(digits = 2)

	Population	Income	Illiteracy	Life Exp	Murder.x	HS Grad	Frost	Area	Murder.y	Assault	UrbanPop	Rape
Population	1.00	0.21	0.11	-0.07	0.34	-0.10	-0.33	0.02	0.32	0.32	0.51	0.31
Income	0.21	1.00	-0.44	0.34	-0.23	0.62	0.23	0.36	-0.22	0.04	0.48	0.36
Illiteracy	0.11	-0.44	1.00	-0.59	0.70	-0.66	-0.67	0.08	0.71	0.51	-0.06	0.15
Life Exp	-0.07	0.34	-0.59	1.00	-0.78	0.58	0.26	-0.11	-0.78	-0.63	0.27	-0.27
Murder.x	0.34	-0.23	0.70	-0.78	1.00	-0.49	-0.54	0.23	0.93	0.74	0.02	0.58
HS Grad	-0.10	0.62	-0.66	0.58	-0.49	1.00	0.37	0.33	-0.52	-0.23	0.36	0.27
Frost	-0.33	0.23	-0.67	0.26	-0.54	0.37	1.00	0.06	-0.54	-0.47	-0.25	-0.28
Area	0.02	0.36	0.08	-0.11	0.23	0.33	0.06	1.00	0.15	0.23	-0.06	0.52
Murder.y	0.32	-0.22	0.71	-0.78	0.93	-0.52	-0.54	0.15	1.00	0.80	0.07	0.56
Assault	0.32	0.04	0.51	-0.63	0.74	-0.23	-0.47	0.23	0.80	1.00	0.26	0.67
UrbanPop	0.51	0.48	-0.06	0.27	0.02	0.36	-0.25	-0.06	0.07	0.26	1.00	0.41
Rape	0.31	0.36	0.15	-0.27	0.58	0.27	-0.28	0.52	0.56	0.67	0.41	1.00

> library(corrplot)
> library(RColorBrewer)

#Correlation Matrix for calculating and visualizing easily a correlation matrix 使用 rquery.cormat()計算相關矩陣相關矩陣的上三角全相關矩陣更改相關圖的顏色繪製熱圖

> source("http://www.sthda.com/upload/rquery_cormat.r")
> require("corrplot")
> rquery.cormat(cor(dta[,-1]))

$r
           Income HS Grad Life Exp Frost Population UrbanPop  Area Rape
Income          1                                                      
HS Grad      0.89       1                                              
Life Exp     0.73    0.88        1                                     
Frost         0.6    0.77     0.74     1                               
Population  -0.11    -0.4    -0.36 -0.66          1                    
UrbanPop     0.53    0.35     0.34 -0.17       0.57        1           
Area         0.24    0.18    -0.18  0.04      -0.18    -0.25     1     
Rape       -0.061   -0.28    -0.62 -0.63       0.48     0.26  0.52    1
Illiteracy  -0.81   -0.93    -0.92  -0.9       0.42    -0.22 0.038 0.52
Assault     -0.52   -0.74    -0.91 -0.86       0.55    0.024   0.2 0.82
Murder.x    -0.69   -0.86    -0.97 -0.87       0.51    -0.15  0.17 0.72
Murder.y    -0.69   -0.87    -0.97 -0.88       0.52    -0.13  0.13 0.71
           Illiteracy Assault Murder.x Murder.y
Income                                         
HS Grad                                        
Life Exp                                       
Frost                                          
Population                                     
UrbanPop                                       
Area                                           
Rape                                           
Illiteracy          1                          
Assault          0.87       1                  
Murder.x         0.94    0.96        1         
Murder.y         0.95    0.97        1        1

$p
            Income HS Grad Life Exp   Frost Population UrbanPop  Area    Rape
Income           0                                                           
HS Grad    8.4e-05       0                                                   
Life Exp    0.0067 0.00015        0                                          
Frost         0.04  0.0035   0.0055       0                                  
Population    0.73     0.2     0.25   0.021          0                       
UrbanPop     0.078    0.27     0.29     0.6      0.054        0              
Area          0.45    0.57     0.57     0.9       0.57     0.43     0        
Rape          0.85    0.38    0.032    0.03       0.12     0.42 0.081       0
Illiteracy  0.0014   1e-05  2.8e-05 5.4e-05       0.17     0.49  0.91   0.082
Assault      0.081  0.0057  3.5e-05 0.00036      0.064     0.94  0.53 0.00097
Murder.x     0.014 0.00032  2.5e-07 0.00024      0.089     0.63  0.59  0.0083
Murder.y     0.014 0.00025  3.4e-07 0.00018      0.084      0.7  0.69  0.0092
           Illiteracy Assault Murder.x Murder.y
Income                                         
HS Grad                                        
Life Exp                                       
Frost                                          
Population                                     
UrbanPop                                       
Area                                           
Rape                                           
Illiteracy          0                          
Assault       0.00024       0                  
Murder.x      3.8e-06 8.1e-07        0         
Murder.y      3.4e-06 2.7e-07  1.5e-12        0

$sym
           Income HS Grad Life Exp Frost Population UrbanPop Area Rape
Income     1                                                          
HS Grad    +      1                                                   
Life Exp   ,      +       1                                           
Frost      .      ,       ,        1                                  
Population        .       .        ,     1                            
UrbanPop   .      .       .              .          1                 
Area                                                         1        
Rape                      ,        ,     .                   .    1   
Illiteracy +      *       *        +     .                        .   
Assault    .      ,       *        +     .                        +   
Murder.x   ,      +       B        +     .                        ,   
Murder.y   ,      +       B        +     .                        ,   
           Illiteracy Assault Murder.x Murder.y
Income                                         
HS Grad                                        
Life Exp                                       
Frost                                          
Population                                     
UrbanPop                                       
Area                                           
Rape                                           
Illiteracy 1                                   
Assault    +          1                        
Murder.x   *          B       1                
Murder.y   *          B       1        1       
attr(,"legend")
[1] 0 ' ' 0.3 '.' 0.6 ',' 0.8 '+' 0.9 '*' 0.95 'B' 1

In class 3

> library(HSAUR3)

> data(backpain, package="HSAUR3")

> str(backpain) |> head()

'data.frame':   434 obs. of  4 variables:
 $ ID      : Factor w/ 217 levels "1","2","3","4",..: 1 1 2 2 3 3 4 4 5 5 ...
 $ status  : Factor w/ 2 levels "case","control": 1 2 1 2 1 2 1 2 1 2 ...
 $ driver  : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 1 1 2 2 ...
 $ suburban: Factor w/ 2 levels "no","yes": 2 1 2 2 1 2 1 1 1 2 ...

NULL

> backpain |> head()|>knitr::kable()

ID	status	driver	suburban
1	case	yes	yes
1	control	yes	no
2	case	yes	yes
2	control	yes	yes
3	case	yes	no
3	control	yes	yes

> backpain1<- as_tibble(backpain)

#說明註解 -spread用來擴展表，把某一列的值（鍵值對）分開拆成多列。 spread(data, key, value, fill = NA, convert = FALSE, drop =TRUE, sep = NULL) key是原來要拆的那一列的名字（變量名），value是拆出來的那些列的值應該填什麼（填原表的哪一列） -Case Control Total 取次數，Total = n()為整合起來的次數 na.rm = TRUE設定遺漏值，不然會有NA出現 - as.data.frame () 為dataframe

> backpain1  %>% 
+   dplyr::rename(Group = status, 
+                 Driver= driver, 
+                Suburban = suburban) %>%
+   group_by(Driver, Suburban) %>% 
+   tidyr::spread(key="Group", value ="Group") %>%
+   summarize(Case = sum(is.na(case),na.rm = TRUE),
+             Control = sum(is.na(control),na.rm = TRUE),
+             Total = n())%>% 
+   as.data.frame () %>% 
+   knitr::kable ()

Driver	Suburban	Case	Control	Total
no	no	38	17	64
no	yes	5	4	11
yes	no	43	44	107
yes	yes	37	58	158

In class 4

> library(carData)

> data(Vocab, package="carData")

> str(Vocab) |> head()

'data.frame':   30351 obs. of  4 variables:
 $ year      : num  1974 1974 1974 1974 1974 ...
 $ sex       : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 2 2 2 1 1 ...
 $ education : num  14 16 10 10 12 16 17 10 12 11 ...
 $ vocabulary: num  9 9 9 5 8 8 9 5 3 5 ...
 - attr(*, "na.action")= 'omit' Named int [1:32115] 1 2 3 4 5 6 7 8 9 10 ...
  ..- attr(*, "names")= chr [1:32115] "19720001" "19720002" "19720003" "19720004" ...

NULL

> head(Vocab) |> knitr::kable ()

	year	sex	education	vocabulary
19740001	1974	Male	14	9
19740002	1974	Male	16	9
19740003	1974	Female	10	9
19740004	1974	Female	10	5
19740005	1974	Female	12	8
19740006	1974	Male	16	8

> library(lattice)

> Vocab %>% 
+  mutate(year = as.factor(year)) %>%
+  lattice::xyplot(education ~ vocabulary | year, data=.,
+         groups=sex, 
+         type=c("p","g","r"), 
+         cex= .3,
+         xlab="Education", 
+         ylab="Vocabulary", 
+         layout = c(5,6),
+         auto.key=list(columns = 2),
+    par.settings = list(superpose.symbol = list(pch ="*", cex = 1),
+                           superpose.line =list(lwd =1.5)))

In class 5

#Supply comments to each code chunk

The data set concerns species and weight of animals caught in plots in a study area in Arizona over time.

Each row holds information for a single animal, and the columns represent:

record_id: Unique id for the observation
month: month of observation
day: day of observation
year: year of observation
plot_id: ID of a particular plot
species_id: 2-letter code
sex: sex of animal (“M”, “F”)
hindfoot_length: length of the hindfoot in mm
weight: weight of the animal in grams
genus: genus of animal
species: species of animal
taxa: e.g. Rodent, Reptile, Bird, Rabbit
plot_type: type of plot

#用套件來讀取套件，載入tidyverse

> pacman::p_load(tidyverse)

#讀網路上的資料

> dta <- read_csv("http://kbroman.org/datacarp/portal_data_joined.csv")

#glimpse看一眼資料型態檢視資料

> glimpse(dta)

Rows: 34,786
Columns: 13
$ record_id       <dbl> 1, 72, 224, 266, 349, 363, 435, 506, 588, 661, 748, 84~
$ month           <dbl> 7, 8, 9, 10, 11, 11, 12, 1, 2, 3, 4, 5, 6, 8, 9, 10, 1~
$ day             <dbl> 16, 19, 13, 16, 12, 12, 10, 8, 18, 11, 8, 6, 9, 5, 4, ~
$ year            <dbl> 1977, 1977, 1977, 1977, 1977, 1977, 1977, 1978, 1978, ~
$ plot_id         <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~
$ species_id      <chr> "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", ~
$ sex             <chr> "M", "M", NA, NA, NA, NA, NA, NA, "M", NA, NA, "M", "M~
$ hindfoot_length <dbl> 32, 31, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, NA, 34~
$ weight          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 218, NA, NA, 204, 200,~
$ genus           <chr> "Neotoma", "Neotoma", "Neotoma", "Neotoma", "Neotoma",~
$ species         <chr> "albigula", "albigula", "albigula", "albigula", "albig~
$ taxa            <chr> "Rodent", "Rodent", "Rodent", "Rodent", "Rodent", "Rod~
$ plot_type       <chr> "Control", "Control", "Control", "Control", "Control",~

#dim查看各維度長度顯示資料框列與行的長度

> dim(dta)

[1] 34786    13

#dplyr語法使用select函數選dta中的plot_id, species_id, weight三個欄位， %>%然後去 head()看前六筆

> dplyr::select(dta, plot_id, species_id, weight) %>% head()

# A tibble: 6 x 3
  plot_id species_id weight
    <dbl> <chr>       <dbl>
1       2 NL             NA
2       2 NL             NA
3       2 NL             NA
4       2 NL             NA
5       2 NL             NA
6       2 NL             NA

#dplyr語法使用select函數，選dta中不要 -record_id, -species_id這2個變項， %>%然後去 head()看前六筆

> dplyr::select(dta, -record_id, -species_id) %>% head()

# A tibble: 6 x 11
  month   day  year plot_id sex   hindfoot_length weight genus   species  taxa  
  <dbl> <dbl> <dbl>   <dbl> <chr>           <dbl>  <dbl> <chr>   <chr>    <chr> 
1     7    16  1977       2 M                  32     NA Neotoma albigula Rodent
2     8    19  1977       2 M                  31     NA Neotoma albigula Rodent
3     9    13  1977       2 <NA>               NA     NA Neotoma albigula Rodent
4    10    16  1977       2 <NA>               NA     NA Neotoma albigula Rodent
5    11    12  1977       2 <NA>               NA     NA Neotoma albigula Rodent
6    11    12  1977       2 <NA>               NA     NA Neotoma albigula Rodent
# ... with 1 more variable: plot_type <chr>

#dplyr語法使用filter函數做篩選，選取year = 1995資料

> dplyr::filter(dta, year == 1995) %>% head()

# A tibble: 6 x 13
  record_id month   day  year plot_id species_id sex   hindfoot_length weight
      <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
1     22314     6     7  1995       2 NL         M                  34     NA
2     22728     9    23  1995       2 NL         F                  32    165
3     22899    10    28  1995       2 NL         F                  32    171
4     23032    12     2  1995       2 NL         F                  33     NA
5     22003     1    11  1995       2 DM         M                  37     41
6     22042     2     4  1995       2 DM         F                  36     45
# ... with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
#   plot_type <chr>

#使用select 選擇函數及filter篩選指定，看前6筆資料，這裡要3個變項，weight 要小於等於 5， species_id, sex, weight

> head(dplyr::select(dplyr::filter(dta, weight <= 5), species_id, sex, weight))

# A tibble: 6 x 3
  species_id sex   weight
  <chr>      <chr>  <dbl>
1 PF         M          5
2 PF         F          5
3 PF         F          5
4 PF         F          4
5 PF         F          5
6 PF         F          4

#程式簡化使用%>% 代表然後及接下來的意思

> dta %>% 
+   dplyr::filter(weight <= 5) %>% 
+   dplyr::select(species_id, sex, weight) %>% 
+   head

# A tibble: 6 x 3
  species_id sex   weight
  <chr>      <chr>  <dbl>
1 PF         M          5
2 PF         F          5
3 PF         F          5
4 PF         F          4
5 PF         F          5
6 PF         F          4

#mutate建立新變項生成weight_kg計算weight / 1000;生成weight_lb計算weight_kg * 2.2，只看前六筆

> dta %>% 
+   mutate(weight_kg = weight / 1000,
+          weight_lb = weight_kg * 2.2) %>% 
+   head()

# A tibble: 6 x 15
  record_id month   day  year plot_id species_id sex   hindfoot_length weight
      <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
1         1     7    16  1977       2 NL         M                  32     NA
2        72     8    19  1977       2 NL         M                  31     NA
3       224     9    13  1977       2 NL         <NA>               NA     NA
4       266    10    16  1977       2 NL         <NA>               NA     NA
5       349    11    12  1977       2 NL         <NA>               NA     NA
6       363    11    12  1977       2 NL         <NA>               NA     NA
# ... with 6 more variables: genus <chr>, species <chr>, taxa <chr>,
#   plot_type <chr>, weight_kg <dbl>, weight_lb <dbl>

#常用指令 group_by分組 !is.na(weight)針對weight有值的人 summarize計算mean(weight)放入新變項mean_weight中 desc(mean_weight)排序大到小mean_weight

> dta %>% 
+   filter(!is.na(weight)) %>%
+   group_by(sex, species_id) %>%
+   summarize(mean_weight = mean(weight)) %>%
+   arrange(desc(mean_weight)) %>% 
+   head()

# A tibble: 6 x 3
# Groups:   sex [3]
  sex   species_id mean_weight
  <chr> <chr>            <dbl>
1 <NA>  NL                168.
2 M     NL                166.
3 F     NL                154.
4 M     SS                130 
5 <NA>  SH                130 
6 M     DS                122.

#看性別分組後的總計

> dta %>%
+   group_by(sex) %>%
+   tally

# A tibble: 3 x 2
  sex       n
  <chr> <int>
1 F     15690
2 M     17348
3 <NA>   1748

#同上，另一種寫法，計算sex總計

> dta %>%
+   count(sex)

# A tibble: 3 x 2
  sex       n
  <chr> <int>
1 F     15690
2 M     17348
3 <NA>   1748

#同上，總計計算分組樣本數

> dta %>%
+   group_by(sex) %>%
+   summarize(count = n())

# A tibble: 3 x 2
  sex   count
  <chr> <int>
1 F     15690
2 M     17348
3 <NA>   1748

#分組計算year有真值的總計樣本數

> dta %>%
+   group_by(sex) %>%
+   summarize(count = sum(!is.na(year)))

# A tibble: 3 x 2
  sex   count
  <chr> <int>
1 F     15690
2 M     17348
3 <NA>   1748

#制做新檔案此檔案中filter篩選!is.na(weight)體重為真值的人， group_by(genus, plot_id)依genus, plot_id 2變項作為分組計算mean(weight)，放入新變項mean_weight中

> dta_gw <- dta %>% 
+   filter(!is.na(weight)) %>%
+   group_by(genus, plot_id) %>%
+   summarize(mean_weight = mean(weight))

#看一眼新資料dta_gw

> glimpse(dta_gw)

Rows: 196
Columns: 3
Groups: genus [10]
$ genus       <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
$ plot_id     <dbl> 1, 2, 3, 5, 18, 19, 20, 21, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,~
$ mean_weight <dbl> 7.000000, 6.000000, 8.611111, 7.750000, 9.500000, 9.533333~

#spread 把直的轉橫的，將genus的資料提取出來，內容為mean_weight的值

> dta_w <- dta_gw %>%
+   spread(key = genus, value = mean_weight)

#看一眼新轉向後的資料

> glimpse(dta_w)

Rows: 24
Columns: 11
$ plot_id         <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,~
$ Baiomys         <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA~
$ Chaetodipus     <dbl> 22.19939, 25.11014, 24.63636, 23.02381, 17.98276, 24.8~
$ Dipodomys       <dbl> 60.23214, 55.68259, 52.04688, 57.52454, 51.11356, 58.6~
$ Neotoma         <dbl> 156.2222, 169.1436, 158.2414, 164.1667, 190.0370, 179.~
$ Onychomys       <dbl> 27.67550, 26.87302, 26.03241, 28.09375, 27.01695, 25.8~
$ Perognathus     <dbl> 9.625000, 6.947368, 7.507812, 7.824427, 8.658537, 7.80~
$ Peromyscus      <dbl> 22.22222, 22.26966, 21.37037, 22.60000, 21.23171, 21.8~
$ Reithrodontomys <dbl> 11.375000, 10.680556, 10.516588, 10.263158, 11.154545,~
$ Sigmodon        <dbl> NA, 70.85714, 65.61404, 82.00000, 82.66667, 68.77778, ~
$ Spermophilus    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 13~

#依未轉向的資料dta_gw整理將genus, mean_weight 2個欄位中的NA改成0

> dta_gw %>%
+   spread(genus, mean_weight, fill = 0) %>%
+   head()

# A tibble: 6 x 11
  plot_id Baiomys Chaetodipus Dipodomys Neotoma Onychomys Perognathus Peromyscus
    <dbl>   <dbl>       <dbl>     <dbl>   <dbl>     <dbl>       <dbl>      <dbl>
1       1    7           22.2      60.2    156.      27.7        9.62       22.2
2       2    6           25.1      55.7    169.      26.9        6.95       22.3
3       3    8.61        24.6      52.0    158.      26.0        7.51       21.4
4       4    0           23.0      57.5    164.      28.1        7.82       22.6
5       5    7.75        18.0      51.1    190.      27.0        8.66       21.2
6       6    0           24.9      58.6    180.      25.9        7.81       21.8
# ... with 3 more variables: Reithrodontomys <dbl>, Sigmodon <dbl>,
#   Spermophilus <dbl>

#gather 把橫的轉成直的,跟spread相反為long form資料，依genus變項轉成直的，填入mean_weight值，不要留plot_id，因轉成直的會自動生程序號，不要序號

> dta_l <- dta_w %>%
+   gather(key = genus, value = mean_weight, -plot_id)

#看一眼轉成直的資料

> glimpse(dta_l)

Rows: 240
Columns: 3
$ plot_id     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,~
$ genus       <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Ba~
$ mean_weight <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA, NA~

#前面一樣，Baiomys:Spermophilus 只要Baiomys:Spermophilus之間的欄位值就好

> dta_w %>%
+   gather(key = genus, value = mean_weight, Baiomys:Spermophilus) %>%
+   head()

# A tibble: 6 x 3
  plot_id genus   mean_weight
    <dbl> <chr>         <dbl>
1       1 Baiomys        7   
2       2 Baiomys        6   
3       3 Baiomys        8.61
4       4 Baiomys       NA   
5       5 Baiomys        7.75
6       6 Baiomys       NA

#filter 這裡選定weight hindfoot_length sex 3個欄位有真值的資料

> dta_complete <- dta %>%
+   filter(!is.na(weight),           
+          !is.na(hindfoot_length),  
+          !is.na(sex))

#計算count 針對剛剛選出的3個欄位計算species_id 列出n >= 50的資料

> species_counts <- dta_complete %>%
+     count(species_id) %>% 
+     filter(n >= 50)

#同上，使用%in%語法 species_counts$species_id計算ID 用法 a %in% table，將有值的輸出在計算欄位中

> dta_complete <- dta_complete %>%
+   filter(species_id %in% species_counts$species_id)

In class-Data wrangling

Shang Chi Lee

2021-11-08

In class 1

In class 2

In class 3

In class 4

In class 5