inclass1

Explain what does this statement do: lapply(lapply(search(), ls), length)

# 用於列出工作目錄中存在的所有object的名稱
ls()
character(0)
# 用於獲取R現有連結的package
search()
[1] ".GlobalEnv"        "package:stats"     "package:graphics" 
[4] "package:grDevices" "package:utils"     "package:datasets" 
[7] "package:methods"   "Autoloads"         "package:base"     
# 計算每一package中 function list 的長度
lapply(lapply(search(), ls), length)
[[1]]
[1] 0

[[2]]
[1] 449

[[3]]
[1] 87

[[4]]
[1] 113

[[5]]
[1] 247

[[6]]
[1] 104

[[7]]
[1] 203

[[8]]
[1] 0

[[9]]
[1] 1255

inclass 2

Convert the R script in the NZ schools example into a rmarkdown file and provide comments to each code chunk indicated by ‘##’. Give alternative code to perform the same calculation where appropriate.

## keep the school names with white spaces
dta <- read.csv("nzSchools.csv", as.is=2) #as.is??

## data structure
str(dta)
'data.frame':   2571 obs. of  6 variables:
 $ ID  : int  1015 1052 1062 1092 1130 1018 1029 1030 1588 1154 ...
 $ Name: chr  "Hora Hora School" "Morningside School" "Onerahi School" "Raurimu Avenue School" ...
 $ City: Factor w/ 541 levels "Ahaura","Ahipara",..: 533 533 533 533 533 533 533 533 533 533 ...
 $ Auth: Factor w/ 4 levels "Other","Private",..: 3 3 3 3 3 3 3 3 4 3 ...
 $ Dec : int  2 3 4 2 4 8 5 5 6 1 ...
 $ Roll: int  318 200 455 86 577 329 637 395 438 201 ...
## dimension of the data
dim(dta)
[1] 2571    6

binning

## 新建Size欄位,大於Roll中位數的為"Large"小於等於為"Small"
dta$Size <- ifelse(dta$Roll > median(dta$Roll), "Large", "Small")

## 刪除size欄位
dta$Size <- NULL

## 顯示資料前六筆
head(dta)
    ID                  Name      City  Auth Dec Roll
1 1015      Hora Hora School Whangarei State   2  318
2 1052    Morningside School Whangarei State   3  200
3 1062        Onerahi School Whangarei State   4  455
4 1092 Raurimu Avenue School Whangarei State   2   86
5 1130      Whangarei School Whangarei State   4  577
6 1018       Hurupaki School Whangarei State   8  329
## 將Roll平均切三等份,分別為small, mediam, large
dta$Size <- cut(dta$Roll, 3, labels=c("Small", "Mediam", "Large"))

## 顯示size的count
table(dta$Size)

 Small Mediam  Large 
  2555     15      1 

sorting–order

## order函數返回的值表示位置,並以降序排列
dta$RollOrd <- order(dta$Roll, decreasing=T)

## 顯示RollOrd 前六筆位置資料
## 第一筆RollOrd為1726,意思是要找第1726筆資料,才是第一列
head(dta[dta$RollOrd, ])
      ID                  Name         City  Auth Dec Roll   Size RollOrd
1726 498 Correspondence School   Wellington State  NA 5546  Large     753
301   28     Rangitoto College     Auckland State  10 3022 Mediam     353
376   78      Avondale College     Auckland State   4 2613 Mediam     712
2307 319  Burnside High School Christchurch State   8 2588 Mediam     709
615   41      Macleans College     Auckland State  10 2476 Mediam    1915
199   43    Massey High School     Auckland State   5 2452 Mediam    1683
# 顯示RollOrd 後六筆位置資料
tail(dta[dta$RollOrd, ])
       ID                    Name                  City    Auth Dec Roll  Size
2401 1641  Amana Christian School               Dunedin Private   9    7 Small
1590 2461       Tangimoana School              Manawatu   State   4    6 Small
1996 3598         Woodbank School              Kaikoura   State   4    6 Small
2112 3386     Jacobs River School          Jacobs River   State   5    6 Small
1514 2407     Ngamatapouri School Sth Taranaki District   State   9    5 Small
1575 2420 Papanui Junction School               Taihape   State   5    5 Small
     RollOrd
2401    2562
1590     266
1996    2478
2112    1501
1514    2377
1575    1542
## 根據city和Roll由大到小(z to A)排序,並顯示前六筆
head(dta[order(dta$City, dta$Roll, decreasing=T), ])
       ID                      Name      City  Auth Dec Roll  Size RollOrd
2548  401           Menzies College   Wyndham State   4  356 Small     859
2549 4054            Wyndham School   Wyndham State   5   94 Small    1163
1611 2742          Woodville School Woodville State   3  147 Small     726
1630 2640           Papatawa School Woodville State   7   27 Small    2273
2041 3600            Woodend School   Woodend State   9  375 Small    1401
1601  399 Central Southland College    Winton State   7  549 Small     450
## 根據city和Roll由大到小排序,並顯示後六筆
tail(dta[order(dta$City, dta$Roll, decreasing=T), ])
       ID                         Name    City  Auth Dec Roll  Size RollOrd
2169 3273                Albury School  Albury State   8   30 Small    1010
2018  350           Akaroa Area School  Akaroa State   8  125 Small    1051
2023 3332           Duvauchelle School  Akaroa State   9   41 Small     749
335  1200                Ahuroa School  Ahuroa State   7   22 Small     193
99   1000               Ahipara School Ahipara State   3  241 Small    1963
2117 2105 Awahono School - Grey Valley  Ahaura State   4  119 Small     364

sorting–sort 只呈現數列

## 只呈現Roll由大到小的數值 integer
head(sort(dta$Roll, decreasing=T))
[1] 5546 3022 2613 2588 2476 2452
tail(sort(dta$Roll, decreasing=T))
[1] 7 6 6 6 5 5
## 只顯示經排序後前六個city name
head(sort(dta$City, dta$Roll, decreasing=T))
[1] Wyndham   Wyndham   Woodville Woodville Woodend   Winton   
541 Levels: Ahaura Ahipara Ahuroa Akaroa Albury Alexandra Alfriston ... Wyndham

sorting–arrange 與order效果相同

## one varible
pacman::p_load(dplyr)
head(arrange(dta,desc(Roll)))
   ID                  Name         City  Auth Dec Roll   Size RollOrd
1 498 Correspondence School   Wellington State  NA 5546  Large     753
2  28     Rangitoto College     Auckland State  10 3022 Mediam     353
3  78      Avondale College     Auckland State   4 2613 Mediam     712
4 319  Burnside High School Christchurch State   8 2588 Mediam     709
5  41      Macleans College     Auckland State  10 2476 Mediam    1915
6  43    Massey High School     Auckland State   5 2452 Mediam    1683
tail(arrange(dta,desc(Roll)))
       ID                    Name                  City    Auth Dec Roll  Size
2566 1641  Amana Christian School               Dunedin Private   9    7 Small
2567 2461       Tangimoana School              Manawatu   State   4    6 Small
2568 3598         Woodbank School              Kaikoura   State   4    6 Small
2569 3386     Jacobs River School          Jacobs River   State   5    6 Small
2570 2407     Ngamatapouri School Sth Taranaki District   State   9    5 Small
2571 2420 Papanui Junction School               Taihape   State   5    5 Small
     RollOrd
2566    2562
2567     266
2568    2478
2569    1501
2570    2377
2571    1542
## groupby
dta |> group_by(City) |> arrange(desc(Roll)) |> head()
# A tibble: 6 x 8
# Groups:   City [3]
     ID Name                  City         Auth    Dec  Roll Size   RollOrd
  <int> <chr>                 <fct>        <fct> <int> <int> <fct>    <int>
1   498 Correspondence School Wellington   State    NA  5546 Large      753
2    28 Rangitoto College     Auckland     State    10  3022 Mediam     353
3    78 Avondale College      Auckland     State     4  2613 Mediam     712
4   319 Burnside High School  Christchurch State     8  2588 Mediam     709
5    41 Macleans College      Auckland     State    10  2476 Mediam    1915
6    43 Massey High School    Auckland     State     5  2452 Mediam    1683

tables

## 顯示Auth的列聯表
table(dta$Auth)

           Other          Private            State State Integrated 
               1               99             2144              327 
## 將列聯表存成authtbl,並顯示authtbl
authtbl <- table(dta$Auth); authtbl

           Other          Private            State State Integrated 
               1               99             2144              327 
## 顯示authtbl的類型
class(authtbl)
[1] "table"
## 顯示Auth為other的資料
dta[dta$Auth == "Other", ]
      ID            Name         City  Auth Dec Roll  Size RollOrd
2315 518 Kingslea School Christchurch Other   1   51 Small    1579
## xtabs 顯示Auth與Dec 2x2列聯表
xtabs(~ Auth + Dec, data=dta)
                  Dec
Auth                 1   2   3   4   5   6   7   8   9  10
  Other              1   0   0   0   0   0   0   0   0   0
  Private            0   0   2   6   2   2   6  11  12  38
  State            259 230 208 219 214 215 188 200 205 205
  State Integrated  12  22  35  28  38  34  45  45  37  31
## ftable 與xtable結果相同
with(dta,ftable(Auth,Dec))
                 Dec   1   2   3   4   5   6   7   8   9  10
Auth                                                        
Other                  1   0   0   0   0   0   0   0   0   0
Private                0   0   2   6   2   2   6  11  12  38
State                259 230 208 219 214 215 188 200 205 205
State Integrated      12  22  35  28  38  34  45  45  37  31

aggregating

## 計算Roll的平均值
mean(dta$Roll)
[1] 295.4737
## 計算Auth=Private之樣本Roll的平均值
mean(dta$Roll[dta$Auth == "Private"])
[1] 308.798
## 計算不同種類Auth之Roll的平均值
aggregate(dta["Roll"], by=list(dta$Auth), FUN=mean)
           Group.1     Roll
1            Other  51.0000
2          Private 308.7980
3            State 300.6301
4 State Integrated 258.3792
## group_by and summarise 
dta |> group_by(Auth) |> summarise(mean = mean(Roll))
# A tibble: 4 x 2
  Auth              mean
  <fct>            <dbl>
1 Other              51 
2 Private           309.
3 State             301.
4 State Integrated  258.
## Dec > 5 則Rich為True,反之為False
dta$Rich <- dta$Dec > 5; dta$Rich
## 計算Auth 與Rich 分層後的Roll的平均值 
## aggregate似乎略過了Rich=NA?
aggregate(dta["Roll"], by=list(dta$Auth, dta$Rich), FUN=mean)
           Group.1 Group.2     Roll
1            Other   FALSE  51.0000
2          Private   FALSE 151.4000
3            State   FALSE 261.7487
4 State Integrated   FALSE 183.2370
5          Private    TRUE 402.5362
6            State    TRUE 338.8243
7 State Integrated    TRUE 311.2135
## group_by and summarise 
dta |> group_by(Auth, Rich) |> summarise(mean = mean(Roll))
# A tibble: 9 x 3
# Groups:   Auth [4]
  Auth             Rich    mean
  <fct>            <lgl>  <dbl>
1 Other            FALSE   51  
2 Private          FALSE  151. 
3 Private          TRUE   403. 
4 Private          NA      64.1
5 State            FALSE  262. 
6 State            TRUE   339. 
7 State            NA    5546  
8 State Integrated FALSE  183. 
9 State Integrated TRUE   311. 
## 顯示Auth各種類之Roll的range
by(dta["Roll"], INDICES=list(dta$Auth), FUN=range)
: Other
[1] 51 51
------------------------------------------------------------ 
: Private
[1]    7 1663
------------------------------------------------------------ 
: State
[1]    5 5546
------------------------------------------------------------ 
: State Integrated
[1]   18 1475

inclass3

Split the ChickWeight{datasets} data by individual chicks to extract separate slope estimates of regressing weight onto Time for each chick.

dta3 <- ChickWeight
head(dta3)
  weight Time Chick Diet
1     42    0     1    1
2     51    2     1    1
3     59    4     1    1
4     64    6     1    1
5     76    8     1    1
6     93   10     1    1

Produce multilple coef-base R approach

# Chick level重新排序
dta3$Chick <- factor(as.numeric(as.character(dta3$Chick))) 
levels(dta3$Chick)
 [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15"
[16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30"
[31] "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42" "43" "44" "45"
[46] "46" "47" "48" "49" "50"
# split and lapply
split(dta3, dta3$Chick)|> #將資料依Chick進行dataset切割
lapply(function(x) lm(weight ~ Time, data = x)) |> sapply(coef) #將Chick的subdata逐一放入進行lm,並使用sapply產生coef
                    1         2        3        4        5         6         7
(Intercept) 24.465436 24.724853 23.17955 32.86568 16.89563 44.123431  5.842535
Time         7.987899  8.719861  8.48737  6.08864 10.05536  6.378006 13.205264
                    8         9        10        11        12        13
(Intercept) 43.727273 52.094086 38.695054 47.921948 21.939797 43.384359
Time         4.827273  2.663137  4.066102  7.510967  8.440629  2.239601
                  14       15        16        17 18       19        20
(Intercept) 20.52488 46.83333 43.392857 43.030706 39 31.21222 37.667826
Time        11.98245  1.89881  1.053571  4.531538 -2  5.08743  3.732718
                  21        22        23        24       25       26        27
(Intercept) 15.56330 40.082590 38.428074 53.067766 19.65119 20.70715 29.858569
Time        15.47512  5.877931  6.685978  1.207533 11.30676 10.10316  7.379368
                   28        29        30       31       32        33        34
(Intercept) 23.984874  5.882771 39.109666 19.13099 13.69173 45.830283  5.081682
Time         9.703676 12.453487  5.898351 10.02617 13.18091  5.855241 15.000151
                   35       36        37       38       39       40        41
(Intercept)  4.757979 25.85403 29.608834 10.67282 17.03661 10.83830 39.337922
Time        17.258811  9.99047  6.677053 12.06051 10.73710 13.44229  8.159885
                  42        43        44        45        46        47
(Intercept) 19.86507 52.185751 44.909091 35.673121 27.771744 36.489790
Time        11.83679  8.318863  6.354545  7.686432  9.738466  8.374981
                   48        49       50
(Intercept)  7.947663 31.662986 23.78218
Time        13.714718  9.717894 11.33293

Produce multilple coef -tidy approach

dta3 |> group_by(dta3$Chick) |>
  do ({res<-lm(weight ~ Time, data=.)
    broom::tidy(coef(res))
    }
    )
# A tibble: 100 x 3
# Groups:   dta3$Chick [50]
   `dta3$Chick` names           x
   <fct>        <chr>       <dbl>
 1 1            (Intercept) 24.5 
 2 1            Time         7.99
 3 2            (Intercept) 24.7 
 4 2            Time         8.72
 5 3            (Intercept) 23.2 
 6 3            Time         8.49
 7 4            (Intercept) 32.9 
 8 4            Time         6.09
 9 5            (Intercept) 16.9 
10 5            Time        10.1 
# ... with 90 more rows

multiple plot- base R

library(dplyr)
par(mfrow=c(3,3))
lapply(split(dta3, dta3$Chick), function(x){
  plot(x$weight ~ x$Time,
       xlab="Time (day)",
       ylab="Weight (gm)")
  legend('bottomright',
         paste("Chick", x$Chick[1], sep="="),
         bty='n')
  abline(lm(x$weight ~ x$Time), 
         lwd=1,
         col="firebrick")
})

multiple plot- ggplot

library(ggplot2)
datshade <- dta3[, -c(3,4)] #定住總體回歸線用
ggplot(dta3, aes(x=Time, y=weight))+
  geom_point()+
  geom_smooth(method= lm, formula= y~x, se=F)+
  geom_line(datshade, mapping=aes(x=Time, y=weight), 
            stat="smooth",method = "lm", formula = y ~ x,
            se = FALSE, color="firebrick", alpha=0.5)+# 畫總體回歸線
  facet_wrap(.~Chick, ncol=10, nrow=5)+
  theme_minimal()

inclass4

# a case study - II

## 
dta4 <- read.table("NCEA2007.txt", sep=":", quote="", h=T, as.is=T) #as.is=T, Name為chr;#as.is=F, Name為factor;

## row x column
dim(dta4)
[1] 88  4
## 資料結構
str(dta4)
'data.frame':   88 obs. of  4 variables:
 $ Name  : chr  "Al-Madinah School" "Alfriston College" "Ambury Park Centre for Riding Therapy" "Aorere College" ...
 $ Level1: num  61.5 53.9 33.3 39.5 71.2 22.1 50.8 57.3 89.3 59.8 ...
 $ Level2: num  75 44.1 20 50.2 78.9 30.8 34.8 49.8 89.7 65.7 ...
 $ Level3: num  0 0 0 30.6 55.5 26.3 48.9 44.6 88.6 50.4 ...
## 顯示前六筆
head(dta4)
                                   Name Level1 Level2 Level3
1                     Al-Madinah School   61.5   75.0    0.0
2                     Alfriston College   53.9   44.1    0.0
3 Ambury Park Centre for Riding Therapy   33.3   20.0    0.0
4                        Aorere College   39.5   50.2   30.6
5        Auckland Girls' Grammar School   71.2   78.9   55.5
6                      Auckland Grammar   22.1   30.8   26.3

caculate mean by apply系列

## 計算扣除colum1後,每個column的mean
## output 為 numeric
a<-apply(dta4[, -1], MARGIN=2, FUN=mean) #MARGIN=2, columns;MARGIN=1, rows;
a
  Level1   Level2   Level3 
62.26705 61.06818 47.97614 
class(a)
[1] "numeric"
## list apply
## output 為 list
b<-lapply(dta4[, -1], FUN=mean)
b
$Level1
[1] 62.26705

$Level2
[1] 61.06818

$Level3
[1] 47.97614
class(b)
[1] "list"
## simplify the list apply
## output 為 numeric
c<-sapply(dta4[, -1], FUN=mean)
c
  Level1   Level2   Level3 
62.26705 61.06818 47.97614 
class(c)
[1] "numeric"

caculate range by apply系列

## 計算扣除colum1後,每個column的range
## output 為 "matrix" "array" 
d<-apply(dta4[, -1], MARGIN=2, FUN=range)
d
     Level1 Level2 Level3
[1,]    2.8    0.0    0.0
[2,]   97.4   95.7   95.7
class(d)
[1] "matrix" "array" 
## output 為 list 
e<-lapply(dta4[, -1], FUN=range)
e
$Level1
[1]  2.8 97.4

$Level2
[1]  0.0 95.7

$Level3
[1]  0.0 95.7
class(e)
[1] "list"
## output 為 "matrix" "array
f<-sapply(dta4[, -1], FUN=range)
f
     Level1 Level2 Level3
[1,]    2.8    0.0    0.0
[2,]   97.4   95.7   95.7
class(f)
[1] "matrix" "array" 

splitting by apply系列

## subset data by level of Roll and keep value of Auth
rollsByAuth <- split(dta$Roll, dta$Auth)

## 資料結構
str(rollsByAuth)
List of 4
 $ Other           : int 51
 $ Private         : int [1:99] 255 39 154 73 83 25 95 85 94 729 ...
 $ State           : int [1:2144] 318 200 455 86 577 329 637 395 201 267 ...
 $ State Integrated: int [1:327] 438 26 191 560 151 114 126 171 211 57 ...
## 資料型態為list
class(rollsByAuth)
[1] "list"
## output 為list
g<-lapply(split(dta$Roll, dta$Auth), mean)
g
$Other
[1] 51

$Private
[1] 308.798

$State
[1] 300.6301

$`State Integrated`
[1] 258.3792
class(g)
[1] "list"
## output 為 numeric
h<-sapply(split(dta$Roll, dta$Auth), mean)
h
           Other          Private            State State Integrated 
         51.0000         308.7980         300.6301         258.3792 
class(h)
[1] "numeric"