Excise1

library(MASS)
head(anorexia)
##   Treat Prewt Postwt
## 1  Cont  80.7   80.2
## 2  Cont  89.4   80.1
## 3  Cont  91.8   86.4
## 4  Cont  74.0   86.3
## 5  Cont  78.1   76.1
## 6  Cont  88.3   78.1
str(anorexia)
## 'data.frame':    72 obs. of  3 variables:
##  $ Treat : Factor w/ 3 levels "CBT","Cont","FT": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Prewt : num  80.7 89.4 91.8 74 78.1 88.3 87.3 75.1 80.6 78.4 ...
##  $ Postwt: num  80.2 80.1 86.4 86.3 76.1 78.1 75.1 86.7 73.5 84.6 ...

#第一列的內容列出

anorexia[1,] 
##   Treat Prewt Postwt
## 1  Cont  80.7   80.2

#第二欄位的內容列出

head(anorexia[,2])
## [1] 80.7 89.4 91.8 74.0 78.1 88.3

#指定列出Prewt這變項的資料

head(anorexia["Prewt"] )
##   Prewt
## 1  80.7
## 2  89.4
## 3  91.8
## 4  74.0
## 5  78.1
## 6  88.3
 lm(Postwt ~ Treat, data=anorexia)
## 
## Call:
## lm(formula = Postwt ~ Treat, data = anorexia)
## 
## Coefficients:
## (Intercept)    TreatCont      TreatFT  
##      85.697       -4.589        4.798
lm(Postwt ~ as.numeric(Treat), data=anorexia)
## 
## Call:
## lm(formula = Postwt ~ as.numeric(Treat), data = anorexia)
## 
## Coefficients:
##       (Intercept)  as.numeric(Treat)  
##            82.036              1.711

#總結 lm為 linear models, 但Treat為factor變項,不適用於lm。 因as.numeric(Treat)強制將Treat變成數字型,技術上雖可跑lm,但以統計觀點Treat代表的是不同治療方案,不適用lm,跑ANOVA較合適,比較不同治療的差異。

Excise2

die_1<-sample(1:6, replace=TRUE) 
die_1
## [1] 4 4 6 4 6 5

#3個die加總

set.seed(1234)
die_a<-sample(1:6,  size=3, replace=TRUE) 
die_a
## [1] 4 2 6
sum(die_a)
## [1] 12

出現的組合有20種 有1~6 六個號碼,任取3個相加,C6取3=(654)/(321)=20

set.seed(1234)
three.dice <- function(){
  dice <- sample(1:6, size = 3, replace = TRUE)
  return(sum(dice))
}

#Approximating Probabilities 做100次相加 expr返回數,有無都可

set.seed(1234)
replicate(n = 100, expr = three.dice())
##   [1] 12 10 15 10 16 18 13 11 12 11 11 10  9 13 10 12  4  6 10 10 13  7  9 14  9
##  [26] 14  9 11 13 11 14 15  9 10 12  9 11 13  7 17  7 14 15 14 13  7 10 12 12  7
##  [51]  9 11  4 10 14  5  6 10 14 14  8 13  9  8  7  9 13  7 17 10 10  9  8 13 10
##  [76] 14 11 13  7 12 17 13  9  6 17  7  7 11  9 13 11  9 16  6 12 13 13 11  8 10
sims<-replicate(n = 100, expr = three.dice())

看數字出現的次數

table(sims)
## sims
##  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 
##  1  3  1  7 10 10 11 10 14 13 10  6  2  1  1

算數字出現的機率

table(sims)/length(sims)
## sims
##    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18 
## 0.01 0.03 0.01 0.07 0.10 0.10 0.11 0.10 0.14 0.13 0.10 0.06 0.02 0.01 0.01
plot(table(sims), xlab = 'Sum', ylab = 'Frequency', main = '20 Rolls of 3 Fair Dice', type="h")

#密度直方圖

hist(sims, probability = T)

#總結 為probability histogram

Excise3

#An R script for IQ_Beh data set

dta <- read.table("IQ_Beh.txt", header = T, row.names = 1)
str(dta)
## 'data.frame':    94 obs. of  3 variables:
##  $ Dep: chr  "N" "N" "N" "N" ...
##  $ IQ : int  103 124 124 104 96 92 124 99 92 116 ...
##  $ BP : int  4 12 9 3 3 3 6 4 3 9 ...
head(dta)
##   Dep  IQ BP
## 1   N 103  4
## 2   N 124 12
## 3   N 124  9
## 4   N 104  3
## 5   D  96  3
## 6   N  92  3

#資料型態 class()

class(dta)
## [1] "data.frame"

#各維度長度 dim()先顯示列,再顯示行 94筆資料3個欄位

dim(dta)
## [1] 94  3

#各維度名稱 dimnames()

dimnames(dta)
## [[1]]
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30"
## [31] "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42" "43" "44" "45"
## [46] "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56" "57" "58" "59" "60"
## [61] "61" "62" "63" "64" "65" "66" "67" "68" "69" "70" "71" "72" "73" "74" "75"
## [76] "76" "77" "78" "79" "80" "81" "82" "83" "84" "85" "86" "87" "88" "89" "90"
## [91] "91" "92" "93" "94"
## 
## [[2]]
## [1] "Dep" "IQ"  "BP"
names(dta)
## [1] "Dep" "IQ"  "BP"

#vector檢查基本數據結構,is.vector()除名稱外沒有其他屬性的向量時才返回真

is.vector(dta$BP)
## [1] TRUE

#秀出第一列row資料

dta[1, ]
##   Dep  IQ BP
## 1   N 103  4

#秀出第一列row資料,IQ欄位的3筆資料

dta[1:3, "IQ"]
## [1] 103 124 124

#若沒有指定IQ欄位,秀出三個欄位的資料

head(dta[1:3])
##   Dep  IQ BP
## 1   N 103  4
## 2   N 124 12
## 3   N 124  9
## 4   N 104  3
## 5   D  96  3
## 6   N  92  3

#把BP大到小排列後,秀出最後6筆 tail(): 顯示data後6筆資料 order(),把數列由大排到小,從小排到大,decreasing = FALSE order(a, decreasing=TRUE)]

tail(dta[order(dta$BP), ])
##    Dep  IQ BP
## 16   N  89 11
## 58   N 117 11
## 66   N 126 11
## 2    N 124 12
## 73   D  99 13
## 12   D  22 17

BP由小到大排序可用decreasing=TRUE

tail(dta[order(dta$BP,decreasing=TRUE), ])
##    Dep  IQ BP
## 94   N 114  2
## 72   N 115  1
## 77   N 124  1
## 80   N 121  1
## 24   N 106  0
## 75   N 122  0

BP由小到大排序,直接用(-dta$BP)加負號更快

tail(dta[order(-dta$BP), ])
##    Dep  IQ BP
## 94   N 114  2
## 72   N 115  1
## 77   N 124  1
## 80   N 121  1
## 24   N 106  0
## 75   N 122  0

#秀出最後4筆由小到大的排序

tail(dta[order(-dta$BP), ], 4)
##    Dep  IQ BP
## 77   N 124  1
## 80   N 121  1
## 24   N 106  0
## 75   N 122  0

#with(data,table,hist,….)

with(dta, hist(IQ, xlab = "IQ", main = ""))

boxplot(BP ~ Dep, data = dta, 
        xlab = "Depression", 
        ylab = "Behavior problem score")

#用text可以加上文字 abline()函數給當前繪圖添加一條或多條直線 lty是線的type

plot(BP ~ IQ, data = dta, type = "n",
     ylab = "Behavior problem score", xlab = "IQ")
text(dta$IQ, dta$BP, labels = dta$Dep, cex = 0.5)
abline(lm(BP ~ IQ, data = dta, subset = Dep == "D"),col='blue')
abline(lm(BP ~ IQ, data = dta, subset = Dep == "N"), lty = 2)

##end

Excise4

library(dplyr)
## 
## 載入套件:'dplyr'
## 下列物件被遮斷自 'package:MASS':
## 
##     select
## 下列物件被遮斷自 'package:stats':
## 
##     filter, lag
## 下列物件被遮斷自 'package:base':
## 
##     intersect, setdiff, setequal, union
usBirths <- read.table("usBirths2015.txt", header = T)
head(usBirths)
##    birth    month
## 1 325955  January
## 2 298058 February
## 3 328923    March
## 4 320832    April
## 5 327917      May
## 6 330541     June
str(usBirths) 
## 'data.frame':    12 obs. of  2 variables:
##  $ birth: int  325955 298058 328923 320832 327917 330541 353415 351791 347516 339007 ...
##  $ month: chr  "January" "February" "March" "April" ...
usBirths
##     birth     month
## 1  325955   January
## 2  298058  February
## 3  328923     March
## 4  320832     April
## 5  327917       May
## 6  330541      June
## 7  353415      July
## 8  351791    August
## 9  347516 September
## 10 339007   October
## 11 318820  November
## 12 335722  December
usBirths$season<- c("Winter","Winter","Winter","Spring","Spring","Spring","Summer","Summer","Summer","Fall","Fall","Fall")
head(usBirths)
##    birth    month season
## 1 325955  January Winter
## 2 298058 February Winter
## 3 328923    March Winter
## 4 320832    April Spring
## 5 327917      May Spring
## 6 330541     June Spring
usBirths$season <- as.factor(usBirths$season)

#嘗試合併的作法

season1<- c("Winter","Winter","Winter","Spring","Spring","Spring","Summer","Summer","Summer","Fall","Fall","Fall")
 new<- cbind(usBirths, season1)
new
##     birth     month season season1
## 1  325955   January Winter  Winter
## 2  298058  February Winter  Winter
## 3  328923     March Winter  Winter
## 4  320832     April Spring  Spring
## 5  327917       May Spring  Spring
## 6  330541      June Spring  Spring
## 7  353415      July Summer  Summer
## 8  351791    August Summer  Summer
## 9  347516 September Summer  Summer
## 10 339007   October   Fall    Fall
## 11 318820  November   Fall    Fall
## 12 335722  December   Fall    Fall
aggregate(birth ~ season,data=usBirths, FUN=sum)
##   season   birth
## 1   Fall  993549
## 2 Spring  979290
## 3 Summer 1052722
## 4 Winter  952936

Excise5

Excise5-1

read <- read.table("readingtimes.txt", header = T)
str(read) 
## 'data.frame':    7 obs. of  14 variables:
##  $ Snt : int  1 2 3 4 5 6 7
##  $ Sp  : int  1 2 3 4 5 6 7
##  $ Wrds: int  13 16 9 9 10 18 6
##  $ New : int  1 3 2 2 3 4 1
##  $ S01 : num  3.43 6.48 1.71 3.68 4 ...
##  $ S02 : num  2.79 5.41 2.34 3.71 2.9 ...
##  $ S03 : num  4.16 4.49 3.02 2.87 2.99 ...
##  $ S04 : num  3.07 5.06 2.46 2.73 2.67 ...
##  $ S05 : num  3.62 9.29 6.04 4.21 3.88 ...
##  $ S06 : num  3.16 5.64 2.46 6.24 3.22 ...
##  $ S07 : num  3.23 8.36 4.92 3.72 3.14 ...
##  $ S08 : num  7.16 4.31 3.37 6.33 6.14 ...
##  $ S09 : num  1.54 2.95 1.38 1.15 2.76 ...
##  $ S10 : num  4.06 6.65 2.18 3.66 3.33 ...
head(read)
##   Snt Sp Wrds New   S01   S02   S03   S04   S05    S06    S07   S08   S09   S10
## 1   1  1   13   1 3.429 2.795 4.161 3.071 3.625  3.161  3.232 7.161 1.536 4.063
## 2   2  2   16   3 6.482 5.411 4.491 5.063 9.295  5.643  8.357 4.313 2.946 6.652
## 3   3  3    9   2 1.714 2.339 3.018 2.464 6.045  2.455  4.920 3.366 1.375 2.179
## 4   4  4    9   2 3.679 3.714 2.866 2.732 4.205  6.241  3.723 6.330 1.152 3.661
## 5   5  5   10   3 4.000 2.902 2.991 2.670 3.884  3.223  3.143 6.143 2.759 3.330
## 6   6  6   18   4 6.973 8.018 6.625 7.571 8.795 13.188 11.170 6.071 7.964 7.866
colMeans(read[,5:14])
##      S01      S02      S03      S04      S05      S06      S07      S08 
## 4.130143 3.847000 3.774286 3.779286 5.620000 5.371286 5.228429 5.011429 
##      S09      S10 
## 2.741000 4.493714

sort()從小到大的排序 rank()每個數值對應的順序

sort(colMeans(read[,5:14]))
##      S09      S03      S04      S02      S01      S10      S08      S07 
## 2.741000 3.774286 3.779286 3.847000 4.130143 4.493714 5.011429 5.228429 
##      S06      S05 
## 5.371286 5.620000

#S09閱讀速度只需2.741,為最快

Excise5-2

library(dplyr)
read <- read.table("readingtimes.txt", header = T)
head(read )
##   Snt Sp Wrds New   S01   S02   S03   S04   S05    S06    S07   S08   S09   S10
## 1   1  1   13   1 3.429 2.795 4.161 3.071 3.625  3.161  3.232 7.161 1.536 4.063
## 2   2  2   16   3 6.482 5.411 4.491 5.063 9.295  5.643  8.357 4.313 2.946 6.652
## 3   3  3    9   2 1.714 2.339 3.018 2.464 6.045  2.455  4.920 3.366 1.375 2.179
## 4   4  4    9   2 3.679 3.714 2.866 2.732 4.205  6.241  3.723 6.330 1.152 3.661
## 5   5  5   10   3 4.000 2.902 2.991 2.670 3.884  3.223  3.143 6.143 2.759 3.330
## 6   6  6   18   4 6.973 8.018 6.625 7.571 8.795 13.188 11.170 6.071 7.964 7.866

#使用reshape2將資料轉成long

library(reshape2)

id.vars:不須轉的變項列出 variable.name 將要轉的變項取新的名子 value.name 轉的變項填入的數值取新的名子 這裡將S01~S10共10人轉呈長形資料

read_long<-melt(read,id.vars=c("Snt","Sp","Wrds","New"), variable.name = "S_id",value.name = "speed")
head(read_long)
##   Snt Sp Wrds New S_id speed
## 1   1  1   13   1  S01 3.429
## 2   2  2   16   3  S01 6.482
## 3   3  3    9   2  S01 1.714
## 4   4  4    9   2  S01 3.679
## 5   5  5   10   3  S01 4.000
## 6   6  6   18   4  S01 6.973
#計算每個人的閱讀速度,取新的名子為rate
read_long$rate = with(read_long, speed/Wrds)
head(read_long)
##   Snt Sp Wrds New S_id speed      rate
## 1   1  1   13   1  S01 3.429 0.2637692
## 2   2  2   16   3  S01 6.482 0.4051250
## 3   3  3    9   2  S01 1.714 0.1904444
## 4   4  4    9   2  S01 3.679 0.4087778
## 5   5  5   10   3  S01 4.000 0.4000000
## 6   6  6   18   4  S01 6.973 0.3873889
#計算每個人平均的閱讀速度
aggregate(rate ~ S_id, data=read_long, FUN=mean)
##    S_id      rate
## 1   S01 0.3563579
## 2   S02 0.3218649
## 3   S03 0.3285283
## 4   S04 0.3283256
## 5   S05 0.4939310
## 6   S06 0.4616710
## 7   S07 0.4297786
## 8   S08 0.4474266
## 9   S09 0.2205573
## 10  S10 0.3949539
#全部的人閱讀速度
colMeans(read_long[7])
##      rate 
## 0.3783395

#It takes time an average of 0.3783395