library(MASS)
head(anorexia)
## Treat Prewt Postwt
## 1 Cont 80.7 80.2
## 2 Cont 89.4 80.1
## 3 Cont 91.8 86.4
## 4 Cont 74.0 86.3
## 5 Cont 78.1 76.1
## 6 Cont 88.3 78.1
str(anorexia)
## 'data.frame': 72 obs. of 3 variables:
## $ Treat : Factor w/ 3 levels "CBT","Cont","FT": 2 2 2 2 2 2 2 2 2 2 ...
## $ Prewt : num 80.7 89.4 91.8 74 78.1 88.3 87.3 75.1 80.6 78.4 ...
## $ Postwt: num 80.2 80.1 86.4 86.3 76.1 78.1 75.1 86.7 73.5 84.6 ...
#第一列的內容列出
anorexia[1,]
## Treat Prewt Postwt
## 1 Cont 80.7 80.2
#第二欄位的內容列出
head(anorexia[,2])
## [1] 80.7 89.4 91.8 74.0 78.1 88.3
#指定列出Prewt這變項的資料
head(anorexia["Prewt"] )
## Prewt
## 1 80.7
## 2 89.4
## 3 91.8
## 4 74.0
## 5 78.1
## 6 88.3
lm(Postwt ~ Treat, data=anorexia)
##
## Call:
## lm(formula = Postwt ~ Treat, data = anorexia)
##
## Coefficients:
## (Intercept) TreatCont TreatFT
## 85.697 -4.589 4.798
lm(Postwt ~ as.numeric(Treat), data=anorexia)
##
## Call:
## lm(formula = Postwt ~ as.numeric(Treat), data = anorexia)
##
## Coefficients:
## (Intercept) as.numeric(Treat)
## 82.036 1.711
#總結 lm為 linear models, 但Treat為factor變項,不適用於lm。 因as.numeric(Treat)強制將Treat變成數字型,技術上雖可跑lm,但以統計觀點Treat代表的是不同治療方案,不適用lm,跑ANOVA較合適,比較不同治療的差異。
die_1<-sample(1:6, replace=TRUE)
die_1
## [1] 4 4 6 4 6 5
#3個die加總
set.seed(1234)
die_a<-sample(1:6, size=3, replace=TRUE)
die_a
## [1] 4 2 6
sum(die_a)
## [1] 12
出現的組合有20種 有1~6 六個號碼,任取3個相加,C6取3=(654)/(321)=20
set.seed(1234)
three.dice <- function(){
dice <- sample(1:6, size = 3, replace = TRUE)
return(sum(dice))
}
#Approximating Probabilities 做100次相加 expr返回數,有無都可
set.seed(1234)
replicate(n = 100, expr = three.dice())
## [1] 12 10 15 10 16 18 13 11 12 11 11 10 9 13 10 12 4 6 10 10 13 7 9 14 9
## [26] 14 9 11 13 11 14 15 9 10 12 9 11 13 7 17 7 14 15 14 13 7 10 12 12 7
## [51] 9 11 4 10 14 5 6 10 14 14 8 13 9 8 7 9 13 7 17 10 10 9 8 13 10
## [76] 14 11 13 7 12 17 13 9 6 17 7 7 11 9 13 11 9 16 6 12 13 13 11 8 10
sims<-replicate(n = 100, expr = three.dice())
看數字出現的次數
table(sims)
## sims
## 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 1 3 1 7 10 10 11 10 14 13 10 6 2 1 1
算數字出現的機率
table(sims)/length(sims)
## sims
## 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 0.01 0.03 0.01 0.07 0.10 0.10 0.11 0.10 0.14 0.13 0.10 0.06 0.02 0.01 0.01
plot(table(sims), xlab = 'Sum', ylab = 'Frequency', main = '20 Rolls of 3 Fair Dice', type="h")
#密度直方圖
hist(sims, probability = T)
#總結 為probability histogram
#An R script for IQ_Beh data set
dta <- read.table("IQ_Beh.txt", header = T, row.names = 1)
str(dta)
## 'data.frame': 94 obs. of 3 variables:
## $ Dep: chr "N" "N" "N" "N" ...
## $ IQ : int 103 124 124 104 96 92 124 99 92 116 ...
## $ BP : int 4 12 9 3 3 3 6 4 3 9 ...
head(dta)
## Dep IQ BP
## 1 N 103 4
## 2 N 124 12
## 3 N 124 9
## 4 N 104 3
## 5 D 96 3
## 6 N 92 3
#資料型態 class()
class(dta)
## [1] "data.frame"
#各維度長度 dim()先顯示列,再顯示行 94筆資料3個欄位
dim(dta)
## [1] 94 3
#各維度名稱 dimnames()
dimnames(dta)
## [[1]]
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30"
## [31] "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42" "43" "44" "45"
## [46] "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56" "57" "58" "59" "60"
## [61] "61" "62" "63" "64" "65" "66" "67" "68" "69" "70" "71" "72" "73" "74" "75"
## [76] "76" "77" "78" "79" "80" "81" "82" "83" "84" "85" "86" "87" "88" "89" "90"
## [91] "91" "92" "93" "94"
##
## [[2]]
## [1] "Dep" "IQ" "BP"
names(dta)
## [1] "Dep" "IQ" "BP"
#vector檢查基本數據結構,is.vector()除名稱外沒有其他屬性的向量時才返回真
is.vector(dta$BP)
## [1] TRUE
#秀出第一列row資料
dta[1, ]
## Dep IQ BP
## 1 N 103 4
#秀出第一列row資料,IQ欄位的3筆資料
dta[1:3, "IQ"]
## [1] 103 124 124
#若沒有指定IQ欄位,秀出三個欄位的資料
head(dta[1:3])
## Dep IQ BP
## 1 N 103 4
## 2 N 124 12
## 3 N 124 9
## 4 N 104 3
## 5 D 96 3
## 6 N 92 3
#把BP大到小排列後,秀出最後6筆 tail(): 顯示data後6筆資料 order(),把數列由大排到小,從小排到大,decreasing = FALSE order(a, decreasing=TRUE)]
tail(dta[order(dta$BP), ])
## Dep IQ BP
## 16 N 89 11
## 58 N 117 11
## 66 N 126 11
## 2 N 124 12
## 73 D 99 13
## 12 D 22 17
BP由小到大排序可用decreasing=TRUE
tail(dta[order(dta$BP,decreasing=TRUE), ])
## Dep IQ BP
## 94 N 114 2
## 72 N 115 1
## 77 N 124 1
## 80 N 121 1
## 24 N 106 0
## 75 N 122 0
BP由小到大排序,直接用(-dta$BP)加負號更快
tail(dta[order(-dta$BP), ])
## Dep IQ BP
## 94 N 114 2
## 72 N 115 1
## 77 N 124 1
## 80 N 121 1
## 24 N 106 0
## 75 N 122 0
#秀出最後4筆由小到大的排序
tail(dta[order(-dta$BP), ], 4)
## Dep IQ BP
## 77 N 124 1
## 80 N 121 1
## 24 N 106 0
## 75 N 122 0
#with(data,table,hist,….)
with(dta, hist(IQ, xlab = "IQ", main = ""))
boxplot(BP ~ Dep, data = dta,
xlab = "Depression",
ylab = "Behavior problem score")
#用text可以加上文字 abline()函數給當前繪圖添加一條或多條直線 lty是線的type
plot(BP ~ IQ, data = dta, type = "n",
ylab = "Behavior problem score", xlab = "IQ")
text(dta$IQ, dta$BP, labels = dta$Dep, cex = 0.5)
abline(lm(BP ~ IQ, data = dta, subset = Dep == "D"),col='blue')
abline(lm(BP ~ IQ, data = dta, subset = Dep == "N"), lty = 2)
##end
library(dplyr)
##
## 載入套件:'dplyr'
## 下列物件被遮斷自 'package:MASS':
##
## select
## 下列物件被遮斷自 'package:stats':
##
## filter, lag
## 下列物件被遮斷自 'package:base':
##
## intersect, setdiff, setequal, union
usBirths <- read.table("usBirths2015.txt", header = T)
head(usBirths)
## birth month
## 1 325955 January
## 2 298058 February
## 3 328923 March
## 4 320832 April
## 5 327917 May
## 6 330541 June
str(usBirths)
## 'data.frame': 12 obs. of 2 variables:
## $ birth: int 325955 298058 328923 320832 327917 330541 353415 351791 347516 339007 ...
## $ month: chr "January" "February" "March" "April" ...
usBirths
## birth month
## 1 325955 January
## 2 298058 February
## 3 328923 March
## 4 320832 April
## 5 327917 May
## 6 330541 June
## 7 353415 July
## 8 351791 August
## 9 347516 September
## 10 339007 October
## 11 318820 November
## 12 335722 December
usBirths$season<- c("Winter","Winter","Winter","Spring","Spring","Spring","Summer","Summer","Summer","Fall","Fall","Fall")
head(usBirths)
## birth month season
## 1 325955 January Winter
## 2 298058 February Winter
## 3 328923 March Winter
## 4 320832 April Spring
## 5 327917 May Spring
## 6 330541 June Spring
usBirths$season <- as.factor(usBirths$season)
#嘗試合併的作法
season1<- c("Winter","Winter","Winter","Spring","Spring","Spring","Summer","Summer","Summer","Fall","Fall","Fall")
new<- cbind(usBirths, season1)
new
## birth month season season1
## 1 325955 January Winter Winter
## 2 298058 February Winter Winter
## 3 328923 March Winter Winter
## 4 320832 April Spring Spring
## 5 327917 May Spring Spring
## 6 330541 June Spring Spring
## 7 353415 July Summer Summer
## 8 351791 August Summer Summer
## 9 347516 September Summer Summer
## 10 339007 October Fall Fall
## 11 318820 November Fall Fall
## 12 335722 December Fall Fall
aggregate(birth ~ season,data=usBirths, FUN=sum)
## season birth
## 1 Fall 993549
## 2 Spring 979290
## 3 Summer 1052722
## 4 Winter 952936
read <- read.table("readingtimes.txt", header = T)
str(read)
## 'data.frame': 7 obs. of 14 variables:
## $ Snt : int 1 2 3 4 5 6 7
## $ Sp : int 1 2 3 4 5 6 7
## $ Wrds: int 13 16 9 9 10 18 6
## $ New : int 1 3 2 2 3 4 1
## $ S01 : num 3.43 6.48 1.71 3.68 4 ...
## $ S02 : num 2.79 5.41 2.34 3.71 2.9 ...
## $ S03 : num 4.16 4.49 3.02 2.87 2.99 ...
## $ S04 : num 3.07 5.06 2.46 2.73 2.67 ...
## $ S05 : num 3.62 9.29 6.04 4.21 3.88 ...
## $ S06 : num 3.16 5.64 2.46 6.24 3.22 ...
## $ S07 : num 3.23 8.36 4.92 3.72 3.14 ...
## $ S08 : num 7.16 4.31 3.37 6.33 6.14 ...
## $ S09 : num 1.54 2.95 1.38 1.15 2.76 ...
## $ S10 : num 4.06 6.65 2.18 3.66 3.33 ...
head(read)
## Snt Sp Wrds New S01 S02 S03 S04 S05 S06 S07 S08 S09 S10
## 1 1 1 13 1 3.429 2.795 4.161 3.071 3.625 3.161 3.232 7.161 1.536 4.063
## 2 2 2 16 3 6.482 5.411 4.491 5.063 9.295 5.643 8.357 4.313 2.946 6.652
## 3 3 3 9 2 1.714 2.339 3.018 2.464 6.045 2.455 4.920 3.366 1.375 2.179
## 4 4 4 9 2 3.679 3.714 2.866 2.732 4.205 6.241 3.723 6.330 1.152 3.661
## 5 5 5 10 3 4.000 2.902 2.991 2.670 3.884 3.223 3.143 6.143 2.759 3.330
## 6 6 6 18 4 6.973 8.018 6.625 7.571 8.795 13.188 11.170 6.071 7.964 7.866
colMeans(read[,5:14])
## S01 S02 S03 S04 S05 S06 S07 S08
## 4.130143 3.847000 3.774286 3.779286 5.620000 5.371286 5.228429 5.011429
## S09 S10
## 2.741000 4.493714
sort()從小到大的排序 rank()每個數值對應的順序
sort(colMeans(read[,5:14]))
## S09 S03 S04 S02 S01 S10 S08 S07
## 2.741000 3.774286 3.779286 3.847000 4.130143 4.493714 5.011429 5.228429
## S06 S05
## 5.371286 5.620000
#S09閱讀速度只需2.741,為最快
library(dplyr)
read <- read.table("readingtimes.txt", header = T)
head(read )
## Snt Sp Wrds New S01 S02 S03 S04 S05 S06 S07 S08 S09 S10
## 1 1 1 13 1 3.429 2.795 4.161 3.071 3.625 3.161 3.232 7.161 1.536 4.063
## 2 2 2 16 3 6.482 5.411 4.491 5.063 9.295 5.643 8.357 4.313 2.946 6.652
## 3 3 3 9 2 1.714 2.339 3.018 2.464 6.045 2.455 4.920 3.366 1.375 2.179
## 4 4 4 9 2 3.679 3.714 2.866 2.732 4.205 6.241 3.723 6.330 1.152 3.661
## 5 5 5 10 3 4.000 2.902 2.991 2.670 3.884 3.223 3.143 6.143 2.759 3.330
## 6 6 6 18 4 6.973 8.018 6.625 7.571 8.795 13.188 11.170 6.071 7.964 7.866
#使用reshape2將資料轉成long
library(reshape2)
id.vars:不須轉的變項列出 variable.name 將要轉的變項取新的名子 value.name 轉的變項填入的數值取新的名子 這裡將S01~S10共10人轉呈長形資料
read_long<-melt(read,id.vars=c("Snt","Sp","Wrds","New"), variable.name = "S_id",value.name = "speed")
head(read_long)
## Snt Sp Wrds New S_id speed
## 1 1 1 13 1 S01 3.429
## 2 2 2 16 3 S01 6.482
## 3 3 3 9 2 S01 1.714
## 4 4 4 9 2 S01 3.679
## 5 5 5 10 3 S01 4.000
## 6 6 6 18 4 S01 6.973
#計算每個人的閱讀速度,取新的名子為rate
read_long$rate = with(read_long, speed/Wrds)
head(read_long)
## Snt Sp Wrds New S_id speed rate
## 1 1 1 13 1 S01 3.429 0.2637692
## 2 2 2 16 3 S01 6.482 0.4051250
## 3 3 3 9 2 S01 1.714 0.1904444
## 4 4 4 9 2 S01 3.679 0.4087778
## 5 5 5 10 3 S01 4.000 0.4000000
## 6 6 6 18 4 S01 6.973 0.3873889
#計算每個人平均的閱讀速度
aggregate(rate ~ S_id, data=read_long, FUN=mean)
## S_id rate
## 1 S01 0.3563579
## 2 S02 0.3218649
## 3 S03 0.3285283
## 4 S04 0.3283256
## 5 S05 0.4939310
## 6 S06 0.4616710
## 7 S07 0.4297786
## 8 S08 0.4474266
## 9 S09 0.2205573
## 10 S10 0.3949539
#全部的人閱讀速度
colMeans(read_long[7])
## rate
## 0.3783395
#It takes time an average of 0.3783395