Use the read and math variables from the high schools data example for this problem. First firgure out what this R script does and then eliminate the for loop in the code segment.
dta <- read.table("C:/Users/TheorEco Lab/Desktop/108-2/DataManagement/0511/hs0.txt", header=TRUE)
str(dta)
## 'data.frame': 200 obs. of 11 variables:
## $ id : int 70 121 86 141 172 113 50 11 84 48 ...
## $ female : Factor w/ 2 levels "female","male": 2 1 2 2 2 2 2 2 2 2 ...
## $ race : Factor w/ 4 levels "african-amer",..: 4 4 4 4 4 4 1 3 4 1 ...
## $ ses : Factor w/ 3 levels "high","low","middle": 2 3 1 1 3 3 3 3 3 3 ...
## $ schtyp : Factor w/ 2 levels "private","public": 2 2 2 2 2 2 2 2 2 2 ...
## $ prog : Factor w/ 3 levels "academic","general",..: 2 3 2 3 1 1 2 1 2 1 ...
## $ read : int 57 68 44 63 47 44 50 34 63 57 ...
## $ write : int 52 59 33 44 52 52 59 46 57 55 ...
## $ math : int 41 53 54 47 57 51 42 45 54 52 ...
## $ science: int 47 63 58 53 53 63 53 39 58 NA ...
## $ socst : int 57 61 31 56 61 61 61 36 51 51 ...
dta.asian <- subset(dta, race=="asian")
r0 <- cor(dta.asian$math, dta.asian$socst)
cnt <- 0
nIter <- 1001
for (i in 1:nIter) {
new <- sample(dta.asian$read)
r <- cor(new, dta.asian$math)
if ( r0 <= r ) cnt <- cnt+1
}
cnt/nIter
## [1] 0.03896104
先讀取資料中種族為 asian 的資料做為子集,再將亞洲人子集的 math 和 socst 取相關係數設成 r0。
cnt 為項目次數,nIter 為接下來 for loop 重複 1001 次。設定 new 來取樣亞洲人子集的 read ,並計算與 math 的相關係數。當 math 與 socst 的相關係數小於等於其與 read 的,則計算為 1 ,並使用 for loop 重複。
最後計算 read-math的相關係數大於等於 math-socst的比例。
先重複取樣亞洲人子集的 read資料,重複 nIter 次,每一次的取樣為一欄資料,定為 newread,再將 newread 另設為一 dataframe。
將 newread 的資料每一欄取樣的資料與 math 分別取相關係數。
newread <- replicate(nIter, sample(dta.asian$read))
newread <- data.frame(unlist(newread))
with(newread, lapply(names(newread), function(x)
cor(dta.asian$math, eval(substitute(tmp, list(tmp=as.name(x)))))))
取種族為亞洲人資料的 math 和 socst 資料的相關。
cor.test(dta[dta$race=="asian", "math"], dta[dta$race=="asian", "socst"])
##
## Pearson's product-moment correlation
##
## data: dta[dta$race == "asian", "math"] and dta[dta$race == "asian", "socst"]
## t = 1.9887, df = 9, p-value = 0.07796
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.07083501 0.86552255
## sample estimates:
## cor
## 0.5525177
An example of obtaining (nonparametric) bootstrap estimates of coefficients for a multiple linear regression model for the anorexia{MASS} data set is presented in the following script . Figure out how it works and improve the code.
data(anorexia, package="MASS")
dtadta <- anorexia
dta 資料的基本統計結果summary(dta)
## Treat Prewt Postwt
## CBT :29 Min. :70.00 Min. : 71.30
## Cont:26 1st Qu.:79.60 1st Qu.: 79.33
## FT :17 Median :82.30 Median : 84.05
## Mean :82.41 Mean : 85.17
## 3rd Qu.:86.00 3rd Qu.: 91.55
## Max. :94.90 Max. :103.60
dta 資料為72*3 維度的資料dim(dta)
## [1] 72 3
dta 資料的 row 名稱,結果為數字順序row.names(dta)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30"
## [31] "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42" "43" "44" "45"
## [46] "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56" "57" "58" "59" "60"
## [61] "61" "62" "63" "64" "65" "66" "67" "68" "69" "70" "71" "72"
dta 資料裡的 Prewt 和 Treat 與 Postwt 做迴歸分析anrx_lm <- lm(Postwt ~ Prewt + Treat, data=dta)
Prewt 和 Treat 皆對 Postwt 有不同程度顯著影響summary(anrx_lm)
##
## Call:
## lm(formula = Postwt ~ Prewt + Treat, data = dta)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.1083 -4.2773 -0.5484 5.4838 15.2922
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.7711 13.3910 3.717 0.00041 ***
## Prewt 0.4345 0.1612 2.695 0.00885 **
## TreatCont -4.0971 1.8935 -2.164 0.03400 *
## TreatFT 4.5631 2.1333 2.139 0.03604 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.978 on 68 degrees of freedom
## Multiple R-squared: 0.2777, Adjusted R-squared: 0.2458
## F-statistic: 8.713 on 3 and 68 DF, p-value: 5.719e-05
coef(anrx_lm)
## (Intercept) Prewt TreatCont TreatFT
## 49.7711090 0.4344612 -4.0970655 4.5630627
bootstrap_function <- function(string_formula, nc, model_data, ndraws) {
coeff_mtx <- matrix(0, nrow = ndraws, ncol = nc)
for (i in 1:ndraws) {
bootstrap_ids <- sample(seq(nrow(model_data)), nrow(model_data), replace = TRUE)
bootstrap_data <- model_data[bootstrap_ids,]
bootstrap_model <- lm(as.formula(string_formula), bootstrap_data)
coeff_mtx[i,] <- bootstrap_model$coefficients
if (i == 1) {
print(bootstrap_model$coefficients)
}
}
return(coeff_mtx)
}
coef() function 的結果和呈現格式是一樣的bootstrap_estimates <- bootstrap_function("Postwt ~ Prewt + Treat", nc=4, dta, 500)
## (Intercept) Prewt TreatCont TreatFT
## 38.2166336 0.5627407 -0.6300966 3.0210706
bootstrap_CIs <- matrix(0, nrow = 4, ncol = 2)
for (i in 1:4) {
bootstrap_CIs[i,1] <- quantile(bootstrap_estimates[,i], 0.025)
bootstrap_CIs[i,2] <- quantile(bootstrap_estimates[,i], 0.975)
}
print(bootstrap_CIs)
## [,1] [,2]
## [1,] 17.1511832 77.2752688
## [2,] 0.1103611 0.8253463
## [3,] -7.2044018 -0.4780539
## [4,] 0.6610773 9.1096771
confint(anrx_lm)
## 2.5 % 97.5 %
## (Intercept) 23.0498681 76.4923499
## Prewt 0.1128268 0.7560955
## TreatCont -7.8754712 -0.3186599
## TreatFT 0.3060571 8.8200682
par(mfrow=c(2,2))
hist(bootstrap_estimates[,1])
hist(bootstrap_estimates[,2])
hist(bootstrap_estimates[,3])
hist(bootstrap_estimates[,4])
What does the R script do? Replace the “while” loop in the code with a different iteration control structure.
Brownian <- function(n = 11, pause = 0.05, nIter = 6, ...) {
x = rnorm(n)
y = rnorm(n)
i = 1
while (i <= nIter) {
plot(x, y, ...)
text(x, y, cex = 0.5)
x = x + rnorm(n)
y = y + rnorm(n)
Sys.sleep(pause)
i = i + 1
}
}
*重複次數改為 6。
## test it
Brownian(xlim = c(-20, 20), ylim = c(-20, 20),
pch = 21, cex = 2, col = "cyan", bg = "lavender")
Brownian <- function(n = 11, pause = 0.05, nIter = 6, ...) {
x = rnorm(n)
y = rnorm(n)
i = 1
repeat {
plot(x, y, ...)
text(x, y, cex = 0.5)
x = x + rnorm(n)
y = y + rnorm(n)
Sys.sleep(pause)
i = i + 1
if (i > nIter) break
}
}
## test it
Brownian(xlim = c(-20, 20), ylim = c(-20, 20),
pch = 21, cex = 2, col = "cyan", bg = "lavender")
Here is an example of simulation using R. Figure out how it works and improve the code.
This example shows how to sample, at random, from a population of 50.5% men and 49.5% women, in which height is normally distributed with a mean of 170 cm and a standard deviation of 7 cm for men, with corresponding values 160 and 5 for women. The output is a data frame, with the first column showing gender (M for male, F for female) and the second column showing height.
simSexHeight <- function(n) {
m <- matrix(c(runif(n), rnorm(n)), ncol=2)
draw <- function(mr) { # mr = one row of m
if (mr[1] < 0.505) {
sex <- 1
cm <- 170 + 7*mr[2]
} else {
sex <- 0
cm <- 160 + 5*mr[2]
}
return(c(sex, cm))
}
person <- t(apply(m, 1, draw))
person <- as.data.frame(person)
person[, 1] <- ifelse(person[, 1]==1, "M", "F")
names(person) <- c("Gender", "Height")
return(person)
}
simSexHeight(5)
## Gender Height
## 1 F 164.0699
## 2 F 161.3493
## 3 M 169.9022
## 4 F 152.8734
## 5 M 176.7505
simSexHeight 這個 function 呈現出來的會是一個兩欄的 matrix,row 為隨機分佈且標準化。
而這個 matrix 裡的每行的質,當第一欄的質小於 0.505 ,則定義性別為 1 ,身高為 170 + 7*mr[2],這表示 matrix 第二欄的隨機標準化數值為所加上得標準差倍率。此為男性。
至於其他的,則定義性別為 0,身高為 160 + 5*mr[2],為女性。
將此 matrix 和所填入的資料 draw 合併成 person,並轉成 dataframe。在 person 的第一欄的質若為 1 ,改呈現為 M,不然呈 F。
最後將 person 的欄位名稱修改,而simSexHeight 最終以 person 為呈現。