Functions exercise 1-5
HW 1
library(dplyr)
library(MASS)
library (reshape)
dta <- read.table("C:/Users/USER/Desktop/R_data management/0427/hs0.txt", sep="\t", quote="", h=T, as.is=T)
str(dta)## 'data.frame': 200 obs. of 11 variables:
## $ id : int 70 121 86 141 172 113 50 11 84 48 ...
## $ female : chr "male" "female" "male" "male" ...
## $ race : chr "white" "white" "white" "white" ...
## $ ses : chr "low" "middle" "high" "high" ...
## $ schtyp : chr "public" "public" "public" "public" ...
## $ prog : chr "general" "vocation" "general" "vocation" ...
## $ read : int 57 68 44 63 47 44 50 34 63 57 ...
## $ write : int 52 59 33 44 52 52 59 46 57 55 ...
## $ math : int 41 53 54 47 57 51 42 45 54 52 ...
## $ science: int 47 63 58 53 53 63 53 39 58 NA ...
## $ socst : int 57 61 31 56 61 61 61 36 51 51 ...
## [1] 200 11
## id female race ses schtyp prog read write math science socst
## 1 70 male white low public general 57 52 41 47 57
## 2 121 female white middle public vocation 68 59 53 63 61
## 3 86 male white high public general 44 33 54 58 31
## 4 141 male white high public vocation 63 44 47 53 56
## 5 172 male white middle public academic 47 52 57 53 61
## 6 113 male white middle public academic 44 52 51 63 61
(a) test if any pairs of the five variables: read, write, math, science, and socst, are different in means.
res <- dta%>%
select_if(is.numeric)%>%melt(., id='id')
pairwise.t.test(res$value, res$variable, p.adjust = "none")##
## Pairwise comparisons using t tests with pooled SD
##
## data: res$value and res$variable
##
## read write math science
## write 0.58 - - -
## math 0.68 0.90 - -
## science 0.76 0.39 0.47 -
## socst 0.86 0.71 0.81 0.63
##
## P value adjustment method: none
no significant difference between all paris of five variable ## (b) test if the 4 different ethnic groups have the same mean scores for each of the 5 variables (individually): read, write, math, science, and socst.
dta2<-dta[ ,c(3, 7, 8 ,9 , 10, 11)]
res2 <- dta2%>%melt(., id='race')
lapply(split(res2, res2$variable), function(x) anova(lm(x$value~factor(x$race))))## $read
## Analysis of Variance Table
##
## Response: x$value
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(x$race) 3 1749.8 583.27 5.9637 0.0006539 ***
## Residuals 196 19169.6 97.80
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## $write
## Analysis of Variance Table
##
## Response: x$value
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(x$race) 3 1914.2 638.05 7.8334 5.785e-05 ***
## Residuals 196 15964.7 81.45
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## $math
## Analysis of Variance Table
##
## Response: x$value
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(x$race) 3 1842.1 614.05 7.7033 6.84e-05 ***
## Residuals 196 15623.7 79.71
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## $science
## Analysis of Variance Table
##
## Response: x$value
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(x$race) 3 3169.5 1056.51 13.063 8.505e-08 ***
## Residuals 191 15447.2 80.88
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## $socst
## Analysis of Variance Table
##
## Response: x$value
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(x$race) 3 943.9 314.63 2.804 0.04098 *
## Residuals 196 21992.3 112.21
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mean of scores are significantly different between groups
(c) Perform all pairwise simple regressions for these variables: read, write, math, science, and socst.
dta3<-dta[ ,c(7:11)]
lapply(dta3, function(x){
coefficients( lm(cbind(dta3$read, dta3$write, dta3$math, dta3$science, dta3$socst)~x, data=dta3)) } )## $read
## [,1] [,2] [,3] [,4] [,5]
## (Intercept) -8.141284e-15 24.4095462 21.2425264 20.6602830 18.5561875
## x 1.000000e+00 0.5425249 0.6009356 0.5998076 0.6476622
##
## $write
## [,1] [,2] [,3] [,4] [,5]
## (Intercept) 18.7309118 -8.955412e-14 20.6572869 21.1309429 16.5279046
## x 0.6336486 1.000000e+00 0.6055514 0.5843927 0.6791647
##
## $math
## [,1] [,2] [,3] [,4] [,5]
## (Intercept) 14.5396491 20.2651022 0 17.49497 19.9529454
## x 0.7148764 0.6167729 1 0.65494 0.6155894
##
## $science
## [,1] [,2] [,3] [,4] [,5]
## (Intercept) 18.1571108 24.3565993 21.3916641 -8.141284e-15 26.1686196
## x 0.6540264 0.5455811 0.6003186 1.000000e+00 0.5034689
##
## $socst
## [,1] [,2] [,3] [,4] [,5]
## (Intercept) 21.5368195 25.229771 28.1291585 30.1197077 1.628257e-14
## x 0.5845412 0.524823 0.4670406 0.4167311 1.000000e+00
HW 2
The formula P = L (r/(1-(1+r)^(-M)) describes the payment you have to make per month for M number of months if you take out a loan of L amount today at a monthly interest rate of r. Compute how much you will have to pay per month for 10, 15, 20, 25, or 30 years if you borrow NT$5,000,000, 10,000,000, or 15,000,000 from a bank that charges you 2%, 5%, or 7% for the monthly interest rate.
int<-function(L, r, M){
P<-L*(r/(1-(1+r)^(-M)))
return(P)
}
L=list(5000000, 10000000, 15000000)
r=list(0.02, 0.05, 0.07)
M=list(c(10, 15, 20, 25, 30)*12)
mapply(int, L, r[[1]], M)## [,1] [,2] [,3]
## [1,] 110240.5 220481.0 330721.5
## [2,] 102913.7 205827.4 308741.0
## [3,] 100870.4 201740.8 302611.2
## [4,] 100263.7 200527.4 300791.1
## [5,] 100080.2 200160.4 300240.7
## [,1] [,2] [,3]
## [1,] 250718.6 501437.1 752155.7
## [2,] 250038.4 500076.7 750115.1
## [3,] 250002.1 500004.1 750006.2
## [4,] 250000.1 500000.2 750000.3
## [5,] 250000.0 500000.0 750000.0
## [,1] [,2] [,3]
## [1,] 350104.3 700208.5 1050313
## [2,] 350001.8 700003.6 1050005
## [3,] 350000.0 700000.1 1050000
## [4,] 350000.0 700000.0 1050000
## [5,] 350000.0 700000.0 1050000
## [[1]]
## [,1] [,2] [,3]
## [1,] 110240.5 220481.0 330721.5
## [2,] 102913.7 205827.4 308741.0
## [3,] 100870.4 201740.8 302611.2
## [4,] 100263.7 200527.4 300791.1
## [5,] 100080.2 200160.4 300240.7
##
## [[2]]
## [,1] [,2] [,3]
## [1,] 250718.6 501437.1 752155.7
## [2,] 250038.4 500076.7 750115.1
## [3,] 250002.1 500004.1 750006.2
## [4,] 250000.1 500000.2 750000.3
## [5,] 250000.0 500000.0 750000.0
##
## [[3]]
## [,1] [,2] [,3]
## [1,] 350104.3 700208.5 1050313
## [2,] 350001.8 700003.6 1050005
## [3,] 350000.0 700000.1 1050000
## [4,] 350000.0 700000.0 1050000
## [5,] 350000.0 700000.0 1050000
don’t know why list r did not work
HW 3
The following R script is an attempt to demonstrate the correspondence between parameter estimations by the least square method and the maximum likelihood method for the case of simple linear regression with a constant normal error term. 1. Construct a function from the script so that any deviance value for pairs of parameter estimates can be found. 2. Generalize the function further so that it will work with any data sets that can be modeled by a simple linear regression with a constant normal error term.
##
## Call:
## lm(formula = weight ~ height, data = women)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7333 -1.1333 -0.3833 0.7417 3.1167
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -87.51667 5.93694 -14.74 1.71e-09 ***
## height 3.45000 0.09114 37.85 1.09e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.525 on 13 degrees of freedom
## Multiple R-squared: 0.991, Adjusted R-squared: 0.9903
## F-statistic: 1433 on 1 and 13 DF, p-value: 1.091e-14
param <- c(coef(m0)[1], coef(m0)[2])
a <- param[1]
b <- param[2]
yhat <- a + b*women$height
e <- summary(m0)$sigma
lkhd <- dnorm(women$weight, mean=yhat, sd=e)
dvnc <- -2 * sum(log(lkhd))
ci_a <- coef(m0)[1] + unlist(summary(m0))$coefficients3*c(-2,2)
ci_b <- coef(m0)[2] + unlist(summary(m0))$coefficients4*c(-2,2)
bb <- expand.grid(a=seq(ci_a[1], ci_a[2], len=50),
b=seq(ci_b[1], ci_b[2], len=50))dvnc.f<-function(a,x,y){
m0<-lm(y~x, data=a)
param <- c(coef(m0)[1], coef(m0)[2])
i<-param[1]
b<-param[2]
yhat <- i + b*x
e <- summary(m0)$sigma
lkhd <- dnorm(y, mean=yhat, sd=e)
dvnc <- -2 * sum(log(lkhd))
print(param)
print(e)
print(lkhd)
print(dvnc)
}
dvnc.f(women, women$height, women$weight)## (Intercept) x
## -87.51667 3.45000
## [1] 1.525005
## [1] 0.07452923 0.21398769 0.24700963 0.26135075 0.25346523 0.22531900
## [7] 0.18359549 0.13712321 0.19359472 0.14741642 0.20326265 0.24608202
## [13] 0.26158497 0.15433520 0.03240932
## [1] 53.22809
lm.ci<-function(m0){
print(summary(m0))
param <- c(coef(m0)[1], coef(m0)[2])
a <- param[1]
b <- param[2]
ci_a <- coef(m0)[1] + unlist(summary(m0))$coefficients3*c(-1.96,1.96)
ci_b <- coef(m0)[2] + unlist(summary(m0))$coefficients4*c(-1.96,1.96)
print(paste0('95% CI of the intercept: [', min(ci_a), ', ', max(ci_a), ']'))
print(paste0('95% CI of the slope: [', min(ci_b), ', ', max(ci_b), ']'))
}HW 4
Modify this R script to create a function to compute the c-statistic illustrated with the data set in the article: Tryon, W.W. (1984). A simplified time-series analysis for evaluating treatment interventions. Journal of Applied Behavioral Analysis, 34(4), 230-233.
# # c-stat example # # read in data
dta <-read.table("C:/Users/USER/Desktop/R_data management/0427/cstat.txt", header=T)
str(dta) ## 'data.frame': 42 obs. of 1 variable:
## $ nc: int 28 46 39 45 24 20 35 37 36 40 ...
## nc
## 1 28
## 2 46
## 3 39
## 4 45
## 5 24
## 6 20
# # plot figure 1
#
{plot.new()
plot(1:42, dta[,1], xlab="Observations", ylab="Number of Children")
lines(1:42, dta[,1])
abline(v=10, lty=2)
abline(v=32, lty=2)
segments(1, mean(dta[1:10,1]),10, mean(dta[1:10,1]),col="red")
segments(11, mean(dta[11:32,1]),32, mean(dta[11:32,1]),col="red")
segments(33, mean(dta[33:42,1]),42, mean(dta[33:42,1]),col="red")
}#calculate c-stat for first baseline phase
cden<-1-(sum(diff(dta[1:10,1])^2)/(2*(10-1)*var(dta[1:10,1])))
sc<-sqrt((10-2)/((10-1)*(10+1)))
pval<-(1-pnorm(cden/sc))
pval ## [1] 0.2866238
## function
cstat.fun<-function(x){
n<-length(x)
cden <- 1-(sum(diff(x)^2)/(2*(n-1)*var(x)))
sc<-sqrt((n-2)/((n-1)*(n+1)))
pval <- 1-pnorm(cden/sc)
return(list(cden, sc,pval))
}
cstat.fun(dta[1:10, 1])## [[1]]
## [1] 0.1601208
##
## [[2]]
## [1] 0.2842676
##
## [[3]]
## [1] 0.2866238
HW 5
Plot the likelihood functoion to estimate the probability of graduate admission by gender, respectively, for Dept A in UCBAdmissions{datasets}. Construct approximate 95%-CI for each gender. Do they overlap?
library(datasets)
set.seed(1234)
theta <- seq(0.01, 0.99, by = .01)
n <- 100
n_m <- sum(UCBAdmissions[,1,1])
n_f <- sum(UCBAdmissions[,2,1])
p_m <- UCBAdmissions[1,1,1] / n_m
p_f <- UCBAdmissions[1,2,1] / n_f
y_m <- rbinom(n, 1, p_m)
y_f <- rbinom(n, 1, p_f)
llkhd_m <- sum(y_m) * log(theta) + (n - sum(y_m)) * log(1 - theta)
llkhd_f <- sum(y_f) * log(theta) + (n - sum(y_f)) * log(1 - theta)
plot(theta, llkhd_f, xlab = 'Probability', ylab = 'Likelihood', main = 'Grid search', type='n')
grid()
lines(theta, llkhd_m, col = 'hotpink1')
lines(theta, llkhd_f, col = 'dodgerblue1')
phat <- mean(y_m)
abline(v = phat, lty = 3, col = 'hotpink1')
arrows(phat - 2*sqrt(phat*(1-phat))/sqrt(n_m), -60,
phat + 2*sqrt(phat*(1-phat))/sqrt(n_m), -60,
code = 3, length = 0.1, angle = 90, col = 'hotpink1', lwd = 2)
phat <- mean(y_f)
abline(v = phat, lty = 3, col = 'dodgerblue1')
arrows(phat - 2*sqrt(phat*(1-phat))/sqrt(n_f), -80,
phat + 2*sqrt(phat*(1-phat))/sqrt(n_f), -80,
code = 3, length = 0.1, angle = 90, col = 'dodgerblue1', lwd = 2)
legend(x = .2, y = -2200, legend = c('Male', 'Female'), cex = 1.2, lty = 1,
col = c('hotpink1', 'dodgerblue1'), bty = 'n')