HW 1

library(dplyr)
library(MASS)
library (reshape)
dta <- read.table("C:/Users/USER/Desktop/R_data management/0427/hs0.txt", sep="\t", quote="", h=T, as.is=T)

str(dta)

## 'data.frame':    200 obs. of  11 variables:
##  $ id     : int  70 121 86 141 172 113 50 11 84 48 ...
##  $ female : chr  "male" "female" "male" "male" ...
##  $ race   : chr  "white" "white" "white" "white" ...
##  $ ses    : chr  "low" "middle" "high" "high" ...
##  $ schtyp : chr  "public" "public" "public" "public" ...
##  $ prog   : chr  "general" "vocation" "general" "vocation" ...
##  $ read   : int  57 68 44 63 47 44 50 34 63 57 ...
##  $ write  : int  52 59 33 44 52 52 59 46 57 55 ...
##  $ math   : int  41 53 54 47 57 51 42 45 54 52 ...
##  $ science: int  47 63 58 53 53 63 53 39 58 NA ...
##  $ socst  : int  57 61 31 56 61 61 61 36 51 51 ...

dim(dta)

## [1] 200  11

head(dta)

##    id female  race    ses schtyp     prog read write math science socst
## 1  70   male white    low public  general   57    52   41      47    57
## 2 121 female white middle public vocation   68    59   53      63    61
## 3  86   male white   high public  general   44    33   54      58    31
## 4 141   male white   high public vocation   63    44   47      53    56
## 5 172   male white middle public academic   47    52   57      53    61
## 6 113   male white middle public academic   44    52   51      63    61

(a) test if any pairs of the five variables: read, write, math, science, and socst, are different in means.

res <- dta%>% 
  select_if(is.numeric)%>%melt(., id='id')
pairwise.t.test(res$value, res$variable, p.adjust = "none")

## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  res$value and res$variable 
## 
##         read write math science
## write   0.58 -     -    -      
## math    0.68 0.90  -    -      
## science 0.76 0.39  0.47 -      
## socst   0.86 0.71  0.81 0.63   
## 
## P value adjustment method: none

no significant difference between all paris of five variable ## (b) test if the 4 different ethnic groups have the same mean scores for each of the 5 variables (individually): read, write, math, science, and socst.

dta2<-dta[ ,c(3, 7, 8 ,9 , 10, 11)]
res2 <- dta2%>%melt(., id='race')

lapply(split(res2, res2$variable), function(x) anova(lm(x$value~factor(x$race))))

## $read
## Analysis of Variance Table
## 
## Response: x$value
##                 Df  Sum Sq Mean Sq F value    Pr(>F)    
## factor(x$race)   3  1749.8  583.27  5.9637 0.0006539 ***
## Residuals      196 19169.6   97.80                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $write
## Analysis of Variance Table
## 
## Response: x$value
##                 Df  Sum Sq Mean Sq F value    Pr(>F)    
## factor(x$race)   3  1914.2  638.05  7.8334 5.785e-05 ***
## Residuals      196 15964.7   81.45                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $math
## Analysis of Variance Table
## 
## Response: x$value
##                 Df  Sum Sq Mean Sq F value   Pr(>F)    
## factor(x$race)   3  1842.1  614.05  7.7033 6.84e-05 ***
## Residuals      196 15623.7   79.71                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $science
## Analysis of Variance Table
## 
## Response: x$value
##                 Df  Sum Sq Mean Sq F value    Pr(>F)    
## factor(x$race)   3  3169.5 1056.51  13.063 8.505e-08 ***
## Residuals      191 15447.2   80.88                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $socst
## Analysis of Variance Table
## 
## Response: x$value
##                 Df  Sum Sq Mean Sq F value  Pr(>F)  
## factor(x$race)   3   943.9  314.63   2.804 0.04098 *
## Residuals      196 21992.3  112.21                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

mean of scores are significantly different between groups

(c) Perform all pairwise simple regressions for these variables: read, write, math, science, and socst.

dta3<-dta[ ,c(7:11)]
lapply(dta3, function(x){ 
  coefficients( lm(cbind(dta3$read, dta3$write, dta3$math, dta3$science, dta3$socst)~x, data=dta3)) } )

## $read
##                      [,1]       [,2]       [,3]       [,4]       [,5]
## (Intercept) -8.141284e-15 24.4095462 21.2425264 20.6602830 18.5561875
## x            1.000000e+00  0.5425249  0.6009356  0.5998076  0.6476622
## 
## $write
##                   [,1]          [,2]       [,3]       [,4]       [,5]
## (Intercept) 18.7309118 -8.955412e-14 20.6572869 21.1309429 16.5279046
## x            0.6336486  1.000000e+00  0.6055514  0.5843927  0.6791647
## 
## $math
##                   [,1]       [,2] [,3]     [,4]       [,5]
## (Intercept) 14.5396491 20.2651022    0 17.49497 19.9529454
## x            0.7148764  0.6167729    1  0.65494  0.6155894
## 
## $science
##                   [,1]       [,2]       [,3]          [,4]       [,5]
## (Intercept) 18.1571108 24.3565993 21.3916641 -8.141284e-15 26.1686196
## x            0.6540264  0.5455811  0.6003186  1.000000e+00  0.5034689
## 
## $socst
##                   [,1]      [,2]       [,3]       [,4]         [,5]
## (Intercept) 21.5368195 25.229771 28.1291585 30.1197077 1.628257e-14
## x            0.5845412  0.524823  0.4670406  0.4167311 1.000000e+00

HW 2

The formula P = L (r/(1-(1+r)^(-M)) describes the payment you have to make per month for M number of months if you take out a loan of L amount today at a monthly interest rate of r. Compute how much you will have to pay per month for 10, 15, 20, 25, or 30 years if you borrow NT$5,000,000, 10,000,000, or 15,000,000 from a bank that charges you 2%, 5%, or 7% for the monthly interest rate.

int<-function(L, r, M){   
  P<-L*(r/(1-(1+r)^(-M)))
  return(P)  
  }
L=list(5000000, 10000000, 15000000)
r=list(0.02, 0.05, 0.07)
M=list(c(10, 15, 20, 25, 30)*12)
 mapply(int, L, r[[1]], M)

##          [,1]     [,2]     [,3]
## [1,] 110240.5 220481.0 330721.5
## [2,] 102913.7 205827.4 308741.0
## [3,] 100870.4 201740.8 302611.2
## [4,] 100263.7 200527.4 300791.1
## [5,] 100080.2 200160.4 300240.7

 mapply(int, L, r[[2]], M)

##          [,1]     [,2]     [,3]
## [1,] 250718.6 501437.1 752155.7
## [2,] 250038.4 500076.7 750115.1
## [3,] 250002.1 500004.1 750006.2
## [4,] 250000.1 500000.2 750000.3
## [5,] 250000.0 500000.0 750000.0

 mapply(int, L, r[[3]], M)

##          [,1]     [,2]    [,3]
## [1,] 350104.3 700208.5 1050313
## [2,] 350001.8 700003.6 1050005
## [3,] 350000.0 700000.1 1050000
## [4,] 350000.0 700000.0 1050000
## [5,] 350000.0 700000.0 1050000

 ## alternative
 lapply(r, function(x) mapply(int, L, x, M))

## [[1]]
##          [,1]     [,2]     [,3]
## [1,] 110240.5 220481.0 330721.5
## [2,] 102913.7 205827.4 308741.0
## [3,] 100870.4 201740.8 302611.2
## [4,] 100263.7 200527.4 300791.1
## [5,] 100080.2 200160.4 300240.7
## 
## [[2]]
##          [,1]     [,2]     [,3]
## [1,] 250718.6 501437.1 752155.7
## [2,] 250038.4 500076.7 750115.1
## [3,] 250002.1 500004.1 750006.2
## [4,] 250000.1 500000.2 750000.3
## [5,] 250000.0 500000.0 750000.0
## 
## [[3]]
##          [,1]     [,2]    [,3]
## [1,] 350104.3 700208.5 1050313
## [2,] 350001.8 700003.6 1050005
## [3,] 350000.0 700000.1 1050000
## [4,] 350000.0 700000.0 1050000
## [5,] 350000.0 700000.0 1050000

don’t know why list r did not work

HW 3

The following R script is an attempt to demonstrate the correspondence between parameter estimations by the least square method and the maximum likelihood method for the case of simple linear regression with a constant normal error term. 1. Construct a function from the script so that any deviance value for pairs of parameter estimates can be found. 2. Generalize the function further so that it will work with any data sets that can be modeled by a simple linear regression with a constant normal error term.

m0 <- lm(weight ~ height, data=women)
# 
summary(m0)

## 
## Call:
## lm(formula = weight ~ height, data = women)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7333 -1.1333 -0.3833  0.7417  3.1167 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -87.51667    5.93694  -14.74 1.71e-09 ***
## height        3.45000    0.09114   37.85 1.09e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.525 on 13 degrees of freedom
## Multiple R-squared:  0.991,  Adjusted R-squared:  0.9903 
## F-statistic:  1433 on 1 and 13 DF,  p-value: 1.091e-14

param <- c(coef(m0)[1], coef(m0)[2])
a <- param[1]
b <- param[2]
yhat <- a + b*women$height
e <- summary(m0)$sigma
lkhd <- dnorm(women$weight, mean=yhat, sd=e)
dvnc <- -2 * sum(log(lkhd))
ci_a <- coef(m0)[1] + unlist(summary(m0))$coefficients3*c(-2,2)
ci_b <- coef(m0)[2] + unlist(summary(m0))$coefficients4*c(-2,2)
bb <- expand.grid(a=seq(ci_a[1], ci_a[2], len=50),
                  b=seq(ci_b[1], ci_b[2], len=50))

dvnc.f<-function(a,x,y){ 
  
  m0<-lm(y~x, data=a)
  param <- c(coef(m0)[1], coef(m0)[2])
  i<-param[1]
  b<-param[2]
  yhat <- i + b*x
  e <- summary(m0)$sigma
  lkhd <- dnorm(y, mean=yhat, sd=e)
  dvnc <- -2 * sum(log(lkhd))
  print(param)
  print(e)
  print(lkhd)
  print(dvnc)  
  
  
  }

dvnc.f(women, women$height, women$weight)

## (Intercept)           x 
##   -87.51667     3.45000 
## [1] 1.525005
##  [1] 0.07452923 0.21398769 0.24700963 0.26135075 0.25346523 0.22531900
##  [7] 0.18359549 0.13712321 0.19359472 0.14741642 0.20326265 0.24608202
## [13] 0.26158497 0.15433520 0.03240932
## [1] 53.22809

lm.ci<-function(m0){
  print(summary(m0))
  param <- c(coef(m0)[1], coef(m0)[2])
  a <- param[1]
  b <- param[2]
  ci_a <- coef(m0)[1] + unlist(summary(m0))$coefficients3*c(-1.96,1.96)
  ci_b <- coef(m0)[2] + unlist(summary(m0))$coefficients4*c(-1.96,1.96)
  print(paste0('95% CI of the intercept: [',  min(ci_a), ', ', max(ci_a), ']'))
  print(paste0('95% CI of the slope: [',  min(ci_b), ', ', max(ci_b), ']'))
  
  
}

HW 4

Modify this R script to create a function to compute the c-statistic illustrated with the data set in the article: Tryon, W.W. (1984). A simplified time-series analysis for evaluating treatment interventions. Journal of Applied Behavioral Analysis, 34(4), 230-233.

# # c-stat example # # read in data 

dta <-read.table("C:/Users/USER/Desktop/R_data management/0427/cstat.txt", header=T) 
str(dta)

## 'data.frame':    42 obs. of  1 variable:
##  $ nc: int  28 46 39 45 24 20 35 37 36 40 ...

head(dta)

##   nc
## 1 28
## 2 46
## 3 39
## 4 45
## 5 24
## 6 20

# # plot figure 1 
# 
{plot.new()
plot(1:42, dta[,1], xlab="Observations", ylab="Number of Children") 
lines(1:42, dta[,1]) 
abline(v=10, lty=2) 
abline(v=32, lty=2) 
segments(1, mean(dta[1:10,1]),10, mean(dta[1:10,1]),col="red") 
segments(11, mean(dta[11:32,1]),32, mean(dta[11:32,1]),col="red") 
segments(33, mean(dta[33:42,1]),42, mean(dta[33:42,1]),col="red")
}

#calculate c-stat for first baseline phase  
cden<-1-(sum(diff(dta[1:10,1])^2)/(2*(10-1)*var(dta[1:10,1]))) 
sc<-sqrt((10-2)/((10-1)*(10+1))) 
pval<-(1-pnorm(cden/sc)) 
pval

## [1] 0.2866238

## function  
cstat.fun<-function(x){ 
    n<-length(x)
    cden <- 1-(sum(diff(x)^2)/(2*(n-1)*var(x)))
    sc<-sqrt((n-2)/((n-1)*(n+1)))
    pval <- 1-pnorm(cden/sc) 
    return(list(cden, sc,pval))
  }
  
  cstat.fun(dta[1:10, 1])

## [[1]]
## [1] 0.1601208
## 
## [[2]]
## [1] 0.2842676
## 
## [[3]]
## [1] 0.2866238

HW 5

Plot the likelihood functoion to estimate the probability of graduate admission by gender, respectively, for Dept A in UCBAdmissions{datasets}. Construct approximate 95%-CI for each gender. Do they overlap?

library(datasets)
set.seed(1234)
theta <- seq(0.01, 0.99, by = .01)
n <- 100
n_m <- sum(UCBAdmissions[,1,1])
n_f <- sum(UCBAdmissions[,2,1])
p_m <- UCBAdmissions[1,1,1] / n_m
p_f <- UCBAdmissions[1,2,1] / n_f
y_m <- rbinom(n, 1, p_m)
y_f <- rbinom(n, 1, p_f)
llkhd_m <- sum(y_m) * log(theta) + (n - sum(y_m)) * log(1 - theta)
llkhd_f <- sum(y_f) * log(theta) + (n - sum(y_f)) * log(1 - theta)

plot(theta, llkhd_f, xlab = 'Probability', ylab = 'Likelihood', main = 'Grid search', type='n')
grid()
lines(theta, llkhd_m, col = 'hotpink1')
lines(theta, llkhd_f, col = 'dodgerblue1')

phat <- mean(y_m)
abline(v = phat, lty = 3, col = 'hotpink1')
arrows(phat - 2*sqrt(phat*(1-phat))/sqrt(n_m), -60,
       phat + 2*sqrt(phat*(1-phat))/sqrt(n_m), -60,
       code = 3, length = 0.1, angle = 90, col = 'hotpink1', lwd = 2)

phat <- mean(y_f)
abline(v = phat, lty = 3, col = 'dodgerblue1')
arrows(phat - 2*sqrt(phat*(1-phat))/sqrt(n_f), -80,
       phat + 2*sqrt(phat*(1-phat))/sqrt(n_f), -80,
       code = 3, length = 0.1, angle = 90, col = 'dodgerblue1', lwd = 2)

legend(x = .2, y = -2200, legend = c('Male', 'Female'), cex = 1.2, lty = 1,
       col = c('hotpink1', 'dodgerblue1'), bty = 'n')

Functions exercise 1-5