Homework 1:
Load data file
## id female race ses schtyp prog read write math science socst
## 1 70 male white low public general 57 52 41 47 57
## 2 121 female white middle public vocation 68 59 53 63 61
## 3 86 male white high public general 44 33 54 58 31
## 4 141 male white high public vocation 63 44 47 53 56
## 5 172 male white middle public academic 47 52 57 53 61
## 6 113 male white middle public academic 44 52 51 63 61
Found some NA values in Science scores, so adopted another method
## read write math science socst
## 52.230 52.775 52.645 NA 52.405
Translate the data from wide to long format
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## ─ Attaching packages ────────────────────────── tidyverse 1.3.0 ─
## ✓ ggplot2 3.2.1 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ stringr 1.4.0
## ✓ tidyr 1.0.2 ✓ forcats 0.4.0
## ✓ readr 1.3.1
## ─ Conflicts ─────────────────────────── tidyverse_conflicts() ─
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Show the descriptive statistic of score
dta_L1 <- dta_L %>%
group_by(Academic) %>%
summarise(a_mean = mean(Score, na.rm = TRUE),
a_se = sd(Score, na.rm = TRUE) / sqrt(sum(!is.na(Score))))
dta_L1## # A tibble: 5 x 3
## Academic a_mean a_se
## <chr> <dbl> <dbl>
## 1 math 52.6 0.672
## 2 read 52.1 0.733
## 3 science 51.9 0.702
## 4 socst 52.3 0.771
## 5 write 52.7 0.678
Plot the data
ggplot(dta_L1, aes(x = Academic, y = a_mean)) +
geom_point(aes(color = Academic), size = 4) +
geom_errorbar(aes(ymax = a_mean + a_se,
ymin = a_mean - a_se)) +
xlab('Academic') + ylab('Mean score') +
theme_minimal() +
theme(legend.position = 'none')Conducted One-way ANOVA
##
## Error: id
## Df Sum Sq Mean Sq F value Pr(>F)
## Residuals 1 4023 4023
##
## Error: id:Academic
## Df Sum Sq Mean Sq
## Academic 4 60.5 15.12
##
## Error: Within
## Df Sum Sq Mean Sq F value Pr(>F)
## Academic 4 321 80.33 0.847 0.495
## Residuals 965 91524 94.84
The results revealed that no difference between academic scores.
Show the descriptive statistic of score
dta_L1 <- dta_L %>%
group_by(race, Academic) %>%
summarise(a_mean = mean(Score, na.rm = TRUE),
a_se = sd(Score, na.rm = TRUE) / sqrt(sum(!is.na(Score))))
dta_L1## # A tibble: 20 x 4
## # Groups: race [4]
## race Academic a_mean a_se
## <fct> <chr> <dbl> <dbl>
## 1 african-amer math 46.5 1.50
## 2 african-amer read 46.3 1.58
## 3 african-amer science 42.4 2.19
## 4 african-amer socst 49.4 2.56
## 5 african-amer write 47.8 2.16
## 6 asian math 57.3 3.05
## 7 asian read 51.9 2.31
## 8 asian science 51.5 2.86
## 9 asian socst 51 2.94
## 10 asian write 58 2.38
## 11 hispanic math 47.6 1.48
## 12 hispanic read 47 2.16
## 13 hispanic science 46.2 1.52
## 14 hispanic socst 48.0 1.95
## 15 hispanic write 46.8 1.73
## 16 white math 53.8 0.787
## 17 white read 53.7 0.862
## 18 white science 54.1 0.766
## 19 white socst 53.5 0.910
## 20 white write 53.9 0.770
Plot the data
ggplot(dta_L1, aes(x = Academic, y = a_mean)) +
geom_point(aes(color = race), size = 4) +
geom_errorbar(aes(ymax = a_mean + a_se,
ymin = a_mean - a_se)) +
facet_wrap(. ~ race, nrow = 1) +
xlab('Academic') + ylab('Mean score') +
theme_minimal() +
theme(legend.position = 'none')Test if the 4 different ethnic groups have the same mean scores for each of the 5 variables (individually): read, write, math, science, and socst.
test_by_race <- paste0(names(dta[7:11]), " ~ race")
formulae <- lapply(test_by_race, FUN=formula)
res_aov <- lapply(formulae, function(f) lm(f, data=dta))
lapply(res_aov[1:5], anova)## [[1]]
## Analysis of Variance Table
##
## Response: read
## Df Sum Sq Mean Sq F value Pr(>F)
## race 3 1627.6 542.52 5.5496 0.001132 **
## Residuals 191 18672.0 97.76
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## [[2]]
## Analysis of Variance Table
##
## Response: write
## Df Sum Sq Mean Sq F value Pr(>F)
## race 3 1758.1 586.04 7.1651 0.0001388 ***
## Residuals 191 15622.2 81.79
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## [[3]]
## Analysis of Variance Table
##
## Response: math
## Df Sum Sq Mean Sq F value Pr(>F)
## race 3 1746.3 582.09 7.2581 0.0001231 ***
## Residuals 191 15317.8 80.20
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## [[4]]
## Analysis of Variance Table
##
## Response: science
## Df Sum Sq Mean Sq F value Pr(>F)
## race 3 3169.5 1056.51 13.063 8.505e-08 ***
## Residuals 191 15447.2 80.88
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## [[5]]
## Analysis of Variance Table
##
## Response: socst
## Df Sum Sq Mean Sq F value Pr(>F)
## race 3 800.7 266.89 2.3501 0.07379 .
## Residuals 191 21690.9 113.56
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
A: Yes
Perform all pairwise simple regressions for these variables: read, write, math, science, and socst.
# y <- dta[, "read"]
# res <- sapply(dta[, 8:11], function(x) lm(y ~ x))
# lapply(res[1:4], broom::tidy)
#
# y <- dta[, "write"]
# res <- sapply(dta[, c(7,9:11)], function(x) lm(y ~ x))
# lapply(res[1:4], broom::tidy)
#
# y <- dta[, "math"]
# res <- sapply(dta[, c(7,8,10,11)], function(x) lm(y ~ x))
# lapply(res[1:4], broom::tidy)
#
# y <- dta[, "science"]
# res <- sapply(dta[, c(7,8,9,11)], function(x) lm(y ~ x))
# lapply(res[1:4], broom::tidy)
#
# y <- dta[, "socst"]
# res <- sapply(dta[, c(7,8,9,10)], function(x) lm(y ~ x))
# lapply(res[1:4], broom::tidy)I don’t know why it did not work, then I still tried another method to solve it
Homework 2: Loan and Payment
Create a function
Generate the number
data.frame(L = rep(c(5000000, 10000000, 15000000), each = 15),
r = rep(c(0.02, 0.05, 0.07), each = 5),
M = rep(c(10, 15, 20, 25, 30) * 12, 9)) %>%
mutate(P = round(mapply(payment, L, r, M)))## L r M P
## 1 5.0e+06 0.02 120 110240
## 2 5.0e+06 0.02 180 102914
## 3 5.0e+06 0.02 240 100870
## 4 5.0e+06 0.02 300 100264
## 5 5.0e+06 0.02 360 100080
## 6 5.0e+06 0.05 120 250719
## 7 5.0e+06 0.05 180 250038
## 8 5.0e+06 0.05 240 250002
## 9 5.0e+06 0.05 300 250000
## 10 5.0e+06 0.05 360 250000
## 11 5.0e+06 0.07 120 350104
## 12 5.0e+06 0.07 180 350002
## 13 5.0e+06 0.07 240 350000
## 14 5.0e+06 0.07 300 350000
## 15 5.0e+06 0.07 360 350000
## 16 1.0e+07 0.02 120 220481
## 17 1.0e+07 0.02 180 205827
## 18 1.0e+07 0.02 240 201741
## 19 1.0e+07 0.02 300 200527
## 20 1.0e+07 0.02 360 200160
## 21 1.0e+07 0.05 120 501437
## 22 1.0e+07 0.05 180 500077
## 23 1.0e+07 0.05 240 500004
## 24 1.0e+07 0.05 300 500000
## 25 1.0e+07 0.05 360 500000
## 26 1.0e+07 0.07 120 700209
## 27 1.0e+07 0.07 180 700004
## 28 1.0e+07 0.07 240 700000
## 29 1.0e+07 0.07 300 700000
## 30 1.0e+07 0.07 360 700000
## 31 1.5e+07 0.02 120 330721
## 32 1.5e+07 0.02 180 308741
## 33 1.5e+07 0.02 240 302611
## 34 1.5e+07 0.02 300 300791
## 35 1.5e+07 0.02 360 300241
## 36 1.5e+07 0.05 120 752156
## 37 1.5e+07 0.05 180 750115
## 38 1.5e+07 0.05 240 750006
## 39 1.5e+07 0.05 300 750000
## 40 1.5e+07 0.05 360 750000
## 41 1.5e+07 0.07 120 1050313
## 42 1.5e+07 0.07 180 1050005
## 43 1.5e+07 0.07 240 1050000
## 44 1.5e+07 0.07 300 1050000
## 45 1.5e+07 0.07 360 1050000
Homework 3: Least Square Method and Maximum Likelihood Method for Simple Linear Regression
##
## Call:
## lm(formula = weight ~ height, data = women)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7333 -1.1333 -0.3833 0.7417 3.1167
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -87.51667 5.93694 -14.74 1.71e-09 ***
## height 3.45000 0.09114 37.85 1.09e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.525 on 13 degrees of freedom
## Multiple R-squared: 0.991, Adjusted R-squared: 0.9903
## F-statistic: 1433 on 1 and 13 DF, p-value: 1.091e-14
##
param <- c(coef(m0)[1], coef(m0)[2])
##
a <- param[1]
##
b <- param[2]
##
yhat <- a + b*women$height
##
e <- summary(m0)$sigma
##
lkhd <- dnorm(women$weight, mean=yhat, sd=e)
##
dvnc <- -2 * sum(log(lkhd))
##
ci_a <- coef(m0)[1] + unlist(summary(m0))$coefficients3*c(-2,2)
##
ci_b <- coef(m0)[2] + unlist(summary(m0))$coefficients4*c(-2,2)
##
bb <- expand.grid(a=seq(ci_a[1], ci_a[2], len=50),
b=seq(ci_b[1], ci_b[2], len=50))Construct a function from the script so that any deviance value for pairs of parameter estimates can be found.
The code chunk was refered from Jay-Liao
dvnc_fun <- function(x) {
m0 <- lm(weight ~ height, data = women)
return(c(x[1] - coef(m0)[1], x[2] - coef(m0)[2]))
}
lm_with_CI <- function(m0) {
param <- c(coef(m0)[1], coef(m0)[2])
a <- param[1]
b <- param[2]
ci_a <- coef(m0)[1] + unlist(summary(m0))$coefficients3*c(-1.96, 1.96) #Show 95% CI
ci_b <- coef(m0)[2] + unlist(summary(m0))$coefficients4*c(-1.96, 1.96) #Show 95% CI
print(summary(m0))
print(paste0('95% CI of the intercept: [', min(ci_a), ', ', max(ci_a), ']'))
print(paste0('95% CI of the slope: [', min(ci_b), ', ', max(ci_b), ']'))
}
lm_with_CI(lm(weight ~ height, data = women))##
## Call:
## lm(formula = weight ~ height, data = women)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7333 -1.1333 -0.3833 0.7417 3.1167
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -87.51667 5.93694 -14.74 1.71e-09 ***
## height 3.45000 0.09114 37.85 1.09e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.525 on 13 degrees of freedom
## Multiple R-squared: 0.991, Adjusted R-squared: 0.9903
## F-statistic: 1433 on 1 and 13 DF, p-value: 1.091e-14
##
## [1] "95% CI of the intercept: [-99.1530770025164, -75.8802563308165]"
## [1] "95% CI of the slope: [3.27137246888624, 3.62862753111375]"
Homework 3: c-stat example
read in data
## 'data.frame': 42 obs. of 1 variable:
## $ nc: int 28 46 39 45 24 20 35 37 36 40 ...
## nc
## 1 28
## 2 46
## 3 39
## 4 45
## 5 24
## 6 20
plot figure 1
plot(1:42, dta[,1], xlab="Observations", ylab="Number of Children")
lines(1:42, dta[,1])
abline(v=10, lty=2)
abline(v=32, lty=2)
segments(1, mean(dta[1:10,1]),10, mean(dta[1:10,1]),col="red")
segments(11, mean(dta[11:32,1]),32, mean(dta[11:32,1]),col="red")
segments(33, mean(dta[33:42,1]),42, mean(dta[33:42,1]),col="red")calculate c-stat for first baseline phase
cden <- 1-(sum(diff(dta[1:10,1])^2)/(2*(10-1)*var(dta[1:10,1])))
sc <- sqrt((10-2)/((10-1)*(10+1)))
pval <- 1-pnorm(cden/sc)
pval## [1] 0.2866238
calculate c-stat for first baseline plus group tokens
n <- 32
cden <- 1-(sum(diff(dta[1:n,1])^2)/(2*(n-1)*var(dta[1:n,1])))
sc <- sqrt((n-2)/((n-1)*(n+1)))
pval <- 1-pnorm(cden/sc)
list(z=cden/sc,pvalue=pval)## $z
## [1] 3.879054
##
## $pvalue
## [1] 5.243167e-05
f_cden <- function(x) {
n <- (length(x))
cden <- 1-(sum(diff(x)^2)/(2*(n-1)*var(x)))
return(cden)
}
f_sc <- function(x) {
n <- (length(x))
sc <- sqrt((n-2)/((n-1)*(n+1)))
return(sc)
}
f_pval <- function(x) {
cden <- f_cden(x)
sc <- f_sc(x)
return(1-pnorm(cden/sc))
}
f_cden(dta[1:10, 1])## [1] 0.1601208
## [1] 0.2842676
## [1] 0.2866238
f_stat <- function(x) {
n <- length(x)
cden <- 1-(sum(diff(x)^2)/(2*(n-1)*var(x)))
sc <- sqrt((n-2)/((n-1)*(n+1)))
pval <- 1-pnorm(cden/sc)
return(list(cden = cden, sc = sc, z=cden/sc, pvalue=pval))
}
f_stat(dta[1:32, 1])## $cden
## [1] 0.6642762
##
## $sc
## [1] 0.1712469
##
## $z
## [1] 3.879054
##
## $pvalue
## [1] 5.243167e-05
Homework 5: Plot the likelihood functoion to estimate the probability of graduate admission by gender
Estimate the probability and likelihood function of graduate admission by gender
Plot the data, the blue line as Male and red line as Female, respectively.
plot(theta, llkhdF, xlab='Probability',cylab='Likelihood', main='Grid search', type='n')
grid()
lines(theta, llkhdM, col='blue')
lines(theta, llkhdF, col='red')
phatM <- mean(yM)
phatF <- mean(yF)
abline(v=phatM, lty=3, col='blue')
arrows(phatM - 2*sqrt(phatM*(1-phatM))/sqrt(n), -100,
phatM + 2*sqrt(phatM*(1-phatM))/sqrt(n),-100,
code=3, length=0.1, angle=90, col='blue')
abline(v=phatF, lty=3, col='red')
arrows(phatF - 2*sqrt(phatF*(1-phatF))/sqrt(n), -100,
phatF + 2*sqrt(phatF*(1-phatF))/sqrt(n),-100,
code=3, length=0.1, angle=90, col='red')
legend(x = .2, y = -250, legend = c('Male', 'Female'), cex = 1.5, lty = 1,
col = c('blue', 'red'), bty = 'n')Q: Do they overlap? A: No.