# Libraries
library("stats")
library("stats4")
library("foreign")
library("MASS")
library("mlogit")
library("tidyverse")
library("lmtest")
optain the difference of probability between some one who spends 5 hour sper wek and some one who spends 10 hours per week. SAT fix at 3.0, SAT 1,200.
\[\hat{P}(grad = 1|hsGPA, SAT, study) = \Lambda(−1.17+0.24hsGPA+0.00058SAT+0.073sTudy)\]
study10 = (0.24*3.0) + (0.00058*1200) + (0.073*10) - 1.17
study5 = -1.17 + 0.24*3.0 + 0.00058*1200 + 0.073*5
study10-study5
## [1] 0.365
The marginal of effect of studying one more hour per week … studying one more hour per week would have a 36.5% chance of incrasing
# NOT SURE IF I GOT IT
# hsGPA 3.1
# SAT 1250
# 10 hours per week
study10mar = (0.24*3.1) + (0.00058*1250) + (0.073*10) - 1.17
study10mar
## [1] 1.029
Likelihood function to find the maximum likelyhood. input is the outcome output is the probability.
This function tells you what is the probability of y_i = 1
p=pr(y=1) \[f(y_i;p) = p^{y_i}(1-p)^{1-y_t}\]
\(y_t\) is the probability
output is the probability input is the outcome which are known.
# This is how to create a function. Different types of function can be created.
likelihood = function(N,y,theta)
{return(theta^y*(1-theta)^(N-y))}
N = 1000
y = rbinom(N,1,10)
# theta =
theta = seq(from = -2, to = 4, by = 0.01)
plot(theta, likelihood(N, 4, theta))
plot
### b
plot
### c
plot
### d
What can you conclude from the above three plots?
library(readxl)
pntsprd <- read_excel("pntsprd.xls")
head(pntsprd)
## # A tibble: 6 x 12
## favscr undscr spread favhome neutral fav25 und25 fregion uregion scrdiff
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 72 61 7 0 0 1 0 3 4 11
## 2 82 74 7 1 0 0 0 3 1 8
## 3 87 57 17 1 0 0 0 3 3 30
## 4 69 70 9 1 0 0 0 3 3 -1
## 5 77 79 2.5 0 0 0 0 2 3 -2
## 6 91 65 9 0 1 1 0 3 4 26
## # ... with 2 more variables: sprdcvr <dbl>, favwin <dbl>
\[P(favwin = 1|spread) = \beta_0 + \beta_1spread\]
# Linear Probability Model
lpm <- lm(pntsprd$favwin~pntsprd$spread)
summary(lpm)
##
## Call:
## lm(formula = pntsprd$favwin ~ pntsprd$spread)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9836 -0.1192 0.1519 0.3069 0.4037
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.576949 0.028235 20.434 < 2e-16 ***
## pntsprd$spread 0.019366 0.002339 8.281 9.32e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4017 on 551 degrees of freedom
## Multiple R-squared: 0.1107, Adjusted R-squared: 0.1091
## F-statistic: 68.57 on 1 and 551 DF, p-value: 9.324e-16
Use the rubost standard error
Is spread statistically significant with 5% significance level? What is the estimated probability that the favored team wins when spread = 10?
estimate a probit
# Probit Model
Probitm <-glm(inlf~nwifeinc+educ+exper+expersq+age+kidslt6+kidsge6,family = binomial(link="probit"))
summary(Probitm)
Add the variables f avhome, f av25, and und25 to the probit model and test joint significance of these variables using the likelihood ratio test with 5% significance level.
library(readxl)
loanapp <- read_excel("loanapp.xls")
head(loanapp)
## # A tibble: 6 x 59
## occ loanamt action msa suffolk appinc typur unit married dep emp
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <dbl>
## 1 1 89 1 1120 0 72 0 1 0 0 0
## 2 1 128 3 1120 0 74 0 1 1 1 0
## 3 1 128 1 1120 0 84 3 1 0 0 1
## 4 1 66 1 1120 0 36 0 1 1 0 0
## 5 1 120 1 1120 0 59 8 1 1 0 0
## 6 1 111 1 1120 0 63 9 1 0 0 0
## # ... with 48 more variables: yjob <dbl>, self <dbl>, atotinc <dbl>,
## # cototinc <dbl>, hexp <dbl>, price <dbl>, other <dbl>, liq <dbl>, rep <chr>,
## # gdlin <dbl>, lines <dbl>, mortg <dbl>, cons <dbl>, pubrec <dbl>,
## # hrat <dbl>, obrat <dbl>, fixadj <dbl>, term <dbl>, apr <dbl>, prop <dbl>,
## # inss <dbl>, inson <dbl>, gift <dbl>, cosign <dbl>, unver <dbl>,
## # review <dbl>, netw <dbl>, unem <dbl>, min30 <chr>, bd <dbl>, mi <dbl>,
## # old <dbl>, vr <dbl>, sch <dbl>, black <dbl>, hispan <dbl>, male <chr>,
## # reject <dbl>, approve <dbl>, mortno <dbl>, mortperf <dbl>, mortlat1 <dbl>,
## # mortlat2 <dbl>, chist <dbl>, multi <chr>, loanprc <dbl>, thick <chr>,
## # white <dbl>
estimate a probit modlel
# Probit Model
proapp = glm(loanapp$approve ~ loanapp$white,
family = binomial(link = "probit"))
summary(proapp)
##
## Call:
## glm(formula = loanapp$approve ~ loanapp$white, family = binomial(link = "probit"))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1864 0.4384 0.4384 0.4384 0.8314
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.54695 0.07544 7.251 4.15e-13 ***
## loanapp$white 0.78395 0.08671 9.041 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1480.7 on 1988 degrees of freedom
## Residual deviance: 1401.8 on 1987 degrees of freedom
## AIC: 1405.8
##
## Number of Fisher Scoring iterations: 4
# Probit Model
proapp2 = glm(loanapp$approve ~ loanapp$white + loanapp$hrat + loanapp$obrat + loanapp$loanprc + loanapp$unem + loanapp$male + loanapp$married + loanapp$dep + loanapp$sch + loanapp$cosign + loanapp$chist + loanapp$pubrec + loanapp$mortlat1 + loanapp$mortlat2 + loanapp$vr,
family = binomial(link = "probit"))
summary(proapp2)
##
## Call:
## glm(formula = loanapp$approve ~ loanapp$white + loanapp$hrat +
## loanapp$obrat + loanapp$loanprc + loanapp$unem + loanapp$male +
## loanapp$married + loanapp$dep + loanapp$sch + loanapp$cosign +
## loanapp$chist + loanapp$pubrec + loanapp$mortlat1 + loanapp$mortlat2 +
## loanapp$vr, family = binomial(link = "probit"))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0950 0.2331 0.3389 0.4850 1.9979
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 11.395347 204.586080 0.056 0.9556
## loanapp$white 0.532479 0.097452 5.464 4.66e-08 ***
## loanapp$hrat 0.008364 0.007061 1.185 0.2362
## loanapp$obrat -0.028319 0.006192 -4.574 4.79e-06 ***
## loanapp$loanprc -1.004248 0.239688 -4.190 2.79e-05 ***
## loanapp$unem -0.035933 0.017828 -2.016 0.0438 *
## loanapp$male0 -3.971989 94.258449 -0.042 0.9664
## loanapp$male1 -4.010678 94.258422 -0.043 0.9661
## loanapp$married0 -2.096414 418.227224 -0.005 0.9960
## loanapp$married1 -1.829983 418.227210 -0.004 0.9965
## loanapp$dep0 -3.276150 376.753578 -0.009 0.9931
## loanapp$dep1 -3.291261 376.753585 -0.009 0.9930
## loanapp$dep2 -3.401128 376.753582 -0.009 0.9928
## loanapp$dep3 -3.388711 376.753604 -0.009 0.9928
## loanapp$dep4 -3.816284 376.753667 -0.010 0.9919
## loanapp$dep5 1.146623 406.202496 0.003 0.9977
## loanapp$dep6 0.915706 430.929803 0.002 0.9983
## loanapp$dep7 -0.096264 532.810001 0.000 0.9999
## loanapp$dep8 NA NA NA NA
## loanapp$sch 0.011054 0.095678 0.116 0.9080
## loanapp$cosign 0.056668 0.242111 0.234 0.8149
## loanapp$chist 0.584222 0.095920 6.091 1.12e-09 ***
## loanapp$pubrec -0.773119 0.127788 -6.050 1.45e-09 ***
## loanapp$mortlat1 -0.165458 0.258328 -0.640 0.5219
## loanapp$mortlat2 -0.492990 0.338468 -1.457 0.1452
## loanapp$vr -0.190553 0.081869 -2.328 0.0199 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1480.7 on 1988 degrees of freedom
## Residual deviance: 1194.9 on 1964 degrees of freedom
## AIC: 1244.9
##
## Number of Fisher Scoring iterations: 14
library(readxl)
apple <- read_excel("apple.xls")
head(apple)
## # A tibble: 6 x 17
## id educ date state regprc ecoprc inseason hhsize male faminc age
## <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 10002 16 "\"1~ "\"S~ 1.19 1.19 1 4 0 45 43
## 2 10004 16 "\"1~ "\"K~ 0.59 0.79 0 1 0 65 37
## 3 10034 18 "\"1~ "\"M~ 0.59 0.99 1 3 0 65 44
## 4 10035 12 "\"1~ "\"T~ 0.89 1.09 1 2 1 55 55
## 5 10039 15 "\"1~ "\"N~ 0.89 1.09 0 1 1 25 22
## 6 10041 12 "\"1~ "\"W~ 0.59 0.79 1 4 0 15 34
## # ... with 6 more variables: reglbs <dbl>, ecolbs <dbl>, numlt5 <dbl>,
## # num5_17 <dbl>, num18_64 <dbl>, num_gt64 <dbl>
sum(apple$ecolbs == 0.00000)
## [1] 248
Tobit model
library(readxl)
smoke <- read_excel("smoke.xls")
head(smoke)
## # A tibble: 6 x 10
## educ cigpric white age income cigs restaurn lincome agesq lcigpric
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 16 60.5 1 46 20000 0 0 9.90 2116 4.10
## 2 16 57.9 1 40 30000 0 0 10.3 1600 4.06
## 3 12 57.7 1 58 30000 3 0 10.3 3364 4.05
## 4 13.5 57.9 1 30 20000 0 0 9.90 900 4.06
## 5 10 58.3 1 17 20000 0 0 9.90 289 4.07
## 6 6 59.3 1 86 6500 0 0 8.78 7396 4.08
Poisson Regression