###setwd("c:/Users/Michael/DROPBOX/priv/CUNY/MSDS/201902-Spring/DATA606-Jason/Homework")# look at the exact data set referenced
data("babies")
summary(babies)## case bwt gestation parity age
## Min. : 1.00 Min. : 55.00 Min. :148.00 Min. :0.00000 Min. :15.000
## 1st Qu.: 309.75 1st Qu.:108.75 1st Qu.:272.00 1st Qu.:0.00000 1st Qu.:23.000
## Median : 618.50 Median :120.00 Median :280.00 Median :0.00000 Median :26.000
## Mean : 618.50 Mean :119.58 Mean :279.34 Mean :0.25485 Mean :27.255
## 3rd Qu.: 927.25 3rd Qu.:131.00 3rd Qu.:288.00 3rd Qu.:1.00000 3rd Qu.:31.000
## Max. :1236.00 Max. :176.00 Max. :353.00 Max. :1.00000 Max. :45.000
## NA's :13 NA's :2
## height weight smoke
## Min. :53.000 Min. : 87.00 Min. :0.00000
## 1st Qu.:62.000 1st Qu.:114.75 1st Qu.:0.00000
## Median :64.000 Median :125.00 Median :0.00000
## Mean :64.048 Mean :128.63 Mean :0.39478
## 3rd Qu.:66.000 3rd Qu.:139.00 3rd Qu.:1.00000
## Max. :72.000 Max. :250.00 Max. :1.00000
## NA's :22 NA's :36 NA's :10
p802model <- lm(bwt ~ parity, data=babies)
p802model##
## Call:
## lm(formula = bwt ~ parity, data = babies)
##
## Coefficients:
## (Intercept) parity
## 120.0684 -1.9287
summary(p802model)##
## Call:
## lm(formula = bwt ~ parity, data = babies)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65.068 -11.068 0.396 10.932 57.860
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 120.06840 0.60052 199.9422 <0.0000000000000002 ***
## parity -1.92872 1.18954 -1.6214 0.1052
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.224 on 1234 degrees of freedom
## Multiple R-squared: 0.0021259, Adjusted R-squared: 0.0013173
## F-statistic: 2.629 on 1 and 1234 DF, p-value: 0.10519
\[ \widehat{BirthWeight} = 120.06840391 - 1.92872137 \times parity \]
babies$par="FirstBorn"
babies$par[babies$parity==1]="NotFirstBorn"
babies$par2=as.factor(babies$par)
boxplot(babies$bwt~babies$par2,
main="Birth Weight of First Born vs. Subsequent babies",
xlab="Birth Order",
ylab="Birth Weight (ounces)",
col=c("lightblue","green"),
show.names=T)# look at the exact data set referenced
#install.packages("MASS")
#data(package="MASS")
data(quine, package="MASS")
summary(quine)## Eth Sex Age Lrn Days
## A:69 F:80 F0:27 AL:83 Min. : 0.000
## N:77 M:66 F1:46 SL:63 1st Qu.: 5.000
## F2:40 Median :11.000
## F3:33 Mean :16.459
## 3rd Qu.:22.750
## Max. :81.000
head(quine,n=2)## Eth Sex Age Lrn Days
## 1 A M F0 SL 2
## 2 A M F0 SL 11
tail(quine,n=1)## Eth Sex Age Lrn Days
## 146 N F F3 AL 37
p804model <- lm(Days ~ Eth + Sex + Lrn, data=quine)
p804model##
## Call:
## lm(formula = Days ~ Eth + Sex + Lrn, data = quine)
##
## Coefficients:
## (Intercept) EthN SexM LrnSL
## 18.9318 -9.1122 3.1043 2.1542
summary(p804model)##
## Call:
## lm(formula = Days ~ Eth + Sex + Lrn, data = quine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.1903 -10.0780 -4.9279 5.7680 59.9140
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.9318 2.5704 7.3652 0.00000000001319 ***
## EthN -9.1122 2.5989 -3.5063 0.0006087 ***
## SexM 3.1043 2.6371 1.1771 0.2411076
## LrnSL 2.1542 2.6505 0.8127 0.4177317
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.673 on 142 degrees of freedom
## Multiple R-squared: 0.089331, Adjusted R-squared: 0.070092
## F-statistic: 4.6431 on 3 and 142 DF, p-value: 0.003967
p804coefs <- p804model$coefficients
p804coefs## (Intercept) EthN SexM LrnSL
## 18.9318482 -9.1122415 3.1042554 2.1541571
\[ \widehat { Days } \quad =\quad 18.93184821 + (-9.11224147)I_{{ Eth }_{ (0:Aboriginal) }^{ (1:NotAboriginal) }} + ( 3.10425539 )I_{Sex_{ (0:female) }^{ (1:male) }} + ( 2.15415712 )I_{Lrn_{ (0:AverageLearner) }^{ (1:SlowLearner) }} \]
Q... = mean(quine$Days)
print("Eth:")
Q0.. = mean((subset(quine, Eth=="A" ))$Days)
Q1.. = mean((subset(quine, Eth=="N" ))$Days)
c(Q0.. , Q1.., -Q0.. + Q1..)
print("Sex:")
Q.0. = mean((subset(quine, Sex=="F" ))$Days)
Q.1. = mean((subset(quine, Sex=="M" ))$Days)
c(Q.0. , Q.1., -Q.0. + Q.1.)
print("Lrn:")
Q..0 = mean((subset(quine, Lrn=="AL" ))$Days)
Q..1 = mean((subset(quine, Lrn=="SL" ))$Days)
c(Q..0 , Q..1, -Q..0 + Q..1)
Q000 = mean((subset(quine, Eth=="A" & Sex=="F" & Lrn=="AL"))$Days)
Q001 = mean((subset(quine, Eth=="A" & Sex=="F" & Lrn=="SL"))$Days)
Q010 = mean((subset(quine, Eth=="A" & Sex=="M" & Lrn=="AL"))$Days)
Q011 = mean((subset(quine, Eth=="A" & Sex=="M" & Lrn=="SL"))$Days)
Q100 = mean((subset(quine, Eth=="N" & Sex=="F" & Lrn=="AL"))$Days)
Q101 = mean((subset(quine, Eth=="N" & Sex=="F" & Lrn=="SL"))$Days)
Q110 = mean((subset(quine, Eth=="N" & Sex=="M" & Lrn=="AL"))$Days)
Q111 = mean((subset(quine, Eth=="N" & Sex=="M" & Lrn=="SL"))$Days)p804coefs## (Intercept) EthN SexM LrnSL
## 18.9318482 -9.1122415 3.1042554 2.1541571
FirstStudent <- quine[1,]
FirstStudent## Eth Sex Age Lrn Days
## 1 A M F0 SL 2
# identify which coefficients are to be used, i.e.,
# (1 for intercept, 1 if NotAboriginal, 1 if Male, 1 if Slow Learner)
# This observation:
# (1 for intercept, 0 for Aboriginal, 1 for Male, 1 for Slow Learner)
q = c(1,0,1,1)
q## [1] 1 0 1 1
# Compute coefficients are to be used (Aboriginal is set to zero, since it's base case)
p804coefs * q## (Intercept) EthN SexM LrnSL
## 18.9318482 0.0000000 3.1042554 2.1541571
# compute the dot product
FirstStudentEstimatedDays <- as.numeric(p804coefs %*% q)
print(paste("FirstStudentEstimatedDays = ", FirstStudentEstimatedDays))## [1] "FirstStudentEstimatedDays = 24.1902607210836"
# display the actual number of days absent
FirstStudentActualDays <- FirstStudent$Days
print(paste("FirstStudentActualDays = ", FirstStudentActualDays))## [1] "FirstStudentActualDays = 2"
# compute the residual: Actual minus estimated
ManualFirstStudentResidual <- FirstStudentActualDays - FirstStudentEstimatedDays
ManualFirstStudentResidual## [1] -22.190261
# compare vs. the value returned by the model
ModelFirstStudentResidual <- p804model$residuals[1]
ModelFirstStudentResidual ## 1
## -22.190261
ResidVariance <- var(p804model$residuals)
ResidVariance## [1] 240.56881
AbsentDaysVariance <- var(quine$Days)
AbsentDaysVariance## [1] 264.16726
n = dim(quine)[1]
n## [1] 146
# calculate the R2 manually
manual_R2 <- 1 - ResidVariance/AbsentDaysVariance
manual_R2## [1] 0.089331491
# check vs. the value returned by the regression
model_R2 <- summary(p804model)$r.squared
model_R2## [1] 0.089331491
# calculate the adjR2 manually
manual_adjR2 <- 1 - ResidVariance/AbsentDaysVariance * (n-1)/(n-3-1)
manual_adjR2## [1] 0.070092016
# check vs. the value returned by the regression
model_adjR2 <- summary(p804model)$adj.r.squared
model_adjR2## [1] 0.070092016
tabl=data.frame(matrix(
c(
c("Fullmodel", 0.0701),
c("Noethnicity", -0.0033),
c("Nosex", 0.0676),
c("No learner status", 0.0723)
),
nrow = 4, ncol = 2, byrow = T,
dimnames=list(
c(1,2,3,4),
c("Model","Adjusted R2")
)),stringsAsFactors = F)
tabl## Model Adjusted.R2
## 1 Fullmodel 0.0701
## 2 Noethnicity -0.0033
## 3 Nosex 0.0676
## 4 No learner status 0.0723
lrn) should be dropped first, because doing so increases the adjusted-\(R^2\) the most.# look at the exact data set referenced
require(olsrr)## Loading required package: olsrr
## Warning: package 'olsrr' was built under R version 3.5.3
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
ols_step_backward_p(p804model,prem = .3,details = T)## Backward Elimination Method
## ---------------------------
##
## Candidate Terms:
##
## 1 . Eth
## 2 . Sex
## 3 . Lrn
##
## We are eliminating variables based on p value...
##
## - Lrn
##
## Backward Elimination: Step 1
##
## Variable Lrn Removed
##
## Model Summary
## ---------------------------------------------------------------
## R 0.292 RMSE 15.655
## R-Squared 0.085 Coef. Var 95.114
## Adj. R-Squared 0.072 MSE 245.068
## Pred R-Squared 0.046 MAE 11.743
## ---------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## ---------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## ---------------------------------------------------------------------
## Regression 3259.515 2 1629.757 6.65 0.0017
## Residual 35044.739 143 245.068
## Total 38304.253 145
## ---------------------------------------------------------------------
##
## Parameter Estimates
## -----------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -----------------------------------------------------------------------------------------
## (Intercept) 19.984 2.218 9.010 0.000 15.600 24.368
## EthN -9.065 2.595 -0.279 -3.493 0.001 -14.194 -3.935
## SexM 2.778 2.603 0.085 1.067 0.288 -2.368 7.923
## -----------------------------------------------------------------------------------------
##
##
##
## No more variables satisfy the condition of p value = 0.3
##
##
## Variables Removed:
##
## - Lrn
##
##
## Final Model Output
## ------------------
##
## Model Summary
## ---------------------------------------------------------------
## R 0.292 RMSE 15.655
## R-Squared 0.085 Coef. Var 95.114
## Adj. R-Squared 0.072 MSE 245.068
## Pred R-Squared 0.046 MAE 11.743
## ---------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## ---------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## ---------------------------------------------------------------------
## Regression 3259.515 2 1629.757 6.65 0.0017
## Residual 35044.739 143 245.068
## Total 38304.253 145
## ---------------------------------------------------------------------
##
## Parameter Estimates
## -----------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -----------------------------------------------------------------------------------------
## (Intercept) 19.984 2.218 9.010 0.000 15.600 24.368
## EthN -9.065 2.595 -0.279 -3.493 0.001 -14.194 -3.935
## SexM 2.778 2.603 0.085 1.067 0.288 -2.368 7.923
## -----------------------------------------------------------------------------------------
##
##
## Elimination Summary
## --------------------------------------------------------------------------
## Variable Adj.
## Step Removed R-Square R-Square C(p) AIC RMSE
## --------------------------------------------------------------------------
## 1 Lrn 0.0851 0.0723 2.6605 1222.5231 15.6547
## --------------------------------------------------------------------------
lrn is removed first.# look at the exact data set referenced
data("orings")
summary(orings)## temp damage
## Min. :53.000 Min. :0.00000
## 1st Qu.:67.000 1st Qu.:0.00000
## Median :70.000 Median :0.00000
## Mean :69.565 Mean :0.47826
## 3rd Qu.:75.000 3rd Qu.:1.00000
## Max. :81.000 Max. :5.00000
orings$undamaged = 6 - orings$damage
orings## temp damage undamaged
## 1 53 5 1
## 2 57 1 5
## 3 58 1 5
## 4 63 1 5
## 5 66 0 6
## 6 67 0 6
## 7 67 0 6
## 8 67 0 6
## 9 68 0 6
## 10 69 0 6
## 11 70 1 5
## 12 70 0 6
## 13 70 1 5
## 14 70 0 6
## 15 72 0 6
## 16 73 0 6
## 17 75 0 6
## 18 75 1 5
## 19 76 0 6
## 20 76 0 6
## 21 78 0 6
## 22 79 0 6
## 23 81 0 6
plot(orings$damage~jitter(orings$temp),
xlab="Temperature at launch",
ylab="Number of failed O-Rings",
main="Failed O-Rings by temperature at launch")## a messy way to create a 138x2 matrix of (temperature, failure or success)
## necessary for the regression
failarray138=array(0,dim=138)
temperaturearray=array(rep(orings[,"temp"],6),dim=c(1,138)) # 138 entries
failarray23=orings[,2] # 5 1 1 1 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0
fail1=array(unlist(lapply(X = failarray23,FUN = min,1)),dim=23) # 1 1 1 1 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0
failarray23 = failarray23 - fail1 # 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
fail2=array(unlist(lapply(X = failarray23,FUN = min,1)),dim=23)
failarray23 = failarray23 - fail2 # 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
fail3=array(unlist(lapply(X = failarray23,FUN = min,1)),dim=23)
failarray23 = failarray23 - fail3 # 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
fail4=array(unlist(lapply(X = failarray23,FUN = min,1)),dim=23)
failarray23 = failarray23 - fail4 # 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
fail5=array(unlist(lapply(X = failarray23,FUN = min,1)),dim=23)
failarray23 = failarray23 - fail5 # 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
fail6=array(unlist(lapply(X = failarray23,FUN = min,1)),dim=23)
failarray138 = array(c(fail1,fail2,fail3,fail4,fail5,fail6),dim=c(1,138))
failsarray=t(rbind(temperaturearray,failarray138))
colnames(failsarray)=c("temp","failure")
fails=data.frame(failsarray)
failsbytemp=fails[order(fails$temp),]
failsarraybytemp=cbind(failsbytemp$temp,failsbytemp$failure)
colnames(failsarraybytemp)=c("temp","failure")
failsbytemp=data.frame(failsarraybytemp)fail_model=glm(failure~temp, data=failsbytemp,family = binomial)
summary(fail_model)##
## Call:
## glm(formula = failure ~ temp, family = binomial, data = failsbytemp)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.26457 -0.33952 -0.24715 -0.12991 3.02159
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 11.662990 3.296157 3.5384 0.0004026 ***
## temp -0.216234 0.053175 -4.0665 0.00004773 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 76.7448 on 137 degrees of freedom
## Residual deviance: 54.7594 on 136 degrees of freedom
## AIC: 58.7594
##
## Number of Fisher Scoring iterations: 6
logit_predictions=t(t(predict(fail_model)))
prob_predictions=logistic(logit_predictions)
predicts = cbind(fails,prob_predictions)\[ \hat{p_i} = \frac { e^{logit(\hat{p_i} )}}{1+e^{logit(\hat{p_i} )}} \]
11.6630 for the intercept indicates the value of the logit if the temperature were zero.-0.2162 measures the estimated reduction in the value of the logit for each degree increase in temperature above zero, where the scale is in fahrenheit.\[ z.value = \frac{estimate}{std.error} \]
\[ logit(\hat{p_i})=ln \left( \frac{\hat{p_i}}{1-\hat{p_i}} \right) = 11.6630 - 0.2162 \times temperature_i\]
Exercise 8.16 introduced us to O-rings that were identified as a plausible explanation for the breakup of the Challenger space shuttle 73 seconds into takeoff in 1986.
The investigation found that the ambient temperature at the time of the shuttle launch was closely related to the damage of O-rings, which are a critical component of the shuttle.
See this earlier exercise if you would like to browse the original data.
orings$failure=orings$damage/6.0
plot(orings$failure~jitter(orings$temp), col="blue",
xlab="Temperature (Fahrenheit)",
ylab="Probability of damage",
main="O-Rings-Probability of Damage by Temperature at launch"
)\[ log \left( \frac { \hat { p } }{ 1-\hat { p } } \right) =\quad 11.6630\quad -\quad (0.2162)Temperature \]
where \(\hat{p}\) is the model-estimated probability that an O-ring will become damaged.
Use the model to calculate the probability that an O-ring will become damaged at each of the following ambient temperatures: 51, 53, and 55 degrees Fahrenheit.
# make a temperature list of the three temperatures requested
temperature_list = seq(51,55,2)
temperature_list## [1] 51 53 55
# predict the logit for each of the three temperatures
logit_preds = predict.glm(fail_model,newdata=data.frame(temp=temperature_list))
names(logit_preds) = temperature_list
logit_preds## 51 53 55
## 0.63507282 0.20260550 -0.22986183
# The desired probabilities for the three temperatures
prob_preds = logistic(logit_preds)
print("The desired probabilities are:")## [1] "The desired probabilities are:"
print(t(t(prob_preds)))## [,1]
## 51 0.65363882
## 53 0.55047882
## 55 0.44278623
# compute the probabilities from the equation rather than the model
manual_probs = logistic(11.6630 - (0.2162) * temperature_list)
manual_probs## [1] 0.65402974 0.55092283 0.44324565
The model-estimated probabilities for several additional ambient temperatures are provided below, where subscripts indicate the temperature:
\[\hat{p}_{57} = 0.341\] \[\hat{p}_{59} = 0.251\] \[\hat{p}_{61} = 0.179\] \[\hat{p}_{63} = 0.124\] \[\hat{p}_{65} = 0.084\] \[\hat{p}_{67} = 0.056\] \[\hat{p}_{69} = 0.037\] \[\hat{p}_{71} = 0.024\]
# make a temperature list of the three temperatures requested
temperature_list = seq(51,81,2)
temperature_list## [1] 51 53 55 57 59 61 63 65 67 69 71 73 75 77 79 81
# predict the logit for each of the three temperatures
logit_preds = predict.glm(fail_model,newdata=data.frame(temp=temperature_list))
names(logit_preds) = temperature_list
logit_preds## 51 53 55 57 59 61 63 65
## 0.63507282 0.20260550 -0.22986183 -0.66232916 -1.09479649 -1.52726381 -1.95973114 -2.39219847
## 67 69 71 73 75 77 79 81
## -2.82466579 -3.25713312 -3.68960045 -4.12206778 -4.55453510 -4.98700243 -5.41946976 -5.85193709
# The desired probabilities for the three temperatures
prob_preds = logistic(logit_preds)
print("The desired probabilities are:")## [1] "The desired probabilities are:"
print(t(t(prob_preds)))## [,1]
## 51 0.6536388178
## 53 0.5504788165
## 55 0.4427862349
## 57 0.3402165925
## 59 0.2507161453
## 61 0.1783943753
## 63 0.1234961473
## 65 0.0837695402
## 67 0.0560057456
## 69 0.0370714129
## 71 0.0243730934
## 73 0.0159523561
## 75 0.0104098839
## 77 0.0067798159
## 79 0.0044099608
## 81 0.0028660879
logit_predictions=t(t(predict(fail_model)))
prob_predictions=logistic(logit_predictions)
predicts = cbind(failsbytemp,prob_predictions)
plot(predicts$temp,predicts$prob_predictions,col="red")
lines(predicts$temp,predicts$prob_predictions,type="l",col="red")#plot.new()
plot(orings$failure~jitter(orings$temp), col="blue",
xlab="Temperature (Fahrenheit)",
ylab="Probability of damage",ylim=c(0,1),
main="O-Rings-Probability of Damage by Temperature at launch",
pch=19
)
points(predicts$temp,predicts$prob_predictions,col="red",pch=18)
lines(predicts$temp,predicts$prob_predictions,col="red",lty=1)
legend(x=65, y=0.7, legend=c("Actual", "Prediction"),
cex=0.8,
pch = c(19, 18), lty = c(NA, 1),
col = c("blue", "red"), text.col = c("blue", "red"))