Student Detail:

Xiyue Shu (s3705474)

Load Packages

library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
library(MASS)
library(leaps)
library(car)
## Loading required package: carData

Question 1 (a)

q1 = read.csv("asphalt.csv")
names(q1)[1]<-'y'
names(q1)[2]<-'x1'
names(q1)[3]<-'x2'
names(q1)[4]<-'x3'
names(q1)[5]<-'x4'
names(q1)[6]<-'x5'
names(q1)[7]<-'x6'
full<-lm(y~x1+x2+x3+x4+x5+factor(x6), q1)
anova(full)
## Analysis of Variance Table
## 
## Response: y
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## x1          1 424.93  424.93 27.3767 2.310e-05 ***
## x2          1  16.13   16.13  1.0393    0.3182    
## x3          1  28.40   28.40  1.8295    0.1888    
## x4          1  19.73   19.73  1.2711    0.2707    
## x5          1  41.05   41.05  2.6450    0.1169    
## factor(x6)  1 463.77  463.77 29.8792 1.283e-05 ***
## Residuals  24 372.51   15.52                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(full)
## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5 + factor(x6), data = q1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.6781 -1.8309  0.1751  1.4858 11.1262 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -57.584153  35.896498  -1.604   0.1218    
## x1            0.003071   0.008161   0.376   0.7100    
## x2            7.498028   3.967155   1.890   0.0709 .  
## x3            6.225817   4.812723   1.294   0.2081    
## x4            0.522211   1.174673   0.445   0.6606    
## x5           -0.241275   1.684963  -0.143   0.8873    
## factor(x6)1 -10.772593   1.970769  -5.466 1.28e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.94 on 24 degrees of freedom
## Multiple R-squared:  0.7274, Adjusted R-squared:  0.6592 
## F-statistic: 10.67 on 6 and 24 DF,  p-value: 8.588e-06

Question 1 (b)

Question 1 (c)

r<-regsubsets(y~x1+x2+x3+x4+x5+factor(x6), data=q1)
summary(r)
## Subset selection object
## Call: regsubsets.formula(y ~ x1 + x2 + x3 + x4 + x5 + factor(x6), data = q1)
## 6 Variables  (and intercept)
##             Forced in Forced out
## x1              FALSE      FALSE
## x2              FALSE      FALSE
## x3              FALSE      FALSE
## x4              FALSE      FALSE
## x5              FALSE      FALSE
## factor(x6)1     FALSE      FALSE
## 1 subsets of each size up to 6
## Selection Algorithm: exhaustive
##          x1  x2  x3  x4  x5  factor(x6)1
## 1  ( 1 ) " " " " " " " " " " "*"        
## 2  ( 1 ) " " "*" " " " " " " "*"        
## 3  ( 1 ) " " "*" "*" " " " " "*"        
## 4  ( 1 ) " " "*" "*" "*" " " "*"        
## 5  ( 1 ) "*" "*" "*" "*" " " "*"        
## 6  ( 1 ) "*" "*" "*" "*" "*" "*"
plot(r, scale="adjr2")

model_sub <- lm(y~x2+x3+factor(x6), q1)
anova(model_sub)
## Analysis of Variance Table
## 
## Response: y
##            Df Sum Sq Mean Sq F value   Pr(>F)    
## x2          1  44.55   44.55  3.1883  0.08540 .  
## x3          1  97.69   97.69  6.9917  0.01347 *  
## factor(x6)  1 847.01  847.01 60.6182 2.26e-08 ***
## Residuals  27 377.27   13.97                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model_sub)
## 
## Call:
## lm(formula = y ~ x2 + x3 + factor(x6), data = q1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9414 -1.7181 -0.1026  1.4959 11.0532 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -57.332     28.107  -2.040   0.0513 .  
## x2             7.427      3.251   2.285   0.0304 *  
## x3             6.828      4.039   1.691   0.1024    
## factor(x6)1  -10.537      1.353  -7.786 2.26e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.738 on 27 degrees of freedom
## Multiple R-squared:  0.7239, Adjusted R-squared:  0.6932 
## F-statistic:  23.6 on 3 and 27 DF,  p-value: 1.044e-07
par(mfrow=c(2,2))
plot(model_sub)

par(mfrow=c(1,1))
acf(model_sub$residuals)

* Check for randomness: residuals vs fitted plot show obvious curvature, variability on y axis is not constant across the range of values on the x axis. There is violation, residuals are not randomly distributed. * Check for normality: normal Q-Q plot shows departures from the line. There is violation, residuals are not normally distributed. * Check for homoscedasticity: Scale-location plot shows that the red line is curved and the variance in the square root of the standardized residuals is not consistent across fitted values. There is violation, residuals do not exhibit homoscedasticity. * Check for influential cases: Residuals vs. Leverage plot shows that there is a point that falls passed the red line in the top right hand corner.
* Check for autocorrelation: ACF plot shows one significant lag. There is violation, residuals are autocorrelated. * Observations based on the residual plots will then be checked using statistic tests.

ncvTest(model_sub)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 13.25413, Df = 1, p = 0.00027198
durbinWatsonTest(model_sub)
##  lag Autocorrelation D-W Statistic p-value
##    1      -0.1063792      2.159149   0.782
##  Alternative hypothesis: rho != 0
shapiro.test(model_sub$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  model_sub$residuals
## W = 0.93002, p-value = 0.04391

Question 2 (a)

q2 = read.csv("byssinosis.csv")
names(q2)[4]<-'x1'
names(q2)[5]<-'x2'
names(q2)[6]<-'x3'
names(q2)[7]<-'x4'
names(q2)[8]<-'x5'
model2<-glm(cbind(BysYes,BysNo)~factor(x1)+factor(x2)+factor(x3)+factor(x4)+factor(x5),family=binomial,q2)
summary.glm(model2)
## 
## Call:
## glm(formula = cbind(BysYes, BysNo) ~ factor(x1) + factor(x2) + 
##     factor(x3) + factor(x4) + factor(x5), family = binomial, 
##     data = q2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5240  -0.8105  -0.1952   0.2071   1.5643  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.9452     0.2334  -8.334  < 2e-16 ***
## factor(x1)2  -2.5799     0.2921  -8.834  < 2e-16 ***
## factor(x1)3  -2.7306     0.2153 -12.681  < 2e-16 ***
## factor(x2)2   0.1163     0.2072   0.562 0.574426    
## factor(x3)2   0.1239     0.2288   0.542 0.587983    
## factor(x4)2  -0.6413     0.1944  -3.299 0.000971 ***
## factor(x5)2   0.5641     0.2617   2.156 0.031091 *  
## factor(x5)3   0.7531     0.2161   3.484 0.000493 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 322.527  on 64  degrees of freedom
## Residual deviance:  43.271  on 57  degrees of freedom
## AIC: 165.95
## 
## Number of Fisher Scoring iterations: 5
pchisq(model2$deviance, df=model2$df.residual, lower.tail=FALSE)
## [1] 0.9103186
model3<-glm(cbind(BysYes,BysNo)~factor(x1)+factor(x4)+factor(x5),family=binomial,q2)

Question 2(b)

summary.glm(model3)
## 
## Call:
## glm(formula = cbind(BysYes, BysNo) ~ factor(x1) + factor(x4) + 
##     factor(x5), family = binomial, data = q2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6239  -0.7904  -0.1946   0.2679   1.6605  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.8336     0.1525 -12.026  < 2e-16 ***
## factor(x1)2  -2.5493     0.2614  -9.753  < 2e-16 ***
## factor(x1)3  -2.7175     0.1898 -14.314  < 2e-16 ***
## factor(x4)2  -0.6210     0.1908  -3.255 0.001133 ** 
## factor(x5)2   0.5060     0.2490   2.032 0.042119 *  
## factor(x5)3   0.6728     0.1813   3.710 0.000207 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 322.527  on 64  degrees of freedom
## Residual deviance:  43.882  on 59  degrees of freedom
## AIC: 162.56
## 
## Number of Fisher Scoring iterations: 5
pchisq(model3$deviance, df=model3$df.residual, lower.tail=FALSE)
## [1] 0.9291498

Question 2(c)

\(x1 = 2, x2 = 2, x3 = 1, x4 = 2, x5 = 3\) :

exp(-1.8336-2.5493-0.6210+0.6728)/(1+exp(-1.8336-2.5493-0.6210+0.6728))
## [1] 0.01298231

\(x1 = 1, x2 = 2, x3 = 2, x4 = 1, x5 = 3\) :

exp(-1.8336+0.6728)/(1+exp(-1.8336+0.6728))
## [1] 0.238522

\(x1 = 2, x2 = 1, x3 = 1, x4 = 2, x5 = 2\) :

exp(-1.8336-2.5493-0.6210+0.5060)/(1+exp(-1.8336-2.5493-0.6210+0.5060))
## [1] 0.01100979

\(x1 = 3, x2 = 1, x3 = 2, x4 = 2, x5 = 1\) :

exp(-1.8336-2.7175-0.6210)/(1+exp(-1.8336-2.7175-0.6210))
## [1] 0.005640646