Data_Analysis_and_Decision_Making

#Question 1

#1. Exercise
#For the prostate data infaraway, fit a model with lpsa as the response and the other variables as predictors.
#Compute 90% and 95% CIs for the parameter associated with age

require(faraway)

## Loading required package: faraway

g=lm(lpsa ~ .,prostate)
summary(g)

## 
## Call:
## lm(formula = lpsa ~ ., data = prostate)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7331 -0.3713 -0.0170  0.4141  1.6381 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.669337   1.296387   0.516  0.60693    
## lcavol       0.587022   0.087920   6.677 2.11e-09 ***
## lweight      0.454467   0.170012   2.673  0.00896 ** 
## age         -0.019637   0.011173  -1.758  0.08229 .  
## lbph         0.107054   0.058449   1.832  0.07040 .  
## svi          0.766157   0.244309   3.136  0.00233 ** 
## lcp         -0.105474   0.091013  -1.159  0.24964    
## gleason      0.045142   0.157465   0.287  0.77503    
## pgg45        0.004525   0.004421   1.024  0.30886    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7084 on 88 degrees of freedom
## Multiple R-squared:  0.6548, Adjusted R-squared:  0.6234 
## F-statistic: 20.86 on 8 and 88 DF,  p-value: < 2.2e-16

confint(g,level=0.9)

##                      5 %         95 %
## (Intercept) -1.485718237  2.824391633
## lcavol       0.440867156  0.733176497
## lweight      0.171846568  0.737088281
## age         -0.038210200 -0.001064151
## lbph         0.009890745  0.204217317
## svi          0.360029029  1.172285623
## lcp         -0.256770899  0.045822373
## gleason     -0.216620186  0.306903382
## pgg45       -0.002824333  0.011874796

confint(g,level=0.95)

##                    2.5 %      97.5 %
## (Intercept) -1.906960983 3.245634379
## lcavol       0.412298699 0.761744954
## lweight      0.116603435 0.792331414
## age         -0.041840618 0.002566267
## lbph        -0.009101499 0.223209561
## svi          0.280644232 1.251670420
## lcp         -0.286344443 0.075395916
## gleason     -0.267786053 0.358069248
## pgg45       -0.004260932 0.013311395

#Question 2

#2. Exercise

#Compute and display a 95% joint confidence region for 
#the parameters associated with age and lbph. 
#Plot the origin and report the outcome of the appropriate hypotheses test. Affirm 
#this conclusion with an appropriate partial F-test.


require(ellipse)

## Loading required package: ellipse

plot(ellipse(g,c("age","lbph")),
     type="l",
     main="Joint Confidence Region")
points(0,0)
points(coef(g)["age"],coef(g)["lbph"],pch=18)
abline(v=confint(g)["age",],lty=2)
abline(h=confint(g)["lbph",],lty=2)

#We do not reject the null hyotheses as the origin lies inside the ellipse
#The 95% CR is equivalent to testing the full model.

g2=lm(lpsa ~ age + lbph, prostate)
anova(g2,g)

## Analysis of Variance Table
## 
## Model 1: lpsa ~ age + lbph
## Model 2: lpsa ~ lcavol + lweight + age + lbph + svi + lcp + gleason + 
##     pgg45
##   Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1     94 122.124                                  
## 2     88  44.163  6    77.961 25.891 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# based on the partial F-test we get to know that F value is very big and 
#p-value =2.2*10^-16 which is very small thus we do not reject the null hypotheses 
#and consider age and lbph

#Question 3
#Predict lpsa (95%) for a new patient with lcavol = 1.22692, lweight = 3.62301, age = 65, lbph = -0.3001, svi = 0.0, 
#lcp = -0.79851, gleason = 7.0, pgg45 = 15.0.
#Do this again for the mean response. Using the exp() function, obtain the new prediction and mean response for psa.


x0= data.frame(lcavol=1.22692,
               lweight=3.62301,
               age=65,
               lbph=-0.3001,
               svi=0.0,
               lcp=-0.79851,
               gleason=7.0,
               pgg45=15.0)

predict(g,
        x0,
        interval = "prediction",
        level=.95)

##        fit       lwr     upr
## 1 2.195654 0.7708983 3.62041

exp(predict(g,x0,interval = "prediction",
            level = .95))

##        fit      lwr      upr
## 1 8.985877 2.161707 37.35288

#Question 4
#Repeat the above exercise with new patient age = 20

x0= data.frame(lcavol=1.22692,
               lweight=3.62301,
               age=20,
               lbph=-0.3001,
               svi=0.0,
               lcp=-0.79851,
               gleason=7.0,
               pgg45=15.0)

predict(g,
        x0,
        interval = "prediction",
        level=.95)

##        fit      lwr      upr
## 1 3.079327 1.357826 4.800828

exp(predict(g,x0,interval = "prediction",
            level = .95))

##        fit      lwr      upr
## 1 21.74376 3.887732 121.6111

#question 5

#For the model in exercise 1, remove all the predictors that are not significant at the 5% level.
#Recompute the predictions for exercises 3 and 4. Compare CIs. On the psa scale, which CIs do you prefer?

fm.1 <- lm(lpsa ~ lcavol+lweight+age+svi,prostate)
summary(fm.1)

## 
## Call:
## lm(formula = lpsa ~ lcavol + lweight + age + svi, data = prostate)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7204 -0.4387  0.0035  0.4696  1.5890 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.171941   0.737459   0.233 0.816160    
## lcavol       0.561925   0.075656   7.427 5.49e-11 ***
## lweight      0.546718   0.156426   3.495 0.000731 ***
## age         -0.009287   0.010505  -0.884 0.378979    
## svi          0.665046   0.210028   3.166 0.002093 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7177 on 92 degrees of freedom
## Multiple R-squared:  0.6296, Adjusted R-squared:  0.6135 
## F-statistic: 39.09 on 4 and 92 DF,  p-value: < 2.2e-16

newP.data0 <- data.frame(lcavol = 1.22692, lweight = 3.62031, age = 65,svi=0)
predict(fm.1,newP.data0,interval= "prediction",level=0.95)

##        fit       lwr      upr
## 1 2.237007 0.8017119 3.672302

exp(predict(fm.1,newP.data0,interval = "prediction",
            level = .95))

##       fit      lwr      upr
## 1 9.36526 2.229354 39.34238

newP.data1 <- data.frame(lcavol = 1.22692, lweight = 3.62031, age = 20,svi=0)
predict(fm.1,newP.data1,interval= "prediction",level=0.95)

##        fit       lwr      upr
## 1 2.654925 0.9556234 4.354227

exp(predict(fm.1,newP.data1,interval = "prediction",
            level = .95))

##        fit      lwr      upr
## 1 14.22393 2.600291 77.80669

#The length of the second prediction interval is longer than the first.This is also because of age
#The prediction intervals from the second model should be narrower than those from the original model theoretically because all significant values have been removed
#Therefore,the second one should explain the response more accurately than the first model
#The narrower prediction intervals are preferred

#Excercise 6

#Test the "small" model in exercise 5 against the "big"" model in
#exercise 1 at probability type I error ??=0.05 Which model is preferred?
fit1<- lm(lpsa~lcavol+lweight+age+lbph+svi+lcp+gleason+pgg45,prostate)
fit2 = lm(lpsa~lcavol+lweight+svi, data=prostate)
anova(fit1, fit2)

## Analysis of Variance Table
## 
## Model 1: lpsa ~ lcavol + lweight + age + lbph + svi + lcp + gleason + 
##     pgg45
## Model 2: lpsa ~ lcavol + lweight + svi
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     88 44.163                           
## 2     93 47.785 -5   -3.6218 1.4434 0.2167

#Since the p-value for the F-stat is 0.2167 and is larger than the significance level 0.05, we accept the reduced model
#Thus, the smaller model is preferred because it is simpler and still valid.

Data_Analysis_and_Decision_Making_5.R

aksha

Mon Oct 09 02:42:26 2017