Problem Set # 4

Frank Annie

October 29th 2015

Due Date: October 29, 2015

Total Points: 30

1 The data set trees (datasets package, part of base install) contains the girth (inches), height (feet) and volume of timber from 31 felled Black Cherry trees. Suppose you want to be able to predict tree volume based on girth.

Create a scatter plot of the data and label the axes.

require(ggplot2)

## Loading required package: ggplot2

require(datasets)
head(trees)

##   Girth Height Volume
## 1   8.3     70   10.3
## 2   8.6     65   10.3
## 3   8.8     63   10.2
## 4  10.5     72   16.4
## 5  10.7     81   18.8
## 6  10.8     83   19.7

p = ggplot(trees, aes(x = Girth, y = Volume)) + geom_point(col = "#336600", 
    pch = 8) + xlab("Girth (In)") + ylab("Volume") + theme(panel.grid.minor = element_line(color = "#999999", 
    linetype = "dotted"))
p

Add a linear regression line to the plot.

p = p + geom_smooth(method = lm, se = FALSE, col = "blue")
p

c) Determine the sum of squared residuals.

p = p + geom_segment(aes(y = predict(lm(Volume ~ Girth)), yend = Volume, x = Girth, 
    xend = Girth), col = "red")
p

sum(residuals(lm(trees$Volume ~ trees$Girth))^2)

## [1] 524.3025

Repeat a, b, and c but use the square of the girth instead of girth as the explanatory variable. Which model do you prefer and why?

g = (trees$Girth^2)
g

##  [1]  68.89  73.96  77.44 110.25 114.49 116.64 121.00 121.00 123.21 125.44
## [11] 127.69 129.96 129.96 136.89 144.00 166.41 166.41 176.89 187.69 190.44
## [21] 196.00 201.64 210.25 256.00 265.69 299.29 306.25 320.41 324.00 324.00
## [31] 424.36

p2 = ggplot(trees, aes(x = g, y = Volume)) + geom_point(col = "purple", pch = 8) + 
    xlab("Girth Squared (In)") + ylab("Volume") + theme(panel.grid.minor = element_line(color = "#999999", 
    linetype = "dotted"))
p2

p2 = p2 + geom_smooth(method = lm, se = FALSE, col = "blue")
p2

p2 = p2 + geom_segment(aes(y = predict(lm(Volume ~ g)), yend = Volume, x = g, 
    xend = g), col = "#FF3366")
p2

sum(residuals(lm(trees$Volume ~ g))^2)

## [1] 329.3191

The second model appears to be a better model because of the residual based values are smaller than other models. It seems that that this model seems to fit better over all.

2 The data set USmelanoma (HSAUR2 package) contains male mortality counts per one million inhabitants by state along with the latitude and longitude centroid of the state.

Create a scatter plot of mortality versus latitude using latitude as the explanatory variable.

require(HSAUR2)

## Loading required package: HSAUR2
## Loading required package: tools

mel = USmelanoma
head(mel)

##             mortality latitude longitude ocean
## Alabama           219     33.0      87.0   yes
## Arizona           160     34.5     112.0    no
## Arkansas          170     35.0      92.5    no
## California        182     37.5     119.5   yes
## Colorado          149     39.0     105.5    no
## Connecticut       159     41.8      72.8   yes

pm = ggplot(mel, aes(x = latitude, y = mortality)) + geom_point(col = "red", 
    pch = 15) + xlab("Latitude (Degrees)") + ylab("Mortality") + theme_bw()
pm

Add the linear regression line to your scatter plot.

pm = pm + geom_smooth(method = lm, se = FALSE, col = "blue")
pm

Regress mortality on latitude and interpret the value of the slope coefficient.

model = lm(mortality ~ latitude, data = mel)
summary(model)

## 
## Call:
## lm(formula = mortality ~ latitude, data = mel)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.972 -13.185   0.972  12.006  43.938 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 389.1894    23.8123   16.34  < 2e-16 ***
## latitude     -5.9776     0.5984   -9.99 3.31e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.12 on 47 degrees of freedom
## Multiple R-squared:  0.6798, Adjusted R-squared:  0.673 
## F-statistic:  99.8 on 1 and 47 DF,  p-value: 3.309e-13

It seems that melanoma cases appears to decrease according to the slope coefficient

Determine the sum of squared errors.

resid(model)

##              Alabama              Arizona             Arkansas 
##           27.0726285          -22.9609178           -9.9721000 
##           California             Colorado          Connecticut 
##           16.9719894           -7.0615570           19.6758231 
##             Delaware District of Columbia              Florida 
##           43.9384430           20.9384430          -24.8155502 
##              Georgia                Idaho             Illinois 
##           22.0726285           -7.1845604          -26.0839213 
##              Indiana                 Iowa               Kansas 
##          -20.8883941           -8.9331226            6.9496251 
##             Kentucky            Louisiana                Maine 
##          -16.2347199          -12.6871158           -2.0002154 
##             Maryland        Massachusetts             Michigan 
##            5.9384430            6.0668774          -12.1621961 
##            Minnesota          Mississippi             Missouri 
##            1.7818932           13.8771014          -28.0503749 
##              Montana             Nebraska               Nevada 
##            0.7595290          -19.1174676           34.9384430 
##        New Hampshire           New Jersey           New Mexico 
##            1.6310946           10.1116059          -38.9721000 
##             New York       North Carolina         North Dakota 
##           19.8489860           22.0167179            9.7483468 
##                 Ohio             Oklahoma               Oregon 
##          -17.8883941            5.0167179            9.8266217 
##         Pennsylvania         Rhode Island       South Carolina 
##          -13.3018127           -2.3241769           -9.1452629 
##         South Dakota            Tennessee                Texas 
##          -35.3912697           12.0055358           28.1061749 
##                 Utah              Vermont             Virginia 
##          -11.0727391           26.8266217            0.9719894 
##           Washington        West Virginia            Wisconsin 
##           11.7483468          -21.2570841          -13.1845604 
##              Wyoming 
##            1.8489860

SSE = sum(resid(model)^2)
SSE

## [1] 17173.07

Examine the model assumptions. What do you conclude about them?

ggplot(USmelanoma, aes(x = cut(latitude, 4), y = mortality)) + geom_boxplot() + 
    xlab("Latitude") + ylab("Mortality")

require(sm)

## Loading required package: sm
## Package 'sm', version 2.2-5.4: type help(sm) for summary information

r = residuals(lm(mortality ~ latitude, data = USmelanoma))
sm.density(r, xlab = "Model Residuals", model = "Normal")

qqnorm(r)
qqline(r)

The data is normally distributed. and appears to

3 Consider the endometrialcancer.txt file from https://uploads.strikinglycdn.com/files/302190/ec394e5c-bc2b-4e03-be04-b58eb790f728/endometrialcancer.txt. The file contains information on a study of endometrial cance.

Regress the endometrial cancer cases (column d) onto the duration of oestrogen use (column duration, units months) while controlling for hypertension, gall bladder disease, obesity, and age (column age, units years).

library(msm)


ec <- read.table("https://uploads.strikinglycdn.com/files/302190/ec394e5c-bc2b-4e03-be04-b58eb790f728/endometrialcancer.txt", header = TRUE)
summary(ec)

##        d           gall      hyp        ob         duration    
##  Min.   :0.0000   No :214   No :152   No : 92   Min.   : 0.00  
##  1st Qu.:0.0000   Yes: 34   Yes: 96   Yes:156   1st Qu.: 0.00  
##  Median :0.0000                                 Median : 4.50  
##  Mean   :0.2016                                 Mean   :26.18  
##  3rd Qu.:0.0000                                 3rd Qu.:48.00  
##  Max.   :1.0000                                 Max.   :96.00  
##       age       
##  Min.   :57.00  
##  1st Qu.:66.00  
##  Median :71.00  
##  Mean   :70.92  
##  3rd Qu.:75.00  
##  Max.   :83.00

mod = lm(d ~ duration + hyp + gall + ob + age, data = ec)
summary(mod)

## 
## Call:
## lm(formula = d ~ duration + hyp + gall + ob + age, data = ec)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.63507 -0.19197 -0.12196 -0.03019  0.97709 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.1953300  0.2860506  -0.683  0.49535    
## duration     0.0028831  0.0006985   4.127 5.05e-05 ***
## hypYes       0.0167825  0.0513352   0.327  0.74401    
## gallYes      0.1892853  0.0713971   2.651  0.00855 ** 
## obYes        0.0962666  0.0504864   1.907  0.05773 .  
## age          0.0032217  0.0040092   0.804  0.42243    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3802 on 242 degrees of freedom
## Multiple R-squared:  0.1237, Adjusted R-squared:  0.1056 
## F-statistic:  6.83 on 5 and 242 DF,  p-value: 5.566e-06

Interpret the results of the statistical model. What can you conclude about the relationships between the independent and dependent variables?

mod2 <- glm(d ~ duration + hyp + gall + ob + age, data = ec, family = "binomial")
summary(mod2)

## 
## Call:
## glm(formula = d ~ duration + hyp + gall + ob + age, family = "binomial", 
##     data = ec)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6067  -0.6128  -0.5079  -0.3665   2.3636  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.085605   2.136456  -1.912 0.055834 .  
## duration     0.016686   0.004403   3.790 0.000151 ***
## hypYes       0.126535   0.355881   0.356 0.722174    
## gallYes      1.013917   0.424989   2.386 0.017044 *  
## obYes        0.731593   0.383289   1.909 0.056297 .  
## age          0.020267   0.029735   0.682 0.495497    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 249.30  on 247  degrees of freedom
## Residual deviance: 220.57  on 242  degrees of freedom
## AIC: 232.57
## 
## Number of Fisher Scoring iterations: 4

step(mod2)

## Start:  AIC=232.57
## d ~ duration + hyp + gall + ob + age
## 
##            Df Deviance    AIC
## - hyp       1   220.70 230.70
## - age       1   221.04 231.04
## <none>          220.57 232.57
## - ob        1   224.46 234.46
## - gall      1   226.00 236.00
## - duration  1   234.98 244.98
## 
## Step:  AIC=230.7
## d ~ duration + gall + ob + age
## 
##            Df Deviance    AIC
## - age       1   221.32 229.32
## <none>          220.70 230.70
## - ob        1   224.77 232.77
## - gall      1   226.06 234.06
## - duration  1   235.44 243.44
## 
## Step:  AIC=229.32
## d ~ duration + gall + ob
## 
##            Df Deviance    AIC
## <none>          221.32 229.32
## - ob        1   225.67 231.67
## - gall      1   226.65 232.65
## - duration  1   235.49 241.49

## 
## Call:  glm(formula = d ~ duration + gall + ob, family = "binomial", 
##     data = ec)
## 
## Coefficients:
## (Intercept)     duration      gallYes        obYes  
##    -2.60365      0.01623      0.99717      0.76614  
## 
## Degrees of Freedom: 247 Total (i.e. Null);  244 Residual
## Null Deviance:       249.3 
## Residual Deviance: 221.3     AIC: 229.3

Estrogen appears to incresae the likely hood of devloping cancer

What is the area under the receiving operating characteristic curve value for this model? Intrepret this result.

mod2 <- glm(d ~ duration + hyp + gall + ob + age, data = ec, family = "binomial")

summary(mod2)

## 
## Call:
## glm(formula = d ~ duration + hyp + gall + ob + age, family = "binomial", 
##     data = ec)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6067  -0.6128  -0.5079  -0.3665   2.3636  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.085605   2.136456  -1.912 0.055834 .  
## duration     0.016686   0.004403   3.790 0.000151 ***
## hypYes       0.126535   0.355881   0.356 0.722174    
## gallYes      1.013917   0.424989   2.386 0.017044 *  
## obYes        0.731593   0.383289   1.909 0.056297 .  
## age          0.020267   0.029735   0.682 0.495497    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 249.30  on 247  degrees of freedom
## Residual deviance: 220.57  on 242  degrees of freedom
## AIC: 232.57
## 
## Number of Fisher Scoring iterations: 4

predpro <- predict(mod2, type = c("response"))

library(pROC)

## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

roc_curve <- roc(ec$d~predpro)
plot(roc_curve)

## 
## Call:
## roc.formula(formula = ec$d ~ predpro)
## 
## Data: predpro in 198 controls (ec$d 0) < 50 cases (ec$d 1).
## Area under the curve: 0.7532

The area under the curve is 0.7532 The model shows that there is a chance that a woman with cancer has a chance of getting chancer 50 percent of the time.

Examine the model assumptions of 1) linearity in log-odds, and 2) residual deviance. What do you conclude about your model adequecy?

lfitduration <- loess(ec$d ~ ec$duration)
lfitage <- loess(ec$d ~ ec$age)
ec$lfitduration <- log(predict(lfitduration)/(1-predict(lfitduration)))
ec$lfitage <- log(predict(lfitage)/(1-predict(lfitage)))

library(ggplot2)

ggplot(ec, aes(x = duration, y = lfitduration)) + geom_point() + geom_smooth(method = "loess")

ggplot(ec, aes(x = age, y = lfitage)) + geom_point() + geom_smooth(method = "loess")

pchisq(220, 57, 242, lower.tail = FALSE)

## [1] 0.9948817

The linear realtionship of the model is in question I would argue the accuracy of the less relible that other tests. Other test fail to the reject the null hepothesis based on the P Value.