October 29th 2015
Due Date: October 29, 2015
Total Points: 30
1 The data set trees (datasets package, part of base install) contains the girth (inches), height (feet) and volume of timber from 31 felled Black Cherry trees. Suppose you want to be able to predict tree volume based on girth.
require(ggplot2)
## Loading required package: ggplot2
require(datasets)
head(trees)
## Girth Height Volume
## 1 8.3 70 10.3
## 2 8.6 65 10.3
## 3 8.8 63 10.2
## 4 10.5 72 16.4
## 5 10.7 81 18.8
## 6 10.8 83 19.7
p = ggplot(trees, aes(x = Girth, y = Volume)) + geom_point(col = "#336600",
pch = 8) + xlab("Girth (In)") + ylab("Volume") + theme(panel.grid.minor = element_line(color = "#999999",
linetype = "dotted"))
p
p = p + geom_smooth(method = lm, se = FALSE, col = "blue")
p
c) Determine the sum of squared residuals.
p = p + geom_segment(aes(y = predict(lm(Volume ~ Girth)), yend = Volume, x = Girth,
xend = Girth), col = "red")
p
sum(residuals(lm(trees$Volume ~ trees$Girth))^2)
## [1] 524.3025
g = (trees$Girth^2)
g
## [1] 68.89 73.96 77.44 110.25 114.49 116.64 121.00 121.00 123.21 125.44
## [11] 127.69 129.96 129.96 136.89 144.00 166.41 166.41 176.89 187.69 190.44
## [21] 196.00 201.64 210.25 256.00 265.69 299.29 306.25 320.41 324.00 324.00
## [31] 424.36
p2 = ggplot(trees, aes(x = g, y = Volume)) + geom_point(col = "purple", pch = 8) +
xlab("Girth Squared (In)") + ylab("Volume") + theme(panel.grid.minor = element_line(color = "#999999",
linetype = "dotted"))
p2
p2 = p2 + geom_smooth(method = lm, se = FALSE, col = "blue")
p2
p2 = p2 + geom_segment(aes(y = predict(lm(Volume ~ g)), yend = Volume, x = g,
xend = g), col = "#FF3366")
p2
sum(residuals(lm(trees$Volume ~ g))^2)
## [1] 329.3191
The second model appears to be a better model because of the residual based values are smaller than other models. It seems that that this model seems to fit better over all.
2 The data set USmelanoma (HSAUR2 package) contains male mortality counts per one million inhabitants by state along with the latitude and longitude centroid of the state.
require(HSAUR2)
## Loading required package: HSAUR2
## Loading required package: tools
mel = USmelanoma
head(mel)
## mortality latitude longitude ocean
## Alabama 219 33.0 87.0 yes
## Arizona 160 34.5 112.0 no
## Arkansas 170 35.0 92.5 no
## California 182 37.5 119.5 yes
## Colorado 149 39.0 105.5 no
## Connecticut 159 41.8 72.8 yes
pm = ggplot(mel, aes(x = latitude, y = mortality)) + geom_point(col = "red",
pch = 15) + xlab("Latitude (Degrees)") + ylab("Mortality") + theme_bw()
pm
pm = pm + geom_smooth(method = lm, se = FALSE, col = "blue")
pm
model = lm(mortality ~ latitude, data = mel)
summary(model)
##
## Call:
## lm(formula = mortality ~ latitude, data = mel)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.972 -13.185 0.972 12.006 43.938
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 389.1894 23.8123 16.34 < 2e-16 ***
## latitude -5.9776 0.5984 -9.99 3.31e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.12 on 47 degrees of freedom
## Multiple R-squared: 0.6798, Adjusted R-squared: 0.673
## F-statistic: 99.8 on 1 and 47 DF, p-value: 3.309e-13
It seems that melanoma cases appears to decrease according to the slope coefficient
resid(model)
## Alabama Arizona Arkansas
## 27.0726285 -22.9609178 -9.9721000
## California Colorado Connecticut
## 16.9719894 -7.0615570 19.6758231
## Delaware District of Columbia Florida
## 43.9384430 20.9384430 -24.8155502
## Georgia Idaho Illinois
## 22.0726285 -7.1845604 -26.0839213
## Indiana Iowa Kansas
## -20.8883941 -8.9331226 6.9496251
## Kentucky Louisiana Maine
## -16.2347199 -12.6871158 -2.0002154
## Maryland Massachusetts Michigan
## 5.9384430 6.0668774 -12.1621961
## Minnesota Mississippi Missouri
## 1.7818932 13.8771014 -28.0503749
## Montana Nebraska Nevada
## 0.7595290 -19.1174676 34.9384430
## New Hampshire New Jersey New Mexico
## 1.6310946 10.1116059 -38.9721000
## New York North Carolina North Dakota
## 19.8489860 22.0167179 9.7483468
## Ohio Oklahoma Oregon
## -17.8883941 5.0167179 9.8266217
## Pennsylvania Rhode Island South Carolina
## -13.3018127 -2.3241769 -9.1452629
## South Dakota Tennessee Texas
## -35.3912697 12.0055358 28.1061749
## Utah Vermont Virginia
## -11.0727391 26.8266217 0.9719894
## Washington West Virginia Wisconsin
## 11.7483468 -21.2570841 -13.1845604
## Wyoming
## 1.8489860
SSE = sum(resid(model)^2)
SSE
## [1] 17173.07
ggplot(USmelanoma, aes(x = cut(latitude, 4), y = mortality)) + geom_boxplot() +
xlab("Latitude") + ylab("Mortality")
require(sm)
## Loading required package: sm
## Package 'sm', version 2.2-5.4: type help(sm) for summary information
r = residuals(lm(mortality ~ latitude, data = USmelanoma))
sm.density(r, xlab = "Model Residuals", model = "Normal")
qqnorm(r)
qqline(r)
The data is normally distributed. and appears to
3 Consider the endometrialcancer.txt file from https://uploads.strikinglycdn.com/files/302190/ec394e5c-bc2b-4e03-be04-b58eb790f728/endometrialcancer.txt. The file contains information on a study of endometrial cance.
library(msm)
ec <- read.table("https://uploads.strikinglycdn.com/files/302190/ec394e5c-bc2b-4e03-be04-b58eb790f728/endometrialcancer.txt", header = TRUE)
summary(ec)
## d gall hyp ob duration
## Min. :0.0000 No :214 No :152 No : 92 Min. : 0.00
## 1st Qu.:0.0000 Yes: 34 Yes: 96 Yes:156 1st Qu.: 0.00
## Median :0.0000 Median : 4.50
## Mean :0.2016 Mean :26.18
## 3rd Qu.:0.0000 3rd Qu.:48.00
## Max. :1.0000 Max. :96.00
## age
## Min. :57.00
## 1st Qu.:66.00
## Median :71.00
## Mean :70.92
## 3rd Qu.:75.00
## Max. :83.00
mod = lm(d ~ duration + hyp + gall + ob + age, data = ec)
summary(mod)
##
## Call:
## lm(formula = d ~ duration + hyp + gall + ob + age, data = ec)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.63507 -0.19197 -0.12196 -0.03019 0.97709
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.1953300 0.2860506 -0.683 0.49535
## duration 0.0028831 0.0006985 4.127 5.05e-05 ***
## hypYes 0.0167825 0.0513352 0.327 0.74401
## gallYes 0.1892853 0.0713971 2.651 0.00855 **
## obYes 0.0962666 0.0504864 1.907 0.05773 .
## age 0.0032217 0.0040092 0.804 0.42243
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3802 on 242 degrees of freedom
## Multiple R-squared: 0.1237, Adjusted R-squared: 0.1056
## F-statistic: 6.83 on 5 and 242 DF, p-value: 5.566e-06
mod2 <- glm(d ~ duration + hyp + gall + ob + age, data = ec, family = "binomial")
summary(mod2)
##
## Call:
## glm(formula = d ~ duration + hyp + gall + ob + age, family = "binomial",
## data = ec)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6067 -0.6128 -0.5079 -0.3665 2.3636
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.085605 2.136456 -1.912 0.055834 .
## duration 0.016686 0.004403 3.790 0.000151 ***
## hypYes 0.126535 0.355881 0.356 0.722174
## gallYes 1.013917 0.424989 2.386 0.017044 *
## obYes 0.731593 0.383289 1.909 0.056297 .
## age 0.020267 0.029735 0.682 0.495497
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 249.30 on 247 degrees of freedom
## Residual deviance: 220.57 on 242 degrees of freedom
## AIC: 232.57
##
## Number of Fisher Scoring iterations: 4
step(mod2)
## Start: AIC=232.57
## d ~ duration + hyp + gall + ob + age
##
## Df Deviance AIC
## - hyp 1 220.70 230.70
## - age 1 221.04 231.04
## <none> 220.57 232.57
## - ob 1 224.46 234.46
## - gall 1 226.00 236.00
## - duration 1 234.98 244.98
##
## Step: AIC=230.7
## d ~ duration + gall + ob + age
##
## Df Deviance AIC
## - age 1 221.32 229.32
## <none> 220.70 230.70
## - ob 1 224.77 232.77
## - gall 1 226.06 234.06
## - duration 1 235.44 243.44
##
## Step: AIC=229.32
## d ~ duration + gall + ob
##
## Df Deviance AIC
## <none> 221.32 229.32
## - ob 1 225.67 231.67
## - gall 1 226.65 232.65
## - duration 1 235.49 241.49
##
## Call: glm(formula = d ~ duration + gall + ob, family = "binomial",
## data = ec)
##
## Coefficients:
## (Intercept) duration gallYes obYes
## -2.60365 0.01623 0.99717 0.76614
##
## Degrees of Freedom: 247 Total (i.e. Null); 244 Residual
## Null Deviance: 249.3
## Residual Deviance: 221.3 AIC: 229.3
Estrogen appears to incresae the likely hood of devloping cancer
mod2 <- glm(d ~ duration + hyp + gall + ob + age, data = ec, family = "binomial")
summary(mod2)
##
## Call:
## glm(formula = d ~ duration + hyp + gall + ob + age, family = "binomial",
## data = ec)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6067 -0.6128 -0.5079 -0.3665 2.3636
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.085605 2.136456 -1.912 0.055834 .
## duration 0.016686 0.004403 3.790 0.000151 ***
## hypYes 0.126535 0.355881 0.356 0.722174
## gallYes 1.013917 0.424989 2.386 0.017044 *
## obYes 0.731593 0.383289 1.909 0.056297 .
## age 0.020267 0.029735 0.682 0.495497
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 249.30 on 247 degrees of freedom
## Residual deviance: 220.57 on 242 degrees of freedom
## AIC: 232.57
##
## Number of Fisher Scoring iterations: 4
predpro <- predict(mod2, type = c("response"))
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc_curve <- roc(ec$d~predpro)
plot(roc_curve)
##
## Call:
## roc.formula(formula = ec$d ~ predpro)
##
## Data: predpro in 198 controls (ec$d 0) < 50 cases (ec$d 1).
## Area under the curve: 0.7532
The area under the curve is 0.7532 The model shows that there is a chance that a woman with cancer has a chance of getting chancer 50 percent of the time.
lfitduration <- loess(ec$d ~ ec$duration)
lfitage <- loess(ec$d ~ ec$age)
ec$lfitduration <- log(predict(lfitduration)/(1-predict(lfitduration)))
ec$lfitage <- log(predict(lfitage)/(1-predict(lfitage)))
library(ggplot2)
ggplot(ec, aes(x = duration, y = lfitduration)) + geom_point() + geom_smooth(method = "loess")
ggplot(ec, aes(x = age, y = lfitage)) + geom_point() + geom_smooth(method = "loess")
pchisq(220, 57, 242, lower.tail = FALSE)
## [1] 0.9948817
The linear realtionship of the model is in question I would argue the accuracy of the less relible that other tests. Other test fail to the reject the null hepothesis based on the P Value.