rko <- read.fwf("http://www.stat.ufl.edu/~winner/data/rko_film_19301941.dat",
header=F, widths=c(25,rep(8,7),7,9,7,7),
col.names = c("film","re.release","prodCost","domRev","forRev","totRev",
"profit","distCost","distCost_Rev","distCost_prodCost",
"year","proft_totCost"))
head(rko); tail(rko)
## film re.release prodCost domRev forRev totRev profit
## 1 STREET GIRL 0 211 806 198 1004 500
## 2 VAGABOND LOVER 0 204 671 85 756 335
## 3 SAINT IN NEW YO 0 128 350 310 460 195
## 4 BACHELOR MOTHER 0 509 1170 805 1975 827
## 5 TOP HAT 0 609 1782 1420 3202 1325
## 6 LITTLE WOMEN[*] 1 424 1337 663 2000 800
## distCost distCost_Rev distCost_prodCost year proft_totCost
## 1 293 0.29 2 1.389 193 0 99
## 2 217 0.28 7 1.064 193 0 79
## 3 137 0.29 8 1.070 193 8 73
## 4 639 0.32 4 1.255 193 9 72
## 5 1268 0.39 6 2.082 193 6 70
## 6 776 0.38 8 1.830 193 4 66
## film re.release prodCost domRev forRev totRev profit
## 150 GAY DIPLOMAT 0 184 96 35 131 -115
## 151 HITTING A NEW H 0 727 305 183 488 -431
## 152 TWO ALONE 0 236 125 39 164 -158
## 153 WOMAN COMANDS 0 415 186 56 242 -265
## 154 ABE LINCOLN IN 0 1004 535 131 666 -740
## 155 ENCHANTED APRIL 0 346 127 38 165 -260
## distCost distCost_Rev distCost_prodCost year proft_totCost
## 150 62 0.47 3 0.337 193 1 -46
## 151 192 0.39 3 0.264 193 8 -46
## 152 86 0.52 4 0.364 193 4 -49
## 153 92 0.38 0 0.222 193 2 -52
## 154 402 0.60 4 0.400 194 0 -52
## 155 79 0.47 9 0.228 193 5 -61
# Create data frame that removes re-releases
rko1 <- rko[rko$re.release == 0,]
attach(rko1)
# Create Total Costs variable
totCost <- prodCost + distCost
# Plot Total Revenue vs Total Cost (using pch=16, cex=0.7 makes easier viewing than defaults)
plot(totRev ~ totCost, pch=16, cex=0.7, col="purple")
# (b) The graph produced does appear to be approximately linear.
# Fit simple linear regression
part1.mod1 <- lm (totRev ~ totCost)
summary(part1.mod1)
##
## Call:
## lm(formula = totRev ~ totCost)
##
## Residuals:
## Min 1Q Median 3Q Max
## -873.15 -148.97 17.42 97.99 1134.11
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -39.18815 43.39388 -0.903 0.368
## totCost 1.12258 0.04716 23.806 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 282.5 on 140 degrees of freedom
## Multiple R-squared: 0.8019, Adjusted R-squared: 0.8005
## F-statistic: 566.7 on 1 and 140 DF, p-value: < 2.2e-16
# Plot data and regression line
plot(totRev ~ totCost, pch=16, cex=0.7, col="purple")
abline(part1.mod1, col="green2", lwd=2)
# Give residual plots from the regression fit (2x2 grid)
par(mfrow=c(2,2))
plot(part1.mod1)
par(mfrow=c(1,1))
# For the scale-location graph, it does appear to widen as the fitted values increase. In the Residuals vs. Leverage plot, we also see one particular value far away from the rest.
# Re-fit the model, with log(Y) as response and log(X) as predictor
part1.mod2 <- lm(log(totRev) ~ log(totCost))
summary(part1.mod2)
##
## Call:
## lm(formula = log(totRev) ~ log(totCost))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.35896 -0.21836 0.06384 0.22463 0.67495
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.02034 0.25667 0.079 0.937
## log(totCost) 0.99902 0.03984 25.075 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3705 on 140 degrees of freedom
## Multiple R-squared: 0.8179, Adjusted R-squared: 0.8166
## F-statistic: 628.8 on 1 and 140 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(part1.mod2)
par(mfrow=c(1,1))
# The model does not appear to be a high leverage case.
detach(rko1)
rm(list=ls(all=TRUE))
ssf <- read.csv("http://www.stat.ufl.edu/~winner/data/ship_speed_fuel.csv")
head(ssf); tail(ssf)
## ship_leg speed fuel
## 1 1 15.0 37
## 2 1 15.4 38
## 3 1 15.5 38
## 4 1 15.7 39
## 5 1 15.8 39
## 6 1 15.9 40
## ship_leg speed fuel
## 95 5 18.0 93
## 96 5 18.4 97
## 97 5 18.6 102
## 98 5 20.1 128
## 99 5 20.5 135
## 100 5 21.0 146
# Use only data from first ship leg
ssf1 <- ssf[ssf$ship_leg == 1,]
attach(ssf1)
# Plot fuel vs speed
plot(fuel ~ speed, pch=16, col="red")
# Appears to be positive and linear.
# Fit linear, quadratic, and cubic models
part2.mod1 <- lm(fuel ~ speed)
summary(part2.mod1)
##
## Call:
## lm(formula = fuel ~ speed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8991 -0.9786 -0.2604 1.0315 3.5556
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -89.9661 7.2224 -12.46 2.76e-10 ***
## speed 8.2274 0.4366 18.84 2.69e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.613 on 18 degrees of freedom
## Multiple R-squared: 0.9517, Adjusted R-squared: 0.9491
## F-statistic: 355 on 1 and 18 DF, p-value: 2.69e-13
part2.mod2 <- lm(fuel ~ speed + I(speed^2))
summary(part2.mod2)
##
## Call:
## lm(formula = fuel ~ speed + I(speed^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9670 -0.3237 0.0663 0.5322 1.0957
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 447.7778 73.2671 6.112 1.15e-05 ***
## speed -56.9518 8.8724 -6.419 6.35e-06 ***
## I(speed^2) 1.9701 0.2681 7.349 1.14e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8123 on 17 degrees of freedom
## Multiple R-squared: 0.9884, Adjusted R-squared: 0.9871
## F-statistic: 727.2 on 2 and 17 DF, p-value: < 2.2e-16
part2.mod3 <- lm(fuel ~ speed + I(speed^2) + I(speed^3))
summary(part2.mod3)
##
## Call:
## lm(formula = fuel ~ speed + I(speed^2) + I(speed^3))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.01750 -0.29470 0.08005 0.50618 1.13746
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 817.8328 1443.3227 0.567 0.579
## speed -124.4879 263.2095 -0.473 0.643
## I(speed^2) 6.0718 15.9782 0.380 0.709
## I(speed^3) -0.0829 0.3229 -0.257 0.801
##
## Residual standard error: 0.8356 on 16 degrees of freedom
## Multiple R-squared: 0.9885, Adjusted R-squared: 0.9863
## F-statistic: 458.2 on 3 and 16 DF, p-value: 1.02e-15
# Plot fits on single plot
speed.grid <- seq(floor(min(speed)-1), ceiling(max(speed)+1), 0.01)
plot(fuel ~ speed, pch=16, col="orange")
lines(speed.grid, predict(part2.mod1, list(speed=speed.grid)), col="blue", lwd=2)
lines(speed.grid, predict(part2.mod2, list(speed=speed.grid)), col="red", lwd=2)
lines(speed.grid, predict(part2.mod3, list(speed=speed.grid)), col="green", lwd=2)
legend("topleft", c("Data", "Linear", "Quadratic", "Cubic"), pch=c(16,NA,NA,NA),
col=c("orange", "blue", "red", "green"), lwd=c(NA,2,2,2))
# The linear model has an R^2 of 95.17%, the Quadratic has one of 98.84% and the cubic model has one of 98.5%.
summary(part2.mod1)$r.squared
## [1] 0.951746
detach(ssf1)
rm(list=ls(all=TRUE))
stc <- read.csv("http://www.stat.ufl.edu/~winner/data/sewingthread.csv")
head(stc); tail(stc)
## serNum thick stitchDen elong cotton sewThrdCns
## 1 1 1.44 3 3.8 1 12.50
## 2 2 1.44 4 3.8 1 13.15
## 3 3 1.44 5 3.8 1 13.70
## 4 4 2.88 3 3.8 1 15.10
## 5 5 2.88 4 3.8 1 16.30
## 6 6 2.88 5 3.8 1 17.48
## serNum thick stitchDen elong cotton sewThrdCns
## 43 19 4.92 3 17.7 0 17.90
## 44 20 4.92 4 17.7 0 18.88
## 45 21 4.92 5 17.7 0 20.28
## 46 22 6.56 3 17.7 0 21.86
## 47 23 6.56 4 17.7 0 22.94
## 48 24 6.56 5 17.7 0 24.32
attach(stc)
# Plots of sewThrdCns vs thick, stitchDen, as.factor(cotton)
par(mfrow=c(1,3))
plot(sewThrdCns ~ thick, pch=16, col="purple")
plot(sewThrdCns ~ stitchDen, pch=16, col="blue")
plot(sewThrdCns ~ as.factor(cotton), col="orange")
par(mfrow=c(1,1))
# Fit additive model
part3.mod1 <- lm(sewThrdCns ~ thick + stitchDen + cotton)
summary(part3.mod1)
##
## Call:
## lm(formula = sewThrdCns ~ thick + stitchDen + cotton)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.81378 -0.33065 -0.09975 0.27436 1.29700
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.95618 0.41220 12.024 1.69e-15 ***
## thick 2.10088 0.04255 49.376 < 2e-16 ***
## stitchDen 1.10031 0.09085 12.111 1.32e-15 ***
## cotton 0.94417 0.14836 6.364 9.83e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5139 on 44 degrees of freedom
## Multiple R-squared: 0.9835, Adjusted R-squared: 0.9824
## F-statistic: 875.1 on 3 and 44 DF, p-value: < 2.2e-16
# Fit model with all 2-Way interactions
part3.mod2 <- lm(sewThrdCns ~ thick + stitchDen + cotton +
I(thick*stitchDen) + I(thick*cotton) + I(stitchDen*cotton))
summary(part3.mod2)
##
## Call:
## lm(formula = sewThrdCns ~ thick + stitchDen + cotton + I(thick *
## stitchDen) + I(thick * cotton) + I(stitchDen * cotton))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.84505 -0.27873 0.01979 0.30960 0.72236
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.74071 0.79196 9.774 2.86e-12 ***
## thick 1.43639 0.17420 8.245 3.06e-10 ***
## stitchDen 0.53909 0.19130 2.818 0.007403 **
## cotton -0.58760 0.65113 -0.902 0.372101
## I(thick * stitchDen) 0.13108 0.04184 3.133 0.003193 **
## I(thick * cotton) 0.28033 0.06833 4.103 0.000189 ***
## I(stitchDen * cotton) 0.11312 0.14590 0.775 0.442569
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4127 on 41 degrees of freedom
## Multiple R-squared: 0.9901, Adjusted R-squared: 0.9886
## F-statistic: 683.2 on 6 and 41 DF, p-value: < 2.2e-16
# Test whether interaction effects are 0
anova(part3.mod1 , part3.mod2)
## Analysis of Variance Table
##
## Model 1: sewThrdCns ~ thick + stitchDen + cotton
## Model 2: sewThrdCns ~ thick + stitchDen + cotton + I(thick * stitchDen) +
## I(thick * cotton) + I(stitchDen * cotton)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 44 11.6217
## 2 41 6.9818 3 4.6399 9.0824 9.876e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
predict(part3.mod2, list(thick= 6.56, stitchDen = 4, cotton = 1))
## 1
## 24.46324
predict(part3.mod2, list(thick= 6.56, stitchDen = 4, cotton = 0))
## 1
## 22.75938
{Part3.end} detach(stc) rm(list=ls(all=TRUE))