Part 1 - Revenues and Costs of RKO Movies 1930-1941

rko <- read.fwf("http://www.stat.ufl.edu/~winner/data/rko_film_19301941.dat",
                header=F, widths=c(25,rep(8,7),7,9,7,7),
                col.names = c("film","re.release","prodCost","domRev","forRev","totRev",
                              "profit","distCost","distCost_Rev","distCost_prodCost",
                              "year","proft_totCost"))
head(rko); tail(rko)
##                        film re.release prodCost domRev forRev totRev profit
## 1 STREET GIRL                        0      211    806    198   1004    500
## 2 VAGABOND LOVER                     0      204    671     85    756    335
## 3 SAINT IN NEW YO                    0      128    350    310    460    195
## 4 BACHELOR MOTHER                    0      509   1170    805   1975    827
## 5 TOP HAT                            0      609   1782   1420   3202   1325
## 6 LITTLE WOMEN[*]                    1      424   1337    663   2000    800
##   distCost distCost_Rev distCost_prodCost year proft_totCost
## 1      293         0.29         2   1.389  193       0    99
## 2      217         0.28         7   1.064  193       0    79
## 3      137         0.29         8   1.070  193       8    73
## 4      639         0.32         4   1.255  193       9    72
## 5     1268         0.39         6   2.082  193       6    70
## 6      776         0.38         8   1.830  193       4    66
##                          film re.release prodCost domRev forRev totRev profit
## 150 GAY DIPLOMAT                       0      184     96     35    131   -115
## 151 HITTING A NEW H                    0      727    305    183    488   -431
## 152 TWO ALONE                          0      236    125     39    164   -158
## 153 WOMAN COMANDS                      0      415    186     56    242   -265
## 154 ABE LINCOLN IN                     0     1004    535    131    666   -740
## 155 ENCHANTED APRIL                    0      346    127     38    165   -260
##     distCost distCost_Rev distCost_prodCost year proft_totCost
## 150       62         0.47         3   0.337  193       1   -46
## 151      192         0.39         3   0.264  193       8   -46
## 152       86         0.52         4   0.364  193       4   -49
## 153       92         0.38         0   0.222  193       2   -52
## 154      402         0.60         4   0.400  194       0   -52
## 155       79         0.47         9   0.228  193       5   -61
# Create data frame that removes re-releases
rko1 <- rko[rko$re.release ==  0,]
attach(rko1)

# Create Total Costs variable
totCost <- prodCost + distCost
  
# Plot Total Revenue vs Total Cost (using pch=16, cex=0.7 makes easier viewing than defaults)
plot(totRev ~ totCost, pch=16, cex=0.7, col="purple")

# (b) The graph produced does appear to be approximately linear. 

# Fit simple linear regression
part1.mod1 <- lm (totRev ~ totCost)
summary(part1.mod1)
## 
## Call:
## lm(formula = totRev ~ totCost)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -873.15 -148.97   17.42   97.99 1134.11 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -39.18815   43.39388  -0.903    0.368    
## totCost       1.12258    0.04716  23.806   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 282.5 on 140 degrees of freedom
## Multiple R-squared:  0.8019, Adjusted R-squared:  0.8005 
## F-statistic: 566.7 on 1 and 140 DF,  p-value: < 2.2e-16
# Plot data and regression line 
plot(totRev ~ totCost, pch=16, cex=0.7, col="purple")
abline(part1.mod1, col="green2", lwd=2)

# Give residual plots from the regression fit (2x2 grid)
par(mfrow=c(2,2))
plot(part1.mod1)

par(mfrow=c(1,1))

# For the scale-location graph, it does appear to widen as the fitted values increase. In the Residuals vs. Leverage plot, we also see one particular value far away from the rest. 

# Re-fit the model, with log(Y) as response and log(X) as predictor
part1.mod2 <- lm(log(totRev) ~ log(totCost))
summary(part1.mod2)
## 
## Call:
## lm(formula = log(totRev) ~ log(totCost))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.35896 -0.21836  0.06384  0.22463  0.67495 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.02034    0.25667   0.079    0.937    
## log(totCost)  0.99902    0.03984  25.075   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3705 on 140 degrees of freedom
## Multiple R-squared:  0.8179, Adjusted R-squared:  0.8166 
## F-statistic: 628.8 on 1 and 140 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(part1.mod2)

par(mfrow=c(1,1))

# The model does not appear to be a high leverage case.
detach(rko1)
rm(list=ls(all=TRUE))

Part 2 - Fuel Consumption and Speed for Container Ships

ssf <- read.csv("http://www.stat.ufl.edu/~winner/data/ship_speed_fuel.csv")
head(ssf); tail(ssf)
##   ship_leg speed fuel
## 1        1  15.0   37
## 2        1  15.4   38
## 3        1  15.5   38
## 4        1  15.7   39
## 5        1  15.8   39
## 6        1  15.9   40
##     ship_leg speed fuel
## 95         5  18.0   93
## 96         5  18.4   97
## 97         5  18.6  102
## 98         5  20.1  128
## 99         5  20.5  135
## 100        5  21.0  146
# Use only data from first ship leg
ssf1 <- ssf[ssf$ship_leg == 1,]
attach(ssf1)

# Plot fuel vs speed
plot(fuel ~ speed, pch=16, col="red")

# Appears to be positive and linear.

# Fit linear, quadratic, and cubic models
part2.mod1 <- lm(fuel ~ speed)
summary(part2.mod1)
## 
## Call:
## lm(formula = fuel ~ speed)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8991 -0.9786 -0.2604  1.0315  3.5556 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -89.9661     7.2224  -12.46 2.76e-10 ***
## speed         8.2274     0.4366   18.84 2.69e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.613 on 18 degrees of freedom
## Multiple R-squared:  0.9517, Adjusted R-squared:  0.9491 
## F-statistic:   355 on 1 and 18 DF,  p-value: 2.69e-13
part2.mod2 <- lm(fuel ~ speed + I(speed^2))
summary(part2.mod2)
## 
## Call:
## lm(formula = fuel ~ speed + I(speed^2))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9670 -0.3237  0.0663  0.5322  1.0957 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 447.7778    73.2671   6.112 1.15e-05 ***
## speed       -56.9518     8.8724  -6.419 6.35e-06 ***
## I(speed^2)    1.9701     0.2681   7.349 1.14e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8123 on 17 degrees of freedom
## Multiple R-squared:  0.9884, Adjusted R-squared:  0.9871 
## F-statistic: 727.2 on 2 and 17 DF,  p-value: < 2.2e-16
part2.mod3 <- lm(fuel ~ speed + I(speed^2) + I(speed^3))
summary(part2.mod3)
## 
## Call:
## lm(formula = fuel ~ speed + I(speed^2) + I(speed^3))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.01750 -0.29470  0.08005  0.50618  1.13746 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept)  817.8328  1443.3227   0.567    0.579
## speed       -124.4879   263.2095  -0.473    0.643
## I(speed^2)     6.0718    15.9782   0.380    0.709
## I(speed^3)    -0.0829     0.3229  -0.257    0.801
## 
## Residual standard error: 0.8356 on 16 degrees of freedom
## Multiple R-squared:  0.9885, Adjusted R-squared:  0.9863 
## F-statistic: 458.2 on 3 and 16 DF,  p-value: 1.02e-15
# Plot fits on single plot
speed.grid <- seq(floor(min(speed)-1), ceiling(max(speed)+1), 0.01)
plot(fuel ~ speed, pch=16, col="orange")
lines(speed.grid, predict(part2.mod1, list(speed=speed.grid)), col="blue", lwd=2)
lines(speed.grid, predict(part2.mod2, list(speed=speed.grid)), col="red", lwd=2)
lines(speed.grid, predict(part2.mod3, list(speed=speed.grid)), col="green", lwd=2)
legend("topleft", c("Data", "Linear", "Quadratic", "Cubic"), pch=c(16,NA,NA,NA),
       col=c("orange", "blue", "red", "green"), lwd=c(NA,2,2,2))

# The linear model has an R^2 of 95.17%, the Quadratic has one of 98.84% and the cubic model has one of 98.5%. 

summary(part2.mod1)$r.squared
## [1] 0.951746
detach(ssf1)
rm(list=ls(all=TRUE))

Part 3: Sewing Thread Consumption

stc <- read.csv("http://www.stat.ufl.edu/~winner/data/sewingthread.csv")
head(stc); tail(stc)
##   serNum thick stitchDen elong cotton sewThrdCns
## 1      1  1.44         3   3.8      1      12.50
## 2      2  1.44         4   3.8      1      13.15
## 3      3  1.44         5   3.8      1      13.70
## 4      4  2.88         3   3.8      1      15.10
## 5      5  2.88         4   3.8      1      16.30
## 6      6  2.88         5   3.8      1      17.48
##    serNum thick stitchDen elong cotton sewThrdCns
## 43     19  4.92         3  17.7      0      17.90
## 44     20  4.92         4  17.7      0      18.88
## 45     21  4.92         5  17.7      0      20.28
## 46     22  6.56         3  17.7      0      21.86
## 47     23  6.56         4  17.7      0      22.94
## 48     24  6.56         5  17.7      0      24.32
attach(stc)

# Plots of sewThrdCns vs thick, stitchDen, as.factor(cotton)

par(mfrow=c(1,3))
plot(sewThrdCns ~ thick, pch=16, col="purple")
plot(sewThrdCns ~ stitchDen, pch=16, col="blue")
plot(sewThrdCns ~ as.factor(cotton),  col="orange")

par(mfrow=c(1,1))

# Fit additive model
part3.mod1 <- lm(sewThrdCns ~ thick + stitchDen + cotton)
summary(part3.mod1)
## 
## Call:
## lm(formula = sewThrdCns ~ thick + stitchDen + cotton)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.81378 -0.33065 -0.09975  0.27436  1.29700 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.95618    0.41220  12.024 1.69e-15 ***
## thick        2.10088    0.04255  49.376  < 2e-16 ***
## stitchDen    1.10031    0.09085  12.111 1.32e-15 ***
## cotton       0.94417    0.14836   6.364 9.83e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5139 on 44 degrees of freedom
## Multiple R-squared:  0.9835, Adjusted R-squared:  0.9824 
## F-statistic: 875.1 on 3 and 44 DF,  p-value: < 2.2e-16
# Fit model with all 2-Way interactions
part3.mod2 <- lm(sewThrdCns ~ thick + stitchDen + cotton + 
                   I(thick*stitchDen) + I(thick*cotton) + I(stitchDen*cotton))
summary(part3.mod2)
## 
## Call:
## lm(formula = sewThrdCns ~ thick + stitchDen + cotton + I(thick * 
##     stitchDen) + I(thick * cotton) + I(stitchDen * cotton))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.84505 -0.27873  0.01979  0.30960  0.72236 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            7.74071    0.79196   9.774 2.86e-12 ***
## thick                  1.43639    0.17420   8.245 3.06e-10 ***
## stitchDen              0.53909    0.19130   2.818 0.007403 ** 
## cotton                -0.58760    0.65113  -0.902 0.372101    
## I(thick * stitchDen)   0.13108    0.04184   3.133 0.003193 ** 
## I(thick * cotton)      0.28033    0.06833   4.103 0.000189 ***
## I(stitchDen * cotton)  0.11312    0.14590   0.775 0.442569    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4127 on 41 degrees of freedom
## Multiple R-squared:  0.9901, Adjusted R-squared:  0.9886 
## F-statistic: 683.2 on 6 and 41 DF,  p-value: < 2.2e-16
# Test whether interaction effects are 0
anova(part3.mod1 , part3.mod2)
## Analysis of Variance Table
## 
## Model 1: sewThrdCns ~ thick + stitchDen + cotton
## Model 2: sewThrdCns ~ thick + stitchDen + cotton + I(thick * stitchDen) + 
##     I(thick * cotton) + I(stitchDen * cotton)
##   Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1     44 11.6217                                  
## 2     41  6.9818  3    4.6399 9.0824 9.876e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
predict(part3.mod2, list(thick= 6.56, stitchDen = 4, cotton = 1))
##        1 
## 24.46324
predict(part3.mod2, list(thick= 6.56, stitchDen = 4, cotton = 0))
##        1 
## 22.75938

{Part3.end} detach(stc) rm(list=ls(all=TRUE))