1. The following table gives the elongation e in inches per inch(in./in.)for a given stress S on a steel wire measured in pounds per square inch (lb/in:2). Test the model e D c1S by plotting the data. Estimate c1 graphically.

S. 10 3/ 5 10 20 30 40 50 60 70 80 90 100 e . 105/ 0 19 57 94 134 173 216 256 297 343 390

library(ggplot2)
library(reshape2)
s<- c(5,10,20,30,40,50,60,70,80,90,100)
e <- c(0, 19, 57, 94, 134, 173, 216, 256, 297, 343, 390)
df<- data.frame(s,e)
p<- ggplot(data = df, aes(x = df$s, y = df$e)) +
  geom_point()

p <- p + labs(title="elongation for a given stress", x="s", y="e")
p

n <- length(e)
c1 <- (e[n] - e[1])/(s[n] - s[1])
c1
## [1] 4.105263

c1 = 4.105263

###Page 121: #2.a 2. For each of the following data sets, formulate the mathematical model that minimizes the largest deviation between the data and the line y D axCb. If a computer is available, solve for the estimates of a and b.

x<- c(1.0, 2.3, 3.7, 4.2, 6.1, 7.0)
y<- c(3.6, 3.0, 3.2, 5.1, 5.3, 6.8)

using Chebyshev approximation criterion: The mathematical model that would minimize the deviation is:

r − 3.6 + 1.0a + b ≥ 0

r + 3.6 − 1.0a − b ≥ 0

r − 3.0 + 2.3a + b ≥ 0

r + 3.0 − 2.3a − b ≥ 0

r − 3.2 + 3.7a + b ≥ 0

r + 3.2 − 3.7a − b ≥ 0

r − 5.1 + 4.2a + b ≥ 0

r + 5.1 − 4.2a − b ≥ 0

r − 5.3 + 6.1a + b ≥ 0

r + 5.3 − 6.1a − b ≥ 0

r − 6.8 + 7.0a + b ≥ 0

r + 6.8 − 7.0a − b ≥ 0

# solve the estimates for a and b using least square method
lm1 <- lm(y ~ x)
lm1
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##      2.2149       0.5642

slope = 0.5642 Intercept = 2.2148

Plotting

df2<- data.frame(x,y)
df2$fx<- (0.5642*df2$x) + 2.2148

p<- ggplot(data = df2, aes(x = x, y = fx)) +
  geom_line()
p <- p + labs(title=" y=ax+b", x="x", y="y")
p

Page 127: #10 Fit the model y = ax^3/2:

Power curve of the form y=Axny=Axn, the least-square estimate a is given by:

\[ a = \frac{\Sigma x_i^n y_i}{\Sigma x_i^{2n}} \]

Where \(n = \frac{3}{2}\)

Planets <- data.frame(Body = c("Mercury", "Venus", "Earth", "Mars", "Jupiter","Saturn", "Uranus", "Neptune"),
Period = c(7.6e6, 1.94e7, 3.16e7, 5.94e7, 3.74e8, 9.35e8, 2.64e9, 5.22e9),
Distance = c(5.79e10, 1.08e11, 1.5e11, 2.28e11, 7.79e11, 1.43e12,
                                   2.87e12, 4.5e12))
Planets
##      Body   Period Distance
## 1 Mercury 7.60e+06 5.79e+10
## 2   Venus 1.94e+07 1.08e+11
## 3   Earth 3.16e+07 1.50e+11
## 4    Mars 5.94e+07 2.28e+11
## 5 Jupiter 3.74e+08 7.79e+11
## 6  Saturn 9.35e+08 1.43e+12
## 7  Uranus 2.64e+09 2.87e+12
## 8 Neptune 5.22e+09 4.50e+12
a <- sum(Planets$Period^(3/2)*Planets$Distance)/sum(Planets$Period^(3))
a
## [1] 0.01320756
Planets$Distance_fit <- a*Planets$Period^(3/2)

p_fit<- ggplot(data = Planets, aes(x = Period, y = Distance_fit)) +
  geom_line()
p_fit <- p_fit + labs(title="Fiting Power Curve n=3/2", x="x", y="y")

p_fit

Page 136: #7

l <- c(14.5,12.5,17.25,14.5,12.625,17.75,14.125,12.625)
w <- c(27,17,41,26,17,49,23,16)

Model1

\[ w = { kl^3} \]

\[ k = \frac{\Sigma l_i^3 w_i}{\Sigma l_i^{6}} \]

k <- sum(l^3 * w)/sum(l^6)
model1<- k*l^3
resi1 <- w - model1
sse1 <- sum(resi1^2)
sse1
## [1] 12.16834

Model2

\[ w = { klg^2} \]

\[ k2 = \frac{\Sigma l_ig_i^2 w_i}{\Sigma l_i^{2}g_i^{4}} \]

g <- c(9.75,8.375,11.0,9.75,8.5,12.5,9.0,8.5)
k2 <- sum(l*g^2*w)/sum(l^2*g^4)
model2 <- k2*l*g^2
resi2 <- w - model2
sse2 <- sum(resi2^2)
sse2
## [1] 17.6711

I would prefer model 1 over model 2 as evident from sum of squares error is low for model 1

Page 146: #5

population <- c(341948, 1092759, 5491, 49375, 1340000,365, 2500, 78200, 867023, 14000,
                23700, 70700, 304500, 138000, 2602000)
mean_velocity <- c(4.81, 5.88, 3.31, 4.90, 5.62,2.76, 2.27, 3.85, 5.21, 3.70,
                   3.27, 4.31, 4.42, 4.39, 5.05)
o <- order(population)
population <- population[o]
mean.velocity <- mean_velocity[o]
# fit model: V=C P^a
lm1 <- lm(log(mean_velocity)~log(population))
plot(log(population), log(mean_velocity))
abline(lm1)

c <- exp(lm1$coefficients[1])
a <- lm1$coefficients[2]
#  non-transformed data
plot(population, mean_velocity)
points(population,c*population^a, type="l")

model V = m(log P) + b

lm2 <- lm(mean_velocity~log(population))
m <- lm2$coefficients[2]
b <- lm2$coefficients[1]
plot(log(population), mean_velocity)
abline(lm2)

#  non-transformed data
plot(population, mean_velocity)
points(population, m*log(population) + b, type="l")

Compare the errors with Problem 4

residuals_1 <- abs(mean_velocity - c * population^a)
residuals_1
##  [1] 0.4226323 1.5916764 0.9384696 0.6984559 1.4446208 1.3791700 1.8515738
##  [8] 0.2666454 1.1210082 0.3507694 0.7751977 0.3092196 0.4301904 0.4098357
## [15] 1.1010562
residuals_2 <- abs(mean_velocity - m * log(population) + b)
residuals_2
##  [1] 10.176928 11.372191  8.853413 10.504344 11.258614  8.446396  7.979767
##  [8]  9.566330 10.963306  9.504828  9.082379 10.182949 10.308013 10.291291
## [15] 10.994493
plot(population, mean_velocity)
points(population, c * population^a, type="l", col="blue")
points(population, m * log(population) + b, type="l", col="red")

# sum of squares
ss_model1 <- sum(residuals_1^2)
ss_model1
## [1] 15.21068
ss_model2 <- sum(residuals_2^2)
ss_model2
## [1] 1504.673

V = 1.397P^0.096 is better when using Sum of Squares measures

Page 157: #4

In the following data, X represents the diameter of a Ponderosa Pine measured at breast height, and Y is a measure of volume - number of bard feet divided by 10. Make a scatterplot of the data.

ponderosa <- data.frame(x = c(17, 19, 20, 22, 23, 25, 31, 32, 33, 36, 37, 38, 39, 41),
               y = c(19, 25, 32, 51, 57, 71, 141, 123, 187, 192, 205,252,248,294))
summary(lm(y ~ poly(x, 13, raw=TRUE), data=ponderosa))
## 
## Call:
## lm(formula = y ~ poly(x, 13, raw = TRUE), data = ponderosa)
## 
## Residuals:
##        1        2        3        4        5        6        7        8 
##  -0.0777   1.2350  -2.4560   3.2869  -1.7901  -0.6550  13.4945 -29.2786 
##        9       10       11       12       13       14 
##  18.3321   0.6008  -9.5549  10.2363  -3.5314   0.1583 
## 
## Coefficients: (3 not defined because of singularities)
##                             Estimate Std. Error t value Pr(>|t|)
## (Intercept)                1.670e+07  2.823e+07   0.592    0.596
## poly(x, 13, raw = TRUE)1  -6.202e+06  1.031e+07  -0.602    0.590
## poly(x, 13, raw = TRUE)2   1.018e+06  1.666e+06   0.611    0.584
## poly(x, 13, raw = TRUE)3  -9.699e+04  1.563e+05  -0.621    0.579
## poly(x, 13, raw = TRUE)4   5.906e+03  9.383e+03   0.629    0.574
## poly(x, 13, raw = TRUE)5  -2.380e+02  3.732e+02  -0.638    0.569
## poly(x, 13, raw = TRUE)6   6.309e+00  9.774e+00   0.646    0.565
## poly(x, 13, raw = TRUE)7  -1.038e-01  1.590e-01  -0.653    0.561
## poly(x, 13, raw = TRUE)8   8.663e-04  1.315e-03   0.659    0.557
## poly(x, 13, raw = TRUE)9          NA         NA      NA       NA
## poly(x, 13, raw = TRUE)10 -4.741e-08  7.078e-08  -0.670    0.551
## poly(x, 13, raw = TRUE)11         NA         NA      NA       NA
## poly(x, 13, raw = TRUE)12  2.555e-12  3.768e-12   0.678    0.546
## poly(x, 13, raw = TRUE)13         NA         NA      NA       NA
## 
## Residual standard error: 23.14 on 3 degrees of freedom
## Multiple R-squared:  0.9862, Adjusted R-squared:  0.9401 
## F-statistic:  21.4 on 10 and 3 DF,  p-value: 0.01419
coefficients <- as.vector(lm(y ~ poly(x, 13, raw=TRUE), data=ponderosa)$coefficients)

coefficients 
##  [1]  1.670123e+07 -6.202165e+06  1.018156e+06 -9.698892e+04  5.906312e+03
##  [6] -2.380262e+02  6.309290e+00 -1.037739e-01  8.662540e-04            NA
## [11] -4.740889e-08            NA  2.554667e-12            NA
ggplot(ponderosa) + geom_point(aes(x=x, y=y)) + geom_line(aes(x=x, y=
  coefficients[1] + coefficients[2]*x + coefficients[3]*x^2 +
    coefficients[4]*x^3 + coefficients[5]*x^4 + coefficients[6]*x^5 + 
    coefficients[7]*x^6 + coefficients[8]*x^7 + coefficients[9]*x^8 +
    coefficients[11]*x^10 +
    coefficients[13]*x^12))

A 13th degree polynomia would pass through every data point and it is an example of overfitting.

Page 169: #11

Construct a scatterplot of the given data. Is there a trend in the data? Are any of the data points outliers? Construct a divided difference table. Is smoothing with a low-order polynomial appropriate? If so, choose an appropriate polynomial and fit using the least-squares criterion of best fit. Analyze the goodness of fit by examining appropriate indicators and graphing the model, the data points, and the deviations.

bass <- data.frame(length = c(12.5, 12.625, 14.125, 14.5, 17.25, 17.75),
                   weight = c(17, 16.5, 23, 26.5, 41, 49))

As seen from the plot the number of observations are less, so it is difficult to detect outliers in this scatter plot

# Difference Table
bassdelta1 <- (tail(bass$weight, -1) - head(bass$weight, -1))/
  (tail(bass$length, -1) - head(bass$length, -1))

bassdelta2 <- (tail(bassdelta1, -1) - head(bassdelta1, -1))/
  (tail(bass$length, -2) - head(bass$length, -2))

bassdelta3 <- (tail(bassdelta2, -1) - head(bassdelta2, -1))/
  (tail(bass$length, -3) - head(bass$length, -3))

bassdelta4 <- (tail(bassdelta3, -1) - head(bassdelta3, -1))/
  (tail(bass$length, -4) - head(bass$length, -4))

bassdelta5 <- (tail(bassdelta4, -1) - head(bassdelta4, -1))/
  (tail(bass$length, -5) - head(bass$length, -5))

bass$delta1 <- c(bassdelta1, 0)
bass$delta2 <- c(bassdelta2, rep(0,2))
bass$delta3 <- c(bassdelta3, rep(0,3))
bass$delta4 <- c(bassdelta4, rep(0,4))
bass$delta5 <- c(bassdelta5, rep(0,5))

bass
##   length weight    delta1    delta2     delta3     delta4     delta5
## 1 12.500   17.0 -4.000000  5.128205 -1.2307692 0.07857739 0.06406722
## 2 12.625   16.5  4.333333  2.666667 -0.8575266 0.41493031 0.00000000
## 3 14.125   23.0  9.333333 -1.299394  1.2689912 0.00000000 0.00000000
## 4 14.500   26.5  5.272727  3.300699  0.0000000 0.00000000 0.00000000
## 5 17.250   41.0 16.000000  0.000000  0.0000000 0.00000000 0.00000000
## 6 17.750   49.0  0.000000  0.000000  0.0000000 0.00000000 0.00000000
ggplot(bass, aes(x=length, y=weight)) + geom_point()

ggplot(bass, aes(x=length, y=delta1)) + geom_line()

ggplot(bass, aes(x=length, y=delta2)) + geom_line()

ggplot(bass, aes(x=length, y=delta3)) + geom_line()

ggplot(bass, aes(x=length, y=delta4)) + geom_line()

ggplot(bass, aes(x=length, y=delta5)) + geom_line()

Linear Model

bassmodel <- lm(weight ~ length, data=bass)

coefficients <- bassmodel$coefficients

bass$bassest <- coefficients[1] + coefficients[2]*bass$length

ggplot(bass) + geom_point(aes(x=length, y=weight)) + geom_line(aes(x=length, y=bassest))

Quadratic model

bassmodelquad <- lm(weight ~ poly(length, 2, raw=TRUE), data=bass)

coefficients <- bassmodelquad$coefficients

bass$bassestquad <- coefficients[1] + coefficients[2]*bass$length + coefficients[3]*bass$length^2

ggplot(bass) + geom_point(aes(x=length, y=weight)) + geom_line(aes(x=length, y=bassestquad))

bassmodelquart <- lm(weight ~ poly(length, 4, raw=TRUE), data=bass)

coefficients <- bassmodelquart$coefficients

bass$bassestquart <- coefficients[1] + coefficients[2]*bass$length + coefficients[3]*bass$length^2 +
  coefficients[4]*bass$length^3 + coefficients[5]*bass$length^4

ggplot(bass) + geom_point(aes(x=length, y=weight)) + geom_line(aes(x=length, y=bassestquart))

Residuals of the quadratic model

residuals <- data.frame(linear = bassmodel$residuals, quadratic = bassmodelquad$residuals, 
                        quartic = bassmodelquart$residuals, datapoint = seq(1,6))


ggplot(melt(residuals, id="datapoint"), aes(x=datapoint, y=value, color=variable)) + geom_line()

Quadratic model is the best fit but it looks like it is a case of overfit.

Page 181: 5

y = c(1919,1932,1958,1963,1968,1971,1974,1976,1978,
1981,1985,1988,1991,1995,1999,2001,2002,2006,2007,2008,2009,2012)
p = c(0.02,0.03,0.04,0.05,0.06,0.08,0.10,0.13,0.15,0.20,
0.22,0.25,0.29,0.32,0.33,0.34,0.37,0.39,0.41,0.42,0.44,0.45)
y = c(1971,1974,1976,1978,1981,1985,1988,1991,1995,1999,2001,2002,2006,2007,2008,2009,2012)
p = c(0.08,0.10,0.13,0.15,0.20,0.22,0.25,0.29,0.32,0.33,0.34,0.37,0.39,0.41,0.42,0.44,0.45)
plot(y,p)

Taking log

logp = log(p)
plot(y,logp)

taking square root

sqrtp = sqrt(p)
plot(y, sqrtp)

Linear regression

lm1 = lm(p~y)
p1 = 0.008931*y - 17.51354
plot(y, p, type="p", col="red")
lines(y, p1, type="p", col="blue")

cbind(y,p1)
##          y       p1
##  [1,] 1971 0.089461
##  [2,] 1974 0.116254
##  [3,] 1976 0.134116
##  [4,] 1978 0.151978
##  [5,] 1981 0.178771
##  [6,] 1985 0.214495
##  [7,] 1988 0.241288
##  [8,] 1991 0.268081
##  [9,] 1995 0.303805
## [10,] 1999 0.339529
## [11,] 2001 0.357391
## [12,] 2002 0.366322
## [13,] 2006 0.402046
## [14,] 2007 0.410977
## [15,] 2008 0.419908
## [16,] 2009 0.428839
## [17,] 2012 0.455632

Linear regression has minimum error