S. 10 3/ 5 10 20 30 40 50 60 70 80 90 100 e . 105/ 0 19 57 94 134 173 216 256 297 343 390
library(ggplot2)
library(reshape2)
s<- c(5,10,20,30,40,50,60,70,80,90,100)
e <- c(0, 19, 57, 94, 134, 173, 216, 256, 297, 343, 390)
df<- data.frame(s,e)
p<- ggplot(data = df, aes(x = df$s, y = df$e)) +
geom_point()
p <- p + labs(title="elongation for a given stress", x="s", y="e")
p
n <- length(e)
c1 <- (e[n] - e[1])/(s[n] - s[1])
c1
## [1] 4.105263
c1 = 4.105263
###Page 121: #2.a 2. For each of the following data sets, formulate the mathematical model that minimizes the largest deviation between the data and the line y D axCb. If a computer is available, solve for the estimates of a and b.
x<- c(1.0, 2.3, 3.7, 4.2, 6.1, 7.0)
y<- c(3.6, 3.0, 3.2, 5.1, 5.3, 6.8)
using Chebyshev approximation criterion: The mathematical model that would minimize the deviation is:
r − 3.6 + 1.0a + b ≥ 0
r + 3.6 − 1.0a − b ≥ 0
r − 3.0 + 2.3a + b ≥ 0
r + 3.0 − 2.3a − b ≥ 0
r − 3.2 + 3.7a + b ≥ 0
r + 3.2 − 3.7a − b ≥ 0
r − 5.1 + 4.2a + b ≥ 0
r + 5.1 − 4.2a − b ≥ 0
r − 5.3 + 6.1a + b ≥ 0
r + 5.3 − 6.1a − b ≥ 0
r − 6.8 + 7.0a + b ≥ 0
r + 6.8 − 7.0a − b ≥ 0
# solve the estimates for a and b using least square method
lm1 <- lm(y ~ x)
lm1
##
## Call:
## lm(formula = y ~ x)
##
## Coefficients:
## (Intercept) x
## 2.2149 0.5642
slope = 0.5642 Intercept = 2.2148
Plotting
df2<- data.frame(x,y)
df2$fx<- (0.5642*df2$x) + 2.2148
p<- ggplot(data = df2, aes(x = x, y = fx)) +
geom_line()
p <- p + labs(title=" y=ax+b", x="x", y="y")
p
Page 127: #10 Fit the model y = ax^3/2:
Power curve of the form y=Axny=Axn, the least-square estimate a is given by:
\[ a = \frac{\Sigma x_i^n y_i}{\Sigma x_i^{2n}} \]
Where \(n = \frac{3}{2}\)
Planets <- data.frame(Body = c("Mercury", "Venus", "Earth", "Mars", "Jupiter","Saturn", "Uranus", "Neptune"),
Period = c(7.6e6, 1.94e7, 3.16e7, 5.94e7, 3.74e8, 9.35e8, 2.64e9, 5.22e9),
Distance = c(5.79e10, 1.08e11, 1.5e11, 2.28e11, 7.79e11, 1.43e12,
2.87e12, 4.5e12))
Planets
## Body Period Distance
## 1 Mercury 7.60e+06 5.79e+10
## 2 Venus 1.94e+07 1.08e+11
## 3 Earth 3.16e+07 1.50e+11
## 4 Mars 5.94e+07 2.28e+11
## 5 Jupiter 3.74e+08 7.79e+11
## 6 Saturn 9.35e+08 1.43e+12
## 7 Uranus 2.64e+09 2.87e+12
## 8 Neptune 5.22e+09 4.50e+12
a <- sum(Planets$Period^(3/2)*Planets$Distance)/sum(Planets$Period^(3))
a
## [1] 0.01320756
Planets$Distance_fit <- a*Planets$Period^(3/2)
p_fit<- ggplot(data = Planets, aes(x = Period, y = Distance_fit)) +
geom_line()
p_fit <- p_fit + labs(title="Fiting Power Curve n=3/2", x="x", y="y")
p_fit
Page 136: #7
l <- c(14.5,12.5,17.25,14.5,12.625,17.75,14.125,12.625)
w <- c(27,17,41,26,17,49,23,16)
Model1
\[ w = { kl^3} \]
\[ k = \frac{\Sigma l_i^3 w_i}{\Sigma l_i^{6}} \]
k <- sum(l^3 * w)/sum(l^6)
model1<- k*l^3
resi1 <- w - model1
sse1 <- sum(resi1^2)
sse1
## [1] 12.16834
Model2
\[ w = { klg^2} \]
\[ k2 = \frac{\Sigma l_ig_i^2 w_i}{\Sigma l_i^{2}g_i^{4}} \]
g <- c(9.75,8.375,11.0,9.75,8.5,12.5,9.0,8.5)
k2 <- sum(l*g^2*w)/sum(l^2*g^4)
model2 <- k2*l*g^2
resi2 <- w - model2
sse2 <- sum(resi2^2)
sse2
## [1] 17.6711
I would prefer model 1 over model 2 as evident from sum of squares error is low for model 1
Page 146: #5
population <- c(341948, 1092759, 5491, 49375, 1340000,365, 2500, 78200, 867023, 14000,
23700, 70700, 304500, 138000, 2602000)
mean_velocity <- c(4.81, 5.88, 3.31, 4.90, 5.62,2.76, 2.27, 3.85, 5.21, 3.70,
3.27, 4.31, 4.42, 4.39, 5.05)
o <- order(population)
population <- population[o]
mean.velocity <- mean_velocity[o]
# fit model: V=C P^a
lm1 <- lm(log(mean_velocity)~log(population))
plot(log(population), log(mean_velocity))
abline(lm1)
c <- exp(lm1$coefficients[1])
a <- lm1$coefficients[2]
# non-transformed data
plot(population, mean_velocity)
points(population,c*population^a, type="l")
model V = m(log P) + b
lm2 <- lm(mean_velocity~log(population))
m <- lm2$coefficients[2]
b <- lm2$coefficients[1]
plot(log(population), mean_velocity)
abline(lm2)
# non-transformed data
plot(population, mean_velocity)
points(population, m*log(population) + b, type="l")
Compare the errors with Problem 4
residuals_1 <- abs(mean_velocity - c * population^a)
residuals_1
## [1] 0.4226323 1.5916764 0.9384696 0.6984559 1.4446208 1.3791700 1.8515738
## [8] 0.2666454 1.1210082 0.3507694 0.7751977 0.3092196 0.4301904 0.4098357
## [15] 1.1010562
residuals_2 <- abs(mean_velocity - m * log(population) + b)
residuals_2
## [1] 10.176928 11.372191 8.853413 10.504344 11.258614 8.446396 7.979767
## [8] 9.566330 10.963306 9.504828 9.082379 10.182949 10.308013 10.291291
## [15] 10.994493
plot(population, mean_velocity)
points(population, c * population^a, type="l", col="blue")
points(population, m * log(population) + b, type="l", col="red")
# sum of squares
ss_model1 <- sum(residuals_1^2)
ss_model1
## [1] 15.21068
ss_model2 <- sum(residuals_2^2)
ss_model2
## [1] 1504.673
V = 1.397P^0.096 is better when using Sum of Squares measures
Page 157: #4
In the following data, X represents the diameter of a Ponderosa Pine measured at breast height, and Y is a measure of volume - number of bard feet divided by 10. Make a scatterplot of the data.
ponderosa <- data.frame(x = c(17, 19, 20, 22, 23, 25, 31, 32, 33, 36, 37, 38, 39, 41),
y = c(19, 25, 32, 51, 57, 71, 141, 123, 187, 192, 205,252,248,294))
summary(lm(y ~ poly(x, 13, raw=TRUE), data=ponderosa))
##
## Call:
## lm(formula = y ~ poly(x, 13, raw = TRUE), data = ponderosa)
##
## Residuals:
## 1 2 3 4 5 6 7 8
## -0.0777 1.2350 -2.4560 3.2869 -1.7901 -0.6550 13.4945 -29.2786
## 9 10 11 12 13 14
## 18.3321 0.6008 -9.5549 10.2363 -3.5314 0.1583
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.670e+07 2.823e+07 0.592 0.596
## poly(x, 13, raw = TRUE)1 -6.202e+06 1.031e+07 -0.602 0.590
## poly(x, 13, raw = TRUE)2 1.018e+06 1.666e+06 0.611 0.584
## poly(x, 13, raw = TRUE)3 -9.699e+04 1.563e+05 -0.621 0.579
## poly(x, 13, raw = TRUE)4 5.906e+03 9.383e+03 0.629 0.574
## poly(x, 13, raw = TRUE)5 -2.380e+02 3.732e+02 -0.638 0.569
## poly(x, 13, raw = TRUE)6 6.309e+00 9.774e+00 0.646 0.565
## poly(x, 13, raw = TRUE)7 -1.038e-01 1.590e-01 -0.653 0.561
## poly(x, 13, raw = TRUE)8 8.663e-04 1.315e-03 0.659 0.557
## poly(x, 13, raw = TRUE)9 NA NA NA NA
## poly(x, 13, raw = TRUE)10 -4.741e-08 7.078e-08 -0.670 0.551
## poly(x, 13, raw = TRUE)11 NA NA NA NA
## poly(x, 13, raw = TRUE)12 2.555e-12 3.768e-12 0.678 0.546
## poly(x, 13, raw = TRUE)13 NA NA NA NA
##
## Residual standard error: 23.14 on 3 degrees of freedom
## Multiple R-squared: 0.9862, Adjusted R-squared: 0.9401
## F-statistic: 21.4 on 10 and 3 DF, p-value: 0.01419
coefficients <- as.vector(lm(y ~ poly(x, 13, raw=TRUE), data=ponderosa)$coefficients)
coefficients
## [1] 1.670123e+07 -6.202165e+06 1.018156e+06 -9.698892e+04 5.906312e+03
## [6] -2.380262e+02 6.309290e+00 -1.037739e-01 8.662540e-04 NA
## [11] -4.740889e-08 NA 2.554667e-12 NA
ggplot(ponderosa) + geom_point(aes(x=x, y=y)) + geom_line(aes(x=x, y=
coefficients[1] + coefficients[2]*x + coefficients[3]*x^2 +
coefficients[4]*x^3 + coefficients[5]*x^4 + coefficients[6]*x^5 +
coefficients[7]*x^6 + coefficients[8]*x^7 + coefficients[9]*x^8 +
coefficients[11]*x^10 +
coefficients[13]*x^12))
A 13th degree polynomia would pass through every data point and it is an example of overfitting.
Page 169: #11
Construct a scatterplot of the given data. Is there a trend in the data? Are any of the data points outliers? Construct a divided difference table. Is smoothing with a low-order polynomial appropriate? If so, choose an appropriate polynomial and fit using the least-squares criterion of best fit. Analyze the goodness of fit by examining appropriate indicators and graphing the model, the data points, and the deviations.
bass <- data.frame(length = c(12.5, 12.625, 14.125, 14.5, 17.25, 17.75),
weight = c(17, 16.5, 23, 26.5, 41, 49))
As seen from the plot the number of observations are less, so it is difficult to detect outliers in this scatter plot
# Difference Table
bassdelta1 <- (tail(bass$weight, -1) - head(bass$weight, -1))/
(tail(bass$length, -1) - head(bass$length, -1))
bassdelta2 <- (tail(bassdelta1, -1) - head(bassdelta1, -1))/
(tail(bass$length, -2) - head(bass$length, -2))
bassdelta3 <- (tail(bassdelta2, -1) - head(bassdelta2, -1))/
(tail(bass$length, -3) - head(bass$length, -3))
bassdelta4 <- (tail(bassdelta3, -1) - head(bassdelta3, -1))/
(tail(bass$length, -4) - head(bass$length, -4))
bassdelta5 <- (tail(bassdelta4, -1) - head(bassdelta4, -1))/
(tail(bass$length, -5) - head(bass$length, -5))
bass$delta1 <- c(bassdelta1, 0)
bass$delta2 <- c(bassdelta2, rep(0,2))
bass$delta3 <- c(bassdelta3, rep(0,3))
bass$delta4 <- c(bassdelta4, rep(0,4))
bass$delta5 <- c(bassdelta5, rep(0,5))
bass
## length weight delta1 delta2 delta3 delta4 delta5
## 1 12.500 17.0 -4.000000 5.128205 -1.2307692 0.07857739 0.06406722
## 2 12.625 16.5 4.333333 2.666667 -0.8575266 0.41493031 0.00000000
## 3 14.125 23.0 9.333333 -1.299394 1.2689912 0.00000000 0.00000000
## 4 14.500 26.5 5.272727 3.300699 0.0000000 0.00000000 0.00000000
## 5 17.250 41.0 16.000000 0.000000 0.0000000 0.00000000 0.00000000
## 6 17.750 49.0 0.000000 0.000000 0.0000000 0.00000000 0.00000000
ggplot(bass, aes(x=length, y=weight)) + geom_point()
ggplot(bass, aes(x=length, y=delta1)) + geom_line()
ggplot(bass, aes(x=length, y=delta2)) + geom_line()
ggplot(bass, aes(x=length, y=delta3)) + geom_line()
ggplot(bass, aes(x=length, y=delta4)) + geom_line()
ggplot(bass, aes(x=length, y=delta5)) + geom_line()
Linear Model
bassmodel <- lm(weight ~ length, data=bass)
coefficients <- bassmodel$coefficients
bass$bassest <- coefficients[1] + coefficients[2]*bass$length
ggplot(bass) + geom_point(aes(x=length, y=weight)) + geom_line(aes(x=length, y=bassest))
Quadratic model
bassmodelquad <- lm(weight ~ poly(length, 2, raw=TRUE), data=bass)
coefficients <- bassmodelquad$coefficients
bass$bassestquad <- coefficients[1] + coefficients[2]*bass$length + coefficients[3]*bass$length^2
ggplot(bass) + geom_point(aes(x=length, y=weight)) + geom_line(aes(x=length, y=bassestquad))
bassmodelquart <- lm(weight ~ poly(length, 4, raw=TRUE), data=bass)
coefficients <- bassmodelquart$coefficients
bass$bassestquart <- coefficients[1] + coefficients[2]*bass$length + coefficients[3]*bass$length^2 +
coefficients[4]*bass$length^3 + coefficients[5]*bass$length^4
ggplot(bass) + geom_point(aes(x=length, y=weight)) + geom_line(aes(x=length, y=bassestquart))
Residuals of the quadratic model
residuals <- data.frame(linear = bassmodel$residuals, quadratic = bassmodelquad$residuals,
quartic = bassmodelquart$residuals, datapoint = seq(1,6))
ggplot(melt(residuals, id="datapoint"), aes(x=datapoint, y=value, color=variable)) + geom_line()
Quadratic model is the best fit but it looks like it is a case of overfit.
y = c(1919,1932,1958,1963,1968,1971,1974,1976,1978,
1981,1985,1988,1991,1995,1999,2001,2002,2006,2007,2008,2009,2012)
p = c(0.02,0.03,0.04,0.05,0.06,0.08,0.10,0.13,0.15,0.20,
0.22,0.25,0.29,0.32,0.33,0.34,0.37,0.39,0.41,0.42,0.44,0.45)
y = c(1971,1974,1976,1978,1981,1985,1988,1991,1995,1999,2001,2002,2006,2007,2008,2009,2012)
p = c(0.08,0.10,0.13,0.15,0.20,0.22,0.25,0.29,0.32,0.33,0.34,0.37,0.39,0.41,0.42,0.44,0.45)
plot(y,p)
Taking log
logp = log(p)
plot(y,logp)
taking square root
sqrtp = sqrt(p)
plot(y, sqrtp)
Linear regression
lm1 = lm(p~y)
p1 = 0.008931*y - 17.51354
plot(y, p, type="p", col="red")
lines(y, p1, type="p", col="blue")
cbind(y,p1)
## y p1
## [1,] 1971 0.089461
## [2,] 1974 0.116254
## [3,] 1976 0.134116
## [4,] 1978 0.151978
## [5,] 1981 0.178771
## [6,] 1985 0.214495
## [7,] 1988 0.241288
## [8,] 1991 0.268081
## [9,] 1995 0.303805
## [10,] 1999 0.339529
## [11,] 2001 0.357391
## [12,] 2002 0.366322
## [13,] 2006 0.402046
## [14,] 2007 0.410977
## [15,] 2008 0.419908
## [16,] 2009 0.428839
## [17,] 2012 0.455632
Linear regression has minimum error