#6
At what percentage concentration is the predicted time the same for the two lots?
Plot the data using a different plotting symbol according to the lot.
library(curl)
## Warning: package 'curl' was built under R version 4.2.3
## Using libcurl 7.86.0 with LibreSSL/3.3.6
library(openssl)
## Warning: package 'openssl' was built under R version 4.2.3
## Linking to: OpenSSL 3.3.0 9 Apr 2024
library(rsconnect)
library(ggplot2)
data(clot, package="faraway")
head(clot)
## time conc lot
## 1 118 5 one
## 2 58 10 one
## 3 42 15 one
## 4 35 20 one
## 5 27 30 one
## 6 25 40 one
ggplot(clot, aes(time, conc, shape = lot)) + geom_point()
#with facet wrap of concentrations
ggplot(clot, aes(time, conc, shape = lot)) + geom_point() +
facet_wrap( ~ conc)
We can us a Logarithmic Transformation of 2 continuous variables to form Linear relationship.
Our function of f is defined as: f(time) = log(time)
Our function of g is defined as: g(conc) = log(conc)
clot$log_time <- log(clot$time)
clot$log_conc <- log(clot$conc)
For H0: The time to clot in different concentration does NOT vary in different lot size
For H1: The time to clot in different concentration varies in different lot size
lmod <- lm(log_time ~ log_conc * lot, data = clot)
summary(lmod)
##
## Call:
## lm(formula = log_time ~ log_conc * lot, data = clot)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.16452 -0.10730 -0.04348 0.07614 0.25353
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.47806 0.18073 30.312 3.62e-14 ***
## log_conc -0.59704 0.05252 -11.367 1.87e-08 ***
## lottwo -0.58179 0.25558 -2.276 0.0391 *
## log_conc:lottwo 0.03383 0.07428 0.455 0.6558
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1482 on 14 degrees of freedom
## Multiple R-squared: 0.9539, Adjusted R-squared: 0.944
## F-statistic: 96.47 on 3 and 14 DF, p-value: 1.371e-09
anovalmod<-anova(lmod)
anovalmod
## Analysis of Variance Table
##
## Response: log_time
## Df Sum Sq Mean Sq F value Pr(>F)
## log_conc 1 5.3590 5.3590 243.9768 2.975e-10 ***
## lot 1 0.9933 0.9933 45.2214 9.709e-06 ***
## log_conc:lot 1 0.0046 0.0046 0.2074 0.6558
## Residuals 14 0.3075 0.0220
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(lmod)
## Chapter 11:
#4
Take the fat data, and use the percentage of body fat, siri, as the response and
the other variables, except brozek and density as potential predictors. Remove
every tenth observation from the data for use as a test sample. Use the remaining
data as a training sample building the following models:
(c) Principal component regression
```r
data(fat, package="faraway")
head(fat)
## brozek siri density age weight height adipos free neck chest abdom hip
## 1 12.6 12.3 1.0708 23 154.25 67.75 23.7 134.9 36.2 93.1 85.2 94.5
## 2 6.9 6.1 1.0853 22 173.25 72.25 23.4 161.3 38.5 93.6 83.0 98.7
## 3 24.6 25.3 1.0414 22 154.00 66.25 24.7 116.0 34.0 95.8 87.9 99.2
## 4 10.9 10.4 1.0751 26 184.75 72.25 24.9 164.7 37.4 101.8 86.4 101.2
## 5 27.8 28.7 1.0340 24 184.25 71.25 25.6 133.1 34.4 97.3 100.0 101.9
## 6 20.6 20.9 1.0502 24 210.25 74.75 26.5 167.0 39.0 104.5 94.4 107.8
## thigh knee ankle biceps forearm wrist
## 1 59.0 37.3 21.9 32.0 27.4 17.1
## 2 58.7 37.3 23.4 30.5 28.9 18.2
## 3 59.6 38.9 24.0 28.8 25.2 16.6
## 4 60.1 37.3 22.8 32.4 29.4 18.2
## 5 63.2 42.2 24.0 32.2 27.7 17.7
## 6 66.0 42.0 25.6 35.7 30.6 18.8
#training sets
set.seed(123)
require(MASS)
## Loading required package: MASS
testdata <- seq(10,250,by=10)
testdata
## [1] 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190
## [20] 200 210 220 230 240 250
train <- fat[-testdata,]
subset <- fat[testdata,]
library(stats)
library(Metrics)
pca <- prcomp(train[,4:18])
round(pca$sdev,3)
## [1] 37.047 14.540 9.511 3.634 3.462 2.544 2.200 1.854 1.577 1.412
## [11] 1.324 1.249 1.037 0.828 0.491
plot(pca$sdev, type="l", xlab="Principal Component Number", ,ylab="square root of eigenvalues")
lmod <- lm(train$siri ~ pca$x[,1:6])
summary(lmod)
##
## Call:
## lm(formula = train$siri ~ pca$x[, 1:6])
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.3999 -0.5813 0.3739 0.9771 7.9487
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.180176 0.110619 173.389 < 2e-16 ***
## pca$x[, 1:6]PC1 -0.126730 0.002992 -42.350 < 2e-16 ***
## pca$x[, 1:6]PC2 -0.356175 0.007625 -46.714 < 2e-16 ***
## pca$x[, 1:6]PC3 0.464783 0.011656 39.875 < 2e-16 ***
## pca$x[, 1:6]PC4 0.309275 0.030508 10.138 < 2e-16 ***
## pca$x[, 1:6]PC5 -0.091998 0.032023 -2.873 0.00447 **
## pca$x[, 1:6]PC6 -0.222500 0.043574 -5.106 7.11e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.667 on 220 degrees of freedom
## Multiple R-squared: 0.9629, Adjusted R-squared: 0.9618
## F-statistic: 950.5 on 6 and 220 DF, p-value: < 2.2e-16
#predictions
mm <- apply(train[,4:18],2,mean)
tx <- as.matrix(sweep(subset[,4:18],2,mm)) #centering the predicting data
nx <- tx %*% pca$rot[,1:6]
pred3 <- cbind(1,nx) %*% lmod$coef
rmse <- function(x,y) sqrt(mean((x-y)^2))
y10 <- subset$siri
rmse(pred3,y10)
## [1] 1.141839
a <- numeric(15)
for (i in 1:15) {
nx <- tx %*% pca$rot[,1:i]
model4 <- lm(train$siri ~pca$x[,1:i])
pred4 <- cbind(1,nx) %*% model4$coef
a[i] <- rmse(pred4,y10)
}
plot(a, ylab="Test RMSE",xlab="Principal Component Number")
which.min(a)
## [1] 7
From these results, we yield the best RMSE when we use the first 7 principal components.
par(mfrow=c(1,1))
trainx <- as.matrix(sweep(train[,4:18],2,mm))
testx <- as.matrix(sweep(subset[,4:18],2,mm))
yc <- train$siri - mean(train$siri)
g_ridge <- lm.ridge(yc ~trainx, lambda=seq(0,10,1e-4))
matplot(g_ridge$lambda,t(g_ridge$coef), type="l",lty=1,xlab=expression(lambda),ylab=expression(hat(beta)),main="Ridge Trace plot")
select(g_ridge)
## modified HKB estimator is 0.1340921
## modified L-W estimator is 0.4446052
## smallest value of GCV at 0.0467
We select lambda as 0.1340921 from our ordinary ridge type shrinkage estimator. (HKB)
#used data
yfit<-scale(trainx, center=F, scale=g_ridge$scales) %*% g_ridge$coef[,468] + mean(train$siri)
rmse(yfit,train$siri)
## [1] 1.494422
#the used data value is 1.494422
#test data
ypred = scale(testx, center=F, scale=g_ridge$scales) %*% g_ridge$coef[,468] + mean(train$siri)
rmse(ypred,subset$siri )
## [1] 1.128089
#the used data value is 1.128089