Here is the RPubs link to my homework ShijiaBian'sHW3
We perform best subset, forward stepwise, and backward stepwise selection on a single data set. For each approach, we obtain p + 1 models, containing 0, 1, 2, … , p predictors. Explain your answers:
Best subset selection model has the largest probability that has the smallest training RSS given k predictors. Usually, the training RSS of the forward subset selection model is the same as that of the best subset selection model at certain k. The code below illustrate this.
install.packages("ISLR")
## Error: trying to use CRAN without setting a mirror
library(ISLR)
names(Hitters)
## [1] "AtBat" "Hits" "HmRun" "Runs" "RBI"
## [6] "Walks" "Years" "CAtBat" "CHits" "CHmRun"
## [11] "CRuns" "CRBI" "CWalks" "League" "Division"
## [16] "PutOuts" "Assists" "Errors" "Salary" "NewLeague"
dim(Hitters)
## [1] 322 20
sum(is.na(Hitters$Salary))
## [1] 59
Hitters = na.omit(Hitters)
dim(Hitters)
## [1] 263 20
sum(is.na(Hitters))
## [1] 0
# this library can perform best subset selection
install.packages("leaps")
## Error: trying to use CRAN without setting a mirror
library(leaps)
regfit.full = regsubsets(Salary ~ ., Hitters)
summary(regfit.full)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., Hitters)
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " "
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " "
## 7 ( 1 ) " " "*" " " " " " " "*" " " "*" "*" "*" " "
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " "*" "*"
## CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) "*" " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " "*" " " " " " "
## 4 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 5 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 6 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 7 ( 1 ) " " " " " " "*" "*" " " " " " "
## 8 ( 1 ) " " "*" " " "*" "*" " " " " " "
# Best subset selection method
regfit.full = regsubsets(Salary ~ ., data = Hitters, nvmax = 19)
reg.summary = summary(regfit.full)
names(reg.summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
reg.summary$rsq
## [1] 0.3215 0.4252 0.4514 0.4754 0.4908 0.5087 0.5141 0.5286 0.5346 0.5405
## [11] 0.5426 0.5436 0.5445 0.5452 0.5455 0.5458 0.5460 0.5461 0.5461
# Forward subset selection method
regfit.fwd = regsubsets(Salary ~ ., data = Hitters, nvmax = 19, method = "forward")
fwd.summary <- summary(regfit.fwd)
# Backward subset selection method
regfit.bwd = regsubsets(Salary ~ ., data = Hitters, nvmax = 19, method = "backward")
bwd.summary <- summary(regfit.bwd)
List the RSS given by the three methods for the first 19 predictors, so we can compare the RSS for the three subset selection methods at different number of predictors:
reg.summary$rss
## [1] 36179679 30646560 29249297 27970852 27149899 26194904 25906548
## [8] 25136930 24814051 24500402 24387345 24333232 24289148 24248660
## [15] 24235177 24219377 24209447 24201837 24200700
fwd.summary$rss
## [1] 36179679 30646560 29249297 27970852 27149899 26194904 25954217
## [8] 25159234 24814051 24500402 24387345 24333232 24289148 24248660
## [15] 24235177 24219377 24209447 24201837 24200700
bwd.summary$rss
## [1] 36437951 31203460 29407297 28450807 27509524 26674092 25933487
## [8] 25159234 24814051 24500402 24387345 24333232 24289148 24248660
## [15] 24235177 24219377 24209447 24201837 24200700
This depends on the pattern of the testing data set. According to the adjusted R square, the backward selection method should give the smallest test RSS, becuase the backward selection model is more inflexible than that of the other two methods.
T
T
F
F
T
\( \int[\hat{g}(x)]^2dx=0\Rightarrow\hat{g}(x)=0 \)
Therefore, \( \hat{g}(x)=0 \)
\( \int[\hat{g}^{1}(x)]^2dx=0\Rightarrow\hat{g}(x)=constant \)
Therefore, \( \hat{g}(x)=constant \).The constant that best minimizes the RSS is the mean of the response, so \( \hat{g}(x)=\bar{y} \).
\( \int[\hat{g}^{2}(x)]^2dx=0\Rightarrow\hat{g}(x)=Least Square Regression Function \)
Therefore, to best minimizes the RSS, \( \hat{g}(x) \) is a least square regression fit.
\( \int[\hat{g}^{3}(x)]^2dx=0\Rightarrow\hat{g}(x)=Quadratic Regression Function \)
Therefore, to best minimizes the RSS, \( \hat{g}(x) \) is a quadratic least square fit.
The penalty term has no effect, and so the function g will be very jumpy and will exactly interpolate the training observation.
rm(list = ls())
s = data.frame(x = c(1, 2, 3, 4, 5), y = c(1, -0.5, 3, 4, 6))
grid = data.frame(x = seq(0, 6, 0.1))
plot(s, ylim = c(-1, 7))
abline(h = 0, col = "black")
abline(h = mean(s$y), col = "blue")
abline(lm(y ~ ., data = s), col = "red")
lines(grid$x, predict(lm(y ~ poly(x, 2), data = s), newdata = grid), col = "green")
lines(s, col = "yellow")
legend("topleft", legend = c("m=0 λ=infit", "m=1 λ=infit", "m=2 λ=infit",
"m=3 λ=infit", "m=3 λ=0"), col = c("black", "blue", "red", "green", "yellow"),
lty = 1)
## Warning: conversion failure on 'm=0 λ=infit' in 'mbcsToSbcs': dot substituted for <ce>
## Warning: conversion failure on 'm=0 λ=infit' in 'mbcsToSbcs': dot substituted for <bb>
## Warning: conversion failure on 'm=1 λ=infit' in 'mbcsToSbcs': dot substituted for <ce>
## Warning: conversion failure on 'm=1 λ=infit' in 'mbcsToSbcs': dot substituted for <bb>
## Warning: conversion failure on 'm=2 λ=infit' in 'mbcsToSbcs': dot substituted for <ce>
## Warning: conversion failure on 'm=2 λ=infit' in 'mbcsToSbcs': dot substituted for <bb>
## Warning: conversion failure on 'm=3 λ=infit' in 'mbcsToSbcs': dot substituted for <ce>
## Warning: conversion failure on 'm=3 λ=infit' in 'mbcsToSbcs': dot substituted for <bb>
## Warning: conversion failure on 'm=3 λ=0' in 'mbcsToSbcs': dot substituted for <ce>
## Warning: conversion failure on 'm=3 λ=0' in 'mbcsToSbcs': dot substituted for <bb>
## Warning: conversion failure on 'm=0 λ=infit' in 'mbcsToSbcs': dot substituted for <ce>
## Warning: conversion failure on 'm=0 λ=infit' in 'mbcsToSbcs': dot substituted for <bb>
## Warning: conversion failure on 'm=1 λ=infit' in 'mbcsToSbcs': dot substituted for <ce>
## Warning: conversion failure on 'm=1 λ=infit' in 'mbcsToSbcs': dot substituted for <bb>
## Warning: conversion failure on 'm=2 λ=infit' in 'mbcsToSbcs': dot substituted for <ce>
## Warning: conversion failure on 'm=2 λ=infit' in 'mbcsToSbcs': dot substituted for <bb>
## Warning: conversion failure on 'm=3 λ=infit' in 'mbcsToSbcs': dot substituted for <ce>
## Warning: conversion failure on 'm=3 λ=infit' in 'mbcsToSbcs': dot substituted for <bb>
## Warning: conversion failure on 'm=3 λ=0' in 'mbcsToSbcs': dot substituted for <ce>
## Warning: conversion failure on 'm=3 λ=0' in 'mbcsToSbcs': dot substituted for <bb>
title("Generalized smoothing splines")
Read the body dataset into R using the load() function. This dataset contains:
Let's say we forgot how the gender is coded in this dataset. Using a simple visualization, explain how you can tell which gender is which.
If we forget how we coded the gender into the data, we can distinguish that the color blue is the male and color red is female. The reason is that the male's average height and weight are larger than that of the female. The majority of the blue dots are on the upper side. We can visualize this from the plot very easily. Therefore, we can determine red is female (0) and blue is male (1)
# Here we load the data:
load("/Users/shijiabian/Downloads/body.RData")
genderFact <- as.factor(Y$Gender)
par(mfrow = c(3, 1))
plot(Y$Age, Y$Weight, col = c("red", "blue")[genderFact])
plot(Y$Age, Y$Height, col = c("red", "blue")[genderFact])
plot(Y$Weight, Y$Height, col = c("red", "blue")[genderFact])
Reserve 200 observations from your dataset to act as a test set and use the remaining 307 as a training set. On the training set, use both pcr and plsr to the models to predict a person's weight based on the variables in X. Use the options scale = TRUE and validation=CV'. Why does it make sense to scale our variables in this case?
The reasons that we want to scale the variables in our case are:
The reason that we CV our variables in this case is: We are cross-validating within our pcr/plsr model fits. In other words, this is cross-validating our choice of model fit by using cross-validation on the 307 training data samples we chose. This will avoid overfitting, and giving us an approximate model for the testing data set.
install.packages("pls")
## Error: trying to use CRAN without setting a mirror
library(pls)
##
## Attaching package: 'pls'
##
## The following object is masked from 'package:stats':
##
## loadings
set.seed(1234)
# test and train set for X
test.X <- sort(sample(1:nrow(X), 200))
train.X <- (1:nrow(X))[-test.X]
data <- as.data.frame(cbind(Y, X))
data <- data[, -c(1, 3, 4)]
# PCR for fitting the model
train.pcr <- pcr(Weight ~ ., data = data, subset = train.X, scale = TRUE, validation = "CV")
# plsr for fitting the model
train.plsr <- plsr(Weight ~ ., data = data, subset = train.X, scale = TRUE,
validation = "CV")
Run summary() on each of the objects calculated above, and compare the training % variance explained from the pcr output to the plsr output. Do you notice any consistent patterns (in comparing the two)? Is that pattern surprising? Explain why or why not.
No. This is not a surprising result.PLSR and PCR are both methods to model a response variable when there are a large number of predictor variables, and those predictors are highly correlated or even collinear. Both methods construct new predictor variables, known as components, as linear combinations of the original predictor variables, thus the trend of PLSR and PCR should be the same. We can see this from the % variance explained of PLSR and PCR. However, they construct those components in different ways. PCR creates components to explain the observed variability in the predictor variables, without considering the response variable at all. On the other hand, PLSR does take the response variable into account, and therefore often leads to models that are able to fit the response variable with fewer components. Thus there will be some difference between % variance explained of the PLSR and PCR.
# summary for PCR
summary(train.pcr)
## Data: X dimension: 307 21
## Y dimension: 307 1
## Fit method: svdpc
## Number of components considered: 21
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 12.81 3.090 2.996 2.810 2.759 2.764 2.771
## adjCV 12.81 3.087 2.993 2.804 2.756 2.760 2.766
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 2.770 2.770 2.788 2.786 2.773 2.764 2.763
## adjCV 2.764 2.765 2.784 2.779 2.767 2.757 2.757
## 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps
## CV 2.763 2.732 2.702 2.658 2.628 2.626
## adjCV 2.756 2.729 2.697 2.630 2.617 2.614
## 20 comps 21 comps
## CV 2.641 2.645
## adjCV 2.628 2.632
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 62.84 74.60 79.53 84.06 86.40 88.29 89.90
## Weight 94.24 94.69 95.41 95.54 95.56 95.58 95.61
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 91.40 92.64 93.81 94.93 95.85 96.68 97.42
## Weight 95.61 95.62 95.67 95.70 95.76 95.80 95.83
## 15 comps 16 comps 17 comps 18 comps 19 comps 20 comps
## X 97.97 98.48 98.91 99.31 99.58 99.82
## Weight 95.94 96.13 96.42 96.42 96.46 96.47
## 21 comps
## X 100.00
## Weight 96.48
# summary for plsr
summary(train.plsr)
## Data: X dimension: 307 21
## Y dimension: 307 1
## Fit method: kernelpls
## Number of components considered: 21
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 12.81 3.014 2.768 2.683 2.650 2.614 2.599
## adjCV 12.81 3.013 2.766 2.679 2.636 2.603 2.589
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 2.598 2.604 2.608 2.610 2.610 2.610 2.610
## adjCV 2.588 2.593 2.597 2.599 2.599 2.599 2.599
## 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps
## CV 2.611 2.610 2.610 2.610 2.610 2.610
## adjCV 2.599 2.599 2.599 2.599 2.599 2.599
## 20 comps 21 comps
## CV 2.610 2.610
## adjCV 2.599 2.599
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 62.83 72.47 79.16 80.56 83.26 86.23 87.88
## Weight 94.54 95.53 95.89 96.33 96.42 96.46 96.47
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 89.25 90.25 91.36 92.48 93.16 93.84 94.55
## Weight 96.47 96.47 96.48 96.48 96.48 96.48 96.48
## 15 comps 16 comps 17 comps 18 comps 19 comps 20 comps
## X 95.41 96.48 97.31 98.04 98.84 99.60
## Weight 96.48 96.48 96.48 96.48 96.48 96.48
## 21 comps
## X 100.00
## Weight 96.48
For each of models, pick a number of components that you would use to predict future values of weight from X. Please include any further analysis you use to decide on the number of components.
For the Principle Component Analysis Model, I want to choose the first 5 components. For the Partial Linear Square Regression Model, we want to choose the first 8 components. This is due to the one standard deviation rule.
test = sort(sample(1:nrow(X), 200))
train = (1:nrow(X))[-test]
install.packages("pls")
## Error: trying to use CRAN without setting a mirror
library(pls)
set.seed(1234)
data <- as.data.frame(cbind(Y, X))
data <- data[, -c(1, 3, 4)]
pcr.fit <- pcr(Weight ~ ., data = data, subset = train, scale = T, validation = "CV")
plsr.fit <- plsr(Weight ~ ., data = data, subset = train, scale = T, validation = "CV")
validationplot(pcr.fit, val.type = "MSEP")
plot(pcr.fit$Xvar/pcr.fit$Xtotvar, xlab = "Principal Component", ylab = "Prop. Variance Explained")
summary(plsr.fit)
## Data: X dimension: 307 21
## Y dimension: 307 1
## Fit method: kernelpls
## Number of components considered: 21
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 13.57 3.182 2.922 2.848 2.860 2.852 2.845
## adjCV 13.57 3.180 2.920 2.843 2.849 2.840 2.832
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 2.849 2.842 2.837 2.837 2.839 2.839 2.839
## adjCV 2.836 2.829 2.824 2.824 2.826 2.826 2.826
## 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps
## CV 2.839 2.839 2.839 2.839 2.839 2.839
## adjCV 2.826 2.826 2.826 2.826 2.826 2.826
## 20 comps 21 comps
## CV 2.839 2.839
## adjCV 2.826 2.826
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 64.47 74.93 80.24 81.88 83.85 86.47 88.31
## Weight 94.59 95.57 95.94 96.17 96.26 96.30 96.32
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 89.65 90.69 91.68 92.95 93.61 94.58 95.26
## Weight 96.32 96.32 96.32 96.32 96.32 96.32 96.32
## 15 comps 16 comps 17 comps 18 comps 19 comps 20 comps
## X 95.95 97.08 97.82 98.38 98.94 99.68
## Weight 96.32 96.32 96.32 96.32 96.32 96.32
## 21 comps
## X 100.00
## Weight 96.32
validationplot(plsr.fit, val.type = "MSEP")
plot(plsr.fit$Xvar/plsr.fit$Xtotvar, xlab = "Principal Component", ylab = "Prop. Variance Explained")
plot(pcr.fit$Xvar/pcr.fit$Xtotvar, xlab = "Principal Component", ylab = "Prop. Variance Explained")
points(plsr.fit$Xvar/plsr.fit$Xtotvar, xlab = "Principal Component", ylab = "Prop. Variance Explained",
col = "blue")
legend("topright", legend = c("PCR", "PLSR"), col = c("black", "blue"), pch = 1)
Practically speaking, it might be nice if we could guess a person's weight without measuring 21 different quantities. Do either of the methods performed above allow us to do that? If not, pick another method that will and fit it on the training data.
Neither of the methods performed above allow us to guess person's weight. PCA and PLR are used for summarizing the data set with a smallr number of representative variables that collectively explain most of the variability in the original set. Thus the two methods are not a good way of guessing the weight. Due to the large amount of predictors, it is hard to interpret the model and guess the weight if we include all the predictors. In order to decrease the number of predictors in the model and increase interpretability, we want to use lasso here. As we can see from the coefficients above the lasso model eliminated 4 predictors from the original 21 predictors.
install.packages("glmnet")
## Error: trying to use CRAN without setting a mirror
install.packages("ISLR")
## Error: trying to use CRAN without setting a mirror
library(ISLR)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 1.9-5
grid <- 10^seq(10, -2, length = 100)
data.x <- scale(model.matrix(Weight ~ ., data = data)[, -1])
data.y <- data$Weight
lasso.fit <- glmnet(data.x[train, ], data.y[train], alpha = 1, lambda = grid)
plot(lasso.fit)
set.seed(1234)
lasso.cv <- cv.glmnet(data.x[train, ], data.y[train], alpha = 1)
bestlam <- lasso.cv$lambda.min
predict(lasso.fit, type = "coefficients", s = bestlam)
## 22 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 69.18702
## Wrist.Diam 0.31333
## Wrist.Girth .
## Forearm.Girth 1.16174
## Elbow.Diam 0.57417
## Bicep.Girth .
## Shoulder.Girth 1.81321
## Biacromial.Diam 0.80851
## Chest.Depth 1.25674
## Chest.Diam 0.84920
## Chest.Girth 0.23323
## Navel.Girth .
## Waist.Girth 2.80798
## Pelvic.Breadth 0.71955
## Bitrochanteric.Diam .
## Hip.Girth 1.80898
## Thigh.Girth 0.85362
## Knee.Diam 0.07308
## Knee.Girth 1.10209
## Calf.Girth 0.71844
## Ankle.Diam 0.70624
## Ankle.Girth 0.18131
Compare all 3 methods in terms of performance on the test set. Keep in mind that you should only run one version of each model on the test set. Any necessary selection of parameters should be done only with the training set.
The best performer in terms of test MSE was the PLSR model. The lasso also had similar performance with the advantage of eliminating unnecessary variables.
# pcr:
pcr.pred <- predict(pcr.fit, data[test, ], ncomp = 5)
mean((pcr.pred - data.y[test])^2)
## [1] 9.295
# plsr:
plsr.pred <- predict(plsr.fit, data[test, ], ncomp = 7)
mean((plsr.pred - data.y[test])^2)
## [1] 7.672
# lasso:
lasso.pred <- predict(lasso.fit, s = bestlam, newx = data.x[test, ])
mean((lasso.pred - data.y[test])^2)
## [1] 7.85
In this problem we will build our R function that can be used to fit a cubic spline to data by using a truncated power basis.
Write a function h such that \[ h(x,z)=(x-z)^3_+ \]
h <- function(x, z) {
if (x > z) {
return((x - z)^3)
} else {
return(0)
}
}
Using h and sapply, write a function hs such that hs(xs, z) = (h(xs[1],z),h(xs[2],z),…, h(xs[length(xs)],z))
hs <- function(xs, z) {
return(sapply(xs, h, z = z))
}
Using hs, write a function splinebasis such that splinebasis(xs, zs) returns a length(xs)(length(zs) + 3) matrix with the following columns: xs, xs2, xs3 , hs(xs,zs[1]), hs(xs,zs[2]), … , hs(xs,zs[length(zs)]).
splinebasis <- function(xs, zs) {
x <- cbind(xs, xs^2, xs^3)
for (i in 1:length(zs)) {
x <- cbind(x, hs(xs, zs[i]))
}
colnames(x) <- as.character(1:(3 + length(zs)))
return(x)
}
Now, let's generate some data:
set.seed(1337)
x = runif(100)
y = sin(10 * x)
When we fit a cubic spline with knots at the points knots, that is the same thing as fitting a linear model predicting y from splinebasis(xs, zs). Using lm() and your splinebasis(), fit a spline to the data we generated using k = 3 evenly spaced knots (knots = c(¼,2/4,¾)). Make a plot that clearly shows the original points, the curve those points were generated from, and a curve that represents your fitted spline. Note that the curves can be generated either using curve() or by using lines() with many points along the curve. Does the spline do a good job approximating the function?
According to the plot below, this spline does a good job approximating the function for the most part of the data. However, only the left tail of the plot is a little off.
knots <- c(1/4, 2/4, 3/4)
data <- splinebasis(x, knots)
data <- as.data.frame(cbind(y, data))
mod1 <- lm(y ~ ., data = data)
summary(mod1)
##
## Call:
## lm(formula = y ~ ., data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0598 -0.0157 -0.0019 0.0226 0.0778
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.2488 0.0292 -8.52 2.8e-13 ***
## `1` 18.6454 0.5908 31.56 < 2e-16 ***
## `2` -80.1222 3.2398 -24.73 < 2e-16 ***
## `3` 74.5450 5.1896 14.36 < 2e-16 ***
## `4` 41.6353 6.5669 6.34 8.2e-09 ***
## `5` -278.9462 2.9644 -94.10 < 2e-16 ***
## `6` 250.9985 4.8937 51.29 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0286 on 93 degrees of freedom
## Multiple R-squared: 0.998, Adjusted R-squared: 0.998
## F-statistic: 8.41e+03 on 6 and 93 DF, p-value: <2e-16
plot(x, y)
curve.orig <- function(x) {
return(sin(10 * x))
}
curve(curve.orig, 0, 1, add = T, col = "red")
coeffs <- coefficients(mod1)
fitted.model <- function(x, coeffs, zs) {
return(cbind(rep(1, length(x)), splinebasis(x, zs)) %*% coeffs)
}
fitted.model.plot <- function(x) {
return(fitted.model(x, coeffs, knots))
}
curve(fitted.model.plot, 0, 1, add = T, col = "blue")
Repeat the above process for k = 1,2,…,9. Do NOT copy paste your code
get.knots <- function(n) {
if (n == 1) {
return(0.5)
}
int <- 1/(n + 1)
fknots <- int
for (i in 2:n) {
fknots <- c(fknots, fknots[i - 1] + int)
}
return(fknots)
}
op <- par(mfrow = c(5, 2), mar = c(1, 1, 1, 1))
for (i in c(1, 2, 4, 5, 6, 7, 8, 9)) {
knots <- get.knots(i)
data = splinebasis(x, knots)
data <- as.data.frame(cbind(y, data))
mod1 <- lm(y ~ ., data = data)
plot(x, y, xlab = "", ylab = "")
curve.orig <- function(x) {
return(sin(10 * x))
}
curve(curve.orig, 0, 1, add = T, col = "red")
coeffs <- coefficients(mod1)
fitted.model <- function(x, coeffs, zs) {
return(cbind(rep(1, length(x)), splinebasis(x, zs)) %*% coeffs)
}
fitted.model.plot <- function(x) {
return(fitted.model(x, coeffs, knots))
}
curve(fitted.model.plot, 0, 1, add = T, col = "blue")
}
par(op)
As we let the number of knots increase, do we expect our fitted curve to continually become a better approximation to the true curve? Why or why not?
Yes, we do we expect our fitted curve to continually become a better approximation to the true curve. Splines with few knots are generally smoother than splines with many knots; however, increasing the number of knots usually increases the fit of the spline function to the data. Knots give the curve freedom to bend to more closely follow the data. In the presence of noise, the model will tend to overfit the data plus noise and the model may diverge from the true curve as we increase the number of knots.
**NOTE**
This hw has the error below when uploading the packages:
Error: trying to use CRAN without setting a mirror
This is due to the internet limit in China. I have to upload the package manually in R instead of calling the function in R console.
Th Warning Message in 2 (e) is due to fonts and encodings when converting the non-ascii characters into HTML.