kcTemp <- c(43.8,40.1,49.2,41.8,34.0,49.1,47.8,48.1,37.6,42.0,
43.7,47.1,47.7,46.9,36.5,45.0,48.0,37.6,42.2,38.7,
45.2,42.5,43.1,36.0,47.4,48.5,47.1,43.2,43.8,45.7)
set.seed(1234)
n<-100
x<-rnorm(n)
kern<-function(z,x,Delta=.1){
n<-length(x)
1/(n*Delta)*sum(dnorm((z-x)/Delta))
}
S<-sd(kcTemp)
n<-length(kcTemp)
hist(kcTemp,main="Delta=1.5",freq=FALSE)
dens<-density(kcTemp,bw=1.5)
points(dens$x,dens$y,type="l",col="red",lwd=3)
Plot of Graduation Rate vs Income below
The red line represents the simple linear regression line
data6 <-read.csv("https://www.dropbox.com/s/s4cp6q7xpdli7na/HW6stateData.csv?dl=1")
head(data6)
## X Population Income Illiteracy Life.Exp Murder HS.Grad Frost
## 1 Alabama 3615 3624 2.1 69.05 15.1 41.3 20
## 2 Alaska 365 6315 1.5 69.31 11.3 66.7 152
## 3 Arizona 2212 4530 1.8 70.55 7.8 58.1 15
## 4 Arkansas 2110 3378 1.9 70.66 10.1 39.9 65
## 5 California 21198 5114 1.1 71.71 10.3 62.6 20
## 6 Colorado 2541 4884 0.7 72.06 6.8 63.9 166
## Area
## 1 50708
## 2 566432
## 3 113417
## 4 51945
## 5 156361
## 6 103766
lin <- lm(data6$Income~data6$HS.Grad)
plot(data6$HS.Grad, data6$Income, main = "Graduation Rate vs. Income", xlab = "Graduation Rate", ylab = "Income", pch = 16)
par(new=T)
abline(lin, col='red')
#Part C. Below we see a linear regression model with a quadratic effect for HS graduation rate to this data.
data6$HS.Grad2 <- data6$HS.Grad**2
quad <- lm(Income~ HS.Grad + HS.Grad2, data=data6)
x <- seq(26, 75, 1)
pred <- predict(quad, data.frame(HS.Grad = x, HS.Grad2 = x^2))
plot(data6$HS.Grad,data6$Income, main = "Graduation Rate vs. Income", xlab = "Graduation Rate", ylab = "Income", pch = 16)
par(new=T)
abline(lin, col = "red", lty = 1)
lines(x, pred, col = "purple", lty = 2)
legend("bottomright", lty = c(1, 2), col = c("red", "purple"), legend = c("Linear", "Quadratic"))
For the Loess model, I picked span = 1 because span = 2 smoothed the curve too much and basically resembled the quadratic line.
From the plot below, I would argue the Loess model fits the data the best. Visually, the curve traces the data better. The curve has a slight peak around 50 and then begins to slope down and then a bit up again.
The Quadratic curve does not have this change that the data follows. It captures much of the same changes as the Loess curve but does not have that turn.
The Linear curve just does not fit the data at all.
I would try to a cubic function compared to the Loess.
loessy <-loess(Income~HS.Grad,span=1,degree=2,data=data6)
summary(loessy)
## Call:
## loess(formula = Income ~ HS.Grad, data = data6, span = 1, degree = 2)
##
## Number of Observations: 50
## Equivalent Number of Parameters: 3.39
## Residual Standard Error: 470.8
## Trace of smoother matrix: 3.63 (exact)
##
## Control settings:
## span : 1
## degree : 2
## family : gaussian
## surface : interpolate cell = 0.2
## normalize: TRUE
## parametric: FALSE
## drop.square: FALSE
pred2 = predict(loessy, data.frame(HS.Grad = x))
plot(data6$HS.Grad,data6$Income, main = "Graduation Rate vs. Income", xlab = "Graduation Rate", ylab = "Income", pch = 16)
par(new=T)
abline(lin, col = "red", lwd=3, lty = 1)
lines(x, pred, col = "purple", lwd = 3, lty = 2)
lines(x, pred2, col = "green",lwd = 4, lty = 3)
legend("bottomright", lty = c(1, 2, 3), col = c("red", "purple", "green"), lwd = 3, legend = c("Linear", "Quadratic", "Loess"))
I tried to calculate the SSE below and created a range based on these values. It seemed to work okay and provided the middle of the range and I worked backwards from there.
A span between 0.2 to 0.6 provides a good fit for this data without under/overfitting.
set.seed(1234)
x <- runif(100,0,10)
y <- sin(x)+rnorm(100,0,.3)
loessy<-loess(y~x,span=0.45,degree=2)
plot(x,y)
par(new=T)
j = order(x)
points(x[j],loessy$fitted[j],col="red",lwd=3,type="l")
# define function that returns the SSE
calcSSE <- function(z){
loessMod <- try(loess(y ~ x, degree=2, span=z), silent=T)
res <- try(loessMod$residuals, silent=T)
if(class(res)!="try-error"){
if((sum(res, na.rm=T) > 0)){
sse <- sum(res^2)
}
}else{
sse <- 99999
}
return(sse)
}
#Run optim to find span that gives min SSE, starting at 0.2
#Code used from http://r-statistics.co/Loess-Regression-With-R.html
#optim(par=c(0.2), calcSSE, method="SANN")
#[1] 0.44883
#$value
#[1] 0
#$counts
#function gradient
# 10000 NA
#$convergence
#[1] 0
From the simulation below, I would argue that the value of the span increases as the noise increases.
Additionally, the range of span will decrease as the noise increases.
A span of 0.1 to 0.5 would be a good range.
set.seed(1234)
x <- runif(100,0,10)
y <- sin(x)+rnorm(100,0,0.1)
loess_a<-loess(y~x,span=.1,degree=2)
plot(x,y, main = "Sigma = 0.1, Span = 0.1")
par(new=T)
j = order(x)
points(x[j],loess_a$fitted[j],col="red",lwd=3,type="l")
set.seed(1234)
x <- runif(100,0,10)
y <- sin(x)+rnorm(100,0,0.1)
loess_a<-loess(y~x,span=.3,degree=2)
plot(x,y, main = "Sigma = 0.1, Span = 0.3")
par(new=T)
j = order(x)
points(x[j],loess_a$fitted[j],col="red",lwd=3,type="l")
set.seed(1234)
x <- runif(100,0,10)
y <- sin(x)+rnorm(100,0,0.1)
loess_a<-loess(y~x,span=.5,degree=2)
plot(x,y, main = "Sigma = 0.1, Span = 0.5")
par(new=T)
j = order(x)
points(x[j],loess_a$fitted[j],col="red",lwd=3,type="l")
# define function that returns the SSE
calcSSE <- function(z){
loessMod <- try(loess_a(y ~ x, degree=2, span=z), silent=T)
res <- try(loessMod$residuals, silent=T)
if(class(res)!="try-error"){
if((sum(res, na.rm=T) > 0)){
sse <- sum(res^2)
}
}else{
sse <- 99999
}
return(sse)
}
#Run optim to find span that gives min SSE, starting at 0.1
#Code used from http://r-statistics.co/Loess-Regression-With-R.html
#optim(par=c(0.1), calcSSE, method="SANN")
A span of 0.2 to 0.6 would be a good range.
set.seed(1234)
x <- runif(100,0,10)
y <- sin(x)+rnorm(100,0,0.5)
loess_b<-loess(y~x,span=.2,degree=2)
plot(x,y, main = "Sigma = 0.5, Span = 0.2")
par(new=T)
j = order(x)
points(x[j],loess_b$fitted[j],col="red",lwd=3,type="l")
set.seed(1234)
x <- runif(100,0,10)
y <- sin(x)+rnorm(100,0,0.5)
loess_b<-loess(y~x,span=.48,degree=2)
plot(x,y, main = "Sigma = 0.5, Span = 0.48")
par(new=T)
j = order(x)
points(x[j],loess_b$fitted[j],col="red",lwd=3,type="l")
set.seed(1234)
x <- runif(100,0,10)
y <- sin(x)+rnorm(100,0,0.5)
loess_b<-loess(y~x,span=.6,degree=2)
plot(x,y, main = "Sigma = 0.5, Span = 0.6")
par(new=T)
j = order(x)
points(x[j],loess_b$fitted[j],col="red",lwd=3,type="l")
# define function that returns the SSE
calcSSE <- function(z){
loessMod <- try(loess(y ~ x, degree=2, span=z), silent=T)
res <- try(loessMod$residuals, silent=T)
if(class(res)!="try-error"){
if((sum(res, na.rm=T) > 0)){
sse <- sum(res^2)
}
}else{
sse <- 99999
}
return(sse)
}
#Run optim to find span that gives min SSE, starting at 0.2
#Code used from http://r-statistics.co/Loess-Regression-With-R.html
#optim(par=c(0.2), calcSSE, method="SANN")
A span of of 0.4 to 0.7 would be a good range.
set.seed(1234)
x <- runif(100,0,10)
y <- sin(x)+rnorm(100,0,1)
loess_c<-loess(y~x,span=.4,degree=2)
plot(x,y, main = "Sigma = 1, Span = 0.4")
par(new=T)
j = order(x)
points(x[j],loess_c$fitted[j],col="red",lwd=3,type="l")
set.seed(1234)
x <- runif(100,0,10)
y <- sin(x)+rnorm(100,0,1)
loess_c<-loess(y~x,span=.58,degree=2)
plot(x,y, main = "Sigma = 1, Span = 0.58")
par(new=T)
j = order(x)
points(x[j],loess_c$fitted[j],col="red",lwd=3,type="l")
set.seed(1234)
x <- runif(100,0,10)
y <- sin(x)+rnorm(100,0,1)
loess_c<-loess(y~x,span=.7,degree=2)
plot(x,y, main = "Sigma = 1, Span = 0.7")
par(new=T)
j = order(x)
points(x[j],loess_c$fitted[j],col="red",lwd=3,type="l")
# define function that returns the SSE
calcSSE <- function(z){
loessMod <- try(loess(y ~ x, degree=2, span=z), silent=T)
res <- try(loessMod$residuals, silent=T)
if(class(res)!="try-error"){
if((sum(res, na.rm=T) > 0)){
sse <- sum(res^2)
}
}else{
sse <- 99999
}
return(sse)
}
#Run optim to find span that gives min SSE, starting at 0.4
#Code used from http://r-statistics.co/Loess-Regression-With-R.html
#optim(par=c(0.4), calcSSE, method="SANN")
A span of 0.5 to 0.8 would be a good range.
set.seed(1234)
x <- runif(100,0,10)
y <- sin(x)+rnorm(100,0,1.5)
loess_d<-loess(y~x,span=0.50,degree=2)
plot(x,y, main = "Sigma = 1.5, Span = 0.5")
par(new=T)
j = order(x)
points(x[j],loess_d$fitted[j],col="red",lwd=3,type="l")
set.seed(1234)
x <- runif(100,0,10)
y <- sin(x)+rnorm(100,0,1.5)
loess_d<-loess(y~x,span=0.72,degree=2)
plot(x,y, main = "Sigma = 1.5, Span = 0.72")
par(new=T)
j = order(x)
points(x[j],loess_d$fitted[j],col="red",lwd=3,type="l")
set.seed(1234)
x <- runif(100,0,10)
y <- sin(x)+rnorm(100,0,1.5)
loess_d<-loess(y~x,span=0.8,degree=2)
plot(x,y, main = "Sigma = 1.5, Span = 0.8")
par(new=T)
j = order(x)
points(x[j],loess_d$fitted[j],col="red",lwd=3,type="l")
# define function that returns the SSE
calcSSE <- function(z){
loessMod <- try(loess(y ~ x, degree=2, span=z), silent=T)
res <- try(loessMod$residuals, silent=T)
if(class(res)!="try-error"){
if((sum(res, na.rm=T) > 0)){
sse <- sum(res^2)
}
}else{
sse <- 99999
}
return(sse)
}
#Run optim to find span that gives min SSE, starting at 0.5
#Code used from http://r-statistics.co/Loess-Regression-With-R.html
#optim(par=c(0.5), calcSSE, method="SANN")