Problem 1

kcTemp <- c(43.8,40.1,49.2,41.8,34.0,49.1,47.8,48.1,37.6,42.0,
43.7,47.1,47.7,46.9,36.5,45.0,48.0,37.6,42.2,38.7,
45.2,42.5,43.1,36.0,47.4,48.5,47.1,43.2,43.8,45.7)


set.seed(1234)
n<-100
x<-rnorm(n)

kern<-function(z,x,Delta=.1){
n<-length(x)
1/(n*Delta)*sum(dnorm((z-x)/Delta))
}

S<-sd(kcTemp)
n<-length(kcTemp)

hist(kcTemp,main="Delta=1.5",freq=FALSE)  
dens<-density(kcTemp,bw=1.5) 
points(dens$x,dens$y,type="l",col="red",lwd=3)

Problem 2

Part A

Plot of Graduation Rate vs Income below

Part B

The red line represents the simple linear regression line

data6 <-read.csv("https://www.dropbox.com/s/s4cp6q7xpdli7na/HW6stateData.csv?dl=1")

head(data6)

##            X Population Income Illiteracy Life.Exp Murder HS.Grad Frost
## 1    Alabama       3615   3624        2.1    69.05   15.1    41.3    20
## 2     Alaska        365   6315        1.5    69.31   11.3    66.7   152
## 3    Arizona       2212   4530        1.8    70.55    7.8    58.1    15
## 4   Arkansas       2110   3378        1.9    70.66   10.1    39.9    65
## 5 California      21198   5114        1.1    71.71   10.3    62.6    20
## 6   Colorado       2541   4884        0.7    72.06    6.8    63.9   166
##     Area
## 1  50708
## 2 566432
## 3 113417
## 4  51945
## 5 156361
## 6 103766

lin <- lm(data6$Income~data6$HS.Grad)

plot(data6$HS.Grad, data6$Income, main = "Graduation Rate vs. Income", xlab = "Graduation Rate", ylab = "Income", pch = 16)
par(new=T)
abline(lin, col='red')

#Part C. Below we see a linear regression model with a quadratic effect for HS graduation rate to this data.

data6$HS.Grad2 <- data6$HS.Grad**2
quad <- lm(Income~ HS.Grad + HS.Grad2, data=data6)

x <- seq(26, 75, 1)
pred <- predict(quad, data.frame(HS.Grad = x, HS.Grad2 = x^2))

plot(data6$HS.Grad,data6$Income, main = "Graduation Rate vs. Income", xlab = "Graduation Rate", ylab = "Income", pch = 16)
par(new=T)
abline(lin, col = "red", lty = 1)
lines(x, pred, col = "purple", lty = 2)
legend("bottomright", lty = c(1, 2), col = c("red", "purple"), legend = c("Linear", "Quadratic"))

Part D.

For the Loess model, I picked span = 1 because span = 2 smoothed the curve too much and basically resembled the quadratic line.

From the plot below, I would argue the Loess model fits the data the best. Visually, the curve traces the data better. The curve has a slight peak around 50 and then begins to slope down and then a bit up again.

The Quadratic curve does not have this change that the data follows. It captures much of the same changes as the Loess curve but does not have that turn.

The Linear curve just does not fit the data at all.

I would try to a cubic function compared to the Loess.

loessy <-loess(Income~HS.Grad,span=1,degree=2,data=data6)
summary(loessy)

## Call:
## loess(formula = Income ~ HS.Grad, data = data6, span = 1, degree = 2)
## 
## Number of Observations: 50 
## Equivalent Number of Parameters: 3.39 
## Residual Standard Error: 470.8 
## Trace of smoother matrix: 3.63  (exact)
## 
## Control settings:
##   span     :  1 
##   degree   :  2 
##   family   :  gaussian
##   surface  :  interpolate      cell = 0.2
##   normalize:  TRUE
##  parametric:  FALSE
## drop.square:  FALSE

pred2 = predict(loessy, data.frame(HS.Grad = x))

plot(data6$HS.Grad,data6$Income, main = "Graduation Rate vs. Income", xlab = "Graduation Rate", ylab = "Income", pch = 16)
par(new=T)
abline(lin, col = "red", lwd=3, lty = 1)
lines(x, pred, col = "purple", lwd = 3, lty = 2)
lines(x, pred2, col = "green",lwd = 4, lty = 3)
legend("bottomright", lty = c(1, 2, 3), col = c("red", "purple", "green"), lwd = 3, legend = c("Linear", "Quadratic", "Loess"))

Problem 3.

I tried to calculate the SSE below and created a range based on these values. It seemed to work okay and provided the middle of the range and I worked backwards from there.

A span between 0.2 to 0.6 provides a good fit for this data without under/overfitting.

set.seed(1234)
x <- runif(100,0,10)
y <- sin(x)+rnorm(100,0,.3)
loessy<-loess(y~x,span=0.45,degree=2)
plot(x,y)
par(new=T)
j = order(x)
points(x[j],loessy$fitted[j],col="red",lwd=3,type="l")

# define function that returns the SSE
calcSSE <- function(z){
  loessMod <- try(loess(y ~ x, degree=2, span=z), silent=T)
  res <- try(loessMod$residuals, silent=T)
  if(class(res)!="try-error"){
    if((sum(res, na.rm=T) > 0)){
      sse <- sum(res^2)  
    }
  }else{
    sse <- 99999
  }
  return(sse)
}

#Run optim to find span that gives min SSE, starting at 0.2
#Code used from http://r-statistics.co/Loess-Regression-With-R.html
#optim(par=c(0.2), calcSSE, method="SANN")
#[1] 0.44883

#$value
#[1] 0

#$counts
#function gradient 
#   10000       NA 

#$convergence
#[1] 0

Problem 4

From the simulation below, I would argue that the value of the span increases as the noise increases.

Additionally, the range of span will decrease as the noise increases.

Part A. Sigma = 0.1

A span of 0.1 to 0.5 would be a good range.

set.seed(1234)
x <- runif(100,0,10)
y <- sin(x)+rnorm(100,0,0.1)
loess_a<-loess(y~x,span=.1,degree=2)
plot(x,y, main = "Sigma = 0.1, Span = 0.1")
par(new=T)
j = order(x)
points(x[j],loess_a$fitted[j],col="red",lwd=3,type="l")

set.seed(1234)
x <- runif(100,0,10)
y <- sin(x)+rnorm(100,0,0.1)
loess_a<-loess(y~x,span=.3,degree=2)
plot(x,y, main = "Sigma = 0.1, Span = 0.3")
par(new=T)
j = order(x)
points(x[j],loess_a$fitted[j],col="red",lwd=3,type="l")

set.seed(1234)
x <- runif(100,0,10)
y <- sin(x)+rnorm(100,0,0.1)
loess_a<-loess(y~x,span=.5,degree=2)
plot(x,y, main = "Sigma = 0.1, Span = 0.5")
par(new=T)
j = order(x)
points(x[j],loess_a$fitted[j],col="red",lwd=3,type="l")

# define function that returns the SSE
calcSSE <- function(z){
  loessMod <- try(loess_a(y ~ x, degree=2, span=z), silent=T)
  res <- try(loessMod$residuals, silent=T)
  if(class(res)!="try-error"){
    if((sum(res, na.rm=T) > 0)){
      sse <- sum(res^2)  
    }
  }else{
    sse <- 99999
  }
  return(sse)
}

#Run optim to find span that gives min SSE, starting at 0.1
#Code used from http://r-statistics.co/Loess-Regression-With-R.html
#optim(par=c(0.1), calcSSE, method="SANN")

Part B. Sigma = 0.5

A span of 0.2 to 0.6 would be a good range.

set.seed(1234)
x <- runif(100,0,10)
y <- sin(x)+rnorm(100,0,0.5)
loess_b<-loess(y~x,span=.2,degree=2)
plot(x,y, main = "Sigma = 0.5, Span = 0.2")
par(new=T)
j = order(x)
points(x[j],loess_b$fitted[j],col="red",lwd=3,type="l")

set.seed(1234)
x <- runif(100,0,10)
y <- sin(x)+rnorm(100,0,0.5)
loess_b<-loess(y~x,span=.48,degree=2)
plot(x,y, main = "Sigma = 0.5, Span = 0.48")
par(new=T)
j = order(x)
points(x[j],loess_b$fitted[j],col="red",lwd=3,type="l")

set.seed(1234)
x <- runif(100,0,10)
y <- sin(x)+rnorm(100,0,0.5)
loess_b<-loess(y~x,span=.6,degree=2)
plot(x,y, main = "Sigma = 0.5, Span = 0.6")
par(new=T)
j = order(x)
points(x[j],loess_b$fitted[j],col="red",lwd=3,type="l")

# define function that returns the SSE
calcSSE <- function(z){
  loessMod <- try(loess(y ~ x, degree=2, span=z), silent=T)
  res <- try(loessMod$residuals, silent=T)
  if(class(res)!="try-error"){
    if((sum(res, na.rm=T) > 0)){
      sse <- sum(res^2)  
    }
  }else{
    sse <- 99999
  }
  return(sse)
}

#Run optim to find span that gives min SSE, starting at 0.2
#Code used from http://r-statistics.co/Loess-Regression-With-R.html
#optim(par=c(0.2), calcSSE, method="SANN")

Part C. Sigma = 1

A span of of 0.4 to 0.7 would be a good range.

set.seed(1234)
x <- runif(100,0,10)
y <- sin(x)+rnorm(100,0,1)
loess_c<-loess(y~x,span=.4,degree=2)
plot(x,y, main = "Sigma = 1, Span = 0.4")
par(new=T)
j = order(x)
points(x[j],loess_c$fitted[j],col="red",lwd=3,type="l")

set.seed(1234)
x <- runif(100,0,10)
y <- sin(x)+rnorm(100,0,1)
loess_c<-loess(y~x,span=.58,degree=2)
plot(x,y, main = "Sigma = 1, Span = 0.58")
par(new=T)
j = order(x)
points(x[j],loess_c$fitted[j],col="red",lwd=3,type="l")

set.seed(1234)
x <- runif(100,0,10)
y <- sin(x)+rnorm(100,0,1)
loess_c<-loess(y~x,span=.7,degree=2)
plot(x,y, main = "Sigma = 1, Span = 0.7")
par(new=T)
j = order(x)
points(x[j],loess_c$fitted[j],col="red",lwd=3,type="l")

# define function that returns the SSE
calcSSE <- function(z){
  loessMod <- try(loess(y ~ x, degree=2, span=z), silent=T)
  res <- try(loessMod$residuals, silent=T)
  if(class(res)!="try-error"){
    if((sum(res, na.rm=T) > 0)){
      sse <- sum(res^2)  
    }
  }else{
    sse <- 99999
  }
  return(sse)
}

#Run optim to find span that gives min SSE, starting at 0.4
#Code used from http://r-statistics.co/Loess-Regression-With-R.html
#optim(par=c(0.4), calcSSE, method="SANN")

Part D. Sigma = 1.5

A span of 0.5 to 0.8 would be a good range.

set.seed(1234)
x <- runif(100,0,10)
y <- sin(x)+rnorm(100,0,1.5)
loess_d<-loess(y~x,span=0.50,degree=2)
plot(x,y, main = "Sigma = 1.5, Span = 0.5")
par(new=T)
j = order(x)
points(x[j],loess_d$fitted[j],col="red",lwd=3,type="l")

set.seed(1234)
x <- runif(100,0,10)
y <- sin(x)+rnorm(100,0,1.5)
loess_d<-loess(y~x,span=0.72,degree=2)
plot(x,y, main = "Sigma = 1.5, Span = 0.72")
par(new=T)
j = order(x)
points(x[j],loess_d$fitted[j],col="red",lwd=3,type="l")

set.seed(1234)
x <- runif(100,0,10)
y <- sin(x)+rnorm(100,0,1.5)
loess_d<-loess(y~x,span=0.8,degree=2)
plot(x,y, main = "Sigma = 1.5, Span = 0.8")
par(new=T)
j = order(x)
points(x[j],loess_d$fitted[j],col="red",lwd=3,type="l")

# define function that returns the SSE
calcSSE <- function(z){
  loessMod <- try(loess(y ~ x, degree=2, span=z), silent=T)
  res <- try(loessMod$residuals, silent=T)
  if(class(res)!="try-error"){
    if((sum(res, na.rm=T) > 0)){
      sse <- sum(res^2)  
    }
  }else{
    sse <- 99999
  }
  return(sse)
}

#Run optim to find span that gives min SSE, starting at 0.5
#Code used from http://r-statistics.co/Loess-Regression-With-R.html
#optim(par=c(0.5), calcSSE, method="SANN")

STAT 488 HW 6 Non Parametric

Kajal Chokshi

4/13/2019