Overfitting R-code

library(readr)
crime <- read_csv("~/Senior Project/crime.csv")
plot(ViolentCrimes~Year, data=crime, pch=19)

crime1<- crime %>%
  filter(Year<=1992)

plot(ViolentCrimes~Year, data=crime1, pch=19)
abline(lm(ViolentCrimes~Year, data=crime1))

crime2<- crime %>%
  filter(Year>=1992 & Year<=2014)
plot(ViolentCrimes~Year, data=crime2, pch=19)
abline(lm(ViolentCrimes~Year, data=crime2))

crime3<- crime %>%
  filter(Year>2013)
plot(ViolentCrimes~Year, data=crime3, pch=19)
abline(lm(ViolentCrimes~Year, data=crime3))

y<- crime$ViolentCrimes
x<- crime$Year
xsq<-x^2
xcub<-x^3
xquar<-x^4
x5<-x^5
x6<-x^6
x7<-x^7
x8<-x^8
x9<-x^9
x10<-x^10
plot(x,y, pch=19, xlab="Year", ylab="Violent Crimes per 100k People")
fit1<- lm(y~x)
abline(fit1, col = "red")
fit3<- lm(y~x+xsq+xcub)
xv<-seq(min(x),max(x),1)
yv<-predict(fit3, list(x=xv, xsq=xv^2, xcub=xv^3))
lines(xv,yv, col = "blue")
fit10<- lm(y~poly(x,10))
xv<-seq(min(x),max(x),1)
yv<-predict(fit10, list(x=xv, xsq=xv^2, xcub=xv^3, xquar=xv^4, x5=xv^5, x6=xv^6,x7=xv^7,x8=xv^8,x9=xv^9,x10=xv^6))
lines(xv,yv, col = "black")

y<- Auto$mpg
x<- Auto$horsepower
xsq<-x^2
xcub<-x^3
xquar<-x^4
x5<-x^5
x6<-x^6
x7<-x7
x8<-x^8
x9<-x^9
x10<-x^10
plot(x,y,pch=20, xlab="Horsepower", ylab="Miles Per Gallon")
fit1<- lm(y~x)
summary(fit1)

## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.5710  -3.2592  -0.3435   2.7630  16.9240 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 39.935861   0.717499   55.66   <2e-16 ***
## x           -0.157845   0.006446  -24.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared:  0.6059, Adjusted R-squared:  0.6049 
## F-statistic: 599.7 on 1 and 390 DF,  p-value: < 2.2e-16

abline(fit1, col = "red", lwd = 4)
fit2<- lm(y~x+xsq)
summary(fit2)

## 
## Call:
## lm(formula = y ~ x + xsq)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.7135  -2.5943  -0.0859   2.2868  15.8961 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 56.9000997  1.8004268   31.60   <2e-16 ***
## x           -0.4661896  0.0311246  -14.98   <2e-16 ***
## xsq          0.0012305  0.0001221   10.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared:  0.6876, Adjusted R-squared:  0.686 
## F-statistic:   428 on 2 and 389 DF,  p-value: < 2.2e-16

xv<-seq(min(x),max(x),1)
yv<-predict(fit2, list(x=xv, xsq=xv^2))
lines(xv,yv, col = "green", lwd = 4)
fit3<- lm(y~x+xsq+xcub)
summary(fit3)

## 
## Call:
## lm(formula = y ~ x + xsq + xcub)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.7039  -2.4491  -0.1519   2.2035  15.8159 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.068e+01  4.563e+00  13.298  < 2e-16 ***
## x           -5.689e-01  1.179e-01  -4.824 2.03e-06 ***
## xsq          2.079e-03  9.479e-04   2.193   0.0289 *  
## xcub        -2.147e-06  2.378e-06  -0.903   0.3673    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.375 on 388 degrees of freedom
## Multiple R-squared:  0.6882, Adjusted R-squared:  0.6858 
## F-statistic: 285.5 on 3 and 388 DF,  p-value: < 2.2e-16

xv<-seq(min(x),max(x),1)
yv<-predict(fit3, list(x=xv, xsq=xv^2, xcub=xv^3))
lines(xv,yv, col = "blue", lwd = 4)
fit4<- lm(y~poly(x,4))
summary(fit4)$r.squared

## [1] 0.6893436

xv<-seq(min(x),max(x),1)
yv<-predict(fit4, list(x=xv, xsq=xv^2, xcub=xv^3, xquar=xv^4))
lines(xv,yv, col = "purple", lwd = 4)
fit5<- lm(y~poly(x,5))
summary(fit5)$r.squared

## [1] 0.696739

xv<-seq(min(x),max(x),1)
yv<-predict(fit5, list(x=xv, xsq=xv^2, xcub=xv^3, xquar=xv^4, x5=xv^5))
lines(xv,yv, col = "pink", lwd = 4)
fit17<- lm(y~poly(x,17))
summary(fit17)

## 
## Call:
## lm(formula = y ~ poly(x, 17))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.7840  -2.4482  -0.1369   2.3586  15.1625 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     23.4459     0.2113 110.967  < 2e-16 ***
## poly(x, 17)1  -120.1377     4.1833 -28.719  < 2e-16 ***
## poly(x, 17)2    44.0895     4.1833  10.540  < 2e-16 ***
## poly(x, 17)3    -3.9488     4.1833  -0.944 0.345797    
## poly(x, 17)4    -5.1878     4.1833  -1.240 0.215702    
## poly(x, 17)5    13.2722     4.1833   3.173 0.001635 ** 
## poly(x, 17)6    -8.5462     4.1833  -2.043 0.041757 *  
## poly(x, 17)7     7.9806     4.1833   1.908 0.057190 .  
## poly(x, 17)8     2.1727     4.1833   0.519 0.603800    
## poly(x, 17)9    -3.9182     4.1833  -0.937 0.349549    
## poly(x, 17)10   -2.6146     4.1833  -0.625 0.532346    
## poly(x, 17)11    3.5636     4.1833   0.852 0.394834    
## poly(x, 17)12    1.1451     4.1833   0.274 0.784446    
## poly(x, 17)13    0.6041     4.1833   0.144 0.885262    
## poly(x, 17)14   -3.8267     4.1833  -0.915 0.360908    
## poly(x, 17)15   13.4922     4.1833   3.225 0.001369 ** 
## poly(x, 17)16  -14.5099     4.1833  -3.469 0.000584 ***
## poly(x, 17)17    9.6578     4.1833   2.309 0.021506 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.183 on 374 degrees of freedom
## Multiple R-squared:  0.7252, Adjusted R-squared:  0.7127 
## F-statistic: 58.07 on 17 and 374 DF,  p-value: < 2.2e-16

xv<-seq(min(x),max(x),1)
yv<-predict(fit17, list(x=xv, xsq=xv^2, xcub=xv^3, xquar=xv^4, x5=xv^5, x6=xv^6,x7=xv^7,x8=xv^8,x9=xv^9,x10=xv^10,x11=xv^11,x12=xv^12,x13=xv^13,x14=xv^14,x15=xv^15,x16=xv^16,x17=xv^17))
lines(xv,yv, col = "black", lwd = 4)

library(caTools)

set.seed(1000)
sample <- sample(c(TRUE, FALSE), nrow(Auto), replace=TRUE, prob=c(0.75,0.25))
train  <- Auto[sample, ]
test   <- Auto[!sample, ]

testModel=lm(mpg~horsepower, data=train)
pred.test=predict(testModel,test)
mean((pred.test-test$mpg)^2)

## [1] 24.66337

testModel2=lm(mpg~poly(horsepower,2), data=train)
pred.test2=predict(testModel2,test)
mean((pred.test2-test$mpg)^2)

## [1] 20.52966

testModel3=lm(mpg~poly(horsepower,3), data=train)
pred.test3=predict(testModel3,test)
mean((pred.test3-test$mpg)^2)

## [1] 20.59906

testModel4=lm(mpg~poly(horsepower,4), data=train)
pred.test4=predict(testModel4,test)
mean((pred.test4-test$mpg)^2)

## [1] 20.45034

testModel5=lm(mpg~poly(horsepower,5), data=train)
pred.test5=predict(testModel5,test)
mean((pred.test5-test$mpg)^2)

## [1] 19.85817

testModel7=lm(mpg~poly(horsepower,7), data=train)
pred.test7=predict(testModel7,test)
mean((pred.test7-test$mpg)^2)

## [1] 20.42431

testModel8=lm(mpg~poly(horsepower,8), data=train)
pred.test8=predict(testModel8,test)
mean((pred.test8-test$mpg)^2)

## [1] 20.74518

testModel9=lm(mpg~poly(horsepower,9), data=train)
pred.test9=predict(testModel9,test)
mean((pred.test9-test$mpg)^2)

## [1] 20.88395

testModel10=lm(mpg~poly(horsepower,10), data=train)
pred.test10=predict(testModel10,test)
mean((pred.test10-test$mpg)^2)

## [1] 20.93814

Overfitting R-code

Sam Fix

2023-02-06