library(MASS)
library(tidyverse)
## ── Attaching packages ────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.3
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ── Conflicts ───────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x dplyr::select() masks MASS::select()
data(Boston)
summary(Boston)
##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08204   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv      
##  Min.   : 1.73   Min.   : 5.00  
##  1st Qu.: 6.95   1st Qu.:17.02  
##  Median :11.36   Median :21.20  
##  Mean   :12.65   Mean   :22.53  
##  3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :37.97   Max.   :50.00
dim(Boston)
## [1] 506  14

##Sice of sprit

ceiling(506*.3)
## [1] 152
Boston$part<-rep(0, 506)

set.seed(1)
test<-sample(1:506, 152, replace=FALSE)
Boston$part[test]<-1

ggplot(Boston, aes(x=lstat, y=medv, color=as.factor(part)))+
  geom_point()

mod10=lm(medv~poly(lstat, 10), data=Boston)
summary(mod10)
## 
## Call:
## lm(formula = medv ~ poly(lstat, 10), data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.5340  -3.0286  -0.7507   2.0437  26.4738 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         22.5328     0.2311  97.488  < 2e-16 ***
## poly(lstat, 10)1  -152.4595     5.1993 -29.323  < 2e-16 ***
## poly(lstat, 10)2    64.2272     5.1993  12.353  < 2e-16 ***
## poly(lstat, 10)3   -27.0511     5.1993  -5.203 2.88e-07 ***
## poly(lstat, 10)4    25.4517     5.1993   4.895 1.33e-06 ***
## poly(lstat, 10)5   -19.2524     5.1993  -3.703 0.000237 ***
## poly(lstat, 10)6     6.5088     5.1993   1.252 0.211211    
## poly(lstat, 10)7     1.9416     5.1993   0.373 0.708977    
## poly(lstat, 10)8    -6.7299     5.1993  -1.294 0.196133    
## poly(lstat, 10)9     8.4168     5.1993   1.619 0.106116    
## poly(lstat, 10)10   -7.3351     5.1993  -1.411 0.158930    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.199 on 495 degrees of freedom
## Multiple R-squared:  0.6867, Adjusted R-squared:  0.6804 
## F-statistic: 108.5 on 10 and 495 DF,  p-value: < 2.2e-16
ggplot(Boston, aes(x=lstat, y=medv, color=as.factor(part)))+
  geom_point()+
  facet_grid(.~part)

TRAIN DATAFRAME

traindf<-Boston%>%
  filter(part==0)

testdf<-Boston%>%
  filter(part==1)

TRAIN MODEL

trainLM<-lm(medv~lstat, traindf)
anova(trainLM)
## Analysis of Variance Table
## 
## Response: medv
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## lstat       1  16434 16433.6  424.12 < 2.2e-16 ***
## Residuals 352  13639    38.7                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

TEST MSE

testFit<-predict(trainLM, testdf)
testRSS<-sum((testdf$medv-testFit)^2)
testRSS
## [1] 5848.702
testMSE<-mean((testdf$medv-testFit)^2)
testMSE
## [1] 38.4783

TEST DIFFERENT POLY DEGREES

train<-sample(506, 354)

degreePoly<-10
polyMSE<-matrix(nrow=degreePoly, ncol=2)
colnames(polyMSE)<-c("degree", "MSE")
for(i in 1:degreePoly){ 
  polyMSE[i,1]<-i
  this.fit<-lm(medv~poly(lstat,i), data=Boston, subset=train)
  polyMSE[i,2]<-mean((Boston$medv-predict(this.fit, Boston))[-train]^2)
}

polyDF<-as.data.frame(polyMSE)
head(polyDF)
##   degree      MSE
## 1      1 49.75812
## 2      2 37.93550
## 3      3 38.24952
## 4      4 36.80820
## 5      5 36.21784
## 6      6 36.11516
ggplot(data=polyDF, aes(x=degree, y=MSE))+
  geom_point()+
  geom_line()

MULTIPLE RANDOM SPLITS

degreePoly<-10
splits<-5

splitMat<-matrix(nrow=degreePoly*splits, ncol=3)
colnames(splitMat)<-c("run","MSE", "degree")

for(i in 1:splits){
  a=(i-1)*degreePoly+1
  b=i*degreePoly
  
  set.seed(i*10)
  splitMat[a:b,1]<-i
  train<-sample(506, 354)
  
  for(j in 1:degreePoly){ 
    c=a+(j-1)
    
    this.fit<-lm(medv~poly(lstat,j), data=Boston, subset=train)
    splitMat[c,2]<-mean((Boston$medv-predict(this.fit, Boston))[-train]^2)
    splitMat[c,3]<-j
  }
}
splitDF<-as.data.frame(splitMat)
head(splitDF)
##   run      MSE degree
## 1   1 38.56181      1
## 2   1 29.22841      2
## 3   1 26.82882      3
## 4   1 27.18187      4
## 5   1 27.54487      5
## 6   1 28.41189      6
ggplot(data=splitDF, aes(x=degree, y=MSE, color=as.factor(run)))+
  geom_point()+
  geom_line()