Model Diagnostics and Polynominal Regression

library(tidyverse)
## ── Attaching packages ───────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.3
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
data(quakes)
mod = lm(stations~mag, data=quakes)
summary(mod)
## 
## Call:
## lm(formula = stations ~ mag, data = quakes)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -48.871  -7.102  -0.474   6.783  50.244 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -180.4243     4.1899  -43.06   <2e-16 ***
## mag           46.2822     0.9034   51.23   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.5 on 998 degrees of freedom
## Multiple R-squared:  0.7245, Adjusted R-squared:  0.7242 
## F-statistic:  2625 on 1 and 998 DF,  p-value: < 2.2e-16
anova(mod)
## Analysis of Variance Table
## 
## Response: stations
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## mag         1 347148  347148  2624.7 < 2.2e-16 ***
## Residuals 998 132000     132                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
names(mod)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"
head(mod$residuals)
##           1           2           3           4           5           6 
##  -0.7302851   1.0390414 -26.4996115   9.6672625   6.2954836   7.2954836
res<-quakes$stations-mod$fitted
head(res)
##           1           2           3           4           5           6 
##  -0.7302851   1.0390414 -26.4996115   9.6672625   6.2954836   7.2954836
mod_df<-data.frame(residuals=mod$residuals,
                   fitted=mod$fitted.values)

#residual plot

ggplot(mod_df, aes(fitted, residuals))+
  geom_point()+
  geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

QQplot

qqnorm(mod$residuals)

res vs leverage

  plot(mod)

####Mini Quiz 5 SLR and ANOVA

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
data(Boston)

head(Boston)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
ggplot(Boston, aes(x = lstat, y = rm))+
  geom_point()+
  geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

summary(Boston)
##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08204   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv      
##  Min.   : 1.73   Min.   : 5.00  
##  1st Qu.: 6.95   1st Qu.:17.02  
##  Median :11.36   Median :21.20  
##  Mean   :12.65   Mean   :22.53  
##  3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :37.97   Max.   :50.00
  library(MASS)
data(Boston)

mod10=lm(medv~poly(lstat, 10), data=Boston)
summary(mod10)
## 
## Call:
## lm(formula = medv ~ poly(lstat, 10), data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.5340  -3.0286  -0.7507   2.0437  26.4738 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         22.5328     0.2311  97.488  < 2e-16 ***
## poly(lstat, 10)1  -152.4595     5.1993 -29.323  < 2e-16 ***
## poly(lstat, 10)2    64.2272     5.1993  12.353  < 2e-16 ***
## poly(lstat, 10)3   -27.0511     5.1993  -5.203 2.88e-07 ***
## poly(lstat, 10)4    25.4517     5.1993   4.895 1.33e-06 ***
## poly(lstat, 10)5   -19.2524     5.1993  -3.703 0.000237 ***
## poly(lstat, 10)6     6.5088     5.1993   1.252 0.211211    
## poly(lstat, 10)7     1.9416     5.1993   0.373 0.708977    
## poly(lstat, 10)8    -6.7299     5.1993  -1.294 0.196133    
## poly(lstat, 10)9     8.4168     5.1993   1.619 0.106116    
## poly(lstat, 10)10   -7.3351     5.1993  -1.411 0.158930    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.199 on 495 degrees of freedom
## Multiple R-squared:  0.6867, Adjusted R-squared:  0.6804 
## F-statistic: 108.5 on 10 and 495 DF,  p-value: < 2.2e-16

###testing vs training

#size of the split #I am going to choose 70 30

dim(Boston)#506
## [1] 506  14
ceiling(506*.3) #152 obs in test 
## [1] 152

#create a partition test(1) and train(0)

set.seed(1)
Boston$part<-rep(0, 506)
test<-sample(1:506, 152, replace=FALSE)
Boston$part[test]<-1

library(tidyverse)

#####train df

traindf<-Boston%>%
  filter(part==0)

#####test df

testdf<-Boston%>%
  filter(part==1)

ggplot(data = Boston, aes(x = lstat, y = medv, color=as.factor(part)))+
  geom_point()+
  facet_grid(.~part)##partition= separate into two windows graphs 

names(Boston)
##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
##  [8] "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"   
## [15] "part"

#train model

trainLM<-lm(medv~lstat, traindf)
anova(trainLM)#create anova table
## Analysis of Variance Table
## 
## Response: medv
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## lstat       1  16434 16433.6  424.12 < 2.2e-16 ***
## Residuals 352  13639    38.7                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

###console #Response: medv # Df Sum Sq Mean Sq F value Pr(>F)
#lstat 1 17634 17633.8 432.54 < 2.2e-16 *** #Residuals 352 14350 40.8 <-this is your training MSE
#— # Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ’ ’ 1

#find the test MSE

testFit<-predict(trainLM, testdf)
testRSS<- sum((testdf$medv-testFit)^2)
testRSS
## [1] 5848.702
testMSE<-mean((testdf$medv-testFit)^2)
testMSE
## [1] 38.4783