Unit 4, Recitation

VIDEO 2

# Read in data
setwd('C:/Users/daria.alekseeva/Documents/Edx/Trees/')
boston = read.csv("boston.csv")
str(boston)
## 'data.frame':    506 obs. of  16 variables:
##  $ TOWN   : Factor w/ 92 levels "Arlington","Ashland",..: 54 77 77 46 46 46 69 69 69 69 ...
##  $ TRACT  : int  2011 2021 2022 2031 2032 2033 2041 2042 2043 2044 ...
##  $ LON    : num  -71 -71 -70.9 -70.9 -70.9 ...
##  $ LAT    : num  42.3 42.3 42.3 42.3 42.3 ...
##  $ MEDV   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 22.1 16.5 18.9 ...
##  $ CRIM   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ ZN     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ INDUS  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ CHAS   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NOX    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ RM     : num  6.58 6.42 7.18 7 7.15 ...
##  $ AGE    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ DIS    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ RAD    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ TAX    : int  296 242 242 222 222 222 311 311 311 311 ...
##  $ PTRATIO: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
# Plot observations
plot(boston$LON, boston$LAT)

# Tracts alongside the Charles River
points(boston$LON[boston$CHAS==1], boston$LAT[boston$CHAS==1], col="blue", pch=19)

# Plot MIT
points(boston$LON[boston$TRACT==3531],boston$LAT[boston$TRACT==3531],col="red", pch=20)

# Plot polution
summary(boston$NOX)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3850  0.4490  0.5380  0.5547  0.6240  0.8710
points(boston$LON[boston$NOX>=0.55], boston$LAT[boston$NOX>=0.55], col="green", pch=20)

# Plot prices
plot(boston$LON, boston$LAT)
summary(boston$MEDV)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00   17.02   21.20   22.53   25.00   50.00
points(boston$LON[boston$MEDV>=21.2], boston$LAT[boston$MEDV>=21.2], col="red", pch=20)

# VIDEO 3

# Linear Regression using LAT and LON
plot(boston$LAT, boston$MEDV)

plot(boston$LON, boston$MEDV)

latlonlm = lm(MEDV ~ LAT + LON, data=boston)
summary(latlonlm)
## 
## Call:
## lm(formula = MEDV ~ LAT + LON, data = boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.460  -5.590  -1.299   3.695  28.129 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3178.472    484.937  -6.554 1.39e-10 ***
## LAT             8.046      6.327   1.272    0.204    
## LON           -40.268      5.184  -7.768 4.50e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.693 on 503 degrees of freedom
## Multiple R-squared:  0.1072, Adjusted R-squared:  0.1036 
## F-statistic: 30.19 on 2 and 503 DF,  p-value: 4.159e-13
# Visualize regression output
plot(boston$LON, boston$LAT)
points(boston$LON[boston$MEDV>=21.2], boston$LAT[boston$MEDV>=21.2], col="red", pch=20)

latlonlm$fitted.values
##        1        2        3        4        5        6        7        8 
## 18.75633 18.81648 18.21651 17.97483 17.77344 17.60024 18.32916 18.49416 
##        9       10       11       12       13       14       15       16 
## 18.32904 18.20015 18.44176 18.81222 19.00560 19.43658 19.69836 19.93589 
##       17       18       19       20       21       22       23       24 
## 20.39492 19.92388 20.48766 20.26703 20.08099 20.05277 19.85547 19.67426 
##       25       26       27       28       29       30       31       32 
## 19.54619 19.35208 19.20306 19.16685 19.03801 18.78031 19.43265 19.29173 
##       33       34       35       36       37       38       39       40 
## 19.61388 19.91187 19.79915 20.68910 21.04745 20.98293 21.63527 21.55856 
##       41       42       43       44       45       46       47       48 
## 22.33163 21.37316 21.24036 20.21512 19.75853 19.88344 19.58142 19.25924 
##       49       50       51       52       53       54       55       56 
## 19.25115 19.18508 19.23088 19.74790 19.60931 20.38652 22.21046 20.07213 
##       57       58       59       60       61       62       63       64 
## 18.70708 18.66683 18.82408 18.77184 18.40938 18.33295 18.02687 17.51943 
##       65       66       67       68       69       70       71       72 
## 15.65495 23.10462 24.13141 24.84590 25.60131 25.61752 25.93170 25.34631 
##       73       74       75       76       77       78       79       80 
## 26.20561 25.81913 25.29164 24.85675 24.42188 23.90640 24.53454 24.63514 
##       81       82       83       84       85       86       87       88 
## 23.75323 23.82969 23.85779 23.38269 22.72237 22.94390 22.29957 22.40842 
##       89       90       91       92       93       94       95       96 
## 22.45762 22.14675 21.91325 22.41258 22.92793 23.39494 23.27825 23.80985 
##       97       98       99      100      101      102      103      104 
## 24.17226 24.28506 24.63140 23.89449 23.25432 23.74970 23.81005 23.62081 
##      105      106      107      108      109      110      111      112 
## 23.29062 23.13361 22.97737 22.72687 22.74697 22.97647 22.86770 22.52944 
##      113      114      115      116      117      118      119      120 
## 22.52548 22.31604 22.13247 21.99392 22.06474 21.77238 21.74020 21.37777 
##      121      122      123      124      125      126      127      128 
## 21.49303 21.78858 22.02217 21.96582 21.72822 21.52770 21.78464 22.54575 
##      129      130      131      132      133      134      135      136 
## 22.74307 22.99111 23.22626 23.46786 23.45175 23.66916 23.51215 23.43729 
##      137      138      139      140      141      142      143      144 
## 23.12160 22.92429 22.84380 22.65451 22.46283 22.52568 22.23739 22.40088 
##      145      146      147      148      149      150      151      152 
## 22.34452 22.45326 22.67069 22.53941 22.60221 22.64652 22.81242 22.79553 
##      153      154      155      156      157      158      159      160 
## 22.63851 22.84389 22.92200 22.98483 22.97271 23.13372 23.02100 22.92276 
##      161      162      163      164      165      166      167      168 
## 23.11686 23.31252 23.42771 23.60892 24.03575 23.66526 23.47196 23.89072 
##      169      170      171      172      173      174      175      176 
## 23.37207 23.52911 23.68372 23.62894 23.96395 23.93091 23.86243 24.37376 
##      177      178      179      180      181      182      183      184 
## 25.09862 24.80630 24.38995 24.48101 24.41018 24.24272 24.40379 24.65586 
##      185      186      187      188      189      190      191      192 
## 24.87733 25.13904 24.97145 25.49323 26.23821 26.48860 25.58575 26.72127 
##      193      194      195      196      197      198      199      200 
## 26.15347 27.07142 27.53046 28.15904 29.56023 30.37365 29.78157 30.29335 
##      201      202      203      204      205      206      207      208 
## 30.89753 28.88834 28.88817 28.39678 28.18598 26.42353 26.41397 26.34152 
##      209      210      211      212      213      214      215      216 
## 26.35525 26.05888 26.04683 25.85673 25.70288 25.94843 25.52159 25.36218 
##      217      218      219      220      221      222      223      224 
## 25.00625 24.54316 24.05995 24.54320 24.66565 25.13194 25.26726 25.13756 
##      225      226      227      228      229      230      231      232 
## 24.60370 24.18094 25.03862 24.75439 24.43473 24.76255 25.26424 25.24407 
##      233      234      235      236      237      238      239      240 
## 25.72003 25.55405 25.55558 25.79638 26.19426 26.15406 28.30049 28.21593 
##      241      242      243      244      245      246      247      248 
## 27.75302 28.44966 28.80955 29.24691 29.63356 30.18767 30.42681 29.83324 
##      249      250      251      252      253      254      255      256 
## 30.20688 29.95153 29.63737 29.91678 30.88568 31.20960 31.44166 30.54133 
##      257      258      259      260      261      262      263      264 
## 28.66323 22.91885 23.11536 23.26677 23.37955 23.32319 23.48831 23.12993 
##      265      266      267      268      269      270      271      272 
## 23.00909 22.94468 23.01719 23.48836 23.91107 23.70209 23.15454 23.39217 
##      273      274      275      276      277      278      279      280 
## 23.63371 24.44305 25.11943 25.46811 25.48752 25.92879 25.59288 25.98907 
##      281      282      283      284      285      286      287      288 
## 26.70586 27.45326 27.06033 26.66993 26.65411 27.91862 26.91216 25.10000 
##      289      290      291      292      293      294      295      296 
## 24.60860 25.54682 25.08756 25.00528 23.87704 23.79102 24.24758 24.72601 
##      297      298      299      300      301      302      303      304 
## 24.56256 24.21390 23.93226 23.01827 23.27587 22.80456 21.85417 22.96544 
##      305      306      307      308      309      310      311      312 
## 21.55585 22.03901 21.60809 20.90341 20.32355 20.61345 20.57708 20.25089 
##      313      314      315      316      317      318      319      320 
## 20.46440 20.11653 19.74773 18.96246 19.22830 19.82432 20.12628 20.53701 
##      321      322      323      324      325      326      327      328 
## 19.89691 19.49419 19.21390 18.95456 19.11170 19.37440 19.44197 19.89697 
##      329      330      331      332      333      334      335      336 
## 20.53328 21.11313 20.31192 19.67577 19.07574 18.49177 17.91995 18.23393 
##      337      338      339      340      341      342      343      344 
## 18.58014 17.93585 18.14436 18.36659 18.31661 15.16523 16.38123 17.16657 
##      345      346      347      348      349      350      351      352 
## 16.78823 16.96581 17.17110 16.23690 16.11270 13.72375 12.71715 12.29463 
##      353      354      355      356      357      358      359      360 
## 11.34041 12.06130 13.01563 12.81849 23.60656 24.04794 24.26138 23.92718 
##      361      362      363      364      365      366      367      368 
## 23.79674 23.68957 23.45200 23.72980 22.58058 22.58221 22.34222 22.22785 
##      369      370      371      372      373      374      375      376 
## 22.14326 22.22781 21.94108 21.93382 21.87098 21.65754 21.66964 21.79684 
##      377      378      379      380      381      382      383      384 
## 21.70421 21.86931 21.93617 21.95789 22.21961 22.01023 21.28298 21.20890 
##      385      386      387      388      389      390      391      392 
## 21.30154 21.24358 21.22987 21.19120 21.11629 21.02849 20.80619 20.53234 
##      393      394      395      396      397      398      399      400 
## 21.09620 20.80642 20.92563 21.00858 21.16724 21.15191 21.12448 21.48050 
##      401      402      403      404      405      406      407      408 
## 21.44749 21.34601 21.32429 21.44671 21.51596 21.46034 21.85567 21.91777 
##      409      410      411      412      413      414      415      416 
## 21.92585 22.00638 22.04664 22.12558 22.04264 21.95407 21.80510 21.92029 
##      417      418      419      420      421      422      423      424 
## 22.01853 22.13689 22.23113 22.48482 22.60643 22.75782 22.77960 22.57667 
##      425      426      427      428      429      430      431      432 
## 22.43736 22.42283 22.32218 22.27706 22.17560 22.01454 22.24007 22.06290 
##      433      434      435      436      437      438      439      440 
## 22.10238 21.97033 21.91152 21.87123 21.87201 21.85105 21.86795 21.56755 
##      441      442      443      444      445      446      447      448 
## 21.35816 21.39446 21.40651 21.56114 21.69239 21.74879 21.59015 21.48709 
##      449      450      451      452      453      454      455      456 
## 21.59180 21.72467 21.82537 21.61196 21.32605 21.55157 21.77144 22.00257 
##      457      458      459      460      461      462      463      464 
## 21.99455 21.95834 21.78922 21.60801 21.63859 21.33012 21.12881 21.30601 
##      465      466      467      468      469      470      471      472 
## 21.64832 22.05905 22.08721 22.60660 22.68721 22.68315 22.84829 23.20505 
##      473      474      475      476      477      478      479      480 
## 23.27911 22.94481 22.20387 22.08625 22.30451 22.18770 22.30287 22.77962 
##      481      482      483      484      485      486      487      488 
## 23.97576 23.73415 23.48051 23.73667 22.99336 22.70752 22.64302 22.42955 
##      489      490      491      492      493      494      495      496 
## 21.16374 21.31355 21.40855 21.07754 21.68151 21.00096 21.03154 20.98882 
##      497      498      499      500      501      502      503      504 
## 20.58857 20.31154 20.69332 20.41146 20.10948 19.81316 19.98473 20.12569 
##      505      506 
## 19.81563 19.59015
points(boston$LON[latlonlm$fitted.values >= 21.2], boston$LAT[latlonlm$fitted.values >= 21.2], col="blue", pch="$")

# Video 4

# Load CART packages
library(rpart)
library(rpart.plot)

# CART model
latlontree = rpart(MEDV ~ LAT + LON, data=boston)
prp(latlontree)

# Visualize output
plot(boston$LON, boston$LAT)
points(boston$LON[boston$MEDV>=21.2], boston$LAT[boston$MEDV>=21.2], col="red", pch=20)

fittedvalues = predict(latlontree)
points(boston$LON[fittedvalues>21.2], boston$LAT[fittedvalues>=21.2], col="blue", pch="$")

# Simplify tree by increasing minbucket
latlontree = rpart(MEDV ~ LAT + LON, data=boston, minbucket=50)
plot(latlontree)
text(latlontree)

# Visualize Output
plot(boston$LON,boston$LAT)
abline(v=-71.07)
abline(h=42.21)
abline(h=42.17)
points(boston$LON[boston$MEDV>=21.2], boston$LAT[boston$MEDV>=21.2], col="red", pch=20)

# VIDEO 5

# Let's use all the variables

# Split the data
library(caTools)
set.seed(123)
split = sample.split(boston$MEDV, SplitRatio = 0.7)
train = subset(boston, split==TRUE)
test = subset(boston, split==FALSE)

# Create linear regression
linreg = lm(MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + RM + AGE + DIS + RAD + TAX + PTRATIO, data=train)
summary(linreg)
## 
## Call:
## lm(formula = MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + 
##     RM + AGE + DIS + RAD + TAX + PTRATIO, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.511  -2.712  -0.676   1.793  36.883 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.523e+02  4.367e+02  -0.578   0.5638    
## LAT          1.544e+00  5.192e+00   0.297   0.7664    
## LON         -2.987e+00  4.786e+00  -0.624   0.5329    
## CRIM        -1.808e-01  4.390e-02  -4.118 4.77e-05 ***
## ZN           3.250e-02  1.877e-02   1.731   0.0843 .  
## INDUS       -4.297e-02  8.473e-02  -0.507   0.6124    
## CHAS         2.904e+00  1.220e+00   2.380   0.0178 *  
## NOX         -2.161e+01  5.414e+00  -3.992 7.98e-05 ***
## RM           6.284e+00  4.827e-01  13.019  < 2e-16 ***
## AGE         -4.430e-02  1.785e-02  -2.482   0.0135 *  
## DIS         -1.577e+00  2.842e-01  -5.551 5.63e-08 ***
## RAD          2.451e-01  9.728e-02   2.519   0.0122 *  
## TAX         -1.112e-02  5.452e-03  -2.040   0.0421 *  
## PTRATIO     -9.835e-01  1.939e-01  -5.072 6.38e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.595 on 350 degrees of freedom
## Multiple R-squared:  0.665,  Adjusted R-squared:  0.6525 
## F-statistic: 53.43 on 13 and 350 DF,  p-value: < 2.2e-16
# Make predictions
linreg.pred = predict(linreg, newdata=test)
linreg.sse = sum((linreg.pred - test$MEDV)^2)
linreg.sse
## [1] 3037.088
# Create a CART model
tree = rpart(MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + RM + AGE + DIS + RAD + TAX + PTRATIO, data=train)
prp(tree)

# Make predictions
tree.pred = predict(tree, newdata=test)
tree.sse = sum((tree.pred - test$MEDV)^2)
tree.sse
## [1] 4328.988
# Video 7

# Load libraries for cross-validation
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2

library(e1071)

# Number of folds
tr.control = trainControl(method = "cv", number = 10)

# cp values
cp.grid = expand.grid( .cp = (0:10)*0.001)

# What did we just do?
1*0.001 
## [1] 0.001
10*0.001 
## [1] 0.01
0:10
##  [1]  0  1  2  3  4  5  6  7  8  9 10
0:10 * 0.001
##  [1] 0.000 0.001 0.002 0.003 0.004 0.005 0.006 0.007 0.008 0.009 0.010
# Cross-validation
tr = train(MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + RM + AGE + DIS + RAD + TAX + PTRATIO, data = train, method = "rpart", trControl = tr.control, tuneGrid = cp.grid)
tr
## CART 
## 
## 364 samples
##  15 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 328, 327, 329, 328, 327, 328, ... 
## Resampling results across tuning parameters:
## 
##   cp     RMSE      Rsquared   RMSE SD   Rsquared SD
##   0.000  4.795661  0.7251433  1.839513  0.1726678  
##   0.001  4.808653  0.7241589  1.852793  0.1750635  
##   0.002  4.843092  0.7203932  1.834373  0.1745897  
##   0.003  4.855853  0.7187000  1.848917  0.1792498  
##   0.004  4.992709  0.7032541  1.824035  0.1805318  
##   0.005  4.984651  0.7008777  1.850695  0.1867995  
##   0.006  5.018776  0.6955937  1.872769  0.1916284  
##   0.007  4.994024  0.6980549  1.877489  0.1906712  
##   0.008  5.017056  0.6951007  1.959210  0.1966926  
##   0.009  4.989958  0.6965547  1.994557  0.1986186  
##   0.010  4.989958  0.6965547  1.994557  0.1986186  
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final value used for the model was cp = 0.
# Extract tree
best.tree = tr$finalModel
prp(best.tree)

# Make predictions
best.tree.pred = predict(best.tree, newdata=test)
best.tree.sse = sum((best.tree.pred - test$MEDV)^2)
best.tree.sse
## [1] 3660.149