library(stats)

Reading in the data and EDA

setwd("C:/Users/Harris/OneDrive - University of Pittsburgh/Senior Year/Statistical Learning/Stat1361")
train <- read.csv("./train.csv")
test <- read.csv("./test.csv")

dim(train)
## [1] 1400   17
dim(test)
## [1] 600  17
summary(train[,-1])
##      price                    desc        numstories      yearbuilt   
##  Min.   :  35847   CONDOMINIUM  :  69   Min.   :1.000   Min.   :1805  
##  1st Qu.: 139575   MOBILE HOME  :   1   1st Qu.:1.000   1st Qu.:1922  
##  Median : 267905   MULTI-FAMILY :  42   Median :2.000   Median :1940  
##  Mean   : 343143   ROWHOUSE     :  16   Mean   :1.691   Mean   :1944  
##  3rd Qu.: 426548   SINGLE FAMILY:1272   3rd Qu.:2.000   3rd Qu.:1960  
##  Max.   :3990701                        Max.   :3.000   Max.   :2017  
##                                                                       
##   exteriorfinish    rooftype      basement        totalrooms    
##  Brick   :858    METAL  :234   Min.   :0.0000   Min.   : 3.000  
##  Concrete:  4    ROLL   : 56   1st Qu.:0.0000   1st Qu.: 6.000  
##  Frame   :388    SHINGLE:727   Median :1.0000   Median : 7.000  
##  Log     :  3    SLATE  :383   Mean   :0.5607   Mean   : 7.515  
##  Stone   : 52                  3rd Qu.:1.0000   3rd Qu.: 9.000  
##  Stucco  : 95                  Max.   :1.0000   Max.   :23.000  
##                                                                 
##     bedrooms       bathrooms        fireplaces          sqft      
##  Min.   : 1.00   Min.   : 1.000   Min.   :0.0000   Min.   :  475  
##  1st Qu.: 3.00   1st Qu.: 1.500   1st Qu.:0.0000   1st Qu.: 1349  
##  Median : 3.00   Median : 2.000   Median :0.0000   Median : 1831  
##  Mean   : 3.36   Mean   : 2.233   Mean   :0.6101   Mean   : 2201  
##  3rd Qu.: 4.00   3rd Qu.: 2.500   3rd Qu.:1.0000   3rd Qu.: 2573  
##  Max.   :12.00   Max.   :10.000   Max.   :5.0000   Max.   :15872  
##                                   NA's   :687                     
##     lotarea        state       zipcode        AvgIncome    
##  Min.   :      0   PA:713   Min.   :15003   Min.   :11306  
##  1st Qu.:   3794   VA:687   1st Qu.:15216   1st Qu.:29570  
##  Median :   7744            Median :15332   Median :39943  
##  Mean   :  32528            Mean   :19120   Mean   :43911  
##  3rd Qu.:  16166            3rd Qu.:23222   3rd Qu.:55239  
##  Max.   :3820212            Max.   :23235   Max.   :95289  
## 
summary(test[,-1])
##   price                    desc       numstories      yearbuilt   
##  Mode:logical   CONDOMINIUM  : 42   Min.   :1.000   Min.   :1720  
##  NA's:600       MOBILE HOME  :  1   1st Qu.:1.000   1st Qu.:1920  
##                 MULTI-FAMILY : 15   Median :2.000   Median :1940  
##                 ROWHOUSE     :  3   Mean   :1.689   Mean   :1941  
##                 SINGLE FAMILY:539   3rd Qu.:2.000   3rd Qu.:1960  
##                                     Max.   :3.000   Max.   :2017  
##                                                                   
##   exteriorfinish    rooftype      basement       totalrooms        bedrooms    
##  Brick   :379    METAL  :117   Min.   :0.000   Min.   : 2.000   Min.   :1.000  
##  Concrete:  2    ROLL   : 29   1st Qu.:0.000   1st Qu.: 6.000   1st Qu.:3.000  
##  Frame   :166    SHINGLE:295   Median :1.000   Median : 7.000   Median :3.000  
##  Log     :  1    SLATE  :159   Mean   :0.525   Mean   : 7.322   Mean   :3.303  
##  Stone   : 13                  3rd Qu.:1.000   3rd Qu.: 9.000   3rd Qu.:4.000  
##  Stucco  : 39                  Max.   :1.000   Max.   :16.000   Max.   :8.000  
##                                                                                
##    bathrooms      fireplaces         sqft          lotarea        state   
##  Min.   :1.00   Min.   :0.000   Min.   :  437   Min.   :      0   PA:287  
##  1st Qu.:1.00   1st Qu.:0.000   1st Qu.: 1308   1st Qu.:   3461   VA:313  
##  Median :2.00   Median :0.000   Median : 1772   Median :   7204           
##  Mean   :2.19   Mean   :0.561   Mean   : 2158   Mean   :  21791           
##  3rd Qu.:2.50   3rd Qu.:1.000   3rd Qu.: 2684   3rd Qu.:  16202           
##  Max.   :7.50   Max.   :3.000   Max.   :12150   Max.   :1242593           
##                 NA's   :313                                               
##     zipcode        AvgIncome    
##  Min.   :15003   Min.   :11306  
##  1st Qu.:15220   1st Qu.:29570  
##  Median :23220   Median :39943  
##  Mean   :19371   Mean   :42943  
##  3rd Qu.:23222   3rd Qu.:51643  
##  Max.   :23235   Max.   :95289  
## 
with(train, plot(desc, price))

with(train, plot(numstories, price))

with(train, plot(yearbuilt, price))

with(train, plot(exteriorfinish, price))

with(train, plot(rooftype, price))

with(train, plot(basement, price))

with(train, plot(totalrooms, price))

with(train, plot(bedrooms, price))

with(train, plot(bathrooms, price))

with(train, plot(fireplaces, price))

with(train, plot(sqft, price))

with(train, plot(lotarea, price))

with(train, plot(state, price))

with(train, plot(AvgIncome, price))

It looks like there are 687 NAs in the fireplace column. Let’s check for more and keep this in mind for later.

sapply(train, anyNA)
##             id          price           desc     numstories      yearbuilt 
##          FALSE          FALSE          FALSE          FALSE          FALSE 
## exteriorfinish       rooftype       basement     totalrooms       bedrooms 
##          FALSE          FALSE          FALSE          FALSE          FALSE 
##      bathrooms     fireplaces           sqft        lotarea          state 
##          FALSE           TRUE          FALSE          FALSE          FALSE 
##        zipcode      AvgIncome 
##          FALSE          FALSE
sapply(test, anyNA)
##             id          price           desc     numstories      yearbuilt 
##          FALSE           TRUE          FALSE          FALSE          FALSE 
## exteriorfinish       rooftype       basement     totalrooms       bedrooms 
##          FALSE          FALSE          FALSE          FALSE          FALSE 
##      bathrooms     fireplaces           sqft        lotarea          state 
##          FALSE           TRUE          FALSE          FALSE          FALSE 
##        zipcode      AvgIncome 
##          FALSE          FALSE

Fireplace is the only column with NA. Let’s look more into this.

table(train$fireplaces, useNA = "always")
## 
##    0    1    2    3    4    5 <NA> 
##  386  251   56   10    8    2  687

I will impute the NA values in the fireplace column with the column mean.

train[is.na(train$fireplaces), "fireplaces"] <- mean(train$fireplaces, na.rm = T)
test[is.na(test$fireplaces), "fireplaces"] <- mean(test$fireplaces, na.rm = T)

Here, I remove the ID column since it has no correlation with the response. I will also remove the mobile home observation since it is interfering with the validation set approach, as well as infrequent exterior finishes.

train <- train[,-1]
train <- train[train$desc != "MOBILE HOME",]

train <- train[train$exteriorfinish != "Log" & train$exteriorfinish != "Concrete",]

The variable lot area has some zeros which might be a mistake. Let’s look into this further.

train[train$lotarea == 0,]
##          price          desc numstories yearbuilt exteriorfinish rooftype
## 38   179632.36   CONDOMINIUM        1.0      1919          Brick    METAL
## 70   225290.18   CONDOMINIUM        1.0      1920          Brick     ROLL
## 86   318260.72   CONDOMINIUM        2.5      2004          Frame  SHINGLE
## 92   208425.30   CONDOMINIUM        1.5      2000          Frame  SHINGLE
## 117  158990.03   CONDOMINIUM        1.0      1925          Brick    METAL
## 159  277231.31   CONDOMINIUM        2.0      2014          Brick  SHINGLE
## 200  136721.87   CONDOMINIUM        1.0      1996          Frame  SHINGLE
## 250   97975.41   CONDOMINIUM        1.0      1972          Brick     ROLL
## 265   85288.95   CONDOMINIUM        2.0      1980          Frame  SHINGLE
## 357  257956.43   CONDOMINIUM        1.0      2012          Brick  SHINGLE
## 437  158112.98   CONDOMINIUM        1.0      1910          Brick     ROLL
## 442  303264.90   CONDOMINIUM        2.0      2011          Frame  SHINGLE
## 463   74985.24   CONDOMINIUM        1.0      1983          Brick  SHINGLE
## 485  198207.53   CONDOMINIUM        1.0      1953          Brick    SLATE
## 514  367951.54 SINGLE FAMILY        1.0      1916          Brick    METAL
## 530  275134.10   CONDOMINIUM        1.0      1972          Brick     ROLL
## 545  187985.94   CONDOMINIUM        1.0      1900          Brick    METAL
## 565  229016.14   CONDOMINIUM        1.0      2000          Frame  SHINGLE
## 585  332407.35   CONDOMINIUM        1.0      1986          Brick     ROLL
## 626  180383.35   CONDOMINIUM        1.0      1915          Brick    METAL
## 739  218334.52   CONDOMINIUM        2.0      2006          Frame  SHINGLE
## 800  108425.37   CONDOMINIUM        1.0      1972          Brick     ROLL
## 805  397786.12   CONDOMINIUM        1.0      1832          Brick    SLATE
## 934  114877.68   CONDOMINIUM        1.0      1900          Brick     ROLL
## 940  156027.06   CONDOMINIUM        1.0      1900          Brick    METAL
## 974   95841.89   CONDOMINIUM        1.0      1982          Brick  SHINGLE
## 1047  65175.30   CONDOMINIUM        1.0      1975          Frame     ROLL
## 1053 185588.18   CONDOMINIUM        1.0      1924          Brick  SHINGLE
## 1054 165415.35   CONDOMINIUM        1.0      1923          Brick    METAL
## 1068 135657.63   CONDOMINIUM        1.0      1900          Brick  SHINGLE
## 1158 142594.49 SINGLE FAMILY        2.5      1996          Brick  SHINGLE
## 1183 156909.49   CONDOMINIUM        1.0      1920          Brick     ROLL
## 1259 468294.63   CONDOMINIUM        2.0      1832          Brick    SLATE
## 1270 114751.29   CONDOMINIUM        2.0      1989          Frame  SHINGLE
## 1287 173631.99   CONDOMINIUM        1.0      1919          Brick    METAL
## 1292 199781.84   CONDOMINIUM        1.0      1910          Brick    METAL
## 1302  86491.14   CONDOMINIUM        1.0      1972          Brick     ROLL
## 1334 454236.28   CONDOMINIUM        1.0      1986          Brick     ROLL
## 1336 233630.16   CONDOMINIUM        1.0      2000          Frame  SHINGLE
## 1350 474607.18   CONDOMINIUM        1.0      1997          Brick     ROLL
## 1353 494934.00   CONDOMINIUM        2.0      1832          Brick    SLATE
## 1375 488669.56   CONDOMINIUM        1.0      1997          Brick     ROLL
## 1377 176856.90   CONDOMINIUM        1.0      1925          Brick    METAL
##      basement totalrooms bedrooms bathrooms fireplaces sqft lotarea state
## 38          0          3        1       1.0  0.6100982  650       0    VA
## 70          0          3        2       2.0  1.0000000 1609       0    PA
## 86          1          6        3       3.5  0.0000000 2327       0    PA
## 92          0          6        2       2.5  0.0000000 1825       0    PA
## 117         0          4        2       1.0  0.6100982  799       0    VA
## 159         1          6        3       2.5  0.0000000 2318       0    PA
## 200         0          6        3       2.5  0.0000000 1110       0    PA
## 250         0          4        2       1.0  0.0000000 1146       0    PA
## 265         1          5        2       1.5  0.0000000 1140       0    PA
## 357         1          5        3       2.5  0.0000000 1894       0    PA
## 437         0          4        2       1.0  0.6100982  700       0    VA
## 442         1          8        4       2.5  0.0000000 2756       0    PA
## 463         0          5        2       1.5  0.0000000  851       0    PA
## 485         0          5        2       1.0  0.6100982 1127       0    VA
## 514         0          6        3       2.0  0.6100982 1599       0    VA
## 530         0          5        2       2.5  0.0000000 1830       0    PA
## 545         0          4        2       2.0  0.6100982 1051       0    VA
## 565         0          6        2       2.5  1.0000000 1596       0    PA
## 585         0          5        2       2.0  0.0000000 2086       0    PA
## 626         0          4        1       1.0  0.6100982  867       0    VA
## 739         1          6        3       2.5  0.0000000 1724       0    PA
## 800         0          4        2       2.0  0.0000000 1234       0    PA
## 805         0          5        2       2.5  0.6100982 1604       0    VA
## 934         0          4        1       1.0  0.0000000 1320       0    PA
## 940         0          4        2       1.0  0.6100982 1201       0    VA
## 974         0          4        2       2.0  0.0000000 1120       0    PA
## 1047        0          3        1       1.0  0.0000000  751       0    PA
## 1053        0          5        3       1.0  0.6100982  795       0    VA
## 1054        0          3        1       1.0  0.6100982  615       0    VA
## 1068        0          3        1       1.0  0.6100982  552       0    VA
## 1158        0          6        3       1.5  0.6100982 1506       0    VA
## 1183        0          3        1       1.0  0.6100982  594       0    VA
## 1259        0          6        2       2.5  0.6100982 1699       0    VA
## 1270        1          4        2       2.0  0.0000000 1408       0    PA
## 1287        0          3        1       1.0  0.6100982  650       0    VA
## 1292        0          3        1       1.5  0.6100982 1069       0    VA
## 1302        0          3        1       1.0  0.0000000  965       0    PA
## 1334        0          6        2       2.0  0.0000000 2609       0    PA
## 1336        0          5        2       2.5  1.0000000 1596       0    PA
## 1350        0          6        3       2.0  0.0000000 2589       0    PA
## 1353        0          6        3       2.5  0.6100982 2231       0    VA
## 1375        0          6        3       2.5  0.0000000 2943       0    PA
## 1377        0          4        2       2.0  0.6100982  780       0    VA
##      zipcode AvgIncome
## 38     23220     26557
## 70     15215     43750
## 86     15238     67370
## 92     15235     41367
## 117    23223     27936
## 159    15237     55016
## 200    15044     95289
## 250    15215     43750
## 265    15218     35983
## 357    15237     55016
## 437    23219     11306
## 442    15227     38439
## 463    15215     43750
## 485    23221     39943
## 514    23220     26557
## 530    15213     18473
## 545    23219     11306
## 565    15025     60669
## 585    15213     18473
## 626    23220     26557
## 739    15227     38439
## 800    15213     18473
## 805    23220     26557
## 934    15212     26712
## 940    23223     27936
## 974    15218     35983
## 1047   15237     55016
## 1053   23221     39943
## 1054   23221     39943
## 1068   23223     27936
## 1158   23220     26557
## 1183   23220     26557
## 1259   23220     26557
## 1270   15235     41367
## 1287   23220     26557
## 1292   23223     27936
## 1302   15215     43750
## 1334   15213     18473
## 1336   15025     60669
## 1350   15213     18473
## 1353   23220     26557
## 1375   15213     18473
## 1377   23223     27936

They are almost all condominiums so a 0 for lot size makes sense. More importantly though is the fact that the mean is skewed right relative to the median. Let’s check for large outliers. There are many outliers, and after looking at the histogram, the entire distribition is definitely skewed right.

summary(train$lotarea)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    3805    7744   29475   16049 3820212
largelots <- train[train$lotarea > 100000,]
largelots
##          price          desc numstories yearbuilt exteriorfinish rooftype
## 2     133555.0 SINGLE FAMILY        1.0      1951          Brick  SHINGLE
## 32   2505883.2 SINGLE FAMILY        1.5      2001          Brick    SLATE
## 47    897047.1 SINGLE FAMILY        2.5      1950          Brick    SLATE
## 62    222437.1 SINGLE FAMILY        1.0      1950          Frame  SHINGLE
## 75   2165585.5 SINGLE FAMILY        2.0      2005          Stone    SLATE
## 89    518323.5 SINGLE FAMILY        2.0      2003          Brick  SHINGLE
## 103   700871.4 SINGLE FAMILY        2.0      1993          Frame  SHINGLE
## 130   558365.8 SINGLE FAMILY        2.0      2008          Brick  SHINGLE
## 134   417817.1 SINGLE FAMILY        1.0      1955          Brick  SHINGLE
## 137  1121120.7 SINGLE FAMILY        2.0      1993          Brick  SHINGLE
## 176   883709.5 SINGLE FAMILY        2.0      1930          Frame    SLATE
## 216  1190970.6 SINGLE FAMILY        2.0      2004          Brick  SHINGLE
## 246   369140.6 SINGLE FAMILY        2.0      1930          Frame  SHINGLE
## 334   298066.5 SINGLE FAMILY        1.0      1998          Frame  SHINGLE
## 388   497169.6 SINGLE FAMILY        2.0      1986          Brick  SHINGLE
## 394   714298.5 SINGLE FAMILY        2.0      2002          Brick  SHINGLE
## 469   180722.5 SINGLE FAMILY        1.0      1950          Frame  SHINGLE
## 471  1278376.6 SINGLE FAMILY        2.0      1917          Stone  SHINGLE
## 478   498788.5 SINGLE FAMILY        1.0      1989          Stone  SHINGLE
## 523  1098461.3 SINGLE FAMILY        2.0      2015         Stucco  SHINGLE
## 569   368854.4 SINGLE FAMILY        2.0      1890          Frame  SHINGLE
## 595   375283.5 SINGLE FAMILY        2.0      1924          Brick    SLATE
## 600   369047.2 SINGLE FAMILY        1.0      2015          Frame  SHINGLE
## 607   517839.3 SINGLE FAMILY        2.0      1939          Stone    SLATE
## 619   643438.5 SINGLE FAMILY        2.0      1973          Brick  SHINGLE
## 639  3990701.1 SINGLE FAMILY        2.5      1935          Brick    SLATE
## 663  1665583.2 SINGLE FAMILY        2.0      2001          Frame  SHINGLE
## 707   824997.7  MULTI-FAMILY        2.0      1860          Frame  SHINGLE
## 746   531899.0 SINGLE FAMILY        2.0      1975          Brick  SHINGLE
## 752  1269568.2 SINGLE FAMILY        2.0      2012          Brick  SHINGLE
## 778  1522371.5 SINGLE FAMILY        2.0      1937          Brick    SLATE
## 803  2526697.5 SINGLE FAMILY        2.5      1920          Stone    SLATE
## 817   475406.7 SINGLE FAMILY        2.0      1935          Frame    METAL
## 818   133728.9 SINGLE FAMILY        1.0      1962          Brick  SHINGLE
## 882  1385186.9  MULTI-FAMILY        2.0      1978          Stone  SHINGLE
## 944  1048091.1  MULTI-FAMILY        2.0      1925          Frame    SLATE
## 963   307965.7 SINGLE FAMILY        1.0      1978          Brick  SHINGLE
## 1010  344285.3 SINGLE FAMILY        1.0      1958          Frame     ROLL
## 1024 1489254.6 SINGLE FAMILY        1.5      1955          Brick  SHINGLE
## 1038 1152514.2 SINGLE FAMILY        1.5      1925         Stucco    SLATE
## 1084 1113707.4 SINGLE FAMILY        2.0      2001          Brick  SHINGLE
## 1100 1406904.7 SINGLE FAMILY        2.0      2003          Brick    SLATE
## 1161  715654.8 SINGLE FAMILY        2.0      1999          Brick  SHINGLE
## 1175  500165.6 SINGLE FAMILY        2.0      1995          Brick  SHINGLE
## 1212  383206.9 SINGLE FAMILY        1.0      1988          Brick  SHINGLE
## 1217  291807.5 SINGLE FAMILY        1.5      1940          Frame  SHINGLE
## 1218 1692333.8 SINGLE FAMILY        2.0      1936          Brick    SLATE
## 1238 1428734.9 SINGLE FAMILY        2.0      1940          Brick  SHINGLE
## 1247 1300535.4 SINGLE FAMILY        2.0      1940          Stone    SLATE
## 1266  171803.5 SINGLE FAMILY        1.0      1971          Frame  SHINGLE
## 1278  348203.5 SINGLE FAMILY        1.0      1995          Frame  SHINGLE
## 1279  529870.6 SINGLE FAMILY        1.5      1991          Frame  SHINGLE
## 1283  433110.7 SINGLE FAMILY        2.0      2007          Brick  SHINGLE
## 1294 1204385.9 SINGLE FAMILY        2.0      1920          Stone     ROLL
## 1310  325354.8 SINGLE FAMILY        1.0      1960          Frame  SHINGLE
## 1333  171421.7 SINGLE FAMILY        1.0      1955          Brick  SHINGLE
## 1335  139541.9 SINGLE FAMILY        1.0      1952          Brick  SHINGLE
## 1364  452455.1 SINGLE FAMILY        1.0      2008          Brick  SHINGLE
## 1374  624915.5 SINGLE FAMILY        2.0      2009          Brick  SHINGLE
##      basement totalrooms bedrooms bathrooms fireplaces  sqft   lotarea state
## 2           1          5        3       1.5  2.0000000  1285  165964.0    PA
## 32          0         10        5       6.0  0.6100982  6145  291416.4    VA
## 47          1         11        5       4.5  0.6100982  3895  296947.0    VA
## 62          1          6        3       1.0  0.0000000  1296  106722.0    PA
## 75          1         10        4       5.5  3.0000000  9444  131290.0    PA
## 89          1          8        4       3.0  0.0000000  3884  384635.0    PA
## 103         1         10        4       3.5  1.0000000  5585  435600.0    PA
## 130         1          9        4       2.5  1.0000000  3302  431941.0    PA
## 134         1         10        4       2.5  2.0000000  3425  108900.0    PA
## 137         1         14        5       6.5  1.0000000  6170  229300.0    PA
## 176         1         10        5       4.5  1.0000000  3523  125845.0    PA
## 216         1         10        4       5.5  2.0000000  7886  609840.0    PA
## 246         1          8        4       3.5  1.0000000  2734  130754.0    PA
## 334         1          3        2       2.0  0.0000000  1680  277042.0    PA
## 388         1          7        3       2.5  1.0000000  3410  267546.0    PA
## 394         1         10        5       4.5  2.0000000  3607  148148.0    PA
## 469         1          7        2       2.0  0.0000000  1293  130680.0    PA
## 471         1         10        7       7.0  4.0000000  8096  365468.0    PA
## 478         1          8        3       2.5  1.0000000  6341  745007.0    PA
## 523         1         13        5       7.0  0.0000000  8919  141012.0    PA
## 569         1          7        4       2.0  1.0000000  2856 3820212.0    PA
## 595         1          7        3       2.0  2.0000000  4103  214315.0    PA
## 600         1         10        4       3.0  0.0000000  2184  100963.0    PA
## 607         1          9        4       2.5  1.0000000  2250  719175.0    PA
## 619         1          8        4       2.5  2.0000000  4550 2087636.0    PA
## 639         0         19        8       8.5  0.6100982 10060  302364.0    VA
## 663         1         16        5       5.5  2.0000000  9008  167654.0    PA
## 707         1          9        5       3.0  1.0000000  6912  263538.0    PA
## 746         1         11        5       3.5  2.0000000  4228  435600.0    PA
## 752         1         11        5       6.0  0.0000000  6496  140433.0    PA
## 778         1         11        4       6.0  4.0000000  5164  446621.0    PA
## 803         1         13        5       7.5  4.0000000 15872 1393075.0    PA
## 817         0          7        3       2.0  0.6100982  1731  949608.0    VA
## 818         1          6        3       1.0  0.0000000   988  111252.0    PA
## 882         1         16        4       5.0  5.0000000  7484  159560.0    PA
## 944         1         10        4       4.0  2.0000000  7744  263974.0    PA
## 963         1          7        3       2.5  1.0000000  1733  249817.0    PA
## 1010        1          7        3       3.0  2.0000000  2149  109292.0    PA
## 1024        1         16        7       7.5  2.0000000  6403  130680.0    PA
## 1038        1         11        5       4.0  4.0000000  8125  144489.0    PA
## 1084        1         10        4       4.5  0.0000000  6434  104152.0    PA
## 1100        1         10        5       5.0  1.0000000  4824  174676.0    PA
## 1161        1         10        5       5.0  3.0000000  7302  261012.0    PA
## 1175        1          9        5       3.5  1.0000000  3842  135123.0    PA
## 1212        1          8        3       2.5  1.0000000  2697  578825.0    PA
## 1217        1          7        3       2.5  0.0000000  1986  438475.0    PA
## 1218        1         12        6       6.5  2.0000000  7622  166573.0    PA
## 1238        1         12        4       6.5  5.0000000 10852  130680.0    PA
## 1247        1         12        6       5.5  2.0000000  5163  144619.0    PA
## 1266        1          5        2       1.0  0.0000000  1092 2115273.0    PA
## 1278        1          3        1       1.0  0.0000000  1011  496671.0    PA
## 1279        1          9        4       3.5  1.0000000  3816  109771.0    PA
## 1283        1         10        4       2.5  1.0000000  3504  202040.0    PA
## 1294        1         12        4       5.5  3.0000000  5878  224500.0    PA
## 1310        1          7        4       2.5  0.0000000  1514  566280.0    PA
## 1333        1          6        3       2.0  0.0000000  1690  120095.0    PA
## 1335        1          5        2       1.0  0.0000000  1432  175634.0    PA
## 1364        1          7        3       2.5  1.0000000  3974  543585.0    PA
## 1374        1          9        4       2.5  0.0000000  4113  131238.0    PA
##      zipcode AvgIncome
## 2      15003     37868
## 32     23226     50519
## 47     23235     56989
## 62     15026     69387
## 75     15238     67370
## 89     15025     60669
## 103    15025     60669
## 130    15005     64427
## 134    15238     67370
## 137    15044     95289
## 176    15238     67370
## 216    15237     55016
## 246    15005     64427
## 334    15025     60669
## 388    15044     95289
## 394    15005     64427
## 469    15238     67370
## 471    15044     95289
## 478    15025     60669
## 523    15215     43750
## 569    15332     62875
## 595    15235     41367
## 600    15332     62875
## 607    15237     55016
## 619    15037     55863
## 639    23226     50519
## 663    15215     43750
## 707    15044     95289
## 746    15025     60669
## 752    15044     95289
## 778    15238     67370
## 803    15238     67370
## 817    23234     40522
## 818    15026     69387
## 882    15238     67370
## 944    15238     67370
## 963    15025     60669
## 1010   15237     55016
## 1024   15238     67370
## 1038   15215     43750
## 1084   15044     95289
## 1100   15005     64427
## 1161   15005     64427
## 1175   15044     95289
## 1212   15037     55863
## 1217   15037     55863
## 1218   15238     67370
## 1238   15238     67370
## 1247   15238     67370
## 1266   15332     62875
## 1278   15235     41367
## 1279   15237     55016
## 1283   15332     62875
## 1294   15215     43750
## 1310   15237     55016
## 1333   15332     62875
## 1335   15037     55863
## 1364   15037     55863
## 1374   15237     55016
nrow(largelots)
## [1] 59
hist(train$lotarea, breaks = 100)

Now let’s make note of any other outliers. First, the square foot column - there are 3 outliers.

hist(train$sqft)

train[train$sqft > 10000 , ]
##        price          desc numstories yearbuilt exteriorfinish rooftype
## 639  3990701 SINGLE FAMILY        2.5      1935          Brick    SLATE
## 803  2526698 SINGLE FAMILY        2.5      1920          Stone    SLATE
## 1238 1428735 SINGLE FAMILY        2.0      1940          Brick  SHINGLE
##      basement totalrooms bedrooms bathrooms fireplaces  sqft lotarea state
## 639         0         19        8       8.5  0.6100982 10060  302364    VA
## 803         1         13        5       7.5  4.0000000 15872 1393075    PA
## 1238        1         12        4       6.5  5.0000000 10852  130680    PA
##      zipcode AvgIncome
## 639    23226     50519
## 803    15238     67370
## 1238   15238     67370

Let’s look at price on its own.

hist(train$price, breaks = 100)

Zip code has too many factors, so I will delete the column. Average Income can be a good proxy for zip code.

train <- subset(train, select = -zipcode)
test <- subset(test, select = -zipcode)

Modeling:

Initially split into a training and test set for all models (train1 and test1). Use cross validation when applicable on train1.

Linear model manually changing predictors

library(stats)
library(boot)

#validation set split
set.seed(2)
trainindex <- sample(1:nrow(train), nrow(train)*.7)
train1 <- train[trainindex,]
test1 <- train[-trainindex,]

#linear model with all predictors
lmfull <- glm(price ~ ., data = train)
summary(lmfull)
## 
## Call:
## glm(formula = price ~ ., data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -594921   -59401    -3126    54346  1864366  
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           2.298e+05  2.870e+05   0.801 0.423442    
## descMULTI-FAMILY     -1.295e+05  2.967e+04  -4.365 1.37e-05 ***
## descROWHOUSE          2.670e+04  3.832e+04   0.697 0.485979    
## descSINGLE FAMILY    -2.347e+04  1.846e+04  -1.272 0.203733    
## numstories           -1.942e+04  8.997e+03  -2.159 0.031019 *  
## yearbuilt            -2.116e+02  1.498e+02  -1.413 0.157964    
## exteriorfinishFrame  -1.579e+04  8.831e+03  -1.788 0.074035 .  
## exteriorfinishStone  -3.901e+04  2.079e+04  -1.876 0.060891 .  
## exteriorfinishStucco -7.715e+04  1.479e+04  -5.218 2.09e-07 ***
## rooftypeROLL          6.743e+03  2.166e+04   0.311 0.755576    
## rooftypeSHINGLE      -4.887e+04  1.453e+04  -3.364 0.000789 ***
## rooftypeSLATE         4.021e+04  1.181e+04   3.404 0.000684 ***
## basement              3.629e+04  1.320e+04   2.750 0.006046 ** 
## totalrooms           -2.797e+03  3.172e+03  -0.882 0.377960    
## bedrooms             -1.477e+04  6.106e+03  -2.419 0.015680 *  
## bathrooms             8.017e+04  5.619e+03  14.268  < 2e-16 ***
## fireplaces           -3.872e+04  7.234e+03  -5.352 1.02e-07 ***
## sqft                  1.560e+02  5.075e+00  30.731  < 2e-16 ***
## lotarea               5.920e-02  2.480e-02   2.387 0.017133 *  
## stateVA               1.917e+05  1.577e+04  12.157  < 2e-16 ***
## AvgIncome             1.442e+00  3.051e-01   4.727 2.51e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 17267122762)
## 
##     Null deviance: 1.4339e+14  on 1391  degrees of freedom
## Residual deviance: 2.3673e+13  on 1371  degrees of freedom
## AIC: 36785
## 
## Number of Fisher Scoring iterations: 2
#getting cross validated MSE for full LM model
lmfullcv <- cv.glm(train, lmfull, K = 10)
mselmfull <- lmfullcv$delta[1]
mselmfull
## [1] 18198966972
#including only significant terms
lmreduced <- glm(price ~ desc + exteriorfinish + numstories + rooftype + basement + bedrooms + bathrooms + fireplaces + sqft + lotarea + state + AvgIncome, data = train)

#getting CV error
lmreducedcv <- cv.glm(train, lmreduced, K = 10)
msereducedlm <- lmreducedcv$delta[1]
msereducedlm
## [1] 17916548309

Ridge

set.seed(1)
library(glmnet)
## Warning: package 'glmnet' was built under R version 3.6.3
## Loading required package: Matrix
## Loaded glmnet 4.0-2
x = model.matrix(price~., train)[, -1]
y = train$price

grid <- 10^seq(10,-2, length = 100)
ridge <- glmnet(x, y, alpha = 0, lambda = grid)

#use cross validation to choose best lambda value
cvridge <- cv.glmnet(x, y, alpha = 0, lambda = grid)
best <- cvridge$lambda.min
best
## [1] 231.013
mseridge <- cvridge$cvm[which(cvridge$lambda == best)]
mseridge
## [1] 18247763194
#obtaining the coefficients of the model chosen by CV
model <- predict(ridge, s = best, type = "coefficients")
model
## 24 x 1 sparse Matrix of class "dgCMatrix"
##                                    1
## (Intercept)             2.271854e+05
## descMOBILE HOME         .           
## descMULTI-FAMILY       -1.295726e+05
## descROWHOUSE            2.697548e+04
## descSINGLE FAMILY      -2.321684e+04
## numstories             -1.921437e+04
## yearbuilt              -2.101641e+02
## exteriorfinishConcrete  .           
## exteriorfinishFrame    -1.581381e+04
## exteriorfinishLog       .           
## exteriorfinishStone    -3.898815e+04
## exteriorfinishStucco   -7.718419e+04
## rooftypeROLL            6.561915e+03
## rooftypeSHINGLE        -4.897133e+04
## rooftypeSLATE           4.027268e+04
## basement                3.536442e+04
## totalrooms             -2.670024e+03
## bedrooms               -1.475754e+04
## bathrooms               8.040069e+04
## fireplaces             -3.839883e+04
## sqft                    1.554213e+02
## lotarea                 5.955892e-02
## stateVA                 1.904573e+05
## AvgIncome               1.438172e+00

Lasso

lasso <- glmnet(x, y, alpha = 1, lambda = grid)

#finding the best lambda value by CV
set.seed(2)
cvlasso <- cv.glmnet(x, y, alpha = 1)
best <- cvlasso$lambda.min

#obtaining the CV error of the lasso model chosen by CV
mselasso <- cvlasso$cvm[which(cvridge$lambda == cvridge$lambda.min)]
mselasso
## [1] 18836506616
predict(lasso, s = best, type = "coefficients")
## 24 x 1 sparse Matrix of class "dgCMatrix"
##                                    1
## (Intercept)             1.099764e+05
## descMOBILE HOME         .           
## descMULTI-FAMILY       -1.219651e+05
## descROWHOUSE            2.732720e+04
## descSINGLE FAMILY      -1.661825e+04
## numstories             -1.698020e+04
## yearbuilt              -1.487798e+02
## exteriorfinishConcrete  .           
## exteriorfinishFrame    -1.446350e+04
## exteriorfinishLog       .           
## exteriorfinishStone    -3.440568e+04
## exteriorfinishStucco   -7.510747e+04
## rooftypeROLL            6.073763e+02
## rooftypeSHINGLE        -5.094604e+04
## rooftypeSLATE           3.809361e+04
## basement                2.749767e+04
## totalrooms             -1.550969e+03
## bedrooms               -1.511292e+04
## bathrooms               7.982234e+04
## fireplaces             -3.603688e+04
## sqft                    1.536619e+02
## lotarea                 5.653835e-02
## stateVA                 1.805653e+05
## AvgIncome               1.322257e+00
## ols after lasso
afterlasso <- glm(price ~ desc + numstories + exteriorfinish + rooftype + bathrooms + bedrooms + fireplaces + sqft + lotarea + state + AvgIncome, data = train)
summary(afterlasso)
## 
## Call:
## glm(formula = price ~ desc + numstories + exteriorfinish + rooftype + 
##     bathrooms + bedrooms + fireplaces + sqft + lotarea + state + 
##     AvgIncome, data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -609411   -60092    -1971    54607  1857374  
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.591e+05  2.819e+04  -5.643 2.03e-08 ***
## descMULTI-FAMILY     -1.164e+05  2.816e+04  -4.133 3.80e-05 ***
## descROWHOUSE          4.688e+04  3.751e+04   1.250 0.211512    
## descSINGLE FAMILY    -1.045e+04  1.780e+04  -0.587 0.557310    
## numstories           -1.559e+04  8.882e+03  -1.755 0.079471 .  
## exteriorfinishFrame  -1.724e+04  8.818e+03  -1.955 0.050799 .  
## exteriorfinishStone  -3.962e+04  2.082e+04  -1.903 0.057272 .  
## exteriorfinishStucco -8.171e+04  1.472e+04  -5.549 3.45e-08 ***
## rooftypeROLL         -4.803e+03  2.118e+04  -0.227 0.820644    
## rooftypeSHINGLE      -5.558e+04  1.352e+04  -4.112 4.16e-05 ***
## rooftypeSLATE         3.786e+04  1.161e+04   3.260 0.001140 ** 
## bathrooms             8.097e+04  5.408e+03  14.974  < 2e-16 ***
## bedrooms             -1.864e+04  5.043e+03  -3.695 0.000228 ***
## fireplaces           -3.764e+04  7.210e+03  -5.220 2.07e-07 ***
## sqft                  1.532e+02  4.959e+00  30.896  < 2e-16 ***
## lotarea               6.277e-02  2.482e-02   2.529 0.011553 *  
## stateVA               1.603e+05  1.130e+04  14.176  < 2e-16 ***
## AvgIncome             1.333e+00  2.864e-01   4.653 3.59e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 17358273582)
## 
##     Null deviance: 1.4339e+14  on 1391  degrees of freedom
## Residual deviance: 2.3850e+13  on 1374  degrees of freedom
## AIC: 36790
## 
## Number of Fisher Scoring iterations: 2
afterlassocv <- cv.glm(train, afterlasso, K = 10)
mseafterlasso <- afterlassocv$delta[1]
mseafterlasso
## [1] 17869313365
plot(afterlasso)

Tree

library(tree)
## Warning: package 'tree' was built under R version 3.6.3
tree <- tree(price ~., data = train1)
summary(tree)
## 
## Regression tree:
## tree(formula = price ~ ., data = train1)
## Variables actually used in tree construction:
## [1] "sqft"      "rooftype"  "state"     "bathrooms"
## Number of terminal nodes:  11 
## Residual mean deviance:  2.151e+10 = 2.071e+13 / 963 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1352000   -78700    -9910        0    76700  1742000
plot(tree)
text(tree, pretty = 0)

pred <- predict(tree, newdata = test1)
msetree <- mean( (pred - test1$price)^2)
msetree
## [1] 20770133160
#choose tuning parameter to prune the tree using 10-fold CV
cvtree <- cv.tree(tree)
plot(cvtree$size, cvtree$dev, type = 'b')

#choose size 8 as the pruned tree
prune = prune.tree(tree, best = 11)
plot(prune)
text(prune, pretty = 0)

pred <-  predict(prune, newdata = test1)
mseprune <- mean( (pred - test1$price)^2)
mseprune
## [1] 20770133160

Bagging, Random Forest

set.seed(2)
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.6.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
#obtain bagging model
bag <- randomForest(price ~., data = train1, mtry = 14, importance = TRUE)
bag
## 
## Call:
##  randomForest(formula = price ~ ., data = train1, mtry = 14, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 14
## 
##           Mean of squared residuals: 16720067293
##                     % Var explained: 84.96
pred <-  predict(bag, newdata = test1)
msebag <- mean( (pred - test1$price)^2)

#random forest tuning over mtry using validation set approach
MSEs <- rep(NA, 12)
for(i in 3:14){
        rf <- randomForest(price ~., data = train1, mtry = i)
        pred <-  predict(rf, newdata = test1)
        MSEs[i-2] <- mean( (pred - test1$price)^2)
}

plot(3:14, MSEs, type = "b")

#obtain the validation set MSE for the best rf model
bestrf <- randomForest(price~., data = train1, mtry = (which.min(MSEs) + 2))
pred <-  predict(bestrf, newdata = test1)
mserf <- mean( (pred - test1$price)^2)

varImpPlot(bestrf)

importance(bestrf)
##                IncNodePurity
## desc            3.145216e+11
## numstories      2.342883e+12
## yearbuilt       2.891023e+12
## exteriorfinish  1.482689e+12
## rooftype        5.762043e+12
## basement        9.314035e+11
## totalrooms      1.084386e+13
## bedrooms        7.662470e+12
## bathrooms       2.245261e+13
## fireplaces      2.855454e+12
## sqft            2.873395e+13
## lotarea         1.070175e+13
## state           2.631609e+12
## AvgIncome       3.901091e+12

Boosting

library(gbm)
## Warning: package 'gbm' was built under R version 3.6.3
## Loaded gbm 2.1.8
# choosing the best lambda tuning parameter and best ntree
set.seed(1)
grid <- c(10^seq(-5, -1), .2, .3)
ntree <- c(50, 100, 250, 500, 1000, 1500, 2000, 3000)
MSEs <- matrix(nrow = length(ntree), ncol = length(grid))
for(i in 1:length(grid)){
        for(j in 1:length(ntree)){
                boost <- gbm(price ~., data = train1, n.trees = ntree[j], distribution = "gaussian", interaction.depth = 4, shrinkage = grid[i])
                pred <-  predict(boost, newdata = test1)
                MSEs[j,i] <- mean( (pred - test1$price)^2)
        }
}
## Using 50 trees...
## Using 100 trees...
## Using 250 trees...
## Using 500 trees...
## Using 1000 trees...
## Using 1500 trees...
## Using 2000 trees...
## Using 3000 trees...
## Using 50 trees...
## Using 100 trees...
## Using 250 trees...
## Using 500 trees...
## Using 1000 trees...
## Using 1500 trees...
## Using 2000 trees...
## Using 3000 trees...
## Using 50 trees...
## Using 100 trees...
## Using 250 trees...
## Using 500 trees...
## Using 1000 trees...
## Using 1500 trees...
## Using 2000 trees...
## Using 3000 trees...
## Using 50 trees...
## Using 100 trees...
## Using 250 trees...
## Using 500 trees...
## Using 1000 trees...
## Using 1500 trees...
## Using 2000 trees...
## Using 3000 trees...
## Using 50 trees...
## Using 100 trees...
## Using 250 trees...
## Using 500 trees...
## Using 1000 trees...
## Using 1500 trees...
## Using 2000 trees...
## Using 3000 trees...
## Using 50 trees...
## Using 100 trees...
## Using 250 trees...
## Using 500 trees...
## Using 1000 trees...
## Using 1500 trees...
## Using 2000 trees...
## Using 3000 trees...
## Using 50 trees...
## Using 100 trees...
## Using 250 trees...
## Using 500 trees...
## Using 1000 trees...
## Using 1500 trees...
## Using 2000 trees...
## Using 3000 trees...
MSEs
##             [,1]        [,2]        [,3]        [,4]       [,5]       [,6]
## [1,] 84035239297 83563785864 79029865198 47565387833 8700060858 8437308910
## [2,] 83983504557 83054313254 74458864423 30225881550 6652570210 7794588428
## [3,] 83823908707 81518183345 62443440136 13043105793 6060579082 6728686801
## [4,] 83560622265 79037184539 47552386952  8492551900 5874305747 6805514721
## [5,] 83048042051 74393040534 30031983471  7042301712 6070065755 6447094684
## [6,] 82530178920 70065724010 20816403904  6495228601 5941262183 6327600116
## [7,] 82023662602 66057339568 15852442026  6352379927 6156373526 6512712554
## [8,] 81013833761 59051139022 11324811011  6078884039 5897293905 6960472764
##            [,7]
## [1,] 7748989375
## [2,] 6926894319
## [3,] 6491387218
## [4,] 7211123389
## [5,] 6998353015
## [6,] 8548368498
## [7,] 7365770518
## [8,] 6673433699
which <- which.min(MSEs)

#obtaining the validation set error of the best boosting model
boost <- gbm(price ~., data = train1, n.trees = ntree[(which %% 8)], distribution = "gaussian", interaction.depth = 4, shrinkage = grid[round(which/8, 0) + 1])
pred <-  predict(boost, newdata = test1)
## Using 500 trees...
mseboost <- mean( (pred - test1$price)^2 )
mseboost
## [1] 6170604767
#MSE for observations below the 3rd quartile
regtest <- test1[test1$price < 1000000,]
pred <-  predict(boost, newdata = regtest)
## Using 500 trees...
mseregular <- mean( (pred - regtest$price)^2 )
mseregular
## [1] 5164666245
#MSE for observations above the 3rd quartile
hightest <- test1[test1$price > 1000000,]
pred <-  predict(boost, newdata = hightest)
## Using 500 trees...
msehigh <- mean( (pred - hightest$price)^2 )
msehigh
## [1] 35199116424
summary(boost)

##                           var    rel.inf
## sqft                     sqft 47.8835374
## bathrooms           bathrooms 23.9323397
## lotarea               lotarea  8.4076568
## rooftype             rooftype  6.0365388
## state                   state  4.4538855
## totalrooms         totalrooms  2.7532791
## AvgIncome           AvgIncome  2.1346561
## yearbuilt           yearbuilt  1.4418337
## bedrooms             bedrooms  0.8749210
## exteriorfinish exteriorfinish  0.6926275
## numstories         numstories  0.6851495
## fireplaces         fireplaces  0.3324516
## desc                     desc  0.2625792
## basement             basement  0.1085441
data.frame(mselmfull, msereducedlm, mselasso, mseafterlasso, mseridge, msetree, mseprune, mserf, msebag, mseboost)
##     mselmfull msereducedlm    mselasso mseafterlasso    mseridge     msetree
## 1 18198966972  17916548309 18836506616   17869313365 18247763194 20770133160
##      mseprune      mserf     msebag   mseboost
## 1 20770133160 6556605797 8684574624 6170604767

Predictions

test$price <- predict(boost, newdata = test)
## Using 500 trees...
test$student_id <- rep(4181080, nrow(test))
testing_predictions <- subset(test, select = c(id, price, student_id))
write.csv(testing_predictions, "./testing_predictions_4181080.csv")