library(stats)
Reading in the data and EDA
setwd("C:/Users/Harris/OneDrive - University of Pittsburgh/Senior Year/Statistical Learning/Stat1361")
train <- read.csv("./train.csv")
test <- read.csv("./test.csv")
dim(train)
## [1] 1400 17
dim(test)
## [1] 600 17
summary(train[,-1])
## price desc numstories yearbuilt
## Min. : 35847 CONDOMINIUM : 69 Min. :1.000 Min. :1805
## 1st Qu.: 139575 MOBILE HOME : 1 1st Qu.:1.000 1st Qu.:1922
## Median : 267905 MULTI-FAMILY : 42 Median :2.000 Median :1940
## Mean : 343143 ROWHOUSE : 16 Mean :1.691 Mean :1944
## 3rd Qu.: 426548 SINGLE FAMILY:1272 3rd Qu.:2.000 3rd Qu.:1960
## Max. :3990701 Max. :3.000 Max. :2017
##
## exteriorfinish rooftype basement totalrooms
## Brick :858 METAL :234 Min. :0.0000 Min. : 3.000
## Concrete: 4 ROLL : 56 1st Qu.:0.0000 1st Qu.: 6.000
## Frame :388 SHINGLE:727 Median :1.0000 Median : 7.000
## Log : 3 SLATE :383 Mean :0.5607 Mean : 7.515
## Stone : 52 3rd Qu.:1.0000 3rd Qu.: 9.000
## Stucco : 95 Max. :1.0000 Max. :23.000
##
## bedrooms bathrooms fireplaces sqft
## Min. : 1.00 Min. : 1.000 Min. :0.0000 Min. : 475
## 1st Qu.: 3.00 1st Qu.: 1.500 1st Qu.:0.0000 1st Qu.: 1349
## Median : 3.00 Median : 2.000 Median :0.0000 Median : 1831
## Mean : 3.36 Mean : 2.233 Mean :0.6101 Mean : 2201
## 3rd Qu.: 4.00 3rd Qu.: 2.500 3rd Qu.:1.0000 3rd Qu.: 2573
## Max. :12.00 Max. :10.000 Max. :5.0000 Max. :15872
## NA's :687
## lotarea state zipcode AvgIncome
## Min. : 0 PA:713 Min. :15003 Min. :11306
## 1st Qu.: 3794 VA:687 1st Qu.:15216 1st Qu.:29570
## Median : 7744 Median :15332 Median :39943
## Mean : 32528 Mean :19120 Mean :43911
## 3rd Qu.: 16166 3rd Qu.:23222 3rd Qu.:55239
## Max. :3820212 Max. :23235 Max. :95289
##
summary(test[,-1])
## price desc numstories yearbuilt
## Mode:logical CONDOMINIUM : 42 Min. :1.000 Min. :1720
## NA's:600 MOBILE HOME : 1 1st Qu.:1.000 1st Qu.:1920
## MULTI-FAMILY : 15 Median :2.000 Median :1940
## ROWHOUSE : 3 Mean :1.689 Mean :1941
## SINGLE FAMILY:539 3rd Qu.:2.000 3rd Qu.:1960
## Max. :3.000 Max. :2017
##
## exteriorfinish rooftype basement totalrooms bedrooms
## Brick :379 METAL :117 Min. :0.000 Min. : 2.000 Min. :1.000
## Concrete: 2 ROLL : 29 1st Qu.:0.000 1st Qu.: 6.000 1st Qu.:3.000
## Frame :166 SHINGLE:295 Median :1.000 Median : 7.000 Median :3.000
## Log : 1 SLATE :159 Mean :0.525 Mean : 7.322 Mean :3.303
## Stone : 13 3rd Qu.:1.000 3rd Qu.: 9.000 3rd Qu.:4.000
## Stucco : 39 Max. :1.000 Max. :16.000 Max. :8.000
##
## bathrooms fireplaces sqft lotarea state
## Min. :1.00 Min. :0.000 Min. : 437 Min. : 0 PA:287
## 1st Qu.:1.00 1st Qu.:0.000 1st Qu.: 1308 1st Qu.: 3461 VA:313
## Median :2.00 Median :0.000 Median : 1772 Median : 7204
## Mean :2.19 Mean :0.561 Mean : 2158 Mean : 21791
## 3rd Qu.:2.50 3rd Qu.:1.000 3rd Qu.: 2684 3rd Qu.: 16202
## Max. :7.50 Max. :3.000 Max. :12150 Max. :1242593
## NA's :313
## zipcode AvgIncome
## Min. :15003 Min. :11306
## 1st Qu.:15220 1st Qu.:29570
## Median :23220 Median :39943
## Mean :19371 Mean :42943
## 3rd Qu.:23222 3rd Qu.:51643
## Max. :23235 Max. :95289
##
with(train, plot(desc, price))
with(train, plot(numstories, price))
with(train, plot(yearbuilt, price))
with(train, plot(exteriorfinish, price))
with(train, plot(rooftype, price))
with(train, plot(basement, price))
with(train, plot(totalrooms, price))
with(train, plot(bedrooms, price))
with(train, plot(bathrooms, price))
with(train, plot(fireplaces, price))
with(train, plot(sqft, price))
with(train, plot(lotarea, price))
with(train, plot(state, price))
with(train, plot(AvgIncome, price))
It looks like there are 687 NAs in the fireplace column. Let’s check for more and keep this in mind for later.
sapply(train, anyNA)
## id price desc numstories yearbuilt
## FALSE FALSE FALSE FALSE FALSE
## exteriorfinish rooftype basement totalrooms bedrooms
## FALSE FALSE FALSE FALSE FALSE
## bathrooms fireplaces sqft lotarea state
## FALSE TRUE FALSE FALSE FALSE
## zipcode AvgIncome
## FALSE FALSE
sapply(test, anyNA)
## id price desc numstories yearbuilt
## FALSE TRUE FALSE FALSE FALSE
## exteriorfinish rooftype basement totalrooms bedrooms
## FALSE FALSE FALSE FALSE FALSE
## bathrooms fireplaces sqft lotarea state
## FALSE TRUE FALSE FALSE FALSE
## zipcode AvgIncome
## FALSE FALSE
Fireplace is the only column with NA. Let’s look more into this.
table(train$fireplaces, useNA = "always")
##
## 0 1 2 3 4 5 <NA>
## 386 251 56 10 8 2 687
I will impute the NA values in the fireplace column with the column mean.
train[is.na(train$fireplaces), "fireplaces"] <- mean(train$fireplaces, na.rm = T)
test[is.na(test$fireplaces), "fireplaces"] <- mean(test$fireplaces, na.rm = T)
Here, I remove the ID column since it has no correlation with the response. I will also remove the mobile home observation since it is interfering with the validation set approach, as well as infrequent exterior finishes.
train <- train[,-1]
train <- train[train$desc != "MOBILE HOME",]
train <- train[train$exteriorfinish != "Log" & train$exteriorfinish != "Concrete",]
The variable lot area has some zeros which might be a mistake. Let’s look into this further.
train[train$lotarea == 0,]
## price desc numstories yearbuilt exteriorfinish rooftype
## 38 179632.36 CONDOMINIUM 1.0 1919 Brick METAL
## 70 225290.18 CONDOMINIUM 1.0 1920 Brick ROLL
## 86 318260.72 CONDOMINIUM 2.5 2004 Frame SHINGLE
## 92 208425.30 CONDOMINIUM 1.5 2000 Frame SHINGLE
## 117 158990.03 CONDOMINIUM 1.0 1925 Brick METAL
## 159 277231.31 CONDOMINIUM 2.0 2014 Brick SHINGLE
## 200 136721.87 CONDOMINIUM 1.0 1996 Frame SHINGLE
## 250 97975.41 CONDOMINIUM 1.0 1972 Brick ROLL
## 265 85288.95 CONDOMINIUM 2.0 1980 Frame SHINGLE
## 357 257956.43 CONDOMINIUM 1.0 2012 Brick SHINGLE
## 437 158112.98 CONDOMINIUM 1.0 1910 Brick ROLL
## 442 303264.90 CONDOMINIUM 2.0 2011 Frame SHINGLE
## 463 74985.24 CONDOMINIUM 1.0 1983 Brick SHINGLE
## 485 198207.53 CONDOMINIUM 1.0 1953 Brick SLATE
## 514 367951.54 SINGLE FAMILY 1.0 1916 Brick METAL
## 530 275134.10 CONDOMINIUM 1.0 1972 Brick ROLL
## 545 187985.94 CONDOMINIUM 1.0 1900 Brick METAL
## 565 229016.14 CONDOMINIUM 1.0 2000 Frame SHINGLE
## 585 332407.35 CONDOMINIUM 1.0 1986 Brick ROLL
## 626 180383.35 CONDOMINIUM 1.0 1915 Brick METAL
## 739 218334.52 CONDOMINIUM 2.0 2006 Frame SHINGLE
## 800 108425.37 CONDOMINIUM 1.0 1972 Brick ROLL
## 805 397786.12 CONDOMINIUM 1.0 1832 Brick SLATE
## 934 114877.68 CONDOMINIUM 1.0 1900 Brick ROLL
## 940 156027.06 CONDOMINIUM 1.0 1900 Brick METAL
## 974 95841.89 CONDOMINIUM 1.0 1982 Brick SHINGLE
## 1047 65175.30 CONDOMINIUM 1.0 1975 Frame ROLL
## 1053 185588.18 CONDOMINIUM 1.0 1924 Brick SHINGLE
## 1054 165415.35 CONDOMINIUM 1.0 1923 Brick METAL
## 1068 135657.63 CONDOMINIUM 1.0 1900 Brick SHINGLE
## 1158 142594.49 SINGLE FAMILY 2.5 1996 Brick SHINGLE
## 1183 156909.49 CONDOMINIUM 1.0 1920 Brick ROLL
## 1259 468294.63 CONDOMINIUM 2.0 1832 Brick SLATE
## 1270 114751.29 CONDOMINIUM 2.0 1989 Frame SHINGLE
## 1287 173631.99 CONDOMINIUM 1.0 1919 Brick METAL
## 1292 199781.84 CONDOMINIUM 1.0 1910 Brick METAL
## 1302 86491.14 CONDOMINIUM 1.0 1972 Brick ROLL
## 1334 454236.28 CONDOMINIUM 1.0 1986 Brick ROLL
## 1336 233630.16 CONDOMINIUM 1.0 2000 Frame SHINGLE
## 1350 474607.18 CONDOMINIUM 1.0 1997 Brick ROLL
## 1353 494934.00 CONDOMINIUM 2.0 1832 Brick SLATE
## 1375 488669.56 CONDOMINIUM 1.0 1997 Brick ROLL
## 1377 176856.90 CONDOMINIUM 1.0 1925 Brick METAL
## basement totalrooms bedrooms bathrooms fireplaces sqft lotarea state
## 38 0 3 1 1.0 0.6100982 650 0 VA
## 70 0 3 2 2.0 1.0000000 1609 0 PA
## 86 1 6 3 3.5 0.0000000 2327 0 PA
## 92 0 6 2 2.5 0.0000000 1825 0 PA
## 117 0 4 2 1.0 0.6100982 799 0 VA
## 159 1 6 3 2.5 0.0000000 2318 0 PA
## 200 0 6 3 2.5 0.0000000 1110 0 PA
## 250 0 4 2 1.0 0.0000000 1146 0 PA
## 265 1 5 2 1.5 0.0000000 1140 0 PA
## 357 1 5 3 2.5 0.0000000 1894 0 PA
## 437 0 4 2 1.0 0.6100982 700 0 VA
## 442 1 8 4 2.5 0.0000000 2756 0 PA
## 463 0 5 2 1.5 0.0000000 851 0 PA
## 485 0 5 2 1.0 0.6100982 1127 0 VA
## 514 0 6 3 2.0 0.6100982 1599 0 VA
## 530 0 5 2 2.5 0.0000000 1830 0 PA
## 545 0 4 2 2.0 0.6100982 1051 0 VA
## 565 0 6 2 2.5 1.0000000 1596 0 PA
## 585 0 5 2 2.0 0.0000000 2086 0 PA
## 626 0 4 1 1.0 0.6100982 867 0 VA
## 739 1 6 3 2.5 0.0000000 1724 0 PA
## 800 0 4 2 2.0 0.0000000 1234 0 PA
## 805 0 5 2 2.5 0.6100982 1604 0 VA
## 934 0 4 1 1.0 0.0000000 1320 0 PA
## 940 0 4 2 1.0 0.6100982 1201 0 VA
## 974 0 4 2 2.0 0.0000000 1120 0 PA
## 1047 0 3 1 1.0 0.0000000 751 0 PA
## 1053 0 5 3 1.0 0.6100982 795 0 VA
## 1054 0 3 1 1.0 0.6100982 615 0 VA
## 1068 0 3 1 1.0 0.6100982 552 0 VA
## 1158 0 6 3 1.5 0.6100982 1506 0 VA
## 1183 0 3 1 1.0 0.6100982 594 0 VA
## 1259 0 6 2 2.5 0.6100982 1699 0 VA
## 1270 1 4 2 2.0 0.0000000 1408 0 PA
## 1287 0 3 1 1.0 0.6100982 650 0 VA
## 1292 0 3 1 1.5 0.6100982 1069 0 VA
## 1302 0 3 1 1.0 0.0000000 965 0 PA
## 1334 0 6 2 2.0 0.0000000 2609 0 PA
## 1336 0 5 2 2.5 1.0000000 1596 0 PA
## 1350 0 6 3 2.0 0.0000000 2589 0 PA
## 1353 0 6 3 2.5 0.6100982 2231 0 VA
## 1375 0 6 3 2.5 0.0000000 2943 0 PA
## 1377 0 4 2 2.0 0.6100982 780 0 VA
## zipcode AvgIncome
## 38 23220 26557
## 70 15215 43750
## 86 15238 67370
## 92 15235 41367
## 117 23223 27936
## 159 15237 55016
## 200 15044 95289
## 250 15215 43750
## 265 15218 35983
## 357 15237 55016
## 437 23219 11306
## 442 15227 38439
## 463 15215 43750
## 485 23221 39943
## 514 23220 26557
## 530 15213 18473
## 545 23219 11306
## 565 15025 60669
## 585 15213 18473
## 626 23220 26557
## 739 15227 38439
## 800 15213 18473
## 805 23220 26557
## 934 15212 26712
## 940 23223 27936
## 974 15218 35983
## 1047 15237 55016
## 1053 23221 39943
## 1054 23221 39943
## 1068 23223 27936
## 1158 23220 26557
## 1183 23220 26557
## 1259 23220 26557
## 1270 15235 41367
## 1287 23220 26557
## 1292 23223 27936
## 1302 15215 43750
## 1334 15213 18473
## 1336 15025 60669
## 1350 15213 18473
## 1353 23220 26557
## 1375 15213 18473
## 1377 23223 27936
They are almost all condominiums so a 0 for lot size makes sense. More importantly though is the fact that the mean is skewed right relative to the median. Let’s check for large outliers. There are many outliers, and after looking at the histogram, the entire distribition is definitely skewed right.
summary(train$lotarea)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 3805 7744 29475 16049 3820212
largelots <- train[train$lotarea > 100000,]
largelots
## price desc numstories yearbuilt exteriorfinish rooftype
## 2 133555.0 SINGLE FAMILY 1.0 1951 Brick SHINGLE
## 32 2505883.2 SINGLE FAMILY 1.5 2001 Brick SLATE
## 47 897047.1 SINGLE FAMILY 2.5 1950 Brick SLATE
## 62 222437.1 SINGLE FAMILY 1.0 1950 Frame SHINGLE
## 75 2165585.5 SINGLE FAMILY 2.0 2005 Stone SLATE
## 89 518323.5 SINGLE FAMILY 2.0 2003 Brick SHINGLE
## 103 700871.4 SINGLE FAMILY 2.0 1993 Frame SHINGLE
## 130 558365.8 SINGLE FAMILY 2.0 2008 Brick SHINGLE
## 134 417817.1 SINGLE FAMILY 1.0 1955 Brick SHINGLE
## 137 1121120.7 SINGLE FAMILY 2.0 1993 Brick SHINGLE
## 176 883709.5 SINGLE FAMILY 2.0 1930 Frame SLATE
## 216 1190970.6 SINGLE FAMILY 2.0 2004 Brick SHINGLE
## 246 369140.6 SINGLE FAMILY 2.0 1930 Frame SHINGLE
## 334 298066.5 SINGLE FAMILY 1.0 1998 Frame SHINGLE
## 388 497169.6 SINGLE FAMILY 2.0 1986 Brick SHINGLE
## 394 714298.5 SINGLE FAMILY 2.0 2002 Brick SHINGLE
## 469 180722.5 SINGLE FAMILY 1.0 1950 Frame SHINGLE
## 471 1278376.6 SINGLE FAMILY 2.0 1917 Stone SHINGLE
## 478 498788.5 SINGLE FAMILY 1.0 1989 Stone SHINGLE
## 523 1098461.3 SINGLE FAMILY 2.0 2015 Stucco SHINGLE
## 569 368854.4 SINGLE FAMILY 2.0 1890 Frame SHINGLE
## 595 375283.5 SINGLE FAMILY 2.0 1924 Brick SLATE
## 600 369047.2 SINGLE FAMILY 1.0 2015 Frame SHINGLE
## 607 517839.3 SINGLE FAMILY 2.0 1939 Stone SLATE
## 619 643438.5 SINGLE FAMILY 2.0 1973 Brick SHINGLE
## 639 3990701.1 SINGLE FAMILY 2.5 1935 Brick SLATE
## 663 1665583.2 SINGLE FAMILY 2.0 2001 Frame SHINGLE
## 707 824997.7 MULTI-FAMILY 2.0 1860 Frame SHINGLE
## 746 531899.0 SINGLE FAMILY 2.0 1975 Brick SHINGLE
## 752 1269568.2 SINGLE FAMILY 2.0 2012 Brick SHINGLE
## 778 1522371.5 SINGLE FAMILY 2.0 1937 Brick SLATE
## 803 2526697.5 SINGLE FAMILY 2.5 1920 Stone SLATE
## 817 475406.7 SINGLE FAMILY 2.0 1935 Frame METAL
## 818 133728.9 SINGLE FAMILY 1.0 1962 Brick SHINGLE
## 882 1385186.9 MULTI-FAMILY 2.0 1978 Stone SHINGLE
## 944 1048091.1 MULTI-FAMILY 2.0 1925 Frame SLATE
## 963 307965.7 SINGLE FAMILY 1.0 1978 Brick SHINGLE
## 1010 344285.3 SINGLE FAMILY 1.0 1958 Frame ROLL
## 1024 1489254.6 SINGLE FAMILY 1.5 1955 Brick SHINGLE
## 1038 1152514.2 SINGLE FAMILY 1.5 1925 Stucco SLATE
## 1084 1113707.4 SINGLE FAMILY 2.0 2001 Brick SHINGLE
## 1100 1406904.7 SINGLE FAMILY 2.0 2003 Brick SLATE
## 1161 715654.8 SINGLE FAMILY 2.0 1999 Brick SHINGLE
## 1175 500165.6 SINGLE FAMILY 2.0 1995 Brick SHINGLE
## 1212 383206.9 SINGLE FAMILY 1.0 1988 Brick SHINGLE
## 1217 291807.5 SINGLE FAMILY 1.5 1940 Frame SHINGLE
## 1218 1692333.8 SINGLE FAMILY 2.0 1936 Brick SLATE
## 1238 1428734.9 SINGLE FAMILY 2.0 1940 Brick SHINGLE
## 1247 1300535.4 SINGLE FAMILY 2.0 1940 Stone SLATE
## 1266 171803.5 SINGLE FAMILY 1.0 1971 Frame SHINGLE
## 1278 348203.5 SINGLE FAMILY 1.0 1995 Frame SHINGLE
## 1279 529870.6 SINGLE FAMILY 1.5 1991 Frame SHINGLE
## 1283 433110.7 SINGLE FAMILY 2.0 2007 Brick SHINGLE
## 1294 1204385.9 SINGLE FAMILY 2.0 1920 Stone ROLL
## 1310 325354.8 SINGLE FAMILY 1.0 1960 Frame SHINGLE
## 1333 171421.7 SINGLE FAMILY 1.0 1955 Brick SHINGLE
## 1335 139541.9 SINGLE FAMILY 1.0 1952 Brick SHINGLE
## 1364 452455.1 SINGLE FAMILY 1.0 2008 Brick SHINGLE
## 1374 624915.5 SINGLE FAMILY 2.0 2009 Brick SHINGLE
## basement totalrooms bedrooms bathrooms fireplaces sqft lotarea state
## 2 1 5 3 1.5 2.0000000 1285 165964.0 PA
## 32 0 10 5 6.0 0.6100982 6145 291416.4 VA
## 47 1 11 5 4.5 0.6100982 3895 296947.0 VA
## 62 1 6 3 1.0 0.0000000 1296 106722.0 PA
## 75 1 10 4 5.5 3.0000000 9444 131290.0 PA
## 89 1 8 4 3.0 0.0000000 3884 384635.0 PA
## 103 1 10 4 3.5 1.0000000 5585 435600.0 PA
## 130 1 9 4 2.5 1.0000000 3302 431941.0 PA
## 134 1 10 4 2.5 2.0000000 3425 108900.0 PA
## 137 1 14 5 6.5 1.0000000 6170 229300.0 PA
## 176 1 10 5 4.5 1.0000000 3523 125845.0 PA
## 216 1 10 4 5.5 2.0000000 7886 609840.0 PA
## 246 1 8 4 3.5 1.0000000 2734 130754.0 PA
## 334 1 3 2 2.0 0.0000000 1680 277042.0 PA
## 388 1 7 3 2.5 1.0000000 3410 267546.0 PA
## 394 1 10 5 4.5 2.0000000 3607 148148.0 PA
## 469 1 7 2 2.0 0.0000000 1293 130680.0 PA
## 471 1 10 7 7.0 4.0000000 8096 365468.0 PA
## 478 1 8 3 2.5 1.0000000 6341 745007.0 PA
## 523 1 13 5 7.0 0.0000000 8919 141012.0 PA
## 569 1 7 4 2.0 1.0000000 2856 3820212.0 PA
## 595 1 7 3 2.0 2.0000000 4103 214315.0 PA
## 600 1 10 4 3.0 0.0000000 2184 100963.0 PA
## 607 1 9 4 2.5 1.0000000 2250 719175.0 PA
## 619 1 8 4 2.5 2.0000000 4550 2087636.0 PA
## 639 0 19 8 8.5 0.6100982 10060 302364.0 VA
## 663 1 16 5 5.5 2.0000000 9008 167654.0 PA
## 707 1 9 5 3.0 1.0000000 6912 263538.0 PA
## 746 1 11 5 3.5 2.0000000 4228 435600.0 PA
## 752 1 11 5 6.0 0.0000000 6496 140433.0 PA
## 778 1 11 4 6.0 4.0000000 5164 446621.0 PA
## 803 1 13 5 7.5 4.0000000 15872 1393075.0 PA
## 817 0 7 3 2.0 0.6100982 1731 949608.0 VA
## 818 1 6 3 1.0 0.0000000 988 111252.0 PA
## 882 1 16 4 5.0 5.0000000 7484 159560.0 PA
## 944 1 10 4 4.0 2.0000000 7744 263974.0 PA
## 963 1 7 3 2.5 1.0000000 1733 249817.0 PA
## 1010 1 7 3 3.0 2.0000000 2149 109292.0 PA
## 1024 1 16 7 7.5 2.0000000 6403 130680.0 PA
## 1038 1 11 5 4.0 4.0000000 8125 144489.0 PA
## 1084 1 10 4 4.5 0.0000000 6434 104152.0 PA
## 1100 1 10 5 5.0 1.0000000 4824 174676.0 PA
## 1161 1 10 5 5.0 3.0000000 7302 261012.0 PA
## 1175 1 9 5 3.5 1.0000000 3842 135123.0 PA
## 1212 1 8 3 2.5 1.0000000 2697 578825.0 PA
## 1217 1 7 3 2.5 0.0000000 1986 438475.0 PA
## 1218 1 12 6 6.5 2.0000000 7622 166573.0 PA
## 1238 1 12 4 6.5 5.0000000 10852 130680.0 PA
## 1247 1 12 6 5.5 2.0000000 5163 144619.0 PA
## 1266 1 5 2 1.0 0.0000000 1092 2115273.0 PA
## 1278 1 3 1 1.0 0.0000000 1011 496671.0 PA
## 1279 1 9 4 3.5 1.0000000 3816 109771.0 PA
## 1283 1 10 4 2.5 1.0000000 3504 202040.0 PA
## 1294 1 12 4 5.5 3.0000000 5878 224500.0 PA
## 1310 1 7 4 2.5 0.0000000 1514 566280.0 PA
## 1333 1 6 3 2.0 0.0000000 1690 120095.0 PA
## 1335 1 5 2 1.0 0.0000000 1432 175634.0 PA
## 1364 1 7 3 2.5 1.0000000 3974 543585.0 PA
## 1374 1 9 4 2.5 0.0000000 4113 131238.0 PA
## zipcode AvgIncome
## 2 15003 37868
## 32 23226 50519
## 47 23235 56989
## 62 15026 69387
## 75 15238 67370
## 89 15025 60669
## 103 15025 60669
## 130 15005 64427
## 134 15238 67370
## 137 15044 95289
## 176 15238 67370
## 216 15237 55016
## 246 15005 64427
## 334 15025 60669
## 388 15044 95289
## 394 15005 64427
## 469 15238 67370
## 471 15044 95289
## 478 15025 60669
## 523 15215 43750
## 569 15332 62875
## 595 15235 41367
## 600 15332 62875
## 607 15237 55016
## 619 15037 55863
## 639 23226 50519
## 663 15215 43750
## 707 15044 95289
## 746 15025 60669
## 752 15044 95289
## 778 15238 67370
## 803 15238 67370
## 817 23234 40522
## 818 15026 69387
## 882 15238 67370
## 944 15238 67370
## 963 15025 60669
## 1010 15237 55016
## 1024 15238 67370
## 1038 15215 43750
## 1084 15044 95289
## 1100 15005 64427
## 1161 15005 64427
## 1175 15044 95289
## 1212 15037 55863
## 1217 15037 55863
## 1218 15238 67370
## 1238 15238 67370
## 1247 15238 67370
## 1266 15332 62875
## 1278 15235 41367
## 1279 15237 55016
## 1283 15332 62875
## 1294 15215 43750
## 1310 15237 55016
## 1333 15332 62875
## 1335 15037 55863
## 1364 15037 55863
## 1374 15237 55016
nrow(largelots)
## [1] 59
hist(train$lotarea, breaks = 100)
Now let’s make note of any other outliers. First, the square foot column - there are 3 outliers.
hist(train$sqft)
train[train$sqft > 10000 , ]
## price desc numstories yearbuilt exteriorfinish rooftype
## 639 3990701 SINGLE FAMILY 2.5 1935 Brick SLATE
## 803 2526698 SINGLE FAMILY 2.5 1920 Stone SLATE
## 1238 1428735 SINGLE FAMILY 2.0 1940 Brick SHINGLE
## basement totalrooms bedrooms bathrooms fireplaces sqft lotarea state
## 639 0 19 8 8.5 0.6100982 10060 302364 VA
## 803 1 13 5 7.5 4.0000000 15872 1393075 PA
## 1238 1 12 4 6.5 5.0000000 10852 130680 PA
## zipcode AvgIncome
## 639 23226 50519
## 803 15238 67370
## 1238 15238 67370
Let’s look at price on its own.
hist(train$price, breaks = 100)
Zip code has too many factors, so I will delete the column. Average Income can be a good proxy for zip code.
train <- subset(train, select = -zipcode)
test <- subset(test, select = -zipcode)
Modeling:
Initially split into a training and test set for all models (train1 and test1). Use cross validation when applicable on train1.
Linear model manually changing predictors
library(stats)
library(boot)
#validation set split
set.seed(2)
trainindex <- sample(1:nrow(train), nrow(train)*.7)
train1 <- train[trainindex,]
test1 <- train[-trainindex,]
#linear model with all predictors
lmfull <- glm(price ~ ., data = train)
summary(lmfull)
##
## Call:
## glm(formula = price ~ ., data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -594921 -59401 -3126 54346 1864366
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.298e+05 2.870e+05 0.801 0.423442
## descMULTI-FAMILY -1.295e+05 2.967e+04 -4.365 1.37e-05 ***
## descROWHOUSE 2.670e+04 3.832e+04 0.697 0.485979
## descSINGLE FAMILY -2.347e+04 1.846e+04 -1.272 0.203733
## numstories -1.942e+04 8.997e+03 -2.159 0.031019 *
## yearbuilt -2.116e+02 1.498e+02 -1.413 0.157964
## exteriorfinishFrame -1.579e+04 8.831e+03 -1.788 0.074035 .
## exteriorfinishStone -3.901e+04 2.079e+04 -1.876 0.060891 .
## exteriorfinishStucco -7.715e+04 1.479e+04 -5.218 2.09e-07 ***
## rooftypeROLL 6.743e+03 2.166e+04 0.311 0.755576
## rooftypeSHINGLE -4.887e+04 1.453e+04 -3.364 0.000789 ***
## rooftypeSLATE 4.021e+04 1.181e+04 3.404 0.000684 ***
## basement 3.629e+04 1.320e+04 2.750 0.006046 **
## totalrooms -2.797e+03 3.172e+03 -0.882 0.377960
## bedrooms -1.477e+04 6.106e+03 -2.419 0.015680 *
## bathrooms 8.017e+04 5.619e+03 14.268 < 2e-16 ***
## fireplaces -3.872e+04 7.234e+03 -5.352 1.02e-07 ***
## sqft 1.560e+02 5.075e+00 30.731 < 2e-16 ***
## lotarea 5.920e-02 2.480e-02 2.387 0.017133 *
## stateVA 1.917e+05 1.577e+04 12.157 < 2e-16 ***
## AvgIncome 1.442e+00 3.051e-01 4.727 2.51e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 17267122762)
##
## Null deviance: 1.4339e+14 on 1391 degrees of freedom
## Residual deviance: 2.3673e+13 on 1371 degrees of freedom
## AIC: 36785
##
## Number of Fisher Scoring iterations: 2
#getting cross validated MSE for full LM model
lmfullcv <- cv.glm(train, lmfull, K = 10)
mselmfull <- lmfullcv$delta[1]
mselmfull
## [1] 18198966972
#including only significant terms
lmreduced <- glm(price ~ desc + exteriorfinish + numstories + rooftype + basement + bedrooms + bathrooms + fireplaces + sqft + lotarea + state + AvgIncome, data = train)
#getting CV error
lmreducedcv <- cv.glm(train, lmreduced, K = 10)
msereducedlm <- lmreducedcv$delta[1]
msereducedlm
## [1] 17916548309
Ridge
set.seed(1)
library(glmnet)
## Warning: package 'glmnet' was built under R version 3.6.3
## Loading required package: Matrix
## Loaded glmnet 4.0-2
x = model.matrix(price~., train)[, -1]
y = train$price
grid <- 10^seq(10,-2, length = 100)
ridge <- glmnet(x, y, alpha = 0, lambda = grid)
#use cross validation to choose best lambda value
cvridge <- cv.glmnet(x, y, alpha = 0, lambda = grid)
best <- cvridge$lambda.min
best
## [1] 231.013
mseridge <- cvridge$cvm[which(cvridge$lambda == best)]
mseridge
## [1] 18247763194
#obtaining the coefficients of the model chosen by CV
model <- predict(ridge, s = best, type = "coefficients")
model
## 24 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 2.271854e+05
## descMOBILE HOME .
## descMULTI-FAMILY -1.295726e+05
## descROWHOUSE 2.697548e+04
## descSINGLE FAMILY -2.321684e+04
## numstories -1.921437e+04
## yearbuilt -2.101641e+02
## exteriorfinishConcrete .
## exteriorfinishFrame -1.581381e+04
## exteriorfinishLog .
## exteriorfinishStone -3.898815e+04
## exteriorfinishStucco -7.718419e+04
## rooftypeROLL 6.561915e+03
## rooftypeSHINGLE -4.897133e+04
## rooftypeSLATE 4.027268e+04
## basement 3.536442e+04
## totalrooms -2.670024e+03
## bedrooms -1.475754e+04
## bathrooms 8.040069e+04
## fireplaces -3.839883e+04
## sqft 1.554213e+02
## lotarea 5.955892e-02
## stateVA 1.904573e+05
## AvgIncome 1.438172e+00
Lasso
lasso <- glmnet(x, y, alpha = 1, lambda = grid)
#finding the best lambda value by CV
set.seed(2)
cvlasso <- cv.glmnet(x, y, alpha = 1)
best <- cvlasso$lambda.min
#obtaining the CV error of the lasso model chosen by CV
mselasso <- cvlasso$cvm[which(cvridge$lambda == cvridge$lambda.min)]
mselasso
## [1] 18836506616
predict(lasso, s = best, type = "coefficients")
## 24 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 1.099764e+05
## descMOBILE HOME .
## descMULTI-FAMILY -1.219651e+05
## descROWHOUSE 2.732720e+04
## descSINGLE FAMILY -1.661825e+04
## numstories -1.698020e+04
## yearbuilt -1.487798e+02
## exteriorfinishConcrete .
## exteriorfinishFrame -1.446350e+04
## exteriorfinishLog .
## exteriorfinishStone -3.440568e+04
## exteriorfinishStucco -7.510747e+04
## rooftypeROLL 6.073763e+02
## rooftypeSHINGLE -5.094604e+04
## rooftypeSLATE 3.809361e+04
## basement 2.749767e+04
## totalrooms -1.550969e+03
## bedrooms -1.511292e+04
## bathrooms 7.982234e+04
## fireplaces -3.603688e+04
## sqft 1.536619e+02
## lotarea 5.653835e-02
## stateVA 1.805653e+05
## AvgIncome 1.322257e+00
## ols after lasso
afterlasso <- glm(price ~ desc + numstories + exteriorfinish + rooftype + bathrooms + bedrooms + fireplaces + sqft + lotarea + state + AvgIncome, data = train)
summary(afterlasso)
##
## Call:
## glm(formula = price ~ desc + numstories + exteriorfinish + rooftype +
## bathrooms + bedrooms + fireplaces + sqft + lotarea + state +
## AvgIncome, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -609411 -60092 -1971 54607 1857374
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.591e+05 2.819e+04 -5.643 2.03e-08 ***
## descMULTI-FAMILY -1.164e+05 2.816e+04 -4.133 3.80e-05 ***
## descROWHOUSE 4.688e+04 3.751e+04 1.250 0.211512
## descSINGLE FAMILY -1.045e+04 1.780e+04 -0.587 0.557310
## numstories -1.559e+04 8.882e+03 -1.755 0.079471 .
## exteriorfinishFrame -1.724e+04 8.818e+03 -1.955 0.050799 .
## exteriorfinishStone -3.962e+04 2.082e+04 -1.903 0.057272 .
## exteriorfinishStucco -8.171e+04 1.472e+04 -5.549 3.45e-08 ***
## rooftypeROLL -4.803e+03 2.118e+04 -0.227 0.820644
## rooftypeSHINGLE -5.558e+04 1.352e+04 -4.112 4.16e-05 ***
## rooftypeSLATE 3.786e+04 1.161e+04 3.260 0.001140 **
## bathrooms 8.097e+04 5.408e+03 14.974 < 2e-16 ***
## bedrooms -1.864e+04 5.043e+03 -3.695 0.000228 ***
## fireplaces -3.764e+04 7.210e+03 -5.220 2.07e-07 ***
## sqft 1.532e+02 4.959e+00 30.896 < 2e-16 ***
## lotarea 6.277e-02 2.482e-02 2.529 0.011553 *
## stateVA 1.603e+05 1.130e+04 14.176 < 2e-16 ***
## AvgIncome 1.333e+00 2.864e-01 4.653 3.59e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 17358273582)
##
## Null deviance: 1.4339e+14 on 1391 degrees of freedom
## Residual deviance: 2.3850e+13 on 1374 degrees of freedom
## AIC: 36790
##
## Number of Fisher Scoring iterations: 2
afterlassocv <- cv.glm(train, afterlasso, K = 10)
mseafterlasso <- afterlassocv$delta[1]
mseafterlasso
## [1] 17869313365
plot(afterlasso)
Tree
library(tree)
## Warning: package 'tree' was built under R version 3.6.3
tree <- tree(price ~., data = train1)
summary(tree)
##
## Regression tree:
## tree(formula = price ~ ., data = train1)
## Variables actually used in tree construction:
## [1] "sqft" "rooftype" "state" "bathrooms"
## Number of terminal nodes: 11
## Residual mean deviance: 2.151e+10 = 2.071e+13 / 963
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1352000 -78700 -9910 0 76700 1742000
plot(tree)
text(tree, pretty = 0)
pred <- predict(tree, newdata = test1)
msetree <- mean( (pred - test1$price)^2)
msetree
## [1] 20770133160
#choose tuning parameter to prune the tree using 10-fold CV
cvtree <- cv.tree(tree)
plot(cvtree$size, cvtree$dev, type = 'b')
#choose size 8 as the pruned tree
prune = prune.tree(tree, best = 11)
plot(prune)
text(prune, pretty = 0)
pred <- predict(prune, newdata = test1)
mseprune <- mean( (pred - test1$price)^2)
mseprune
## [1] 20770133160
Bagging, Random Forest
set.seed(2)
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.6.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
#obtain bagging model
bag <- randomForest(price ~., data = train1, mtry = 14, importance = TRUE)
bag
##
## Call:
## randomForest(formula = price ~ ., data = train1, mtry = 14, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 14
##
## Mean of squared residuals: 16720067293
## % Var explained: 84.96
pred <- predict(bag, newdata = test1)
msebag <- mean( (pred - test1$price)^2)
#random forest tuning over mtry using validation set approach
MSEs <- rep(NA, 12)
for(i in 3:14){
rf <- randomForest(price ~., data = train1, mtry = i)
pred <- predict(rf, newdata = test1)
MSEs[i-2] <- mean( (pred - test1$price)^2)
}
plot(3:14, MSEs, type = "b")
#obtain the validation set MSE for the best rf model
bestrf <- randomForest(price~., data = train1, mtry = (which.min(MSEs) + 2))
pred <- predict(bestrf, newdata = test1)
mserf <- mean( (pred - test1$price)^2)
varImpPlot(bestrf)
importance(bestrf)
## IncNodePurity
## desc 3.145216e+11
## numstories 2.342883e+12
## yearbuilt 2.891023e+12
## exteriorfinish 1.482689e+12
## rooftype 5.762043e+12
## basement 9.314035e+11
## totalrooms 1.084386e+13
## bedrooms 7.662470e+12
## bathrooms 2.245261e+13
## fireplaces 2.855454e+12
## sqft 2.873395e+13
## lotarea 1.070175e+13
## state 2.631609e+12
## AvgIncome 3.901091e+12
Boosting
library(gbm)
## Warning: package 'gbm' was built under R version 3.6.3
## Loaded gbm 2.1.8
# choosing the best lambda tuning parameter and best ntree
set.seed(1)
grid <- c(10^seq(-5, -1), .2, .3)
ntree <- c(50, 100, 250, 500, 1000, 1500, 2000, 3000)
MSEs <- matrix(nrow = length(ntree), ncol = length(grid))
for(i in 1:length(grid)){
for(j in 1:length(ntree)){
boost <- gbm(price ~., data = train1, n.trees = ntree[j], distribution = "gaussian", interaction.depth = 4, shrinkage = grid[i])
pred <- predict(boost, newdata = test1)
MSEs[j,i] <- mean( (pred - test1$price)^2)
}
}
## Using 50 trees...
## Using 100 trees...
## Using 250 trees...
## Using 500 trees...
## Using 1000 trees...
## Using 1500 trees...
## Using 2000 trees...
## Using 3000 trees...
## Using 50 trees...
## Using 100 trees...
## Using 250 trees...
## Using 500 trees...
## Using 1000 trees...
## Using 1500 trees...
## Using 2000 trees...
## Using 3000 trees...
## Using 50 trees...
## Using 100 trees...
## Using 250 trees...
## Using 500 trees...
## Using 1000 trees...
## Using 1500 trees...
## Using 2000 trees...
## Using 3000 trees...
## Using 50 trees...
## Using 100 trees...
## Using 250 trees...
## Using 500 trees...
## Using 1000 trees...
## Using 1500 trees...
## Using 2000 trees...
## Using 3000 trees...
## Using 50 trees...
## Using 100 trees...
## Using 250 trees...
## Using 500 trees...
## Using 1000 trees...
## Using 1500 trees...
## Using 2000 trees...
## Using 3000 trees...
## Using 50 trees...
## Using 100 trees...
## Using 250 trees...
## Using 500 trees...
## Using 1000 trees...
## Using 1500 trees...
## Using 2000 trees...
## Using 3000 trees...
## Using 50 trees...
## Using 100 trees...
## Using 250 trees...
## Using 500 trees...
## Using 1000 trees...
## Using 1500 trees...
## Using 2000 trees...
## Using 3000 trees...
MSEs
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 84035239297 83563785864 79029865198 47565387833 8700060858 8437308910
## [2,] 83983504557 83054313254 74458864423 30225881550 6652570210 7794588428
## [3,] 83823908707 81518183345 62443440136 13043105793 6060579082 6728686801
## [4,] 83560622265 79037184539 47552386952 8492551900 5874305747 6805514721
## [5,] 83048042051 74393040534 30031983471 7042301712 6070065755 6447094684
## [6,] 82530178920 70065724010 20816403904 6495228601 5941262183 6327600116
## [7,] 82023662602 66057339568 15852442026 6352379927 6156373526 6512712554
## [8,] 81013833761 59051139022 11324811011 6078884039 5897293905 6960472764
## [,7]
## [1,] 7748989375
## [2,] 6926894319
## [3,] 6491387218
## [4,] 7211123389
## [5,] 6998353015
## [6,] 8548368498
## [7,] 7365770518
## [8,] 6673433699
which <- which.min(MSEs)
#obtaining the validation set error of the best boosting model
boost <- gbm(price ~., data = train1, n.trees = ntree[(which %% 8)], distribution = "gaussian", interaction.depth = 4, shrinkage = grid[round(which/8, 0) + 1])
pred <- predict(boost, newdata = test1)
## Using 500 trees...
mseboost <- mean( (pred - test1$price)^2 )
mseboost
## [1] 6170604767
#MSE for observations below the 3rd quartile
regtest <- test1[test1$price < 1000000,]
pred <- predict(boost, newdata = regtest)
## Using 500 trees...
mseregular <- mean( (pred - regtest$price)^2 )
mseregular
## [1] 5164666245
#MSE for observations above the 3rd quartile
hightest <- test1[test1$price > 1000000,]
pred <- predict(boost, newdata = hightest)
## Using 500 trees...
msehigh <- mean( (pred - hightest$price)^2 )
msehigh
## [1] 35199116424
summary(boost)
## var rel.inf
## sqft sqft 47.8835374
## bathrooms bathrooms 23.9323397
## lotarea lotarea 8.4076568
## rooftype rooftype 6.0365388
## state state 4.4538855
## totalrooms totalrooms 2.7532791
## AvgIncome AvgIncome 2.1346561
## yearbuilt yearbuilt 1.4418337
## bedrooms bedrooms 0.8749210
## exteriorfinish exteriorfinish 0.6926275
## numstories numstories 0.6851495
## fireplaces fireplaces 0.3324516
## desc desc 0.2625792
## basement basement 0.1085441
data.frame(mselmfull, msereducedlm, mselasso, mseafterlasso, mseridge, msetree, mseprune, mserf, msebag, mseboost)
## mselmfull msereducedlm mselasso mseafterlasso mseridge msetree
## 1 18198966972 17916548309 18836506616 17869313365 18247763194 20770133160
## mseprune mserf msebag mseboost
## 1 20770133160 6556605797 8684574624 6170604767
Predictions
test$price <- predict(boost, newdata = test)
## Using 500 trees...
test$student_id <- rep(4181080, nrow(test))
testing_predictions <- subset(test, select = c(id, price, student_id))
write.csv(testing_predictions, "./testing_predictions_4181080.csv")