Prepare Data
Read the data
airbnb_original <- read_csv("airbnb_eda.csv")
## Rows: 39118 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): name, host_name, neighbourhood_group, neighbourhood, room_type, la...
## dbl (10): id, host_id, latitude, longitude, price, minimum_nights, number_of...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(airbnb_original)
## Rows: 39,118
## Columns: 17
## $ id <dbl> 2539, 2595, 3647, 3831, 5022, 5099, 5121, 5203, 52…
## $ name <chr> "Clean & quiet apt home by the park", "Skylit Midt…
## $ host_id <dbl> 2787, 2845, 4632, 4869, 7192, 7322, 7356, 7490, 75…
## $ host_name <chr> "John", "Jennifer", "Elisabeth", "LisaRoxanne", "L…
## $ neighbourhood_group <chr> "Brooklyn", "Manhattan", "Manhattan", "Brooklyn", …
## $ neighbourhood <chr> "Kensington", "Midtown", "Harlem", "Clinton Hill",…
## $ latitude <dbl> 40.64749, 40.75362, 40.80902, 40.68514, 40.79851, …
## $ longitude <dbl> -73.97237, -73.98377, -73.94190, -73.95976, -73.94…
## $ room_type <chr> "Private room", "Entire home/apt", "Private room",…
## $ price <dbl> 149, 225, 60, 45, 80, 200, 60, 32, 150, 54, 85, 48…
## $ minimum_nights <dbl> 1, 1, 3, 1, 10, 3, 45, 2, 1, 5, 2, 90, 2, 2, 1, 3,…
## $ number_of_reviews <dbl> 9, 45, 0, 270, 9, 74, 49, 118, 160, 53, 188, 27, 1…
## $ last_review <chr> "10/19/2018", "5/21/2019", NA, "7/5/2019", "11/19/…
## $ reviews_per_month <dbl> 0.21, 0.38, NA, 4.64, 0.10, 0.59, 0.40, 0.99, 1.33…
## $ floor <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ `noise(dB)` <dbl> 69.05646, 56.05428, 56.05428, 69.05646, 56.05428, …
## $ Location <chr> "807, Friel Place, Brooklyn, Kings County, City of…
Feature Engineering: Part 1
#Summary of airbnb
summary(airbnb_original)
## id name host_id host_name
## Min. : 2539 Length:39118 Min. : 2438 Length:39118
## 1st Qu.: 9436041 Class :character 1st Qu.: 7789663 Class :character
## Median :19637846 Mode :character Median : 30616863 Mode :character
## Mean :18980697 Mean : 67230185
## 3rd Qu.:29079859 3rd Qu.:107270482
## Max. :36487245 Max. :274321313
##
## neighbourhood_group neighbourhood latitude longitude
## Length:39118 Length:39118 Min. :40.51 Min. :-74.24
## Class :character Class :character 1st Qu.:40.69 1st Qu.:-73.98
## Mode :character Mode :character Median :40.72 Median :-73.96
## Mean :40.73 Mean :-73.95
## 3rd Qu.:40.76 3rd Qu.:-73.94
## Max. :40.91 Max. :-73.71
##
## room_type price minimum_nights number_of_reviews
## Length:39118 Min. : 0.0 Min. : 1.000 Min. : 0.00
## Class :character 1st Qu.: 60.0 1st Qu.: 1.000 1st Qu.: 1.00
## Mode :character Median : 99.0 Median : 2.000 Median : 5.00
## Mean : 140.4 Mean : 7.004 Mean : 23.46
## 3rd Qu.: 165.0 3rd Qu.: 5.000 3rd Qu.: 24.00
## Max. :10000.0 Max. :1250.000 Max. :629.00
##
## last_review reviews_per_month floor noise(dB)
## Length:39118 Min. : 0.010 Min. : 0.000 Min. :22.96
## Class :character 1st Qu.: 0.190 1st Qu.: 1.000 1st Qu.:56.05
## Mode :character Median : 0.720 Median : 1.000 Median :62.48
## Mean : 1.375 Mean : 1.582 Mean :62.70
## 3rd Qu.: 2.030 3rd Qu.: 1.000 3rd Qu.:69.06
## Max. :58.500 Max. :20.000 Max. :98.06
## NA's :7978
## Location
## Length:39118
## Class :character
## Mode :character
##
##
##
##
#Convert neighbourhood_group, neighborhood and room_type to factor.
airbnb_original$neighbourhood_group<- as.factor(airbnb_original$neighbourhood_group)
airbnb_original$neighbourhood<- as.factor(airbnb_original$neighbourhood)
airbnb_original$room_type<- as.factor(airbnb_original$room_type)
airbnb_original %>%
sapply(levels)
## $id
## NULL
##
## $name
## NULL
##
## $host_id
## NULL
##
## $host_name
## NULL
##
## $neighbourhood_group
## [1] "Bronx" "Brooklyn" "Manhattan" "Queens"
## [5] "Staten Island"
##
## $neighbourhood
## [1] "Allerton" "Arden Heights"
## [3] "Arrochar" "Arverne"
## [5] "Astoria" "Bath Beach"
## [7] "Battery Park City" "Bay Ridge"
## [9] "Bay Terrace" "Bay Terrace, Staten Island"
## [11] "Baychester" "Bayside"
## [13] "Bayswater" "Bedford-Stuyvesant"
## [15] "Belle Harbor" "Bellerose"
## [17] "Belmont" "Bensonhurst"
## [19] "Bergen Beach" "Boerum Hill"
## [21] "Borough Park" "Breezy Point"
## [23] "Briarwood" "Brighton Beach"
## [25] "Bronxdale" "Brooklyn Heights"
## [27] "Brownsville" "Bull's Head"
## [29] "Bushwick" "Cambria Heights"
## [31] "Canarsie" "Carroll Gardens"
## [33] "Castle Hill" "Castleton Corners"
## [35] "Chelsea" "Chinatown"
## [37] "City Island" "Civic Center"
## [39] "Claremont Village" "Clason Point"
## [41] "Clifton" "Clinton Hill"
## [43] "Co-op City" "Cobble Hill"
## [45] "College Point" "Columbia St"
## [47] "Concord" "Concourse"
## [49] "Concourse Village" "Coney Island"
## [51] "Corona" "Crown Heights"
## [53] "Cypress Hills" "Ditmars Steinway"
## [55] "Dongan Hills" "Douglaston"
## [57] "Downtown Brooklyn" "DUMBO"
## [59] "Dyker Heights" "East Elmhurst"
## [61] "East Flatbush" "East Harlem"
## [63] "East Morrisania" "East New York"
## [65] "East Village" "Eastchester"
## [67] "Edenwald" "Edgemere"
## [69] "Elmhurst" "Eltingville"
## [71] "Emerson Hill" "Far Rockaway"
## [73] "Fieldston" "Financial District"
## [75] "Flatbush" "Flatiron District"
## [77] "Flatlands" "Flushing"
## [79] "Fordham" "Forest Hills"
## [81] "Fort Greene" "Fort Hamilton"
## [83] "Fort Wadsworth" "Fresh Meadows"
## [85] "Glendale" "Gowanus"
## [87] "Gramercy" "Graniteville"
## [89] "Grant City" "Gravesend"
## [91] "Great Kills" "Greenpoint"
## [93] "Greenwich Village" "Grymes Hill"
## [95] "Harlem" "Hell's Kitchen"
## [97] "Highbridge" "Hollis"
## [99] "Holliswood" "Howard Beach"
## [101] "Howland Hook" "Huguenot"
## [103] "Hunts Point" "Inwood"
## [105] "Jackson Heights" "Jamaica"
## [107] "Jamaica Estates" "Jamaica Hills"
## [109] "Kensington" "Kew Gardens"
## [111] "Kew Gardens Hills" "Kingsbridge"
## [113] "Kips Bay" "Laurelton"
## [115] "Lighthouse Hill" "Little Italy"
## [117] "Little Neck" "Long Island City"
## [119] "Longwood" "Lower East Side"
## [121] "Manhattan Beach" "Marble Hill"
## [123] "Mariners Harbor" "Maspeth"
## [125] "Melrose" "Middle Village"
## [127] "Midland Beach" "Midtown"
## [129] "Midwood" "Mill Basin"
## [131] "Morningside Heights" "Morris Heights"
## [133] "Morris Park" "Morrisania"
## [135] "Mott Haven" "Mount Eden"
## [137] "Mount Hope" "Murray Hill"
## [139] "Navy Yard" "Neponsit"
## [141] "New Brighton" "New Dorp"
## [143] "New Dorp Beach" "New Springville"
## [145] "NoHo" "Nolita"
## [147] "North Riverdale" "Norwood"
## [149] "Oakwood" "Olinville"
## [151] "Ozone Park" "Park Slope"
## [153] "Parkchester" "Pelham Bay"
## [155] "Pelham Gardens" "Port Morris"
## [157] "Port Richmond" "Prince's Bay"
## [159] "Prospect-Lefferts Gardens" "Prospect Heights"
## [161] "Queens Village" "Randall Manor"
## [163] "Red Hook" "Rego Park"
## [165] "Richmond Hill" "Richmondtown"
## [167] "Ridgewood" "Riverdale"
## [169] "Rockaway Beach" "Roosevelt Island"
## [171] "Rosebank" "Rosedale"
## [173] "Rossville" "Schuylerville"
## [175] "Sea Gate" "Sheepshead Bay"
## [177] "Shore Acres" "Silver Lake"
## [179] "SoHo" "Soundview"
## [181] "South Beach" "South Ozone Park"
## [183] "South Slope" "Springfield Gardens"
## [185] "Spuyten Duyvil" "St. Albans"
## [187] "St. George" "Stapleton"
## [189] "Stuyvesant Town" "Sunnyside"
## [191] "Sunset Park" "Theater District"
## [193] "Throgs Neck" "Todt Hill"
## [195] "Tompkinsville" "Tottenville"
## [197] "Tremont" "Tribeca"
## [199] "Two Bridges" "Unionport"
## [201] "University Heights" "Upper East Side"
## [203] "Upper West Side" "Van Nest"
## [205] "Vinegar Hill" "Wakefield"
## [207] "Washington Heights" "West Brighton"
## [209] "West Farms" "West Village"
## [211] "Westchester Square" "Westerleigh"
## [213] "Whitestone" "Williamsbridge"
## [215] "Williamsburg" "Willowbrook"
## [217] "Windsor Terrace" "Woodhaven"
## [219] "Woodlawn" "Woodside"
##
## $latitude
## NULL
##
## $longitude
## NULL
##
## $room_type
## [1] "Entire home/apt" "Private room" "Shared room"
##
## $price
## NULL
##
## $minimum_nights
## NULL
##
## $number_of_reviews
## NULL
##
## $last_review
## NULL
##
## $reviews_per_month
## NULL
##
## $floor
## NULL
##
## $`noise(dB)`
## NULL
##
## $Location
## NULL
#Create new numeric variables from the factor variables neighbourhood_group, neighborhood and room_type to factor.
airbnb_original <- airbnb_original %>%
mutate(neighbourhood_group_numeric=ifelse(neighbourhood_group == "Bronx", 1, ifelse(neighbourhood_group == "Brooklyn",2,ifelse(neighbourhood_group == "Manhattan",3,ifelse(neighbourhood_group == "Queens",4,5)))))
airbnb_original <- airbnb_original %>%
mutate(room_type_numeric=ifelse(room_type == "Entire home/apt",1,ifelse(room_type == "Private room",2,3)))
airbnb_original <- airbnb_original %>%
mutate(neighbourhood_numeric=as.numeric(neighbourhood))
#Convert last_review date to a date format.
airbnb_original <- airbnb_original %>%
mutate(last_review = ifelse(is.na(last_review),"01/01/2000",last_review))%>%
mutate(last_review = mdy(last_review))
#Replacing NA values for reviews_per_month with 0.
airbnb_original <- airbnb_original %>%
mutate(reviews_per_month = ifelse(is.na(reviews_per_month), 0, reviews_per_month))
Feature Engineering: Part 2
#Drop columns that will not be useful in the price prediction
#name: The name of the host cannot have any impact on the price prediction
#host_id: It is just an id assigned to a host. It does not have any impact on the price
#last_review: The last review column is being dropped as it may not have an impact on price.
#Location: It is the address of the place and since we are using latitude and longitude, I feel this is not needed to predict the price.
#View(airbnb_original)
airbnb <- airbnb_original %>%
select(-name, -host_id, -host_name, -last_review, -Location)
Feature Engineering Part 3
#summary of airbnb variables
summary(airbnb)
## id neighbourhood_group neighbourhood
## Min. : 2539 Bronx : 873 Williamsburg : 3146
## 1st Qu.: 9436041 Brooklyn :16084 Bedford-Stuyvesant: 2987
## Median :19637846 Manhattan :17329 Harlem : 2140
## Mean :18980697 Queens : 4533 Bushwick : 1980
## 3rd Qu.:29079859 Staten Island: 299 Hell's Kitchen : 1575
## Max. :36487245 Upper West Side : 1539
## (Other) :25751
## latitude longitude room_type price
## Min. :40.51 Min. :-74.24 Entire home/apt:20276 Min. : 0.0
## 1st Qu.:40.69 1st Qu.:-73.98 Private room :17912 1st Qu.: 60.0
## Median :40.72 Median :-73.96 Shared room : 930 Median : 99.0
## Mean :40.73 Mean :-73.95 Mean : 140.4
## 3rd Qu.:40.76 3rd Qu.:-73.94 3rd Qu.: 165.0
## Max. :40.91 Max. :-73.71 Max. :10000.0
##
## minimum_nights number_of_reviews reviews_per_month floor
## Min. : 1.000 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 1.000 1st Qu.: 1.00 1st Qu.: 0.040 1st Qu.: 1.000
## Median : 2.000 Median : 5.00 Median : 0.380 Median : 1.000
## Mean : 7.004 Mean : 23.46 Mean : 1.095 Mean : 1.582
## 3rd Qu.: 5.000 3rd Qu.: 24.00 3rd Qu.: 1.600 3rd Qu.: 1.000
## Max. :1250.000 Max. :629.00 Max. :58.500 Max. :20.000
##
## noise(dB) neighbourhood_group_numeric room_type_numeric
## Min. :22.96 Min. :1.000 Min. :1.000
## 1st Qu.:56.05 1st Qu.:2.000 1st Qu.:1.000
## Median :62.48 Median :3.000 Median :1.000
## Mean :62.70 Mean :2.675 Mean :1.505
## 3rd Qu.:69.06 3rd Qu.:3.000 3rd Qu.:2.000
## Max. :98.06 Max. :5.000 Max. :3.000
##
## neighbourhood_numeric
## Min. : 1.0
## 1st Qu.: 52.0
## Median : 95.0
## Mean :107.8
## 3rd Qu.:179.0
## Max. :220.0
##
#Checking for and removing outliers
airbnb %>%
select_if(is.numeric) %>%
select(-id) %>%
gather(yval, val, -price) %>%
ggplot(aes(price, val)) +
geom_point() +
facet_grid(yval~.)

airbnb <- subset(airbnb, airbnb$price > 0 & airbnb$price < 7500)
airbnb %>%
ggplot(aes(price,number_of_reviews))+
geom_point()

airbnb %>%
ggplot(aes(price,reviews_per_month))+
geom_point()

airbnb <- subset(airbnb, airbnb$reviews_per_month < 30)
airbnb %>%
ggplot(aes(price,minimum_nights))+
geom_point()

airbnb <- subset(airbnb, airbnb$minimum_nights < 600)
Feature Engineering Part 4
#Create Plots to check the relationship between Price and the other variables and apply transformations where necessary
airbnb %>%
ggplot(aes(x=price)) + geom_histogram(color="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

airbnb %>%
ggplot(aes(x=log(price))) + geom_histogram(color="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

airbnb%>%
ggplot(aes(x=minimum_nights)) + geom_histogram(color="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

model1 <- lm(data=airbnb, price ~ minimum_nights )
get_regression_table(model1)
## # A tibble: 2 × 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 135. 1.03 131. 0 133. 137.
## 2 minimum_nights 0.545 0.054 10.1 0 0.439 0.651
points<-get_regression_points(model1)
ggplot(data=points, aes(x=residual)) + geom_histogram(color="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=points, aes(x=price_hat, y=residual)) + geom_point()

#Transforming the price and minimum nights to log form as they are right skewed.
airbnb<-airbnb %>%
mutate(lnprice=log(price), lnmin_nights=log(minimum_nights))
ggplot(data=airbnb, aes(x=lnprice)) + geom_histogram(color="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=airbnb, aes(x=lnmin_nights)) + geom_histogram(color="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

head(airbnb)
## # A tibble: 6 × 17
## id neighbourhood_group neighbourhood latitude longitude room_type price
## <dbl> <fct> <fct> <dbl> <dbl> <fct> <dbl>
## 1 2539 Brooklyn Kensington 40.6 -74.0 Private room 149
## 2 2595 Manhattan Midtown 40.8 -74.0 Entire home/… 225
## 3 3647 Manhattan Harlem 40.8 -73.9 Private room 60
## 4 3831 Brooklyn Clinton Hill 40.7 -74.0 Entire home/… 45
## 5 5022 Manhattan East Harlem 40.8 -73.9 Entire home/… 80
## 6 5099 Manhattan Murray Hill 40.7 -74.0 Entire home/… 200
## # ℹ 10 more variables: minimum_nights <dbl>, number_of_reviews <dbl>,
## # reviews_per_month <dbl>, floor <dbl>, `noise(dB)` <dbl>,
## # neighbourhood_group_numeric <dbl>, room_type_numeric <dbl>,
## # neighbourhood_numeric <dbl>, lnprice <dbl>, lnmin_nights <dbl>
model1 <- lm(data=airbnb, lnprice ~ lnmin_nights)
get_regression_table(model1)
## # A tibble: 2 × 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 4.56 0.006 826. 0 4.55 4.57
## 2 lnmin_nights 0.044 0.004 12.5 0 0.037 0.051
points<-get_regression_points(model1)
ggplot(data=points, aes(x=residual)) + geom_histogram(color="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=points, aes(x=lnprice_hat, y=residual)) + geom_point()

#Create some tables and some boxplots for the factor variables
table(airbnb$neighbourhood_group)
##
## Bronx Brooklyn Manhattan Queens Staten Island
## 872 16074 17318 4532 299
table(airbnb$neighbourhood)
##
## Allerton Arden Heights
## 33 4
## Arrochar Arverne
## 15 68
## Astoria Bath Beach
## 717 16
## Battery Park City Bay Ridge
## 52 114
## Bay Terrace Bay Terrace, Staten Island
## 5 1
## Baychester Bayside
## 7 31
## Bayswater Bedford-Stuyvesant
## 12 2984
## Belle Harbor Bellerose
## 6 12
## Belmont Bensonhurst
## 21 61
## Bergen Beach Boerum Hill
## 6 147
## Borough Park Breezy Point
## 105 2
## Briarwood Brighton Beach
## 50 63
## Bronxdale Brooklyn Heights
## 13 126
## Brownsville Bull's Head
## 53 4
## Bushwick Cambria Heights
## 1978 23
## Canarsie Carroll Gardens
## 114 184
## Castle Hill Castleton Corners
## 8 4
## Chelsea Chinatown
## 884 294
## City Island Civic Center
## 14 40
## Claremont Village Clason Point
## 19 18
## Clifton Clinton Hill
## 13 475
## Co-op City Cobble Hill
## 1 77
## College Point Columbia St
## 16 34
## Concord Concourse
## 20 39
## Concourse Village Coney Island
## 28 13
## Corona Crown Heights
## 47 1253
## Cypress Hills Ditmars Steinway
## 103 245
## Dongan Hills Douglaston
## 6 8
## Downtown Brooklyn DUMBO
## 61 26
## Dyker Heights East Elmhurst
## 10 153
## East Flatbush East Harlem
## 401 912
## East Morrisania East New York
## 8 166
## East Village Eastchester
## 1506 12
## Edenwald Edgemere
## 8 9
## Elmhurst Eltingville
## 192 3
## Emerson Hill Far Rockaway
## 1 25
## Fieldston Financial District
## 10 594
## Flatbush Flatiron District
## 500 63
## Flatlands Flushing
## 63 320
## Fordham Forest Hills
## 49 115
## Fort Greene Fort Hamilton
## 377 40
## Fort Wadsworth Fresh Meadows
## 1 24
## Glendale Gowanus
## 43 194
## Gramercy Graniteville
## 291 3
## Grant City Gravesend
## 5 55
## Great Kills Greenpoint
## 9 870
## Greenwich Village Grymes Hill
## 308 5
## Harlem Hell's Kitchen
## 2140 1575
## Highbridge Hollis
## 23 10
## Holliswood Howard Beach
## 3 16
## Howland Hook Huguenot
## 2 2
## Hunts Point Inwood
## 14 204
## Jackson Heights Jamaica
## 157 185
## Jamaica Estates Jamaica Hills
## 14 7
## Kensington Kew Gardens
## 147 25
## Kew Gardens Hills Kingsbridge
## 22 57
## Kips Bay Laurelton
## 371 12
## Lighthouse Hill Little Italy
## 1 95
## Little Neck Long Island City
## 3 429
## Longwood Lower East Side
## 52 736
## Manhattan Beach Marble Hill
## 8 10
## Mariners Harbor Maspeth
## 7 95
## Melrose Middle Village
## 9 20
## Midland Beach Midtown
## 6 1225
## Midwood Mill Basin
## 90 3
## Morningside Heights Morris Heights
## 273 14
## Morris Park Morrisania
## 10 12
## Mott Haven Mount Eden
## 41 5
## Mount Hope Murray Hill
## 15 394
## Navy Yard Neponsit
## 12 1
## New Brighton New Dorp
## 3 1
## New Dorp Beach New Springville
## 5 7
## NoHo Nolita
## 60 187
## North Riverdale Norwood
## 7 27
## Oakwood Olinville
## 4 3
## Ozone Park Park Slope
## 53 420
## Parkchester Pelham Bay
## 28 13
## Pelham Gardens Port Morris
## 25 42
## Port Richmond Prince's Bay
## 6 3
## Prospect-Lefferts Gardens Prospect Heights
## 432 289
## Queens Village Randall Manor
## 52 17
## Red Hook Rego Park
## 65 90
## Richmond Hill Richmondtown
## 71 1
## Ridgewood Riverdale
## 338 8
## Rockaway Beach Roosevelt Island
## 47 68
## Rosebank Rosedale
## 6 48
## Rossville Schuylerville
## 1 10
## Sea Gate Sheepshead Bay
## 7 124
## Shore Acres Silver Lake
## 5 1
## SoHo Soundview
## 290 13
## South Beach South Ozone Park
## 5 36
## South Slope Springfield Gardens
## 229 68
## Spuyten Duyvil St. Albans
## 4 59
## St. George Stapleton
## 40 21
## Stuyvesant Town Sunnyside
## 34 290
## Sunset Park Theater District
## 288 227
## Throgs Neck Todt Hill
## 21 3
## Tompkinsville Tottenville
## 36 4
## Tremont Tribeca
## 8 139
## Two Bridges Unionport
## 57 6
## University Heights Upper East Side
## 19 1434
## Upper West Side Van Nest
## 1538 8
## Vinegar Hill Wakefield
## 27 39
## Washington Heights West Brighton
## 718 15
## West Farms West Village
## 1 599
## Westchester Square Westerleigh
## 9 2
## Whitestone Williamsbridge
## 11 33
## Williamsburg Willowbrook
## 3144 1
## Windsor Terrace Woodhaven
## 120 72
## Woodlawn Woodside
## 8 175
table(airbnb$room_type)
##
## Entire home/apt Private room Shared room
## 20266 17901 928
ggplot(data=airbnb, aes(y=lnprice,fill= neighbourhood_group)) + geom_boxplot()

ggplot(data=airbnb, aes(y=lnprice,fill=room_type)) + geom_boxplot()

ggplot(data=airbnb, aes(y=lnprice,fill=neighbourhood)) + geom_boxplot()

#Creating scatterplots for the numeric variables
ggplot(data=airbnb, aes(x=lnmin_nights, y=lnprice)) + geom_point()

ggplot(data=airbnb, aes(x=longitude, y=lnprice)) + geom_point()

ggplot(data=airbnb, aes(x=latitude, y=lnprice)) + geom_point()

ggplot(data=airbnb, aes(x=floor, y=lnprice)) + geom_point()

ggplot(data=airbnb, aes(x=`noise(dB)`, y=lnprice)) + geom_point()

ggplot(data=airbnb, aes(x=number_of_reviews, y=lnprice)) + geom_point()

ggplot(data=airbnb, aes(x=reviews_per_month, y=lnprice)) + geom_point()

#Running the lnprice vs number_of_reviews resulted in a cone shaped graph and so plotting against log(number_of_reviews)
ggplot(data=airbnb, aes(x=log(number_of_reviews), y=lnprice)) + geom_point()

#Running the lnprice vs reviews_per_month resulted in a cone shaped graph and so plotting against log(reviews_per_month)
ggplot(data=airbnb, aes(x=log(reviews_per_month), y=lnprice)) + geom_point()

Feature Engineering Part 5
#Create a numeric dataframe for models that require numeric only or perform better with numeric variables
airbnb_numeric <- airbnb %>%
select(-neighbourhood_group,-neighbourhood,-room_type)
Checking to see how a model works with and without the log
transformed variables
model0<-lm(price~neighbourhood_group + neighbourhood+ latitude + longitude + room_type + number_of_reviews + reviews_per_month + floor + `noise(dB)` + minimum_nights,airbnb)
get_regression_summaries(model0)
## # A tibble: 1 × 9
## r_squared adj_r_squared mse rmse sigma statistic p_value df nobs
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.744 0.742 9231. 96.1 96.4 495. 0 228 39095
get_regression_table(model0)
## # A tibble: 233 × 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept -3.82e4 6318. -6.05 0 -50588. -2.58e4
## 2 neighbourhood_group: … 2.24e3 28.7 78.1 0 2188. 2.30e3
## 3 neighbourhood_group: … 3.35e3 34.3 97.8 0 3286. 3.42e3
## 4 neighbourhood_group: … 1.49e3 32.6 45.7 0 1426. 1.55e3
## 5 neighbourhood_group: … 3.95e3 108. 36.7 0 3735. 4.16e3
## 6 neighbourhood: Arden … -2.73e2 108. -2.53 0.011 -485. -6.17e1
## 7 neighbourhood: Arroch… -1.39e2 99.6 -1.39 0.164 -334. 5.66e1
## 8 neighbourhood: Arverne 1.17e2 21.0 5.57 0 75.7 1.58e2
## 9 neighbourhood: Astoria 1.04e0 8.38 0.124 0.901 -15.4 1.75e1
## 10 neighbourhood: Bath B… -4.19e1 26.1 -1.61 0.108 -93.1 9.24e0
## # ℹ 223 more rows
points<-get_regression_points(model0)
ggplot(data=points, aes(x=residual)) + geom_histogram(color="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=points, aes(x=price_hat, y=residual)) + geom_point()

points %>%
ggplot(aes(sample = residual)) +
stat_qq() +
stat_qq_line()

model1 <- lm(data=airbnb, lnprice ~ neighbourhood_group +neighbourhood+ latitude + longitude + room_type + number_of_reviews + reviews_per_month + floor + `noise(dB)` + lnmin_nights )
get_regression_table(model1)
## # A tibble: 233 × 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept -205. 30.8 -6.68 0 -266. -145.
## 2 neighbourhood_group: … 3.62 0.14 25.8 0 3.34 3.89
## 3 neighbourhood_group: … 7.87 0.167 47.1 0 7.54 8.20
## 4 neighbourhood_group: … 5.90 0.159 37.2 0 5.59 6.22
## 5 neighbourhood_group: … 10.1 0.524 19.3 0 9.08 11.1
## 6 neighbourhood: Arden … -1.71 0.525 -3.25 0.001 -2.74 -0.677
## 7 neighbourhood: Arroch… -1.01 0.485 -2.08 0.038 -1.96 -0.058
## 8 neighbourhood: Arverne 0.505 0.102 4.93 0 0.304 0.705
## 9 neighbourhood: Astoria 0.075 0.041 1.84 0.066 -0.005 0.155
## 10 neighbourhood: Bath B… -0.413 0.127 -3.25 0.001 -0.662 -0.163
## # ℹ 223 more rows
get_regression_summaries(model1)
## # A tibble: 1 × 9
## r_squared adj_r_squared mse rmse sigma statistic p_value df nobs
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.611 0.609 0.219 0.468 0.469 268. 0 228 39095
points<-get_regression_points(model1)
points
## # A tibble: 39,095 × 14
## ID lnprice neighbourhood_group neighbourhood latitude longitude room_type
## <int> <dbl> <fct> <fct> <dbl> <dbl> <fct>
## 1 1 5.00 Brooklyn Kensington 40.6 -74.0 Private …
## 2 2 5.42 Manhattan Midtown 40.8 -74.0 Entire h…
## 3 3 4.09 Manhattan Harlem 40.8 -73.9 Private …
## 4 4 3.81 Brooklyn Clinton Hill 40.7 -74.0 Entire h…
## 5 5 4.38 Manhattan East Harlem 40.8 -73.9 Entire h…
## 6 6 5.30 Manhattan Murray Hill 40.7 -74.0 Entire h…
## 7 7 4.09 Brooklyn Bedford-Stuyv… 40.7 -74.0 Private …
## 8 8 3.47 Manhattan Upper West Si… 40.8 -74.0 Private …
## 9 9 5.01 Manhattan Chinatown 40.7 -74.0 Entire h…
## 10 10 3.99 Manhattan Upper West Si… 40.8 -74.0 Entire h…
## # ℹ 39,085 more rows
## # ℹ 7 more variables: number_of_reviews <dbl>, reviews_per_month <dbl>,
## # floor <dbl>, `noise(dB)` <dbl>, lnmin_nights <dbl>, lnprice_hat <dbl>,
## # residual <dbl>
ggplot(data=points, aes(x=residual)) + geom_histogram(color="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=points, aes(x=lnprice_hat, y=residual)) + geom_point()

points %>%
ggplot(aes(sample = residual)) +
stat_qq() +
stat_qq_line()

Visualize Correlation Matrix
glimpse(airbnb_numeric)
## Rows: 39,095
## Columns: 14
## $ id <dbl> 2539, 2595, 3647, 3831, 5022, 5099, 5121, …
## $ latitude <dbl> 40.64749, 40.75362, 40.80902, 40.68514, 40…
## $ longitude <dbl> -73.97237, -73.98377, -73.94190, -73.95976…
## $ price <dbl> 149, 225, 60, 45, 80, 200, 60, 32, 150, 54…
## $ minimum_nights <dbl> 1, 1, 3, 1, 10, 3, 45, 2, 1, 5, 2, 90, 2, …
## $ number_of_reviews <dbl> 9, 45, 0, 270, 9, 74, 49, 118, 160, 53, 18…
## $ reviews_per_month <dbl> 0.21, 0.38, 0.00, 4.64, 0.10, 0.59, 0.40, …
## $ floor <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `noise(dB)` <dbl> 69.05646, 56.05428, 56.05428, 69.05646, 56…
## $ neighbourhood_group_numeric <dbl> 2, 3, 3, 2, 3, 3, 2, 3, 3, 3, 3, 3, 2, 2, …
## $ room_type_numeric <dbl> 2, 1, 2, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, …
## $ neighbourhood_numeric <dbl> 109, 128, 95, 42, 62, 138, 14, 203, 36, 20…
## $ lnprice <dbl> 5.003946, 5.416100, 4.094345, 3.806662, 4.…
## $ lnmin_nights <dbl> 0.0000000, 0.0000000, 1.0986123, 0.0000000…
corrplot(cor(airbnb_numeric[,-1]))

Modeling
Split the dataset into test and train.
#make this example reproducible
set.seed(1)
#use 70% of dataset as training set and 30% as test set
sample <- sample(c(TRUE, FALSE), nrow(airbnb), replace=TRUE, prob=c(0.7,0.3))
airbnb_train <- airbnb[sample, ]
airbnb_test <- airbnb[!sample, ]
#The following R code sets all observations in our test data set to NA that contain the additional level that didn’t exist in our train data:
airbnb_test_new <- airbnb_test # Duplicate test data set
airbnb_test_new$neighbourhood[which(!(airbnb_test_new$neighbourhood %in% unique(airbnb_train$neighbourhood)))] <- NA # Replace new levels by NA
airbnb_test_new
## # A tibble: 11,708 × 17
## id neighbourhood_group neighbourhood latitude longitude room_type price
## <dbl> <fct> <fct> <dbl> <dbl> <fct> <dbl>
## 1 3831 Brooklyn Clinton Hill 40.7 -74.0 Entire h… 45
## 2 5099 Manhattan Murray Hill 40.7 -74.0 Entire h… 200
## 3 5121 Brooklyn Bedford-Stuyves… 40.7 -74.0 Private … 60
## 4 7322 Manhattan Chelsea 40.7 -74.0 Private … 140
## 5 7750 Manhattan East Harlem 40.8 -73.9 Entire h… 190
## 6 7801 Brooklyn Williamsburg 40.7 -74.0 Entire h… 299
## 7 8025 Brooklyn Park Slope 40.7 -74.0 Private … 80
## 8 8110 Brooklyn Park Slope 40.7 -74.0 Private … 110
## 9 11708 Brooklyn Bushwick 40.7 -73.9 Entire h… 43
## 10 12318 Manhattan Upper West Side 40.8 -74.0 Private … 135
## # ℹ 11,698 more rows
## # ℹ 10 more variables: minimum_nights <dbl>, number_of_reviews <dbl>,
## # reviews_per_month <dbl>, floor <dbl>, `noise(dB)` <dbl>,
## # neighbourhood_group_numeric <dbl>, room_type_numeric <dbl>,
## # neighbourhood_numeric <dbl>, lnprice <dbl>, lnmin_nights <dbl>
#use 70% of dataset as training set and 30% as test set
sample_numeric <- sample(c(TRUE, FALSE), nrow(airbnb_numeric), replace=TRUE, prob=c(0.7,0.3))
airbnb_train_numeric <- airbnb_numeric[sample_numeric, ]
airbnb_test_numeric <- airbnb_numeric[!sample_numeric, ]
#Get the train and test ID
train_id <- data.frame(ID = airbnb_train_numeric$id)
test_id <- data.frame(ID = airbnb_test_numeric$id)
#Dropping the Id columns
airbnb_train_numeric <- airbnb_train_numeric[,-1]
airbnb_test_numeric <- airbnb_test_numeric[,-1]
#Creating X train and test datasets and Y train and test datasets for the random forest model
X_train <- airbnb_train_numeric[, -c(3,12,13)]#Drop the dependent variables which are price and lnprice and create the X_train dataframe.
Y_train <- airbnb_train_numeric[, 3]#Select the price and create the Y_train dataframe.
X_test <- airbnb_test_numeric[, -c(3,12,13)]#Drop the dependent variables which are price and lnprice and create the X_test dataframe.
Y_test <- airbnb_test_numeric[, 3]#Select the price and create the Y_test dataframe.
#The following steps are specifically for the xgboost model
#Change target variable as a numeric vector and the rest of train dataset into matrix form.
train_mat<-as.matrix(X_train)
mode(train_mat)<-'double' #coerce categorical variables to change to numeric.
#Change target variable as a numeric vector and the rest of test dataset into matrix form.
test_mat<-as.matrix(X_test)
mode(test_mat)<-'double' #coerce categorical variables to change to numeric.
Model 1: OLS Linear Regresssion
model.ols<-lm(price ~ neighbourhood_group + neighbourhood+latitude + longitude + room_type + number_of_reviews + floor + `noise(dB)` + minimum_nights + room_type*neighbourhood_group+ floor*neighbourhood_group + floor*room_type+ neighbourhood*floor+ room_type*neighbourhood+ neighbourhood_group*neighbourhood,airbnb_train)
get_regression_summaries(model.ols)
## # A tibble: 1 × 9
## r_squared adj_r_squared mse rmse sigma statistic p_value df nobs
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.861 0.858 5107. 71.5 72.2 297. 0 559 27387
get_regression_table(model.ols)
## # A tibble: 1,739 × 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept -29933. 5768. -5.19 0 -41238. -1.86e4
## 2 neighbourhood_group: … -3654. 512. -7.14 0 -4657. -2.65e3
## 3 neighbourhood_group: … -987. 512. -1.93 0.054 -1991. 1.68e1
## 4 neighbourhood_group: … 15251. 635. 24 0 14005. 1.65e4
## 5 neighbourhood_group: … -2831. 518. -5.47 0 -3846. -1.82e3
## 6 neighbourhood: Arden … -49.0 83.6 -0.586 0.558 -213. 1.15e2
## 7 neighbourhood: Arroch… 357. 83.5 4.27 0 193. 5.20e2
## 8 neighbourhood: Arverne 149. 24.2 6.18 0 102. 1.97e2
## 9 neighbourhood: Astoria -25.3 15.1 -1.67 0.095 -54.9 4.36e0
## 10 neighbourhood: Bath B… -58.8 37.5 -1.57 0.117 -132. 1.47e1
## # ℹ 1,729 more rows
points<-get_regression_points(model.ols)
ggplot(data=points, aes(x=residual)) + geom_histogram(color="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=points, aes(x=price_hat, y=residual)) + geom_point()

points %>%
ggplot(aes(sample = residual)) +
stat_qq() +
stat_qq_line()

airbnb.predictions <- predict(model.ols,airbnb_test_new)
## Warning in predict.lm(model.ols, airbnb_test_new): prediction from a
## rank-deficient fit may be misleading
#airbnb.predictions
#result_test <- data.frame(ID = airbnb_test_new$id,
#price = exp(airbnb_test_new$lnprice),
#predictions = exp(airbnb.predictions))
result_test <- data.frame(ID = airbnb_test_new$id,
price = airbnb_test_new$price,
predictions = airbnb.predictions)
write.csv(result_test,file = "C:/Users/anish/Documents/Data Science Capstone/predicitions_airbnb_ols.csv")
Model 2 AIC
null <- lm(price~1, data = airbnb_train)
full <- lm(price ~ neighbourhood_group +neighbourhood+ latitude + longitude + room_type + number_of_reviews + reviews_per_month + floor + `noise(dB)` + minimum_nights, data = airbnb_train)
step(null, scope =list(lower=null, upper= full), direction = "both")
## Start: AIC=287838.9
## price ~ 1
##
## Df Sum of Sq RSS AIC
## + room_type 2 87348345 917202210 285352
## + neighbourhood 214 95950592 908599963 285517
## + neighbourhood_group 4 30276617 974273938 287009
## + longitude 1 26455034 978095521 287110
## + `noise(dB)` 1 7122022 997428533 287646
## + floor 1 5633304 998917251 287687
## + minimum_nights 1 2948164 1001602392 287760
## + number_of_reviews 1 2547252 1002003303 287771
## + reviews_per_month 1 2455346 1002095209 287774
## + latitude 1 662152 1003888403 287823
## <none> 1004550555 287839
##
## Step: AIC=285351.6
## price ~ room_type
##
## Df Sum of Sq RSS AIC
## + neighbourhood 214 60444170 856758040 283913
## + neighbourhood_group 4 15834239 901367971 284883
## + floor 1 11864840 905337370 284997
## + longitude 1 11500745 905701465 285008
## + `noise(dB)` 1 2368942 914833268 285283
## + number_of_reviews 1 2259484 914942726 285286
## + reviews_per_month 1 1625635 915576576 285305
## + minimum_nights 1 937386 916264824 285326
## + latitude 1 785425 916416786 285330
## <none> 917202210 285352
## - room_type 2 87348345 1004550555 287839
##
## Step: AIC=283912.5
## price ~ room_type + neighbourhood
##
## Df Sum of Sq RSS AIC
## + floor 1 589207446 267550594 252040
## + `noise(dB)` 1 495617717 361140323 260255
## + number_of_reviews 1 1152356 855605684 283878
## + reviews_per_month 1 675817 856082223 283893
## + longitude 1 250115 856507925 283907
## + minimum_nights 1 185051 856572989 283909
## <none> 856758040 283913
## + latitude 1 17326 856740714 283914
## - neighbourhood 214 60444170 917202210 285352
## - room_type 2 51841923 908599963 285517
##
## Step: AIC=252040.2
## price ~ room_type + neighbourhood + floor
##
## Df Sum of Sq RSS AIC
## + `noise(dB)` 1 20039722 247510872 249910
## + number_of_reviews 1 324210 267226383 252009
## + longitude 1 239666 267310927 252018
## + reviews_per_month 1 80382 267470212 252034
## <none> 267550594 252040
## + minimum_nights 1 10749 267539845 252041
## + latitude 1 9684 267540909 252041
## - room_type 2 39700752 307251345 255825
## - floor 1 589207446 856758040 283913
## - neighbourhood 214 637786776 905337370 284997
##
## Step: AIC=249910
## price ~ room_type + neighbourhood + floor + `noise(dB)`
##
## Df Sum of Sq RSS AIC
## + number_of_reviews 1 230697 247280175 249887
## + longitude 1 143278 247367593 249896
## + reviews_per_month 1 79698 247431174 249903
## <none> 247510872 249910
## + latitude 1 5773 247505099 249911
## + minimum_nights 1 3322 247507549 249912
## - `noise(dB)` 1 20039722 267550594 252040
## - room_type 2 36315517 283826389 253656
## - floor 1 113629451 361140323 260255
## - neighbourhood 214 653789342 901300214 284877
##
## Step: AIC=249886.5
## price ~ room_type + neighbourhood + floor + `noise(dB)` + number_of_reviews
##
## Df Sum of Sq RSS AIC
## + longitude 1 136744 247143431 249873
## <none> 247280175 249887
## + minimum_nights 1 9008 247271168 249888
## + latitude 1 4170 247276006 249888
## + reviews_per_month 1 29 247280146 249889
## - number_of_reviews 1 230697 247510872 249910
## - `noise(dB)` 1 19946208 267226383 252009
## - room_type 2 36377007 283657183 253641
## - floor 1 113667551 360947727 260243
## - neighbourhood 214 651486509 898766684 284801
##
## Step: AIC=249873.4
## price ~ room_type + neighbourhood + floor + `noise(dB)` + number_of_reviews +
## longitude
##
## Df Sum of Sq RSS AIC
## <none> 247143431 249873
## + minimum_nights 1 9285 247134146 249874
## + latitude 1 7195 247136236 249875
## + reviews_per_month 1 354 247143077 249875
## - longitude 1 136744 247280175 249887
## - number_of_reviews 1 224163 247367593 249896
## - `noise(dB)` 1 19853568 266996999 251988
## - room_type 2 36163617 283307048 253609
## - floor 1 113778990 360922421 260243
## - neighbourhood 214 602502991 849646422 283264
##
## Call:
## lm(formula = price ~ room_type + neighbourhood + floor + `noise(dB)` +
## number_of_reviews + longitude, data = airbnb_train)
##
## Coefficients:
## (Intercept)
## -3.092e+04
## room_typePrivate room
## -7.680e+01
## room_typeShared room
## -8.436e+01
## neighbourhoodArden Heights
## 3.483e+03
## neighbourhoodArrochar
## 3.635e+03
## neighbourhoodArverne
## 1.405e+03
## neighbourhoodAstoria
## 1.292e+03
## neighbourhoodBath Beach
## 2.231e+03
## neighbourhoodBattery Park City
## 3.335e+03
## neighbourhoodBay Ridge
## 2.267e+03
## neighbourhoodBay Terrace
## 1.368e+03
## neighbourhoodBay Terrace, Staten Island
## 3.477e+03
## neighbourhoodBaychester
## -3.046e+01
## neighbourhoodBayside
## 1.338e+03
## neighbourhoodBayswater
## 1.320e+03
## neighbourhoodBedford-Stuyvesant
## 2.264e+03
## neighbourhoodBelle Harbor
## 1.405e+03
## neighbourhoodBellerose
## 1.355e+03
## neighbourhoodBelmont
## 3.372e+01
## neighbourhoodBensonhurst
## 2.229e+03
## neighbourhoodBergen Beach
## 2.241e+03
## neighbourhoodBoerum Hill
## 2.296e+03
## neighbourhoodBorough Park
## 2.228e+03
## neighbourhoodBreezy Point
## 1.427e+03
## neighbourhoodBriarwood
## 1.331e+03
## neighbourhoodBrighton Beach
## 2.285e+03
## neighbourhoodBronxdale
## -1.888e+01
## neighbourhoodBrooklyn Heights
## 2.316e+03
## neighbourhoodBrownsville
## 2.268e+03
## neighbourhoodBull's Head
## 3.525e+03
## neighbourhoodBushwick
## 2.252e+03
## neighbourhoodCambria Heights
## 1.335e+03
## neighbourhoodCanarsie
## 2.284e+03
## neighbourhoodCarroll Gardens
## 2.296e+03
## neighbourhoodCastle Hill
## -3.951e+00
## neighbourhoodCastleton Corners
## 3.626e+03
## neighbourhoodChelsea
## 3.359e+03
## neighbourhoodChinatown
## 3.307e+03
## neighbourhoodCity Island
## 1.923e+02
## neighbourhoodCivic Center
## 3.307e+03
## neighbourhoodClaremont Village
## 3.620e+00
## neighbourhoodClason Point
## 3.523e+01
## neighbourhoodClifton
## 3.584e+03
## neighbourhoodClinton Hill
## 2.252e+03
## neighbourhoodCo-op City
## 5.200e+01
## neighbourhoodCobble Hill
## 2.317e+03
## neighbourhoodCollege Point
## 1.274e+03
## neighbourhoodColumbia St
## 2.257e+03
## neighbourhoodConcord
## 3.557e+03
## neighbourhoodConcourse
## -8.437e+00
## neighbourhoodConcourse Village
## -3.783e+00
## neighbourhoodConey Island
## 2.195e+03
## neighbourhoodCorona
## 1.277e+03
## neighbourhoodCrown Heights
## 2.254e+03
## neighbourhoodCypress Hills
## 2.278e+03
## neighbourhoodDitmars Steinway
## 1.290e+03
## neighbourhoodDongan Hills
## 3.553e+03
## neighbourhoodDouglaston
## 1.325e+03
## neighbourhoodDowntown Brooklyn
## 2.280e+03
## neighbourhoodDUMBO
## 2.317e+03
## neighbourhoodDyker Heights
## 2.242e+03
## neighbourhoodEast Elmhurst
## 1.300e+03
## neighbourhoodEast Flatbush
## 2.245e+03
## neighbourhoodEast Harlem
## 3.318e+03
## neighbourhoodEast Morrisania
## 3.698e+01
## neighbourhoodEast New York
## 2.270e+03
## neighbourhoodEast Village
## 3.336e+03
## neighbourhoodEastchester
## 7.262e+01
## neighbourhoodEdenwald
## 2.920e+01
## neighbourhoodEdgemere
## 1.316e+03
## neighbourhoodElmhurst
## 1.291e+03
## neighbourhoodEltingville
## 3.615e+03
## neighbourhoodEmerson Hill
## 3.480e+03
## neighbourhoodFar Rockaway
## 1.439e+03
## neighbourhoodFieldston
## -3.123e+01
## neighbourhoodFinancial District
## 3.351e+03
## neighbourhoodFlatbush
## 2.215e+03
## neighbourhoodFlatiron District
## 3.383e+03
## neighbourhoodFlatlands
## 2.273e+03
## neighbourhoodFlushing
## 1.333e+03
## neighbourhoodFordham
## 4.195e+00
## neighbourhoodForest Hills
## 1.354e+03
## neighbourhoodFort Greene
## 2.279e+03
## neighbourhoodFort Hamilton
## 2.231e+03
## neighbourhoodFort Wadsworth
## 4.564e+03
## neighbourhoodFresh Meadows
## 1.329e+03
## neighbourhoodGlendale
## 1.285e+03
## neighbourhoodGowanus
## 2.291e+03
## neighbourhoodGramercy
## 3.347e+03
## neighbourhoodGraniteville
## 3.484e+03
## neighbourhoodGrant City
## 3.515e+03
## neighbourhoodGravesend
## 2.244e+03
## neighbourhoodGreat Kills
## 3.585e+03
## neighbourhoodGreenpoint
## 2.287e+03
## neighbourhoodGreenwich Village
## 3.359e+03
## neighbourhoodGrymes Hill
## 3.606e+03
## neighbourhoodHarlem
## 3.280e+03
## neighbourhoodHell's Kitchen
## 3.349e+03
## neighbourhoodHighbridge
## -4.654e+00
## neighbourhoodHollis
## 1.330e+03
## neighbourhoodHolliswood
## 1.417e+03
## neighbourhoodHoward Beach
## 1.312e+03
## neighbourhoodHowland Hook
## 3.512e+03
## neighbourhoodHuguenot
## 3.511e+03
## neighbourhoodHunts Point
## -1.758e+00
## neighbourhoodInwood
## 3.299e+03
## neighbourhoodJackson Heights
## 1.287e+03
## neighbourhoodJamaica
## 1.328e+03
## neighbourhoodJamaica Estates
## 1.283e+03
## neighbourhoodJamaica Hills
## 1.364e+03
## neighbourhoodKensington
## 2.249e+03
## neighbourhoodKew Gardens
## 1.312e+03
## neighbourhoodKew Gardens Hills
## 1.329e+03
## neighbourhoodKingsbridge
## 9.600e+00
## neighbourhoodKips Bay
## 3.337e+03
## neighbourhoodLaurelton
## 1.340e+03
## neighbourhoodLighthouse Hill
## 3.704e+03
## neighbourhoodLittle Italy
## 3.164e+03
## neighbourhoodLittle Neck
## 1.328e+03
## neighbourhoodLong Island City
## 1.304e+03
## neighbourhoodLongwood
## 3.623e+01
## neighbourhoodLower East Side
## 3.329e+03
## neighbourhoodManhattan Beach
## 2.231e+03
## neighbourhoodMarble Hill
## 3.278e+03
## neighbourhoodMariners Harbor
## 3.536e+03
## neighbourhoodMaspeth
## 1.275e+03
## neighbourhoodMelrose
## -1.858e+01
## neighbourhoodMiddle Village
## 1.286e+03
## neighbourhoodMidland Beach
## 3.524e+03
## neighbourhoodMidtown
## 3.370e+03
## neighbourhoodMidwood
## 2.237e+03
## neighbourhoodMorningside Heights
## 3.240e+03
## neighbourhoodMorris Heights
## -1.027e+01
## neighbourhoodMorris Park
## 1.005e+01
## neighbourhoodMorrisania
## 5.067e+00
## neighbourhoodMott Haven
## 1.742e+01
## neighbourhoodMount Eden
## -9.029e+00
## neighbourhoodMount Hope
## -1.284e+01
## neighbourhoodMurray Hill
## 3.355e+03
## neighbourhoodNavy Yard
## 2.266e+03
## neighbourhoodNeponsit
## 1.588e+03
## neighbourhoodNew Dorp
## 3.485e+03
## neighbourhoodNew Dorp Beach
## 3.547e+03
## neighbourhoodNew Springville
## 3.526e+03
## neighbourhoodNoHo
## 3.386e+03
## neighbourhoodNolita
## 3.352e+03
## neighbourhoodNorth Riverdale
## -1.818e+01
## neighbourhoodNorwood
## 6.050e+00
## neighbourhoodOakwood
## 3.545e+03
## neighbourhoodOlinville
## 3.288e+00
## neighbourhoodOzone Park
## 1.285e+03
## neighbourhoodPark Slope
## 2.304e+03
## neighbourhoodParkchester
## 1.312e+01
## neighbourhoodPelham Bay
## 2.234e+01
## neighbourhoodPelham Gardens
## 4.715e+00
## neighbourhoodPort Morris
## 2.031e+00
## neighbourhoodPort Richmond
## 3.627e+03
## neighbourhoodPrince's Bay
## 4.033e+03
## neighbourhoodProspect-Lefferts Gardens
## 2.212e+03
## neighbourhoodProspect Heights
## 2.235e+03
## neighbourhoodQueens Village
## 1.319e+03
## neighbourhoodRandall Manor
## 4.003e+03
## neighbourhoodRed Hook
## 2.269e+03
## neighbourhoodRego Park
## 1.290e+03
## neighbourhoodRichmond Hill
## 1.312e+03
## neighbourhoodRidgewood
## 1.274e+03
## neighbourhoodRiverdale
## 3.306e+02
## neighbourhoodRockaway Beach
## 1.334e+03
## neighbourhoodRoosevelt Island
## 3.305e+03
## neighbourhoodRosebank
## 3.565e+03
## neighbourhoodRosedale
## 1.341e+03
## neighbourhoodRossville
## 3.475e+03
## neighbourhoodSchuylerville
## 2.269e+01
## neighbourhoodSea Gate
## 2.167e+03
## neighbourhoodSheepshead Bay
## 2.256e+03
## neighbourhoodShore Acres
## 3.522e+03
## neighbourhoodSilver Lake
## 3.523e+03
## neighbourhoodSoHo
## 3.303e+03
## neighbourhoodSoundview
## -9.186e+00
## neighbourhoodSouth Beach
## 3.631e+03
## neighbourhoodSouth Ozone Park
## 1.314e+03
## neighbourhoodSouth Slope
## 2.277e+03
## neighbourhoodSpringfield Gardens
## 1.330e+03
## neighbourhoodSpuyten Duyvil
## 9.920e+01
## neighbourhoodSt. Albans
## 1.343e+03
## neighbourhoodSt. George
## 3.629e+03
## neighbourhoodStapleton
## 3.613e+03
## neighbourhoodStuyvesant Town
## 3.329e+03
## neighbourhoodSunnyside
## 1.276e+03
## neighbourhoodSunset Park
## 2.254e+03
## neighbourhoodTheater District
## 3.367e+03
## neighbourhoodThrogs Neck
## 3.617e+01
## neighbourhoodTompkinsville
## 3.562e+03
## neighbourhoodTottenville
## 3.570e+03
## neighbourhoodTremont
## -5.817e+00
## neighbourhoodTribeca
## 3.237e+03
## neighbourhoodTwo Bridges
## 3.316e+03
## neighbourhoodUnionport
## 9.339e+01
## neighbourhoodUniversity Heights
## -7.675e-01
## neighbourhoodUpper East Side
## 3.333e+03
## neighbourhoodUpper West Side
## 3.292e+03
## neighbourhoodVan Nest
## 8.017e+01
## neighbourhoodVinegar Hill
## 2.297e+03
## neighbourhoodWakefield
## 3.239e+01
## neighbourhoodWashington Heights
## 3.296e+03
## neighbourhoodWest Brighton
## 3.585e+03
## neighbourhoodWest Farms
## 4.370e+01
## neighbourhoodWest Village
## 3.216e+03
## neighbourhoodWestchester Square
## 2.238e+02
## neighbourhoodWesterleigh
## 3.533e+03
## neighbourhoodWhitestone
## 1.357e+03
## neighbourhoodWilliamsbridge
## -1.538e+01
## neighbourhoodWilliamsburg
## 2.297e+03
## neighbourhoodWindsor Terrace
## 2.268e+03
## neighbourhoodWoodhaven
## 1.286e+03
## neighbourhoodWoodlawn
## -6.041e+00
## neighbourhoodWoodside
## 1.293e+03
## floor
## 3.781e+02
## `noise(dB)`
## 7.967e+01
## number_of_reviews
## -6.473e-02
## longitude
## -3.100e+02
airbnb_model_aic<-lm(formula = price ~room_type + neighbourhood + floor + `noise(dB)` +
number_of_reviews + longitude, data = airbnb_train)
get_regression_summaries(airbnb_model_aic)
## # A tibble: 1 × 9
## r_squared adj_r_squared mse rmse sigma statistic p_value df nobs
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.754 0.752 9024. 95.0 95.4 378. 0 220 27387
get_regression_table(airbnb_model_aic)
## # A tibble: 221 × 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept -30923. 5904. -5.24 0 -42496. -19351.
## 2 room_type: Private ro… -76.8 1.24 -61.9 0 -79.2 -74.4
## 3 room_type: Shared room -84.4 3.88 -21.8 0 -92.0 -76.8
## 4 neighbourhood: Arden … 3483. 73.3 47.5 0 3340. 3627.
## 5 neighbourhood: Arroch… 3635. 53.8 67.6 0 3529. 3740.
## 6 neighbourhood: Arverne 1405. 36.7 38.3 0 1334. 1477.
## 7 neighbourhood: Astoria 1292. 34.2 37.8 0 1225. 1360.
## 8 neighbourhood: Bath B… 2231. 36.1 61.7 0 2160. 2302.
## 9 neighbourhood: Batter… 3335. 40.0 83.5 0 3257. 3414.
## 10 neighbourhood: Bay Ri… 2267. 27.5 82.4 0 2213. 2321.
## # ℹ 211 more rows
points<-get_regression_points(airbnb_model_aic)
ggplot(data=points, aes(x=residual)) + geom_histogram(color="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=points, aes(x=price_hat, y=residual)) + geom_point()

points %>%
ggplot(aes(sample = residual)) +
stat_qq() +
stat_qq_line()

airbnb.predictions <- predict(airbnb_model_aic,airbnb_test_new)
#airbnb.predictions
Model 3 Random Forest
#Run the random forest model
randomforest_model <- randomForest(price~neighbourhood_group_numeric +neighbourhood_numeric+ latitude + longitude + room_type_numeric + minimum_nights + number_of_reviews + floor + minimum_nights,data = airbnb_train_numeric)
# View the forest results.
print(randomforest_model)
##
## Call:
## randomForest(formula = price ~ neighbourhood_group_numeric + neighbourhood_numeric + latitude + longitude + room_type_numeric + minimum_nights + number_of_reviews + floor + minimum_nights, data = airbnb_train_numeric)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 10852.54
## % Var explained: 69.15
# Importance of each predictor.
print(importance(randomforest_model,type = 2))
## IncNodePurity
## neighbourhood_group_numeric 27791329
## neighbourhood_numeric 31036416
## latitude 71539819
## longitude 96216934
## room_type_numeric 65684155
## minimum_nights 24853100
## number_of_reviews 20896469
## floor 456833767
#Predict the price based on X_train.
pred_train <- predict(randomforest_model, X_train)
#Create a dataframe with the train_id, the actual price, and the predicted price.
result_train <- data.frame(ID = train_id,
price = Y_train,
predictions = pred_train)
#Print the head of the dataframe result_train.
head(result_train)
## ID price predictions
## 1 2595 225 226.57832
## 2 3647 60 64.25744
## 3 5121 60 68.69139
## 4 5203 32 38.01279
## 5 5238 150 189.93120
## 6 5295 54 94.45100
#Predict the price based on X_test.
pred_test <- predict(randomforest_model, airbnb_test_numeric)
#Create a dataframe with the test_id, the actual price, and the predicted price.
result_test <- data.frame(ID = test_id,
price = Y_test,
predictions = pred_test)
write.csv(result_test,file = "C:/Users/anish/Documents/Data Science Capstone/predicitions_airbnb_rf.csv")
#Print the head of the dataframe result_train.
head(result_test)
## ID price predictions
## 1 2539 149 63.7401
## 2 3831 45 108.9368
## 3 5022 80 135.0129
## 4 5099 200 216.1896
## 5 6848 140 179.6969
## 6 9657 180 203.1002
print(paste0('Test RMSE: ' , rmse(result_test$price,
result_test$predictions))) #testRMSE
## [1] "Test RMSE: 107.834320503987"
print(paste0('Train RMSE: ' , rmse(result_train$price,
result_train$predictions)))
## [1] "Train RMSE: 70.957961774177"
print(paste0('Test R2: ' ,
caret::postResample(result_test$predictions , result_test$price)['Rsquared']))
## [1] "Test R2: 0.712074194315585"
print(paste0('Train R2: ' ,
caret::postResample(result_train$predictions , result_train$price)['Rsquared']))
## [1] "Train R2: 0.88210725279567"
ggplot(result_test, aes(x = predictions, y = price)) + geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

melt_pred_test <- melt(head(result_test,20), id.vars = "ID")
ggplot(melt_pred_test,
aes(y = value,
x = ID,
colour = variable)) +
geom_point() +
geom_line() +
ggtitle("Actual vs Predicted for Test Data")

Model 4: Extreme Gradient Boosting(XG Boost)
#Preparing two matrices for xgb
dtrain <- xgb.DMatrix(data = train_mat, label = airbnb_train_numeric$price)
dtest <- xgb.DMatrix(data = test_mat, label = airbnb_test_numeric$price)
#Run xgboost
bst <- xgboost(data = dtrain, max.depth = 5, eta = 1, nround = 1000)
## [1] train-rmse:109.103426
## [2] train-rmse:93.567236
## [3] train-rmse:67.858387
## [4] train-rmse:65.699772
## [5] train-rmse:63.175161
## [6] train-rmse:62.016752
## [7] train-rmse:60.400425
## [8] train-rmse:58.063930
## [9] train-rmse:57.428335
## [10] train-rmse:56.690518
## [11] train-rmse:56.226733
## [12] train-rmse:54.540495
## [13] train-rmse:53.888769
## [14] train-rmse:53.458349
## [15] train-rmse:52.728138
## [16] train-rmse:52.584994
## [17] train-rmse:52.178480
## [18] train-rmse:51.855397
## [19] train-rmse:51.703919
## [20] train-rmse:50.191910
## [21] train-rmse:50.097788
## [22] train-rmse:49.412056
## [23] train-rmse:49.301143
## [24] train-rmse:48.787892
## [25] train-rmse:48.469397
## [26] train-rmse:48.410826
## [27] train-rmse:48.269030
## [28] train-rmse:48.025167
## [29] train-rmse:47.275728
## [30] train-rmse:47.000076
## [31] train-rmse:46.705723
## [32] train-rmse:46.561233
## [33] train-rmse:46.354260
## [34] train-rmse:46.295987
## [35] train-rmse:46.162112
## [36] train-rmse:45.878929
## [37] train-rmse:45.370975
## [38] train-rmse:45.326648
## [39] train-rmse:45.178028
## [40] train-rmse:45.126807
## [41] train-rmse:44.883055
## [42] train-rmse:44.766265
## [43] train-rmse:44.596967
## [44] train-rmse:44.474292
## [45] train-rmse:44.372807
## [46] train-rmse:44.178225
## [47] train-rmse:43.978082
## [48] train-rmse:43.939138
## [49] train-rmse:43.866732
## [50] train-rmse:43.676943
## [51] train-rmse:43.570725
## [52] train-rmse:43.483499
## [53] train-rmse:43.446704
## [54] train-rmse:43.328071
## [55] train-rmse:43.231973
## [56] train-rmse:43.081619
## [57] train-rmse:42.976482
## [58] train-rmse:42.872039
## [59] train-rmse:42.715904
## [60] train-rmse:42.615812
## [61] train-rmse:42.586853
## [62] train-rmse:42.448360
## [63] train-rmse:42.383976
## [64] train-rmse:42.349365
## [65] train-rmse:42.309590
## [66] train-rmse:42.238706
## [67] train-rmse:42.071162
## [68] train-rmse:41.972561
## [69] train-rmse:41.887567
## [70] train-rmse:41.703597
## [71] train-rmse:41.597813
## [72] train-rmse:41.571877
## [73] train-rmse:41.400530
## [74] train-rmse:41.274870
## [75] train-rmse:41.189086
## [76] train-rmse:41.133093
## [77] train-rmse:41.060703
## [78] train-rmse:40.992697
## [79] train-rmse:40.947368
## [80] train-rmse:40.859240
## [81] train-rmse:40.783072
## [82] train-rmse:40.708711
## [83] train-rmse:40.583975
## [84] train-rmse:40.507208
## [85] train-rmse:40.169745
## [86] train-rmse:40.100775
## [87] train-rmse:40.035992
## [88] train-rmse:39.926370
## [89] train-rmse:39.858435
## [90] train-rmse:39.849505
## [91] train-rmse:39.694535
## [92] train-rmse:39.580990
## [93] train-rmse:39.432467
## [94] train-rmse:39.367927
## [95] train-rmse:39.252741
## [96] train-rmse:39.150128
## [97] train-rmse:39.038151
## [98] train-rmse:38.977808
## [99] train-rmse:38.942666
## [100] train-rmse:38.867225
## [101] train-rmse:38.792868
## [102] train-rmse:38.729206
## [103] train-rmse:38.594417
## [104] train-rmse:38.508185
## [105] train-rmse:38.392898
## [106] train-rmse:38.336132
## [107] train-rmse:38.271055
## [108] train-rmse:38.172375
## [109] train-rmse:38.071634
## [110] train-rmse:38.002874
## [111] train-rmse:37.919194
## [112] train-rmse:37.895910
## [113] train-rmse:37.798487
## [114] train-rmse:37.780181
## [115] train-rmse:37.640653
## [116] train-rmse:37.606712
## [117] train-rmse:37.539523
## [118] train-rmse:37.441513
## [119] train-rmse:37.417468
## [120] train-rmse:37.312785
## [121] train-rmse:37.223295
## [122] train-rmse:37.157834
## [123] train-rmse:37.080073
## [124] train-rmse:36.970252
## [125] train-rmse:36.911289
## [126] train-rmse:36.885272
## [127] train-rmse:36.798847
## [128] train-rmse:36.725064
## [129] train-rmse:36.663158
## [130] train-rmse:36.624463
## [131] train-rmse:36.599660
## [132] train-rmse:36.508793
## [133] train-rmse:36.375509
## [134] train-rmse:36.349271
## [135] train-rmse:36.262168
## [136] train-rmse:36.192494
## [137] train-rmse:36.107190
## [138] train-rmse:35.987591
## [139] train-rmse:35.955348
## [140] train-rmse:35.876260
## [141] train-rmse:35.819659
## [142] train-rmse:35.739799
## [143] train-rmse:35.626281
## [144] train-rmse:35.558879
## [145] train-rmse:35.506196
## [146] train-rmse:35.463598
## [147] train-rmse:35.401379
## [148] train-rmse:35.327771
## [149] train-rmse:35.284048
## [150] train-rmse:35.219212
## [151] train-rmse:35.132732
## [152] train-rmse:35.102758
## [153] train-rmse:35.040621
## [154] train-rmse:35.005275
## [155] train-rmse:34.961028
## [156] train-rmse:34.878811
## [157] train-rmse:34.780471
## [158] train-rmse:34.701387
## [159] train-rmse:34.642046
## [160] train-rmse:34.562066
## [161] train-rmse:34.458545
## [162] train-rmse:34.419950
## [163] train-rmse:34.349817
## [164] train-rmse:34.247895
## [165] train-rmse:34.169713
## [166] train-rmse:34.124375
## [167] train-rmse:34.108864
## [168] train-rmse:34.060844
## [169] train-rmse:34.028140
## [170] train-rmse:33.966608
## [171] train-rmse:33.908588
## [172] train-rmse:33.835677
## [173] train-rmse:33.784376
## [174] train-rmse:33.713330
## [175] train-rmse:33.654137
## [176] train-rmse:33.587644
## [177] train-rmse:33.520804
## [178] train-rmse:33.471592
## [179] train-rmse:33.444764
## [180] train-rmse:33.345408
## [181] train-rmse:33.276396
## [182] train-rmse:33.232971
## [183] train-rmse:33.204750
## [184] train-rmse:33.131093
## [185] train-rmse:33.094775
## [186] train-rmse:33.079624
## [187] train-rmse:33.039047
## [188] train-rmse:33.015654
## [189] train-rmse:32.985344
## [190] train-rmse:32.933708
## [191] train-rmse:32.866868
## [192] train-rmse:32.810645
## [193] train-rmse:32.782659
## [194] train-rmse:32.723384
## [195] train-rmse:32.683242
## [196] train-rmse:32.632905
## [197] train-rmse:32.559548
## [198] train-rmse:32.483582
## [199] train-rmse:32.421106
## [200] train-rmse:32.392563
## [201] train-rmse:32.335815
## [202] train-rmse:32.297140
## [203] train-rmse:32.251716
## [204] train-rmse:32.191967
## [205] train-rmse:32.144415
## [206] train-rmse:32.057640
## [207] train-rmse:32.029201
## [208] train-rmse:31.932345
## [209] train-rmse:31.845413
## [210] train-rmse:31.780658
## [211] train-rmse:31.744768
## [212] train-rmse:31.670669
## [213] train-rmse:31.646037
## [214] train-rmse:31.632929
## [215] train-rmse:31.591507
## [216] train-rmse:31.494219
## [217] train-rmse:31.420953
## [218] train-rmse:31.331459
## [219] train-rmse:31.306554
## [220] train-rmse:31.269836
## [221] train-rmse:31.217851
## [222] train-rmse:31.142993
## [223] train-rmse:31.135651
## [224] train-rmse:31.118486
## [225] train-rmse:31.082362
## [226] train-rmse:31.048505
## [227] train-rmse:31.033032
## [228] train-rmse:31.017207
## [229] train-rmse:30.964833
## [230] train-rmse:30.912501
## [231] train-rmse:30.874786
## [232] train-rmse:30.810148
## [233] train-rmse:30.763088
## [234] train-rmse:30.698930
## [235] train-rmse:30.635522
## [236] train-rmse:30.578933
## [237] train-rmse:30.556908
## [238] train-rmse:30.497977
## [239] train-rmse:30.419213
## [240] train-rmse:30.337707
## [241] train-rmse:30.281423
## [242] train-rmse:30.215736
## [243] train-rmse:30.167544
## [244] train-rmse:30.051430
## [245] train-rmse:30.037759
## [246] train-rmse:29.976727
## [247] train-rmse:29.969530
## [248] train-rmse:29.910593
## [249] train-rmse:29.889562
## [250] train-rmse:29.808352
## [251] train-rmse:29.774164
## [252] train-rmse:29.670753
## [253] train-rmse:29.599826
## [254] train-rmse:29.536176
## [255] train-rmse:29.487919
## [256] train-rmse:29.456876
## [257] train-rmse:29.423070
## [258] train-rmse:29.363151
## [259] train-rmse:29.270309
## [260] train-rmse:29.218339
## [261] train-rmse:29.154853
## [262] train-rmse:29.121879
## [263] train-rmse:29.099175
## [264] train-rmse:29.026046
## [265] train-rmse:28.947873
## [266] train-rmse:28.874223
## [267] train-rmse:28.822707
## [268] train-rmse:28.755845
## [269] train-rmse:28.710706
## [270] train-rmse:28.636204
## [271] train-rmse:28.591975
## [272] train-rmse:28.546309
## [273] train-rmse:28.488967
## [274] train-rmse:28.460937
## [275] train-rmse:28.389944
## [276] train-rmse:28.356581
## [277] train-rmse:28.320757
## [278] train-rmse:28.270802
## [279] train-rmse:28.225771
## [280] train-rmse:28.171421
## [281] train-rmse:28.115886
## [282] train-rmse:28.065926
## [283] train-rmse:28.050499
## [284] train-rmse:27.981034
## [285] train-rmse:27.932397
## [286] train-rmse:27.923137
## [287] train-rmse:27.852712
## [288] train-rmse:27.801887
## [289] train-rmse:27.757322
## [290] train-rmse:27.725571
## [291] train-rmse:27.706267
## [292] train-rmse:27.660104
## [293] train-rmse:27.617859
## [294] train-rmse:27.574315
## [295] train-rmse:27.536241
## [296] train-rmse:27.509911
## [297] train-rmse:27.460765
## [298] train-rmse:27.420397
## [299] train-rmse:27.404286
## [300] train-rmse:27.392025
## [301] train-rmse:27.384885
## [302] train-rmse:27.374762
## [303] train-rmse:27.335171
## [304] train-rmse:27.318894
## [305] train-rmse:27.276546
## [306] train-rmse:27.225633
## [307] train-rmse:27.161458
## [308] train-rmse:27.130515
## [309] train-rmse:27.085241
## [310] train-rmse:27.060582
## [311] train-rmse:27.037091
## [312] train-rmse:26.990406
## [313] train-rmse:26.957419
## [314] train-rmse:26.938172
## [315] train-rmse:26.928567
## [316] train-rmse:26.914800
## [317] train-rmse:26.879447
## [318] train-rmse:26.817902
## [319] train-rmse:26.793962
## [320] train-rmse:26.725820
## [321] train-rmse:26.689897
## [322] train-rmse:26.650927
## [323] train-rmse:26.578860
## [324] train-rmse:26.533686
## [325] train-rmse:26.479232
## [326] train-rmse:26.403721
## [327] train-rmse:26.383487
## [328] train-rmse:26.366860
## [329] train-rmse:26.350291
## [330] train-rmse:26.341679
## [331] train-rmse:26.319587
## [332] train-rmse:26.272953
## [333] train-rmse:26.210614
## [334] train-rmse:26.161148
## [335] train-rmse:26.084870
## [336] train-rmse:26.025615
## [337] train-rmse:25.974575
## [338] train-rmse:25.918289
## [339] train-rmse:25.876687
## [340] train-rmse:25.805460
## [341] train-rmse:25.775829
## [342] train-rmse:25.766747
## [343] train-rmse:25.700496
## [344] train-rmse:25.640675
## [345] train-rmse:25.587025
## [346] train-rmse:25.558019
## [347] train-rmse:25.544620
## [348] train-rmse:25.492198
## [349] train-rmse:25.429680
## [350] train-rmse:25.387382
## [351] train-rmse:25.334843
## [352] train-rmse:25.292908
## [353] train-rmse:25.242820
## [354] train-rmse:25.209529
## [355] train-rmse:25.172461
## [356] train-rmse:25.163747
## [357] train-rmse:25.126491
## [358] train-rmse:25.087980
## [359] train-rmse:25.040289
## [360] train-rmse:24.989180
## [361] train-rmse:24.948513
## [362] train-rmse:24.889211
## [363] train-rmse:24.838114
## [364] train-rmse:24.832359
## [365] train-rmse:24.780362
## [366] train-rmse:24.693305
## [367] train-rmse:24.662206
## [368] train-rmse:24.635055
## [369] train-rmse:24.571240
## [370] train-rmse:24.523470
## [371] train-rmse:24.511840
## [372] train-rmse:24.503474
## [373] train-rmse:24.471458
## [374] train-rmse:24.427806
## [375] train-rmse:24.413169
## [376] train-rmse:24.381100
## [377] train-rmse:24.336547
## [378] train-rmse:24.269848
## [379] train-rmse:24.203639
## [380] train-rmse:24.159383
## [381] train-rmse:24.091714
## [382] train-rmse:24.033663
## [383] train-rmse:23.990156
## [384] train-rmse:23.964347
## [385] train-rmse:23.915153
## [386] train-rmse:23.900692
## [387] train-rmse:23.873796
## [388] train-rmse:23.821256
## [389] train-rmse:23.760173
## [390] train-rmse:23.714306
## [391] train-rmse:23.665417
## [392] train-rmse:23.640992
## [393] train-rmse:23.602885
## [394] train-rmse:23.556416
## [395] train-rmse:23.527237
## [396] train-rmse:23.478817
## [397] train-rmse:23.465132
## [398] train-rmse:23.449463
## [399] train-rmse:23.383961
## [400] train-rmse:23.320975
## [401] train-rmse:23.287169
## [402] train-rmse:23.261894
## [403] train-rmse:23.191956
## [404] train-rmse:23.149470
## [405] train-rmse:23.095760
## [406] train-rmse:23.055266
## [407] train-rmse:22.996825
## [408] train-rmse:22.987735
## [409] train-rmse:22.964129
## [410] train-rmse:22.942059
## [411] train-rmse:22.930877
## [412] train-rmse:22.909229
## [413] train-rmse:22.867416
## [414] train-rmse:22.820984
## [415] train-rmse:22.785086
## [416] train-rmse:22.744656
## [417] train-rmse:22.700912
## [418] train-rmse:22.675633
## [419] train-rmse:22.655226
## [420] train-rmse:22.623139
## [421] train-rmse:22.614740
## [422] train-rmse:22.578859
## [423] train-rmse:22.556497
## [424] train-rmse:22.515607
## [425] train-rmse:22.477788
## [426] train-rmse:22.456921
## [427] train-rmse:22.433837
## [428] train-rmse:22.400540
## [429] train-rmse:22.360281
## [430] train-rmse:22.335470
## [431] train-rmse:22.328125
## [432] train-rmse:22.324622
## [433] train-rmse:22.298941
## [434] train-rmse:22.245409
## [435] train-rmse:22.225351
## [436] train-rmse:22.195906
## [437] train-rmse:22.159575
## [438] train-rmse:22.113194
## [439] train-rmse:22.091618
## [440] train-rmse:22.065184
## [441] train-rmse:22.030972
## [442] train-rmse:21.989442
## [443] train-rmse:21.974103
## [444] train-rmse:21.945603
## [445] train-rmse:21.892959
## [446] train-rmse:21.851113
## [447] train-rmse:21.826663
## [448] train-rmse:21.759264
## [449] train-rmse:21.704387
## [450] train-rmse:21.659306
## [451] train-rmse:21.600366
## [452] train-rmse:21.556843
## [453] train-rmse:21.543787
## [454] train-rmse:21.504876
## [455] train-rmse:21.478961
## [456] train-rmse:21.446888
## [457] train-rmse:21.408395
## [458] train-rmse:21.394502
## [459] train-rmse:21.357097
## [460] train-rmse:21.323630
## [461] train-rmse:21.295901
## [462] train-rmse:21.273060
## [463] train-rmse:21.245204
## [464] train-rmse:21.203448
## [465] train-rmse:21.165842
## [466] train-rmse:21.154215
## [467] train-rmse:21.110825
## [468] train-rmse:21.077365
## [469] train-rmse:21.039654
## [470] train-rmse:20.998798
## [471] train-rmse:20.948608
## [472] train-rmse:20.905397
## [473] train-rmse:20.870932
## [474] train-rmse:20.856356
## [475] train-rmse:20.836761
## [476] train-rmse:20.801534
## [477] train-rmse:20.733209
## [478] train-rmse:20.704533
## [479] train-rmse:20.694998
## [480] train-rmse:20.672977
## [481] train-rmse:20.634424
## [482] train-rmse:20.596565
## [483] train-rmse:20.561065
## [484] train-rmse:20.531491
## [485] train-rmse:20.493501
## [486] train-rmse:20.475021
## [487] train-rmse:20.456395
## [488] train-rmse:20.424041
## [489] train-rmse:20.412405
## [490] train-rmse:20.384667
## [491] train-rmse:20.372457
## [492] train-rmse:20.359467
## [493] train-rmse:20.350773
## [494] train-rmse:20.321169
## [495] train-rmse:20.281701
## [496] train-rmse:20.239047
## [497] train-rmse:20.210466
## [498] train-rmse:20.192330
## [499] train-rmse:20.141166
## [500] train-rmse:20.113099
## [501] train-rmse:20.079713
## [502] train-rmse:20.074201
## [503] train-rmse:20.038179
## [504] train-rmse:20.002922
## [505] train-rmse:19.968455
## [506] train-rmse:19.937428
## [507] train-rmse:19.909770
## [508] train-rmse:19.889517
## [509] train-rmse:19.843497
## [510] train-rmse:19.824336
## [511] train-rmse:19.805134
## [512] train-rmse:19.763481
## [513] train-rmse:19.734282
## [514] train-rmse:19.712023
## [515] train-rmse:19.700131
## [516] train-rmse:19.692330
## [517] train-rmse:19.646708
## [518] train-rmse:19.631056
## [519] train-rmse:19.592346
## [520] train-rmse:19.565040
## [521] train-rmse:19.550886
## [522] train-rmse:19.541811
## [523] train-rmse:19.510438
## [524] train-rmse:19.489092
## [525] train-rmse:19.484714
## [526] train-rmse:19.451410
## [527] train-rmse:19.413954
## [528] train-rmse:19.389679
## [529] train-rmse:19.371402
## [530] train-rmse:19.357164
## [531] train-rmse:19.349654
## [532] train-rmse:19.338138
## [533] train-rmse:19.305095
## [534] train-rmse:19.265719
## [535] train-rmse:19.237422
## [536] train-rmse:19.204446
## [537] train-rmse:19.158492
## [538] train-rmse:19.134264
## [539] train-rmse:19.114258
## [540] train-rmse:19.095811
## [541] train-rmse:19.062244
## [542] train-rmse:19.040929
## [543] train-rmse:18.975884
## [544] train-rmse:18.957158
## [545] train-rmse:18.932535
## [546] train-rmse:18.907478
## [547] train-rmse:18.884489
## [548] train-rmse:18.853089
## [549] train-rmse:18.823461
## [550] train-rmse:18.816496
## [551] train-rmse:18.786735
## [552] train-rmse:18.766465
## [553] train-rmse:18.742775
## [554] train-rmse:18.734761
## [555] train-rmse:18.728304
## [556] train-rmse:18.713416
## [557] train-rmse:18.692140
## [558] train-rmse:18.680805
## [559] train-rmse:18.666459
## [560] train-rmse:18.643042
## [561] train-rmse:18.623714
## [562] train-rmse:18.605514
## [563] train-rmse:18.573862
## [564] train-rmse:18.569075
## [565] train-rmse:18.539202
## [566] train-rmse:18.506719
## [567] train-rmse:18.469940
## [568] train-rmse:18.436589
## [569] train-rmse:18.420933
## [570] train-rmse:18.388781
## [571] train-rmse:18.363055
## [572] train-rmse:18.346369
## [573] train-rmse:18.325745
## [574] train-rmse:18.272624
## [575] train-rmse:18.222497
## [576] train-rmse:18.201698
## [577] train-rmse:18.173103
## [578] train-rmse:18.137106
## [579] train-rmse:18.086138
## [580] train-rmse:18.063836
## [581] train-rmse:18.055119
## [582] train-rmse:18.026482
## [583] train-rmse:17.992471
## [584] train-rmse:17.982147
## [585] train-rmse:17.964566
## [586] train-rmse:17.949399
## [587] train-rmse:17.919818
## [588] train-rmse:17.886915
## [589] train-rmse:17.869387
## [590] train-rmse:17.824205
## [591] train-rmse:17.794264
## [592] train-rmse:17.749938
## [593] train-rmse:17.722951
## [594] train-rmse:17.695171
## [595] train-rmse:17.660526
## [596] train-rmse:17.641415
## [597] train-rmse:17.621119
## [598] train-rmse:17.593226
## [599] train-rmse:17.560203
## [600] train-rmse:17.547695
## [601] train-rmse:17.531596
## [602] train-rmse:17.499535
## [603] train-rmse:17.483577
## [604] train-rmse:17.447375
## [605] train-rmse:17.414282
## [606] train-rmse:17.388538
## [607] train-rmse:17.358725
## [608] train-rmse:17.332854
## [609] train-rmse:17.290790
## [610] train-rmse:17.251768
## [611] train-rmse:17.242827
## [612] train-rmse:17.224613
## [613] train-rmse:17.206864
## [614] train-rmse:17.196629
## [615] train-rmse:17.178845
## [616] train-rmse:17.157310
## [617] train-rmse:17.144138
## [618] train-rmse:17.133852
## [619] train-rmse:17.102447
## [620] train-rmse:17.084541
## [621] train-rmse:17.059937
## [622] train-rmse:17.003798
## [623] train-rmse:16.989394
## [624] train-rmse:16.980730
## [625] train-rmse:16.948915
## [626] train-rmse:16.940497
## [627] train-rmse:16.905883
## [628] train-rmse:16.867163
## [629] train-rmse:16.827730
## [630] train-rmse:16.807693
## [631] train-rmse:16.797854
## [632] train-rmse:16.792312
## [633] train-rmse:16.767700
## [634] train-rmse:16.759730
## [635] train-rmse:16.744884
## [636] train-rmse:16.719513
## [637] train-rmse:16.715846
## [638] train-rmse:16.692779
## [639] train-rmse:16.672840
## [640] train-rmse:16.637771
## [641] train-rmse:16.608772
## [642] train-rmse:16.590984
## [643] train-rmse:16.552066
## [644] train-rmse:16.541546
## [645] train-rmse:16.518792
## [646] train-rmse:16.505205
## [647] train-rmse:16.488881
## [648] train-rmse:16.461240
## [649] train-rmse:16.427846
## [650] train-rmse:16.416541
## [651] train-rmse:16.394044
## [652] train-rmse:16.386665
## [653] train-rmse:16.373436
## [654] train-rmse:16.344718
## [655] train-rmse:16.322518
## [656] train-rmse:16.294051
## [657] train-rmse:16.279631
## [658] train-rmse:16.262007
## [659] train-rmse:16.222352
## [660] train-rmse:16.188587
## [661] train-rmse:16.167311
## [662] train-rmse:16.143334
## [663] train-rmse:16.101762
## [664] train-rmse:16.086361
## [665] train-rmse:16.073289
## [666] train-rmse:16.044697
## [667] train-rmse:16.012123
## [668] train-rmse:16.000111
## [669] train-rmse:15.978434
## [670] train-rmse:15.957064
## [671] train-rmse:15.919807
## [672] train-rmse:15.893664
## [673] train-rmse:15.857483
## [674] train-rmse:15.846107
## [675] train-rmse:15.834837
## [676] train-rmse:15.804900
## [677] train-rmse:15.786265
## [678] train-rmse:15.762074
## [679] train-rmse:15.739478
## [680] train-rmse:15.716273
## [681] train-rmse:15.693105
## [682] train-rmse:15.667424
## [683] train-rmse:15.652258
## [684] train-rmse:15.641255
## [685] train-rmse:15.626180
## [686] train-rmse:15.612847
## [687] train-rmse:15.580597
## [688] train-rmse:15.566541
## [689] train-rmse:15.548975
## [690] train-rmse:15.544659
## [691] train-rmse:15.514043
## [692] train-rmse:15.501898
## [693] train-rmse:15.495948
## [694] train-rmse:15.487813
## [695] train-rmse:15.483588
## [696] train-rmse:15.460835
## [697] train-rmse:15.452042
## [698] train-rmse:15.418360
## [699] train-rmse:15.388831
## [700] train-rmse:15.371655
## [701] train-rmse:15.355464
## [702] train-rmse:15.333484
## [703] train-rmse:15.317980
## [704] train-rmse:15.293718
## [705] train-rmse:15.264582
## [706] train-rmse:15.243985
## [707] train-rmse:15.221903
## [708] train-rmse:15.211763
## [709] train-rmse:15.182659
## [710] train-rmse:15.176020
## [711] train-rmse:15.164382
## [712] train-rmse:15.145859
## [713] train-rmse:15.122264
## [714] train-rmse:15.106160
## [715] train-rmse:15.091554
## [716] train-rmse:15.052286
## [717] train-rmse:15.029484
## [718] train-rmse:15.005814
## [719] train-rmse:14.982593
## [720] train-rmse:14.966313
## [721] train-rmse:14.937262
## [722] train-rmse:14.914796
## [723] train-rmse:14.887394
## [724] train-rmse:14.879939
## [725] train-rmse:14.857432
## [726] train-rmse:14.851121
## [727] train-rmse:14.820541
## [728] train-rmse:14.785379
## [729] train-rmse:14.769562
## [730] train-rmse:14.747857
## [731] train-rmse:14.740982
## [732] train-rmse:14.723948
## [733] train-rmse:14.711673
## [734] train-rmse:14.693403
## [735] train-rmse:14.684985
## [736] train-rmse:14.681421
## [737] train-rmse:14.664536
## [738] train-rmse:14.650716
## [739] train-rmse:14.621598
## [740] train-rmse:14.590628
## [741] train-rmse:14.557097
## [742] train-rmse:14.524511
## [743] train-rmse:14.519906
## [744] train-rmse:14.511241
## [745] train-rmse:14.482279
## [746] train-rmse:14.477302
## [747] train-rmse:14.462003
## [748] train-rmse:14.454679
## [749] train-rmse:14.437968
## [750] train-rmse:14.428383
## [751] train-rmse:14.409064
## [752] train-rmse:14.407633
## [753] train-rmse:14.385691
## [754] train-rmse:14.369994
## [755] train-rmse:14.351450
## [756] train-rmse:14.325916
## [757] train-rmse:14.307956
## [758] train-rmse:14.297729
## [759] train-rmse:14.291342
## [760] train-rmse:14.260435
## [761] train-rmse:14.248226
## [762] train-rmse:14.230493
## [763] train-rmse:14.205994
## [764] train-rmse:14.189294
## [765] train-rmse:14.166689
## [766] train-rmse:14.137597
## [767] train-rmse:14.131340
## [768] train-rmse:14.115041
## [769] train-rmse:14.108509
## [770] train-rmse:14.080571
## [771] train-rmse:14.057249
## [772] train-rmse:14.042693
## [773] train-rmse:14.035657
## [774] train-rmse:14.000885
## [775] train-rmse:13.980010
## [776] train-rmse:13.954868
## [777] train-rmse:13.940227
## [778] train-rmse:13.918106
## [779] train-rmse:13.883684
## [780] train-rmse:13.859825
## [781] train-rmse:13.841798
## [782] train-rmse:13.816255
## [783] train-rmse:13.799346
## [784] train-rmse:13.785911
## [785] train-rmse:13.757245
## [786] train-rmse:13.730865
## [787] train-rmse:13.709214
## [788] train-rmse:13.692120
## [789] train-rmse:13.662213
## [790] train-rmse:13.631643
## [791] train-rmse:13.623550
## [792] train-rmse:13.580742
## [793] train-rmse:13.566588
## [794] train-rmse:13.552500
## [795] train-rmse:13.542435
## [796] train-rmse:13.527002
## [797] train-rmse:13.505505
## [798] train-rmse:13.497957
## [799] train-rmse:13.475794
## [800] train-rmse:13.433263
## [801] train-rmse:13.414686
## [802] train-rmse:13.411011
## [803] train-rmse:13.392118
## [804] train-rmse:13.367394
## [805] train-rmse:13.336001
## [806] train-rmse:13.329598
## [807] train-rmse:13.321353
## [808] train-rmse:13.301143
## [809] train-rmse:13.270342
## [810] train-rmse:13.254598
## [811] train-rmse:13.238776
## [812] train-rmse:13.236127
## [813] train-rmse:13.220731
## [814] train-rmse:13.186589
## [815] train-rmse:13.178106
## [816] train-rmse:13.153481
## [817] train-rmse:13.145029
## [818] train-rmse:13.126774
## [819] train-rmse:13.123585
## [820] train-rmse:13.112149
## [821] train-rmse:13.094769
## [822] train-rmse:13.076028
## [823] train-rmse:13.052035
## [824] train-rmse:13.028739
## [825] train-rmse:13.009368
## [826] train-rmse:12.989189
## [827] train-rmse:12.961648
## [828] train-rmse:12.934165
## [829] train-rmse:12.930132
## [830] train-rmse:12.900447
## [831] train-rmse:12.877541
## [832] train-rmse:12.861519
## [833] train-rmse:12.844975
## [834] train-rmse:12.828941
## [835] train-rmse:12.810176
## [836] train-rmse:12.795580
## [837] train-rmse:12.784566
## [838] train-rmse:12.777915
## [839] train-rmse:12.759410
## [840] train-rmse:12.740017
## [841] train-rmse:12.724186
## [842] train-rmse:12.698478
## [843] train-rmse:12.685497
## [844] train-rmse:12.670111
## [845] train-rmse:12.648413
## [846] train-rmse:12.632146
## [847] train-rmse:12.626238
## [848] train-rmse:12.612488
## [849] train-rmse:12.596382
## [850] train-rmse:12.589997
## [851] train-rmse:12.567663
## [852] train-rmse:12.557281
## [853] train-rmse:12.549822
## [854] train-rmse:12.538160
## [855] train-rmse:12.520384
## [856] train-rmse:12.503511
## [857] train-rmse:12.486202
## [858] train-rmse:12.470869
## [859] train-rmse:12.456008
## [860] train-rmse:12.439756
## [861] train-rmse:12.431273
## [862] train-rmse:12.411897
## [863] train-rmse:12.398261
## [864] train-rmse:12.383949
## [865] train-rmse:12.373139
## [866] train-rmse:12.359123
## [867] train-rmse:12.333596
## [868] train-rmse:12.325717
## [869] train-rmse:12.303155
## [870] train-rmse:12.294075
## [871] train-rmse:12.270174
## [872] train-rmse:12.249576
## [873] train-rmse:12.229534
## [874] train-rmse:12.210006
## [875] train-rmse:12.197171
## [876] train-rmse:12.181373
## [877] train-rmse:12.146781
## [878] train-rmse:12.127005
## [879] train-rmse:12.100841
## [880] train-rmse:12.095294
## [881] train-rmse:12.091243
## [882] train-rmse:12.076371
## [883] train-rmse:12.044545
## [884] train-rmse:12.028147
## [885] train-rmse:12.019438
## [886] train-rmse:11.975925
## [887] train-rmse:11.951439
## [888] train-rmse:11.936717
## [889] train-rmse:11.912315
## [890] train-rmse:11.900966
## [891] train-rmse:11.884664
## [892] train-rmse:11.881042
## [893] train-rmse:11.862095
## [894] train-rmse:11.836684
## [895] train-rmse:11.816846
## [896] train-rmse:11.794146
## [897] train-rmse:11.779626
## [898] train-rmse:11.758170
## [899] train-rmse:11.746074
## [900] train-rmse:11.737928
## [901] train-rmse:11.730100
## [902] train-rmse:11.713028
## [903] train-rmse:11.687598
## [904] train-rmse:11.682160
## [905] train-rmse:11.679797
## [906] train-rmse:11.674432
## [907] train-rmse:11.661834
## [908] train-rmse:11.652422
## [909] train-rmse:11.640337
## [910] train-rmse:11.613675
## [911] train-rmse:11.594350
## [912] train-rmse:11.581705
## [913] train-rmse:11.577305
## [914] train-rmse:11.573464
## [915] train-rmse:11.558127
## [916] train-rmse:11.534079
## [917] train-rmse:11.532574
## [918] train-rmse:11.524519
## [919] train-rmse:11.522001
## [920] train-rmse:11.505000
## [921] train-rmse:11.481199
## [922] train-rmse:11.470504
## [923] train-rmse:11.463135
## [924] train-rmse:11.435613
## [925] train-rmse:11.409039
## [926] train-rmse:11.398809
## [927] train-rmse:11.393347
## [928] train-rmse:11.389149
## [929] train-rmse:11.361991
## [930] train-rmse:11.339457
## [931] train-rmse:11.335543
## [932] train-rmse:11.325470
## [933] train-rmse:11.312441
## [934] train-rmse:11.292081
## [935] train-rmse:11.279174
## [936] train-rmse:11.272889
## [937] train-rmse:11.245610
## [938] train-rmse:11.224012
## [939] train-rmse:11.208029
## [940] train-rmse:11.185486
## [941] train-rmse:11.180534
## [942] train-rmse:11.154763
## [943] train-rmse:11.128814
## [944] train-rmse:11.106631
## [945] train-rmse:11.095654
## [946] train-rmse:11.089162
## [947] train-rmse:11.081049
## [948] train-rmse:11.061388
## [949] train-rmse:11.038811
## [950] train-rmse:11.030049
## [951] train-rmse:11.008006
## [952] train-rmse:10.990747
## [953] train-rmse:10.974774
## [954] train-rmse:10.961842
## [955] train-rmse:10.939386
## [956] train-rmse:10.920688
## [957] train-rmse:10.911339
## [958] train-rmse:10.884014
## [959] train-rmse:10.874655
## [960] train-rmse:10.867826
## [961] train-rmse:10.854007
## [962] train-rmse:10.846981
## [963] train-rmse:10.831668
## [964] train-rmse:10.821045
## [965] train-rmse:10.803778
## [966] train-rmse:10.798476
## [967] train-rmse:10.787047
## [968] train-rmse:10.781598
## [969] train-rmse:10.776709
## [970] train-rmse:10.774319
## [971] train-rmse:10.757354
## [972] train-rmse:10.741351
## [973] train-rmse:10.713675
## [974] train-rmse:10.709509
## [975] train-rmse:10.697376
## [976] train-rmse:10.685366
## [977] train-rmse:10.678352
## [978] train-rmse:10.643090
## [979] train-rmse:10.624257
## [980] train-rmse:10.618712
## [981] train-rmse:10.589347
## [982] train-rmse:10.572327
## [983] train-rmse:10.554932
## [984] train-rmse:10.537639
## [985] train-rmse:10.525962
## [986] train-rmse:10.522713
## [987] train-rmse:10.504566
## [988] train-rmse:10.492161
## [989] train-rmse:10.481705
## [990] train-rmse:10.474273
## [991] train-rmse:10.464209
## [992] train-rmse:10.437856
## [993] train-rmse:10.425072
## [994] train-rmse:10.419147
## [995] train-rmse:10.405394
## [996] train-rmse:10.392239
## [997] train-rmse:10.380056
## [998] train-rmse:10.360891
## [999] train-rmse:10.346622
## [1000] train-rmse:10.332669
bst
## ##### xgb.Booster
## raw: 2.4 Mb
## call:
## xgb.train(params = params, data = dtrain, nrounds = nrounds,
## watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
## early_stopping_rounds = early_stopping_rounds, maximize = maximize,
## save_period = save_period, save_name = save_name, xgb_model = xgb_model,
## callbacks = callbacks, max.depth = 5, eta = 1)
## params (as set within xgb.train):
## max_depth = "5", eta = "1", validate_parameters = "1"
## xgb.attributes:
## niter
## callbacks:
## cb.print.evaluation(period = print_every_n)
## cb.evaluation.log()
## # of features: 10
## niter: 1000
## nfeatures : 10
## evaluation_log:
## iter train_rmse
## 1 109.10343
## 2 93.56724
## ---
## 999 10.34662
## 1000 10.33267
#Generate prediction on the test dataset with the model.
preds <- predict(bst, dtest)
#preds
#Calculate RMSE
err <- preds-airbnb_test_numeric$price
rmse <- sqrt(sum(err)^2/nrow(airbnb_test_numeric))
#Create a dataframe with the test_id, the actual price, and the predicted price.
result_test <- data.frame(ID = test_id,
price = airbnb_test_numeric$price,
predictions = preds)
#Print the results
print(paste("test-error=", rmse))
## [1] "test-error= 46.8718864320688"
print(paste0('Test R^2: ' ,
caret::postResample(result_test$predictions , result_test$price)['Rsquared']))
## [1] "Test R^2: 0.811956486991979"
#On Test Data Change to Pivot longer.
melt_pred_test <- melt(head(result_test,100), id.vars = "ID")
ggplot(melt_pred_test,
aes(y = value,
x = ID,
colour = variable)) +
geom_point() +
geom_line() +
ggtitle("Actual vs Predicted for Test Data")

Model 5: Leaps and Lars Model
Function to build second order matrix for x variables
#Builds second order terms for x values.
matrix.2ndorder.make<-function(x, only.quad=F){
x0<-x
dimn<-dimnames(x)[[2]] #extract the names of the variables
num.col<-length(x[1,]) # how many columns
for(i in 1:num.col){
# if we are doing all 2nd order
if(!only.quad){
for(j in i:num.col){
x0<-cbind(x0,x[,i]*x[,j])
dimn<-c(dimn,paste(dimn[i],dimn[j],sep=""))
#create interaction dimension names
}
}
else{
#in here only if doing only squared terms
x0<-cbind(x0,x[,i]*x[,i])
dimn<-c(dimn,paste(dimn[i],"2",sep="")) # squared dimension names
}
}
dimnames(x0)[[2]]<-dimn
x0
}
A Leaps automatic model selector using Cp, and PRESS
Best k models using Cp and from those k calculate best PRESS
Leaps then Press
#names function, takes two variables
regpluspress<-function(x,y){
str<-lsfit(x,y) #Saves lsfit output to str.
press<-sum((str$resid/(1-hat(x)))^2)
str$press<-press #Saves PRESS statistic to str
str #Calls str
}
#Takes parameters
leaps.then.press<-function(xmat,yvec,ncheck=10,print.ls=F)
{
leaps.str<-leaps(xmat,yvec) #Runs through leaps and saves output to leaps.str
z1<-leaps.str$Cp #extract Cp
o1<-order(z1) #Order it
matwhich<-(leaps.str$which[o1,])[1:ncheck,] #pullout ncheck best models with respect to Cp.
z2<-z1[o1][1:ncheck] #Saves lowest cp values to z2
pressvec<-NULL
for(i in 1:ncheck){ #A for loop from 1 to number of checks
ls.str0<-regpluspress(xmat[,matwhich[i,]],yvec) #Saves PRESS statistics from lowest Cp list
if(print.ls){#If called, then it will print ls.str0
ls.print(ls.str0)
}
print(i) #Prints iteration
print(paste("Press=",ls.str0$press)) #Prints PRESS stat
parvec<-matwhich[i,] #Saves matrix row to vector
npar<-sum(parvec) #Sums vector values to npar (sums 1 and 0 values)
print(paste("MPSE=",ls.str0$press/(length(yvec)-(npar+1)))) #Prints the mean squared prediction error
print(paste("Cp=",z2[i])) #Prints Cp value
pressvec<-c(pressvec, ls.str0$press)
if(i==1){
Xmat<-(xmat)[,leaps.str$which[o1[1],]]
coef1<-lsfit(Xmat,yvec)$coef
print("coef1=")
print(coef1)
leaps.pred<-Xmat%*%coef1[-1]+coef1[1]
plot(leaps.pred,yvec)
print("Correlation between leaps prediction and actual yvec")
print(cor(leaps.pred,yvec))
}
}
o2<-order(pressvec) #Output results
print("which matrix:")
matwhich[o2,] #model indicators sorted from best press to worst in top ncheck Cp
print("matwhich 1")
matwhich[o2[1]]
}
And a lars automatic model selector using both Cp and cross
validation MSE
lars selection program
#Function to generate sum of the absolute values of a vector.
sumabs<-function(v1)
{sum(abs(v1))}
#lars select function. This is a lars automatic model selector using both Cp and cross validation MSE
lars.select<-
function(xmat,y,ncheck=10,int=F)
{
lasso.str<-lars(xmat,y,intercept=int) #Calls lars, saves output to lasso.str
#plot(lasso.str)
#print(xmat)
cv.str<-cv.lars(xmat,y,plot.it=F,intercept=int) #Calculates cross-validated error curve for lars
o1<-order(cv.str$cv) #Orders cv values from lowest to highest
mindex<-cv.str$index[o1][1] #Index of cv values ordered
beta<-coef(lasso.str) #Saves coefficients from lasso.str to beta
index0<-apply(beta,1,sumabs) #Iterates through rows, sums absolute values of beta (sum of squares)
index0<-index0/max(index0) #Sums of beta divided by max sum of beta, percentage (0 to 1)
o1<-order(abs(index0-mindex)) #Orders values subtracted by the minimum cv value
I1<-(abs(index0-mindex)==min(abs(index0-mindex)))#If absolute value of index-mindex is the min, add to I1
n1<-length(beta[,1]) #Saves int, length of column of coefficients
beta.out<-beta[I1,] #Beta.out is minimum rows of coefficients
if(sum(abs(beta.out))==0){ #If all values add to zero, then sort list by the Cp value
v1<-lasso.str$Cp
o2<-order(v1)
beta.out<-beta[o1[1:ncheck],]
}
Ind.out<-beta.out!=0 #Saves Ind.out as values of beta.out that don't equal 0
outlist<-list(beta.out=beta.out,ind.out=Ind.out)#Saves list of beta.out, ind.out values
if(int){#finds y-intercept values.
Int.out1<-mean(y)-mean(xmat%*%beta.out[i]) #mean of y-value minus mean of x-matrix times beta coefficients
outlist<-list(beta.out=beta.out,ind.out=Ind.out,int.out=Int.out1)
}
outlist #Returns outlist to end function
}
Combine leaps and lars in a single function
#Combine function calling leaps.then.press and lars.select
combined.leaps.lars<-function(both = F,leaps = F, lars = F,xmat,yvec,ncheck=10,int=F)
{
#if(both==TRUE){par(mfrow=c(2,1))}else{par(mfrow=c(1,1))}
if(both){
leaps.output<-leaps.then.press(xmat,yvec,ncheck,int)
lars.output<-lars.select(xmat,yvec)
plot(xmat%*%lars.output$beta.out,yvec) #Actual Price vs predicted Price
print("Correlation between predicted Price and actual Price Lars")
print(cor(xmat%*%lars.output$beta.out,yvec)) #correlation
#Combine leaps and lars output in a list
lars.leaps.output<-list(leaps.output=leaps.output,lars.output=lars.output)
#Return the output
lars.leaps.output
}
else if(leaps){
leaps.output<-leaps.then.press(xmat,yvec,ncheck,int)
leaps.output
}
else if(lars){
lars.output<-lars.select(xmat,yvec)
plot(xmat%*%lars.output$beta.out,yvec) #Actual Price vs predicted Price
print("Correlation between predicted Price and actual Price Lars")
print(cor(xmat%*%lars.output$beta.out,yvec)) #correlation
lars.output
}
}
Build second order matrix
#Creating a matrix from the data
airbnb_numeric_mat<-as.matrix(airbnb_numeric)
#Output the first 5 rows
airbnb_numeric_mat[1:5,]
## id latitude longitude price minimum_nights number_of_reviews
## [1,] 2539 40.64749 -73.97237 149 1 9
## [2,] 2595 40.75362 -73.98377 225 1 45
## [3,] 3647 40.80902 -73.94190 60 3 0
## [4,] 3831 40.68514 -73.95976 45 1 270
## [5,] 5022 40.79851 -73.94399 80 10 9
## reviews_per_month floor noise(dB) neighbourhood_group_numeric
## [1,] 0.21 1 69.05646 2
## [2,] 0.38 1 56.05428 3
## [3,] 0.00 1 56.05428 3
## [4,] 4.64 1 69.05646 2
## [5,] 0.10 1 56.05428 3
## room_type_numeric neighbourhood_numeric lnprice lnmin_nights
## [1,] 2 109 5.003946 0.000000
## [2,] 1 128 5.416100 0.000000
## [3,] 2 95 4.094345 1.098612
## [4,] 1 42 3.806662 0.000000
## [5,] 1 62 4.382027 2.302585
#Linear fit
ls.print(lsfit(airbnb_numeric_mat[,c(-3,-11)],airbnb_numeric_mat[,3]))
## Residual Standard Error=0.0311
## R-Square=0.5473
## F-statistic (df=12, 39082)=3937.735
## p-value=0
##
## Estimate Std.Err t-value Pr(>|t|)
## Intercept -81.8806 0.1933 -423.5303 0.0000
## id 0.0000 0.0000 5.2865 0.0000
## latitude 0.1878 0.0046 40.7247 0.0000
## price 0.0000 0.0000 -23.1563 0.0000
## minimum_nights 0.0000 0.0000 0.3729 0.7092
## number_of_reviews 0.0000 0.0000 -0.7983 0.4247
## reviews_per_month 0.0014 0.0001 9.7270 0.0000
## floor 0.0135 0.0003 45.7510 0.0000
## noise(dB) 0.0038 0.0001 41.6813 0.0000
## neighbourhood_group_numeric 0.0168 0.0008 21.1657 0.0000
## neighbourhood_numeric 0.0000 0.0000 -16.3249 0.0000
## lnprice -0.0044 0.0003 -14.4619 0.0000
## lnmin_nights -0.0013 0.0002 -6.3516 0.0000
# From the linear fit we can see that reviews_per_month is not a good predictor because p-value is greater than 0.05.
#Make second order matrix of the x values but drop id, price,lnprice,latitude, longitude,lnmin_nights, reviews_per_month,neighborhood
airbnb_numeric_mat2nd<-matrix.2ndorder.make(airbnb_numeric_mat[,c(-1,-2,-3,-4,-7,-13,-14,-12)])#Second order matrix for x variables
airbnb_numeric_mat2nd[1:5,]
## minimum_nights number_of_reviews floor noise(dB)
## [1,] 1 9 1 69.05646
## [2,] 1 45 1 56.05428
## [3,] 3 0 1 56.05428
## [4,] 1 270 1 69.05646
## [5,] 10 9 1 56.05428
## neighbourhood_group_numeric room_type_numeric minimum_nightsminimum_nights
## [1,] 2 2 1
## [2,] 3 1 1
## [3,] 3 2 9
## [4,] 2 1 1
## [5,] 3 1 100
## minimum_nightsnumber_of_reviews minimum_nightsfloor
## [1,] 9 1
## [2,] 45 1
## [3,] 0 3
## [4,] 270 1
## [5,] 90 10
## minimum_nightsnoise(dB) minimum_nightsneighbourhood_group_numeric
## [1,] 69.05646 2
## [2,] 56.05428 3
## [3,] 168.16283 9
## [4,] 69.05646 2
## [5,] 560.54278 30
## minimum_nightsroom_type_numeric number_of_reviewsnumber_of_reviews
## [1,] 2 81
## [2,] 1 2025
## [3,] 6 0
## [4,] 1 72900
## [5,] 10 81
## number_of_reviewsfloor number_of_reviewsnoise(dB)
## [1,] 9 621.5082
## [2,] 45 2522.4425
## [3,] 0 0.0000
## [4,] 270 18645.2454
## [5,] 9 504.4885
## number_of_reviewsneighbourhood_group_numeric
## [1,] 18
## [2,] 135
## [3,] 0
## [4,] 540
## [5,] 27
## number_of_reviewsroom_type_numeric floorfloor floornoise(dB)
## [1,] 18 1 69.05646
## [2,] 45 1 56.05428
## [3,] 0 1 56.05428
## [4,] 270 1 69.05646
## [5,] 9 1 56.05428
## floorneighbourhood_group_numeric floorroom_type_numeric noise(dB)noise(dB)
## [1,] 2 2 4768.795
## [2,] 3 1 3142.082
## [3,] 3 2 3142.082
## [4,] 2 1 4768.795
## [5,] 3 1 3142.082
## noise(dB)neighbourhood_group_numeric noise(dB)room_type_numeric
## [1,] 138.1129 138.11293
## [2,] 168.1628 56.05428
## [3,] 168.1628 112.10856
## [4,] 138.1129 69.05646
## [5,] 168.1628 56.05428
## neighbourhood_group_numericneighbourhood_group_numeric
## [1,] 4
## [2,] 9
## [3,] 9
## [4,] 4
## [5,] 9
## neighbourhood_group_numericroom_type_numeric
## [1,] 4
## [2,] 3
## [3,] 6
## [4,] 2
## [5,] 3
## room_type_numericroom_type_numeric
## [1,] 4
## [2,] 1
## [3,] 4
## [4,] 1
## [5,] 1
#dim(airbnb_numeric_mat2nd)
Call leaps and lars
#Run combined.leap.lars function for full
results.leaps.lars<-combined.leaps.lars(both = F,leaps = T,lars = F,airbnb_numeric_mat2nd,airbnb_numeric_mat[,4])
## [1] 1
## [1] "Press= 471881742.366937"
## [1] "MPSE= 12077.5445309037"
## [1] "Cp= 22.3289521682236"
## [1] "coef1="
## Intercept
## -8.238058e+04
## number_of_reviews
## -1.883703e+00
## floor
## 4.023336e+03
## noise(dB)
## 1.178333e+03
## neighbourhood_group_numeric
## 2.470269e+04
## room_type_numeric
## -2.674480e+02
## minimum_nightsminimum_nights
## 8.917956e-04
## minimum_nightsfloor
## 1.506043e-01
## minimum_nightsnoise(dB)
## -4.008688e-03
## minimum_nightsroom_type_numeric
## -1.659697e-01
## number_of_reviewsnumber_of_reviews
## 3.037553e-04
## number_of_reviewsfloor
## -1.163264e-01
## number_of_reviewsnoise(dB)
## 2.168555e-02
## number_of_reviewsneighbourhood_group_numeric
## 2.229159e-01
## floorfloor
## 8.710118e+01
## floornoise(dB)
## -4.818533e+01
## floorneighbourhood_group_numeric
## -4.688911e+02
## floorroom_type_numeric
## -5.300624e+00
## noise(dB)noise(dB)
## -3.027349e+00
## noise(dB)neighbourhood_group_numeric
## -1.928135e+02
## noise(dB)room_type_numeric
## 1.282519e+00
## neighbourhood_group_numericneighbourhood_group_numeric
## -1.726516e+03
## neighbourhood_group_numericroom_type_numeric
## 1.183899e+01
## room_type_numericroom_type_numeric
## 3.001968e+01

## [1] "Correlation between leaps prediction and actual yvec"
## [,1]
## [1,] 0.8754645
## [1] 2
## [1] "Press= 471995683.595577"
## [1] "MPSE= 12080.4607917785"
## [1] "Cp= 23.3641633402876"
## [1] 3
## [1] "Press= 471860530.489849"
## [1] "MPSE= 12077.3107368787"
## [1] "Cp= 23.6018489151684"
## [1] 4
## [1] "Press= 471873177.429209"
## [1] "MPSE= 12077.634436376"
## [1] "Cp= 23.6773073182849"
## [1] 5
## [1] "Press= 472174338.347263"
## [1] "MPSE= 12085.3426758962"
## [1] "Cp= 24.2468334405494"
## [1] 6
## [1] "Press= 472142437.090343"
## [1] "MPSE= 12084.52616049"
## [1] "Cp= 24.3257406269622"
## [1] 7
## [1] "Press= 471968785.095022"
## [1] "MPSE= 12080.0815227802"
## [1] "Cp= 24.5205753269547"
## [1] 8
## [1] "Press= 471990323.009525"
## [1] "MPSE= 12080.6327875486"
## [1] "Cp= 24.7096010655805"
## [1] 9
## [1] "Press= 471857994.35842"
## [1] "MPSE= 12077.5549504318"
## [1] "Cp= 25.0574108303845"
## [1] 10
## [1] "Press= 472233685.335273"
## [1] "MPSE= 12086.8616671429"
## [1] "Cp= 25.1560561789447"
## [1] "which matrix:"
## [1] "matwhich 1"
#results.leaps.lars
#Run combined.leap.lars function for full
results.leaps.lars<-combined.leaps.lars(both = F,leaps = F,lars = T,airbnb_numeric_mat2nd,airbnb_numeric_mat[,4])

## [1] "Correlation between predicted Price and actual Price Lars"
## [,1]
## [1,] 0.8458776
#results.leaps.lars
#airbnb_numeric_mat
Build the linear model on the output of leaps variable selection.
Run predictions on the test data and write to a .csv file.
linear.airbnb<-lm(price ~ minimum_nights +floor + `noise(dB)` + neighbourhood_group_numeric + room_type_numeric + I(minimum_nights^2) + minimum_nights*floor + minimum_nights*`noise(dB)` + minimum_nights*neighbourhood_group_numeric + minimum_nights*room_type_numeric + number_of_reviews*number_of_reviews + number_of_reviews*floor + number_of_reviews*neighbourhood_group_numeric + number_of_reviews*room_type_numeric + I(floor^2) + floor*`noise(dB)` + floor*neighbourhood_group_numeric + floor*room_type_numeric + I(`noise(dB)`^2) + `noise(dB)`*neighbourhood_group_numeric + `noise(dB)`*room_type_numeric + I(neighbourhood_group_numeric^2) + neighbourhood_group_numeric*room_type_numeric + I(room_type_numeric^2), data = airbnb_train_numeric)
get_regression_summaries(linear.airbnb)
## # A tibble: 1 × 9
## r_squared adj_r_squared mse rmse sigma statistic p_value df nobs
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.818 0.818 6393. 80.0 80.0 5102. 0 24 27220
get_regression_table(linear.airbnb)
## # A tibble: 25 × 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept -1.06e+5 1612. -65.8 0 -1.09e+5 -1.03e+5
## 2 minimum_nights 1.28e+0 0.886 1.45 0.148 -4.56e-1 3.02e+0
## 3 floor 3.70e+3 60.7 60.9 0 3.58e+3 3.82e+3
## 4 `noise(dB)` 1.63e+3 35.8 45.6 0 1.56e+3 1.70e+3
## 5 neighbourhood_group_n… 3.08e+4 387. 79.7 0 3.01e+4 3.16e+4
## 6 room_type_numeric -3.01e+2 29.7 -10.1 0 -3.59e+2 -2.42e+2
## 7 I(minimum_nights^2) 1 e-3 0 5.18 0 1 e-3 1 e-3
## 8 number_of_reviews -1.7 e-2 0.051 -0.343 0.731 -1.17e-1 8.2 e-2
## 9 I(floor^2) 8.19e+1 0.672 122. 0 8.05e+1 8.32e+1
## 10 I(`noise(dB)`^2) -5.15e+0 0.196 -26.3 0 -5.53e+0 -4.76e+0
## # ℹ 15 more rows
points<-get_regression_points(linear.airbnb)
ggplot(data=points, aes(x=residual)) + geom_histogram(color="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=points, aes(x=price_hat, y=residual)) + geom_point()

points %>%
ggplot(aes(sample = residual)) +
stat_qq() +
stat_qq_line()

airbnb.predictions <- predict(linear.airbnb,airbnb_test)
#write.csv(airbnb.predictions,file = "C:/Users/anish/Documents/predicitions_airbnb.csv")
Build the linear model on the output of lars variable selection. Run
predictions on the test data and write to a .csv file.
linear.airbnb<-lm(price ~ minimum_nights + number_of_reviews + floor + `noise(dB)` + neighbourhood_group_numeric + room_type_numeric + I(minimum_nights^2) + minimum_nights*number_of_reviews + minimum_nights*floor + minimum_nights*`noise(dB)` + minimum_nights*neighbourhood_group_numeric + minimum_nights*room_type_numeric + I(number_of_reviews^2) + number_of_reviews*floor + number_of_reviews*`noise(dB)` + number_of_reviews*neighbourhood_group_numeric + number_of_reviews*room_type_numeric + I(floor^2) + floor*`noise(dB)` + floor*neighbourhood_group_numeric + floor*room_type_numeric + I(`noise(dB)`^2) + `noise(dB)`*room_type_numeric + I(neighbourhood_group_numeric^2) + neighbourhood_group_numeric*room_type_numeric + I(room_type_numeric^2), data = airbnb_train_numeric)
get_regression_summaries(linear.airbnb)
## # A tibble: 1 × 9
## r_squared adj_r_squared mse rmse sigma statistic p_value df nobs
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.798 0.798 7110. 84.3 84.4 4129. 0 26 27220
get_regression_table(linear.airbnb)
## # A tibble: 27 × 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept -1.75e+4 206. -84.9 0 -1.79e+4 -1.71e+4
## 2 minimum_nights 1.75e+0 0.938 1.86 0.063 -9.3 e-2 3.58e+0
## 3 number_of_reviews -8.97e-1 0.385 -2.33 0.02 -1.65e+0 -1.41e-1
## 4 floor 6.68e+3 29.5 226. 0 6.62e+3 6.74e+3
## 5 `noise(dB)` -3.25e+2 6.10 -53.2 0 -3.37e+2 -3.13e+2
## 6 neighbourhood_group_n… 9.54e+3 45.5 210. 0 9.46e+3 9.63e+3
## 7 room_type_numeric -1.27e+2 31.2 -4.06 0 -1.88e+2 -6.55e+1
## 8 I(minimum_nights^2) 1 e-3 0 6.9 0 1 e-3 2 e-3
## 9 I(number_of_reviews^2) 0 0 2.94 0.003 0 0
## 10 I(floor^2) 1.07e+2 0.529 201. 0 1.06e+2 1.08e+2
## # ℹ 17 more rows
points<-get_regression_points(linear.airbnb)
ggplot(data=points, aes(x=residual)) + geom_histogram(color="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=points, aes(x=price_hat, y=residual)) + geom_point()

points %>%
ggplot(aes(sample = residual)) +
stat_qq() +
stat_qq_line()
