Prepare Data

Read the data

airbnb_original <- read_csv("airbnb_eda.csv")
## Rows: 39118 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): name, host_name, neighbourhood_group, neighbourhood, room_type, la...
## dbl (10): id, host_id, latitude, longitude, price, minimum_nights, number_of...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(airbnb_original)
## Rows: 39,118
## Columns: 17
## $ id                  <dbl> 2539, 2595, 3647, 3831, 5022, 5099, 5121, 5203, 52…
## $ name                <chr> "Clean & quiet apt home by the park", "Skylit Midt…
## $ host_id             <dbl> 2787, 2845, 4632, 4869, 7192, 7322, 7356, 7490, 75…
## $ host_name           <chr> "John", "Jennifer", "Elisabeth", "LisaRoxanne", "L…
## $ neighbourhood_group <chr> "Brooklyn", "Manhattan", "Manhattan", "Brooklyn", …
## $ neighbourhood       <chr> "Kensington", "Midtown", "Harlem", "Clinton Hill",…
## $ latitude            <dbl> 40.64749, 40.75362, 40.80902, 40.68514, 40.79851, …
## $ longitude           <dbl> -73.97237, -73.98377, -73.94190, -73.95976, -73.94…
## $ room_type           <chr> "Private room", "Entire home/apt", "Private room",…
## $ price               <dbl> 149, 225, 60, 45, 80, 200, 60, 32, 150, 54, 85, 48…
## $ minimum_nights      <dbl> 1, 1, 3, 1, 10, 3, 45, 2, 1, 5, 2, 90, 2, 2, 1, 3,…
## $ number_of_reviews   <dbl> 9, 45, 0, 270, 9, 74, 49, 118, 160, 53, 188, 27, 1…
## $ last_review         <chr> "10/19/2018", "5/21/2019", NA, "7/5/2019", "11/19/…
## $ reviews_per_month   <dbl> 0.21, 0.38, NA, 4.64, 0.10, 0.59, 0.40, 0.99, 1.33…
## $ floor               <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ `noise(dB)`         <dbl> 69.05646, 56.05428, 56.05428, 69.05646, 56.05428, …
## $ Location            <chr> "807, Friel Place, Brooklyn, Kings County, City of…

Feature Engineering: Part 1

#Summary of airbnb
summary(airbnb_original)
##        id               name              host_id           host_name        
##  Min.   :    2539   Length:39118       Min.   :     2438   Length:39118      
##  1st Qu.: 9436041   Class :character   1st Qu.:  7789663   Class :character  
##  Median :19637846   Mode  :character   Median : 30616863   Mode  :character  
##  Mean   :18980697                      Mean   : 67230185                     
##  3rd Qu.:29079859                      3rd Qu.:107270482                     
##  Max.   :36487245                      Max.   :274321313                     
##                                                                              
##  neighbourhood_group neighbourhood         latitude       longitude     
##  Length:39118        Length:39118       Min.   :40.51   Min.   :-74.24  
##  Class :character    Class :character   1st Qu.:40.69   1st Qu.:-73.98  
##  Mode  :character    Mode  :character   Median :40.72   Median :-73.96  
##                                         Mean   :40.73   Mean   :-73.95  
##                                         3rd Qu.:40.76   3rd Qu.:-73.94  
##                                         Max.   :40.91   Max.   :-73.71  
##                                                                         
##   room_type             price         minimum_nights     number_of_reviews
##  Length:39118       Min.   :    0.0   Min.   :   1.000   Min.   :  0.00   
##  Class :character   1st Qu.:   60.0   1st Qu.:   1.000   1st Qu.:  1.00   
##  Mode  :character   Median :   99.0   Median :   2.000   Median :  5.00   
##                     Mean   :  140.4   Mean   :   7.004   Mean   : 23.46   
##                     3rd Qu.:  165.0   3rd Qu.:   5.000   3rd Qu.: 24.00   
##                     Max.   :10000.0   Max.   :1250.000   Max.   :629.00   
##                                                                           
##  last_review        reviews_per_month     floor          noise(dB)    
##  Length:39118       Min.   : 0.010    Min.   : 0.000   Min.   :22.96  
##  Class :character   1st Qu.: 0.190    1st Qu.: 1.000   1st Qu.:56.05  
##  Mode  :character   Median : 0.720    Median : 1.000   Median :62.48  
##                     Mean   : 1.375    Mean   : 1.582   Mean   :62.70  
##                     3rd Qu.: 2.030    3rd Qu.: 1.000   3rd Qu.:69.06  
##                     Max.   :58.500    Max.   :20.000   Max.   :98.06  
##                     NA's   :7978                                      
##    Location        
##  Length:39118      
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
#Convert neighbourhood_group, neighborhood and room_type to factor.
airbnb_original$neighbourhood_group<- as.factor(airbnb_original$neighbourhood_group)
airbnb_original$neighbourhood<- as.factor(airbnb_original$neighbourhood)
airbnb_original$room_type<- as.factor(airbnb_original$room_type)

airbnb_original %>% 
     sapply(levels)
## $id
## NULL
## 
## $name
## NULL
## 
## $host_id
## NULL
## 
## $host_name
## NULL
## 
## $neighbourhood_group
## [1] "Bronx"         "Brooklyn"      "Manhattan"     "Queens"       
## [5] "Staten Island"
## 
## $neighbourhood
##   [1] "Allerton"                   "Arden Heights"             
##   [3] "Arrochar"                   "Arverne"                   
##   [5] "Astoria"                    "Bath Beach"                
##   [7] "Battery Park City"          "Bay Ridge"                 
##   [9] "Bay Terrace"                "Bay Terrace, Staten Island"
##  [11] "Baychester"                 "Bayside"                   
##  [13] "Bayswater"                  "Bedford-Stuyvesant"        
##  [15] "Belle Harbor"               "Bellerose"                 
##  [17] "Belmont"                    "Bensonhurst"               
##  [19] "Bergen Beach"               "Boerum Hill"               
##  [21] "Borough Park"               "Breezy Point"              
##  [23] "Briarwood"                  "Brighton Beach"            
##  [25] "Bronxdale"                  "Brooklyn Heights"          
##  [27] "Brownsville"                "Bull's Head"               
##  [29] "Bushwick"                   "Cambria Heights"           
##  [31] "Canarsie"                   "Carroll Gardens"           
##  [33] "Castle Hill"                "Castleton Corners"         
##  [35] "Chelsea"                    "Chinatown"                 
##  [37] "City Island"                "Civic Center"              
##  [39] "Claremont Village"          "Clason Point"              
##  [41] "Clifton"                    "Clinton Hill"              
##  [43] "Co-op City"                 "Cobble Hill"               
##  [45] "College Point"              "Columbia St"               
##  [47] "Concord"                    "Concourse"                 
##  [49] "Concourse Village"          "Coney Island"              
##  [51] "Corona"                     "Crown Heights"             
##  [53] "Cypress Hills"              "Ditmars Steinway"          
##  [55] "Dongan Hills"               "Douglaston"                
##  [57] "Downtown Brooklyn"          "DUMBO"                     
##  [59] "Dyker Heights"              "East Elmhurst"             
##  [61] "East Flatbush"              "East Harlem"               
##  [63] "East Morrisania"            "East New York"             
##  [65] "East Village"               "Eastchester"               
##  [67] "Edenwald"                   "Edgemere"                  
##  [69] "Elmhurst"                   "Eltingville"               
##  [71] "Emerson Hill"               "Far Rockaway"              
##  [73] "Fieldston"                  "Financial District"        
##  [75] "Flatbush"                   "Flatiron District"         
##  [77] "Flatlands"                  "Flushing"                  
##  [79] "Fordham"                    "Forest Hills"              
##  [81] "Fort Greene"                "Fort Hamilton"             
##  [83] "Fort Wadsworth"             "Fresh Meadows"             
##  [85] "Glendale"                   "Gowanus"                   
##  [87] "Gramercy"                   "Graniteville"              
##  [89] "Grant City"                 "Gravesend"                 
##  [91] "Great Kills"                "Greenpoint"                
##  [93] "Greenwich Village"          "Grymes Hill"               
##  [95] "Harlem"                     "Hell's Kitchen"            
##  [97] "Highbridge"                 "Hollis"                    
##  [99] "Holliswood"                 "Howard Beach"              
## [101] "Howland Hook"               "Huguenot"                  
## [103] "Hunts Point"                "Inwood"                    
## [105] "Jackson Heights"            "Jamaica"                   
## [107] "Jamaica Estates"            "Jamaica Hills"             
## [109] "Kensington"                 "Kew Gardens"               
## [111] "Kew Gardens Hills"          "Kingsbridge"               
## [113] "Kips Bay"                   "Laurelton"                 
## [115] "Lighthouse Hill"            "Little Italy"              
## [117] "Little Neck"                "Long Island City"          
## [119] "Longwood"                   "Lower East Side"           
## [121] "Manhattan Beach"            "Marble Hill"               
## [123] "Mariners Harbor"            "Maspeth"                   
## [125] "Melrose"                    "Middle Village"            
## [127] "Midland Beach"              "Midtown"                   
## [129] "Midwood"                    "Mill Basin"                
## [131] "Morningside Heights"        "Morris Heights"            
## [133] "Morris Park"                "Morrisania"                
## [135] "Mott Haven"                 "Mount Eden"                
## [137] "Mount Hope"                 "Murray Hill"               
## [139] "Navy Yard"                  "Neponsit"                  
## [141] "New Brighton"               "New Dorp"                  
## [143] "New Dorp Beach"             "New Springville"           
## [145] "NoHo"                       "Nolita"                    
## [147] "North Riverdale"            "Norwood"                   
## [149] "Oakwood"                    "Olinville"                 
## [151] "Ozone Park"                 "Park Slope"                
## [153] "Parkchester"                "Pelham Bay"                
## [155] "Pelham Gardens"             "Port Morris"               
## [157] "Port Richmond"              "Prince's Bay"              
## [159] "Prospect-Lefferts Gardens"  "Prospect Heights"          
## [161] "Queens Village"             "Randall Manor"             
## [163] "Red Hook"                   "Rego Park"                 
## [165] "Richmond Hill"              "Richmondtown"              
## [167] "Ridgewood"                  "Riverdale"                 
## [169] "Rockaway Beach"             "Roosevelt Island"          
## [171] "Rosebank"                   "Rosedale"                  
## [173] "Rossville"                  "Schuylerville"             
## [175] "Sea Gate"                   "Sheepshead Bay"            
## [177] "Shore Acres"                "Silver Lake"               
## [179] "SoHo"                       "Soundview"                 
## [181] "South Beach"                "South Ozone Park"          
## [183] "South Slope"                "Springfield Gardens"       
## [185] "Spuyten Duyvil"             "St. Albans"                
## [187] "St. George"                 "Stapleton"                 
## [189] "Stuyvesant Town"            "Sunnyside"                 
## [191] "Sunset Park"                "Theater District"          
## [193] "Throgs Neck"                "Todt Hill"                 
## [195] "Tompkinsville"              "Tottenville"               
## [197] "Tremont"                    "Tribeca"                   
## [199] "Two Bridges"                "Unionport"                 
## [201] "University Heights"         "Upper East Side"           
## [203] "Upper West Side"            "Van Nest"                  
## [205] "Vinegar Hill"               "Wakefield"                 
## [207] "Washington Heights"         "West Brighton"             
## [209] "West Farms"                 "West Village"              
## [211] "Westchester Square"         "Westerleigh"               
## [213] "Whitestone"                 "Williamsbridge"            
## [215] "Williamsburg"               "Willowbrook"               
## [217] "Windsor Terrace"            "Woodhaven"                 
## [219] "Woodlawn"                   "Woodside"                  
## 
## $latitude
## NULL
## 
## $longitude
## NULL
## 
## $room_type
## [1] "Entire home/apt" "Private room"    "Shared room"    
## 
## $price
## NULL
## 
## $minimum_nights
## NULL
## 
## $number_of_reviews
## NULL
## 
## $last_review
## NULL
## 
## $reviews_per_month
## NULL
## 
## $floor
## NULL
## 
## $`noise(dB)`
## NULL
## 
## $Location
## NULL
#Create new numeric variables from the factor variables neighbourhood_group, neighborhood and room_type to factor.
airbnb_original <- airbnb_original %>%
  mutate(neighbourhood_group_numeric=ifelse(neighbourhood_group == "Bronx", 1, ifelse(neighbourhood_group == "Brooklyn",2,ifelse(neighbourhood_group == "Manhattan",3,ifelse(neighbourhood_group == "Queens",4,5)))))

airbnb_original <- airbnb_original %>%
  mutate(room_type_numeric=ifelse(room_type == "Entire home/apt",1,ifelse(room_type == "Private room",2,3)))

airbnb_original <- airbnb_original %>%
  mutate(neighbourhood_numeric=as.numeric(neighbourhood))

#Convert last_review date to a date format.
airbnb_original <- airbnb_original %>%
  mutate(last_review = ifelse(is.na(last_review),"01/01/2000",last_review))%>%
  mutate(last_review = mdy(last_review))

#Replacing NA values for reviews_per_month with 0.
airbnb_original <- airbnb_original %>%
  mutate(reviews_per_month = ifelse(is.na(reviews_per_month), 0, reviews_per_month))

Feature Engineering: Part 2

#Drop columns that will not be useful in the price prediction
#name: The name of the host cannot have any impact on the price prediction
#host_id: It is just an id assigned to a host. It does not have any impact on the price
#last_review: The last review column is being dropped as it may not have an impact on price. 
#Location: It is the address of the place and since we are using latitude and longitude, I feel this is not needed to predict the price.
#View(airbnb_original)

airbnb <- airbnb_original %>%
  select(-name, -host_id, -host_name, -last_review, -Location)

Feature Engineering Part 3

#summary of airbnb variables
summary(airbnb)
##        id              neighbourhood_group            neighbourhood  
##  Min.   :    2539   Bronx        :  873    Williamsburg      : 3146  
##  1st Qu.: 9436041   Brooklyn     :16084    Bedford-Stuyvesant: 2987  
##  Median :19637846   Manhattan    :17329    Harlem            : 2140  
##  Mean   :18980697   Queens       : 4533    Bushwick          : 1980  
##  3rd Qu.:29079859   Staten Island:  299    Hell's Kitchen    : 1575  
##  Max.   :36487245                          Upper West Side   : 1539  
##                                            (Other)           :25751  
##     latitude       longitude                room_type         price        
##  Min.   :40.51   Min.   :-74.24   Entire home/apt:20276   Min.   :    0.0  
##  1st Qu.:40.69   1st Qu.:-73.98   Private room   :17912   1st Qu.:   60.0  
##  Median :40.72   Median :-73.96   Shared room    :  930   Median :   99.0  
##  Mean   :40.73   Mean   :-73.95                           Mean   :  140.4  
##  3rd Qu.:40.76   3rd Qu.:-73.94                           3rd Qu.:  165.0  
##  Max.   :40.91   Max.   :-73.71                           Max.   :10000.0  
##                                                                            
##  minimum_nights     number_of_reviews reviews_per_month     floor       
##  Min.   :   1.000   Min.   :  0.00    Min.   : 0.000    Min.   : 0.000  
##  1st Qu.:   1.000   1st Qu.:  1.00    1st Qu.: 0.040    1st Qu.: 1.000  
##  Median :   2.000   Median :  5.00    Median : 0.380    Median : 1.000  
##  Mean   :   7.004   Mean   : 23.46    Mean   : 1.095    Mean   : 1.582  
##  3rd Qu.:   5.000   3rd Qu.: 24.00    3rd Qu.: 1.600    3rd Qu.: 1.000  
##  Max.   :1250.000   Max.   :629.00    Max.   :58.500    Max.   :20.000  
##                                                                         
##    noise(dB)     neighbourhood_group_numeric room_type_numeric
##  Min.   :22.96   Min.   :1.000               Min.   :1.000    
##  1st Qu.:56.05   1st Qu.:2.000               1st Qu.:1.000    
##  Median :62.48   Median :3.000               Median :1.000    
##  Mean   :62.70   Mean   :2.675               Mean   :1.505    
##  3rd Qu.:69.06   3rd Qu.:3.000               3rd Qu.:2.000    
##  Max.   :98.06   Max.   :5.000               Max.   :3.000    
##                                                               
##  neighbourhood_numeric
##  Min.   :  1.0        
##  1st Qu.: 52.0        
##  Median : 95.0        
##  Mean   :107.8        
##  3rd Qu.:179.0        
##  Max.   :220.0        
## 
#Checking for and removing outliers
airbnb %>%
  select_if(is.numeric) %>%
  select(-id) %>%
  gather(yval, val, -price) %>%
  ggplot(aes(price, val)) +
  geom_point() +
  facet_grid(yval~.)

airbnb <- subset(airbnb, airbnb$price > 0 & airbnb$price < 7500)

airbnb %>%
  ggplot(aes(price,number_of_reviews))+
  geom_point()

airbnb %>%
  ggplot(aes(price,reviews_per_month))+
  geom_point()

airbnb <- subset(airbnb, airbnb$reviews_per_month < 30)

airbnb %>%
  ggplot(aes(price,minimum_nights))+
  geom_point()

airbnb <- subset(airbnb, airbnb$minimum_nights < 600)

Feature Engineering Part 4

#Create Plots to check the relationship between Price and the other variables and apply transformations where necessary
airbnb %>%
  ggplot(aes(x=price)) + geom_histogram(color="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

airbnb %>%
  ggplot(aes(x=log(price))) + geom_histogram(color="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

airbnb%>%
  ggplot(aes(x=minimum_nights)) + geom_histogram(color="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

model1 <- lm(data=airbnb, price ~ minimum_nights )
get_regression_table(model1)
## # A tibble: 2 × 7
##   term           estimate std_error statistic p_value lower_ci upper_ci
##   <chr>             <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept       135.        1.03      131.        0  133.     137.   
## 2 minimum_nights    0.545     0.054      10.1       0    0.439    0.651
points<-get_regression_points(model1)
ggplot(data=points, aes(x=residual)) + geom_histogram(color="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=points, aes(x=price_hat, y=residual)) + geom_point()

#Transforming the price and minimum nights to log form as they are right skewed.
airbnb<-airbnb %>%
  mutate(lnprice=log(price), lnmin_nights=log(minimum_nights))

ggplot(data=airbnb, aes(x=lnprice)) + geom_histogram(color="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=airbnb, aes(x=lnmin_nights)) + geom_histogram(color="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

head(airbnb)
## # A tibble: 6 × 17
##      id neighbourhood_group neighbourhood latitude longitude room_type     price
##   <dbl> <fct>               <fct>            <dbl>     <dbl> <fct>         <dbl>
## 1  2539 Brooklyn            Kensington        40.6     -74.0 Private room    149
## 2  2595 Manhattan           Midtown           40.8     -74.0 Entire home/…   225
## 3  3647 Manhattan           Harlem            40.8     -73.9 Private room     60
## 4  3831 Brooklyn            Clinton Hill      40.7     -74.0 Entire home/…    45
## 5  5022 Manhattan           East Harlem       40.8     -73.9 Entire home/…    80
## 6  5099 Manhattan           Murray Hill       40.7     -74.0 Entire home/…   200
## # ℹ 10 more variables: minimum_nights <dbl>, number_of_reviews <dbl>,
## #   reviews_per_month <dbl>, floor <dbl>, `noise(dB)` <dbl>,
## #   neighbourhood_group_numeric <dbl>, room_type_numeric <dbl>,
## #   neighbourhood_numeric <dbl>, lnprice <dbl>, lnmin_nights <dbl>
model1 <- lm(data=airbnb, lnprice ~ lnmin_nights)

get_regression_table(model1)
## # A tibble: 2 × 7
##   term         estimate std_error statistic p_value lower_ci upper_ci
##   <chr>           <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept       4.56      0.006     826.        0    4.55     4.57 
## 2 lnmin_nights    0.044     0.004      12.5       0    0.037    0.051
points<-get_regression_points(model1)
ggplot(data=points, aes(x=residual)) + geom_histogram(color="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=points, aes(x=lnprice_hat, y=residual)) + geom_point()

#Create some tables and some boxplots for the factor variables
table(airbnb$neighbourhood_group)
## 
##         Bronx      Brooklyn     Manhattan        Queens Staten Island 
##           872         16074         17318          4532           299
table(airbnb$neighbourhood)
## 
##                   Allerton              Arden Heights 
##                         33                          4 
##                   Arrochar                    Arverne 
##                         15                         68 
##                    Astoria                 Bath Beach 
##                        717                         16 
##          Battery Park City                  Bay Ridge 
##                         52                        114 
##                Bay Terrace Bay Terrace, Staten Island 
##                          5                          1 
##                 Baychester                    Bayside 
##                          7                         31 
##                  Bayswater         Bedford-Stuyvesant 
##                         12                       2984 
##               Belle Harbor                  Bellerose 
##                          6                         12 
##                    Belmont                Bensonhurst 
##                         21                         61 
##               Bergen Beach                Boerum Hill 
##                          6                        147 
##               Borough Park               Breezy Point 
##                        105                          2 
##                  Briarwood             Brighton Beach 
##                         50                         63 
##                  Bronxdale           Brooklyn Heights 
##                         13                        126 
##                Brownsville                Bull's Head 
##                         53                          4 
##                   Bushwick            Cambria Heights 
##                       1978                         23 
##                   Canarsie            Carroll Gardens 
##                        114                        184 
##                Castle Hill          Castleton Corners 
##                          8                          4 
##                    Chelsea                  Chinatown 
##                        884                        294 
##                City Island               Civic Center 
##                         14                         40 
##          Claremont Village               Clason Point 
##                         19                         18 
##                    Clifton               Clinton Hill 
##                         13                        475 
##                 Co-op City                Cobble Hill 
##                          1                         77 
##              College Point                Columbia St 
##                         16                         34 
##                    Concord                  Concourse 
##                         20                         39 
##          Concourse Village               Coney Island 
##                         28                         13 
##                     Corona              Crown Heights 
##                         47                       1253 
##              Cypress Hills           Ditmars Steinway 
##                        103                        245 
##               Dongan Hills                 Douglaston 
##                          6                          8 
##          Downtown Brooklyn                      DUMBO 
##                         61                         26 
##              Dyker Heights              East Elmhurst 
##                         10                        153 
##              East Flatbush                East Harlem 
##                        401                        912 
##            East Morrisania              East New York 
##                          8                        166 
##               East Village                Eastchester 
##                       1506                         12 
##                   Edenwald                   Edgemere 
##                          8                          9 
##                   Elmhurst                Eltingville 
##                        192                          3 
##               Emerson Hill               Far Rockaway 
##                          1                         25 
##                  Fieldston         Financial District 
##                         10                        594 
##                   Flatbush          Flatiron District 
##                        500                         63 
##                  Flatlands                   Flushing 
##                         63                        320 
##                    Fordham               Forest Hills 
##                         49                        115 
##                Fort Greene              Fort Hamilton 
##                        377                         40 
##             Fort Wadsworth              Fresh Meadows 
##                          1                         24 
##                   Glendale                    Gowanus 
##                         43                        194 
##                   Gramercy               Graniteville 
##                        291                          3 
##                 Grant City                  Gravesend 
##                          5                         55 
##                Great Kills                 Greenpoint 
##                          9                        870 
##          Greenwich Village                Grymes Hill 
##                        308                          5 
##                     Harlem             Hell's Kitchen 
##                       2140                       1575 
##                 Highbridge                     Hollis 
##                         23                         10 
##                 Holliswood               Howard Beach 
##                          3                         16 
##               Howland Hook                   Huguenot 
##                          2                          2 
##                Hunts Point                     Inwood 
##                         14                        204 
##            Jackson Heights                    Jamaica 
##                        157                        185 
##            Jamaica Estates              Jamaica Hills 
##                         14                          7 
##                 Kensington                Kew Gardens 
##                        147                         25 
##          Kew Gardens Hills                Kingsbridge 
##                         22                         57 
##                   Kips Bay                  Laurelton 
##                        371                         12 
##            Lighthouse Hill               Little Italy 
##                          1                         95 
##                Little Neck           Long Island City 
##                          3                        429 
##                   Longwood            Lower East Side 
##                         52                        736 
##            Manhattan Beach                Marble Hill 
##                          8                         10 
##            Mariners Harbor                    Maspeth 
##                          7                         95 
##                    Melrose             Middle Village 
##                          9                         20 
##              Midland Beach                    Midtown 
##                          6                       1225 
##                    Midwood                 Mill Basin 
##                         90                          3 
##        Morningside Heights             Morris Heights 
##                        273                         14 
##                Morris Park                 Morrisania 
##                         10                         12 
##                 Mott Haven                 Mount Eden 
##                         41                          5 
##                 Mount Hope                Murray Hill 
##                         15                        394 
##                  Navy Yard                   Neponsit 
##                         12                          1 
##               New Brighton                   New Dorp 
##                          3                          1 
##             New Dorp Beach            New Springville 
##                          5                          7 
##                       NoHo                     Nolita 
##                         60                        187 
##            North Riverdale                    Norwood 
##                          7                         27 
##                    Oakwood                  Olinville 
##                          4                          3 
##                 Ozone Park                 Park Slope 
##                         53                        420 
##                Parkchester                 Pelham Bay 
##                         28                         13 
##             Pelham Gardens                Port Morris 
##                         25                         42 
##              Port Richmond               Prince's Bay 
##                          6                          3 
##  Prospect-Lefferts Gardens           Prospect Heights 
##                        432                        289 
##             Queens Village              Randall Manor 
##                         52                         17 
##                   Red Hook                  Rego Park 
##                         65                         90 
##              Richmond Hill               Richmondtown 
##                         71                          1 
##                  Ridgewood                  Riverdale 
##                        338                          8 
##             Rockaway Beach           Roosevelt Island 
##                         47                         68 
##                   Rosebank                   Rosedale 
##                          6                         48 
##                  Rossville              Schuylerville 
##                          1                         10 
##                   Sea Gate             Sheepshead Bay 
##                          7                        124 
##                Shore Acres                Silver Lake 
##                          5                          1 
##                       SoHo                  Soundview 
##                        290                         13 
##                South Beach           South Ozone Park 
##                          5                         36 
##                South Slope        Springfield Gardens 
##                        229                         68 
##             Spuyten Duyvil                 St. Albans 
##                          4                         59 
##                 St. George                  Stapleton 
##                         40                         21 
##            Stuyvesant Town                  Sunnyside 
##                         34                        290 
##                Sunset Park           Theater District 
##                        288                        227 
##                Throgs Neck                  Todt Hill 
##                         21                          3 
##              Tompkinsville                Tottenville 
##                         36                          4 
##                    Tremont                    Tribeca 
##                          8                        139 
##                Two Bridges                  Unionport 
##                         57                          6 
##         University Heights            Upper East Side 
##                         19                       1434 
##            Upper West Side                   Van Nest 
##                       1538                          8 
##               Vinegar Hill                  Wakefield 
##                         27                         39 
##         Washington Heights              West Brighton 
##                        718                         15 
##                 West Farms               West Village 
##                          1                        599 
##         Westchester Square                Westerleigh 
##                          9                          2 
##                 Whitestone             Williamsbridge 
##                         11                         33 
##               Williamsburg                Willowbrook 
##                       3144                          1 
##            Windsor Terrace                  Woodhaven 
##                        120                         72 
##                   Woodlawn                   Woodside 
##                          8                        175
table(airbnb$room_type)
## 
## Entire home/apt    Private room     Shared room 
##           20266           17901             928
ggplot(data=airbnb, aes(y=lnprice,fill= neighbourhood_group)) + geom_boxplot()

ggplot(data=airbnb, aes(y=lnprice,fill=room_type)) + geom_boxplot()

ggplot(data=airbnb, aes(y=lnprice,fill=neighbourhood)) + geom_boxplot()

#Creating scatterplots for the numeric variables
ggplot(data=airbnb, aes(x=lnmin_nights, y=lnprice)) + geom_point()

ggplot(data=airbnb, aes(x=longitude, y=lnprice)) + geom_point()

ggplot(data=airbnb, aes(x=latitude, y=lnprice)) + geom_point()

ggplot(data=airbnb, aes(x=floor, y=lnprice)) + geom_point()

ggplot(data=airbnb, aes(x=`noise(dB)`, y=lnprice)) + geom_point()

ggplot(data=airbnb, aes(x=number_of_reviews, y=lnprice)) + geom_point()

ggplot(data=airbnb, aes(x=reviews_per_month, y=lnprice)) + geom_point()

#Running the lnprice vs number_of_reviews resulted in a cone shaped graph and so plotting against log(number_of_reviews)
ggplot(data=airbnb, aes(x=log(number_of_reviews), y=lnprice)) + geom_point()

#Running the lnprice vs reviews_per_month resulted in a cone shaped graph and so plotting against log(reviews_per_month)
ggplot(data=airbnb, aes(x=log(reviews_per_month), y=lnprice)) + geom_point()

Feature Engineering Part 5

#Create a numeric dataframe for models that require numeric only or perform better with numeric variables
airbnb_numeric <- airbnb %>%
  select(-neighbourhood_group,-neighbourhood,-room_type)

Checking to see how a model works with and without the log transformed variables

model0<-lm(price~neighbourhood_group + neighbourhood+ latitude + longitude + room_type + number_of_reviews + reviews_per_month + floor + `noise(dB)` + minimum_nights,airbnb)

get_regression_summaries(model0)
## # A tibble: 1 × 9
##   r_squared adj_r_squared   mse  rmse sigma statistic p_value    df  nobs
##       <dbl>         <dbl> <dbl> <dbl> <dbl>     <dbl>   <dbl> <dbl> <dbl>
## 1     0.744         0.742 9231.  96.1  96.4      495.       0   228 39095
get_regression_table(model0)
## # A tibble: 233 × 7
##    term                   estimate std_error statistic p_value lower_ci upper_ci
##    <chr>                     <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
##  1 intercept               -3.82e4   6318.      -6.05    0     -50588.   -2.58e4
##  2 neighbourhood_group: …   2.24e3     28.7     78.1     0       2188.    2.30e3
##  3 neighbourhood_group: …   3.35e3     34.3     97.8     0       3286.    3.42e3
##  4 neighbourhood_group: …   1.49e3     32.6     45.7     0       1426.    1.55e3
##  5 neighbourhood_group: …   3.95e3    108.      36.7     0       3735.    4.16e3
##  6 neighbourhood: Arden …  -2.73e2    108.      -2.53    0.011   -485.   -6.17e1
##  7 neighbourhood: Arroch…  -1.39e2     99.6     -1.39    0.164   -334.    5.66e1
##  8 neighbourhood: Arverne   1.17e2     21.0      5.57    0         75.7   1.58e2
##  9 neighbourhood: Astoria   1.04e0      8.38     0.124   0.901    -15.4   1.75e1
## 10 neighbourhood: Bath B…  -4.19e1     26.1     -1.61    0.108    -93.1   9.24e0
## # ℹ 223 more rows
points<-get_regression_points(model0)
ggplot(data=points, aes(x=residual)) + geom_histogram(color="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=points, aes(x=price_hat, y=residual)) + geom_point()

points %>%
  ggplot(aes(sample = residual)) +
  stat_qq() +
  stat_qq_line()

model1 <- lm(data=airbnb, lnprice ~ neighbourhood_group +neighbourhood+ latitude + longitude + room_type + number_of_reviews + reviews_per_month + floor + `noise(dB)` + lnmin_nights )
get_regression_table(model1)
## # A tibble: 233 × 7
##    term                   estimate std_error statistic p_value lower_ci upper_ci
##    <chr>                     <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
##  1 intercept              -205.       30.8       -6.68   0     -266.    -145.   
##  2 neighbourhood_group: …    3.62      0.14      25.8    0        3.34     3.89 
##  3 neighbourhood_group: …    7.87      0.167     47.1    0        7.54     8.20 
##  4 neighbourhood_group: …    5.90      0.159     37.2    0        5.59     6.22 
##  5 neighbourhood_group: …   10.1       0.524     19.3    0        9.08    11.1  
##  6 neighbourhood: Arden …   -1.71      0.525     -3.25   0.001   -2.74    -0.677
##  7 neighbourhood: Arroch…   -1.01      0.485     -2.08   0.038   -1.96    -0.058
##  8 neighbourhood: Arverne    0.505     0.102      4.93   0        0.304    0.705
##  9 neighbourhood: Astoria    0.075     0.041      1.84   0.066   -0.005    0.155
## 10 neighbourhood: Bath B…   -0.413     0.127     -3.25   0.001   -0.662   -0.163
## # ℹ 223 more rows
get_regression_summaries(model1)
## # A tibble: 1 × 9
##   r_squared adj_r_squared   mse  rmse sigma statistic p_value    df  nobs
##       <dbl>         <dbl> <dbl> <dbl> <dbl>     <dbl>   <dbl> <dbl> <dbl>
## 1     0.611         0.609 0.219 0.468 0.469      268.       0   228 39095
points<-get_regression_points(model1)
points
## # A tibble: 39,095 × 14
##       ID lnprice neighbourhood_group neighbourhood  latitude longitude room_type
##    <int>   <dbl> <fct>               <fct>             <dbl>     <dbl> <fct>    
##  1     1    5.00 Brooklyn            Kensington         40.6     -74.0 Private …
##  2     2    5.42 Manhattan           Midtown            40.8     -74.0 Entire h…
##  3     3    4.09 Manhattan           Harlem             40.8     -73.9 Private …
##  4     4    3.81 Brooklyn            Clinton Hill       40.7     -74.0 Entire h…
##  5     5    4.38 Manhattan           East Harlem        40.8     -73.9 Entire h…
##  6     6    5.30 Manhattan           Murray Hill        40.7     -74.0 Entire h…
##  7     7    4.09 Brooklyn            Bedford-Stuyv…     40.7     -74.0 Private …
##  8     8    3.47 Manhattan           Upper West Si…     40.8     -74.0 Private …
##  9     9    5.01 Manhattan           Chinatown          40.7     -74.0 Entire h…
## 10    10    3.99 Manhattan           Upper West Si…     40.8     -74.0 Entire h…
## # ℹ 39,085 more rows
## # ℹ 7 more variables: number_of_reviews <dbl>, reviews_per_month <dbl>,
## #   floor <dbl>, `noise(dB)` <dbl>, lnmin_nights <dbl>, lnprice_hat <dbl>,
## #   residual <dbl>
ggplot(data=points, aes(x=residual)) + geom_histogram(color="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=points, aes(x=lnprice_hat, y=residual)) + geom_point()

points %>%
  ggplot(aes(sample = residual)) +
  stat_qq() +
  stat_qq_line()

Visualize Correlation Matrix

glimpse(airbnb_numeric)
## Rows: 39,095
## Columns: 14
## $ id                          <dbl> 2539, 2595, 3647, 3831, 5022, 5099, 5121, …
## $ latitude                    <dbl> 40.64749, 40.75362, 40.80902, 40.68514, 40…
## $ longitude                   <dbl> -73.97237, -73.98377, -73.94190, -73.95976…
## $ price                       <dbl> 149, 225, 60, 45, 80, 200, 60, 32, 150, 54…
## $ minimum_nights              <dbl> 1, 1, 3, 1, 10, 3, 45, 2, 1, 5, 2, 90, 2, …
## $ number_of_reviews           <dbl> 9, 45, 0, 270, 9, 74, 49, 118, 160, 53, 18…
## $ reviews_per_month           <dbl> 0.21, 0.38, 0.00, 4.64, 0.10, 0.59, 0.40, …
## $ floor                       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `noise(dB)`                 <dbl> 69.05646, 56.05428, 56.05428, 69.05646, 56…
## $ neighbourhood_group_numeric <dbl> 2, 3, 3, 2, 3, 3, 2, 3, 3, 3, 3, 3, 2, 2, …
## $ room_type_numeric           <dbl> 2, 1, 2, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, …
## $ neighbourhood_numeric       <dbl> 109, 128, 95, 42, 62, 138, 14, 203, 36, 20…
## $ lnprice                     <dbl> 5.003946, 5.416100, 4.094345, 3.806662, 4.…
## $ lnmin_nights                <dbl> 0.0000000, 0.0000000, 1.0986123, 0.0000000…
corrplot(cor(airbnb_numeric[,-1]))

Modeling

Split the dataset into test and train.

#make this example reproducible
set.seed(1)

#use 70% of dataset as training set and 30% as test set
sample <- sample(c(TRUE, FALSE), nrow(airbnb), replace=TRUE, prob=c(0.7,0.3))
airbnb_train  <- airbnb[sample, ]
airbnb_test   <- airbnb[!sample, ]

#The following R code sets all observations in our test data set to NA that contain the additional level that didn’t exist in our train data:
airbnb_test_new <- airbnb_test                                # Duplicate test data set
airbnb_test_new$neighbourhood[which(!(airbnb_test_new$neighbourhood %in% unique(airbnb_train$neighbourhood)))] <- NA  # Replace new levels by NA
airbnb_test_new 
## # A tibble: 11,708 × 17
##       id neighbourhood_group neighbourhood    latitude longitude room_type price
##    <dbl> <fct>               <fct>               <dbl>     <dbl> <fct>     <dbl>
##  1  3831 Brooklyn            Clinton Hill         40.7     -74.0 Entire h…    45
##  2  5099 Manhattan           Murray Hill          40.7     -74.0 Entire h…   200
##  3  5121 Brooklyn            Bedford-Stuyves…     40.7     -74.0 Private …    60
##  4  7322 Manhattan           Chelsea              40.7     -74.0 Private …   140
##  5  7750 Manhattan           East Harlem          40.8     -73.9 Entire h…   190
##  6  7801 Brooklyn            Williamsburg         40.7     -74.0 Entire h…   299
##  7  8025 Brooklyn            Park Slope           40.7     -74.0 Private …    80
##  8  8110 Brooklyn            Park Slope           40.7     -74.0 Private …   110
##  9 11708 Brooklyn            Bushwick             40.7     -73.9 Entire h…    43
## 10 12318 Manhattan           Upper West Side      40.8     -74.0 Private …   135
## # ℹ 11,698 more rows
## # ℹ 10 more variables: minimum_nights <dbl>, number_of_reviews <dbl>,
## #   reviews_per_month <dbl>, floor <dbl>, `noise(dB)` <dbl>,
## #   neighbourhood_group_numeric <dbl>, room_type_numeric <dbl>,
## #   neighbourhood_numeric <dbl>, lnprice <dbl>, lnmin_nights <dbl>
#use 70% of dataset as training set and 30% as test set
sample_numeric <- sample(c(TRUE, FALSE), nrow(airbnb_numeric), replace=TRUE, prob=c(0.7,0.3))
airbnb_train_numeric  <- airbnb_numeric[sample_numeric, ]
airbnb_test_numeric   <- airbnb_numeric[!sample_numeric, ]

#Get the train and test ID
train_id <- data.frame(ID = airbnb_train_numeric$id)
test_id  <- data.frame(ID = airbnb_test_numeric$id)

#Dropping the Id columns
airbnb_train_numeric <- airbnb_train_numeric[,-1]
airbnb_test_numeric  <- airbnb_test_numeric[,-1]

#Creating X train and test datasets and Y train and test datasets for the random forest model
X_train <- airbnb_train_numeric[, -c(3,12,13)]#Drop the dependent variables which are price and lnprice and create the X_train dataframe.
Y_train <- airbnb_train_numeric[, 3]#Select the price and create the Y_train dataframe.

X_test <- airbnb_test_numeric[, -c(3,12,13)]#Drop the dependent variables which are price and lnprice and create the X_test dataframe.

Y_test <- airbnb_test_numeric[, 3]#Select the price and create the Y_test dataframe.

#The following steps are specifically for the xgboost model
#Change target variable as a numeric vector and the rest of train dataset into matrix form.
train_mat<-as.matrix(X_train)
mode(train_mat)<-'double' #coerce categorical variables to change to numeric.

#Change target variable as a numeric vector and the rest of test dataset into matrix form.
test_mat<-as.matrix(X_test)
mode(test_mat)<-'double' #coerce categorical variables to change to numeric.

Model 1: OLS Linear Regresssion

model.ols<-lm(price ~ neighbourhood_group + neighbourhood+latitude + longitude + room_type + number_of_reviews + floor + `noise(dB)` + minimum_nights +  room_type*neighbourhood_group+ floor*neighbourhood_group + floor*room_type+  neighbourhood*floor+ room_type*neighbourhood+ neighbourhood_group*neighbourhood,airbnb_train)


get_regression_summaries(model.ols)
## # A tibble: 1 × 9
##   r_squared adj_r_squared   mse  rmse sigma statistic p_value    df  nobs
##       <dbl>         <dbl> <dbl> <dbl> <dbl>     <dbl>   <dbl> <dbl> <dbl>
## 1     0.861         0.858 5107.  71.5  72.2      297.       0   559 27387
get_regression_table(model.ols)
## # A tibble: 1,739 × 7
##    term                   estimate std_error statistic p_value lower_ci upper_ci
##    <chr>                     <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
##  1 intercept              -29933.     5768.     -5.19    0     -41238.   -1.86e4
##  2 neighbourhood_group: …  -3654.      512.     -7.14    0      -4657.   -2.65e3
##  3 neighbourhood_group: …   -987.      512.     -1.93    0.054  -1991.    1.68e1
##  4 neighbourhood_group: …  15251.      635.     24       0      14005.    1.65e4
##  5 neighbourhood_group: …  -2831.      518.     -5.47    0      -3846.   -1.82e3
##  6 neighbourhood: Arden …    -49.0      83.6    -0.586   0.558   -213.    1.15e2
##  7 neighbourhood: Arroch…    357.       83.5     4.27    0        193.    5.20e2
##  8 neighbourhood: Arverne    149.       24.2     6.18    0        102.    1.97e2
##  9 neighbourhood: Astoria    -25.3      15.1    -1.67    0.095    -54.9   4.36e0
## 10 neighbourhood: Bath B…    -58.8      37.5    -1.57    0.117   -132.    1.47e1
## # ℹ 1,729 more rows
points<-get_regression_points(model.ols)
ggplot(data=points, aes(x=residual)) + geom_histogram(color="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=points, aes(x=price_hat, y=residual)) + geom_point()

points %>%
  ggplot(aes(sample = residual)) +
  stat_qq() +
  stat_qq_line()

airbnb.predictions <- predict(model.ols,airbnb_test_new)
## Warning in predict.lm(model.ols, airbnb_test_new): prediction from a
## rank-deficient fit may be misleading
#airbnb.predictions

#result_test <- data.frame(ID = airbnb_test_new$id,
                           #price = exp(airbnb_test_new$lnprice),
                    #predictions = exp(airbnb.predictions))

result_test <- data.frame(ID = airbnb_test_new$id,
                          price = airbnb_test_new$price,
                          predictions = airbnb.predictions)
write.csv(result_test,file = "C:/Users/anish/Documents/Data Science Capstone/predicitions_airbnb_ols.csv")

Model 2 AIC

null <- lm(price~1, data = airbnb_train)
full <- lm(price ~ neighbourhood_group +neighbourhood+ latitude + longitude + room_type + number_of_reviews + reviews_per_month + floor + `noise(dB)` + minimum_nights, data = airbnb_train)

step(null, scope =list(lower=null, upper= full), direction = "both")
## Start:  AIC=287838.9
## price ~ 1
## 
##                        Df Sum of Sq        RSS    AIC
## + room_type             2  87348345  917202210 285352
## + neighbourhood       214  95950592  908599963 285517
## + neighbourhood_group   4  30276617  974273938 287009
## + longitude             1  26455034  978095521 287110
## + `noise(dB)`           1   7122022  997428533 287646
## + floor                 1   5633304  998917251 287687
## + minimum_nights        1   2948164 1001602392 287760
## + number_of_reviews     1   2547252 1002003303 287771
## + reviews_per_month     1   2455346 1002095209 287774
## + latitude              1    662152 1003888403 287823
## <none>                              1004550555 287839
## 
## Step:  AIC=285351.6
## price ~ room_type
## 
##                        Df Sum of Sq        RSS    AIC
## + neighbourhood       214  60444170  856758040 283913
## + neighbourhood_group   4  15834239  901367971 284883
## + floor                 1  11864840  905337370 284997
## + longitude             1  11500745  905701465 285008
## + `noise(dB)`           1   2368942  914833268 285283
## + number_of_reviews     1   2259484  914942726 285286
## + reviews_per_month     1   1625635  915576576 285305
## + minimum_nights        1    937386  916264824 285326
## + latitude              1    785425  916416786 285330
## <none>                               917202210 285352
## - room_type             2  87348345 1004550555 287839
## 
## Step:  AIC=283912.5
## price ~ room_type + neighbourhood
## 
##                      Df Sum of Sq       RSS    AIC
## + floor               1 589207446 267550594 252040
## + `noise(dB)`         1 495617717 361140323 260255
## + number_of_reviews   1   1152356 855605684 283878
## + reviews_per_month   1    675817 856082223 283893
## + longitude           1    250115 856507925 283907
## + minimum_nights      1    185051 856572989 283909
## <none>                            856758040 283913
## + latitude            1     17326 856740714 283914
## - neighbourhood     214  60444170 917202210 285352
## - room_type           2  51841923 908599963 285517
## 
## Step:  AIC=252040.2
## price ~ room_type + neighbourhood + floor
## 
##                      Df Sum of Sq       RSS    AIC
## + `noise(dB)`         1  20039722 247510872 249910
## + number_of_reviews   1    324210 267226383 252009
## + longitude           1    239666 267310927 252018
## + reviews_per_month   1     80382 267470212 252034
## <none>                            267550594 252040
## + minimum_nights      1     10749 267539845 252041
## + latitude            1      9684 267540909 252041
## - room_type           2  39700752 307251345 255825
## - floor               1 589207446 856758040 283913
## - neighbourhood     214 637786776 905337370 284997
## 
## Step:  AIC=249910
## price ~ room_type + neighbourhood + floor + `noise(dB)`
## 
##                      Df Sum of Sq       RSS    AIC
## + number_of_reviews   1    230697 247280175 249887
## + longitude           1    143278 247367593 249896
## + reviews_per_month   1     79698 247431174 249903
## <none>                            247510872 249910
## + latitude            1      5773 247505099 249911
## + minimum_nights      1      3322 247507549 249912
## - `noise(dB)`         1  20039722 267550594 252040
## - room_type           2  36315517 283826389 253656
## - floor               1 113629451 361140323 260255
## - neighbourhood     214 653789342 901300214 284877
## 
## Step:  AIC=249886.5
## price ~ room_type + neighbourhood + floor + `noise(dB)` + number_of_reviews
## 
##                      Df Sum of Sq       RSS    AIC
## + longitude           1    136744 247143431 249873
## <none>                            247280175 249887
## + minimum_nights      1      9008 247271168 249888
## + latitude            1      4170 247276006 249888
## + reviews_per_month   1        29 247280146 249889
## - number_of_reviews   1    230697 247510872 249910
## - `noise(dB)`         1  19946208 267226383 252009
## - room_type           2  36377007 283657183 253641
## - floor               1 113667551 360947727 260243
## - neighbourhood     214 651486509 898766684 284801
## 
## Step:  AIC=249873.4
## price ~ room_type + neighbourhood + floor + `noise(dB)` + number_of_reviews + 
##     longitude
## 
##                      Df Sum of Sq       RSS    AIC
## <none>                            247143431 249873
## + minimum_nights      1      9285 247134146 249874
## + latitude            1      7195 247136236 249875
## + reviews_per_month   1       354 247143077 249875
## - longitude           1    136744 247280175 249887
## - number_of_reviews   1    224163 247367593 249896
## - `noise(dB)`         1  19853568 266996999 251988
## - room_type           2  36163617 283307048 253609
## - floor               1 113778990 360922421 260243
## - neighbourhood     214 602502991 849646422 283264
## 
## Call:
## lm(formula = price ~ room_type + neighbourhood + floor + `noise(dB)` + 
##     number_of_reviews + longitude, data = airbnb_train)
## 
## Coefficients:
##                             (Intercept)  
##                              -3.092e+04  
##                   room_typePrivate room  
##                              -7.680e+01  
##                    room_typeShared room  
##                              -8.436e+01  
##              neighbourhoodArden Heights  
##                               3.483e+03  
##                   neighbourhoodArrochar  
##                               3.635e+03  
##                    neighbourhoodArverne  
##                               1.405e+03  
##                    neighbourhoodAstoria  
##                               1.292e+03  
##                 neighbourhoodBath Beach  
##                               2.231e+03  
##          neighbourhoodBattery Park City  
##                               3.335e+03  
##                  neighbourhoodBay Ridge  
##                               2.267e+03  
##                neighbourhoodBay Terrace  
##                               1.368e+03  
## neighbourhoodBay Terrace, Staten Island  
##                               3.477e+03  
##                 neighbourhoodBaychester  
##                              -3.046e+01  
##                    neighbourhoodBayside  
##                               1.338e+03  
##                  neighbourhoodBayswater  
##                               1.320e+03  
##         neighbourhoodBedford-Stuyvesant  
##                               2.264e+03  
##               neighbourhoodBelle Harbor  
##                               1.405e+03  
##                  neighbourhoodBellerose  
##                               1.355e+03  
##                    neighbourhoodBelmont  
##                               3.372e+01  
##                neighbourhoodBensonhurst  
##                               2.229e+03  
##               neighbourhoodBergen Beach  
##                               2.241e+03  
##                neighbourhoodBoerum Hill  
##                               2.296e+03  
##               neighbourhoodBorough Park  
##                               2.228e+03  
##               neighbourhoodBreezy Point  
##                               1.427e+03  
##                  neighbourhoodBriarwood  
##                               1.331e+03  
##             neighbourhoodBrighton Beach  
##                               2.285e+03  
##                  neighbourhoodBronxdale  
##                              -1.888e+01  
##           neighbourhoodBrooklyn Heights  
##                               2.316e+03  
##                neighbourhoodBrownsville  
##                               2.268e+03  
##                neighbourhoodBull's Head  
##                               3.525e+03  
##                   neighbourhoodBushwick  
##                               2.252e+03  
##            neighbourhoodCambria Heights  
##                               1.335e+03  
##                   neighbourhoodCanarsie  
##                               2.284e+03  
##            neighbourhoodCarroll Gardens  
##                               2.296e+03  
##                neighbourhoodCastle Hill  
##                              -3.951e+00  
##          neighbourhoodCastleton Corners  
##                               3.626e+03  
##                    neighbourhoodChelsea  
##                               3.359e+03  
##                  neighbourhoodChinatown  
##                               3.307e+03  
##                neighbourhoodCity Island  
##                               1.923e+02  
##               neighbourhoodCivic Center  
##                               3.307e+03  
##          neighbourhoodClaremont Village  
##                               3.620e+00  
##               neighbourhoodClason Point  
##                               3.523e+01  
##                    neighbourhoodClifton  
##                               3.584e+03  
##               neighbourhoodClinton Hill  
##                               2.252e+03  
##                 neighbourhoodCo-op City  
##                               5.200e+01  
##                neighbourhoodCobble Hill  
##                               2.317e+03  
##              neighbourhoodCollege Point  
##                               1.274e+03  
##                neighbourhoodColumbia St  
##                               2.257e+03  
##                    neighbourhoodConcord  
##                               3.557e+03  
##                  neighbourhoodConcourse  
##                              -8.437e+00  
##          neighbourhoodConcourse Village  
##                              -3.783e+00  
##               neighbourhoodConey Island  
##                               2.195e+03  
##                     neighbourhoodCorona  
##                               1.277e+03  
##              neighbourhoodCrown Heights  
##                               2.254e+03  
##              neighbourhoodCypress Hills  
##                               2.278e+03  
##           neighbourhoodDitmars Steinway  
##                               1.290e+03  
##               neighbourhoodDongan Hills  
##                               3.553e+03  
##                 neighbourhoodDouglaston  
##                               1.325e+03  
##          neighbourhoodDowntown Brooklyn  
##                               2.280e+03  
##                      neighbourhoodDUMBO  
##                               2.317e+03  
##              neighbourhoodDyker Heights  
##                               2.242e+03  
##              neighbourhoodEast Elmhurst  
##                               1.300e+03  
##              neighbourhoodEast Flatbush  
##                               2.245e+03  
##                neighbourhoodEast Harlem  
##                               3.318e+03  
##            neighbourhoodEast Morrisania  
##                               3.698e+01  
##              neighbourhoodEast New York  
##                               2.270e+03  
##               neighbourhoodEast Village  
##                               3.336e+03  
##                neighbourhoodEastchester  
##                               7.262e+01  
##                   neighbourhoodEdenwald  
##                               2.920e+01  
##                   neighbourhoodEdgemere  
##                               1.316e+03  
##                   neighbourhoodElmhurst  
##                               1.291e+03  
##                neighbourhoodEltingville  
##                               3.615e+03  
##               neighbourhoodEmerson Hill  
##                               3.480e+03  
##               neighbourhoodFar Rockaway  
##                               1.439e+03  
##                  neighbourhoodFieldston  
##                              -3.123e+01  
##         neighbourhoodFinancial District  
##                               3.351e+03  
##                   neighbourhoodFlatbush  
##                               2.215e+03  
##          neighbourhoodFlatiron District  
##                               3.383e+03  
##                  neighbourhoodFlatlands  
##                               2.273e+03  
##                   neighbourhoodFlushing  
##                               1.333e+03  
##                    neighbourhoodFordham  
##                               4.195e+00  
##               neighbourhoodForest Hills  
##                               1.354e+03  
##                neighbourhoodFort Greene  
##                               2.279e+03  
##              neighbourhoodFort Hamilton  
##                               2.231e+03  
##             neighbourhoodFort Wadsworth  
##                               4.564e+03  
##              neighbourhoodFresh Meadows  
##                               1.329e+03  
##                   neighbourhoodGlendale  
##                               1.285e+03  
##                    neighbourhoodGowanus  
##                               2.291e+03  
##                   neighbourhoodGramercy  
##                               3.347e+03  
##               neighbourhoodGraniteville  
##                               3.484e+03  
##                 neighbourhoodGrant City  
##                               3.515e+03  
##                  neighbourhoodGravesend  
##                               2.244e+03  
##                neighbourhoodGreat Kills  
##                               3.585e+03  
##                 neighbourhoodGreenpoint  
##                               2.287e+03  
##          neighbourhoodGreenwich Village  
##                               3.359e+03  
##                neighbourhoodGrymes Hill  
##                               3.606e+03  
##                     neighbourhoodHarlem  
##                               3.280e+03  
##             neighbourhoodHell's Kitchen  
##                               3.349e+03  
##                 neighbourhoodHighbridge  
##                              -4.654e+00  
##                     neighbourhoodHollis  
##                               1.330e+03  
##                 neighbourhoodHolliswood  
##                               1.417e+03  
##               neighbourhoodHoward Beach  
##                               1.312e+03  
##               neighbourhoodHowland Hook  
##                               3.512e+03  
##                   neighbourhoodHuguenot  
##                               3.511e+03  
##                neighbourhoodHunts Point  
##                              -1.758e+00  
##                     neighbourhoodInwood  
##                               3.299e+03  
##            neighbourhoodJackson Heights  
##                               1.287e+03  
##                    neighbourhoodJamaica  
##                               1.328e+03  
##            neighbourhoodJamaica Estates  
##                               1.283e+03  
##              neighbourhoodJamaica Hills  
##                               1.364e+03  
##                 neighbourhoodKensington  
##                               2.249e+03  
##                neighbourhoodKew Gardens  
##                               1.312e+03  
##          neighbourhoodKew Gardens Hills  
##                               1.329e+03  
##                neighbourhoodKingsbridge  
##                               9.600e+00  
##                   neighbourhoodKips Bay  
##                               3.337e+03  
##                  neighbourhoodLaurelton  
##                               1.340e+03  
##            neighbourhoodLighthouse Hill  
##                               3.704e+03  
##               neighbourhoodLittle Italy  
##                               3.164e+03  
##                neighbourhoodLittle Neck  
##                               1.328e+03  
##           neighbourhoodLong Island City  
##                               1.304e+03  
##                   neighbourhoodLongwood  
##                               3.623e+01  
##            neighbourhoodLower East Side  
##                               3.329e+03  
##            neighbourhoodManhattan Beach  
##                               2.231e+03  
##                neighbourhoodMarble Hill  
##                               3.278e+03  
##            neighbourhoodMariners Harbor  
##                               3.536e+03  
##                    neighbourhoodMaspeth  
##                               1.275e+03  
##                    neighbourhoodMelrose  
##                              -1.858e+01  
##             neighbourhoodMiddle Village  
##                               1.286e+03  
##              neighbourhoodMidland Beach  
##                               3.524e+03  
##                    neighbourhoodMidtown  
##                               3.370e+03  
##                    neighbourhoodMidwood  
##                               2.237e+03  
##        neighbourhoodMorningside Heights  
##                               3.240e+03  
##             neighbourhoodMorris Heights  
##                              -1.027e+01  
##                neighbourhoodMorris Park  
##                               1.005e+01  
##                 neighbourhoodMorrisania  
##                               5.067e+00  
##                 neighbourhoodMott Haven  
##                               1.742e+01  
##                 neighbourhoodMount Eden  
##                              -9.029e+00  
##                 neighbourhoodMount Hope  
##                              -1.284e+01  
##                neighbourhoodMurray Hill  
##                               3.355e+03  
##                  neighbourhoodNavy Yard  
##                               2.266e+03  
##                   neighbourhoodNeponsit  
##                               1.588e+03  
##                   neighbourhoodNew Dorp  
##                               3.485e+03  
##             neighbourhoodNew Dorp Beach  
##                               3.547e+03  
##            neighbourhoodNew Springville  
##                               3.526e+03  
##                       neighbourhoodNoHo  
##                               3.386e+03  
##                     neighbourhoodNolita  
##                               3.352e+03  
##            neighbourhoodNorth Riverdale  
##                              -1.818e+01  
##                    neighbourhoodNorwood  
##                               6.050e+00  
##                    neighbourhoodOakwood  
##                               3.545e+03  
##                  neighbourhoodOlinville  
##                               3.288e+00  
##                 neighbourhoodOzone Park  
##                               1.285e+03  
##                 neighbourhoodPark Slope  
##                               2.304e+03  
##                neighbourhoodParkchester  
##                               1.312e+01  
##                 neighbourhoodPelham Bay  
##                               2.234e+01  
##             neighbourhoodPelham Gardens  
##                               4.715e+00  
##                neighbourhoodPort Morris  
##                               2.031e+00  
##              neighbourhoodPort Richmond  
##                               3.627e+03  
##               neighbourhoodPrince's Bay  
##                               4.033e+03  
##  neighbourhoodProspect-Lefferts Gardens  
##                               2.212e+03  
##           neighbourhoodProspect Heights  
##                               2.235e+03  
##             neighbourhoodQueens Village  
##                               1.319e+03  
##              neighbourhoodRandall Manor  
##                               4.003e+03  
##                   neighbourhoodRed Hook  
##                               2.269e+03  
##                  neighbourhoodRego Park  
##                               1.290e+03  
##              neighbourhoodRichmond Hill  
##                               1.312e+03  
##                  neighbourhoodRidgewood  
##                               1.274e+03  
##                  neighbourhoodRiverdale  
##                               3.306e+02  
##             neighbourhoodRockaway Beach  
##                               1.334e+03  
##           neighbourhoodRoosevelt Island  
##                               3.305e+03  
##                   neighbourhoodRosebank  
##                               3.565e+03  
##                   neighbourhoodRosedale  
##                               1.341e+03  
##                  neighbourhoodRossville  
##                               3.475e+03  
##              neighbourhoodSchuylerville  
##                               2.269e+01  
##                   neighbourhoodSea Gate  
##                               2.167e+03  
##             neighbourhoodSheepshead Bay  
##                               2.256e+03  
##                neighbourhoodShore Acres  
##                               3.522e+03  
##                neighbourhoodSilver Lake  
##                               3.523e+03  
##                       neighbourhoodSoHo  
##                               3.303e+03  
##                  neighbourhoodSoundview  
##                              -9.186e+00  
##                neighbourhoodSouth Beach  
##                               3.631e+03  
##           neighbourhoodSouth Ozone Park  
##                               1.314e+03  
##                neighbourhoodSouth Slope  
##                               2.277e+03  
##        neighbourhoodSpringfield Gardens  
##                               1.330e+03  
##             neighbourhoodSpuyten Duyvil  
##                               9.920e+01  
##                 neighbourhoodSt. Albans  
##                               1.343e+03  
##                 neighbourhoodSt. George  
##                               3.629e+03  
##                  neighbourhoodStapleton  
##                               3.613e+03  
##            neighbourhoodStuyvesant Town  
##                               3.329e+03  
##                  neighbourhoodSunnyside  
##                               1.276e+03  
##                neighbourhoodSunset Park  
##                               2.254e+03  
##           neighbourhoodTheater District  
##                               3.367e+03  
##                neighbourhoodThrogs Neck  
##                               3.617e+01  
##              neighbourhoodTompkinsville  
##                               3.562e+03  
##                neighbourhoodTottenville  
##                               3.570e+03  
##                    neighbourhoodTremont  
##                              -5.817e+00  
##                    neighbourhoodTribeca  
##                               3.237e+03  
##                neighbourhoodTwo Bridges  
##                               3.316e+03  
##                  neighbourhoodUnionport  
##                               9.339e+01  
##         neighbourhoodUniversity Heights  
##                              -7.675e-01  
##            neighbourhoodUpper East Side  
##                               3.333e+03  
##            neighbourhoodUpper West Side  
##                               3.292e+03  
##                   neighbourhoodVan Nest  
##                               8.017e+01  
##               neighbourhoodVinegar Hill  
##                               2.297e+03  
##                  neighbourhoodWakefield  
##                               3.239e+01  
##         neighbourhoodWashington Heights  
##                               3.296e+03  
##              neighbourhoodWest Brighton  
##                               3.585e+03  
##                 neighbourhoodWest Farms  
##                               4.370e+01  
##               neighbourhoodWest Village  
##                               3.216e+03  
##         neighbourhoodWestchester Square  
##                               2.238e+02  
##                neighbourhoodWesterleigh  
##                               3.533e+03  
##                 neighbourhoodWhitestone  
##                               1.357e+03  
##             neighbourhoodWilliamsbridge  
##                              -1.538e+01  
##               neighbourhoodWilliamsburg  
##                               2.297e+03  
##            neighbourhoodWindsor Terrace  
##                               2.268e+03  
##                  neighbourhoodWoodhaven  
##                               1.286e+03  
##                   neighbourhoodWoodlawn  
##                              -6.041e+00  
##                   neighbourhoodWoodside  
##                               1.293e+03  
##                                   floor  
##                               3.781e+02  
##                             `noise(dB)`  
##                               7.967e+01  
##                       number_of_reviews  
##                              -6.473e-02  
##                               longitude  
##                              -3.100e+02
airbnb_model_aic<-lm(formula = price ~room_type + neighbourhood + floor + `noise(dB)` + 
    number_of_reviews + longitude, data = airbnb_train)

get_regression_summaries(airbnb_model_aic)
## # A tibble: 1 × 9
##   r_squared adj_r_squared   mse  rmse sigma statistic p_value    df  nobs
##       <dbl>         <dbl> <dbl> <dbl> <dbl>     <dbl>   <dbl> <dbl> <dbl>
## 1     0.754         0.752 9024.  95.0  95.4      378.       0   220 27387
get_regression_table(airbnb_model_aic)
## # A tibble: 221 × 7
##    term                   estimate std_error statistic p_value lower_ci upper_ci
##    <chr>                     <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
##  1 intercept              -30923.    5904.       -5.24       0 -42496.  -19351. 
##  2 room_type: Private ro…    -76.8      1.24    -61.9        0    -79.2    -74.4
##  3 room_type: Shared room    -84.4      3.88    -21.8        0    -92.0    -76.8
##  4 neighbourhood: Arden …   3483.      73.3      47.5        0   3340.    3627. 
##  5 neighbourhood: Arroch…   3635.      53.8      67.6        0   3529.    3740. 
##  6 neighbourhood: Arverne   1405.      36.7      38.3        0   1334.    1477. 
##  7 neighbourhood: Astoria   1292.      34.2      37.8        0   1225.    1360. 
##  8 neighbourhood: Bath B…   2231.      36.1      61.7        0   2160.    2302. 
##  9 neighbourhood: Batter…   3335.      40.0      83.5        0   3257.    3414. 
## 10 neighbourhood: Bay Ri…   2267.      27.5      82.4        0   2213.    2321. 
## # ℹ 211 more rows
points<-get_regression_points(airbnb_model_aic)
ggplot(data=points, aes(x=residual)) + geom_histogram(color="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=points, aes(x=price_hat, y=residual)) + geom_point()

points %>%
  ggplot(aes(sample = residual)) +
  stat_qq() +
  stat_qq_line()

airbnb.predictions <- predict(airbnb_model_aic,airbnb_test_new)
#airbnb.predictions

Model 3 Random Forest

#Run the random forest model
randomforest_model <- randomForest(price~neighbourhood_group_numeric +neighbourhood_numeric+ latitude + longitude + room_type_numeric + minimum_nights  + number_of_reviews + floor  + minimum_nights,data = airbnb_train_numeric)

# View the forest results.
print(randomforest_model) 
## 
## Call:
##  randomForest(formula = price ~ neighbourhood_group_numeric +      neighbourhood_numeric + latitude + longitude + room_type_numeric +      minimum_nights + number_of_reviews + floor + minimum_nights,      data = airbnb_train_numeric) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 10852.54
##                     % Var explained: 69.15
# Importance of each predictor.
print(importance(randomforest_model,type = 2)) 
##                             IncNodePurity
## neighbourhood_group_numeric      27791329
## neighbourhood_numeric            31036416
## latitude                         71539819
## longitude                        96216934
## room_type_numeric                65684155
## minimum_nights                   24853100
## number_of_reviews                20896469
## floor                           456833767
#Predict the price based on X_train.
pred_train <- predict(randomforest_model, X_train)

#Create a dataframe with the train_id, the actual price, and the predicted price. 
result_train <- data.frame(ID = train_id, 
                     price = Y_train,
                    predictions = pred_train)
#Print the head of the dataframe result_train.
head(result_train)
##     ID price predictions
## 1 2595   225   226.57832
## 2 3647    60    64.25744
## 3 5121    60    68.69139
## 4 5203    32    38.01279
## 5 5238   150   189.93120
## 6 5295    54    94.45100
#Predict the price based on X_test.
pred_test <- predict(randomforest_model, airbnb_test_numeric)

#Create a dataframe with the test_id, the actual price, and the predicted price. 
result_test <- data.frame(ID = test_id, 
                          price = Y_test,
                    predictions = pred_test)


write.csv(result_test,file = "C:/Users/anish/Documents/Data Science Capstone/predicitions_airbnb_rf.csv")
#Print the head of the dataframe result_train.
head(result_test)
##     ID price predictions
## 1 2539   149     63.7401
## 2 3831    45    108.9368
## 3 5022    80    135.0129
## 4 5099   200    216.1896
## 5 6848   140    179.6969
## 6 9657   180    203.1002
print(paste0('Test RMSE: ' , rmse(result_test$price,
                                  result_test$predictions))) #testRMSE
## [1] "Test RMSE: 107.834320503987"
print(paste0('Train RMSE: ' , rmse(result_train$price,
                                  result_train$predictions))) 
## [1] "Train RMSE: 70.957961774177"
print(paste0('Test R2: ' ,
             caret::postResample(result_test$predictions , result_test$price)['Rsquared']))
## [1] "Test R2: 0.712074194315585"
print(paste0('Train R2: ' ,
             caret::postResample(result_train$predictions , result_train$price)['Rsquared']))
## [1] "Train R2: 0.88210725279567"
ggplot(result_test, aes(x = predictions, y = price)) + geom_point() +
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

melt_pred_test <- melt(head(result_test,20), id.vars = "ID")

ggplot(melt_pred_test, 
       aes(y = value, 
           x = ID,
           colour = variable)) +
      geom_point() +
      geom_line() +
  ggtitle("Actual vs Predicted for Test Data")

Model 4: Extreme Gradient Boosting(XG Boost)

#Preparing two matrices for xgb
dtrain <- xgb.DMatrix(data = train_mat, label = airbnb_train_numeric$price)
dtest <- xgb.DMatrix(data = test_mat, label = airbnb_test_numeric$price)

#Run xgboost
bst <- xgboost(data = dtrain, max.depth = 5, eta = 1, nround = 1000)
## [1]  train-rmse:109.103426 
## [2]  train-rmse:93.567236 
## [3]  train-rmse:67.858387 
## [4]  train-rmse:65.699772 
## [5]  train-rmse:63.175161 
## [6]  train-rmse:62.016752 
## [7]  train-rmse:60.400425 
## [8]  train-rmse:58.063930 
## [9]  train-rmse:57.428335 
## [10] train-rmse:56.690518 
## [11] train-rmse:56.226733 
## [12] train-rmse:54.540495 
## [13] train-rmse:53.888769 
## [14] train-rmse:53.458349 
## [15] train-rmse:52.728138 
## [16] train-rmse:52.584994 
## [17] train-rmse:52.178480 
## [18] train-rmse:51.855397 
## [19] train-rmse:51.703919 
## [20] train-rmse:50.191910 
## [21] train-rmse:50.097788 
## [22] train-rmse:49.412056 
## [23] train-rmse:49.301143 
## [24] train-rmse:48.787892 
## [25] train-rmse:48.469397 
## [26] train-rmse:48.410826 
## [27] train-rmse:48.269030 
## [28] train-rmse:48.025167 
## [29] train-rmse:47.275728 
## [30] train-rmse:47.000076 
## [31] train-rmse:46.705723 
## [32] train-rmse:46.561233 
## [33] train-rmse:46.354260 
## [34] train-rmse:46.295987 
## [35] train-rmse:46.162112 
## [36] train-rmse:45.878929 
## [37] train-rmse:45.370975 
## [38] train-rmse:45.326648 
## [39] train-rmse:45.178028 
## [40] train-rmse:45.126807 
## [41] train-rmse:44.883055 
## [42] train-rmse:44.766265 
## [43] train-rmse:44.596967 
## [44] train-rmse:44.474292 
## [45] train-rmse:44.372807 
## [46] train-rmse:44.178225 
## [47] train-rmse:43.978082 
## [48] train-rmse:43.939138 
## [49] train-rmse:43.866732 
## [50] train-rmse:43.676943 
## [51] train-rmse:43.570725 
## [52] train-rmse:43.483499 
## [53] train-rmse:43.446704 
## [54] train-rmse:43.328071 
## [55] train-rmse:43.231973 
## [56] train-rmse:43.081619 
## [57] train-rmse:42.976482 
## [58] train-rmse:42.872039 
## [59] train-rmse:42.715904 
## [60] train-rmse:42.615812 
## [61] train-rmse:42.586853 
## [62] train-rmse:42.448360 
## [63] train-rmse:42.383976 
## [64] train-rmse:42.349365 
## [65] train-rmse:42.309590 
## [66] train-rmse:42.238706 
## [67] train-rmse:42.071162 
## [68] train-rmse:41.972561 
## [69] train-rmse:41.887567 
## [70] train-rmse:41.703597 
## [71] train-rmse:41.597813 
## [72] train-rmse:41.571877 
## [73] train-rmse:41.400530 
## [74] train-rmse:41.274870 
## [75] train-rmse:41.189086 
## [76] train-rmse:41.133093 
## [77] train-rmse:41.060703 
## [78] train-rmse:40.992697 
## [79] train-rmse:40.947368 
## [80] train-rmse:40.859240 
## [81] train-rmse:40.783072 
## [82] train-rmse:40.708711 
## [83] train-rmse:40.583975 
## [84] train-rmse:40.507208 
## [85] train-rmse:40.169745 
## [86] train-rmse:40.100775 
## [87] train-rmse:40.035992 
## [88] train-rmse:39.926370 
## [89] train-rmse:39.858435 
## [90] train-rmse:39.849505 
## [91] train-rmse:39.694535 
## [92] train-rmse:39.580990 
## [93] train-rmse:39.432467 
## [94] train-rmse:39.367927 
## [95] train-rmse:39.252741 
## [96] train-rmse:39.150128 
## [97] train-rmse:39.038151 
## [98] train-rmse:38.977808 
## [99] train-rmse:38.942666 
## [100]    train-rmse:38.867225 
## [101]    train-rmse:38.792868 
## [102]    train-rmse:38.729206 
## [103]    train-rmse:38.594417 
## [104]    train-rmse:38.508185 
## [105]    train-rmse:38.392898 
## [106]    train-rmse:38.336132 
## [107]    train-rmse:38.271055 
## [108]    train-rmse:38.172375 
## [109]    train-rmse:38.071634 
## [110]    train-rmse:38.002874 
## [111]    train-rmse:37.919194 
## [112]    train-rmse:37.895910 
## [113]    train-rmse:37.798487 
## [114]    train-rmse:37.780181 
## [115]    train-rmse:37.640653 
## [116]    train-rmse:37.606712 
## [117]    train-rmse:37.539523 
## [118]    train-rmse:37.441513 
## [119]    train-rmse:37.417468 
## [120]    train-rmse:37.312785 
## [121]    train-rmse:37.223295 
## [122]    train-rmse:37.157834 
## [123]    train-rmse:37.080073 
## [124]    train-rmse:36.970252 
## [125]    train-rmse:36.911289 
## [126]    train-rmse:36.885272 
## [127]    train-rmse:36.798847 
## [128]    train-rmse:36.725064 
## [129]    train-rmse:36.663158 
## [130]    train-rmse:36.624463 
## [131]    train-rmse:36.599660 
## [132]    train-rmse:36.508793 
## [133]    train-rmse:36.375509 
## [134]    train-rmse:36.349271 
## [135]    train-rmse:36.262168 
## [136]    train-rmse:36.192494 
## [137]    train-rmse:36.107190 
## [138]    train-rmse:35.987591 
## [139]    train-rmse:35.955348 
## [140]    train-rmse:35.876260 
## [141]    train-rmse:35.819659 
## [142]    train-rmse:35.739799 
## [143]    train-rmse:35.626281 
## [144]    train-rmse:35.558879 
## [145]    train-rmse:35.506196 
## [146]    train-rmse:35.463598 
## [147]    train-rmse:35.401379 
## [148]    train-rmse:35.327771 
## [149]    train-rmse:35.284048 
## [150]    train-rmse:35.219212 
## [151]    train-rmse:35.132732 
## [152]    train-rmse:35.102758 
## [153]    train-rmse:35.040621 
## [154]    train-rmse:35.005275 
## [155]    train-rmse:34.961028 
## [156]    train-rmse:34.878811 
## [157]    train-rmse:34.780471 
## [158]    train-rmse:34.701387 
## [159]    train-rmse:34.642046 
## [160]    train-rmse:34.562066 
## [161]    train-rmse:34.458545 
## [162]    train-rmse:34.419950 
## [163]    train-rmse:34.349817 
## [164]    train-rmse:34.247895 
## [165]    train-rmse:34.169713 
## [166]    train-rmse:34.124375 
## [167]    train-rmse:34.108864 
## [168]    train-rmse:34.060844 
## [169]    train-rmse:34.028140 
## [170]    train-rmse:33.966608 
## [171]    train-rmse:33.908588 
## [172]    train-rmse:33.835677 
## [173]    train-rmse:33.784376 
## [174]    train-rmse:33.713330 
## [175]    train-rmse:33.654137 
## [176]    train-rmse:33.587644 
## [177]    train-rmse:33.520804 
## [178]    train-rmse:33.471592 
## [179]    train-rmse:33.444764 
## [180]    train-rmse:33.345408 
## [181]    train-rmse:33.276396 
## [182]    train-rmse:33.232971 
## [183]    train-rmse:33.204750 
## [184]    train-rmse:33.131093 
## [185]    train-rmse:33.094775 
## [186]    train-rmse:33.079624 
## [187]    train-rmse:33.039047 
## [188]    train-rmse:33.015654 
## [189]    train-rmse:32.985344 
## [190]    train-rmse:32.933708 
## [191]    train-rmse:32.866868 
## [192]    train-rmse:32.810645 
## [193]    train-rmse:32.782659 
## [194]    train-rmse:32.723384 
## [195]    train-rmse:32.683242 
## [196]    train-rmse:32.632905 
## [197]    train-rmse:32.559548 
## [198]    train-rmse:32.483582 
## [199]    train-rmse:32.421106 
## [200]    train-rmse:32.392563 
## [201]    train-rmse:32.335815 
## [202]    train-rmse:32.297140 
## [203]    train-rmse:32.251716 
## [204]    train-rmse:32.191967 
## [205]    train-rmse:32.144415 
## [206]    train-rmse:32.057640 
## [207]    train-rmse:32.029201 
## [208]    train-rmse:31.932345 
## [209]    train-rmse:31.845413 
## [210]    train-rmse:31.780658 
## [211]    train-rmse:31.744768 
## [212]    train-rmse:31.670669 
## [213]    train-rmse:31.646037 
## [214]    train-rmse:31.632929 
## [215]    train-rmse:31.591507 
## [216]    train-rmse:31.494219 
## [217]    train-rmse:31.420953 
## [218]    train-rmse:31.331459 
## [219]    train-rmse:31.306554 
## [220]    train-rmse:31.269836 
## [221]    train-rmse:31.217851 
## [222]    train-rmse:31.142993 
## [223]    train-rmse:31.135651 
## [224]    train-rmse:31.118486 
## [225]    train-rmse:31.082362 
## [226]    train-rmse:31.048505 
## [227]    train-rmse:31.033032 
## [228]    train-rmse:31.017207 
## [229]    train-rmse:30.964833 
## [230]    train-rmse:30.912501 
## [231]    train-rmse:30.874786 
## [232]    train-rmse:30.810148 
## [233]    train-rmse:30.763088 
## [234]    train-rmse:30.698930 
## [235]    train-rmse:30.635522 
## [236]    train-rmse:30.578933 
## [237]    train-rmse:30.556908 
## [238]    train-rmse:30.497977 
## [239]    train-rmse:30.419213 
## [240]    train-rmse:30.337707 
## [241]    train-rmse:30.281423 
## [242]    train-rmse:30.215736 
## [243]    train-rmse:30.167544 
## [244]    train-rmse:30.051430 
## [245]    train-rmse:30.037759 
## [246]    train-rmse:29.976727 
## [247]    train-rmse:29.969530 
## [248]    train-rmse:29.910593 
## [249]    train-rmse:29.889562 
## [250]    train-rmse:29.808352 
## [251]    train-rmse:29.774164 
## [252]    train-rmse:29.670753 
## [253]    train-rmse:29.599826 
## [254]    train-rmse:29.536176 
## [255]    train-rmse:29.487919 
## [256]    train-rmse:29.456876 
## [257]    train-rmse:29.423070 
## [258]    train-rmse:29.363151 
## [259]    train-rmse:29.270309 
## [260]    train-rmse:29.218339 
## [261]    train-rmse:29.154853 
## [262]    train-rmse:29.121879 
## [263]    train-rmse:29.099175 
## [264]    train-rmse:29.026046 
## [265]    train-rmse:28.947873 
## [266]    train-rmse:28.874223 
## [267]    train-rmse:28.822707 
## [268]    train-rmse:28.755845 
## [269]    train-rmse:28.710706 
## [270]    train-rmse:28.636204 
## [271]    train-rmse:28.591975 
## [272]    train-rmse:28.546309 
## [273]    train-rmse:28.488967 
## [274]    train-rmse:28.460937 
## [275]    train-rmse:28.389944 
## [276]    train-rmse:28.356581 
## [277]    train-rmse:28.320757 
## [278]    train-rmse:28.270802 
## [279]    train-rmse:28.225771 
## [280]    train-rmse:28.171421 
## [281]    train-rmse:28.115886 
## [282]    train-rmse:28.065926 
## [283]    train-rmse:28.050499 
## [284]    train-rmse:27.981034 
## [285]    train-rmse:27.932397 
## [286]    train-rmse:27.923137 
## [287]    train-rmse:27.852712 
## [288]    train-rmse:27.801887 
## [289]    train-rmse:27.757322 
## [290]    train-rmse:27.725571 
## [291]    train-rmse:27.706267 
## [292]    train-rmse:27.660104 
## [293]    train-rmse:27.617859 
## [294]    train-rmse:27.574315 
## [295]    train-rmse:27.536241 
## [296]    train-rmse:27.509911 
## [297]    train-rmse:27.460765 
## [298]    train-rmse:27.420397 
## [299]    train-rmse:27.404286 
## [300]    train-rmse:27.392025 
## [301]    train-rmse:27.384885 
## [302]    train-rmse:27.374762 
## [303]    train-rmse:27.335171 
## [304]    train-rmse:27.318894 
## [305]    train-rmse:27.276546 
## [306]    train-rmse:27.225633 
## [307]    train-rmse:27.161458 
## [308]    train-rmse:27.130515 
## [309]    train-rmse:27.085241 
## [310]    train-rmse:27.060582 
## [311]    train-rmse:27.037091 
## [312]    train-rmse:26.990406 
## [313]    train-rmse:26.957419 
## [314]    train-rmse:26.938172 
## [315]    train-rmse:26.928567 
## [316]    train-rmse:26.914800 
## [317]    train-rmse:26.879447 
## [318]    train-rmse:26.817902 
## [319]    train-rmse:26.793962 
## [320]    train-rmse:26.725820 
## [321]    train-rmse:26.689897 
## [322]    train-rmse:26.650927 
## [323]    train-rmse:26.578860 
## [324]    train-rmse:26.533686 
## [325]    train-rmse:26.479232 
## [326]    train-rmse:26.403721 
## [327]    train-rmse:26.383487 
## [328]    train-rmse:26.366860 
## [329]    train-rmse:26.350291 
## [330]    train-rmse:26.341679 
## [331]    train-rmse:26.319587 
## [332]    train-rmse:26.272953 
## [333]    train-rmse:26.210614 
## [334]    train-rmse:26.161148 
## [335]    train-rmse:26.084870 
## [336]    train-rmse:26.025615 
## [337]    train-rmse:25.974575 
## [338]    train-rmse:25.918289 
## [339]    train-rmse:25.876687 
## [340]    train-rmse:25.805460 
## [341]    train-rmse:25.775829 
## [342]    train-rmse:25.766747 
## [343]    train-rmse:25.700496 
## [344]    train-rmse:25.640675 
## [345]    train-rmse:25.587025 
## [346]    train-rmse:25.558019 
## [347]    train-rmse:25.544620 
## [348]    train-rmse:25.492198 
## [349]    train-rmse:25.429680 
## [350]    train-rmse:25.387382 
## [351]    train-rmse:25.334843 
## [352]    train-rmse:25.292908 
## [353]    train-rmse:25.242820 
## [354]    train-rmse:25.209529 
## [355]    train-rmse:25.172461 
## [356]    train-rmse:25.163747 
## [357]    train-rmse:25.126491 
## [358]    train-rmse:25.087980 
## [359]    train-rmse:25.040289 
## [360]    train-rmse:24.989180 
## [361]    train-rmse:24.948513 
## [362]    train-rmse:24.889211 
## [363]    train-rmse:24.838114 
## [364]    train-rmse:24.832359 
## [365]    train-rmse:24.780362 
## [366]    train-rmse:24.693305 
## [367]    train-rmse:24.662206 
## [368]    train-rmse:24.635055 
## [369]    train-rmse:24.571240 
## [370]    train-rmse:24.523470 
## [371]    train-rmse:24.511840 
## [372]    train-rmse:24.503474 
## [373]    train-rmse:24.471458 
## [374]    train-rmse:24.427806 
## [375]    train-rmse:24.413169 
## [376]    train-rmse:24.381100 
## [377]    train-rmse:24.336547 
## [378]    train-rmse:24.269848 
## [379]    train-rmse:24.203639 
## [380]    train-rmse:24.159383 
## [381]    train-rmse:24.091714 
## [382]    train-rmse:24.033663 
## [383]    train-rmse:23.990156 
## [384]    train-rmse:23.964347 
## [385]    train-rmse:23.915153 
## [386]    train-rmse:23.900692 
## [387]    train-rmse:23.873796 
## [388]    train-rmse:23.821256 
## [389]    train-rmse:23.760173 
## [390]    train-rmse:23.714306 
## [391]    train-rmse:23.665417 
## [392]    train-rmse:23.640992 
## [393]    train-rmse:23.602885 
## [394]    train-rmse:23.556416 
## [395]    train-rmse:23.527237 
## [396]    train-rmse:23.478817 
## [397]    train-rmse:23.465132 
## [398]    train-rmse:23.449463 
## [399]    train-rmse:23.383961 
## [400]    train-rmse:23.320975 
## [401]    train-rmse:23.287169 
## [402]    train-rmse:23.261894 
## [403]    train-rmse:23.191956 
## [404]    train-rmse:23.149470 
## [405]    train-rmse:23.095760 
## [406]    train-rmse:23.055266 
## [407]    train-rmse:22.996825 
## [408]    train-rmse:22.987735 
## [409]    train-rmse:22.964129 
## [410]    train-rmse:22.942059 
## [411]    train-rmse:22.930877 
## [412]    train-rmse:22.909229 
## [413]    train-rmse:22.867416 
## [414]    train-rmse:22.820984 
## [415]    train-rmse:22.785086 
## [416]    train-rmse:22.744656 
## [417]    train-rmse:22.700912 
## [418]    train-rmse:22.675633 
## [419]    train-rmse:22.655226 
## [420]    train-rmse:22.623139 
## [421]    train-rmse:22.614740 
## [422]    train-rmse:22.578859 
## [423]    train-rmse:22.556497 
## [424]    train-rmse:22.515607 
## [425]    train-rmse:22.477788 
## [426]    train-rmse:22.456921 
## [427]    train-rmse:22.433837 
## [428]    train-rmse:22.400540 
## [429]    train-rmse:22.360281 
## [430]    train-rmse:22.335470 
## [431]    train-rmse:22.328125 
## [432]    train-rmse:22.324622 
## [433]    train-rmse:22.298941 
## [434]    train-rmse:22.245409 
## [435]    train-rmse:22.225351 
## [436]    train-rmse:22.195906 
## [437]    train-rmse:22.159575 
## [438]    train-rmse:22.113194 
## [439]    train-rmse:22.091618 
## [440]    train-rmse:22.065184 
## [441]    train-rmse:22.030972 
## [442]    train-rmse:21.989442 
## [443]    train-rmse:21.974103 
## [444]    train-rmse:21.945603 
## [445]    train-rmse:21.892959 
## [446]    train-rmse:21.851113 
## [447]    train-rmse:21.826663 
## [448]    train-rmse:21.759264 
## [449]    train-rmse:21.704387 
## [450]    train-rmse:21.659306 
## [451]    train-rmse:21.600366 
## [452]    train-rmse:21.556843 
## [453]    train-rmse:21.543787 
## [454]    train-rmse:21.504876 
## [455]    train-rmse:21.478961 
## [456]    train-rmse:21.446888 
## [457]    train-rmse:21.408395 
## [458]    train-rmse:21.394502 
## [459]    train-rmse:21.357097 
## [460]    train-rmse:21.323630 
## [461]    train-rmse:21.295901 
## [462]    train-rmse:21.273060 
## [463]    train-rmse:21.245204 
## [464]    train-rmse:21.203448 
## [465]    train-rmse:21.165842 
## [466]    train-rmse:21.154215 
## [467]    train-rmse:21.110825 
## [468]    train-rmse:21.077365 
## [469]    train-rmse:21.039654 
## [470]    train-rmse:20.998798 
## [471]    train-rmse:20.948608 
## [472]    train-rmse:20.905397 
## [473]    train-rmse:20.870932 
## [474]    train-rmse:20.856356 
## [475]    train-rmse:20.836761 
## [476]    train-rmse:20.801534 
## [477]    train-rmse:20.733209 
## [478]    train-rmse:20.704533 
## [479]    train-rmse:20.694998 
## [480]    train-rmse:20.672977 
## [481]    train-rmse:20.634424 
## [482]    train-rmse:20.596565 
## [483]    train-rmse:20.561065 
## [484]    train-rmse:20.531491 
## [485]    train-rmse:20.493501 
## [486]    train-rmse:20.475021 
## [487]    train-rmse:20.456395 
## [488]    train-rmse:20.424041 
## [489]    train-rmse:20.412405 
## [490]    train-rmse:20.384667 
## [491]    train-rmse:20.372457 
## [492]    train-rmse:20.359467 
## [493]    train-rmse:20.350773 
## [494]    train-rmse:20.321169 
## [495]    train-rmse:20.281701 
## [496]    train-rmse:20.239047 
## [497]    train-rmse:20.210466 
## [498]    train-rmse:20.192330 
## [499]    train-rmse:20.141166 
## [500]    train-rmse:20.113099 
## [501]    train-rmse:20.079713 
## [502]    train-rmse:20.074201 
## [503]    train-rmse:20.038179 
## [504]    train-rmse:20.002922 
## [505]    train-rmse:19.968455 
## [506]    train-rmse:19.937428 
## [507]    train-rmse:19.909770 
## [508]    train-rmse:19.889517 
## [509]    train-rmse:19.843497 
## [510]    train-rmse:19.824336 
## [511]    train-rmse:19.805134 
## [512]    train-rmse:19.763481 
## [513]    train-rmse:19.734282 
## [514]    train-rmse:19.712023 
## [515]    train-rmse:19.700131 
## [516]    train-rmse:19.692330 
## [517]    train-rmse:19.646708 
## [518]    train-rmse:19.631056 
## [519]    train-rmse:19.592346 
## [520]    train-rmse:19.565040 
## [521]    train-rmse:19.550886 
## [522]    train-rmse:19.541811 
## [523]    train-rmse:19.510438 
## [524]    train-rmse:19.489092 
## [525]    train-rmse:19.484714 
## [526]    train-rmse:19.451410 
## [527]    train-rmse:19.413954 
## [528]    train-rmse:19.389679 
## [529]    train-rmse:19.371402 
## [530]    train-rmse:19.357164 
## [531]    train-rmse:19.349654 
## [532]    train-rmse:19.338138 
## [533]    train-rmse:19.305095 
## [534]    train-rmse:19.265719 
## [535]    train-rmse:19.237422 
## [536]    train-rmse:19.204446 
## [537]    train-rmse:19.158492 
## [538]    train-rmse:19.134264 
## [539]    train-rmse:19.114258 
## [540]    train-rmse:19.095811 
## [541]    train-rmse:19.062244 
## [542]    train-rmse:19.040929 
## [543]    train-rmse:18.975884 
## [544]    train-rmse:18.957158 
## [545]    train-rmse:18.932535 
## [546]    train-rmse:18.907478 
## [547]    train-rmse:18.884489 
## [548]    train-rmse:18.853089 
## [549]    train-rmse:18.823461 
## [550]    train-rmse:18.816496 
## [551]    train-rmse:18.786735 
## [552]    train-rmse:18.766465 
## [553]    train-rmse:18.742775 
## [554]    train-rmse:18.734761 
## [555]    train-rmse:18.728304 
## [556]    train-rmse:18.713416 
## [557]    train-rmse:18.692140 
## [558]    train-rmse:18.680805 
## [559]    train-rmse:18.666459 
## [560]    train-rmse:18.643042 
## [561]    train-rmse:18.623714 
## [562]    train-rmse:18.605514 
## [563]    train-rmse:18.573862 
## [564]    train-rmse:18.569075 
## [565]    train-rmse:18.539202 
## [566]    train-rmse:18.506719 
## [567]    train-rmse:18.469940 
## [568]    train-rmse:18.436589 
## [569]    train-rmse:18.420933 
## [570]    train-rmse:18.388781 
## [571]    train-rmse:18.363055 
## [572]    train-rmse:18.346369 
## [573]    train-rmse:18.325745 
## [574]    train-rmse:18.272624 
## [575]    train-rmse:18.222497 
## [576]    train-rmse:18.201698 
## [577]    train-rmse:18.173103 
## [578]    train-rmse:18.137106 
## [579]    train-rmse:18.086138 
## [580]    train-rmse:18.063836 
## [581]    train-rmse:18.055119 
## [582]    train-rmse:18.026482 
## [583]    train-rmse:17.992471 
## [584]    train-rmse:17.982147 
## [585]    train-rmse:17.964566 
## [586]    train-rmse:17.949399 
## [587]    train-rmse:17.919818 
## [588]    train-rmse:17.886915 
## [589]    train-rmse:17.869387 
## [590]    train-rmse:17.824205 
## [591]    train-rmse:17.794264 
## [592]    train-rmse:17.749938 
## [593]    train-rmse:17.722951 
## [594]    train-rmse:17.695171 
## [595]    train-rmse:17.660526 
## [596]    train-rmse:17.641415 
## [597]    train-rmse:17.621119 
## [598]    train-rmse:17.593226 
## [599]    train-rmse:17.560203 
## [600]    train-rmse:17.547695 
## [601]    train-rmse:17.531596 
## [602]    train-rmse:17.499535 
## [603]    train-rmse:17.483577 
## [604]    train-rmse:17.447375 
## [605]    train-rmse:17.414282 
## [606]    train-rmse:17.388538 
## [607]    train-rmse:17.358725 
## [608]    train-rmse:17.332854 
## [609]    train-rmse:17.290790 
## [610]    train-rmse:17.251768 
## [611]    train-rmse:17.242827 
## [612]    train-rmse:17.224613 
## [613]    train-rmse:17.206864 
## [614]    train-rmse:17.196629 
## [615]    train-rmse:17.178845 
## [616]    train-rmse:17.157310 
## [617]    train-rmse:17.144138 
## [618]    train-rmse:17.133852 
## [619]    train-rmse:17.102447 
## [620]    train-rmse:17.084541 
## [621]    train-rmse:17.059937 
## [622]    train-rmse:17.003798 
## [623]    train-rmse:16.989394 
## [624]    train-rmse:16.980730 
## [625]    train-rmse:16.948915 
## [626]    train-rmse:16.940497 
## [627]    train-rmse:16.905883 
## [628]    train-rmse:16.867163 
## [629]    train-rmse:16.827730 
## [630]    train-rmse:16.807693 
## [631]    train-rmse:16.797854 
## [632]    train-rmse:16.792312 
## [633]    train-rmse:16.767700 
## [634]    train-rmse:16.759730 
## [635]    train-rmse:16.744884 
## [636]    train-rmse:16.719513 
## [637]    train-rmse:16.715846 
## [638]    train-rmse:16.692779 
## [639]    train-rmse:16.672840 
## [640]    train-rmse:16.637771 
## [641]    train-rmse:16.608772 
## [642]    train-rmse:16.590984 
## [643]    train-rmse:16.552066 
## [644]    train-rmse:16.541546 
## [645]    train-rmse:16.518792 
## [646]    train-rmse:16.505205 
## [647]    train-rmse:16.488881 
## [648]    train-rmse:16.461240 
## [649]    train-rmse:16.427846 
## [650]    train-rmse:16.416541 
## [651]    train-rmse:16.394044 
## [652]    train-rmse:16.386665 
## [653]    train-rmse:16.373436 
## [654]    train-rmse:16.344718 
## [655]    train-rmse:16.322518 
## [656]    train-rmse:16.294051 
## [657]    train-rmse:16.279631 
## [658]    train-rmse:16.262007 
## [659]    train-rmse:16.222352 
## [660]    train-rmse:16.188587 
## [661]    train-rmse:16.167311 
## [662]    train-rmse:16.143334 
## [663]    train-rmse:16.101762 
## [664]    train-rmse:16.086361 
## [665]    train-rmse:16.073289 
## [666]    train-rmse:16.044697 
## [667]    train-rmse:16.012123 
## [668]    train-rmse:16.000111 
## [669]    train-rmse:15.978434 
## [670]    train-rmse:15.957064 
## [671]    train-rmse:15.919807 
## [672]    train-rmse:15.893664 
## [673]    train-rmse:15.857483 
## [674]    train-rmse:15.846107 
## [675]    train-rmse:15.834837 
## [676]    train-rmse:15.804900 
## [677]    train-rmse:15.786265 
## [678]    train-rmse:15.762074 
## [679]    train-rmse:15.739478 
## [680]    train-rmse:15.716273 
## [681]    train-rmse:15.693105 
## [682]    train-rmse:15.667424 
## [683]    train-rmse:15.652258 
## [684]    train-rmse:15.641255 
## [685]    train-rmse:15.626180 
## [686]    train-rmse:15.612847 
## [687]    train-rmse:15.580597 
## [688]    train-rmse:15.566541 
## [689]    train-rmse:15.548975 
## [690]    train-rmse:15.544659 
## [691]    train-rmse:15.514043 
## [692]    train-rmse:15.501898 
## [693]    train-rmse:15.495948 
## [694]    train-rmse:15.487813 
## [695]    train-rmse:15.483588 
## [696]    train-rmse:15.460835 
## [697]    train-rmse:15.452042 
## [698]    train-rmse:15.418360 
## [699]    train-rmse:15.388831 
## [700]    train-rmse:15.371655 
## [701]    train-rmse:15.355464 
## [702]    train-rmse:15.333484 
## [703]    train-rmse:15.317980 
## [704]    train-rmse:15.293718 
## [705]    train-rmse:15.264582 
## [706]    train-rmse:15.243985 
## [707]    train-rmse:15.221903 
## [708]    train-rmse:15.211763 
## [709]    train-rmse:15.182659 
## [710]    train-rmse:15.176020 
## [711]    train-rmse:15.164382 
## [712]    train-rmse:15.145859 
## [713]    train-rmse:15.122264 
## [714]    train-rmse:15.106160 
## [715]    train-rmse:15.091554 
## [716]    train-rmse:15.052286 
## [717]    train-rmse:15.029484 
## [718]    train-rmse:15.005814 
## [719]    train-rmse:14.982593 
## [720]    train-rmse:14.966313 
## [721]    train-rmse:14.937262 
## [722]    train-rmse:14.914796 
## [723]    train-rmse:14.887394 
## [724]    train-rmse:14.879939 
## [725]    train-rmse:14.857432 
## [726]    train-rmse:14.851121 
## [727]    train-rmse:14.820541 
## [728]    train-rmse:14.785379 
## [729]    train-rmse:14.769562 
## [730]    train-rmse:14.747857 
## [731]    train-rmse:14.740982 
## [732]    train-rmse:14.723948 
## [733]    train-rmse:14.711673 
## [734]    train-rmse:14.693403 
## [735]    train-rmse:14.684985 
## [736]    train-rmse:14.681421 
## [737]    train-rmse:14.664536 
## [738]    train-rmse:14.650716 
## [739]    train-rmse:14.621598 
## [740]    train-rmse:14.590628 
## [741]    train-rmse:14.557097 
## [742]    train-rmse:14.524511 
## [743]    train-rmse:14.519906 
## [744]    train-rmse:14.511241 
## [745]    train-rmse:14.482279 
## [746]    train-rmse:14.477302 
## [747]    train-rmse:14.462003 
## [748]    train-rmse:14.454679 
## [749]    train-rmse:14.437968 
## [750]    train-rmse:14.428383 
## [751]    train-rmse:14.409064 
## [752]    train-rmse:14.407633 
## [753]    train-rmse:14.385691 
## [754]    train-rmse:14.369994 
## [755]    train-rmse:14.351450 
## [756]    train-rmse:14.325916 
## [757]    train-rmse:14.307956 
## [758]    train-rmse:14.297729 
## [759]    train-rmse:14.291342 
## [760]    train-rmse:14.260435 
## [761]    train-rmse:14.248226 
## [762]    train-rmse:14.230493 
## [763]    train-rmse:14.205994 
## [764]    train-rmse:14.189294 
## [765]    train-rmse:14.166689 
## [766]    train-rmse:14.137597 
## [767]    train-rmse:14.131340 
## [768]    train-rmse:14.115041 
## [769]    train-rmse:14.108509 
## [770]    train-rmse:14.080571 
## [771]    train-rmse:14.057249 
## [772]    train-rmse:14.042693 
## [773]    train-rmse:14.035657 
## [774]    train-rmse:14.000885 
## [775]    train-rmse:13.980010 
## [776]    train-rmse:13.954868 
## [777]    train-rmse:13.940227 
## [778]    train-rmse:13.918106 
## [779]    train-rmse:13.883684 
## [780]    train-rmse:13.859825 
## [781]    train-rmse:13.841798 
## [782]    train-rmse:13.816255 
## [783]    train-rmse:13.799346 
## [784]    train-rmse:13.785911 
## [785]    train-rmse:13.757245 
## [786]    train-rmse:13.730865 
## [787]    train-rmse:13.709214 
## [788]    train-rmse:13.692120 
## [789]    train-rmse:13.662213 
## [790]    train-rmse:13.631643 
## [791]    train-rmse:13.623550 
## [792]    train-rmse:13.580742 
## [793]    train-rmse:13.566588 
## [794]    train-rmse:13.552500 
## [795]    train-rmse:13.542435 
## [796]    train-rmse:13.527002 
## [797]    train-rmse:13.505505 
## [798]    train-rmse:13.497957 
## [799]    train-rmse:13.475794 
## [800]    train-rmse:13.433263 
## [801]    train-rmse:13.414686 
## [802]    train-rmse:13.411011 
## [803]    train-rmse:13.392118 
## [804]    train-rmse:13.367394 
## [805]    train-rmse:13.336001 
## [806]    train-rmse:13.329598 
## [807]    train-rmse:13.321353 
## [808]    train-rmse:13.301143 
## [809]    train-rmse:13.270342 
## [810]    train-rmse:13.254598 
## [811]    train-rmse:13.238776 
## [812]    train-rmse:13.236127 
## [813]    train-rmse:13.220731 
## [814]    train-rmse:13.186589 
## [815]    train-rmse:13.178106 
## [816]    train-rmse:13.153481 
## [817]    train-rmse:13.145029 
## [818]    train-rmse:13.126774 
## [819]    train-rmse:13.123585 
## [820]    train-rmse:13.112149 
## [821]    train-rmse:13.094769 
## [822]    train-rmse:13.076028 
## [823]    train-rmse:13.052035 
## [824]    train-rmse:13.028739 
## [825]    train-rmse:13.009368 
## [826]    train-rmse:12.989189 
## [827]    train-rmse:12.961648 
## [828]    train-rmse:12.934165 
## [829]    train-rmse:12.930132 
## [830]    train-rmse:12.900447 
## [831]    train-rmse:12.877541 
## [832]    train-rmse:12.861519 
## [833]    train-rmse:12.844975 
## [834]    train-rmse:12.828941 
## [835]    train-rmse:12.810176 
## [836]    train-rmse:12.795580 
## [837]    train-rmse:12.784566 
## [838]    train-rmse:12.777915 
## [839]    train-rmse:12.759410 
## [840]    train-rmse:12.740017 
## [841]    train-rmse:12.724186 
## [842]    train-rmse:12.698478 
## [843]    train-rmse:12.685497 
## [844]    train-rmse:12.670111 
## [845]    train-rmse:12.648413 
## [846]    train-rmse:12.632146 
## [847]    train-rmse:12.626238 
## [848]    train-rmse:12.612488 
## [849]    train-rmse:12.596382 
## [850]    train-rmse:12.589997 
## [851]    train-rmse:12.567663 
## [852]    train-rmse:12.557281 
## [853]    train-rmse:12.549822 
## [854]    train-rmse:12.538160 
## [855]    train-rmse:12.520384 
## [856]    train-rmse:12.503511 
## [857]    train-rmse:12.486202 
## [858]    train-rmse:12.470869 
## [859]    train-rmse:12.456008 
## [860]    train-rmse:12.439756 
## [861]    train-rmse:12.431273 
## [862]    train-rmse:12.411897 
## [863]    train-rmse:12.398261 
## [864]    train-rmse:12.383949 
## [865]    train-rmse:12.373139 
## [866]    train-rmse:12.359123 
## [867]    train-rmse:12.333596 
## [868]    train-rmse:12.325717 
## [869]    train-rmse:12.303155 
## [870]    train-rmse:12.294075 
## [871]    train-rmse:12.270174 
## [872]    train-rmse:12.249576 
## [873]    train-rmse:12.229534 
## [874]    train-rmse:12.210006 
## [875]    train-rmse:12.197171 
## [876]    train-rmse:12.181373 
## [877]    train-rmse:12.146781 
## [878]    train-rmse:12.127005 
## [879]    train-rmse:12.100841 
## [880]    train-rmse:12.095294 
## [881]    train-rmse:12.091243 
## [882]    train-rmse:12.076371 
## [883]    train-rmse:12.044545 
## [884]    train-rmse:12.028147 
## [885]    train-rmse:12.019438 
## [886]    train-rmse:11.975925 
## [887]    train-rmse:11.951439 
## [888]    train-rmse:11.936717 
## [889]    train-rmse:11.912315 
## [890]    train-rmse:11.900966 
## [891]    train-rmse:11.884664 
## [892]    train-rmse:11.881042 
## [893]    train-rmse:11.862095 
## [894]    train-rmse:11.836684 
## [895]    train-rmse:11.816846 
## [896]    train-rmse:11.794146 
## [897]    train-rmse:11.779626 
## [898]    train-rmse:11.758170 
## [899]    train-rmse:11.746074 
## [900]    train-rmse:11.737928 
## [901]    train-rmse:11.730100 
## [902]    train-rmse:11.713028 
## [903]    train-rmse:11.687598 
## [904]    train-rmse:11.682160 
## [905]    train-rmse:11.679797 
## [906]    train-rmse:11.674432 
## [907]    train-rmse:11.661834 
## [908]    train-rmse:11.652422 
## [909]    train-rmse:11.640337 
## [910]    train-rmse:11.613675 
## [911]    train-rmse:11.594350 
## [912]    train-rmse:11.581705 
## [913]    train-rmse:11.577305 
## [914]    train-rmse:11.573464 
## [915]    train-rmse:11.558127 
## [916]    train-rmse:11.534079 
## [917]    train-rmse:11.532574 
## [918]    train-rmse:11.524519 
## [919]    train-rmse:11.522001 
## [920]    train-rmse:11.505000 
## [921]    train-rmse:11.481199 
## [922]    train-rmse:11.470504 
## [923]    train-rmse:11.463135 
## [924]    train-rmse:11.435613 
## [925]    train-rmse:11.409039 
## [926]    train-rmse:11.398809 
## [927]    train-rmse:11.393347 
## [928]    train-rmse:11.389149 
## [929]    train-rmse:11.361991 
## [930]    train-rmse:11.339457 
## [931]    train-rmse:11.335543 
## [932]    train-rmse:11.325470 
## [933]    train-rmse:11.312441 
## [934]    train-rmse:11.292081 
## [935]    train-rmse:11.279174 
## [936]    train-rmse:11.272889 
## [937]    train-rmse:11.245610 
## [938]    train-rmse:11.224012 
## [939]    train-rmse:11.208029 
## [940]    train-rmse:11.185486 
## [941]    train-rmse:11.180534 
## [942]    train-rmse:11.154763 
## [943]    train-rmse:11.128814 
## [944]    train-rmse:11.106631 
## [945]    train-rmse:11.095654 
## [946]    train-rmse:11.089162 
## [947]    train-rmse:11.081049 
## [948]    train-rmse:11.061388 
## [949]    train-rmse:11.038811 
## [950]    train-rmse:11.030049 
## [951]    train-rmse:11.008006 
## [952]    train-rmse:10.990747 
## [953]    train-rmse:10.974774 
## [954]    train-rmse:10.961842 
## [955]    train-rmse:10.939386 
## [956]    train-rmse:10.920688 
## [957]    train-rmse:10.911339 
## [958]    train-rmse:10.884014 
## [959]    train-rmse:10.874655 
## [960]    train-rmse:10.867826 
## [961]    train-rmse:10.854007 
## [962]    train-rmse:10.846981 
## [963]    train-rmse:10.831668 
## [964]    train-rmse:10.821045 
## [965]    train-rmse:10.803778 
## [966]    train-rmse:10.798476 
## [967]    train-rmse:10.787047 
## [968]    train-rmse:10.781598 
## [969]    train-rmse:10.776709 
## [970]    train-rmse:10.774319 
## [971]    train-rmse:10.757354 
## [972]    train-rmse:10.741351 
## [973]    train-rmse:10.713675 
## [974]    train-rmse:10.709509 
## [975]    train-rmse:10.697376 
## [976]    train-rmse:10.685366 
## [977]    train-rmse:10.678352 
## [978]    train-rmse:10.643090 
## [979]    train-rmse:10.624257 
## [980]    train-rmse:10.618712 
## [981]    train-rmse:10.589347 
## [982]    train-rmse:10.572327 
## [983]    train-rmse:10.554932 
## [984]    train-rmse:10.537639 
## [985]    train-rmse:10.525962 
## [986]    train-rmse:10.522713 
## [987]    train-rmse:10.504566 
## [988]    train-rmse:10.492161 
## [989]    train-rmse:10.481705 
## [990]    train-rmse:10.474273 
## [991]    train-rmse:10.464209 
## [992]    train-rmse:10.437856 
## [993]    train-rmse:10.425072 
## [994]    train-rmse:10.419147 
## [995]    train-rmse:10.405394 
## [996]    train-rmse:10.392239 
## [997]    train-rmse:10.380056 
## [998]    train-rmse:10.360891 
## [999]    train-rmse:10.346622 
## [1000]   train-rmse:10.332669
bst
## ##### xgb.Booster
## raw: 2.4 Mb 
## call:
##   xgb.train(params = params, data = dtrain, nrounds = nrounds, 
##     watchlist = watchlist, verbose = verbose, print_every_n = print_every_n, 
##     early_stopping_rounds = early_stopping_rounds, maximize = maximize, 
##     save_period = save_period, save_name = save_name, xgb_model = xgb_model, 
##     callbacks = callbacks, max.depth = 5, eta = 1)
## params (as set within xgb.train):
##   max_depth = "5", eta = "1", validate_parameters = "1"
## xgb.attributes:
##   niter
## callbacks:
##   cb.print.evaluation(period = print_every_n)
##   cb.evaluation.log()
## # of features: 10 
## niter: 1000
## nfeatures : 10 
## evaluation_log:
##     iter train_rmse
##        1  109.10343
##        2   93.56724
## ---                
##      999   10.34662
##     1000   10.33267
#Generate prediction on the test dataset with the model.
preds <- predict(bst, dtest)
#preds

#Calculate RMSE
err <- preds-airbnb_test_numeric$price
rmse <- sqrt(sum(err)^2/nrow(airbnb_test_numeric))

#Create a dataframe with the test_id, the actual price, and the predicted price. 
result_test <- data.frame(ID = test_id, 
                     price = airbnb_test_numeric$price,
                    predictions = preds)

#Print the results 
print(paste("test-error=", rmse))
## [1] "test-error= 46.8718864320688"
print(paste0('Test R^2: ' ,
             caret::postResample(result_test$predictions , result_test$price)['Rsquared']))
## [1] "Test R^2: 0.811956486991979"
#On Test Data Change to Pivot longer.
melt_pred_test <- melt(head(result_test,100), id.vars = "ID")

ggplot(melt_pred_test, 
       aes(y = value, 
           x = ID,
           colour = variable)) +
      geom_point() +
      geom_line() +
  ggtitle("Actual vs Predicted for Test Data")

Model 5: Leaps and Lars Model

Function to build second order matrix for x variables

#Builds second order terms for x values.
matrix.2ndorder.make<-function(x, only.quad=F){
  x0<-x
  dimn<-dimnames(x)[[2]] #extract the names of the variables
  num.col<-length(x[1,]) # how many columns
  for(i in 1:num.col){
    # if we are doing all 2nd order
    if(!only.quad){
      for(j in i:num.col){
        x0<-cbind(x0,x[,i]*x[,j])
        dimn<-c(dimn,paste(dimn[i],dimn[j],sep=""))
        #create interaction dimension names

      }
    }
    else{
        #in here only if doing only squared terms
        x0<-cbind(x0,x[,i]*x[,i])
        dimn<-c(dimn,paste(dimn[i],"2",sep="")) # squared dimension names
    }
  }
  dimnames(x0)[[2]]<-dimn
  x0
}

A Leaps automatic model selector using Cp, and PRESS

Best k models using Cp and from those k calculate best PRESS

Leaps then Press

#names function, takes two variables
regpluspress<-function(x,y){
 str<-lsfit(x,y) #Saves lsfit output to str.
 press<-sum((str$resid/(1-hat(x)))^2)
str$press<-press #Saves PRESS statistic to str
str #Calls str
}
#Takes parameters
leaps.then.press<-function(xmat,yvec,ncheck=10,print.ls=F)
{
    leaps.str<-leaps(xmat,yvec) #Runs through leaps and saves output to leaps.str
    z1<-leaps.str$Cp #extract Cp
    o1<-order(z1) #Order it
    matwhich<-(leaps.str$which[o1,])[1:ncheck,] #pullout ncheck best models with respect to Cp.
    z2<-z1[o1][1:ncheck] #Saves lowest cp values to z2
    pressvec<-NULL
    for(i in 1:ncheck){ #A for loop from 1 to number of checks
      ls.str0<-regpluspress(xmat[,matwhich[i,]],yvec) #Saves PRESS statistics from lowest Cp list 
      if(print.ls){#If called, then it will print ls.str0
          ls.print(ls.str0)
      }
      print(i) #Prints iteration
      print(paste("Press=",ls.str0$press)) #Prints PRESS stat
      parvec<-matwhich[i,] #Saves matrix row to vector
      npar<-sum(parvec) #Sums vector values to npar (sums 1 and 0 values) 
      print(paste("MPSE=",ls.str0$press/(length(yvec)-(npar+1)))) #Prints the mean squared prediction error
      print(paste("Cp=",z2[i])) #Prints Cp value
      pressvec<-c(pressvec, ls.str0$press)
      if(i==1){
        Xmat<-(xmat)[,leaps.str$which[o1[1],]]
        coef1<-lsfit(Xmat,yvec)$coef
        print("coef1=")
        print(coef1)
        leaps.pred<-Xmat%*%coef1[-1]+coef1[1]
        plot(leaps.pred,yvec)
        print("Correlation between leaps prediction and actual yvec")
        print(cor(leaps.pred,yvec))
      }
    }
    o2<-order(pressvec) #Output results
    print("which matrix:")
    matwhich[o2,] #model indicators sorted from best press to worst in top ncheck Cp
    print("matwhich 1")
    matwhich[o2[1]]
        
}

And a lars automatic model selector using both Cp and cross validation MSE

lars selection program

#Function to generate sum of the absolute values of a vector.
sumabs<-function(v1)
{sum(abs(v1))}

#lars select function. This is a lars automatic model selector using both Cp and cross validation MSE
lars.select<-
function(xmat,y,ncheck=10,int=F)
{
        lasso.str<-lars(xmat,y,intercept=int) #Calls lars, saves output to lasso.str
        #plot(lasso.str)
        #print(xmat)
        cv.str<-cv.lars(xmat,y,plot.it=F,intercept=int) #Calculates cross-validated error curve for lars
        o1<-order(cv.str$cv) #Orders cv values from lowest to highest
        mindex<-cv.str$index[o1][1] #Index of cv values ordered
        beta<-coef(lasso.str) #Saves coefficients from lasso.str to beta
        index0<-apply(beta,1,sumabs) #Iterates through rows, sums absolute values of beta (sum of squares)
        index0<-index0/max(index0) #Sums of beta divided by max sum of beta, percentage (0 to 1)
        o1<-order(abs(index0-mindex)) #Orders values subtracted by the minimum cv value
        I1<-(abs(index0-mindex)==min(abs(index0-mindex)))#If absolute value of index-mindex is the min, add to I1
        n1<-length(beta[,1]) #Saves int, length of column of coefficients
        beta.out<-beta[I1,] #Beta.out is minimum rows of coefficients
        if(sum(abs(beta.out))==0){ #If all values add to zero, then sort list by the Cp value
                v1<-lasso.str$Cp
                o2<-order(v1)
                beta.out<-beta[o1[1:ncheck],]
        }
        Ind.out<-beta.out!=0 #Saves Ind.out as values of beta.out that don't equal 0
        outlist<-list(beta.out=beta.out,ind.out=Ind.out)#Saves list of beta.out, ind.out values
        if(int){#finds y-intercept values.
                Int.out1<-mean(y)-mean(xmat%*%beta.out[i]) #mean of y-value minus mean of x-matrix times beta coefficients
                outlist<-list(beta.out=beta.out,ind.out=Ind.out,int.out=Int.out1)
        }       
        outlist #Returns outlist to end function
}

Combine leaps and lars in a single function

#Combine function calling leaps.then.press and lars.select
combined.leaps.lars<-function(both = F,leaps = F, lars = F,xmat,yvec,ncheck=10,int=F)
{ 
 #if(both==TRUE){par(mfrow=c(2,1))}else{par(mfrow=c(1,1))}
 if(both){
    leaps.output<-leaps.then.press(xmat,yvec,ncheck,int)
    lars.output<-lars.select(xmat,yvec)
    plot(xmat%*%lars.output$beta.out,yvec) #Actual Price vs predicted Price
    print("Correlation between predicted Price and actual Price Lars")
    print(cor(xmat%*%lars.output$beta.out,yvec)) #correlation
    #Combine leaps and lars output in a list 
    lars.leaps.output<-list(leaps.output=leaps.output,lars.output=lars.output)
    #Return the output
    lars.leaps.output
 }
  else if(leaps){
    leaps.output<-leaps.then.press(xmat,yvec,ncheck,int)
    leaps.output
  }
  else if(lars){
    lars.output<-lars.select(xmat,yvec)
    plot(xmat%*%lars.output$beta.out,yvec) #Actual Price vs predicted Price
    print("Correlation between predicted Price and actual Price Lars")
    print(cor(xmat%*%lars.output$beta.out,yvec)) #correlation
    lars.output
  }
  

}

Build second order matrix

#Creating a matrix from the data
airbnb_numeric_mat<-as.matrix(airbnb_numeric)
#Output the first 5 rows
airbnb_numeric_mat[1:5,]
##        id latitude longitude price minimum_nights number_of_reviews
## [1,] 2539 40.64749 -73.97237   149              1                 9
## [2,] 2595 40.75362 -73.98377   225              1                45
## [3,] 3647 40.80902 -73.94190    60              3                 0
## [4,] 3831 40.68514 -73.95976    45              1               270
## [5,] 5022 40.79851 -73.94399    80             10                 9
##      reviews_per_month floor noise(dB) neighbourhood_group_numeric
## [1,]              0.21     1  69.05646                           2
## [2,]              0.38     1  56.05428                           3
## [3,]              0.00     1  56.05428                           3
## [4,]              4.64     1  69.05646                           2
## [5,]              0.10     1  56.05428                           3
##      room_type_numeric neighbourhood_numeric  lnprice lnmin_nights
## [1,]                 2                   109 5.003946     0.000000
## [2,]                 1                   128 5.416100     0.000000
## [3,]                 2                    95 4.094345     1.098612
## [4,]                 1                    42 3.806662     0.000000
## [5,]                 1                    62 4.382027     2.302585
#Linear fit
ls.print(lsfit(airbnb_numeric_mat[,c(-3,-11)],airbnb_numeric_mat[,3]))
## Residual Standard Error=0.0311
## R-Square=0.5473
## F-statistic (df=12, 39082)=3937.735
## p-value=0
## 
##                             Estimate Std.Err   t-value Pr(>|t|)
## Intercept                   -81.8806  0.1933 -423.5303   0.0000
## id                            0.0000  0.0000    5.2865   0.0000
## latitude                      0.1878  0.0046   40.7247   0.0000
## price                         0.0000  0.0000  -23.1563   0.0000
## minimum_nights                0.0000  0.0000    0.3729   0.7092
## number_of_reviews             0.0000  0.0000   -0.7983   0.4247
## reviews_per_month             0.0014  0.0001    9.7270   0.0000
## floor                         0.0135  0.0003   45.7510   0.0000
## noise(dB)                     0.0038  0.0001   41.6813   0.0000
## neighbourhood_group_numeric   0.0168  0.0008   21.1657   0.0000
## neighbourhood_numeric         0.0000  0.0000  -16.3249   0.0000
## lnprice                      -0.0044  0.0003  -14.4619   0.0000
## lnmin_nights                 -0.0013  0.0002   -6.3516   0.0000
# From the linear fit we can see that  reviews_per_month is not a good predictor because p-value is greater than 0.05.


#Make second order matrix of the x values but drop id, price,lnprice,latitude, longitude,lnmin_nights, reviews_per_month,neighborhood
airbnb_numeric_mat2nd<-matrix.2ndorder.make(airbnb_numeric_mat[,c(-1,-2,-3,-4,-7,-13,-14,-12)])#Second order matrix for x variables
airbnb_numeric_mat2nd[1:5,]
##      minimum_nights number_of_reviews floor noise(dB)
## [1,]              1                 9     1  69.05646
## [2,]              1                45     1  56.05428
## [3,]              3                 0     1  56.05428
## [4,]              1               270     1  69.05646
## [5,]             10                 9     1  56.05428
##      neighbourhood_group_numeric room_type_numeric minimum_nightsminimum_nights
## [1,]                           2                 2                            1
## [2,]                           3                 1                            1
## [3,]                           3                 2                            9
## [4,]                           2                 1                            1
## [5,]                           3                 1                          100
##      minimum_nightsnumber_of_reviews minimum_nightsfloor
## [1,]                               9                   1
## [2,]                              45                   1
## [3,]                               0                   3
## [4,]                             270                   1
## [5,]                              90                  10
##      minimum_nightsnoise(dB) minimum_nightsneighbourhood_group_numeric
## [1,]                69.05646                                         2
## [2,]                56.05428                                         3
## [3,]               168.16283                                         9
## [4,]                69.05646                                         2
## [5,]               560.54278                                        30
##      minimum_nightsroom_type_numeric number_of_reviewsnumber_of_reviews
## [1,]                               2                                 81
## [2,]                               1                               2025
## [3,]                               6                                  0
## [4,]                               1                              72900
## [5,]                              10                                 81
##      number_of_reviewsfloor number_of_reviewsnoise(dB)
## [1,]                      9                   621.5082
## [2,]                     45                  2522.4425
## [3,]                      0                     0.0000
## [4,]                    270                 18645.2454
## [5,]                      9                   504.4885
##      number_of_reviewsneighbourhood_group_numeric
## [1,]                                           18
## [2,]                                          135
## [3,]                                            0
## [4,]                                          540
## [5,]                                           27
##      number_of_reviewsroom_type_numeric floorfloor floornoise(dB)
## [1,]                                 18          1       69.05646
## [2,]                                 45          1       56.05428
## [3,]                                  0          1       56.05428
## [4,]                                270          1       69.05646
## [5,]                                  9          1       56.05428
##      floorneighbourhood_group_numeric floorroom_type_numeric noise(dB)noise(dB)
## [1,]                                2                      2           4768.795
## [2,]                                3                      1           3142.082
## [3,]                                3                      2           3142.082
## [4,]                                2                      1           4768.795
## [5,]                                3                      1           3142.082
##      noise(dB)neighbourhood_group_numeric noise(dB)room_type_numeric
## [1,]                             138.1129                  138.11293
## [2,]                             168.1628                   56.05428
## [3,]                             168.1628                  112.10856
## [4,]                             138.1129                   69.05646
## [5,]                             168.1628                   56.05428
##      neighbourhood_group_numericneighbourhood_group_numeric
## [1,]                                                      4
## [2,]                                                      9
## [3,]                                                      9
## [4,]                                                      4
## [5,]                                                      9
##      neighbourhood_group_numericroom_type_numeric
## [1,]                                            4
## [2,]                                            3
## [3,]                                            6
## [4,]                                            2
## [5,]                                            3
##      room_type_numericroom_type_numeric
## [1,]                                  4
## [2,]                                  1
## [3,]                                  4
## [4,]                                  1
## [5,]                                  1
#dim(airbnb_numeric_mat2nd)

Call leaps and lars

#Run combined.leap.lars function for full
results.leaps.lars<-combined.leaps.lars(both = F,leaps = T,lars = F,airbnb_numeric_mat2nd,airbnb_numeric_mat[,4])
## [1] 1
## [1] "Press= 471881742.366937"
## [1] "MPSE= 12077.5445309037"
## [1] "Cp= 22.3289521682236"
## [1] "coef1="
##                                              Intercept 
##                                          -8.238058e+04 
##                                      number_of_reviews 
##                                          -1.883703e+00 
##                                                  floor 
##                                           4.023336e+03 
##                                              noise(dB) 
##                                           1.178333e+03 
##                            neighbourhood_group_numeric 
##                                           2.470269e+04 
##                                      room_type_numeric 
##                                          -2.674480e+02 
##                           minimum_nightsminimum_nights 
##                                           8.917956e-04 
##                                    minimum_nightsfloor 
##                                           1.506043e-01 
##                                minimum_nightsnoise(dB) 
##                                          -4.008688e-03 
##                        minimum_nightsroom_type_numeric 
##                                          -1.659697e-01 
##                     number_of_reviewsnumber_of_reviews 
##                                           3.037553e-04 
##                                 number_of_reviewsfloor 
##                                          -1.163264e-01 
##                             number_of_reviewsnoise(dB) 
##                                           2.168555e-02 
##           number_of_reviewsneighbourhood_group_numeric 
##                                           2.229159e-01 
##                                             floorfloor 
##                                           8.710118e+01 
##                                         floornoise(dB) 
##                                          -4.818533e+01 
##                       floorneighbourhood_group_numeric 
##                                          -4.688911e+02 
##                                 floorroom_type_numeric 
##                                          -5.300624e+00 
##                                     noise(dB)noise(dB) 
##                                          -3.027349e+00 
##                   noise(dB)neighbourhood_group_numeric 
##                                          -1.928135e+02 
##                             noise(dB)room_type_numeric 
##                                           1.282519e+00 
## neighbourhood_group_numericneighbourhood_group_numeric 
##                                          -1.726516e+03 
##           neighbourhood_group_numericroom_type_numeric 
##                                           1.183899e+01 
##                     room_type_numericroom_type_numeric 
##                                           3.001968e+01

## [1] "Correlation between leaps prediction and actual yvec"
##           [,1]
## [1,] 0.8754645
## [1] 2
## [1] "Press= 471995683.595577"
## [1] "MPSE= 12080.4607917785"
## [1] "Cp= 23.3641633402876"
## [1] 3
## [1] "Press= 471860530.489849"
## [1] "MPSE= 12077.3107368787"
## [1] "Cp= 23.6018489151684"
## [1] 4
## [1] "Press= 471873177.429209"
## [1] "MPSE= 12077.634436376"
## [1] "Cp= 23.6773073182849"
## [1] 5
## [1] "Press= 472174338.347263"
## [1] "MPSE= 12085.3426758962"
## [1] "Cp= 24.2468334405494"
## [1] 6
## [1] "Press= 472142437.090343"
## [1] "MPSE= 12084.52616049"
## [1] "Cp= 24.3257406269622"
## [1] 7
## [1] "Press= 471968785.095022"
## [1] "MPSE= 12080.0815227802"
## [1] "Cp= 24.5205753269547"
## [1] 8
## [1] "Press= 471990323.009525"
## [1] "MPSE= 12080.6327875486"
## [1] "Cp= 24.7096010655805"
## [1] 9
## [1] "Press= 471857994.35842"
## [1] "MPSE= 12077.5549504318"
## [1] "Cp= 25.0574108303845"
## [1] 10
## [1] "Press= 472233685.335273"
## [1] "MPSE= 12086.8616671429"
## [1] "Cp= 25.1560561789447"
## [1] "which matrix:"
## [1] "matwhich 1"
#results.leaps.lars

#Run combined.leap.lars function for full 
results.leaps.lars<-combined.leaps.lars(both = F,leaps = F,lars = T,airbnb_numeric_mat2nd,airbnb_numeric_mat[,4])

## [1] "Correlation between predicted Price and actual Price Lars"
##           [,1]
## [1,] 0.8458776
#results.leaps.lars

#airbnb_numeric_mat

Build the linear model on the output of leaps variable selection. Run predictions on the test data and write to a .csv file.

linear.airbnb<-lm(price ~ minimum_nights +floor + `noise(dB)` + neighbourhood_group_numeric +  room_type_numeric + I(minimum_nights^2) + minimum_nights*floor +  minimum_nights*`noise(dB)` + minimum_nights*neighbourhood_group_numeric + minimum_nights*room_type_numeric + number_of_reviews*number_of_reviews + number_of_reviews*floor + number_of_reviews*neighbourhood_group_numeric + number_of_reviews*room_type_numeric + I(floor^2) + floor*`noise(dB)` + floor*neighbourhood_group_numeric + floor*room_type_numeric + I(`noise(dB)`^2) + `noise(dB)`*neighbourhood_group_numeric + `noise(dB)`*room_type_numeric + I(neighbourhood_group_numeric^2) + neighbourhood_group_numeric*room_type_numeric +  I(room_type_numeric^2), data = airbnb_train_numeric)


get_regression_summaries(linear.airbnb)
## # A tibble: 1 × 9
##   r_squared adj_r_squared   mse  rmse sigma statistic p_value    df  nobs
##       <dbl>         <dbl> <dbl> <dbl> <dbl>     <dbl>   <dbl> <dbl> <dbl>
## 1     0.818         0.818 6393.  80.0  80.0     5102.       0    24 27220
get_regression_table(linear.airbnb)
## # A tibble: 25 × 7
##    term                   estimate std_error statistic p_value lower_ci upper_ci
##    <chr>                     <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
##  1 intercept              -1.06e+5  1612.      -65.8     0     -1.09e+5 -1.03e+5
##  2 minimum_nights          1.28e+0     0.886     1.45    0.148 -4.56e-1  3.02e+0
##  3 floor                   3.70e+3    60.7      60.9     0      3.58e+3  3.82e+3
##  4 `noise(dB)`             1.63e+3    35.8      45.6     0      1.56e+3  1.70e+3
##  5 neighbourhood_group_n…  3.08e+4   387.       79.7     0      3.01e+4  3.16e+4
##  6 room_type_numeric      -3.01e+2    29.7     -10.1     0     -3.59e+2 -2.42e+2
##  7 I(minimum_nights^2)     1   e-3     0         5.18    0      1   e-3  1   e-3
##  8 number_of_reviews      -1.7 e-2     0.051    -0.343   0.731 -1.17e-1  8.2 e-2
##  9 I(floor^2)              8.19e+1     0.672   122.      0      8.05e+1  8.32e+1
## 10 I(`noise(dB)`^2)       -5.15e+0     0.196   -26.3     0     -5.53e+0 -4.76e+0
## # ℹ 15 more rows
points<-get_regression_points(linear.airbnb)
ggplot(data=points, aes(x=residual)) + geom_histogram(color="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=points, aes(x=price_hat, y=residual)) + geom_point()

points %>%
  ggplot(aes(sample = residual)) +
  stat_qq() +
  stat_qq_line()

airbnb.predictions <- predict(linear.airbnb,airbnb_test)
#write.csv(airbnb.predictions,file = "C:/Users/anish/Documents/predicitions_airbnb.csv")

Build the linear model on the output of lars variable selection. Run predictions on the test data and write to a .csv file.

linear.airbnb<-lm(price ~ minimum_nights + number_of_reviews + floor + `noise(dB)` + neighbourhood_group_numeric + room_type_numeric + I(minimum_nights^2) + minimum_nights*number_of_reviews + minimum_nights*floor + minimum_nights*`noise(dB)` + minimum_nights*neighbourhood_group_numeric +  minimum_nights*room_type_numeric + I(number_of_reviews^2) + number_of_reviews*floor + number_of_reviews*`noise(dB)` +  number_of_reviews*neighbourhood_group_numeric + number_of_reviews*room_type_numeric + I(floor^2) + floor*`noise(dB)` + floor*neighbourhood_group_numeric + floor*room_type_numeric + I(`noise(dB)`^2) + `noise(dB)`*room_type_numeric + I(neighbourhood_group_numeric^2) + neighbourhood_group_numeric*room_type_numeric + I(room_type_numeric^2), data = airbnb_train_numeric)

get_regression_summaries(linear.airbnb)
## # A tibble: 1 × 9
##   r_squared adj_r_squared   mse  rmse sigma statistic p_value    df  nobs
##       <dbl>         <dbl> <dbl> <dbl> <dbl>     <dbl>   <dbl> <dbl> <dbl>
## 1     0.798         0.798 7110.  84.3  84.4     4129.       0    26 27220
get_regression_table(linear.airbnb)
## # A tibble: 27 × 7
##    term                   estimate std_error statistic p_value lower_ci upper_ci
##    <chr>                     <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
##  1 intercept              -1.75e+4   206.       -84.9    0     -1.79e+4 -1.71e+4
##  2 minimum_nights          1.75e+0     0.938      1.86   0.063 -9.3 e-2  3.58e+0
##  3 number_of_reviews      -8.97e-1     0.385     -2.33   0.02  -1.65e+0 -1.41e-1
##  4 floor                   6.68e+3    29.5      226.     0      6.62e+3  6.74e+3
##  5 `noise(dB)`            -3.25e+2     6.10     -53.2    0     -3.37e+2 -3.13e+2
##  6 neighbourhood_group_n…  9.54e+3    45.5      210.     0      9.46e+3  9.63e+3
##  7 room_type_numeric      -1.27e+2    31.2       -4.06   0     -1.88e+2 -6.55e+1
##  8 I(minimum_nights^2)     1   e-3     0          6.9    0      1   e-3  2   e-3
##  9 I(number_of_reviews^2)  0           0          2.94   0.003  0        0      
## 10 I(floor^2)              1.07e+2     0.529    201.     0      1.06e+2  1.08e+2
## # ℹ 17 more rows
points<-get_regression_points(linear.airbnb)
ggplot(data=points, aes(x=residual)) + geom_histogram(color="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=points, aes(x=price_hat, y=residual)) + geom_point()

points %>%
  ggplot(aes(sample = residual)) +
  stat_qq() +
  stat_qq_line()