Part 1: Exploratory Data Analysis (Beginner + Advanced Tracks)

1. Visualize available apartments

a) Load the calendar data set and get an overview of it

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(readr)
library(leaflet)
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(caTools)
calendar <- read_csv("/cloud/project/01_data/boston/calendar.csv")
## Rows: 1139165 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): price, adjusted_price
## dbl  (3): listing_id, minimum_nights, maximum_nights
## lgl  (1): available
## date (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# get an overview of the data (calendar)
str(calendar)
## spec_tbl_df [1,139,165 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ listing_id    : num [1:1139165] 3781 3781 3781 3781 1374434 ...
##  $ date          : Date[1:1139165], format: "2021-09-19" "2021-09-20" ...
##  $ available     : logi [1:1139165] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ price         : chr [1:1139165] "$125.00" "$125.00" "$125.00" "$125.00" ...
##  $ adjusted_price: chr [1:1139165] "$125.00" "$125.00" "$125.00" "$125.00" ...
##  $ minimum_nights: num [1:1139165] 32 32 32 32 28 28 28 28 28 28 ...
##  $ maximum_nights: num [1:1139165] 1125 1125 1125 1125 270 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   listing_id = col_double(),
##   ..   date = col_date(format = ""),
##   ..   available = col_logical(),
##   ..   price = col_character(),
##   ..   adjusted_price = col_character(),
##   ..   minimum_nights = col_double(),
##   ..   maximum_nights = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(calendar)
##    listing_id            date            available          price          
##  Min.   :    3781   Min.   :2021-09-19   Mode :logical   Length:1139165    
##  1st Qu.:18194673   1st Qu.:2021-12-19   FALSE:578436    Class :character  
##  Median :35425876   Median :2022-03-20   TRUE :560729    Mode  :character  
##  Mean   :31911601   Mean   :2022-03-20                                     
##  3rd Qu.:47487652   3rd Qu.:2022-06-19                                     
##  Max.   :52324087   Max.   :2022-09-18                                     
##  adjusted_price     minimum_nights   maximum_nights     
##  Length:1139165     Min.   :   1.0   Min.   :1.000e+00  
##  Class :character   1st Qu.:   2.0   1st Qu.:5.000e+02  
##  Mode  :character   Median :  29.0   Median :1.125e+03  
##                     Mean   :  74.6   Mean   :4.129e+06  
##                     3rd Qu.:  91.0   3rd Qu.:1.125e+03  
##                     Max.   :1000.0   Max.   :2.147e+09
head(calendar)

b) Transform data in columns price, available, and date

# Transform price as numeric
calendar$price <- str_remove_all(calendar$price, "[$]")
calendar$price <- str_remove_all(calendar$price, "[,]")
calendar$price <- as.numeric(calendar$price)

# Transform available as character
calendar$available <- ifelse(calendar$available == TRUE, "t", "f")

# Transform date as Date
calendar$date <- as.Date(calendar$date)

c) Show the number of available Airbnb apartments in a simple line plot

# pre-set theme for upcoming visualisations
theme_set(theme_bw())

avail_by_date <- calendar %>%
                 group_by(date, available) %>% 
                 summarise(n = n()) %>% 
                 filter(available == "t")
## `summarise()` has grouped output by 'date'. You can override using the `.groups` argument.
avail_by_date$available <- NULL
                
ggplot(data = avail_by_date, aes(x = date, y = n)) +
       geom_line() +
       ggtitle("Airbnb Availability in Boston") +
       labs(x = "Date", y = "# of available Flats", 
            subtitle = "365 days outlook starting September 2021")

d) Create a calendar plot to visualize the availability of Airbnb listings

# create index of available Airbnb
# index in which the max value represents 1 and min value represents 0
# the rest is mapped relatively to max and min
avail_by_date$indexed_n <- (avail_by_date$n - min(avail_by_date$n))/ 
                            (max(avail_by_date$n) - min(avail_by_date$n))
  
days <- avail_by_date$indexed_n
library(calendR)
## ~~ Package calendR
## Visit https://r-coder.com/ for R tutorials ~~
calendR(start_date = "2021-09-19",
        end_date = "2022-09-18",
        title = "Yearly Availability Index Of All Listings",
        special.days = days,
        special.col = "#00AAAE",
        gradient = TRUE,
        legend.title = "Availability Index",
        legend.pos = "right",
        weeknames = c("M", "T", "W", "T", "F", "S", "S"))

2. Correlation between price and availability

a) Calculate the Correlation Coefficient between available and price

# get the average Price per day, since we have the available flats per day
# therefore create new df with avail and price per day
avail_price_by_date <- calendar %>% 
                       group_by(date) %>% 
                       filter(available == "t") %>% 
                       summarise(price = mean(price), available = n())

# calculate correlation between available and price
cor(avail_price_by_date$price, avail_price_by_date$available)
## [1] -0.6206325
# the correlation coefficient between the 2 variables is approx. -0.62
# the correlation is negative!
# if the price is high, the availability related to it is in tendency lower
# however correlation is not causation
# one cannot tell which leads to which

b) Visualize Correlation

ggplot(data = avail_price_by_date, aes(x = price, y = available)) +
  geom_point() +
  ggtitle("Prices against Availability Airbnb") +
  labs(x = "avg price per day", y = "avg available listings per day", 
       subtitle = "Negative Correlation between Price and Available Airbnb Listings in Boston") +
  annotate(geom = "text", x = 250, y = 1700, label = "correlation = -0.62")

3. Distribution of Prices

# load listings and get an overview of the dataset
listings <- read_csv("/cloud/project/01_data/boston/listings.csv")
## Rows: 3123 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (5): price, neighbourhood_cleansed, property_type, room_type, host_acc...
## dbl  (11): id, latitude, longitude, availability_30, beds, bedrooms, review_...
## lgl   (3): neighbourhood_group_cleansed, bathrooms, host_is_superhost
## date  (1): host_since
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(listings)
summary(listings)
##        id              price           neighbourhood_cleansed
##  Min.   :    3781   Length:3123        Length:3123           
##  1st Qu.:18197318   Class :character   Class :character      
##  Median :35467463   Mode  :character   Mode  :character      
##  Mean   :31919334                                            
##  3rd Qu.:47464676                                            
##  Max.   :52324087                                            
##                                                              
##  neighbourhood_group_cleansed    latitude       longitude      bathrooms     
##  Mode:logical                 Min.   :42.24   Min.   :-71.17   Mode:logical  
##  NA's:3123                    1st Qu.:42.32   1st Qu.:-71.10   NA's:3123     
##                               Median :42.34   Median :-71.07                 
##                               Mean   :42.34   Mean   :-71.08                 
##                               3rd Qu.:42.35   3rd Qu.:-71.06                 
##                               Max.   :42.40   Max.   :-70.99                 
##                                                                              
##  availability_30       beds           bedrooms      review_scores_value
##  Min.   : 0.000   Min.   : 0.000   Min.   : 1.000   Min.   :1.000      
##  1st Qu.: 0.000   1st Qu.: 1.000   1st Qu.: 1.000   1st Qu.:4.500      
##  Median : 3.000   Median : 1.000   Median : 1.000   Median :4.725      
##  Mean   : 8.508   Mean   : 1.658   Mean   : 1.464   Mean   :4.624      
##  3rd Qu.:15.000   3rd Qu.: 2.000   3rd Qu.: 2.000   3rd Qu.:4.880      
##  Max.   :30.000   Max.   :22.000   Max.   :13.000   Max.   :5.000      
##                   NA's   :110      NA's   :440      NA's   :861        
##  minimum_nights    maximum_nights   availability_365 property_type     
##  Min.   :   1.00   Min.   :   1.0   Min.   :  0.0    Length:3123       
##  1st Qu.:   2.00   1st Qu.: 365.0   1st Qu.: 50.5    Class :character  
##  Median :  29.00   Median :1125.0   Median :160.0    Mode  :character  
##  Mean   :  39.33   Mean   : 756.6   Mean   :179.1                      
##  3rd Qu.:  91.00   3rd Qu.:1125.0   3rd Qu.:321.0                      
##  Max.   :1000.00   Max.   :9999.0   Max.   :365.0                      
##                                                                        
##   room_type           host_since         host_is_superhost host_acceptance_rate
##  Length:3123        Min.   :2008-12-03   Mode :logical     Length:3123         
##  Class :character   1st Qu.:2014-07-15   FALSE:2320        Class :character    
##  Mode  :character   Median :2016-05-16   TRUE :803         Mode  :character    
##                     Mean   :2016-05-24                                         
##                     3rd Qu.:2018-10-03                                         
##                     Max.   :2021-09-04                                         
##                                                                                
##  calculated_host_listings_count
##  Min.   :  1.00                
##  1st Qu.:  1.00                
##  Median :  5.00                
##  Mean   : 23.43                
##  3rd Qu.: 26.00                
##  Max.   :168.00                
## 
str(listings)
## spec_tbl_df [3,123 × 20] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ id                            : num [1:3123] 3781 5506 6695 8789 10730 ...
##  $ price                         : chr [1:3123] "$125.00" "$124.00" "$169.00" "$110.00" ...
##  $ neighbourhood_cleansed        : chr [1:3123] "East Boston" "Roxbury" "Roxbury" "Downtown" ...
##  $ neighbourhood_group_cleansed  : logi [1:3123] NA NA NA NA NA NA ...
##  $ latitude                      : num [1:3123] 42.4 42.3 42.3 42.4 42.4 ...
##  $ longitude                     : num [1:3123] -71 -71.1 -71.1 -71.1 -71.1 ...
##  $ bathrooms                     : logi [1:3123] NA NA NA NA NA NA ...
##  $ availability_30               : num [1:3123] 3 9 5 0 0 14 0 0 30 0 ...
##  $ beds                          : num [1:3123] 0 1 0 1 1 1 1 0 0 3 ...
##  $ bedrooms                      : num [1:3123] 1 1 NA 1 1 NA NA NA 1 3 ...
##  $ review_scores_value           : num [1:3123] 4.9 4.77 4.7 4.56 4.43 4.75 NA NA NA 4.47 ...
##  $ minimum_nights                : num [1:3123] 32 3 3 91 91 29 33 91 91 30 ...
##  $ maximum_nights                : num [1:3123] 1125 90 730 365 365 ...
##  $ availability_365              : num [1:3123] 153 46 62 310 277 329 323 0 365 274 ...
##  $ property_type                 : chr [1:3123] "Entire rental unit" "Entire guest suite" "Entire condominium (condo)" "Entire rental unit" ...
##  $ room_type                     : chr [1:3123] "Entire home/apt" "Entire home/apt" "Entire home/apt" "Entire home/apt" ...
##  $ host_since                    : Date[1:3123], format: "2008-12-03" "2009-02-19" ...
##  $ host_is_superhost             : logi [1:3123] FALSE TRUE TRUE TRUE TRUE TRUE ...
##  $ host_acceptance_rate          : chr [1:3123] NA "96%" "96%" "60%" ...
##  $ calculated_host_listings_count: num [1:3123] 1 10 10 5 5 11 11 2 1 27 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   id = col_double(),
##   ..   price = col_character(),
##   ..   neighbourhood_cleansed = col_character(),
##   ..   neighbourhood_group_cleansed = col_logical(),
##   ..   latitude = col_double(),
##   ..   longitude = col_double(),
##   ..   bathrooms = col_logical(),
##   ..   availability_30 = col_double(),
##   ..   beds = col_double(),
##   ..   bedrooms = col_double(),
##   ..   review_scores_value = col_double(),
##   ..   minimum_nights = col_double(),
##   ..   maximum_nights = col_double(),
##   ..   availability_365 = col_double(),
##   ..   property_type = col_character(),
##   ..   room_type = col_character(),
##   ..   host_since = col_date(format = ""),
##   ..   host_is_superhost = col_logical(),
##   ..   host_acceptance_rate = col_character(),
##   ..   calculated_host_listings_count = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
# transform price to numeric!
listings$price <- str_remove_all(listings$price, "[$]")
listings$price <- str_remove_all(listings$price, "[,]")
listings$price <- as.numeric(listings$price)

a) Calculate Mean and Standard Deviation for each neighbourhood

# create df with avg price and sd per neighbourhood
price_by_neighbourhood <- listings %>% 
                          group_by(neighbourhood_cleansed) %>% 
                          summarise(mean_price = round(mean(price), 2), 
                                    sd_price = round(sd(price), 2))

b) Visually compare the price distribution for the, on average, most expensive neighbourhood with the least expensive one

# most expensive on avg is leather district
# least expensive on avg is Hyde Park
# removed the 1 outlier Leather District = 3999 as it is way higher than the rest!
# the rest of the prices for the graph are between 0 and 300
ggplot(data = listings, aes(x = neighbourhood_cleansed, y = price)) +
  geom_boxplot() +
  geom_jitter(width = 0.2) +
  ylim(0, 300) +
  xlim("Hyde Park", "Leather District") +
  ggtitle("Distribution of Prices")
## Warning: Removed 3083 rows containing missing values (stat_boxplot).
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## Warning: Removed 3084 rows containing missing values (geom_point).

4. Analysis of Listing Reviews

# load reviews and get an overview of the dataset
reviews <- read_csv("/cloud/project/01_data/boston/reviews.csv")
## Rows: 122879 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (1): listing_id
## date (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(reviews)
## spec_tbl_df [122,879 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ listing_id: num [1:122879] 3781 3781 3781 3781 3781 ...
##  $ date      : Date[1:122879], format: "2015-07-10" "2015-08-09" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   listing_id = col_double(),
##   ..   date = col_date(format = "")
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(reviews)
##    listing_id            date           
##  Min.   :    3781   Min.   :2009-03-21  
##  1st Qu.: 6401859   1st Qu.:2017-09-26  
##  Median :15256830   Median :2019-01-10  
##  Mean   :17278311   Mean   :2018-11-07  
##  3rd Qu.:24484501   3rd Qu.:2020-01-04  
##  Max.   :52113003   Max.   :2021-09-19

a) Compute the average number of reviews per apartment

# get number of reviews per id
# change column name of listing_id so that it fits to reviews$id
reviews_by_id <- reviews %>% 
                 group_by(listing_id) %>% 
                 summarise(n_reviews = n()) %>% 
                 rename(id = listing_id)

# avg number of reviews per appartment
mean(reviews_by_id$n_reviews)
## [1] 53.98902

b) Merge with the listings data set to extract the location of listings

# merge reviews_by_id and listings
listings_reviews <- merge(reviews_by_id, listings, by = "id" )

# reduce listing_reviews for a better overview
listings_reviews <- listings_reviews %>% 
                    select("id", "n_reviews", "neighbourhood_cleansed", 
                           "latitude", "longitude")

c) Plot the location of 200 most reviewed apartments (e.g. with barplot)

review_desc <- listings_reviews %>% 
               arrange(desc(n_reviews)) %>% 
               slice_head(n = 200)

ggplot(data = review_desc, aes(x = n_reviews, y = neighbourhood_cleansed)) +
       geom_bar(stat = "identity") +
       ggtitle("Locations of Most Reviewed Apartments") +
       xlab("Number of Reviews") + ylab("") +
       labs(subtitle = "neighbourhoods of 200 most reviewed apartments")

5. Maps

a) Map the 200 most frequently reviewed apartments

width <- max(review_desc$latitude) - min(review_desc$latitude)
height <- max(review_desc$longitude) - min(review_desc$longitude)

boston_border <- c(bottom = min(review_desc$latitude) - 0.1 * height,
                   top = max(review_desc$latitude) + 0.1 * height,
                   left = min(review_desc$longitude) - 0.1 * width,
                   right = max(review_desc$longitude) + 0.1 * width)

boston_map <- get_stamenmap(boston_border)
## Source : http://tile.stamen.com/terrain/10/309/378.png
## Source : http://tile.stamen.com/terrain/10/310/378.png
## Source : http://tile.stamen.com/terrain/10/309/379.png
## Source : http://tile.stamen.com/terrain/10/310/379.png
ggmap(boston_map) +
geom_point(data = review_desc, aes(x = longitude, y = latitude), color="steelblue")+
  ggtitle("Most Reviewed Airbnbs in Boston Top 200")

b) Create a new map extending its functionality by adding a pop up window

popup_1 <- paste(sep = "<br/>",
                 paste0("<b>ID: </b>", review_desc$id),
                 paste0("<b>Type: </b>" ,listings$property_type),
                 paste0("<b>Price: </b>" ,listings$price, " $ / night"),
                 paste0("<b>Number of Reviews: </b>", review_desc$n_reviews))

m <- leaflet(review_desc) %>%
  addTiles() %>% 
  addMarkers(lng = ~ longitude, 
             lat = ~ latitude,
             popup = popup_1)
m

c) Create a heatmap

ggmap(boston_map) +
  geom_density_2d(data = review_desc, aes(x = longitude, y = latitude)) +
  stat_density2d(data = review_desc, aes(x = longitude, y = latitude)) +
  ggtitle("Heatmap of Airbnb Housings in Boston (Top200)")

# Part 2: Price Prediction Using Statistical Methods (motivated Beginner + Advanced Tracks) ## Split Dataset 70/30

set.seed(35)
listings_df <- as.data.frame(listings)
splitted_data <- sample.split(listings_df, SplitRatio = 0.7)
train_set = listings_df[splitted_data == TRUE, ]
test_set = listings_df[splitted_data == FALSE, ]

1. Visualize feature correlations in a correlation matrix/heatmap

# get all numeric columns and create correlation matrix
df_num <- unlist(lapply(listings_df, is.numeric))
dataset_num = listings_df[df_num == TRUE]

cor(dataset_num)
##                                         id        price     latitude
## id                              1.00000000  0.057130136  0.122799989
## price                           0.05713014  1.000000000  0.105217060
## latitude                        0.12279999  0.105217060  1.000000000
## longitude                       0.05592774  0.110615119  0.269210262
## availability_30                 0.16828774  0.092200742 -0.003924321
## beds                                    NA           NA           NA
## bedrooms                                NA           NA           NA
## review_scores_value                     NA           NA           NA
## minimum_nights                 -0.13720303 -0.050997202 -0.003551736
## maximum_nights                 -0.14072180  0.041058353  0.034549541
## availability_365                0.13300036  0.043215733 -0.062785567
## calculated_host_listings_count  0.33076194 -0.004734504  0.140622942
##                                  longitude availability_30 beds bedrooms
## id                              0.05592774     0.168287745   NA       NA
## price                           0.11061512     0.092200742   NA       NA
## latitude                        0.26921026    -0.003924321   NA       NA
## longitude                       1.00000000     0.069121342   NA       NA
## availability_30                 0.06912134     1.000000000   NA       NA
## beds                                    NA              NA    1       NA
## bedrooms                                NA              NA   NA        1
## review_scores_value                     NA              NA   NA       NA
## minimum_nights                 -0.08872897     0.135178319   NA       NA
## maximum_nights                 -0.02314254    -0.048840898   NA       NA
## availability_365                0.04556522     0.509657608   NA       NA
## calculated_host_listings_count  0.03397609     0.047550977   NA       NA
##                                review_scores_value minimum_nights
## id                                              NA   -0.137203030
## price                                           NA   -0.050997202
## latitude                                        NA   -0.003551736
## longitude                                       NA   -0.088728970
## availability_30                                 NA    0.135178319
## beds                                            NA             NA
## bedrooms                                        NA             NA
## review_scores_value                              1             NA
## minimum_nights                                  NA    1.000000000
## maximum_nights                                  NA    0.155430641
## availability_365                                NA    0.048285377
## calculated_host_listings_count                  NA    0.066176347
##                                maximum_nights availability_365
## id                                -0.14072180       0.13300036
## price                              0.04105835       0.04321573
## latitude                           0.03454954      -0.06278557
## longitude                         -0.02314254       0.04556522
## availability_30                   -0.04884090       0.50965761
## beds                                       NA               NA
## bedrooms                                   NA               NA
## review_scores_value                        NA               NA
## minimum_nights                     0.15543064       0.04828538
## maximum_nights                     1.00000000       0.03313297
## availability_365                   0.03313297       1.00000000
## calculated_host_listings_count     0.10458032       0.21432412
##                                calculated_host_listings_count
## id                                                0.330761940
## price                                            -0.004734504
## latitude                                          0.140622942
## longitude                                         0.033976092
## availability_30                                   0.047550977
## beds                                                       NA
## bedrooms                                                   NA
## review_scores_value                                        NA
## minimum_nights                                    0.066176347
## maximum_nights                                    0.104580317
## availability_365                                  0.214324123
## calculated_host_listings_count                    1.000000000

2. Regression

a) Simple regression model using one variable

linreg_1 <- lm(price ~ bedrooms, data = train_set)
summary(linreg_1)
## 
## Call:
## lm(formula = price ~ bedrooms, data = train_set)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -393.2  -87.6  -42.9   53.1 4857.1 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   42.797      9.835   4.351 1.43e-05 ***
## bedrooms     100.100      5.832  17.164  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 216.3 on 1872 degrees of freedom
##   (312 observations deleted due to missingness)
## Multiple R-squared:  0.136,  Adjusted R-squared:  0.1355 
## F-statistic: 294.6 on 1 and 1872 DF,  p-value: < 2.2e-16

b) Improve your model using more features

linreg_2 <- lm(price ~ bedrooms + review_scores_value, data = train_set)
summary(linreg_2)
## 
## Call:
## lm(formula = price ~ bedrooms + review_scores_value, data = train_set)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -416.8  -79.4  -36.9   40.5 4865.1 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          124.767     69.905   1.785   0.0745 .  
## bedrooms             108.600      6.838  15.881   <2e-16 ***
## review_scores_value  -21.180     14.939  -1.418   0.1565    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 227.4 on 1360 degrees of freedom
##   (823 observations deleted due to missingness)
## Multiple R-squared:  0.1569, Adjusted R-squared:  0.1556 
## F-statistic: 126.5 on 2 and 1360 DF,  p-value: < 2.2e-16

The Second Model shows a higher R-Squared, however with a higher number of variables the R value automatically increases. The R-Value for both models is quite low!

predicted_price <- predict(linreg_2, test_set)

rmse <- function(actual, predicted) {
sqrt(mean((predicted - actual)^2, na.rm = TRUE))
}

rmse(actual = test_set$price, predicted = predicted_price)
## [1] 449.7764

3. Improve your model

a) Train new models using more advanced methods

linreg_3 <- lm(price ~ bedrooms + review_scores_value + minimum_nights + property_type, data = train_set)
summary(linreg_3)
## 
## Call:
## lm(formula = price ~ bedrooms + review_scores_value + minimum_nights + 
##     property_type, data = train_set)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -444.6  -62.3  -16.0   30.8 4727.0 
## 
## Coefficients:
##                                                   Estimate Std. Error t value
## (Intercept)                                       229.3042   142.9161   1.604
## bedrooms                                           75.7715     8.1288   9.321
## review_scores_value                                -9.4835    14.5200  -0.653
## minimum_nights                                      0.0678     0.1262   0.537
## property_typeCastle                                88.3570   250.8111   0.352
## property_typeEntire bed and breakfast             -65.3416   250.8427  -0.260
## property_typeEntire condominium (condo)          -111.5095   126.8861  -0.879
## property_typeEntire guest suite                  -168.0115   134.4691  -1.249
## property_typeEntire guesthouse                   -156.1280   198.2802  -0.787
## property_typeEntire loft                          -68.8616   140.2220  -0.491
## property_typeEntire place                         -87.3662   250.8226  -0.348
## property_typeEntire rental unit                   -70.6622   125.8716  -0.561
## property_typeEntire residential home              -75.0696   128.9773  -0.582
## property_typeEntire serviced apartment              5.2642   131.1196   0.040
## property_typeEntire townhouse                     233.4639   134.4102   1.737
## property_typeHouseboat                            -12.2692   251.1705  -0.049
## property_typePrivate room in bed and breakfast    -99.1161   135.4817  -0.732
## property_typePrivate room in bungalow            -243.1266   198.3190  -1.226
## property_typePrivate room in condominium (condo) -193.6089   130.0548  -1.489
## property_typePrivate room in guest suite         -144.9211   144.8165  -1.001
## property_typePrivate room in guesthouse          -213.6075   250.8130  -0.852
## property_typePrivate room in loft                -228.6177   177.3787  -1.289
## property_typePrivate room in rental unit         -196.2057   126.3434  -1.553
## property_typePrivate room in residential home    -186.2897   126.3662  -1.474
## property_typePrivate room in townhouse           -135.6940   132.2235  -1.026
## property_typeRoom in bed and breakfast           -183.7693   250.8042  -0.733
## property_typeRoom in boutique hotel               -42.2039   133.3421  -0.317
## property_typeRoom in hotel                        180.5869   138.2224   1.306
## property_typeShared room in townhouse            -236.8278   251.0786  -0.943
##                                                  Pr(>|t|)    
## (Intercept)                                        0.1088    
## bedrooms                                           <2e-16 ***
## review_scores_value                                0.5138    
## minimum_nights                                     0.5912    
## property_typeCastle                                0.7247    
## property_typeEntire bed and breakfast              0.7945    
## property_typeEntire condominium (condo)            0.3797    
## property_typeEntire guest suite                    0.2117    
## property_typeEntire guesthouse                     0.4312    
## property_typeEntire loft                           0.6234    
## property_typeEntire place                          0.7277    
## property_typeEntire rental unit                    0.5746    
## property_typeEntire residential home               0.5606    
## property_typeEntire serviced apartment             0.9680    
## property_typeEntire townhouse                      0.0826 .  
## property_typeHouseboat                             0.9610    
## property_typePrivate room in bed and breakfast     0.4646    
## property_typePrivate room in bungalow              0.2204    
## property_typePrivate room in condominium (condo)   0.1368    
## property_typePrivate room in guest suite           0.3171    
## property_typePrivate room in guesthouse            0.3946    
## property_typePrivate room in loft                  0.1977    
## property_typePrivate room in rental unit           0.1207    
## property_typePrivate room in residential home      0.1407    
## property_typePrivate room in townhouse             0.3050    
## property_typeRoom in bed and breakfast             0.4639    
## property_typeRoom in boutique hotel                0.7517    
## property_typeRoom in hotel                         0.1916    
## property_typeShared room in townhouse              0.3457    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 217.2 on 1334 degrees of freedom
##   (823 observations deleted due to missingness)
## Multiple R-squared:  0.2455, Adjusted R-squared:  0.2297 
## F-statistic:  15.5 on 28 and 1334 DF,  p-value: < 2.2e-16
# Even if the R squared increases the p Value isnt significant for any
# variable except bedrooms!