*.
dataset<- read.csv(file = "~/Desktop/berlin-airbnb-data/listings_summary.csv", head= TRUE)
print(head(dataset,1))
## id listing_url scrape_id last_scraped
## 1 2015 https://www.airbnb.com/rooms/2015 2.018111e+13 2018-11-07
## name
## 1 Berlin-Mitte Value! Quiet courtyard/very central
## summary
## 1 Great location! 30 of 75 sq meters. This wood floored/high ceiling typical Berlin "Altbau" section of an apartment consists of 1 simple large room, a small kitchen and a bathroom + shower. The apartment is in Mitte, close to Prenzlauer Berg/Mauerpark. Perfect for short visits, singles or couples. Your section is closed from the rest of the bigger flat wich is not noticeable. You will not be sharing your space.
## space
## 1 A+++ location! This „Einliegerwohnung“ is an extention of a larger apartment with a separate entrance, bathroom and kitchen. The door to the rest of the apartment is soundproof, hidden, locked and barely noticable (behind mirror in pictures). Your 30 sq meters are facing a quiet courtyard. This wood floored/high ceiling typical Berlin "Altbau" apartment consists of 1 large room with a large double bed, optionally with an extra matress for a 3rd guest), a small kitchen suitable for preparing simple meals, a bathroom + shower + bathtub and of course your own entrance (all linens and towels are provided.) it is located facing a quiet courtyard, it is clean cozy and an authentic "Berlin" place to live. The place is in Mitte, close to Prenzlauer Berg one block from Kastanienallee, Zionskirchplatz, Rosenthaler Platz and Mauerpark. Generally this area is lively with galleries, museums, resturants, cafes, flea markets, bars, clubs, shops, good take away food, a park, U-Bahn, trams, supermarket
## description
## 1 Great location! 30 of 75 sq meters. This wood floored/high ceiling typical Berlin "Altbau" section of an apartment consists of 1 simple large room, a small kitchen and a bathroom + shower. The apartment is in Mitte, close to Prenzlauer Berg/Mauerpark. Perfect for short visits, singles or couples. Your section is closed from the rest of the bigger flat wich is not noticeable. You will not be sharing your space. A+++ location! This „Einliegerwohnung“ is an extention of a larger apartment with a separate entrance, bathroom and kitchen. The door to the rest of the apartment is soundproof, hidden, locked and barely noticable (behind mirror in pictures). Your 30 sq meters are facing a quiet courtyard. This wood floored/high ceiling typical Berlin "Altbau" apartment consists of 1 large room with a large double bed, optionally with an extra matress for a 3rd guest), a small kitchen suitable for preparing simple meals, a bathroom + shower + bathtub and of course your own entrance (all linens a
## experiences_offered
## 1 none
## neighborhood_overview
## 1 It is located in the former East Berlin area of Kastanienallee and Zionskirchplatz (Berlins only natural elevation), very charming and original turn of the last century architecture; this area was barely been destroyed during the war. Close to galleries, resturants, museums, flea markets, cafes, bars, clubs, shops, good take away food, a park, great public transportation: U-Bahn, S-Bahn, Trams, Busses, supermarkets etc.
## notes
## 1 This is my home, not a hotel. I rent out occasionally when I stay with my friend.
## transit
## 1 Close to U-Bahn U8 and U2 (metro), Trams M12, M10 and M8, Busses. Rosenthaler Platz and Hackescher Markt are close by. S-Bahn. Alexander Platz is 6 minutes away on the U8 line.
## access
## 1 Simple kitchen/cooking, refrigerator, microwave oven, stove, dishwasher, espresso/ coffee mashine, cable TV deskspace, shower, bath tub, little courtyard garden, 1st floor. There is a large double bed and optional additional airbeds for a 3rd or 4th person/kids. (Please note that the apartment is 30 square meters, if you stay a few nights and are more than 2 adults, it will be tight)
## interaction
## 1 Always available
## house_rules
## 1 No parties No events No pets No smoking, not even in the courtyard/garden Do not pull the door shut with the keys in the lock from the inside; should that happen, the guest pays for locksmith
## thumbnail_url medium_url
## 1 NA NA
## picture_url
## 1 https://a0.muscache.com/im/pictures/260fd609-710a-4304-82ab-abd7498efd62.jpg?aki_policy=large
## xl_picture_url host_id host_url host_name
## 1 NA 2217 https://www.airbnb.com/users/show/2217 Ian
## host_since host_location
## 1 2008-08-18 Key Biscayne, Florida, United States
## host_about host_response_time host_response_rate
## 1 Believe in sharing economy. within an hour 96%
## host_acceptance_rate host_is_superhost
## 1 N/A t
## host_thumbnail_url
## 1 https://a0.muscache.com/im/pictures/21428a22-4f38-444d-8c72-5892be9da109.jpg?aki_policy=profile_small
## host_picture_url
## 1 https://a0.muscache.com/im/pictures/21428a22-4f38-444d-8c72-5892be9da109.jpg?aki_policy=profile_x_medium
## host_neighbourhood host_listings_count host_total_listings_count
## 1 Mitte 4 4
## host_verifications
## 1 ['email', 'phone', 'reviews', 'jumio', 'offline_government_id', 'government_id']
## host_has_profile_pic host_identity_verified street
## 1 t t Berlin, Berlin, Germany
## neighbourhood neighbourhood_cleansed neighbourhood_group_cleansed city
## 1 Mitte Brunnenstr. Süd Mitte Berlin
## state zipcode market smart_location country_code country latitude
## 1 Berlin 10119 Berlin Berlin, Germany DE Germany 52.53454
## longitude is_location_exact property_type room_type accommodates
## 1 13.40256 f Guesthouse Entire home/apt 3
## bathrooms bedrooms beds bed_type
## 1 1 1 2 Real Bed
## amenities
## 1 {TV,"Cable TV",Wifi,Kitchen,Gym,Heating,"Family/kid friendly","Smoke detector",Essentials,Shampoo,"Lock on bedroom door",Hangers,"Hair dryer",Iron,"Laptop friendly workspace","Private living room",Bathtub,"Hot water","Bed linens","Extra pillows and blankets",Microwave,"Coffee maker",Refrigerator,Dishwasher,"Dishes and silverware","Cooking basics",Stove,"Luggage dropoff allowed","Long term stays allowed"}
## square_feet price weekly_price monthly_price security_deposit
## 1 NA $60.00 $200.00
## cleaning_fee guests_included extra_people minimum_nights maximum_nights
## 1 $30.00 1 $28.00 4 1125
## calendar_updated has_availability availability_30 availability_60
## 1 3 months ago t 0 21
## availability_90 availability_365 calendar_last_scraped number_of_reviews
## 1 51 141 2018-11-07 118
## first_review last_review review_scores_rating review_scores_accuracy
## 1 2016-04-11 2018-10-28 93 10
## review_scores_cleanliness review_scores_checkin
## 1 9 10
## review_scores_communication review_scores_location review_scores_value
## 1 10 10 9
## requires_license license jurisdiction_names instant_bookable
## 1 t NA f
## is_business_travel_ready cancellation_policy
## 1 f strict_14_with_grace_period
## require_guest_profile_picture require_guest_phone_verification
## 1 f f
## calculated_host_listings_count reviews_per_month
## 1 4 3.76
#install.packages('geosphere')
#Find the distance using latitude and longitude.
library(geosphere)
dataset$distance<-distHaversine(dataset[,49:50], c(52.52437, 13.41053))
#Filling the missing values
dataset$bathrooms[is.na(dataset$bathrooms)] <-1
dataset$bedrooms[is.na(dataset$bedrooms)]<-1
dataset$beds[is.na(dataset$beds)]<-1
dataset$review_scores_rating[is.na(dataset$review_scores_rating)]<-0
dataset$review_scores_accuracy[is.na(dataset$review_scores_accuracy)]<-0
dataset$review_scores_checkin[is.na(dataset$review_scores_checkin)]<-0
dataset$review_scores_cleanliness[is.na(dataset$review_scores_cleanliness)]<-0
dataset$review_scores_location[is.na(dataset$review_scores_location)]<-0
dataset$review_scores_value[is.na(dataset$review_scores_value)]<-0
dataset$review_scores_communication[is.na(dataset$review_scores_communication)]<-0
#Categorical features.
dataset$room_type= factor(dataset$room_type,
levels = c('Entire home/apt', 'Private room', 'Shared room'),
labels = c(1, 2, 3))
dataset$bed_type= factor(dataset$bed_type,
levels= c('Airbed',"Couch","Futon","Pull-out Sofa","Real Bed"),
labels= c(1,2,3,4,5))
#amenities count
dataset$amenities<-sapply(strsplit(as.character(dataset$amenities),","),FUN=function(x){length(x[x!="Null"])})
#Removing $ sign from price.
dataset$price = as.numeric(gsub("\\$", "", dataset$price))
## Warning: NAs introduced by coercion
summary(is.na(dataset$price))
## Mode FALSE TRUE
## logical 22511 41
#Features that I will be considering
X=dataset[,c('property_type', 'room_type','accommodates','bathrooms','bedrooms','beds','bed_type','amenities','guests_included','minimum_nights','availability_30','number_of_reviews','review_scores_rating','review_scores_accuracy','review_scores_cleanliness','review_scores_checkin','review_scores_communication','review_scores_location','review_scores_value','distance',"price")]
#Removing the rows having price= 0 or null.
X <- X[!(is.na(X$price)),]
summary(is.na(X))
## property_type room_type accommodates bathrooms
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:22511 FALSE:22511 FALSE:22511 FALSE:22511
## bedrooms beds bed_type amenities
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:22511 FALSE:22511 FALSE:22511 FALSE:22511
## guests_included minimum_nights availability_30 number_of_reviews
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:22511 FALSE:22511 FALSE:22511 FALSE:22511
## review_scores_rating review_scores_accuracy review_scores_cleanliness
## Mode :logical Mode :logical Mode :logical
## FALSE:22511 FALSE:22511 FALSE:22511
## review_scores_checkin review_scores_communication review_scores_location
## Mode :logical Mode :logical Mode :logical
## FALSE:22511 FALSE:22511 FALSE:22511
## review_scores_value distance price
## Mode :logical Mode :logical Mode :logical
## FALSE:22511 FALSE:22511 FALSE:22511
#Splitting the dataset into the Training set and Test set
library(caTools)
split = sample.split(X$price, SplitRatio = 2/3)
training_set = subset(X, split == TRUE)
test_set = subset(X, split == FALSE)
#summary(regressor)
regressor= lm(price~ ., data=training_set)
y_pred= predict(regressor, newdata = test_set)
## Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = object$xlevels): factor property_type has new levels Chalet, Island, Tipi
summary(regressor)
##
## Call:
## lm(formula = price ~ ., data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -252.25 -15.61 -3.09 9.85 830.27
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.316e+00 2.481e+01 -0.174 0.861892
## property_typeApartment 1.496e+01 2.259e+01 0.662 0.507757
## property_typeBarn 5.646e+01 4.515e+01 1.250 0.211185
## property_typeBed and breakfast 2.252e+01 2.337e+01 0.963 0.335352
## property_typeBoat 1.275e+02 2.526e+01 5.047 4.55e-07
## property_typeBoutique hotel 1.250e+02 2.358e+01 5.301 1.17e-07
## property_typeBungalow 1.075e+02 2.577e+01 4.174 3.01e-05
## property_typeCabin 3.209e+01 2.858e+01 1.123 0.261526
## property_typeCamper/RV 1.299e+01 2.768e+01 0.469 0.638890
## property_typeCasa particular (Cuba) 6.143e+00 3.569e+01 0.172 0.863337
## property_typeCastle 1.500e+01 4.517e+01 0.332 0.739834
## property_typeCave 5.494e+00 4.514e+01 0.122 0.903122
## property_typeCondominium 2.002e+01 2.267e+01 0.883 0.377102
## property_typeCottage 8.737e+00 2.991e+01 0.292 0.770206
## property_typeGuest suite 1.076e+01 2.324e+01 0.463 0.643349
## property_typeGuesthouse 1.707e+01 2.330e+01 0.732 0.463997
## property_typeHostel -1.296e+00 2.305e+01 -0.056 0.955156
## property_typeHotel 8.293e+01 2.414e+01 3.436 0.000593
## property_typeHouse 2.095e+01 2.272e+01 0.922 0.356417
## property_typeHouseboat 1.421e+02 2.698e+01 5.265 1.42e-07
## property_typeIn-law 3.721e+01 4.511e+01 0.825 0.409509
## property_typeLoft 4.267e+01 2.269e+01 1.880 0.060077
## property_typeOther 3.719e+01 2.352e+01 1.581 0.113891
## property_typePension (South Korea) 3.824e+01 2.765e+01 1.383 0.166617
## property_typeResort 7.307e+00 3.193e+01 0.229 0.819002
## property_typeServiced apartment 6.388e+01 2.286e+01 2.794 0.005213
## property_typeTiny house 3.305e+01 2.650e+01 1.247 0.212312
## property_typeTownhouse 1.517e+01 2.305e+01 0.658 0.510490
## property_typeTrain 3.215e+01 4.526e+01 0.711 0.477398
## property_typeVilla 1.969e+01 2.702e+01 0.729 0.466141
## room_type2 -2.229e+01 7.356e-01 -30.295 < 2e-16
## room_type3 -3.276e+01 3.017e+00 -10.860 < 2e-16
## accommodates 8.811e+00 3.986e-01 22.106 < 2e-16
## bathrooms 1.767e+01 1.047e+00 16.868 < 2e-16
## bedrooms 1.510e+01 6.762e-01 22.332 < 2e-16
## beds -2.411e+00 4.845e-01 -4.977 6.52e-07
## bed_type2 1.010e+01 1.161e+01 0.870 0.384115
## bed_type3 -3.306e+00 1.059e+01 -0.312 0.755037
## bed_type4 2.054e+00 1.037e+01 0.198 0.842930
## bed_type5 1.501e+00 1.012e+01 0.148 0.882167
## amenities 3.390e-01 4.082e-02 8.304 < 2e-16
## guests_included 4.569e+00 4.532e-01 10.082 < 2e-16
## minimum_nights -4.069e-03 6.659e-03 -0.611 0.541187
## availability_30 9.668e-01 3.961e-02 24.406 < 2e-16
## number_of_reviews -6.267e-02 9.561e-03 -6.555 5.76e-11
## review_scores_rating 1.112e-01 6.660e-02 1.669 0.095121
## review_scores_accuracy -6.812e-01 6.169e-01 -1.104 0.269518
## review_scores_cleanliness 1.895e+00 4.603e-01 4.116 3.87e-05
## review_scores_checkin -2.406e-01 6.068e-01 -0.396 0.691790
## review_scores_communication -1.158e+00 6.287e-01 -1.842 0.065540
## review_scores_location 1.855e+00 4.959e-01 3.740 0.000185
## review_scores_value -3.519e+00 5.725e-01 -6.147 8.11e-10
## distance -1.335e-03 7.987e-05 -16.712 < 2e-16
##
## (Intercept)
## property_typeApartment
## property_typeBarn
## property_typeBed and breakfast
## property_typeBoat ***
## property_typeBoutique hotel ***
## property_typeBungalow ***
## property_typeCabin
## property_typeCamper/RV
## property_typeCasa particular (Cuba)
## property_typeCastle
## property_typeCave
## property_typeCondominium
## property_typeCottage
## property_typeGuest suite
## property_typeGuesthouse
## property_typeHostel
## property_typeHotel ***
## property_typeHouse
## property_typeHouseboat ***
## property_typeIn-law
## property_typeLoft .
## property_typeOther
## property_typePension (South Korea)
## property_typeResort
## property_typeServiced apartment **
## property_typeTiny house
## property_typeTownhouse
## property_typeTrain
## property_typeVilla
## room_type2 ***
## room_type3 ***
## accommodates ***
## bathrooms ***
## bedrooms ***
## beds ***
## bed_type2
## bed_type3
## bed_type4
## bed_type5
## amenities ***
## guests_included ***
## minimum_nights
## availability_30 ***
## number_of_reviews ***
## review_scores_rating .
## review_scores_accuracy
## review_scores_cleanliness ***
## review_scores_checkin
## review_scores_communication .
## review_scores_location ***
## review_scores_value ***
## distance ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39.06 on 14962 degrees of freedom
## Multiple R-squared: 0.4433, Adjusted R-squared: 0.4414
## F-statistic: 229.2 on 52 and 14962 DF, p-value: < 2.2e-16
#Backward Elimination to eliminate the features having p-value >0.5 and making model more efficient.
backwardElimination <- function(x, sl) {
numVars = length(x)
for (i in c(1:numVars)){
regressor = lm(formula = price ~ ., data = x)
maxVar = max(coef(summary(regressor))[c(2:numVars), "Pr(>|t|)"])
if (maxVar > sl){
j = which(coef(summary(regressor))[c(2:numVars), "Pr(>|t|)"] == maxVar)
x = x[, -j]
}
numVars = numVars - 1
}
return(summary(regressor))
}
SL = 0.05
dataset = X
print(backwardElimination(training_set, SL))
##
## Call:
## lm(formula = price ~ ., data = x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -285.36 -16.76 -5.27 9.50 920.98
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.3683 10.9676 0.307 0.758759
## room_type2 -23.3081 0.7671 -30.386 < 2e-16 ***
## room_type3 -33.7114 3.0549 -11.035 < 2e-16 ***
## accommodates 10.9977 0.4116 26.717 < 2e-16 ***
## bathrooms 20.3031 1.1107 18.280 < 2e-16 ***
## bedrooms 14.2200 0.7152 19.883 < 2e-16 ***
## beds -1.8527 0.5096 -3.635 0.000279 ***
## bed_type2 10.4998 12.4973 0.840 0.400829
## bed_type3 -1.3187 11.4014 -0.116 0.907924
## bed_type4 2.7028 11.1584 0.242 0.808609
## bed_type5 3.5445 10.8981 0.325 0.745003
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 42.08 on 15004 degrees of freedom
## Multiple R-squared: 0.3521, Adjusted R-squared: 0.3517
## F-statistic: 815.4 on 10 and 15004 DF, p-value: < 2.2e-16
}