# Libraries to ingest CSV file
library(readr)
# Libraries for machine learning
library(tidyr)
library(class)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(ipred)
library(e1071)
library(klaR)
## Loading required package: MASS
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
##
## extract
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# Libraries for data cleaning and preprocessing
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
library(corrplot)
## corrplot 0.92 loaded
# Ingest original Airbnb data and create data frame called "groupData".
# Use complete.case method to rid the data of any missing values for the target variable
groupData <- read_csv("/Users/iattram1/Desktop/MA ECON/Data Mining/Group 3/Airbnb_Data/Airbnb_Data/Airbnb_Data.csv", show_col_types = FALSE)
## Warning: One or more parsing issues, see `problems()` for details
groupData <- groupData[complete.cases(groupData[ ,c('host_is_superhost')]), ]
View(groupData)
# Get summary statistic of Data
# Interesting results show that first_review, last_review, review_scores_rating, review_scores_accuracy, review_scores_cleanliness, review_scores_checkin, reviews_per_month
# review_scores_communication, review_scores_location, AND review_scores_value all have a range of 9500 to 10100 + NA's. This is about 25% of the data affected.
# While there are other columns with NA values, they are minimal averaging 35-40. Use
# Confirm there are no Missing values in the target variable - host_is_superhost
summary(groupData)
## id listing_url scrape_id last_scraped
## Min. : 2595 Length:38243 Min. :2.021e+13 Min. :2021-12-04
## 1st Qu.:13415526 Class :character 1st Qu.:2.021e+13 1st Qu.:2021-12-05
## Median :30814778 Mode :character Median :2.021e+13 Median :2021-12-05
## Mean :29626704 Mean :2.021e+13 Mean :2021-12-04
## 3rd Qu.:46431450 3rd Qu.:2.021e+13 3rd Qu.:2021-12-05
## Max. :53665099 Max. :2.021e+13 Max. :2021-12-05
##
## name description neighborhood_overview picture_url
## Length:38243 Length:38243 Length:38243 Length:38243
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## host_id host_url host_name host_since
## Min. : 2438 Length:38243 Length:38243 Min. :2008-08-22
## 1st Qu.: 11395166 Class :character Class :character 1st Qu.:2014-01-21
## Median : 50045329 Mode :character Mode :character Median :2015-12-01
## Mean :114844816 Mean :2016-03-05
## 3rd Qu.:200239515 3rd Qu.:2018-07-05
## Max. :434408046 Max. :2021-12-02
##
## host_location host_about host_response_time host_response_rate
## Length:38243 Length:38243 Length:38243 Length:38243
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## host_acceptance_rate host_is_superhost host_thumbnail_url host_picture_url
## Length:38243 Mode :logical Length:38243 Length:38243
## Class :character FALSE:30865 Class :character Class :character
## Mode :character TRUE :7378 Mode :character Mode :character
##
##
##
##
## host_neighbourhood host_listings_count host_total_listings_count
## Length:38243 Min. : 0.00 Min. : 0.00
## Class :character 1st Qu.: 1.00 1st Qu.: 1.00
## Mode :character Median : 1.00 Median : 1.00
## Mean : 49.02 Mean : 49.02
## 3rd Qu.: 3.00 3rd Qu.: 3.00
## Max. :3750.00 Max. :3750.00
##
## host_verifications host_has_profile_pic host_identity_verified
## Length:38243 Mode :logical Mode :logical
## Class :character FALSE:249 FALSE:6377
## Mode :character TRUE :37994 TRUE :31866
##
##
##
##
## neighbourhood neighbourhood_cleansed neighbourhood_group_cleansed
## Length:38243 Length:38243 Length:38243
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## latitude longitude property_type room_type
## Min. :40.50 Min. :-74.25 Length:38243 Length:38243
## 1st Qu.:40.69 1st Qu.:-73.98 Class :character Class :character
## Median :40.73 Median :-73.95 Mode :character Mode :character
## Mean :40.73 Mean :-73.95
## 3rd Qu.:40.76 3rd Qu.:-73.93
## Max. :40.91 Max. :-73.71
##
## accommodates bathrooms bathrooms_text bedrooms
## Min. : 0.000 Mode:logical Length:38243 Min. : 1.000
## 1st Qu.: 2.000 NA's:38243 Class :character 1st Qu.: 1.000
## Median : 2.000 Mode :character Median : 1.000
## Mean : 2.792 Mean : 1.324
## 3rd Qu.: 4.000 3rd Qu.: 1.000
## Max. :16.000 Max. :16.000
## NA's :3974
## beds amenities price minimum_nights
## Min. : 1.000 Length:38243 Length:38243 Min. : 1.00
## 1st Qu.: 1.000 Class :character Class :character 1st Qu.: 3.00
## Median : 1.000 Mode :character Mode :character Median : 30.00
## Mean : 1.593 Mean : 21.63
## 3rd Qu.: 2.000 3rd Qu.: 30.00
## Max. :24.000 Max. :1250.00
## NA's :2405
## maximum_nights minimum_minimum_nights maximum_minimum_nights
## Min. :1.000e+00 Min. : 1.00 Min. : 1.00
## 1st Qu.:6.000e+01 1st Qu.: 3.00 1st Qu.: 3.00
## Median :1.125e+03 Median : 30.00 Median : 30.00
## Mean :5.785e+04 Mean : 21.84 Mean : 27.63
## 3rd Qu.:1.125e+03 3rd Qu.: 30.00 3rd Qu.: 30.00
## Max. :2.147e+09 Max. :1250.00 Max. :1250.00
## NA's :18 NA's :18
## minimum_maximum_nights maximum_maximum_nights minimum_nights_avg_ntm
## Min. :1.000e+00 Min. :1.000e+00 Min. : 1.00
## 1st Qu.:3.600e+02 1st Qu.:3.650e+02 1st Qu.: 3.00
## Median :1.125e+03 Median :1.125e+03 Median : 30.00
## Mean :1.463e+06 Mean :3.541e+06 Mean : 27.22
## 3rd Qu.:1.125e+03 3rd Qu.:1.125e+03 3rd Qu.: 30.00
## Max. :2.147e+09 Max. :2.147e+09 Max. :1250.00
## NA's :18 NA's :18 NA's :18
## maximum_nights_avg_ntm calendar_updated has_availability availability_30
## Min. :1.000e+00 Mode:logical Mode :logical Min. : 0.000
## 1st Qu.:3.650e+02 NA's:38243 FALSE:2116 1st Qu.: 0.000
## Median :1.125e+03 TRUE :36127 Median : 1.000
## Mean :2.249e+06 Mean : 6.911
## 3rd Qu.:1.125e+03 3rd Qu.:11.000
## Max. :2.147e+09 Max. :30.000
## NA's :18
## availability_60 availability_90 availability_365 calendar_last_scraped
## Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. :2021-12-04
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.0 1st Qu.:2021-12-05
## Median : 8.00 Median :25.00 Median : 73.0 Median :2021-12-05
## Mean :19.79 Mean :33.95 Mean :134.3 Mean :2021-12-04
## 3rd Qu.:38.00 3rd Qu.:67.00 3rd Qu.:302.0 3rd Qu.:2021-12-05
## Max. :60.00 Max. :90.00 Max. :365.0 Max. :2021-12-05
##
## number_of_reviews number_of_reviews_ltm number_of_reviews_l30d
## Min. : 0.0 Min. : 0.000 Min. : 0.0000
## 1st Qu.: 1.0 1st Qu.: 0.000 1st Qu.: 0.0000
## Median : 4.0 Median : 0.000 Median : 0.0000
## Mean : 23.3 Mean : 5.034 Mean : 0.5401
## 3rd Qu.: 20.0 3rd Qu.: 3.000 3rd Qu.: 0.0000
## Max. :1009.0 Max. :669.000 Max. :67.0000
##
## first_review last_review review_scores_rating
## Min. :2009-08-09 Min. :2010-12-21 Min. :0.000
## 1st Qu.:2017-03-24 1st Qu.:2019-05-04 1st Qu.:4.570
## Median :2019-04-06 Median :2021-04-10 Median :4.820
## Mean :2018-11-26 Mean :2020-04-11 Mean :4.586
## 3rd Qu.:2021-02-27 3rd Qu.:2021-11-05 3rd Qu.:5.000
## Max. :2021-12-04 Max. :2021-12-05 Max. :5.000
## NA's :9490 NA's :9490 NA's :9490
## review_scores_accuracy review_scores_cleanliness review_scores_checkin
## Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:4.700 1st Qu.:4.500 1st Qu.:4.800
## Median :4.900 Median :4.800 Median :4.960
## Mean :4.744 Mean :4.612 Mean :4.814
## 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.000
## NA's :10099 NA's :10088 NA's :10106
## review_scores_communication review_scores_location review_scores_value
## Min. :0.000 Min. :0.00 Min. :0.000
## 1st Qu.:4.810 1st Qu.:4.67 1st Qu.:4.550
## Median :4.970 Median :4.88 Median :4.780
## Mean :4.808 Mean :4.75 Mean :4.647
## 3rd Qu.:5.000 3rd Qu.:5.00 3rd Qu.:5.000
## Max. :5.000 Max. :5.00 Max. :5.000
## NA's :10095 NA's :10109 NA's :10110
## license instant_bookable calculated_host_listings_count
## Mode:logical Mode :logical Min. : 1.00
## NA's:38243 FALSE:27832 1st Qu.: 1.00
## TRUE :10411 Median : 1.00
## Mean : 17.76
## 3rd Qu.: 3.00
## Max. :421.00
##
## calculated_host_listings_count_entire_homes
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 1.000
## Mean : 8.043
## 3rd Qu.: 1.000
## Max. :308.000
##
## calculated_host_listings_count_private_rooms
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 9.602
## 3rd Qu.: 1.000
## Max. :359.000
##
## calculated_host_listings_count_shared_rooms reviews_per_month
## Min. :0.00000 Min. : 0.010
## 1st Qu.:0.00000 1st Qu.: 0.120
## Median :0.00000 Median : 0.480
## Mean :0.04801 Mean : 1.722
## 3rd Qu.:0.00000 3rd Qu.: 1.780
## Max. :8.00000 Max. :141.000
## NA's :9490
# Display the structure of groupData overall - each column
str(groupData)
## tibble [38,243 × 74] (S3: tbl_df/tbl/data.frame)
## $ id : num [1:38243] 2595 3831 5121 5136 5178 ...
## $ listing_url : chr [1:38243] "https://www.airbnb.com/rooms/2595" "https://www.airbnb.com/rooms/3831" "https://www.airbnb.com/rooms/5121" "https://www.airbnb.com/rooms/5136" ...
## $ scrape_id : num [1:38243] 2.02e+13 2.02e+13 2.02e+13 2.02e+13 2.02e+13 ...
## $ last_scraped : Date[1:38243], format: "2021-12-05" "2021-12-05" ...
## $ name : chr [1:38243] "Skylit Midtown Castle" "Whole flr w/private bdrm, bath & kitchen(pls read)" "BlissArtsSpace!" "Spacious Brooklyn Duplex, Patio + Garden" ...
## $ description : chr [1:38243] "Beautiful, spacious skylit studio in the heart of Midtown, Manhattan. <br /><br />STUNNING SKYLIT STUDIO / 1 BE"| __truncated__ "Enjoy 500 s.f. top floor in 1899 brownstone, w/ wood & ceramic flooring throughout, roomy bdrm, & upgraded kitc"| __truncated__ "<b>The space</b><br />HELLO EVERYONE AND THANKS FOR VISITING BLISS ART SPACE! <br /><br />Thank you all for you"| __truncated__ "We welcome you to stay in our lovely 2 br duplex in South Slope, Brooklyn. Our home is a truly spacious respit"| __truncated__ ...
## $ neighborhood_overview : chr [1:38243] "Centrally located in the heart of Manhattan just a few blocks from all subway connections in the very desirable"| __truncated__ "Just the right mix of urban center and local neighborhood; close to all but enough quiet for a calming walk. 15"| __truncated__ NA NA ...
## $ picture_url : chr [1:38243] "https://a0.muscache.com/pictures/f0813a11-40b2-489e-8217-89a2e1637830.jpg" "https://a0.muscache.com/pictures/e49999c2-9fd5-4ad5-b7cc-224deac989aa.jpg" "https://a0.muscache.com/pictures/2090980c-b68e-4349-a874-4818402923e7.jpg" "https://a0.muscache.com/pictures/miso/Hosting-5136/original/adf1e231-7c60-4475-86c0-cee0cd16f538.jpeg" ...
## $ host_id : num [1:38243] 2845 4869 7356 7378 8967 ...
## $ host_url : chr [1:38243] "https://www.airbnb.com/users/show/2845" "https://www.airbnb.com/users/show/4869" "https://www.airbnb.com/users/show/7356" "https://www.airbnb.com/users/show/7378" ...
## $ host_name : chr [1:38243] "Jennifer" "LisaRoxanne" "Garon" "Rebecca" ...
## $ host_since : Date[1:38243], format: "2008-09-09" "2008-12-07" ...
## $ host_location : chr [1:38243] "New York, New York, United States" "New York, New York, United States" "New York, New York, United States" "Brooklyn, New York, United States" ...
## $ host_about : chr [1:38243] "A New Yorker since 2000! My passion is creating beautiful, unique spaces where unforgettable memories are made."| __truncated__ "Laid-back Native New Yorker (formerly bi-coastal) and AirBnb host of over 6 years and over 400 stays! Besides "| __truncated__ "I am an artist(painter, filmmaker) and curator who is working in the film industry while I'm building my busine"| __truncated__ "Rebecca is an artist/designer, and Henoch is in marketing/sales/event planning. We have two boys- 5 and 8 yo, "| __truncated__ ...
## $ host_response_time : chr [1:38243] "within a day" "a few days or more" "within an hour" "within a day" ...
## $ host_response_rate : chr [1:38243] "80%" "9%" "100%" "100%" ...
## $ host_acceptance_rate : chr [1:38243] "17%" "69%" "100%" "25%" ...
## $ host_is_superhost : logi [1:38243] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ host_thumbnail_url : chr [1:38243] "https://a0.muscache.com/im/pictures/user/50fc57af-a6a3-4e88-8f16-efd6cac7c9bc.jpg?aki_policy=profile_small" "https://a0.muscache.com/im/users/4869/profile_pic/1371927771/original.jpg?aki_policy=profile_small" "https://a0.muscache.com/im/pictures/user/72a61bea-cfb1-45b6-abbb-85bdbd790b32.jpg?aki_policy=profile_small" "https://a0.muscache.com/im/users/7378/profile_pic/1259098621/original.jpg?aki_policy=profile_small" ...
## $ host_picture_url : chr [1:38243] "https://a0.muscache.com/im/pictures/user/50fc57af-a6a3-4e88-8f16-efd6cac7c9bc.jpg?aki_policy=profile_x_medium" "https://a0.muscache.com/im/users/4869/profile_pic/1371927771/original.jpg?aki_policy=profile_x_medium" "https://a0.muscache.com/im/pictures/user/72a61bea-cfb1-45b6-abbb-85bdbd790b32.jpg?aki_policy=profile_x_medium" "https://a0.muscache.com/im/users/7378/profile_pic/1259098621/original.jpg?aki_policy=profile_x_medium" ...
## $ host_neighbourhood : chr [1:38243] "Midtown" "Clinton Hill" "Bedford-Stuyvesant" "Greenwood Heights" ...
## $ host_listings_count : num [1:38243] 8 1 1 1 1 1 3 1 0 3 ...
## $ host_total_listings_count : num [1:38243] 8 1 1 1 1 1 3 1 0 3 ...
## $ host_verifications : chr [1:38243] "['email', 'phone', 'reviews', 'offline_government_id', 'kba', 'selfie', 'government_id', 'identity_manual', 'work_email']" "['email', 'phone', 'reviews', 'offline_government_id', 'kba', 'government_id']" "['email', 'phone', 'facebook', 'reviews', 'offline_government_id', 'selfie', 'government_id', 'identity_manual']" "['email', 'phone', 'reviews']" ...
## $ host_has_profile_pic : logi [1:38243] TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ host_identity_verified : logi [1:38243] TRUE TRUE TRUE TRUE FALSE TRUE ...
## $ neighbourhood : chr [1:38243] "New York, United States" "Brooklyn, New York, United States" NA NA ...
## $ neighbourhood_cleansed : chr [1:38243] "Midtown" "Bedford-Stuyvesant" "Bedford-Stuyvesant" "Sunset Park" ...
## $ neighbourhood_group_cleansed : chr [1:38243] "Manhattan" "Brooklyn" "Brooklyn" "Brooklyn" ...
## $ latitude : num [1:38243] 40.8 40.7 40.7 40.7 40.8 ...
## $ longitude : num [1:38243] -74 -74 -74 -74 -74 ...
## $ property_type : chr [1:38243] "Entire rental unit" "Entire guest suite" "Private room in rental unit" "Entire rental unit" ...
## $ room_type : chr [1:38243] "Entire home/apt" "Entire home/apt" "Private room" "Entire home/apt" ...
## $ accommodates : num [1:38243] 1 3 2 4 2 1 2 3 1 1 ...
## $ bathrooms : logi [1:38243] NA NA NA NA NA NA ...
## $ bathrooms_text : chr [1:38243] "1 bath" "1 bath" NA "1.5 baths" ...
## $ bedrooms : num [1:38243] NA 1 1 2 1 1 1 NA 1 1 ...
## $ beds : num [1:38243] 1 3 1 2 1 1 NA 1 1 1 ...
## $ amenities : chr [1:38243] "[\"Extra pillows and blankets\", \"Baking sheet\", \"Luggage dropoff allowed\", \"TV\", \"Hangers\", \"Ethernet"| __truncated__ "[\"Extra pillows and blankets\", \"Luggage dropoff allowed\", \"Free parking on premises\", \"Pack \\u2019n pla"| __truncated__ "[\"Kitchen\", \"Long term stays allowed\", \"Wifi\", \"Heating\", \"Air conditioning\"]" "[\"Kitchen\", \"BBQ grill\", \"Cable TV\", \"Carbon monoxide alarm\", \"Outdoor dining area\", \"Washer\", \"Lo"| __truncated__ ...
## $ price : chr [1:38243] "$150.00" "$75.00" "$60.00" "$275.00" ...
## $ minimum_nights : num [1:38243] 30 1 30 5 2 2 4 30 30 30 ...
## $ maximum_nights : num [1:38243] 1125 730 730 1125 14 ...
## $ minimum_minimum_nights : num [1:38243] 30 1 30 5 2 2 4 30 30 30 ...
## $ maximum_minimum_nights : num [1:38243] 30 1 30 5 2 2 4 30 30 30 ...
## $ minimum_maximum_nights : num [1:38243] 1125 730 730 1125 14 ...
## $ maximum_maximum_nights : num [1:38243] 1125 730 730 1125 14 ...
## $ minimum_nights_avg_ntm : num [1:38243] 30 1 30 5 2 2 4 30 30 30 ...
## $ maximum_nights_avg_ntm : num [1:38243] 1125 730 730 1125 14 ...
## $ calendar_updated : logi [1:38243] NA NA NA NA NA NA ...
## $ has_availability : logi [1:38243] TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ availability_30 : num [1:38243] 3 3 30 3 1 0 2 2 30 1 ...
## $ availability_60 : num [1:38243] 33 6 60 3 16 0 17 30 60 4 ...
## $ availability_90 : num [1:38243] 63 18 90 12 34 0 47 30 90 34 ...
## $ availability_365 : num [1:38243] 338 194 365 123 192 0 322 179 365 309 ...
## $ calendar_last_scraped : Date[1:38243], format: "2021-12-05" "2021-12-05" ...
## $ number_of_reviews : num [1:38243] 48 409 50 2 507 118 204 181 0 234 ...
## $ number_of_reviews_ltm : num [1:38243] 0 32 0 1 33 0 23 1 0 1 ...
## $ number_of_reviews_l30d : num [1:38243] 0 0 0 0 2 0 2 0 0 0 ...
## $ first_review : Date[1:38243], format: "2009-11-21" "2015-01-05" ...
## $ last_review : Date[1:38243], format: "2019-11-04" "2021-10-22" ...
## $ review_scores_rating : num [1:38243] 4.7 4.45 4.52 5 4.21 4.91 4.7 4.56 NA 4.88 ...
## $ review_scores_accuracy : num [1:38243] 4.72 4.58 4.22 5 4.21 4.83 4.71 4.59 NA 4.81 ...
## $ review_scores_cleanliness : num [1:38243] 4.62 4.49 4.09 5 3.73 4.82 4.61 4.86 NA 4.96 ...
## $ review_scores_checkin : num [1:38243] 4.76 4.78 4.91 5 4.66 4.97 4.79 4.84 NA 4.96 ...
## $ review_scores_communication : num [1:38243] 4.79 4.8 4.91 5 4.42 4.95 4.82 4.8 NA 4.95 ...
## $ review_scores_location : num [1:38243] 4.86 4.71 4.47 4.5 4.87 4.94 4.87 4.67 NA 4.84 ...
## $ review_scores_value : num [1:38243] 4.41 4.64 4.52 5 4.36 4.92 4.73 4.57 NA 4.84 ...
## $ license : logi [1:38243] NA NA NA NA NA NA ...
## $ instant_bookable : logi [1:38243] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ calculated_host_listings_count : num [1:38243] 3 1 2 1 1 1 3 1 2 1 ...
## $ calculated_host_listings_count_entire_homes : num [1:38243] 3 1 0 1 0 0 1 1 0 0 ...
## $ calculated_host_listings_count_private_rooms: num [1:38243] 0 0 2 0 1 1 2 0 2 1 ...
## $ calculated_host_listings_count_shared_rooms : num [1:38243] 0 0 0 0 0 0 0 0 0 0 ...
## $ reviews_per_month : num [1:38243] 0.33 4.86 0.52 0.02 3.68 0.87 1.48 1.24 NA 1.82 ...
# Examnine the str() above and see some columns expected to be numbers are listed as other data type, i.e. price is character.
# Convert some of the columns to numeric to be used in regression
# Converting host_response_rate
groupData$host_response_rate <- as.numeric(
gsub( "%", "", as.character(groupData$host_response_rate)))
## Warning: NAs introduced by coercion
# Converting host_acceptance_rate
groupData$host_acceptance_rate <- as.numeric(
gsub( "%", "", as.character(groupData$host_acceptance_rate)))
## Warning: NAs introduced by coercion
# Converting price
groupData$price <- as.numeric(
gsub( "[\\$,]", "", as.character(groupData$price)))
# Create a dataframe subset of columns I wish to see so I do not have to scroll through the entire 74 columns
# Notice there are some NA values in my subset. I only removed the missing values from host_is_superhost earlier
subset(groupData, select = c('host_response_rate', 'host_acceptance_rate', 'review_scores_value', 'host_is_superhost', 'reviews_per_month', 'price'))
# REMOVE ALL THESE NA VALUES AND REPLACE THEM WITH MEANINGFUL VALUES
groupData$flag_host_response_rate <-
ifelse(is.na(groupData$host_response_rate) |
groupData$host_response_rate=='' , 1,2)
groupData$host_response_rate <- as.numeric(
ifelse(is.na(groupData$host_response_rate) |
groupData$host_response_rate=='' ,
median(groupData$host_response_rate,na.rm = TRUE),
as.character(groupData$host_response_rate)))
groupData$flag_host_acceptance_rate <-
ifelse(is.na(groupData$host_acceptance_rate) |
groupData$host_acceptance_rate=='' , 1,2)
groupData$host_acceptance_rate <- as.numeric(
ifelse(is.na(groupData$host_acceptance_rate) |
groupData$host_acceptance_rate=='' ,
median(groupData$host_acceptance_rate,na.rm = TRUE),
as.character(groupData$host_acceptance_rate)))
groupData$flag_review_scores_rating <-
ifelse(is.na(groupData$review_scores_rating) |
groupData$review_scores_rating=='' , 1,2)
groupData$review_scores_rating <- as.numeric(
ifelse(is.na(groupData$review_scores_rating) |
groupData$review_scores_rating=='' ,
median(groupData$review_scores_rating,na.rm = TRUE),
as.character(groupData$review_scores_rating)))
groupData$flag_host_review_scores_value <-
ifelse(is.na(groupData$review_scores_value) |
groupData$review_scores_value=='' , 1,2)
groupData$review_scores_value <- as.numeric(
ifelse(is.na(groupData$review_scores_value) |
groupData$review_scores_value=='' ,
median(groupData$review_scores_value,na.rm = TRUE),
as.character(groupData$review_scores_value)))
groupData$flag_host_reviews_per_month <-
ifelse(is.na(groupData$reviews_per_month) |
groupData$reviews_per_month=='' , 1,2)
groupData$reviews_per_month <- as.numeric(
ifelse(is.na(groupData$reviews_per_month) |
groupData$reviews_per_month=='' ,
median(groupData$reviews_per_month,na.rm = TRUE),
as.character(groupData$reviews_per_month)))
groupData$flag_host_is_superhost <-
ifelse(groupData$host_is_superhost=='N/A' |
groupData$host_is_superhost=='' , 1,2)
groupData$host_is_superhost <- as.factor(
ifelse(groupData$host_is_superhost=='N/A' |
groupData$host_is_superhost=='' , 'f',
as.character(groupData$host_is_superhost)))
groupData$flag_host_listings_count <-
ifelse(is.na(groupData$host_listings_count) |
groupData$host_listings_count=='' , 1,2)
groupData$host_listings_count <- as.numeric(
ifelse(is.na(groupData$host_listings_count) |
groupData$host_listings_count=='' ,
median(groupData$host_listings_count,na.rm = TRUE),
as.character(groupData$host_listings_count)))
groupData$flag_host_has_profile_pic <-
ifelse(groupData$host_has_profile_pic=='N/A' |
groupData$host_has_profile_pic=='' , 1,2)
groupData$host_has_profile_pic <- as.factor(
ifelse(groupData$host_has_profile_pic=='N/A' |
groupData$host_has_profile_pic=='' , 't',
as.character(groupData$host_has_profile_pic)))
groupData$flag_host_identity_verified <-
ifelse(groupData$host_identity_verified=='N/A' |
groupData$host_identity_verified=='' , 1,2)
groupData$host_identity_verified <- as.factor(
ifelse(groupData$host_identity_verified=='N/A' |
groupData$host_identity_verified=='' , 't',
as.character(groupData$host_identity_verified)))
groupData$flag_bathrooms <-
ifelse(is.na(groupData$bathrooms) |
groupData$bathrooms=='' , 1,2)
groupData$bathrooms <- as.numeric(
ifelse(is.na(groupData$bathrooms) |
groupData$bathrooms=='' ,
median(groupData$bathrooms,na.rm = TRUE),
as.character(groupData$bathrooms)))
groupData$flag_bedrooms <-
ifelse(is.na(groupData$bedrooms) |
groupData$bedrooms=='' , 1,2)
groupData$bedrooms <- as.numeric(
ifelse(is.na(groupData$bedrooms) |
groupData$bedrooms=='' ,
median(groupData$bedrooms,na.rm = TRUE),
as.character(groupData$bedrooms)))
groupData$flag_beds <-
ifelse(is.na(groupData$beds) |
groupData$beds=='' , 1,2)
groupData$beds <- as.numeric(
ifelse(is.na(groupData$beds) |
groupData$beds=='' ,
median(groupData$beds,na.rm = TRUE),
as.character(groupData$beds)))
groupData$flag_review_scores_rating <-
ifelse(is.na(groupData$review_scores_rating) |
groupData$review_scores_rating=='' , 1,2)
groupData$review_scores_rating <- as.numeric(
ifelse(is.na(groupData$review_scores_rating) |
groupData$review_scores_rating=='' ,
median(groupData$review_scores_rating,na.rm = TRUE),
as.character(groupData$review_scores_rating)))
groupData$flag_review_scores_accuracy <-
ifelse(is.na(groupData$review_scores_accuracy) |
groupData$review_scores_accuracy=='' , 1,2)
groupData$review_scores_accuracy <- as.numeric(
ifelse(is.na(groupData$review_scores_accuracy) |
groupData$review_scores_accuracy=='' ,
median(groupData$review_scores_accuracy,na.rm = TRUE),
as.character(groupData$review_scores_accuracy)))
groupData$flag_review_scores_cleanliness <-
ifelse(is.na(groupData$review_scores_cleanliness) |
groupData$review_scores_cleanliness=='' , 1,2)
groupData$review_scores_cleanliness <- as.numeric(
ifelse(is.na(groupData$review_scores_cleanliness) |
groupData$review_scores_cleanliness=='' ,
median(groupData$review_scores_cleanliness,na.rm = TRUE),
as.character(groupData$review_scores_cleanliness)))
groupData$flag_review_scores_checkin <-
ifelse(is.na(groupData$review_scores_checkin) |
groupData$review_scores_checkin=='' , 1,2)
groupData$review_scores_checkin <- as.numeric(
ifelse(is.na(groupData$review_scores_checkin) |
groupData$review_scores_checkin=='' ,
median(groupData$review_scores_checkin,na.rm = TRUE),
as.character(groupData$review_scores_checkin)))
groupData$flag_review_scores_communication <-
ifelse(is.na(groupData$review_scores_communication) |
groupData$review_scores_communication=='' , 1,2)
groupData$review_scores_communication <- as.numeric(
ifelse(is.na(groupData$review_scores_communication) |
groupData$review_scores_communication=='' ,
median(groupData$review_scores_communication,na.rm = TRUE),
as.character(groupData$review_scores_communication)))
groupData$flag_review_scores_location <-
ifelse(is.na(groupData$review_scores_location) |
groupData$review_scores_location=='' , 1,2)
groupData$review_scores_location <- as.numeric(
ifelse(is.na(groupData$review_scores_location) |
groupData$review_scores_location=='' ,
median(groupData$review_scores_location,na.rm = TRUE),
as.character(groupData$review_scores_location)))
groupData$flag_review_scores_value <-
ifelse(is.na(groupData$review_scores_value) |
groupData$review_scores_value=='' , 1,2)
groupData$review_scores_value <- as.numeric(
ifelse(is.na(groupData$review_scores_value) |
groupData$review_scores_value=='' ,
median(groupData$review_scores_value,na.rm = TRUE),
as.character(groupData$review_scores_value)))
groupData$flag_reviews_per_month <-
ifelse(is.na(groupData$reviews_per_month) |
groupData$reviews_per_month=='' , 1,2)
groupData$reviews_per_month <- as.numeric(
ifelse(is.na(groupData$reviews_per_month) |
groupData$reviews_per_month=='' ,
median(groupData$reviews_per_month,na.rm = TRUE),
as.character(groupData$reviews_per_month)))
# Use the scale function to standardize the values (z-score). center=TRUE mean that the value is subtracted from the mean; scale=TRUE means divide by std deviation
# By default, center and scale will be set to TRUE, I explicitly included them for visual reference.
# Insert the target variable into the subset dataframe, now called myData1
myData1 <- scale(subset(groupData, select = c('host_response_rate', 'host_acceptance_rate', 'price', 'review_scores_rating')), center = TRUE, scale = TRUE)
myData1 <- data.frame(myData1, groupData$host_is_superhost)
myData1 <- data.frame(myData1, groupData$host_identity_verified)
myData1 <- data.frame(myData1, groupData$host_has_profile_pic)
colnames(myData1)[5] <- 'host_is_superhost'
colnames(myData1)[6] <- 'host_identity_verified'
colnames(myData1)[7] <- 'host_has_profile_pic'
myData1$host_is_superhost <- as.factor(myData1$host_is_superhost)
myData1$host_identity_verified <- as.integer(myData1$host_identity_verified)
myData1$host_has_profile_pic <- as.integer(myData1$host_has_profile_pic)
myData1
# Set the seed to ensure consistency throughout the testing
# Split the testing (85%) and validation datasets (remaining 15%)
set.seed(1)
myIndex <- createDataPartition(groupData$host_is_superhost, p=0.85, list = FALSE)
trainSet <- myData1[myIndex,]
validationSet <- myData1[-myIndex,]
# Set the method (cv is cross validation) and the fold is the number.
myCtrl <- trainControl(method = "cv", number = 10)
# Use the expand.grid function to put the k values into an object. This is indicated by the number in the trainControl function.
# Therefore, the grid will be from 1-10. R will automatically determine the optimal k, by the highest accurancy in the grid.
myGrid <- expand.grid(.k=c(1:10))
# Set the seed to ensure consistency
# Fit the model using the training set with target variables ~ independent variables
set.seed(1)
KNN_fit <- train(host_is_superhost ~price + host_acceptance_rate + host_response_rate + host_identity_verified + review_scores_rating + host_has_profile_pic, data = trainSet, method = "knn", trControl=myCtrl, tuneGrid = myGrid)
KNN_fit
## k-Nearest Neighbors
##
## 32508 samples
## 6 predictor
## 2 classes: 'FALSE', 'TRUE'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 29258, 29257, 29257, 29257, 29257, 29258, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.8188138 0.3912500
## 2 0.8188760 0.3860313
## 3 0.8352406 0.4106870
## 4 0.8325029 0.3938209
## 5 0.8385320 0.4056350
## 6 0.8373629 0.3968487
## 7 0.8398854 0.3979378
## 8 0.8377936 0.3876076
## 9 0.8407160 0.3931024
## 10 0.8395472 0.3860917
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
# Previous test gives us the optional k, with the highest accuracy rate
# Apply that fit model above to create the Confusion Matrix for validation set. The positive class is FALSE
# Out of 5735 observations (15% of the total observations,38242), we have:
#
# 4413 True Positives
# 723 False Positives
# 216 False Negatives
# 383 True Negatives
KNN_Class <- predict(KNN_fit, newdata = validationSet)
confusionMatrix (KNN_Class,validationSet$host_is_superhost)
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 4406 661
## TRUE 223 445
##
## Accuracy : 0.8459
## 95% CI : (0.8363, 0.8551)
## No Information Rate : 0.8071
## P-Value [Acc > NIR] : 1.286e-14
##
## Kappa : 0.417
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9518
## Specificity : 0.4024
## Pos Pred Value : 0.8695
## Neg Pred Value : 0.6662
## Prevalence : 0.8071
## Detection Rate : 0.7683
## Detection Prevalence : 0.8835
## Balanced Accuracy : 0.6771
##
## 'Positive' Class : FALSE
##
# Predict the probability of each class in the target class (host_is_superhost either True or False), instead of its class membership
# We have the postive class (FALSE) in column 1 and the other class (TRUE) in column 2
KNN_Class_prob <- predict(KNN_fit, newdata = validationSet, type ='prob')
KNN_Class_prob
# Establish a NEW cutoff point to determine the class membership above. The default cutoff point is .5 for rounding purposes.
# We will use he as.factor function to set the class membership to the same data type as our target variable, host_is_superhost
# This will be our new model utilizing the new cut off point of .005
confusionMatrix(as.factor(ifelse(KNN_Class_prob[,2]>0.005, 'FALSE', 'TRUE')), validationSet$host_is_superhost, positive = 'FALSE')
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 2611 1011
## TRUE 2018 95
##
## Accuracy : 0.4718
## 95% CI : (0.4588, 0.4849)
## No Information Rate : 0.8071
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.26
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.56405
## Specificity : 0.08590
## Pos Pred Value : 0.72087
## Neg Pred Value : 0.04496
## Prevalence : 0.80715
## Detection Rate : 0.45527
## Detection Prevalence : 0.63156
## Balanced Accuracy : 0.32497
##
## 'Positive' Class : FALSE
##
# Convert the target variable back to a numeric variable
validationSet$host_is_superhost <- as.numeric(as.character(validationSet$host_is_superhost))
## Warning: NAs introduced by coercion
# Generate the cumulative Gains table
# The gains function requires the actual class membership and predicted target class probability as arguments
# This table will divide the cases in 10 groups based on the probability of belonging to the target class
# The last column shows the average probability of each group
library(gains)
gains_table <- gains(validationSet$host_response_rate, KNN_Class_prob[,2])
gains_table
## Depth Cume Cume Pct Mean
## of Cume Mean Mean of Total Lift Cume Model
## File N N Resp Resp Resp Index Lift Score
## -------------------------------------------------------------------------
## 11 628 628 0.26 0.26 -268.1% -2449 -2449 0.74
## 23 664 1292 0.22 0.24 -504.1% -2038 -2238 0.39
## 32 542 1834 0.14 0.21 -629.1% -1323 -1967 0.24
## 48 898 2732 0.08 0.17 -748.6% -764 -1572 0.13
## 51 181 2913 0.28 0.18 -830.0% -2578 -1634 0.10
## 60 536 3449 0.28 0.19 -1073.0% -2600 -1784 0.06
## 100 2286 5735 -0.32 -0.01 100.0% 2943 100 0.00
## NA NA NA NA NA NA% NA NA NA
## NA NA NA NA NA NA% NA NA NA
## NA NA NA NA NA NA% NA NA NA
#Loading required packages
library(gains)
library(rpart)
library(rpart.plot)
library(pROC)
library(randomForest)
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
#Converting categorical variables to factors
myData1$host_is_superhost <- as.factor(myData1$host_is_superhost)
myData1$host_identity_verified <- as.factor(myData1$host_identity_verified)
myData1$host_has_profile_pic <- as.factor(myData1$host_has_profile_pic)
#Initializing index by partitioning host_is_superhost into 85% training and 15% validation sets.
set.seed(1)
myIndex <- createDataPartition(myData1$host_is_superhost, p=0.85, list = FALSE)
trainSet <- myData1[myIndex,]
validationSet <- myData1[-myIndex,]
#Creating bagging tree.
set.seed(1)
bagging_tree <- randomForest(host_is_superhost ~ ., data=trainSet, ntree= 100, mtry = 3, importance = TRUE)
varImpPlot(bagging_tree, type=1)

#It is evident from the plot that review_scores_value is the strongest predictor in the model, with host_acceptance_rate being the second most relevant predictor and price the third.
predicted_class <- predict(bagging_tree, validationSet)
confusionMatrix(predicted_class, validationSet$host_is_superhost, positive = "TRUE")
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 4422 586
## TRUE 207 520
##
## Accuracy : 0.8617
## 95% CI : (0.8525, 0.8706)
## No Information Rate : 0.8071
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4892
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.47016
## Specificity : 0.95528
## Pos Pred Value : 0.71527
## Neg Pred Value : 0.88299
## Prevalence : 0.19285
## Detection Rate : 0.09067
## Detection Prevalence : 0.12677
## Balanced Accuracy : 0.71272
##
## 'Positive' Class : TRUE
##
#The model has a 86.17% accuracy rate with a 71.53% predictive value.
predicted_prob<- predict(bagging_tree, validationSet, type="prob")
validationSet$host_is_superhost<- as.numeric(as.character(validationSet$ host_is_superhost))
## Warning: NAs introduced by coercion
gains_table <- gains(validationSet$host_is_superhost, predicted_prob[,2])
gains_table
## Depth Cume Cume Pct Mean
## of Cume Mean Mean of Total Lift Cume Model
## File N N Resp Resp Resp Index Lift Score
## -------------------------------------------------------------------------
## 10 581 581 NA NA NA% NA NA 0.83
## 20 593 1174 NA NA NA% NA NA 0.42
## 30 546 1720 NA NA NA% NA NA 0.19
## 41 644 2364 NA NA NA% NA NA 0.08
## 51 534 2898 NA NA NA% NA NA 0.03
## 100 2837 5735 NA NA NA% NA NA 0.00
## NA NA NA NA NA NA% NA NA NA
## NA NA NA NA NA NA% NA NA NA
## NA NA NA NA NA NA% NA NA NA
## NA NA NA NA NA NA% NA NA NA
#Not getting correct response from this