Using Machine Learning to Predict AirBnB Superhost

# Libraries to ingest CSV file

library(readr)

# Libraries for machine learning
library(tidyr)
library(class)
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(ipred)
library(e1071)
library(klaR)

## Loading required package: MASS

library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:tidyr':
## 
##     extract

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

# Libraries for data cleaning and preprocessing
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(ggplot2)
library(corrplot)

## corrplot 0.92 loaded

# Ingest original Airbnb data and create data frame called "groupData".
# Use complete.case method to rid the data of any missing values for the target variable

groupData <- read_csv("/Users/iattram1/Desktop/MA ECON/Data Mining/Group 3/Airbnb_Data/Airbnb_Data/Airbnb_Data.csv", show_col_types = FALSE)

## Warning: One or more parsing issues, see `problems()` for details

groupData <- groupData[complete.cases(groupData[ ,c('host_is_superhost')]), ]
View(groupData)

# Get summary statistic of Data
# Interesting results show that first_review, last_review, review_scores_rating, review_scores_accuracy, review_scores_cleanliness, review_scores_checkin, reviews_per_month
# review_scores_communication, review_scores_location, AND review_scores_value all have a range of 9500 to 10100 + NA's. This is about 25% of the data affected.
# While there are other columns with NA values, they are minimal averaging 35-40.  Use 
# Confirm there are no Missing values in the target variable - host_is_superhost

summary(groupData)

##        id           listing_url          scrape_id          last_scraped       
##  Min.   :    2595   Length:38243       Min.   :2.021e+13   Min.   :2021-12-04  
##  1st Qu.:13415526   Class :character   1st Qu.:2.021e+13   1st Qu.:2021-12-05  
##  Median :30814778   Mode  :character   Median :2.021e+13   Median :2021-12-05  
##  Mean   :29626704                      Mean   :2.021e+13   Mean   :2021-12-04  
##  3rd Qu.:46431450                      3rd Qu.:2.021e+13   3rd Qu.:2021-12-05  
##  Max.   :53665099                      Max.   :2.021e+13   Max.   :2021-12-05  
##                                                                                
##      name           description        neighborhood_overview picture_url       
##  Length:38243       Length:38243       Length:38243          Length:38243      
##  Class :character   Class :character   Class :character      Class :character  
##  Mode  :character   Mode  :character   Mode  :character      Mode  :character  
##                                                                                
##                                                                                
##                                                                                
##                                                                                
##     host_id            host_url          host_name           host_since        
##  Min.   :     2438   Length:38243       Length:38243       Min.   :2008-08-22  
##  1st Qu.: 11395166   Class :character   Class :character   1st Qu.:2014-01-21  
##  Median : 50045329   Mode  :character   Mode  :character   Median :2015-12-01  
##  Mean   :114844816                                         Mean   :2016-03-05  
##  3rd Qu.:200239515                                         3rd Qu.:2018-07-05  
##  Max.   :434408046                                         Max.   :2021-12-02  
##                                                                                
##  host_location       host_about        host_response_time host_response_rate
##  Length:38243       Length:38243       Length:38243       Length:38243      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  host_acceptance_rate host_is_superhost host_thumbnail_url host_picture_url  
##  Length:38243         Mode :logical     Length:38243       Length:38243      
##  Class :character     FALSE:30865       Class :character   Class :character  
##  Mode  :character     TRUE :7378        Mode  :character   Mode  :character  
##                                                                              
##                                                                              
##                                                                              
##                                                                              
##  host_neighbourhood host_listings_count host_total_listings_count
##  Length:38243       Min.   :   0.00     Min.   :   0.00          
##  Class :character   1st Qu.:   1.00     1st Qu.:   1.00          
##  Mode  :character   Median :   1.00     Median :   1.00          
##                     Mean   :  49.02     Mean   :  49.02          
##                     3rd Qu.:   3.00     3rd Qu.:   3.00          
##                     Max.   :3750.00     Max.   :3750.00          
##                                                                  
##  host_verifications host_has_profile_pic host_identity_verified
##  Length:38243       Mode :logical        Mode :logical         
##  Class :character   FALSE:249            FALSE:6377            
##  Mode  :character   TRUE :37994          TRUE :31866           
##                                                                
##                                                                
##                                                                
##                                                                
##  neighbourhood      neighbourhood_cleansed neighbourhood_group_cleansed
##  Length:38243       Length:38243           Length:38243                
##  Class :character   Class :character       Class :character            
##  Mode  :character   Mode  :character       Mode  :character            
##                                                                        
##                                                                        
##                                                                        
##                                                                        
##     latitude       longitude      property_type       room_type        
##  Min.   :40.50   Min.   :-74.25   Length:38243       Length:38243      
##  1st Qu.:40.69   1st Qu.:-73.98   Class :character   Class :character  
##  Median :40.73   Median :-73.95   Mode  :character   Mode  :character  
##  Mean   :40.73   Mean   :-73.95                                        
##  3rd Qu.:40.76   3rd Qu.:-73.93                                        
##  Max.   :40.91   Max.   :-73.71                                        
##                                                                        
##   accommodates    bathrooms      bathrooms_text        bedrooms     
##  Min.   : 0.000   Mode:logical   Length:38243       Min.   : 1.000  
##  1st Qu.: 2.000   NA's:38243     Class :character   1st Qu.: 1.000  
##  Median : 2.000                  Mode  :character   Median : 1.000  
##  Mean   : 2.792                                     Mean   : 1.324  
##  3rd Qu.: 4.000                                     3rd Qu.: 1.000  
##  Max.   :16.000                                     Max.   :16.000  
##                                                     NA's   :3974    
##       beds         amenities            price           minimum_nights   
##  Min.   : 1.000   Length:38243       Length:38243       Min.   :   1.00  
##  1st Qu.: 1.000   Class :character   Class :character   1st Qu.:   3.00  
##  Median : 1.000   Mode  :character   Mode  :character   Median :  30.00  
##  Mean   : 1.593                                         Mean   :  21.63  
##  3rd Qu.: 2.000                                         3rd Qu.:  30.00  
##  Max.   :24.000                                         Max.   :1250.00  
##  NA's   :2405                                                            
##  maximum_nights      minimum_minimum_nights maximum_minimum_nights
##  Min.   :1.000e+00   Min.   :   1.00        Min.   :   1.00       
##  1st Qu.:6.000e+01   1st Qu.:   3.00        1st Qu.:   3.00       
##  Median :1.125e+03   Median :  30.00        Median :  30.00       
##  Mean   :5.785e+04   Mean   :  21.84        Mean   :  27.63       
##  3rd Qu.:1.125e+03   3rd Qu.:  30.00        3rd Qu.:  30.00       
##  Max.   :2.147e+09   Max.   :1250.00        Max.   :1250.00       
##                      NA's   :18             NA's   :18            
##  minimum_maximum_nights maximum_maximum_nights minimum_nights_avg_ntm
##  Min.   :1.000e+00      Min.   :1.000e+00      Min.   :   1.00       
##  1st Qu.:3.600e+02      1st Qu.:3.650e+02      1st Qu.:   3.00       
##  Median :1.125e+03      Median :1.125e+03      Median :  30.00       
##  Mean   :1.463e+06      Mean   :3.541e+06      Mean   :  27.22       
##  3rd Qu.:1.125e+03      3rd Qu.:1.125e+03      3rd Qu.:  30.00       
##  Max.   :2.147e+09      Max.   :2.147e+09      Max.   :1250.00       
##  NA's   :18             NA's   :18             NA's   :18            
##  maximum_nights_avg_ntm calendar_updated has_availability availability_30 
##  Min.   :1.000e+00      Mode:logical     Mode :logical    Min.   : 0.000  
##  1st Qu.:3.650e+02      NA's:38243       FALSE:2116       1st Qu.: 0.000  
##  Median :1.125e+03                       TRUE :36127      Median : 1.000  
##  Mean   :2.249e+06                                        Mean   : 6.911  
##  3rd Qu.:1.125e+03                                        3rd Qu.:11.000  
##  Max.   :2.147e+09                                        Max.   :30.000  
##  NA's   :18                                                               
##  availability_60 availability_90 availability_365 calendar_last_scraped
##  Min.   : 0.00   Min.   : 0.00   Min.   :  0.0    Min.   :2021-12-04   
##  1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.:  0.0    1st Qu.:2021-12-05   
##  Median : 8.00   Median :25.00   Median : 73.0    Median :2021-12-05   
##  Mean   :19.79   Mean   :33.95   Mean   :134.3    Mean   :2021-12-04   
##  3rd Qu.:38.00   3rd Qu.:67.00   3rd Qu.:302.0    3rd Qu.:2021-12-05   
##  Max.   :60.00   Max.   :90.00   Max.   :365.0    Max.   :2021-12-05   
##                                                                        
##  number_of_reviews number_of_reviews_ltm number_of_reviews_l30d
##  Min.   :   0.0    Min.   :  0.000       Min.   : 0.0000       
##  1st Qu.:   1.0    1st Qu.:  0.000       1st Qu.: 0.0000       
##  Median :   4.0    Median :  0.000       Median : 0.0000       
##  Mean   :  23.3    Mean   :  5.034       Mean   : 0.5401       
##  3rd Qu.:  20.0    3rd Qu.:  3.000       3rd Qu.: 0.0000       
##  Max.   :1009.0    Max.   :669.000       Max.   :67.0000       
##                                                                
##   first_review         last_review         review_scores_rating
##  Min.   :2009-08-09   Min.   :2010-12-21   Min.   :0.000       
##  1st Qu.:2017-03-24   1st Qu.:2019-05-04   1st Qu.:4.570       
##  Median :2019-04-06   Median :2021-04-10   Median :4.820       
##  Mean   :2018-11-26   Mean   :2020-04-11   Mean   :4.586       
##  3rd Qu.:2021-02-27   3rd Qu.:2021-11-05   3rd Qu.:5.000       
##  Max.   :2021-12-04   Max.   :2021-12-05   Max.   :5.000       
##  NA's   :9490         NA's   :9490         NA's   :9490        
##  review_scores_accuracy review_scores_cleanliness review_scores_checkin
##  Min.   :0.000          Min.   :0.000             Min.   :0.000        
##  1st Qu.:4.700          1st Qu.:4.500             1st Qu.:4.800        
##  Median :4.900          Median :4.800             Median :4.960        
##  Mean   :4.744          Mean   :4.612             Mean   :4.814        
##  3rd Qu.:5.000          3rd Qu.:5.000             3rd Qu.:5.000        
##  Max.   :5.000          Max.   :5.000             Max.   :5.000        
##  NA's   :10099          NA's   :10088             NA's   :10106        
##  review_scores_communication review_scores_location review_scores_value
##  Min.   :0.000               Min.   :0.00           Min.   :0.000      
##  1st Qu.:4.810               1st Qu.:4.67           1st Qu.:4.550      
##  Median :4.970               Median :4.88           Median :4.780      
##  Mean   :4.808               Mean   :4.75           Mean   :4.647      
##  3rd Qu.:5.000               3rd Qu.:5.00           3rd Qu.:5.000      
##  Max.   :5.000               Max.   :5.00           Max.   :5.000      
##  NA's   :10095               NA's   :10109          NA's   :10110      
##  license        instant_bookable calculated_host_listings_count
##  Mode:logical   Mode :logical    Min.   :  1.00                
##  NA's:38243     FALSE:27832      1st Qu.:  1.00                
##                 TRUE :10411      Median :  1.00                
##                                  Mean   : 17.76                
##                                  3rd Qu.:  3.00                
##                                  Max.   :421.00                
##                                                                
##  calculated_host_listings_count_entire_homes
##  Min.   :  0.000                            
##  1st Qu.:  0.000                            
##  Median :  1.000                            
##  Mean   :  8.043                            
##  3rd Qu.:  1.000                            
##  Max.   :308.000                            
##                                             
##  calculated_host_listings_count_private_rooms
##  Min.   :  0.000                             
##  1st Qu.:  0.000                             
##  Median :  0.000                             
##  Mean   :  9.602                             
##  3rd Qu.:  1.000                             
##  Max.   :359.000                             
##                                              
##  calculated_host_listings_count_shared_rooms reviews_per_month
##  Min.   :0.00000                             Min.   :  0.010  
##  1st Qu.:0.00000                             1st Qu.:  0.120  
##  Median :0.00000                             Median :  0.480  
##  Mean   :0.04801                             Mean   :  1.722  
##  3rd Qu.:0.00000                             3rd Qu.:  1.780  
##  Max.   :8.00000                             Max.   :141.000  
##                                              NA's   :9490

# Display the structure of groupData overall - each column

str(groupData)

## tibble [38,243 × 74] (S3: tbl_df/tbl/data.frame)
##  $ id                                          : num [1:38243] 2595 3831 5121 5136 5178 ...
##  $ listing_url                                 : chr [1:38243] "https://www.airbnb.com/rooms/2595" "https://www.airbnb.com/rooms/3831" "https://www.airbnb.com/rooms/5121" "https://www.airbnb.com/rooms/5136" ...
##  $ scrape_id                                   : num [1:38243] 2.02e+13 2.02e+13 2.02e+13 2.02e+13 2.02e+13 ...
##  $ last_scraped                                : Date[1:38243], format: "2021-12-05" "2021-12-05" ...
##  $ name                                        : chr [1:38243] "Skylit Midtown Castle" "Whole flr w/private bdrm, bath & kitchen(pls read)" "BlissArtsSpace!" "Spacious Brooklyn Duplex, Patio + Garden" ...
##  $ description                                 : chr [1:38243] "Beautiful, spacious skylit studio in the heart of Midtown, Manhattan. <br /><br />STUNNING SKYLIT STUDIO / 1 BE"| __truncated__ "Enjoy 500 s.f. top floor in 1899 brownstone, w/ wood & ceramic flooring throughout, roomy bdrm, & upgraded kitc"| __truncated__ "<b>The space</b><br />HELLO EVERYONE AND THANKS FOR VISITING BLISS ART SPACE! <br /><br />Thank you all for you"| __truncated__ "We welcome you to stay in our lovely 2 br duplex in South Slope, Brooklyn.  Our home is a truly spacious respit"| __truncated__ ...
##  $ neighborhood_overview                       : chr [1:38243] "Centrally located in the heart of Manhattan just a few blocks from all subway connections in the very desirable"| __truncated__ "Just the right mix of urban center and local neighborhood; close to all but enough quiet for a calming walk. 15"| __truncated__ NA NA ...
##  $ picture_url                                 : chr [1:38243] "https://a0.muscache.com/pictures/f0813a11-40b2-489e-8217-89a2e1637830.jpg" "https://a0.muscache.com/pictures/e49999c2-9fd5-4ad5-b7cc-224deac989aa.jpg" "https://a0.muscache.com/pictures/2090980c-b68e-4349-a874-4818402923e7.jpg" "https://a0.muscache.com/pictures/miso/Hosting-5136/original/adf1e231-7c60-4475-86c0-cee0cd16f538.jpeg" ...
##  $ host_id                                     : num [1:38243] 2845 4869 7356 7378 8967 ...
##  $ host_url                                    : chr [1:38243] "https://www.airbnb.com/users/show/2845" "https://www.airbnb.com/users/show/4869" "https://www.airbnb.com/users/show/7356" "https://www.airbnb.com/users/show/7378" ...
##  $ host_name                                   : chr [1:38243] "Jennifer" "LisaRoxanne" "Garon" "Rebecca" ...
##  $ host_since                                  : Date[1:38243], format: "2008-09-09" "2008-12-07" ...
##  $ host_location                               : chr [1:38243] "New York, New York, United States" "New York, New York, United States" "New York, New York, United States" "Brooklyn, New York, United States" ...
##  $ host_about                                  : chr [1:38243] "A New Yorker since 2000! My passion is creating beautiful, unique spaces where unforgettable memories are made."| __truncated__ "Laid-back Native New Yorker (formerly bi-coastal) and AirBnb host of over 6 years and over 400 stays!  Besides "| __truncated__ "I am an artist(painter, filmmaker) and curator who is working in the film industry while I'm building my busine"| __truncated__ "Rebecca is an artist/designer, and Henoch is in marketing/sales/event planning.  We have two boys- 5 and 8 yo, "| __truncated__ ...
##  $ host_response_time                          : chr [1:38243] "within a day" "a few days or more" "within an hour" "within a day" ...
##  $ host_response_rate                          : chr [1:38243] "80%" "9%" "100%" "100%" ...
##  $ host_acceptance_rate                        : chr [1:38243] "17%" "69%" "100%" "25%" ...
##  $ host_is_superhost                           : logi [1:38243] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ host_thumbnail_url                          : chr [1:38243] "https://a0.muscache.com/im/pictures/user/50fc57af-a6a3-4e88-8f16-efd6cac7c9bc.jpg?aki_policy=profile_small" "https://a0.muscache.com/im/users/4869/profile_pic/1371927771/original.jpg?aki_policy=profile_small" "https://a0.muscache.com/im/pictures/user/72a61bea-cfb1-45b6-abbb-85bdbd790b32.jpg?aki_policy=profile_small" "https://a0.muscache.com/im/users/7378/profile_pic/1259098621/original.jpg?aki_policy=profile_small" ...
##  $ host_picture_url                            : chr [1:38243] "https://a0.muscache.com/im/pictures/user/50fc57af-a6a3-4e88-8f16-efd6cac7c9bc.jpg?aki_policy=profile_x_medium" "https://a0.muscache.com/im/users/4869/profile_pic/1371927771/original.jpg?aki_policy=profile_x_medium" "https://a0.muscache.com/im/pictures/user/72a61bea-cfb1-45b6-abbb-85bdbd790b32.jpg?aki_policy=profile_x_medium" "https://a0.muscache.com/im/users/7378/profile_pic/1259098621/original.jpg?aki_policy=profile_x_medium" ...
##  $ host_neighbourhood                          : chr [1:38243] "Midtown" "Clinton Hill" "Bedford-Stuyvesant" "Greenwood Heights" ...
##  $ host_listings_count                         : num [1:38243] 8 1 1 1 1 1 3 1 0 3 ...
##  $ host_total_listings_count                   : num [1:38243] 8 1 1 1 1 1 3 1 0 3 ...
##  $ host_verifications                          : chr [1:38243] "['email', 'phone', 'reviews', 'offline_government_id', 'kba', 'selfie', 'government_id', 'identity_manual', 'work_email']" "['email', 'phone', 'reviews', 'offline_government_id', 'kba', 'government_id']" "['email', 'phone', 'facebook', 'reviews', 'offline_government_id', 'selfie', 'government_id', 'identity_manual']" "['email', 'phone', 'reviews']" ...
##  $ host_has_profile_pic                        : logi [1:38243] TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ host_identity_verified                      : logi [1:38243] TRUE TRUE TRUE TRUE FALSE TRUE ...
##  $ neighbourhood                               : chr [1:38243] "New York, United States" "Brooklyn, New York, United States" NA NA ...
##  $ neighbourhood_cleansed                      : chr [1:38243] "Midtown" "Bedford-Stuyvesant" "Bedford-Stuyvesant" "Sunset Park" ...
##  $ neighbourhood_group_cleansed                : chr [1:38243] "Manhattan" "Brooklyn" "Brooklyn" "Brooklyn" ...
##  $ latitude                                    : num [1:38243] 40.8 40.7 40.7 40.7 40.8 ...
##  $ longitude                                   : num [1:38243] -74 -74 -74 -74 -74 ...
##  $ property_type                               : chr [1:38243] "Entire rental unit" "Entire guest suite" "Private room in rental unit" "Entire rental unit" ...
##  $ room_type                                   : chr [1:38243] "Entire home/apt" "Entire home/apt" "Private room" "Entire home/apt" ...
##  $ accommodates                                : num [1:38243] 1 3 2 4 2 1 2 3 1 1 ...
##  $ bathrooms                                   : logi [1:38243] NA NA NA NA NA NA ...
##  $ bathrooms_text                              : chr [1:38243] "1 bath" "1 bath" NA "1.5 baths" ...
##  $ bedrooms                                    : num [1:38243] NA 1 1 2 1 1 1 NA 1 1 ...
##  $ beds                                        : num [1:38243] 1 3 1 2 1 1 NA 1 1 1 ...
##  $ amenities                                   : chr [1:38243] "[\"Extra pillows and blankets\", \"Baking sheet\", \"Luggage dropoff allowed\", \"TV\", \"Hangers\", \"Ethernet"| __truncated__ "[\"Extra pillows and blankets\", \"Luggage dropoff allowed\", \"Free parking on premises\", \"Pack \\u2019n pla"| __truncated__ "[\"Kitchen\", \"Long term stays allowed\", \"Wifi\", \"Heating\", \"Air conditioning\"]" "[\"Kitchen\", \"BBQ grill\", \"Cable TV\", \"Carbon monoxide alarm\", \"Outdoor dining area\", \"Washer\", \"Lo"| __truncated__ ...
##  $ price                                       : chr [1:38243] "$150.00" "$75.00" "$60.00" "$275.00" ...
##  $ minimum_nights                              : num [1:38243] 30 1 30 5 2 2 4 30 30 30 ...
##  $ maximum_nights                              : num [1:38243] 1125 730 730 1125 14 ...
##  $ minimum_minimum_nights                      : num [1:38243] 30 1 30 5 2 2 4 30 30 30 ...
##  $ maximum_minimum_nights                      : num [1:38243] 30 1 30 5 2 2 4 30 30 30 ...
##  $ minimum_maximum_nights                      : num [1:38243] 1125 730 730 1125 14 ...
##  $ maximum_maximum_nights                      : num [1:38243] 1125 730 730 1125 14 ...
##  $ minimum_nights_avg_ntm                      : num [1:38243] 30 1 30 5 2 2 4 30 30 30 ...
##  $ maximum_nights_avg_ntm                      : num [1:38243] 1125 730 730 1125 14 ...
##  $ calendar_updated                            : logi [1:38243] NA NA NA NA NA NA ...
##  $ has_availability                            : logi [1:38243] TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ availability_30                             : num [1:38243] 3 3 30 3 1 0 2 2 30 1 ...
##  $ availability_60                             : num [1:38243] 33 6 60 3 16 0 17 30 60 4 ...
##  $ availability_90                             : num [1:38243] 63 18 90 12 34 0 47 30 90 34 ...
##  $ availability_365                            : num [1:38243] 338 194 365 123 192 0 322 179 365 309 ...
##  $ calendar_last_scraped                       : Date[1:38243], format: "2021-12-05" "2021-12-05" ...
##  $ number_of_reviews                           : num [1:38243] 48 409 50 2 507 118 204 181 0 234 ...
##  $ number_of_reviews_ltm                       : num [1:38243] 0 32 0 1 33 0 23 1 0 1 ...
##  $ number_of_reviews_l30d                      : num [1:38243] 0 0 0 0 2 0 2 0 0 0 ...
##  $ first_review                                : Date[1:38243], format: "2009-11-21" "2015-01-05" ...
##  $ last_review                                 : Date[1:38243], format: "2019-11-04" "2021-10-22" ...
##  $ review_scores_rating                        : num [1:38243] 4.7 4.45 4.52 5 4.21 4.91 4.7 4.56 NA 4.88 ...
##  $ review_scores_accuracy                      : num [1:38243] 4.72 4.58 4.22 5 4.21 4.83 4.71 4.59 NA 4.81 ...
##  $ review_scores_cleanliness                   : num [1:38243] 4.62 4.49 4.09 5 3.73 4.82 4.61 4.86 NA 4.96 ...
##  $ review_scores_checkin                       : num [1:38243] 4.76 4.78 4.91 5 4.66 4.97 4.79 4.84 NA 4.96 ...
##  $ review_scores_communication                 : num [1:38243] 4.79 4.8 4.91 5 4.42 4.95 4.82 4.8 NA 4.95 ...
##  $ review_scores_location                      : num [1:38243] 4.86 4.71 4.47 4.5 4.87 4.94 4.87 4.67 NA 4.84 ...
##  $ review_scores_value                         : num [1:38243] 4.41 4.64 4.52 5 4.36 4.92 4.73 4.57 NA 4.84 ...
##  $ license                                     : logi [1:38243] NA NA NA NA NA NA ...
##  $ instant_bookable                            : logi [1:38243] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ calculated_host_listings_count              : num [1:38243] 3 1 2 1 1 1 3 1 2 1 ...
##  $ calculated_host_listings_count_entire_homes : num [1:38243] 3 1 0 1 0 0 1 1 0 0 ...
##  $ calculated_host_listings_count_private_rooms: num [1:38243] 0 0 2 0 1 1 2 0 2 1 ...
##  $ calculated_host_listings_count_shared_rooms : num [1:38243] 0 0 0 0 0 0 0 0 0 0 ...
##  $ reviews_per_month                           : num [1:38243] 0.33 4.86 0.52 0.02 3.68 0.87 1.48 1.24 NA 1.82 ...

# Examnine the str() above and see some columns expected to be numbers are listed as other data type, i.e. price is character.
# Convert some of the columns to numeric to be used in regression

# Converting host_response_rate
groupData$host_response_rate <- as.numeric(
  gsub( "%", "", as.character(groupData$host_response_rate)))

## Warning: NAs introduced by coercion

# Converting host_acceptance_rate
groupData$host_acceptance_rate <- as.numeric(
  gsub( "%", "", as.character(groupData$host_acceptance_rate)))

## Warning: NAs introduced by coercion

# Converting price
groupData$price <- as.numeric(
  gsub( "[\\$,]", "", as.character(groupData$price)))

# Create a dataframe subset of columns I wish to see so I do not have to scroll through the entire 74 columns
# Notice there are some NA values in my subset.  I only removed the missing values from host_is_superhost earlier

subset(groupData, select = c('host_response_rate', 'host_acceptance_rate', 'review_scores_value', 'host_is_superhost', 'reviews_per_month', 'price'))

# REMOVE ALL THESE NA VALUES AND REPLACE THEM WITH MEANINGFUL VALUES

groupData$flag_host_response_rate <- 
  ifelse(is.na(groupData$host_response_rate) |
           groupData$host_response_rate=='' , 1,2)

groupData$host_response_rate <- as.numeric(
  ifelse(is.na(groupData$host_response_rate) |
           groupData$host_response_rate=='' , 
         median(groupData$host_response_rate,na.rm = TRUE),
         as.character(groupData$host_response_rate)))

groupData$flag_host_acceptance_rate <- 
  ifelse(is.na(groupData$host_acceptance_rate) |
           groupData$host_acceptance_rate=='' , 1,2)

groupData$host_acceptance_rate <- as.numeric(
  ifelse(is.na(groupData$host_acceptance_rate) |
           groupData$host_acceptance_rate=='' , 
         median(groupData$host_acceptance_rate,na.rm = TRUE),
         as.character(groupData$host_acceptance_rate)))

groupData$flag_review_scores_rating <- 
  ifelse(is.na(groupData$review_scores_rating) |
           groupData$review_scores_rating=='' , 1,2)

groupData$review_scores_rating <- as.numeric(
  ifelse(is.na(groupData$review_scores_rating) |
           groupData$review_scores_rating=='' , 
         median(groupData$review_scores_rating,na.rm = TRUE),
         as.character(groupData$review_scores_rating)))

groupData$flag_host_review_scores_value <- 
  ifelse(is.na(groupData$review_scores_value) |
           groupData$review_scores_value=='' , 1,2)

groupData$review_scores_value <- as.numeric(
  ifelse(is.na(groupData$review_scores_value) |
           groupData$review_scores_value=='' , 
         median(groupData$review_scores_value,na.rm = TRUE),
         as.character(groupData$review_scores_value)))

groupData$flag_host_reviews_per_month <- 
  ifelse(is.na(groupData$reviews_per_month) |
           groupData$reviews_per_month=='' , 1,2)

groupData$reviews_per_month <- as.numeric(
  ifelse(is.na(groupData$reviews_per_month) |
           groupData$reviews_per_month=='' , 
         median(groupData$reviews_per_month,na.rm = TRUE),
         as.character(groupData$reviews_per_month)))

groupData$flag_host_is_superhost <- 
  ifelse(groupData$host_is_superhost=='N/A' |
           groupData$host_is_superhost=='' , 1,2)

groupData$host_is_superhost <- as.factor(
  ifelse(groupData$host_is_superhost=='N/A' |
           groupData$host_is_superhost=='' , 'f',
         as.character(groupData$host_is_superhost)))


groupData$flag_host_listings_count <- 
  ifelse(is.na(groupData$host_listings_count) |
           groupData$host_listings_count=='' , 1,2)

groupData$host_listings_count <- as.numeric(
  ifelse(is.na(groupData$host_listings_count) |
           groupData$host_listings_count=='' , 
         median(groupData$host_listings_count,na.rm = TRUE),
         as.character(groupData$host_listings_count)))


groupData$flag_host_has_profile_pic <- 
  ifelse(groupData$host_has_profile_pic=='N/A' |
           groupData$host_has_profile_pic=='' , 1,2)

groupData$host_has_profile_pic <- as.factor(
  ifelse(groupData$host_has_profile_pic=='N/A' |
           groupData$host_has_profile_pic=='' , 't',
         as.character(groupData$host_has_profile_pic)))


groupData$flag_host_identity_verified <- 
  ifelse(groupData$host_identity_verified=='N/A' |
           groupData$host_identity_verified=='' , 1,2)

groupData$host_identity_verified <- as.factor(
  ifelse(groupData$host_identity_verified=='N/A' |
           groupData$host_identity_verified=='' , 't',
         as.character(groupData$host_identity_verified)))


groupData$flag_bathrooms <- 
  ifelse(is.na(groupData$bathrooms) |
           groupData$bathrooms=='' , 1,2)

groupData$bathrooms <- as.numeric(
  ifelse(is.na(groupData$bathrooms) |
           groupData$bathrooms=='' , 
         median(groupData$bathrooms,na.rm = TRUE),
         as.character(groupData$bathrooms)))


groupData$flag_bedrooms <- 
  ifelse(is.na(groupData$bedrooms) |
           groupData$bedrooms=='' , 1,2)

groupData$bedrooms <- as.numeric(
  ifelse(is.na(groupData$bedrooms) |
           groupData$bedrooms=='' , 
         median(groupData$bedrooms,na.rm = TRUE),
         as.character(groupData$bedrooms)))


groupData$flag_beds <- 
  ifelse(is.na(groupData$beds) |
           groupData$beds=='' , 1,2)

groupData$beds <- as.numeric(
  ifelse(is.na(groupData$beds) |
           groupData$beds=='' , 
         median(groupData$beds,na.rm = TRUE),
         as.character(groupData$beds)))


groupData$flag_review_scores_rating <- 
  ifelse(is.na(groupData$review_scores_rating) |
           groupData$review_scores_rating=='' , 1,2)

groupData$review_scores_rating <- as.numeric(
  ifelse(is.na(groupData$review_scores_rating) |
           groupData$review_scores_rating=='' , 
         median(groupData$review_scores_rating,na.rm = TRUE),
         as.character(groupData$review_scores_rating)))


groupData$flag_review_scores_accuracy <- 
  ifelse(is.na(groupData$review_scores_accuracy) |
           groupData$review_scores_accuracy=='' , 1,2)

groupData$review_scores_accuracy <- as.numeric(
  ifelse(is.na(groupData$review_scores_accuracy) |
           groupData$review_scores_accuracy=='' , 
         median(groupData$review_scores_accuracy,na.rm = TRUE),
         as.character(groupData$review_scores_accuracy)))


groupData$flag_review_scores_cleanliness <- 
  ifelse(is.na(groupData$review_scores_cleanliness) |
           groupData$review_scores_cleanliness=='' , 1,2)

groupData$review_scores_cleanliness <- as.numeric(
  ifelse(is.na(groupData$review_scores_cleanliness) |
           groupData$review_scores_cleanliness=='' , 
         median(groupData$review_scores_cleanliness,na.rm = TRUE),
         as.character(groupData$review_scores_cleanliness)))


groupData$flag_review_scores_checkin <- 
  ifelse(is.na(groupData$review_scores_checkin) |
           groupData$review_scores_checkin=='' , 1,2)

groupData$review_scores_checkin <- as.numeric(
  ifelse(is.na(groupData$review_scores_checkin) |
           groupData$review_scores_checkin=='' , 
         median(groupData$review_scores_checkin,na.rm = TRUE),
         as.character(groupData$review_scores_checkin)))


groupData$flag_review_scores_communication <- 
  ifelse(is.na(groupData$review_scores_communication) |
           groupData$review_scores_communication=='' , 1,2)


groupData$review_scores_communication <- as.numeric(
  ifelse(is.na(groupData$review_scores_communication) |
           groupData$review_scores_communication=='' , 
         median(groupData$review_scores_communication,na.rm = TRUE),
         as.character(groupData$review_scores_communication)))


groupData$flag_review_scores_location <- 
  ifelse(is.na(groupData$review_scores_location) |
           groupData$review_scores_location=='' , 1,2)

groupData$review_scores_location <- as.numeric(
  ifelse(is.na(groupData$review_scores_location) |
           groupData$review_scores_location=='' , 
         median(groupData$review_scores_location,na.rm = TRUE),
         as.character(groupData$review_scores_location)))


groupData$flag_review_scores_value <- 
  ifelse(is.na(groupData$review_scores_value) |
           groupData$review_scores_value=='' , 1,2)


groupData$review_scores_value <- as.numeric(
  ifelse(is.na(groupData$review_scores_value) |
           groupData$review_scores_value=='' , 
         median(groupData$review_scores_value,na.rm = TRUE),
         as.character(groupData$review_scores_value)))


groupData$flag_reviews_per_month <- 
  ifelse(is.na(groupData$reviews_per_month) |
           groupData$reviews_per_month=='' , 1,2)

groupData$reviews_per_month <- as.numeric(
  ifelse(is.na(groupData$reviews_per_month) |
           groupData$reviews_per_month=='' , 
         median(groupData$reviews_per_month,na.rm = TRUE),
         as.character(groupData$reviews_per_month)))

# Use the scale function to standardize the values (z-score). center=TRUE mean that the value is subtracted from the mean; scale=TRUE means divide by std deviation
# By default, center and scale will be set to TRUE, I explicitly included them for visual reference.
# Insert the target variable into the subset dataframe, now called myData1

myData1 <- scale(subset(groupData, select = c('host_response_rate', 'host_acceptance_rate', 'price', 'review_scores_rating')), center = TRUE, scale = TRUE)
myData1 <- data.frame(myData1, groupData$host_is_superhost)
myData1 <- data.frame(myData1, groupData$host_identity_verified)
myData1 <- data.frame(myData1, groupData$host_has_profile_pic)
colnames(myData1)[5] <- 'host_is_superhost'
colnames(myData1)[6] <- 'host_identity_verified'
colnames(myData1)[7] <- 'host_has_profile_pic'
myData1$host_is_superhost <- as.factor(myData1$host_is_superhost)
myData1$host_identity_verified <- as.integer(myData1$host_identity_verified)
myData1$host_has_profile_pic <- as.integer(myData1$host_has_profile_pic)
myData1

# Set the seed to ensure consistency throughout the testing
# Split the testing (85%) and validation datasets (remaining 15%)

set.seed(1)
myIndex <- createDataPartition(groupData$host_is_superhost, p=0.85, list = FALSE)
trainSet <- myData1[myIndex,]
validationSet <- myData1[-myIndex,]

# Set the method (cv is cross validation) and the fold is the number. 

myCtrl <- trainControl(method = "cv", number = 10)

# Use the expand.grid function to put the k values into an object.  This is indicated by the number in the trainControl function.
# Therefore, the grid will be from 1-10. R will automatically determine the optimal k, by the highest accurancy in the grid.

myGrid <- expand.grid(.k=c(1:10))

# Set the seed to ensure consistency
# Fit the model using the training set with target variables ~ independent variables

set.seed(1)
KNN_fit <- train(host_is_superhost ~price + host_acceptance_rate + host_response_rate + host_identity_verified + review_scores_rating + host_has_profile_pic, data = trainSet, method = "knn", trControl=myCtrl, tuneGrid = myGrid)
KNN_fit

## k-Nearest Neighbors 
## 
## 32508 samples
##     6 predictor
##     2 classes: 'FALSE', 'TRUE' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 29258, 29257, 29257, 29257, 29257, 29258, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    1  0.8188138  0.3912500
##    2  0.8188760  0.3860313
##    3  0.8352406  0.4106870
##    4  0.8325029  0.3938209
##    5  0.8385320  0.4056350
##    6  0.8373629  0.3968487
##    7  0.8398854  0.3979378
##    8  0.8377936  0.3876076
##    9  0.8407160  0.3931024
##   10  0.8395472  0.3860917
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.

# Previous test gives us the optional k, with the highest accuracy rate
# Apply that fit model above to create the Confusion Matrix for validation set.  The positive class is FALSE

# Out of 5735 observations (15% of the total observations,38242), we have:
#
# 4413 True Positives
#  723 False Positives
#  216 False Negatives
#  383 True Negatives

KNN_Class <- predict(KNN_fit, newdata = validationSet)
confusionMatrix (KNN_Class,validationSet$host_is_superhost)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE  4406  661
##      TRUE    223  445
##                                           
##                Accuracy : 0.8459          
##                  95% CI : (0.8363, 0.8551)
##     No Information Rate : 0.8071          
##     P-Value [Acc > NIR] : 1.286e-14       
##                                           
##                   Kappa : 0.417           
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9518          
##             Specificity : 0.4024          
##          Pos Pred Value : 0.8695          
##          Neg Pred Value : 0.6662          
##              Prevalence : 0.8071          
##          Detection Rate : 0.7683          
##    Detection Prevalence : 0.8835          
##       Balanced Accuracy : 0.6771          
##                                           
##        'Positive' Class : FALSE           
##

# Predict the probability of each class in the target class (host_is_superhost either True or False), instead of its class membership
# We have the postive class (FALSE) in column 1 and the other class (TRUE) in column 2

KNN_Class_prob <- predict(KNN_fit, newdata = validationSet, type ='prob')
KNN_Class_prob

# Establish a NEW cutoff point to determine the class membership above.  The default cutoff point is .5 for rounding purposes.
# We will use he as.factor function to set the class membership to the same data type as our target variable, host_is_superhost
# This will be our new model utilizing the new cut off point of .005 

confusionMatrix(as.factor(ifelse(KNN_Class_prob[,2]>0.005, 'FALSE', 'TRUE')), validationSet$host_is_superhost, positive = 'FALSE')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE  2611 1011
##      TRUE   2018   95
##                                           
##                Accuracy : 0.4718          
##                  95% CI : (0.4588, 0.4849)
##     No Information Rate : 0.8071          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.26           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.56405         
##             Specificity : 0.08590         
##          Pos Pred Value : 0.72087         
##          Neg Pred Value : 0.04496         
##              Prevalence : 0.80715         
##          Detection Rate : 0.45527         
##    Detection Prevalence : 0.63156         
##       Balanced Accuracy : 0.32497         
##                                           
##        'Positive' Class : FALSE           
##

# Convert the target variable back to a numeric variable

validationSet$host_is_superhost <- as.numeric(as.character(validationSet$host_is_superhost))

## Warning: NAs introduced by coercion

# Generate the cumulative Gains table
# The gains function requires the actual class membership and predicted target class probability as arguments
# This table will divide the cases in 10 groups based on the probability of belonging to the target class
# The last column shows the average probability of each group

library(gains)
gains_table <- gains(validationSet$host_response_rate, KNN_Class_prob[,2])
gains_table

## Depth                            Cume   Cume Pct                     Mean
##  of           Cume     Mean      Mean   of Total    Lift   Cume     Model
## File     N      N      Resp      Resp      Resp    Index   Lift     Score
## -------------------------------------------------------------------------
##   11   628    628      0.26      0.26    -268.1%   -2449  -2449      0.74
##   23   664   1292      0.22      0.24    -504.1%   -2038  -2238      0.39
##   32   542   1834      0.14      0.21    -629.1%   -1323  -1967      0.24
##   48   898   2732      0.08      0.17    -748.6%    -764  -1572      0.13
##   51   181   2913      0.28      0.18    -830.0%   -2578  -1634      0.10
##   60   536   3449      0.28      0.19   -1073.0%   -2600  -1784      0.06
##  100  2286   5735     -0.32     -0.01     100.0%    2943    100      0.00
##   NA    NA     NA        NA        NA        NA%      NA     NA        NA
##   NA    NA     NA        NA        NA        NA%      NA     NA        NA
##   NA    NA     NA        NA        NA        NA%      NA     NA        NA

#Loading required packages
library(gains)
library(rpart)
library(rpart.plot)
library(pROC)
library(randomForest)

## randomForest 4.7-1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

#Converting categorical variables to factors
myData1$host_is_superhost <- as.factor(myData1$host_is_superhost)
myData1$host_identity_verified <- as.factor(myData1$host_identity_verified)
myData1$host_has_profile_pic <- as.factor(myData1$host_has_profile_pic)
#Initializing index by partitioning host_is_superhost into 85% training and 15% validation sets.
set.seed(1)
myIndex <- createDataPartition(myData1$host_is_superhost, p=0.85, list = FALSE)
trainSet <- myData1[myIndex,]
validationSet <- myData1[-myIndex,]
#Creating bagging tree.
set.seed(1)
bagging_tree <- randomForest(host_is_superhost ~ ., data=trainSet, ntree= 100, mtry = 3, importance = TRUE)
varImpPlot(bagging_tree, type=1)

#It is evident from the plot that review_scores_value is the strongest predictor in the model, with host_acceptance_rate being the second most relevant predictor and price the third.
predicted_class <- predict(bagging_tree, validationSet)
confusionMatrix(predicted_class, validationSet$host_is_superhost, positive = "TRUE")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE  4422  586
##      TRUE    207  520
##                                           
##                Accuracy : 0.8617          
##                  95% CI : (0.8525, 0.8706)
##     No Information Rate : 0.8071          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4892          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.47016         
##             Specificity : 0.95528         
##          Pos Pred Value : 0.71527         
##          Neg Pred Value : 0.88299         
##              Prevalence : 0.19285         
##          Detection Rate : 0.09067         
##    Detection Prevalence : 0.12677         
##       Balanced Accuracy : 0.71272         
##                                           
##        'Positive' Class : TRUE            
##

#The model has a 86.17% accuracy rate with a 71.53% predictive value.
predicted_prob<- predict(bagging_tree, validationSet, type="prob")
validationSet$host_is_superhost<- as.numeric(as.character(validationSet$ host_is_superhost))

## Warning: NAs introduced by coercion

gains_table <- gains(validationSet$host_is_superhost, predicted_prob[,2])
gains_table

## Depth                            Cume   Cume Pct                     Mean
##  of           Cume     Mean      Mean   of Total    Lift   Cume     Model
## File     N      N      Resp      Resp      Resp    Index   Lift     Score
## -------------------------------------------------------------------------
##   10   581    581        NA        NA        NA%      NA     NA      0.83
##   20   593   1174        NA        NA        NA%      NA     NA      0.42
##   30   546   1720        NA        NA        NA%      NA     NA      0.19
##   41   644   2364        NA        NA        NA%      NA     NA      0.08
##   51   534   2898        NA        NA        NA%      NA     NA      0.03
##  100  2837   5735        NA        NA        NA%      NA     NA      0.00
##   NA    NA     NA        NA        NA        NA%      NA     NA        NA
##   NA    NA     NA        NA        NA        NA%      NA     NA        NA
##   NA    NA     NA        NA        NA        NA%      NA     NA        NA
##   NA    NA     NA        NA        NA        NA%      NA     NA        NA

#Not getting correct response from this