Load the data

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(moments)
library(ggplot2)
library(corrplot)
## corrplot 0.84 loaded
master = read.csv("/Users/rose/Desktop/work/AB_MANHATTAN_DATA.csv", header = T)
names(master)
##  [1] "id"                             "name"                          
##  [3] "host_id"                        "host_name"                     
##  [5] "neighbourhood_group"            "neighbourhood"                 
##  [7] "latitude"                       "longitude"                     
##  [9] "room_type"                      "price"                         
## [11] "minimum_nights"                 "number_of_reviews"             
## [13] "last_review"                    "reviews_per_month"             
## [15] "calculated_host_listings_count" "availability_365"
#DataLink:New York City Airbnb Open Data | Kaggle. Retrieved September 24, 2019, from https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data

Data screening:

Accuracy:

colnames(master)[1] = 'id'
master = subset(master,master$price >0)
summary(master)
##        id          
##  Min.   :    2595  
##  1st Qu.: 9160997  
##  Median :19116646  
##  Mean   :18774840  
##  3rd Qu.:29541310  
##  Max.   :36487245  
##                    
##                                                  name      
##                                                    :    9  
##  Harlem Gem                                        :    7  
##  Cozy East Village Apartment                       :    6  
##  IN MINT CONDITION-STUDIOS EAST 44TH/UNITED NATIONS:    6  
##  New York Apartment                                :    6  
##  Private room in Manhattan                         :    6  
##  (Other)                                           :21620  
##     host_id                 host_name     neighbourhood_group
##  Min.   :     2845   Sonder (NYC):  327   Manhattan:21660    
##  1st Qu.:  8525354   Blueground  :  230                      
##  Median : 30656279   Michael     :  212                      
##  Mean   : 67833116   David       :  202                      
##  3rd Qu.:106950224   John        :  151                      
##  Max.   :274321313   Kara        :  135                      
##                      (Other)     :20403                      
##          neighbourhood     latitude       longitude     
##  Harlem         :2658   Min.   :40.70   Min.   :-74.02  
##  Upper West Side:1971   1st Qu.:40.73   1st Qu.:-73.99  
##  Hell's Kitchen :1958   Median :40.76   Median :-73.98  
##  East Village   :1853   Mean   :40.77   Mean   :-73.97  
##  Upper East Side:1798   3rd Qu.:40.80   3rd Qu.:-73.95  
##  Midtown        :1545   Max.   :40.88   Max.   :-73.91  
##  (Other)        :9877                                   
##            room_type         price         minimum_nights    
##  Entire home/apt:13198   Min.   :   10.0   Min.   :   1.000  
##  Private room   : 7982   1st Qu.:   95.0   1st Qu.:   1.000  
##  Shared room    :  480   Median :  150.0   Median :   3.000  
##                          Mean   :  196.9   Mean   :   8.579  
##                          3rd Qu.:  220.0   3rd Qu.:   6.000  
##                          Max.   :10000.0   Max.   :1250.000  
##                                                              
##  number_of_reviews    last_review    reviews_per_month
##  Min.   :  0.00             : 5028   Min.   : 0.010   
##  1st Qu.:  1.00    6/23/2019:  599   1st Qu.: 0.170   
##  Median :  4.00    7/1/2019 :  557   Median : 0.610   
##  Mean   : 20.99    6/30/2019:  508   Mean   : 1.272   
##  3rd Qu.: 19.00    6/22/2019:  296   3rd Qu.: 1.890   
##  Max.   :607.00    6/24/2019:  290   Max.   :58.500   
##                    (Other)  :14382   NA's   :5028     
##  calculated_host_listings_count availability_365
##  Min.   :  1.00                 Min.   :  0     
##  1st Qu.:  1.00                 1st Qu.:  0     
##  Median :  1.00                 Median : 36     
##  Mean   : 12.79                 Mean   :112     
##  3rd Qu.:  2.00                 3rd Qu.:230     
##  Max.   :327.00                 Max.   :365     
## 
master$price = log(master$price)
summary(master)
##        id          
##  Min.   :    2595  
##  1st Qu.: 9160997  
##  Median :19116646  
##  Mean   :18774840  
##  3rd Qu.:29541310  
##  Max.   :36487245  
##                    
##                                                  name      
##                                                    :    9  
##  Harlem Gem                                        :    7  
##  Cozy East Village Apartment                       :    6  
##  IN MINT CONDITION-STUDIOS EAST 44TH/UNITED NATIONS:    6  
##  New York Apartment                                :    6  
##  Private room in Manhattan                         :    6  
##  (Other)                                           :21620  
##     host_id                 host_name     neighbourhood_group
##  Min.   :     2845   Sonder (NYC):  327   Manhattan:21660    
##  1st Qu.:  8525354   Blueground  :  230                      
##  Median : 30656279   Michael     :  212                      
##  Mean   : 67833116   David       :  202                      
##  3rd Qu.:106950224   John        :  151                      
##  Max.   :274321313   Kara        :  135                      
##                      (Other)     :20403                      
##          neighbourhood     latitude       longitude     
##  Harlem         :2658   Min.   :40.70   Min.   :-74.02  
##  Upper West Side:1971   1st Qu.:40.73   1st Qu.:-73.99  
##  Hell's Kitchen :1958   Median :40.76   Median :-73.98  
##  East Village   :1853   Mean   :40.77   Mean   :-73.97  
##  Upper East Side:1798   3rd Qu.:40.80   3rd Qu.:-73.95  
##  Midtown        :1545   Max.   :40.88   Max.   :-73.91  
##  (Other)        :9877                                   
##            room_type         price       minimum_nights    
##  Entire home/apt:13198   Min.   :2.303   Min.   :   1.000  
##  Private room   : 7982   1st Qu.:4.554   1st Qu.:   1.000  
##  Shared room    :  480   Median :5.011   Median :   3.000  
##                          Mean   :4.999   Mean   :   8.579  
##                          3rd Qu.:5.394   3rd Qu.:   6.000  
##                          Max.   :9.210   Max.   :1250.000  
##                                                            
##  number_of_reviews    last_review    reviews_per_month
##  Min.   :  0.00             : 5028   Min.   : 0.010   
##  1st Qu.:  1.00    6/23/2019:  599   1st Qu.: 0.170   
##  Median :  4.00    7/1/2019 :  557   Median : 0.610   
##  Mean   : 20.99    6/30/2019:  508   Mean   : 1.272   
##  3rd Qu.: 19.00    6/22/2019:  296   3rd Qu.: 1.890   
##  Max.   :607.00    6/24/2019:  290   Max.   :58.500   
##                    (Other)  :14382   NA's   :5028     
##  calculated_host_listings_count availability_365
##  Min.   :  1.00                 Min.   :  0     
##  1st Qu.:  1.00                 1st Qu.:  0     
##  Median :  1.00                 Median : 36     
##  Mean   : 12.79                 Mean   :112     
##  3rd Qu.:  2.00                 3rd Qu.:230     
##  Max.   :327.00                 Max.   :365     
## 
str(master)
## 'data.frame':    21660 obs. of  16 variables:
##  $ id                            : int  2595 3647 5022 5099 5178 5203 5238 5295 5441 6021 ...
##  $ name                          : Factor w/ 21263 levels ""," 2-3 bedroom UWS garden triplex ",..: 16570 19778 8100 10600 10626 6356 7223 2474 4103 21067 ...
##  $ host_id                       : int  2845 4632 7192 7322 8967 7490 7549 7702 7989 11528 ...
##  $ host_name                     : Factor w/ 5973 levels "","(Ari) HENRY LEE",..: 2541 1587 3153 1069 5046 3649 683 3199 2890 1120 ...
##  $ neighbourhood_group           : Factor w/ 1 level "Manhattan": 1 1 1 1 1 1 1 1 1 1 ...
##  $ neighbourhood                 : Factor w/ 32 levels "Battery Park City",..: 18 11 5 20 12 30 3 30 12 30 ...
##  $ latitude                      : num  40.8 40.8 40.8 40.7 40.8 ...
##  $ longitude                     : num  -74 -73.9 -73.9 -74 -74 ...
##  $ room_type                     : Factor w/ 3 levels "Entire home/apt",..: 1 2 1 1 2 2 1 1 2 2 ...
##  $ price                         : num  5.42 5.01 4.38 5.3 4.37 ...
##  $ minimum_nights                : int  1 3 10 3 2 2 1 5 2 2 ...
##  $ number_of_reviews             : int  45 0 9 74 430 118 160 53 188 113 ...
##  $ last_review                   : Factor w/ 1566 levels "","1/1/2015",..: 928 1 304 1088 1096 1222 1166 1088 1092 1276 ...
##  $ reviews_per_month             : num  0.38 NA 0.1 0.59 3.47 0.99 1.33 0.43 1.5 0.91 ...
##  $ calculated_host_listings_count: int  2 1 1 1 1 1 4 1 1 1 ...
##  $ availability_365              : int  355 365 0 129 220 0 188 6 39 333 ...
# The variable reviews_per_month has 5029 NA observations.

Missing data

# Replace NA with 0 to resolve missing data.
master[is.na(master)] = 0
summary(master)
##        id          
##  Min.   :    2595  
##  1st Qu.: 9160997  
##  Median :19116646  
##  Mean   :18774840  
##  3rd Qu.:29541310  
##  Max.   :36487245  
##                    
##                                                  name      
##                                                    :    9  
##  Harlem Gem                                        :    7  
##  Cozy East Village Apartment                       :    6  
##  IN MINT CONDITION-STUDIOS EAST 44TH/UNITED NATIONS:    6  
##  New York Apartment                                :    6  
##  Private room in Manhattan                         :    6  
##  (Other)                                           :21620  
##     host_id                 host_name     neighbourhood_group
##  Min.   :     2845   Sonder (NYC):  327   Manhattan:21660    
##  1st Qu.:  8525354   Blueground  :  230                      
##  Median : 30656279   Michael     :  212                      
##  Mean   : 67833116   David       :  202                      
##  3rd Qu.:106950224   John        :  151                      
##  Max.   :274321313   Kara        :  135                      
##                      (Other)     :20403                      
##          neighbourhood     latitude       longitude     
##  Harlem         :2658   Min.   :40.70   Min.   :-74.02  
##  Upper West Side:1971   1st Qu.:40.73   1st Qu.:-73.99  
##  Hell's Kitchen :1958   Median :40.76   Median :-73.98  
##  East Village   :1853   Mean   :40.77   Mean   :-73.97  
##  Upper East Side:1798   3rd Qu.:40.80   3rd Qu.:-73.95  
##  Midtown        :1545   Max.   :40.88   Max.   :-73.91  
##  (Other)        :9877                                   
##            room_type         price       minimum_nights    
##  Entire home/apt:13198   Min.   :2.303   Min.   :   1.000  
##  Private room   : 7982   1st Qu.:4.554   1st Qu.:   1.000  
##  Shared room    :  480   Median :5.011   Median :   3.000  
##                          Mean   :4.999   Mean   :   8.579  
##                          3rd Qu.:5.394   3rd Qu.:   6.000  
##                          Max.   :9.210   Max.   :1250.000  
##                                                            
##  number_of_reviews    last_review    reviews_per_month
##  Min.   :  0.00             : 5028   Min.   : 0.0000  
##  1st Qu.:  1.00    6/23/2019:  599   1st Qu.: 0.0200  
##  Median :  4.00    7/1/2019 :  557   Median : 0.2800  
##  Mean   : 20.99    6/30/2019:  508   Mean   : 0.9768  
##  3rd Qu.: 19.00    6/22/2019:  296   3rd Qu.: 1.3000  
##  Max.   :607.00    6/24/2019:  290   Max.   :58.5000  
##                    (Other)  :14382                    
##  calculated_host_listings_count availability_365
##  Min.   :  1.00                 Min.   :  0     
##  1st Qu.:  1.00                 1st Qu.:  0     
##  Median :  1.00                 Median : 36     
##  Mean   : 12.79                 Mean   :112     
##  3rd Qu.:  2.00                 3rd Qu.:230     
##  Max.   :327.00                 Max.   :365     
## 

Outliers

Leverage

model = lm(price ~ minimum_nights + 
             number_of_reviews + 
             reviews_per_month + 
             calculated_host_listings_count + 
             availability_365,
           data = master)

k = 5  # Numbers of IVs
leverage = hatvalues(model = model)
cutleverage = (2 * k + 2) / nrow(master)
paste('Leverage cutoff is: ', cutleverage)
## [1] "Leverage cutoff is:  0.000554016620498615"
badleverage = as.numeric(leverage > cutleverage)
table(badleverage)
## badleverage
##     0     1 
## 19959  1701

Cook’s

cooks = cooks.distance(model)
cutcooks = 4 / (nrow(master) - k - 1)
paste('Cooks cutoff is: ', cutcooks)
## [1] "Cooks cutoff is:  0.000184723376743327"
badcooks = as.numeric(cooks > cutcooks)
table(badcooks)
## badcooks
##     0     1 
## 20883   777

Mahalanobis

# Out of the whole dataset, only these variables will be considered.

variables = c(
    'price', 
    'minimum_nights',
    'number_of_reviews',
    'reviews_per_month',
    'calculated_host_listings_count',
    'availability_365'
  )

mahal = mahalanobis(master[variables], 
                    colMeans(master[variables]), 
                    cov(master[variables]))

cutmahal = qchisq(1-.001, ncol(master[variables]))
paste('Mahalanobis cutoff is: ', round(cutmahal, digits = 2))
## [1] "Mahalanobis cutoff is:  22.46"
badmahal = as.numeric(mahal > cutmahal)
table(badmahal)
## badmahal
##     0     1 
## 20807   853
#master_noout = subset(master2, mahal < cutmahal)
#summary(master_noout)

Overall

overall = badmahal + badleverage + badcooks
table(overall)
## overall
##     0     1     2     3 
## 19546  1162   687   265
master_noout = subset(master, overall < 2)
summary(master_noout)
##        id          
##  Min.   :    2595  
##  1st Qu.: 9217903  
##  Median :18823950  
##  Mean   :18600622  
##  3rd Qu.:28890634  
##  Max.   :36487245  
##                    
##                                                  name      
##                                                    :    8  
##  Harlem Gem                                        :    7  
##  IN MINT CONDITION-STUDIOS EAST 44TH/UNITED NATIONS:    6  
##  New York Apartment                                :    6  
##  Private room in Manhattan                         :    6  
##  West Village Apartment                            :    6  
##  (Other)                                           :20669  
##     host_id               host_name     neighbourhood_group
##  Min.   :     2845   David     :  196   Manhattan:20708    
##  1st Qu.:  8520370   Michael   :  193                      
##  Median : 30283594   Blueground:  184                      
##  Mean   : 65249837   John      :  148                      
##  3rd Qu.: 98844306   Kara      :  133                      
##  Max.   :274321313   Mike      :  133                      
##                      (Other)   :19721                      
##          neighbourhood     latitude       longitude     
##  Harlem         :2587   Min.   :40.70   Min.   :-74.02  
##  Upper West Side:1916   1st Qu.:40.73   1st Qu.:-73.99  
##  Hell's Kitchen :1868   Median :40.76   Median :-73.98  
##  East Village   :1797   Mean   :40.77   Mean   :-73.97  
##  Upper East Side:1766   3rd Qu.:40.80   3rd Qu.:-73.95  
##  Midtown        :1498   Max.   :40.88   Max.   :-73.91  
##  (Other)        :9276                                   
##            room_type         price       minimum_nights   
##  Entire home/apt:12541   Min.   :2.303   Min.   :  1.000  
##  Private room   : 7718   1st Qu.:4.554   1st Qu.:  2.000  
##  Shared room    :  449   Median :5.004   Median :  3.000  
##                          Mean   :4.987   Mean   :  7.296  
##                          3rd Qu.:5.389   3rd Qu.:  5.000  
##                          Max.   :8.006   Max.   :120.000  
##                                                           
##  number_of_reviews    last_review    reviews_per_month
##  Min.   :  0.00             : 4781   Min.   :0.0000   
##  1st Qu.:  1.00    6/23/2019:  548   1st Qu.:0.0200   
##  Median :  4.00    7/1/2019 :  521   Median :0.2700   
##  Mean   : 18.49    6/30/2019:  476   Mean   :0.8855   
##  3rd Qu.: 18.00    6/22/2019:  272   3rd Qu.:1.1800   
##  Max.   :221.00    6/24/2019:  266   Max.   :7.7900   
##                    (Other)  :13844                    
##  calculated_host_listings_count availability_365
##  Min.   :  1.000                Min.   :  0     
##  1st Qu.:  1.000                1st Qu.:  0     
##  Median :  1.000                Median : 30     
##  Mean   :  7.615                Mean   :107     
##  3rd Qu.:  2.000                3rd Qu.:215     
##  Max.   :232.000                Max.   :365     
## 
#master2 is data without outliers and required variables (no categorical)
master2 = master_noout[, variables]
summary(master2)
##      price       minimum_nights    number_of_reviews reviews_per_month
##  Min.   :2.303   Min.   :  1.000   Min.   :  0.00    Min.   :0.0000   
##  1st Qu.:4.554   1st Qu.:  2.000   1st Qu.:  1.00    1st Qu.:0.0200   
##  Median :5.004   Median :  3.000   Median :  4.00    Median :0.2700   
##  Mean   :4.987   Mean   :  7.296   Mean   : 18.49    Mean   :0.8855   
##  3rd Qu.:5.389   3rd Qu.:  5.000   3rd Qu.: 18.00    3rd Qu.:1.1800   
##  Max.   :8.006   Max.   :120.000   Max.   :221.00    Max.   :7.7900   
##  calculated_host_listings_count availability_365
##  Min.   :  1.000                Min.   :  0     
##  1st Qu.:  1.000                1st Qu.:  0     
##  Median :  1.000                Median : 30     
##  Mean   :  7.615                Mean   :107     
##  3rd Qu.:  2.000                3rd Qu.:215     
##  Max.   :232.000                Max.   :365

Assumptions:

Additivity:

The IVs are not highly correlated. Hence, it hasn’t met the assumption for multicollinearity/additivity.

model = lm(price ~ minimum_nights + 
             number_of_reviews + 
             reviews_per_month + 
             calculated_host_listings_count + 
             availability_365,
           data = master_noout)

#To test for Multicollinearity
summary(model, correlation = T)
## 
## Call:
## lm(formula = price ~ minimum_nights + number_of_reviews + reviews_per_month + 
##     calculated_host_listings_count + availability_365, data = master_noout)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.76773 -0.42415  0.00486  0.37682  3.00553 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     4.939e+00  6.878e-03 718.089   <2e-16 ***
## minimum_nights                 -4.526e-03  4.725e-04  -9.580   <2e-16 ***
## number_of_reviews              -1.713e-03  1.694e-04 -10.112   <2e-16 ***
## reviews_per_month              -5.318e-03  4.457e-03  -1.193    0.233    
## calculated_host_listings_count  2.265e-03  1.866e-04  12.138   <2e-16 ***
## availability_365                9.377e-04  3.801e-05  24.671   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6386 on 20702 degrees of freedom
## Multiple R-squared:  0.04907,    Adjusted R-squared:  0.04884 
## F-statistic: 213.7 on 5 and 20702 DF,  p-value: < 2.2e-16
## 
## Correlation of Coefficients:
##                                (Intercept) minimum_nights
## minimum_nights                 -0.38                     
## number_of_reviews              -0.08        0.04         
## reviews_per_month              -0.33        0.17         
## calculated_host_listings_count  0.05       -0.33         
## availability_365               -0.29       -0.28         
##                                number_of_reviews reviews_per_month
## minimum_nights                                                    
## number_of_reviews                                                 
## reviews_per_month              -0.59                              
## calculated_host_listings_count  0.06              0.03            
## availability_365               -0.11             -0.11            
##                                calculated_host_listings_count
## minimum_nights                                               
## number_of_reviews                                            
## reviews_per_month                                            
## calculated_host_listings_count                               
## availability_365               -0.23
# Individual pair-wise correlation is an indicator of multicollinearity. The correlation coeff of the  variables shows that each variables are not highly correlated as the value is not close to 1.
# Also multicollinearity exist when there is very high R squared ie more than 0.90 and coefficients are not significant according to their p-values. In our model R squared is 0.20, f statistic is significant 782.2 and   p value is less than 0.05. The individual p value for most of the variables is significant except for reviews_per_month variable. Hence there is no multicollinearity

Linearity:

The linearity assumption has been tested by resettest function which returns p-value greater than 5%. Hence, the assumption for linearity has met.

random = rchisq(nrow(master_noout[variables]), ncol(master_noout[variables]))

fake = lm(random~., data = master2)
summary(fake)
## 
## Call:
## lm(formula = random ~ ., data = master2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.8711 -2.5628 -0.6461  1.8516 26.9973 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     6.3272519  0.1908416  33.154   <2e-16 ***
## price                          -0.0751174  0.0378864  -1.983   0.0474 *  
## minimum_nights                  0.0011945  0.0025811   0.463   0.6435    
## number_of_reviews               0.0022259  0.0009257   2.405   0.0162 *  
## reviews_per_month              -0.0151092  0.0242979  -0.622   0.5341    
## calculated_host_listings_count  0.0008084  0.0010210   0.792   0.4285    
## availability_365               -0.0001376  0.0002102  -0.655   0.5128    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.481 on 20701 degrees of freedom
## Multiple R-squared:  0.0005838,  Adjusted R-squared:  0.0002941 
## F-statistic: 2.015 on 6 and 20701 DF,  p-value: 0.06001
standardized = rstudent(fake)
qqnorm(standardized)
abline(0, 1)

plot(fake, 2)

resettest(fake)
## 
##  RESET test
## 
## data:  fake
## RESET = 1.0913, df1 = 2, df2 = 20699, p-value = 0.3358

Normality

The assumption for normality hasn’t met as the histogram is positively skewed and the is not rounded near 0.

skewness(master2, na.rm = TRUE)
##                          price                 minimum_nights 
##                      0.4146988                      2.7977708 
##              number_of_reviews              reviews_per_month 
##                      2.8090271                      1.9307349 
## calculated_host_listings_count               availability_365 
##                      6.1082194                      0.8314128
kurtosis(master2, na.rm = TRUE)
##                          price                 minimum_nights 
##                       3.813680                      14.561599 
##              number_of_reviews              reviews_per_month 
##                      11.435476                       6.459823 
## calculated_host_listings_count               availability_365 
##                      45.310079                       2.100742
hist(standardized, breaks = 15)

Homogeneity/Homoscedasticity:

The data is unevenly spread to 0 on x-axis. Hence, the assumption Homogeneity has not met. However, data on y-axis is not rounded to 0. Hence, the assumption Homoscedasticity hasn’t met.

fitvalues = scale(fake$fitted.values)
plot(fitvalues, standardized) 
abline(0, 0)
abline(v = 0)

plot(fake, 1)

Correlation Analysis

summary(master2)
##      price       minimum_nights    number_of_reviews reviews_per_month
##  Min.   :2.303   Min.   :  1.000   Min.   :  0.00    Min.   :0.0000   
##  1st Qu.:4.554   1st Qu.:  2.000   1st Qu.:  1.00    1st Qu.:0.0200   
##  Median :5.004   Median :  3.000   Median :  4.00    Median :0.2700   
##  Mean   :4.987   Mean   :  7.296   Mean   : 18.49    Mean   :0.8855   
##  3rd Qu.:5.389   3rd Qu.:  5.000   3rd Qu.: 18.00    3rd Qu.:1.1800   
##  Max.   :8.006   Max.   :120.000   Max.   :221.00    Max.   :7.7900   
##  calculated_host_listings_count availability_365
##  Min.   :  1.000                Min.   :  0     
##  1st Qu.:  1.000                1st Qu.:  0     
##  Median :  1.000                Median : 30     
##  Mean   :  7.615                Mean   :107     
##  3rd Qu.:  2.000                3rd Qu.:215     
##  Max.   :232.000                Max.   :365
str(master2)
## 'data.frame':    20708 obs. of  6 variables:
##  $ price                         : num  5.42 5.01 4.38 5.3 4.37 ...
##  $ minimum_nights                : int  1 3 10 3 2 1 5 2 2 90 ...
##  $ number_of_reviews             : int  45 0 9 74 118 160 53 188 113 27 ...
##  $ reviews_per_month             : num  0.38 0 0.1 0.59 0.99 1.33 0.43 1.5 0.91 0.22 ...
##  $ calculated_host_listings_count: int  2 1 1 1 1 4 1 1 1 1 ...
##  $ availability_365              : int  355 365 0 129 0 188 6 39 333 0 ...
#Pearson method for a correlation table
round(cor(master2, use="pairwise.complete.obs", method = "pearson"), 2)
##                                price minimum_nights number_of_reviews
## price                           1.00           0.04             -0.07
## minimum_nights                  0.04           1.00             -0.16
## number_of_reviews              -0.07          -0.16              1.00
## reviews_per_month              -0.04          -0.22              0.62
## calculated_host_listings_count  0.13           0.43             -0.11
## availability_365                0.18           0.34              0.13
##                                reviews_per_month
## price                                      -0.04
## minimum_nights                             -0.22
## number_of_reviews                           0.62
## reviews_per_month                           1.00
## calculated_host_listings_count             -0.13
## availability_365                            0.11
##                                calculated_host_listings_count
## price                                                    0.13
## minimum_nights                                           0.43
## number_of_reviews                                       -0.11
## reviews_per_month                                       -0.13
## calculated_host_listings_count                           1.00
## availability_365                                         0.33
##                                availability_365
## price                                      0.18
## minimum_nights                             0.34
## number_of_reviews                          0.13
## reviews_per_month                          0.11
## calculated_host_listings_count             0.33
## availability_365                           1.00
#Price has the highest correlation to availability_365 according to the Pearson table

cor.test(master2$price,master2$number_of_reviews, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  master2$price and master2$number_of_reviews
## t = -9.8171, df = 20706, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.08160979 -0.05449556
## sample estimates:
##         cor 
## -0.06806524
cor.test(master2$price,master2$minimum_nights, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  master2$price and master2$minimum_nights
## t = 6.4757, df = 20706, p-value = 9.65e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.03135613 0.05854149
## sample estimates:
##        cor 
## 0.04495714
cor.test(master2$price,master2$reviews_per_month, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  master2$price and master2$reviews_per_month
## t = -5.7287, df = 20706, p-value = 1.026e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.05337154 -0.02617423
## sample estimates:
##         cor 
## -0.03978025
cor.test(master2$price,master2$calculated_host_listings_count, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  master2$price and master2$calculated_host_listings_count
## t = 19.305, df = 20706, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1195629 0.1463218
## sample estimates:
##       cor 
## 0.1329666
cor.test(master2$price,master2$availability_365, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  master2$price and master2$availability_365
## t = 26.207, df = 20706, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1659609 0.1923269
## sample estimates:
##       cor 
## 0.1791761
#Create Correltation plots for the dataset-
#library(corrplot)
#cordata only created for corrplot
cordata = master2
colnames(cordata) = c("Price", "Min Nights", "No. of RVWs", "RVWs per month", "Host Listing Ct", "Avail 365")
corrplot(cor(cordata), method="circle")

Scatterplots

#library(ggplot2)
cleanup = theme(panel.grid.major = element_blank(),
                panel.grid.minor = element_blank(),
                panel.background = element_blank(),
                axis.line.x = element_line(color = 'black'),
                axis.line.y = element_line(color = 'black'),
                legend.key = element_rect(fill = 'white'),
                text = element_text(size = 15))

# Scatterplot for each Independent with Devependent Variable with No Grouping

# Reviews per month
scatterplot.Monthlyreview = ggplot(master2, aes(master2$reviews_per_month, master2$price))
scatterplot.Monthlyreview + 
  geom_point(color = "blue")+ 
  geom_smooth(method = lm , fill = "grey", color = "red") + 
  xlab("Reviews Per Month") + 
  ylab("Price") + 
  cleanup
## Warning: Use of `master2$reviews_per_month` is discouraged. Use
## `reviews_per_month` instead.
## Warning: Use of `master2$price` is discouraged. Use `price` instead.
## Warning: Use of `master2$reviews_per_month` is discouraged. Use
## `reviews_per_month` instead.
## Warning: Use of `master2$price` is discouraged. Use `price` instead.
## `geom_smooth()` using formula 'y ~ x'

# Minimum Nights
scatterplot.Nights = ggplot(master2, aes(master2$minimum_nights, master2$price))
scatterplot.Nights + 
  geom_point(color = "blue") + 
  geom_smooth(method = lm,color = "red" , fill = "grey" ) + 
  xlab("Minimum NIghts") + 
  ylab("Price") + 
  cleanup 
## Warning: Use of `master2$minimum_nights` is discouraged. Use
## `minimum_nights` instead.

## Warning: Use of `master2$price` is discouraged. Use `price` instead.
## Warning: Use of `master2$minimum_nights` is discouraged. Use
## `minimum_nights` instead.
## Warning: Use of `master2$price` is discouraged. Use `price` instead.
## `geom_smooth()` using formula 'y ~ x'

# No of Reviews
scatterplot.Reviews = ggplot(master2, aes(master2$number_of_reviews, master2$price ))
scatterplot.Reviews + 
  geom_point(color = "blue")+ 
  geom_smooth(method = lm,color = "red" , fill = "grey")+  
  xlab("No of Reviews") + 
  ylab("Price") + 
  cleanup 
## Warning: Use of `master2$number_of_reviews` is discouraged. Use
## `number_of_reviews` instead.

## Warning: Use of `master2$price` is discouraged. Use `price` instead.
## Warning: Use of `master2$number_of_reviews` is discouraged. Use
## `number_of_reviews` instead.
## Warning: Use of `master2$price` is discouraged. Use `price` instead.
## `geom_smooth()` using formula 'y ~ x'

# Host Listing Count
scatterplot.listingcount = ggplot(master2, aes(master2$calculated_host_listings_count, master2$price))
scatterplot.listingcount + 
  geom_point(color = "blue") + 
  geom_smooth(method = lm,color = "red" , fill = "grey") + 
  xlab("Listing count") + 
  ylab("Price") + 
  cleanup
## Warning: Use of `master2$calculated_host_listings_count` is discouraged.
## Use `calculated_host_listings_count` instead.

## Warning: Use of `master2$price` is discouraged. Use `price` instead.
## Warning: Use of `master2$calculated_host_listings_count` is discouraged.
## Use `calculated_host_listings_count` instead.
## Warning: Use of `master2$price` is discouraged. Use `price` instead.
## `geom_smooth()` using formula 'y ~ x'

# Availabiliy
scatterplot.availability = ggplot(master2, aes(master2$availability_365, master2$price))
scatterplot.availability + 
  geom_point(color = "blue") + 
  geom_smooth(method = lm,color = "red" , fill = "black") + 
  xlab("Availability") + 
  ylab("Price") + 
  cleanup
## Warning: Use of `master2$availability_365` is discouraged. Use
## `availability_365` instead.

## Warning: Use of `master2$price` is discouraged. Use `price` instead.
## Warning: Use of `master2$availability_365` is discouraged. Use
## `availability_365` instead.
## Warning: Use of `master2$price` is discouraged. Use `price` instead.
## `geom_smooth()` using formula 'y ~ x'

#Regression Analysis

library(haven)

#we will examine the significance of each variable by looking at the increase in adj Rsquared 

model1= lm(price~ availability_365, data = master2)
summary(model1)  #model1 is significant addition of treatment is significant as p<0.05 with adj R sq 0.03
## 
## Call:
## lm(formula = price ~ availability_365, data = master2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.73802 -0.43905 -0.00606  0.40191  3.02796 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      4.891e+00  5.780e-03  846.20   <2e-16 ***
## availability_365 8.957e-04  3.418e-05   26.21   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6442 on 20706 degrees of freedom
## Multiple R-squared:  0.0321, Adjusted R-squared:  0.03206 
## F-statistic: 686.8 on 1 and 20706 DF,  p-value: < 2.2e-16
summary(model1)$adj.r.squared
## [1] 0.03205733
model2= lm(price~ availability_365+ calculated_host_listings_count, data = master2)
summary(model2) #model2 is significant addition of treatment is significant as p<0.05 with adj R sq 0.04
## 
## Call:
## lm(formula = price ~ availability_365 + calculated_host_listings_count, 
##     data = master2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.70213 -0.43520 -0.01348  0.39055  3.02669 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    4.891e+00  5.762e-03  848.73   <2e-16 ***
## availability_365               7.595e-04  3.605e-05   21.07   <2e-16 ***
## calculated_host_listings_count 2.009e-03  1.740e-04   11.55   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6421 on 20705 degrees of freedom
## Multiple R-squared:  0.0383, Adjusted R-squared:  0.0382 
## F-statistic: 412.2 on 2 and 20705 DF,  p-value: < 2.2e-16
summary(model2)$adj.r.squared
## [1] 0.03820301
model3= lm(price~ availability_365+ calculated_host_listings_count + minimum_nights, data = master2)
summary(model3) #model3 is significant addition of treatment is significant as p<0.05 with adj R sq 0.04
## 
## Call:
## lm(formula = price ~ availability_365 + calculated_host_listings_count + 
##     minimum_nights, data = master2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.71052 -0.42899 -0.00368  0.38948  3.02238 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     4.905e+00  6.063e-03  809.00  < 2e-16 ***
## availability_365                8.238e-04  3.706e-05   22.23  < 2e-16 ***
## calculated_host_listings_count  2.505e-03  1.865e-04   13.43  < 2e-16 ***
## minimum_nights                 -3.369e-03  4.609e-04   -7.31 2.76e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6413 on 20704 degrees of freedom
## Multiple R-squared:  0.04077,    Adjusted R-squared:  0.04063 
## F-statistic: 293.3 on 3 and 20704 DF,  p-value: < 2.2e-16
summary(model3)$adj.r.squared
## [1] 0.04063288
model4= lm(price~ availability_365+ calculated_host_listings_count + minimum_nights +number_of_reviews, data = master2)
summary(model4) #model4 is significant addition of treatment is significant as p<0.05 with adj R sq 0.05
## 
## Call:
## lm(formula = price ~ availability_365 + calculated_host_listings_count + 
##     minimum_nights + number_of_reviews, data = master2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.76435 -0.42347  0.00437  0.37703  3.00124 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     4.936e+00  6.485e-03 761.234   <2e-16 ***
## availability_365                9.329e-04  3.779e-05  24.684   <2e-16 ***
## calculated_host_listings_count  2.272e-03  1.866e-04  12.179   <2e-16 ***
## minimum_nights                 -4.431e-03  4.657e-04  -9.515   <2e-16 ***
## number_of_reviews              -1.832e-03  1.368e-04 -13.388   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6386 on 20703 degrees of freedom
## Multiple R-squared:  0.04901,    Adjusted R-squared:  0.04882 
## F-statistic: 266.7 on 4 and 20703 DF,  p-value: < 2.2e-16
summary(model4)$adj.r.squared
## [1] 0.04882163
model5= lm(price~availability_365+ calculated_host_listings_count + minimum_nights +number_of_reviews + reviews_per_month , data = master2)
summary(model5) #model5 is significant addition of treatment is significant as p<0.05 with adj R sq 0.05
## 
## Call:
## lm(formula = price ~ availability_365 + calculated_host_listings_count + 
##     minimum_nights + number_of_reviews + reviews_per_month, data = master2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.76773 -0.42415  0.00486  0.37682  3.00553 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     4.939e+00  6.878e-03 718.089   <2e-16 ***
## availability_365                9.377e-04  3.801e-05  24.671   <2e-16 ***
## calculated_host_listings_count  2.265e-03  1.866e-04  12.138   <2e-16 ***
## minimum_nights                 -4.526e-03  4.725e-04  -9.580   <2e-16 ***
## number_of_reviews              -1.713e-03  1.694e-04 -10.112   <2e-16 ***
## reviews_per_month              -5.318e-03  4.457e-03  -1.193    0.233    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6386 on 20702 degrees of freedom
## Multiple R-squared:  0.04907,    Adjusted R-squared:  0.04884 
## F-statistic: 213.7 on 5 and 20702 DF,  p-value: < 2.2e-16
summary(model5)$adj.r.squared
## [1] 0.0488411
#Comparing the models
anova(model1, model2, model3, model4, model5) 
#Looking at R squared value in all the models, we noticed that minimum_nights provides least increase in the adjusted R squared. So we will remove it from the final model

#master3= master2[,-11]  #removing minimum_nights
#fullmodel= lm(price~ number_of_reviews + reviews_per_month + calculated_host_listings_count + availability_365, data = master3)
#summary(fullmodel)

#adj R squared is 0.04. The model explains 30% of variance in the data. The p value is <0.05. hence the model is significant

#install.packages("leaps")
#library(leaps)
#data.subset = regsubsets(price~.,master2)
#data.subset.summary = summary(data.subset)
#data.subset.summary
#which.max(data.subset.summary$adjr2)

#Model Validation

mahal1 = mahalanobis(master2,
                     colMeans(master2),
                     cov(master2))

cutmahal1 = qchisq(1-.001,ncol(master2))

badmahal1 = as.numeric(mahal1 > cutmahal1)

table(badmahal1)
## badmahal1
##     0     1 
## 20063   645
k=5# number of IV's
leverage1 = hatvalues(model5)
cutleverage1 = (2*k+2)/nrow(master2)
badleverage1 = as.numeric(leverage1 > cutleverage1)
table(badleverage1)
## badleverage1
##     0     1 
## 18707  2001
#Testing for Influence using Cook's values
cooks1 = cooks.distance(model5)
cutcooks1 = 4/(nrow(master2)-k-1)

badcooks1 = as.numeric(cooks1 > cutcooks1)
table(badcooks1)
## badcooks1
##     0     1 
## 19772   936
totalout1 = badmahal1 + badleverage1 + badcooks1  #Removing the bad values

table(totalout1)
## totalout1
##     0     1     2     3 
## 18057  1837   697   117
noout = subset(master2, totalout1 < 2)

#running the model again after getting rid of outliers
model5.2 = lm(noout$price~ number_of_reviews + reviews_per_month + 
    calculated_host_listings_count+availability_365+minimum_nights, data = noout)

summary(model5.2)
## 
## Call:
## lm(formula = noout$price ~ number_of_reviews + reviews_per_month + 
##     calculated_host_listings_count + availability_365 + minimum_nights, 
##     data = noout)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.77145 -0.42542  0.00237  0.37713  2.90844 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     4.947e+00  7.079e-03 698.882  < 2e-16 ***
## number_of_reviews              -1.987e-03  2.032e-04  -9.778  < 2e-16 ***
## reviews_per_month              -5.138e-03  5.012e-03  -1.025    0.305    
## calculated_host_listings_count  2.254e-03  3.276e-04   6.882 6.08e-12 ***
## availability_365                9.608e-04  3.937e-05  24.403  < 2e-16 ***
## minimum_nights                 -6.065e-03  6.000e-04 -10.109  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6352 on 19888 degrees of freedom
## Multiple R-squared:  0.04128,    Adjusted R-squared:  0.04104 
## F-statistic: 171.3 on 5 and 19888 DF,  p-value: < 2.2e-16
#Multicollinearity
summary(model5.2, correlation = T)
## 
## Call:
## lm(formula = noout$price ~ number_of_reviews + reviews_per_month + 
##     calculated_host_listings_count + availability_365 + minimum_nights, 
##     data = noout)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.77145 -0.42542  0.00237  0.37713  2.90844 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     4.947e+00  7.079e-03 698.882  < 2e-16 ***
## number_of_reviews              -1.987e-03  2.032e-04  -9.778  < 2e-16 ***
## reviews_per_month              -5.138e-03  5.012e-03  -1.025    0.305    
## calculated_host_listings_count  2.254e-03  3.276e-04   6.882 6.08e-12 ***
## availability_365                9.608e-04  3.937e-05  24.403  < 2e-16 ***
## minimum_nights                 -6.065e-03  6.000e-04 -10.109  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6352 on 19888 degrees of freedom
## Multiple R-squared:  0.04128,    Adjusted R-squared:  0.04104 
## F-statistic: 171.3 on 5 and 19888 DF,  p-value: < 2.2e-16
## 
## Correlation of Coefficients:
##                                (Intercept) number_of_reviews
## number_of_reviews              -0.10                        
## reviews_per_month              -0.33       -0.60            
## calculated_host_listings_count  0.09        0.06            
## availability_365               -0.25       -0.10            
## minimum_nights                 -0.42        0.03            
##                                reviews_per_month
## number_of_reviews                               
## reviews_per_month                               
## calculated_host_listings_count  0.01            
## availability_365               -0.12            
## minimum_nights                  0.17            
##                                calculated_host_listings_count
## number_of_reviews                                            
## reviews_per_month                                            
## calculated_host_listings_count                               
## availability_365               -0.23                         
## minimum_nights                 -0.44                         
##                                availability_365
## number_of_reviews                              
## reviews_per_month                              
## calculated_host_listings_count                 
## availability_365                               
## minimum_nights                 -0.26
#Test for Assumptions

standardized2 = rstudent(model5.2)
fitted2 = scale(model5.2$fitted.values)

#Test for Linearity (graphical test)
qqnorm(standardized2)
abline(0,1)

#Linearity (formal test)
resettest(model5.2)
## 
##  RESET test
## 
## data:  model5.2
## RESET = 2.7582, df1 = 2, df2 = 19886, p-value = 0.06343
#Test for Normality
hist(standardized2)

#Test for Homegeneity/Homoscedasticity
plot(fitted2,standardized2)
abline(0,0)
abline(v=0)