airbnb <- read_csv("airbnb_eda.csv")
## Rows: 39118 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): name, host_name, neighbourhood_group, neighbourhood, room_type, la...
## dbl (10): id, host_id, latitude, longitude, price, minimum_nights, number_of...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(airbnb)
## Rows: 39,118
## Columns: 17
## $ id <dbl> 2539, 2595, 3647, 3831, 5022, 5099, 5121, 5203, 52…
## $ name <chr> "Clean & quiet apt home by the park", "Skylit Midt…
## $ host_id <dbl> 2787, 2845, 4632, 4869, 7192, 7322, 7356, 7490, 75…
## $ host_name <chr> "John", "Jennifer", "Elisabeth", "LisaRoxanne", "L…
## $ neighbourhood_group <chr> "Brooklyn", "Manhattan", "Manhattan", "Brooklyn", …
## $ neighbourhood <chr> "Kensington", "Midtown", "Harlem", "Clinton Hill",…
## $ latitude <dbl> 40.64749, 40.75362, 40.80902, 40.68514, 40.79851, …
## $ longitude <dbl> -73.97237, -73.98377, -73.94190, -73.95976, -73.94…
## $ room_type <chr> "Private room", "Entire home/apt", "Private room",…
## $ price <dbl> 149, 225, 60, 45, 80, 200, 60, 32, 150, 54, 85, 48…
## $ minimum_nights <dbl> 1, 1, 3, 1, 10, 3, 45, 2, 1, 5, 2, 90, 2, 2, 1, 3,…
## $ number_of_reviews <dbl> 9, 45, 0, 270, 9, 74, 49, 118, 160, 53, 188, 27, 1…
## $ last_review <chr> "10/19/2018", "5/21/2019", NA, "7/5/2019", "11/19/…
## $ reviews_per_month <dbl> 0.21, 0.38, NA, 4.64, 0.10, 0.59, 0.40, 0.99, 1.33…
## $ floor <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ `noise(dB)` <dbl> 69.05646, 56.05428, 56.05428, 69.05646, 56.05428, …
## $ Location <chr> "807, Friel Place, Brooklyn, Kings County, City of…
airbnb_numeric <- airbnb %>%
na.omit() %>%
select(-id, -name, -host_id, -host_name, -neighbourhood_group, -neighbourhood, -room_type, -last_review, -Location)
After reading the data and exploring the dataset, we see that 17 variables and 39118 observations. It looks like there are some missing values in the columns reviews_per_month, last_review.
This function returns the mean, median and standard deviation of a variable
explore<-function(x){
data<-c("Mean"=mean(x, na.rm=T),
"Median"=median(x, na.rm =T),
"Standard Deviation" = sd(x, na.rm =T),
"Length" = length(x))
return(data)
}
corrplot(cor(airbnb_numeric))
There does not seem to be a high correlation between price and the other
numeric variables.
summary(airbnb)
## id name host_id host_name
## Min. : 2539 Length:39118 Min. : 2438 Length:39118
## 1st Qu.: 9436041 Class :character 1st Qu.: 7789663 Class :character
## Median :19637846 Mode :character Median : 30616863 Mode :character
## Mean :18980697 Mean : 67230185
## 3rd Qu.:29079859 3rd Qu.:107270482
## Max. :36487245 Max. :274321313
##
## neighbourhood_group neighbourhood latitude longitude
## Length:39118 Length:39118 Min. :40.51 Min. :-74.24
## Class :character Class :character 1st Qu.:40.69 1st Qu.:-73.98
## Mode :character Mode :character Median :40.72 Median :-73.96
## Mean :40.73 Mean :-73.95
## 3rd Qu.:40.76 3rd Qu.:-73.94
## Max. :40.91 Max. :-73.71
##
## room_type price minimum_nights number_of_reviews
## Length:39118 Min. : 0.0 Min. : 1.000 Min. : 0.00
## Class :character 1st Qu.: 60.0 1st Qu.: 1.000 1st Qu.: 1.00
## Mode :character Median : 99.0 Median : 2.000 Median : 5.00
## Mean : 140.4 Mean : 7.004 Mean : 23.46
## 3rd Qu.: 165.0 3rd Qu.: 5.000 3rd Qu.: 24.00
## Max. :10000.0 Max. :1250.000 Max. :629.00
##
## last_review reviews_per_month floor noise(dB)
## Length:39118 Min. : 0.010 Min. : 0.000 Min. :22.96
## Class :character 1st Qu.: 0.190 1st Qu.: 1.000 1st Qu.:56.05
## Mode :character Median : 0.720 Median : 1.000 Median :62.48
## Mean : 1.375 Mean : 1.582 Mean :62.70
## 3rd Qu.: 2.030 3rd Qu.: 1.000 3rd Qu.:69.06
## Max. :58.500 Max. :20.000 Max. :98.06
## NA's :7978
## Location
## Length:39118
## Class :character
## Mode :character
##
##
##
##
The summary statistics here give us a better understanding of the dataset.We can see that the mean price is $140 and a median price of $99. The maximum price is $10,000 so we can expect a large number of outliers. The mean number of reviews are 23 with mean reviews per month at 1.375. The mean of the minimum number of nights people are required to stay at a place is 7 and the median is 2. This also indicates a huge number of outliers.
airbnb %>%
ggplot() +
geom_histogram(aes(x = price),fill = "blue", color = "black", bins = 30, binwidth = 100)
ggplotly()
From the histogram of price we can see that the histogram is right skewed. Most of the prices are in the range of 0 to 200.
airbnb %>%
filter(price <= 200) %>%
ggplot() +
geom_histogram(aes(x = price),fill = "blue", color = "black", bins = 20, binwidth = 15)
ggplotly()
This histogram is a little more normally distributed than the previous one. Most of the rental prices seem to be around 50 to 100.
airbnb %>%
ggplot(aes(x= minimum_nights, y=price)) +
geom_point(color = "orange") +
ggtitle("Price by Minimum Nights") +
ylab("Price") +
xlab("Minimum Nights")+
theme(plot.title = element_text(hjust = 0.5))
Null Hypothesis: Mean Airbnb rental price in Manhattan = Mean Airbnb rental price in Brookyln Alternative Hypothesis: Mean Airbnb rental price in Manhattan > Mean Airbnb rental price in Brookyln
airbnb_manhattan <- airbnb %>%
subset(neighbourhood_group == "Manhattan" ,na.rm = TRUE)
airbnb_brooklyn <- airbnb%>%
subset(neighbourhood_group == "Brooklyn" ,na.rm = TRUE)
#Call the Explore Function and get the Mean, Median, Standard Deviation, and Length.
airbnb_manhattan_exp <- explore(airbnb_manhattan$price)
airbnb_manhattan_exp[1]
## Mean
## 178.7154
airbnb_brooklyn_exp <- explore(airbnb_brooklyn$price)
airbnb_brooklyn_exp[1]
## Mean
## 114.0306
#Calculate the SD, z-score and p-values
sd_manhattan_brooklyn <- sqrt(airbnb_manhattan_exp[3]^2/airbnb_manhattan_exp[4] +airbnb_brooklyn_exp[3]^2/airbnb_brooklyn_exp[4])
z_score <- (airbnb_manhattan_exp[1]-airbnb_brooklyn_exp[1])/sd_manhattan_brooklyn
#z_score
print(paste("z-score", z_score))
## [1] "z-score 25.2597458654809"
print(paste("p-value", 1-pnorm(z_score)))
## [1] "p-value 0"
Since the P-value is less than the threshold of 0.05, we can reject the Null Hypothesis and accept the Alternate Hypothesis.
airbnb %>%
group_by(neighbourhood_group)%>%
summarise(mean_price = mean(price)) %>%
ggplot(aes(fill = neighbourhood_group, y = mean_price, x = neighbourhood_group)) +
geom_bar(stat = "identity") +
xlab("Neigborhood Group") +
ylab("Price") +
ggtitle("Airbnb Price by Neighborhood Group") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 45))
ggplotly()
Both the p-value and the bar graph showing Price by Neighborhood group supports the alternate hypothesis and rejects the null hypothesis.We can conclude Manhattan is more expensive than Brooklyn. From the bar graph Manhattan is the most expensive neighborhood group.
airbnb %>%
group_by(neighbourhood_group,room_type) %>%
summarise(mean_price = mean(price)) %>%
ggplot(aes(fill = room_type, y = mean_price, x = neighbourhood_group)) +
geom_bar(position="dodge",stat = "identity") +
xlab("Neigborhood Group") +
ylab("Price") +
ggtitle("Airbnb Price by Neighborhood Group and Room Type") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 45)) +
scale_fill_brewer(type = "qual", palette = 3)
## `summarise()` has grouped output by 'neighbourhood_group'. You can override
## using the `.groups` argument.
ggplotly()
From this graph, we can clearly see that entire home/apt is the most expensive of all the room types across all the neighborhoods. Private rooms and Shared rooms appear to be closer in price.
Null Hypothesis: Mean Airbnb rental price for private room = Mean Airbnb rental price for shared room Alternative Hypothesis: Mean Airbnb rental price for private room > Mean Airbnb rental price for shared room
airbnb_private_room <- airbnb %>%
subset(room_type == "Private room" ,na.rm = TRUE)
airbnb_shared_room <- airbnb%>%
subset(room_type == "Shared room" ,na.rm = TRUE)
#Call the Explore Function and get the Mean, Median, Standard Deviation, and Length.
airbnb_private_room_exp <- explore(airbnb_private_room$price)
airbnb_private_room_exp[1]
## Mean
## 82.3855
airbnb_shared_room_exp <- explore(airbnb_shared_room$price)
airbnb_shared_room_exp[1]
## Mean
## 68.92903
#Calculate the SD, z-score and p-values
sd_private_shared <- sqrt(airbnb_private_room_exp[3]^2/airbnb_private_room_exp[4] +airbnb_shared_room_exp[3]^2/airbnb_shared_room_exp[4])
z_score <- (airbnb_private_room_exp[1]-airbnb_shared_room_exp[1])/sd_private_shared
#z_score
print(paste("z-score", z_score))
## [1] "z-score 3.52298037786318"
print(paste("p-value", 1-pnorm(z_score)))
## [1] "p-value 0.000213361475987051"
Since the P-value is less than the threshold of 0.05, we can reject the Null Hypothesis and accept the Alternate Hypothesis.
Filtering out prices greater that $500 as most prices are in the range of $0-$200 and anything beyond 500 should be outliers
airbnb %>%
filter(price <=500) %>%
ggplot(aes(room_type, price, fill = room_type)) +
geom_boxplot() +
xlab("Room Type") +
ylab("Price") +
ggtitle("Price by Room Type") +
theme(plot.title = element_text(hjust = 0.5), legend.position="none")
ggplotly()
The above box plots show that the median price of entire home/apt is greatest and the private room is greater than shared room. However, there are a huge number of outliers in each categories.