library(pacman)
p_load(dplyr, ggplot2, ggmap, rio, rgdal,tmap,patchwork, geojsonR,broom,geojsonio)
# read in the data
df <- import("data/airbnb_listings.csv")
df <- select(df, id,transit,host_id,host_listings_count,latitude,longitude,room_type,accommodates,bathrooms,bedrooms,price,availability_365,neighbourhood_group_cleansed,neighbourhood_cleansed,room_type,review_scores_rating,review_scores_accuracy,number_of_reviews,last_review,)
# read in the shape files
nyc_b <- fortify(spTransform(readOGR("data/nyc_boroughs_map/.","nybb"),CRS("+proj=longlat +datum=WGS84")))
## OGR data source with driver: ESRI Shapefile 
## Source: "/Users/user/Documents/QMSS/Spring2022/Data_viz/5063_assignments/assignment-2-airbnb-shaunahan/data/nyc_boroughs_map", layer: "nybb"
## with 5 features
## It has 4 fields
## Regions defined for each Polygons
nyc_sr <- readOGR("data/nyc_subway_map/routes_nyc_subway/.","routes_nyc_subway_jan2017")
## OGR data source with driver: ESRI Shapefile 
## Source: "/Users/user/Documents/QMSS/Spring2022/Data_viz/5063_assignments/assignment-2-airbnb-shaunahan/data/nyc_subway_map/routes_nyc_subway", layer: "routes_nyc_subway_jan2017"
## with 25 features
## It has 7 fields
## Integer64 fields read as strings:  OBJECTID
neighbors <- tidy(geojson_read("data/neighbourhoods.geojson",what="sp"))
## Regions defined for each Polygons
nei <- FROM_GeoJson(url_file_string = "data/neighbourhoods.geojson")

1(a)

##Provide a map to show where in New York City AirBnB listings are located.

As illustrated in the last visualization of thisquestion (please scroll down), Airbnb housing is located mostly in Manhattan, and parts of Brooklyn and Queen, due to proximity to the central Manattan area. This result is as expected.

# creating maps for each boro
nyc_b["id"][nyc_b["id"] == 0] <- "Staten Island"
nyc_b["id"][nyc_b["id"] == 1] <- "Bronx"
nyc_b["id"][nyc_b["id"] == 2] <- "Manhattan"
nyc_b["id"][nyc_b["id"] == 3] <- "Brooklyn"
nyc_b["id"][nyc_b["id"] == 4] <- "Queens"
#proprocessing the geojson data
boros<-c()
for (i in 1:length(nei$features))
{
  boros<-c(boros,nei$features[[i]]$properties$neighbourhood_group)
}

boros_column<-c()
for (i in neighbors$id)
{
  boros_column<-c(boros_column,boros[as.numeric((i))])
}

neighbor_name <- c()

for (i in 1:length(nei$features))
{
  neighbor_name<-c(neighbor_name,nei$features[[i]]$properties$neighbourhood)
}

nei_column<-c()
for (i in neighbors$id)
{
  nei_column<-c(nei_column,neighbor_name[as.numeric((i))])
}

neighbors$boros <- boros_column
neighbors$neighbourhood <- nei_column
# creating a style to remove background and axis
style <-  theme_bw() + 
          theme(panel.border = element_blank(), panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),axis.line.x = element_blank(),axis.text.x = element_blank(),axis.line.y = element_blank(),axis.text.y = element_blank(),axis.ticks.y = element_blank(),axis.ticks.x = element_blank(),axis.title.x = element_blank(),axis.title.y =element_blank()
          ,legend.position = c(0.2,0.8))

style2 <-  theme_bw() + 
          theme(panel.border = element_blank(), panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),axis.line.x = element_blank(),axis.text.x = element_blank(),axis.line.y = element_blank(),axis.text.y = element_blank(),axis.ticks.y = element_blank(),axis.ticks.x = element_blank(),axis.title.x = element_blank(),axis.title.y =element_blank()
          )

# plotting every points of the airbnb location on the map of NYC, distinguishing each boro
air_nyc <-ggplot(data=nyc_b,aes(x=long,y=lat))+
  
  
          style +
  
          geom_polygon(data=neighbors, aes(group=group,color=boros),fill=NA,size=0.5)+
  
          geom_polygon(aes(group=group,color=id),fill=NA,,size=1)+
  
          geom_point(data=df, aes(x=longitude, y=latitude),alpha=0.1,size=0.1)+
          ggtitle("Airbnb listings in NYC")
  
          
air_nyc

# take a closer look at each boro independently

air_M <- ggplot(data=subset(nyc_b,id %in% c("Manhattan")),aes(x=long,y=lat)) +
           geom_polygon(data=subset(neighbors,boros %in% "Manhattan"), aes(group=group,color=boros),
                       fill=NA,size=0.5)+
  
          geom_polygon(aes(group=group,color=id),fill=NA,size=1,color= "#D55E00")+
  
         
  
          geom_point(data=subset(df,neighbourhood_group_cleansed %in% c("Manhattan")), 
                     aes(x=longitude, y=latitude),alpha=0.1,size=0.1)+
  
          
          style+ggtitle(("Manhattan"))+ theme(legend.position = "none")
         
air_brx <- ggplot(data=subset(nyc_b,id %in% c("Bronx")),aes(x=long,y=lat))+
           geom_polygon(data=subset(neighbors,boros %in% "Bronx"), aes(group=group,color=boros),
                       fill=NA,size=0.5)+
  
          geom_polygon(aes(group=group,),fill=NA,size=1,color="#0072B2")+
 
  
          geom_point(data=subset(df,neighbourhood_group_cleansed %in% c("Bronx")), 
                     aes(x=longitude, y=latitude),alpha=0.1,size=0.1)+
          style+ ggtitle("Bronx")+ theme(legend.position = "none")        

air_brk <- ggplot(data=subset(nyc_b,id %in% c("Brooklyn")),aes(x=long,y=lat))+
            geom_polygon(data=subset(neighbors,boros %in% "Brooklyn"), aes(group=group,color=boros),
                       fill=NA,size=0.5)+
  
          geom_polygon(aes(group=group),fill=NA,size=1,color="#009E73")+

  
          geom_point(data=subset(df,neighbourhood_group_cleansed %in% c("Brooklyn")), 
                     aes(x=longitude, y=latitude),alpha=0.1,size=0.1)+
          style+ggtitle("Brooklyn") + theme(legend.position = "none")

air_si <- ggplot(data=subset(nyc_b,id %in% c("Staten Island")),aes(x=long,y=lat))+
  
         geom_polygon(data=subset(neighbors,boros %in% "Staten Island"), aes(group=group,color=boros),
                       fill=NA,size=0.5)+
  
          geom_polygon(aes(group=group),fill=NA,size=1,color="gold4")+
         
  
          geom_point(data=subset(df,neighbourhood_group_cleansed %in% c("Staten Island")), 
                     aes(x=longitude, y=latitude),alpha=0.1,size=0.1)+
          style+ggtitle("Staten Island") + theme(legend.position = "none")

air_q <- ggplot(data=subset(nyc_b,id %in% c("Queens")),aes(x=long,y=lat))+
           geom_polygon(data=subset(neighbors,boros %in% "Queens"), aes(group=group,color=boros),
                       fill=NA,size=0.5)+
  
          geom_polygon(aes(group=group),fill=NA,size=1,color="#E69F00")+
 
  
          geom_point(data=subset(df,neighbourhood_group_cleansed %in% c("Queens")), 
                     aes(x=longitude, y=latitude),alpha=0.1,size=0.1)+
          style + ggtitle("Queens")+ theme(legend.position = "none")

air_M+air_brk+air_q+air_brx+air_si

1(b)

##Provide a map in which you summarize the density of the AirBnB listings and highlight the hot-spots for AirBnB locations. Make sure to annotate a few hot-spots on the map.

From the illustration below, the Airbnb listings are mostly located in Manhattan and Brooklyn. From individual density estimation maps of the two boroughs, it seems that the hot-spots for Airbnb locations are: Hells Kitchen, Chelsea, East Village, Gramercy, and Green Point/ Williamsburg.

dense_map <- ggplot(data=nyc_b,aes(x=long,y=lat))+
  
  
         style+
  
          geom_polygon(data=neighbors, aes(group=group,color=id),fill=NA,colour="black",size=0.5)+
  
          geom_polygon(aes(group=group,color=id),fill=NA,,size=1)+
  
          stat_density2d(data = df, geom = "polygon",aes(x = longitude, y = latitude,       
                                                         fill=..level..,alpha=..level..),show.legend = FALSE) +
  
          scale_fill_gradient(low = "orange", high = "blueviolet")+ 
  
          ggtitle("Dense Map of Airbnb Listings")
dense_map

# density map of Manhattan
man_dens <- ggplot(data=subset(nyc_b,id %in% c("Manhattan")),aes(x=long,y=lat))+
             geom_polygon(data=subset(neighbors, boros %in% c("Manhattan")), aes(group=group),fill=NA,colour="#D55E00",size=0.5)+
  
          geom_polygon(aes(group=group,color=id),fill=NA,colour="#D55E00",size=1)+
  
           stat_density2d(data = subset(df,neighbourhood_group_cleansed %in% c("Manhattan")), 
                          geom = "polygon",aes(x = longitude, y = latitude,        
                          fill=..level..,alpha=..level..),show.legend = FALSE)+
  
          scale_fill_gradient(low = "orange", high = "purple")+
  
          style+
        
          geom_text(label="East Village & Gramercy", x=-73.98, y=40.71)+
          geom_text(label="Hell Kitchen & \n Chelsea", x=-74, y=40.77)+
          geom_text(label="SOHO & West Village", x=-74.02, y=40.73)+
          geom_text(label="Upper East Side", x=-73.95, y=40.78)+
          geom_text(label="Financial District", x=-74.02, y=40.70)+ggtitle("Density Estimation of Airbnbs in Manhattan")

man_dens

# Brooklyn
bk_dens <- ggplot(data=subset(nyc_b,id %in% c("Brooklyn")),aes(x=long,y=lat))+
          geom_polygon(data=subset(neighbors, boros %in% c("Brooklyn")),       aes(group=group),fill=NA,colour="#009E73",size=0.5)+
  
          geom_polygon(aes(group=group,color=id),fill=NA,colour="#009E73",size=1)+
  
           stat_density2d(data = subset(df,neighbourhood_group_cleansed %in% c("Brooklyn")), 
                          geom = "polygon",aes(x = longitude, y = latitude,        
                          fill=..level..,alpha=..level..),show.legend = FALSE)+
  
          scale_fill_gradient(low = "orange", high = "purple")+
  
         style+
        
          geom_text(label="GreenPoint & \n Williamsburg", x=-73.98, y=40.725)+
          geom_text(label="Bedford \n Stuyvesant", x=-73.97, y=40.68)+
          geom_text(label="Bushwick", x=-73.925, y=40.685)+ggtitle("Density Estimation of Airbnbs in Brooklyn")


bk_dens

2(a)

##Choose a combination of both maps and non-mapping visualizations (graphs or tables) to explore where in NYC listings are available sporadically vs. year-round. Make sure to highlight the neighborhoods were most listings appear to be permanent or semi-permanent rentals.

#df2 <- df[df$room_type=="Entire home/apt",] & df[df$room_type=="private room",] 
# selecting all entire home and apt
# I decide to categorize every house in df2 having a >300 days of availability as host not present

df2<- df %>%
  mutate(host = case_when(
    availability_365>=300 ~ 1,
    availability_365<300 ~ 0
  ))

# I just want to focus on the neighborhoods where airbnb count is greater than 100 
host_df <- df2 %>%
  group_by(neighbourhood_cleansed) %>%
  summarize(airbnb_counts=n()) %>%
  filter(airbnb_counts>100)

df3 <- df2[df2$neighbourhood_cleansed %in% host_df$neighbourhood_cleansed,]

# getting the top 10 neighbors where permanent rentals are most prevalent
permanent <- df3 %>%
  group_by(neighbourhood_cleansed)%>%
  summarize(perm_perc=mean( host))%>%
  top_n(n=10)
## Selecting by perm_perc
df3 <- df2[df2$neighbourhood_cleansed %in% host_df$neighbourhood_cleansed,]
top10_perm <- df3[df3$neighbourhood_cleansed %in% permanent$neighbourhood_cleansed,]
top10_perm2 <- merge(top10_perm, permanent, by.x="neighbourhood_cleansed",by.y="neighbourhood_cleansed",all.x=TRUE, all.y=TRUE)
colnames(top10_perm2)[14] <- "Boros"
dens_graph <- ggplot(data=top10_perm2, aes(x=reorder(neighbourhood_cleansed, perm_perc),y=availability_365)) +
  geom_point(aes(color=Boros)) + #<<
   geom_jitter(aes(color=Boros), alpha=0.1,
              position = position_jitter(width=0.3)) +   #<<
  labs(x="", y="available days")+
  geom_boxplot(aes(fill=Boros), alpha=0.5)+
  coord_flip()+
  geom_hline(yintercept = 300,color="red",size=1)+
  #labs(fill="Boros")+
  theme_bw()+
  geom_text(label="Non-permanent <--", y=245,x="Upper East Side", colour="black")+
  geom_text(label="--> Permanent", y=340,x="Upper East Side", colour="black")+ggtitle("Top 10 neighbourhoods with permanent renting")


dens_graph

From the graph above, it looks like more housing in Brooklyn and Queens are permenantly available, meaning they are available for most of the year. This is convenient for returning guests who can stay at the same location for their next trip to NYC (if they’d like), as the housing will most likely be available during the time they’d like to travel.

It is also understanable that permenant airbnb listings are located in Queens/ Brooklyn, as many of the hosts in that area are living in the same accomodation as the guest, as they’re renting an airbnb place (ie. a private room(s)) to subsidize their housing rent. (it has to be a permenant renting to support their apartment rent, for example!)

# use a map to show where in NYC are permanent renting most prevalent 

perm_nyc <-ggplot(data=nyc_b,aes(x=long,y=lat))+
  
           geom_polygon(data=neighbors, aes(group=group),fill=NA,colour="orange",size=0.5)+
  
          geom_polygon(aes(group=group),colour="black",fill=NA,size=0.8)+
  
          geom_point(data=df, 
                     aes(x=longitude, y=latitude,colour=availability_365),alpha=0.1,size=0.1)+
  
          scale_colour_gradient(low = "blue", high = "red")+
  
          
          style+ggtitle(("NYC permanent renting location"))+
  
          labs(colour="Available days")
  
  

perm_nyc

man_perm <- ggplot(data=subset(nyc_b, id %in% c("Manhattan")),aes(x=long,y=lat))+  scale_fill_brewer(palette = "Accent") +
  
           geom_polygon(data=subset(neighbors, boros %in% c("Manhattan")), aes(group=group),fill=NA ,colour="black",size=0.5)+
  
          geom_polygon(aes(group=group),colour="black",fill=NA ,size=0.8)+
  
          geom_polygon(data=subset(neighbors, neighbourhood %in% c("Financial District","Murray Hill","Theater District","Midtown","Hell's Kitchen","Kips Bay","Tribeca","Upper East Side")), aes(group=group,fill=neighbourhood),size=0.75,alpha=0.1)+
  
          geom_point(data=subset(top10_perm2, Boros %in% c("Manhattan")), 
                     aes(x=longitude, y=latitude,colour=availability_365),alpha=0.2,size=0.2)+
  
          scale_colour_gradient(low = "blue", high = "red")+
  
          
          style2+ggtitle(("Manhattan permanent renting location"))+
  
          labs(colour="Available days")
  
man_perm

In Manhattan, the neighborhoods that most listings appear to be permanent or semi-permanent rentals are: Midtown East, Times Square, and Financial District.

# Brooklyn
b_perm <- ggplot(data=subset(nyc_b, id %in% c("Brooklyn")),aes(x=long,y=lat))+
  
           geom_polygon(data=subset(neighbors, boros %in% c("Brooklyn")), aes(group=group),fill=NA,colour="black",size=0.5)+
  
          geom_polygon(aes(group=group),colour="black",fill=NA,size=0.8)+
  
          geom_polygon(data=subset(neighbors, neighbourhood %in% c("East Flatbush")), aes(group=group,fill=neighbourhood),size=0.75,alpha=0.05)+
  
          geom_point(data=subset(top10_perm2, Boros %in% c("Brooklyn")), 
                     aes(x=longitude, y=latitude,colour=availability_365),alpha=0.5,size=0.5)+
  
          scale_colour_gradient(low = "blue", high = "red")+
  
          scale_fill_manual(values="green")+
  
          
          style2+ggtitle(("Brooklyn permanent renting location"))+
  
          labs(colour="Available days")
  
b_perm

In Brooklyn, the neighborhood that most listings appear to be permanent or semi-permanent rentals is East Flatbush.

# Queens
q_perm <- ggplot(data=subset(nyc_b, id %in% c("Queens")),aes(x=long,y=lat))+
  
           geom_polygon(data=subset(neighbors, boros %in% c("Queens")), aes(group=group),fill=NA,colour="black",size=0.5)+
  
          geom_polygon(aes(group=group),colour="black",fill=NA,size=0.8)+
  
          geom_polygon(data=subset(neighbors, neighbourhood %in% c("Flushing")), aes(group=group,fill=neighbourhood),size=0.75,alpha=0.05)+
  
          geom_point(data=subset(top10_perm2, Boros %in% c("Queens")), 
                     aes(x=longitude, y=latitude,colour=availability_365),alpha=0.5,size=0.5)+
  
          scale_colour_gradient(low = "blue", high = "red")+
  
          scale_fill_manual(values="green")+
          
          style2+ggtitle(("Queens permanent renting location"))+
  
          labs(colour="Available days")
  
q_perm

In Queens, the neighborhood that most listings appear to be permanent or semi-permanent rentals are Flushing, Jamaica, and Jackson Heights.

2(b)

Some hosts (identified by host_id) operate multiple rentals. Provide a data table of the the top hosts, the total number of listings they are associated with, the average nightly price, and the estimated average monthly total income from these listings.

data table: - average nightly price, estimated average monthly total income

library(dplyr)
p_load(DT,tidyverse)
df <- select(df, id,transit,host_id,host_listings_count,latitude,longitude,room_type,accommodates,bathrooms,bedrooms,price,availability_365,neighbourhood_group_cleansed,neighbourhood_cleansed,room_type,review_scores_rating,review_scores_accuracy,number_of_reviews,last_review,)

price <-df$price
avail <- df$availability_365


price2 <- as.numeric(price)
## Warning: NAs introduced by coercion
availability2 <- as.numeric(avail)
p_load(DT,tidyverse)

# suppose every airbnb is to its 80% capacity (80 days of renting for a total of 100 days a year)
df <- df%>%
  mutate(
    month_income = (0.8*price2*availability2)/12
   )

host_group <- df %>%
  group_by(host_id) 

host_count <- host_group %>%
  summarise(airbnb_counts=n())

host_price <- host_group %>%
  summarize(avg_price=mean(price))

host_income <- host_group %>%
  summarise(avg_month_income=mean(month_income))

host_list <-list(host_count,host_price,host_income) %>%
  reduce(full_join,by="host_id")

datatable(host_list, filter=list(position="top"))

3.

##Provide an interactive map which shows the Top 100 most expensive and Top 100 best reviewed rentals in NYC. The map should differentiate these two groups and upon clicking on a point on the map should show some basic information (at least 3 pieces of information) in a tool tip.

p_load(leaflet)

df$review <- df$review_scores_rating * df$review_scores_accuracy
t100_exp <- df%>%
  arrange(desc(price)) %>%
  mutate(rank=row_number())%>%
  filter(rank<=100)

t100_exp$cat <- "Top 100 expensive"



t100_rev <- df%>%
  arrange(desc(review)) %>%
  mutate(rank=row_number())%>%
  filter(rank<=100)

t100_rev$cat <- "Top 100 reviewed"

inter_df <- rbind(t100_exp,t100_rev)
p_load(RColorBrewer)
pal=colorFactor("Accent",domain=inter_df$cat)
color_cat=pal(inter_df$cat)

c <- paste("Location:",inter_df$neighbourhood_cleansed,"<br/>",
           "Bedroom Count:",inter_df$bedrooms,"<br/>",
           "Bathroom Count:",inter_df$bathrooms,"<br/>",
           "Room Type:",inter_df$room_type,"<br/>"
           )

leaflet(inter_df) %>%
  addProviderTiles("Stamen.TonerLite") %>%
  addCircleMarkers(lng=~longitude, lat=~latitude,color=, popup=c, clusterOptions = markerClusterOptions()) %>%
  addLegend(pal=pal, 
            values=~inter_df$cat,
            title=NA)