Dataset 1: Airline On-Time Statistics and Delay Causes

Source: https://www.transtats.bts.gov/OT_Delay/OT_DelayCause1.asp?pn=1
Time Range: 7/2014 - 7/2019
Airports: All major airports

Import Data

path1 <- 'https://raw.githubusercontent.com/dhairavc/DATA607/master/Flight_Delays_Actual_2014.csv'

flights_raw <- read.csv(path1)
str(flights_raw)
## 'data.frame':    18259 obs. of  22 variables:
##  $ year               : int  2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
##  $ X.month            : int  7 7 7 7 7 7 7 7 7 7 ...
##  $ carrier            : Factor w/ 20 levels "9E","AA","AS",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ carrier_name       : Factor w/ 21 levels "AirTran Airways Corporation",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ airport            : Factor w/ 30 levels "ATL","BOS","BWI",..: 14 16 7 11 17 20 28 1 2 22 ...
##  $ airport_name       : Factor w/ 30 levels "Atlanta, GA: Hartsfield-Jackson Atlanta International",..: 17 14 7 11 18 15 26 1 3 6 ...
##  $ arr_flights        : num  1351 2878 13969 186 1234 ...
##  $ arr_del15          : num  349 662 3180 54 293 ...
##  $ carrier_ct         : num  70.5 181.7 602.1 25.1 58.5 ...
##  $ X.weather_ct       : num  12.4 27.6 123.7 0 12.1 ...
##  $ nas_ct             : num  191.7 243.8 971 19.8 138.9 ...
##  $ security_ct        : num  0 0.83 2.44 0 1 2.71 0 0 0 1.37 ...
##  $ late_aircraft_ct   : num  74.4 208.1 1480.8 9.1 82.5 ...
##  $ arr_cancelled      : num  27 11 231 1 33 19 5 5 10 53 ...
##  $ arr_diverted       : num  15 5 77 0 17 41 1 2 3 2 ...
##  $ X.arr_delay        : num  24039 43150 217757 3383 16466 ...
##  $ X.carrier_delay    : num  4506 14251 56851 1833 3063 ...
##  $ weather_delay      : num  1530 2709 13875 0 674 ...
##  $ nas_delay          : num  12854 10455 42845 576 7923 ...
##  $ security_delay     : num  0 20 69 0 24 101 0 0 0 37 ...
##  $ late_aircraft_delay: num  5149 15715 104117 974 4782 ...
##  $ X                  : logi  NA NA NA NA NA NA ...

Data By Month

In the past 5 years, the total amounts of flights by each months have remained relatively close from month to month, except for July which sees a spike in travel. We see that the number of delays in the summer months of June, July and August are the most, with delays peaking in July. September October and November, have the least delays and then there is a sharp increase of delays in December.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)

flights_raw %>% select(X.month, arr_flights, arr_del15) %>% drop_na() %>% group_by(X.month) %>%
  summarise(total_flights = sum(arr_flights), total_delayed = sum(arr_del15)) %>% 
  mutate(del_pct = total_delayed/total_flights) %>% ggplot(aes(x = X.month, y=total_flights)) +
  geom_bar(stat = 'identity')

flights_raw %>% select(X.month, arr_flights, arr_del15) %>% drop_na() %>% group_by(X.month) %>%
  summarise(total_flights = sum(arr_flights), total_delayed = sum(arr_del15)) %>% 
  mutate(del_pct = total_delayed/total_flights) %>% ggplot(aes(x = X.month, y=total_delayed)) +
  geom_bar(stat = 'identity')

Data by Year

If we see, what is attributing to most of the dealys over the years are from the National Avaiation Service.

del_by_year <- flights_raw %>% select(year, carrier_ct, X.weather_ct, nas_ct, security_ct, late_aircraft_ct) %>% drop_na() %>% group_by(year) %>% summarise_all(funs(sum)) %>% gather(key = del_reason, value = del_count, carrier_ct:late_aircraft_ct) 


ggplot(del_by_year, aes(x=del_by_year$year, y=del_by_year$del_count, group=del_by_year$del_reason, color = del_by_year$del_reason)) + geom_line(size=1) + geom_point()

Data by Airline and Airport

If we look at the delay percentages by carrier and airport, there is one combination that stands out, ExpressJet Airlines Inc. is consistently delayed more than any other carrier when arriving at Las Vegas, NV: McCarran International Airport.

flights_raw %>% select(carrier, airport, arr_del15, arr_flights) %>% drop_na() %>% group_by(airport, carrier) %>% summarize_all(funs(sum)) %>% mutate(del_pct = arr_del15/arr_flights) %>%
  ggplot(aes(x=airport, y=carrier, fill=del_pct)) + geom_tile() + theme(axis.text.x=element_text(angle=90, hjust=1)) + scale_fill_gradient(low = "pink", high = "red")

Data Set 2: AirBnb

This data file includes all needed information to find out more about hosts, geographical availability, necessary metrics to make predictions and draw conclusions.

source: https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data

Load Data:

set2 <- 'https://raw.githubusercontent.com/dhairavc/DATA607/master/ABNB.csv'
abnb_ny <- read.csv(set2)
str(abnb_ny)
## 'data.frame':    48895 obs. of  16 variables:
##  $ id                            : int  2539 2595 3647 3831 5022 5099 5121 5178 5203 5238 ...
##  $ name                          : Factor w/ 47897 levels "","'Fan'tastic",..: 12652 38163 45162 15693 19357 24992 8328 25039 15588 17673 ...
##  $ host_id                       : int  2787 2845 4632 4869 7192 7322 7356 8967 7490 7549 ...
##  $ host_name                     : Factor w/ 11453 levels "","'Cil","#NAME?",..: 5051 4846 2962 6264 5982 1970 3601 9699 6935 1264 ...
##  $ neighbourhood_group           : Factor w/ 5 levels "Bronx","Brooklyn",..: 2 3 3 2 3 3 2 3 3 3 ...
##  $ neighbourhood                 : Factor w/ 221 levels "Allerton","Arden Heights",..: 109 128 95 42 62 138 14 96 203 36 ...
##  $ latitude                      : num  40.6 40.8 40.8 40.7 40.8 ...
##  $ longitude                     : num  -74 -74 -73.9 -74 -73.9 ...
##  $ room_type                     : Factor w/ 3 levels "Entire home/apt",..: 2 1 2 1 1 1 2 2 2 1 ...
##  $ price                         : int  149 225 150 89 80 200 60 79 79 150 ...
##  $ minimum_nights                : int  1 1 3 1 10 3 45 2 2 1 ...
##  $ number_of_reviews             : int  9 45 0 270 9 74 49 430 118 160 ...
##  $ last_review                   : Factor w/ 1765 levels "","1/1/2013",..: 203 1059 1 1438 348 1234 277 1244 1383 1317 ...
##  $ reviews_per_month             : num  0.21 0.38 NA 4.64 0.1 0.59 0.4 3.47 0.99 1.33 ...
##  $ calculated_host_listings_count: int  6 2 1 1 1 1 1 1 1 4 ...
##  $ availability_365              : int  365 355 365 194 0 129 0 220 0 188 ...

By Price

If we look at the rent, the distribution is highly skewed to the right, indicating some outrageous prices. A zoomed in dot plot shows that the outliers are spread across all 5 boroughs, so therefore to unserstand the average price in these locations we should look at the median price and interquartile range of each becuase the extreme outliers will skew the mean and standard deviation.

When looking at the Median and IRQ, we see that Manhattan and Brooklyn, command the largest rents, and the Bronx being the lowest.

library(RColorBrewer)
library(stats)


abnb_ny %>% select(neighbourhood_group, price) %>% drop_na() %>% ggplot( aes(x = price, fill=neighbourhood_group)) + geom_dotplot(stat = "identity", binwidth = 5.5) + coord_flip() + scale_fill_brewer(palette="Dark2")
## Warning: Ignoring unknown parameters: stat

abnb_ny %>% select(neighbourhood_group, price) %>% drop_na() %>% ggplot( aes(x = price, fill=neighbourhood_group)) + geom_dotplot(stat = "identity", binwidth = 100) + coord_flip() + scale_fill_brewer(palette="Dark2")
## Warning: Ignoring unknown parameters: stat

abnb_ny %>% select(neighbourhood_group, price) %>% drop_na() %>% ggplot( aes(x = neighbourhood_group, y = price, fill=neighbourhood_group)) + geom_boxplot() +
scale_fill_brewer(palette="Pastel1") +  ylim(0, 400) 
## Warning: Removed 1763 rows containing non-finite values (stat_boxplot).

abnb_ny %>% select(neighbourhood_group, price) %>% drop_na() %>% group_by(neighbourhood_group) %>% summarise(p_mean = mean(price), p_sd = sd(price), p_median = median(price), p_irq = IQR(price))
## # A tibble: 5 x 5
##   neighbourhood_group p_mean  p_sd p_median p_irq
##   <fct>                <dbl> <dbl>    <dbl> <dbl>
## 1 Bronx                 87.5  107.       65    54
## 2 Brooklyn             124.   187.       90    90
## 3 Manhattan            197.   291.      150   125
## 4 Queens                99.5  167.       75    60
## 5 Staten Island        115.   278.       75    60

By Populatiry

Below is a list of the most active rents for each boroughs.

abnb_ny %>% select(neighbourhood_group, neighbourhood, number_of_reviews) %>% group_by(neighbourhood_group) %>% top_n(3) %>% arrange(neighbourhood_group, number_of_reviews)
## Selecting by number_of_reviews
## # A tibble: 15 x 3
## # Groups:   neighbourhood_group [5]
##    neighbourhood_group neighbourhood number_of_reviews
##    <fct>               <fct>                     <int>
##  1 Bronx               Mott Haven                  276
##  2 Bronx               Mount Eden                  291
##  3 Bronx               Mott Haven                  321
##  4 Brooklyn            South Slope                 467
##  5 Brooklyn            Bushwick                    480
##  6 Brooklyn            Park Slope                  488
##  7 Manhattan           Harlem                      594
##  8 Manhattan           Harlem                      597
##  9 Manhattan           Harlem                      607
## 10 Queens              East Elmhurst               543
## 11 Queens              Jamaica                     576
## 12 Queens              Jamaica                     629
## 13 Staten Island       St. George                  234
## 14 Staten Island       Clifton                     242
## 15 Staten Island       Tompkinsville               333

Revenue by group

Since we have the price per rental and the number of reviews per listing, we can assume that each review is a confirmed purchase, and a minimum revenue can be generated per unit. When grouped by boroughs, if we want to consider what is the likely revenue, we will again look at the Median and IRQ due to the extreme outliers. Surprisingly Staten Island has the highest median.

abnb_ny %>% select(neighbourhood_group, number_of_reviews, price) %>% drop_na() %>% mutate(revenue = number_of_reviews*price) %>% group_by(neighbourhood_group) %>% summarise(r_mean = mean(revenue), r_stv = sd(revenue), r_median = median(revenue), r_irq = IQR(revenue))
## # A tibble: 5 x 5
##   neighbourhood_group r_mean r_stv r_median r_irq
##   <fct>                <dbl> <dbl>    <dbl> <dbl>
## 1 Bronx                1999. 3659.     510  2268.
## 2 Brooklyn             2928. 6676.     544  2485.
## 3 Manhattan            3376. 7989.     600  2810 
## 4 Queens               2400. 4936.     574. 2417 
## 5 Staten Island        2542. 4007.     800  2890
abnb_ny %>% select(neighbourhood_group, neighbourhood, number_of_reviews, price) %>% drop_na() %>% mutate(revenue = number_of_reviews*price) %>%
  ggplot( aes(x = neighbourhood_group, y=revenue, fill=neighbourhood_group)) + geom_boxplot() + 
scale_fill_brewer(palette="Pastel1") +  ylim(0, 4000)
## Warning: Removed 9327 rows containing non-finite values (stat_boxplot).

### Revenue by Neighborhood If we look at revenue across the neighborhoods, and look at the top 3 neighborhoods from each boroughs, Staten Island shows again to have higher median prices

abnb_ny %>% select(neighbourhood_group, neighbourhood, price, number_of_reviews) %>% drop_na() %>% mutate(revenue = number_of_reviews*price) %>%  group_by(neighbourhood_group, neighbourhood) %>% summarise(n_mean = mean(revenue), n_stv = sd(revenue), n_median = median(revenue), n_irq = IQR(revenue)) %>% arrange(neighbourhood_group, desc(n_median)) %>% top_n(3, n_median)
## # A tibble: 15 x 6
## # Groups:   neighbourhood_group [5]
##    neighbourhood_group neighbourhood   n_mean  n_stv n_median n_irq
##    <fct>               <fct>            <dbl>  <dbl>    <dbl> <dbl>
##  1 Bronx               Highbridge       3169.  3136.    2604  3894.
##  2 Bronx               City Island      4174.  5659.    2527  4896 
##  3 Bronx               East Morrisania  3954.  5147.    2310  4832.
##  4 Brooklyn            Manhattan Beach  4418   3460.    4532  4078.
##  5 Brooklyn            Dyker Heights    3234.  3254.    2092. 3458.
##  6 Brooklyn            Bergen Beach     2248.  2226.    1935  1620 
##  7 Manhattan           Two Bridges      3552.  5736.    1216  3818.
##  8 Manhattan           Chinatown        4081.  7687.    1200  4192.
##  9 Manhattan           Nolita           5559. 16356.    1100  4241 
## 10 Queens              Arverne          3908.  5003.    2100  4760 
## 11 Queens              Fresh Meadows    2460.  2526.    1900  2764.
## 12 Queens              East Elmhurst    5351.  7010.    1782  8350 
## 13 Staten Island       Silver Lake      8580   4497.    8580  3180 
## 14 Staten Island       Lighthouse Hill  6568.  4571.    6568. 3232.
## 15 Staten Island       Richmondtown     6162    NaN     6162     0

Dataset 3: Bikeshare Scooter Systems

Source: http://osav-usdot.opendata.arcgis.com/datasets/9c1c7f3f171c4d21a010b9082332bbca_0?selectedAttributes%5B%5D=PlaceID&chartType=line

The bikeshare and e-scooter layer is current as of October 2, 2019. It is a list of cities served by a bikeshare and/or e-scooter system in each year from 2015 to 2019. Some systems serve more than one city. The layer lists just the primary city served. Bikeshare includes systems that are open to the general public, IT-automated, and station based (contain hubs to which users can grab and return a bike) as well as dockless systems. The layer includes a count of the number of docking stations, the number of dockless bikeshare

Import Data

set3 <- "https://raw.githubusercontent.com/dhairavc/DATA607/master/Bikeshare_Scooter_Systems.csv"
bss <- read.csv(set3)

str(bss)
## 'data.frame':    1155 obs. of  19 variables:
##  $ ï..X       : num  -118 -118 -118 -83 -83 ...
##  $ Y          : num  34.1 34.1 34.1 42.3 42.3 ...
##  $ OBJECTID   : int  1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 ...
##  $ PlaceID    : Factor w/ 231 levels "AL001","AL002",..: 19 19 19 128 128 128 118 118 118 151 ...
##  $ YearPlaceId: Factor w/ 1155 levels "2015AL001","2015AL002",..: 943 712 250 821 1052 590 1042 811 580 1075 ...
##  $ City       : Factor w/ 223 levels "Alameda","Albany",..: 108 108 108 57 57 57 26 26 26 129 ...
##  $ State      : Factor w/ 44 levels "AL","AR","AZ",..: 4 4 4 19 19 19 18 18 18 26 ...
##  $ CityState  : Factor w/ 231 levels "Alameda, CA",..: 115 115 115 63 63 63 31 31 31 137 ...
##  $ Year       : int  2019 2018 2016 2018 2019 2017 2019 2018 2017 2019 ...
##  $ AsOfDate   : int  201912 201812 201612 201812 201912 201712 201912 201812 201712 201912 ...
##  $ dockCt     : int  127 127 104 44 44 43 2 2 2 3 ...
##  $ dockNm     : Factor w/ 127 levels "-","APSU B-cycle",..: 84 84 84 85 85 85 86 86 86 87 ...
##  $ docklessCt : int  1 1 0 0 0 0 0 0 0 0 ...
##  $ docklessNm : Factor w/ 21 levels "-","Donkey Republic",..: 8 8 1 1 1 1 1 1 1 1 ...
##  $ scooterCt  : int  2 3 0 1 1 0 0 0 0 0 ...
##  $ scooterNm  : Factor w/ 26 levels "-","Bird","Bird, Bolt, Jump, Lime, Lyft",..: 18 4 1 21 2 1 1 1 1 1 ...
##  $ Type       : Factor w/ 9 levels "","All bikes",..: 3 3 6 5 5 6 6 6 6 6 ...
##  $ Lat        : num  34.1 34.1 34.1 42.3 42.3 ...
##  $ Lon        : num  -118 -118 -118 -83 -83 ...

By Country

If we look across the country, the 3 major types of shared systems are, bicycles, scooters, and dockless scooters. Looking at a time series of all 3, scooters are relatively flat for the past 4 years. Bicycles far outnumber scooters and show an expansion for the first 3 years and a contraction in the last.

bss %>% select(Year, dockCt, docklessCt, scooterCt) %>% group_by(Year) %>% summarise_all(funs(sum)) %>% gather( key = share_system, value = system_count, dockCt:scooterCt) %>% ggplot( aes(x=Year, y=system_count, group=share_system, color = share_system)) + geom_line(size=1) + geom_point()

By State

If we look at a state by state heatmap of all 3 systems, most of the country is relatively stable, except for NY, IL, DC, and MA, which show increases in the amount of bike docks in the respective states

bss %>% select(Year, State, dockCt, docklessCt, scooterCt) %>% gather( key = share_system, value = system_count, 3:5) %>% ggplot( aes(x=Year, y=State, fill=system_count)) + geom_tile() + facet_grid(~share_system) + scale_fill_gradient(low = "light blue", high = "plum1")

### By Company If for the shared bikes, if we look at each company in terms of how many states they are in and how many docking stations do they operate. We observe that while the company B-Cycle is in the most states, it is Citi Bike that operates the most amount of docks, and most likely has the

library(stringr)
library(tidyr)
library(dplyr)

bss_bikes <- bss %>% select(dockNm, dockCt, Year, State) %>% filter(dockCt != 0)

bss_bikes$dockNm <- str_replace_all(bss_bikes$dockNm, "(.*)B-cycle$", "B-Cycle")
bss_bikes$dockNm <- str_replace_all(bss_bikes$dockNm, "(^Citi Bike).*", "Citi Bike")
bss_bikes$dockNm <- str_replace_all(bss_bikes$dockNm, "(^Zagstar).*", "Zagstar")

bss_bikes %>% select(dockNm, Year, dockCt, State) %>% group_by(dockNm, Year) %>% summarize( docks = sum(dockCt), State = n()) %>% gather( key = stat, value = val_type, 3:4)
## # A tibble: 704 x 4
## # Groups:   dockNm [99]
##    dockNm     Year stat  val_type
##    <chr>     <int> <chr>    <int>
##  1 arborbike  2015 docks       14
##  2 arborbike  2016 docks       14
##  3 arborbike  2017 docks       13
##  4 arborbike  2018 docks       13
##  5 arborbike  2019 docks       13
##  6 Aurora     2016 docks        3
##  7 Aurora     2017 docks        3
##  8 Aurora     2018 docks        3
##  9 Aurora     2019 docks        3
## 10 Aventura   2016 docks        5
## # ... with 694 more rows
bss_bikes %>% select(dockNm, Year, dockCt, State) %>% group_by(dockNm, Year) %>% summarize( docks = sum(dockCt), State = n()) %>% gather( key = stat, value = val_type, 3:4) %>% filter(stat == "State") %>% ggplot( aes(x=dockNm, y=val_type)) + geom_col() + facet_wrap(~Year) + coord_flip()

bss_bikes %>% select(dockNm, Year, dockCt, State) %>% group_by(dockNm, Year) %>% summarize( docks = sum(dockCt), State = n()) %>% gather( key = stat, value = val_type, 3:4) %>% filter(stat == "docks") %>% ggplot( aes(x=dockNm, y=val_type)) + geom_col() + facet_wrap(~Year) + coord_flip()