This markdown file has been included for students to see how I cleaned the New York City Dataset and obtained the darksky weather data.

library(glmnet)
library(lme4)
library(readr)
library(Hmisc)
library(osmdata)
library(darksky)
library(ggplot2)
library(tidyverse)
library(rgdal)
library(rgeos)
library(tmap)
library(leaflet)
library(RColorBrewer)
library(sp)
library(spatialEco)
library(mapview)
library(sf)
library(dplyr)
library(tidyr)
library(plm)
library(corrplot)

Read Green Taxi Dataset

wd = "~/Uni Temp Work/Big Data"
plsep <- .Platform$file.sep
new_york <- read.csv(paste(wd, plsep, "2016_Green_Taxi_Trip_Data.csv", sep =""))

Clean Dataset and convert Variables

Below is the code which cleans the green taxi dataset. Notably it does the following:

  1. Converts the features into the correct data type.
  2. Removes errenous recordings
  3. Creates new variables for further more naunced analysis
  4. Reduces size of dataset to only consider the month March.
factors <- c("Store_and_fwd_flag", "ï..VendorID")
new_york[, factors] <- lapply(new_york[factors], as.factor)
new_york$Pickup_Datetime <- as.POSIXlt(new_york$lpep_pickup_datetime, format = "%m/%d/%Y %I:%M:%S %p")
new_york$Dropoff_Datetime <- as.POSIXlt(new_york$Lpep_dropoff_datetime, format = "%m/%d/%Y %I:%M:%S %p")
new_york$Pickup_longitude <- as.numeric(new_york$Pickup_longitude)

new_york <- new_york[, c(1,4:16, 18:21,24:25)]
new_york$day <- weekdays(as.Date(new_york$Pickup_Datetime))
new_york$hour = format(as.POSIXct(new_york$Pickup_Datetime,format = "%m/%d/%Y %I:%M:%S %p"),"%H")
new_york$time = format(as.POSIXct(new_york$Pickup_Datetime,format = "%m/%d/%Y %I:%M:%S %p"),"%m/%d/%Y %H")
new_york$month = format(as.POSIXct(new_york$Pickup_Datetime,format = "%m/%d/%Y %I:%M:%S %p"),"%m")

new_york$Duration <- difftime(new_york$Dropoff_Datetime, new_york$Pickup_Datetime, units = "mins")
new_york$Speed <- new_york$Trip_distance / (as.numeric(new_york$Duration)/60)
new_york = subset(new_york, (is.finite(new_york$Speed) & !is.na(new_york$Speed) & new_york$Speed <= 140)) ## Maximum speed limit in NYC is 50 mph and in the USA is 137 mph. 

new_york <- new_york[new_york$Fare_amount >= 0 & new_york$Extra >= 0 & new_york$MTA_tax >= 0 & new_york$Tip_amount >= 0 & new_york$Duration < 180 &
                       new_york$Tolls_amount >= 0 & new_york$improvement_surcharge >= 0 & new_york$Total_amount >= 2.5 & new_york$Trip_distance > 0,]  
new_york <- new_york[new_york$Pickup_latitude > 40 & new_york$Pickup_longitude > -78 & new_york$Pickup_longitude < -72,]
new_york$month = as.numeric(new_york$month)
march <- new_york[new_york$month == 3 ,] 

summary(march)
##                      ï..VendorID      Store_and_fwd_flag   RateCodeID   
##    error : true            :      0    :      0          Min.   :1.000  
##    message : Internal error:      0   N:1528748          1st Qu.:1.000  
##    status : 500            :      0   Y:   5937          Median :1.000  
##  }                         :      0                      Mean   :1.063  
##  1                         : 326567                      3rd Qu.:1.000  
##  2                         :1208118                      Max.   :6.000  
##  Pickup_longitude Pickup_latitude Dropoff_longitude Dropoff_latitude
##  Min.   :-75.21   Min.   :40.00   Min.   :-75.21    Min.   : 0.00   
##  1st Qu.:-73.96   1st Qu.:40.70   1st Qu.:-73.97    1st Qu.:40.70   
##  Median :-73.95   Median :40.75   Median :-73.95    Median :40.75   
##  Mean   :-73.94   Mean   :40.75   Mean   :-73.90    Mean   :40.73   
##  3rd Qu.:-73.92   3rd Qu.:40.80   3rd Qu.:-73.91    3rd Qu.:40.79   
##  Max.   :-72.94   Max.   :41.31   Max.   :  0.00    Max.   :42.06   
##  Passenger_count Trip_distance      Fare_amount         Extra        
##  Min.   :0.000   Min.   :  0.010   Min.   :  0.00   Min.   : 0.0000  
##  1st Qu.:1.000   1st Qu.:  1.060   1st Qu.:  6.50   1st Qu.: 0.0000  
##  Median :1.000   Median :  1.880   Median :  9.50   Median : 0.5000  
##  Mean   :1.351   Mean   :  2.828   Mean   : 12.01   Mean   : 0.3661  
##  3rd Qu.:1.000   3rd Qu.:  3.550   3rd Qu.: 15.00   3rd Qu.: 0.5000  
##  Max.   :9.000   Max.   :118.500   Max.   :476.00   Max.   :10.5000  
##     MTA_tax         Tip_amount      Tolls_amount     improvement_surcharge
##  Min.   :0.0000   Min.   :  0.00   Min.   : 0.0000   Min.   :0.0000       
##  1st Qu.:0.5000   1st Qu.:  0.00   1st Qu.: 0.0000   1st Qu.:0.3000       
##  Median :0.5000   Median :  0.00   Median : 0.0000   Median :0.3000       
##  Mean   :0.4925   Mean   :  1.23   Mean   : 0.1077   Mean   :0.2955       
##  3rd Qu.:0.5000   3rd Qu.:  2.00   3rd Qu.: 0.0000   3rd Qu.:0.3000       
##  Max.   :0.8800   Max.   :450.00   Max.   :60.8800   Max.   :0.3000       
##   Total_amount    Payment_type     Trip_type     Pickup_Datetime              
##  Min.   :  2.5   Min.   :1.000   Min.   :1.000   Min.   :2016-03-01 00:00:00  
##  1st Qu.:  8.0   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:2016-03-08 17:11:45  
##  Median : 11.3   Median :1.000   Median :1.000   Median :2016-03-16 12:07:22  
##  Mean   : 14.5   Mean   :1.507   Mean   :1.014   Mean   :2016-03-16 09:05:16  
##  3rd Qu.: 17.3   3rd Qu.:2.000   3rd Qu.:1.000   3rd Qu.:2016-03-24 05:08:43  
##  Max.   :499.0   Max.   :5.000   Max.   :2.000   Max.   :2016-03-31 23:59:57  
##  Dropoff_Datetime                  day                hour          
##  Min.   :2016-03-01 00:01:39   Length:1534685     Length:1534685    
##  1st Qu.:2016-03-08 17:26:43   Class :character   Class :character  
##  Median :2016-03-16 12:21:43   Mode  :character   Mode  :character  
##  Mean   :2016-03-16 09:18:07                                        
##  3rd Qu.:2016-03-24 05:20:18                                        
##  Max.   :2016-04-01 01:04:23                                        
##      time               month     Duration            Speed          
##  Length:1534685     Min.   :3   Length:1534685    Min.   :  0.00357  
##  Class :character   1st Qu.:3   Class :difftime   1st Qu.:  9.39130  
##  Mode  :character   Median :3   Mode  :numeric    Median : 11.69343  
##                     Mean   :3                     Mean   : 12.83155  
##                     3rd Qu.:3                     3rd Qu.: 14.75936  
##                     Max.   :3                     Max.   :140.00000

Obtain Weather Data

Here we will obtain further weather data from the DarkSky API for New York City in 2016. To get the API you will need to create an account on the darksky website and obtain a Dark Sky API key.

Sys.setenv("DARKSKY_API_KEY" = "4201d4bdd31527155b61d879065f0f5f") #Set keys:
Sys.getenv("DARKSKY_API_KEY") #Check Key 
## [1] "4201d4bdd31527155b61d879065f0f5f"

Then I have created the following code that allows you to get all the weather data for New York for 2016. I will break down this process into 3 parts: Step 1 - I state the necessary parameters for the DarkSKY API.

latitude_new_york = 40.730610
longitude_new_york = -73.935242
x = 1451624400
time2 <- c()

while (x < 1483246800){
  x <- x + 86400
  time2 <- c(time2, x)
}

Step 2 - Call the DarkSky API

result <- c()
for (i in 1:366){
  result[i] <- get_forecast_for(latitude = latitude_new_york, longitude = longitude_new_york, time2[i], language = "en", units = "si",
                                exclude = "minutely, currently, daily")
}

Step 3: Make all the dataframes within the list consistent and have the same column names and then join all the dataframes together: Step 4 Make Month, Day, Hour from pick-up Date Time consistent with original dataframe and merge the resulting dataframe with the original dataframe by time:

weather_variables <- c("time","summary","precipProbability","apparentTemperature","humidity","windSpeed")
for (i in 1:length(result)){
      result[[i]] <- data.frame(result[i])[, weather_variables]
}
df <- data.frame(0,0,0,0,0,0)
names(df) <- c("time","summary","precipProbability","apparentTemperature","humidity","windSpeed")
df <- df[-1,]
for (i in 1:length(result)) {
  df <- rbind(df, result[[i]])
}
df$time = format(as.POSIXct(df$time,format = "%Y-%m-%d %H:%M:%S"),"%m/%d/%Y %H")
march <- merge(march, df, by="time")
write.csv(march, "~/Uni Temp Work/Big Data/march.csv", row.names = FALSE)