This markdown file has been included for students to see how I cleaned the New York City Dataset and obtained the darksky weather data.
library(glmnet)
library(lme4)
library(readr)
library(Hmisc)
library(osmdata)
library(darksky)
library(ggplot2)
library(tidyverse)
library(rgdal)
library(rgeos)
library(tmap)
library(leaflet)
library(RColorBrewer)
library(sp)
library(spatialEco)
library(mapview)
library(sf)
library(dplyr)
library(tidyr)
library(plm)
library(corrplot)
wd = "~/Uni Temp Work/Big Data"
plsep <- .Platform$file.sep
new_york <- read.csv(paste(wd, plsep, "2016_Green_Taxi_Trip_Data.csv", sep =""))
Below is the code which cleans the green taxi dataset. Notably it does the following:
factors <- c("Store_and_fwd_flag", "ï..VendorID")
new_york[, factors] <- lapply(new_york[factors], as.factor)
new_york$Pickup_Datetime <- as.POSIXlt(new_york$lpep_pickup_datetime, format = "%m/%d/%Y %I:%M:%S %p")
new_york$Dropoff_Datetime <- as.POSIXlt(new_york$Lpep_dropoff_datetime, format = "%m/%d/%Y %I:%M:%S %p")
new_york$Pickup_longitude <- as.numeric(new_york$Pickup_longitude)
new_york <- new_york[, c(1,4:16, 18:21,24:25)]
new_york$day <- weekdays(as.Date(new_york$Pickup_Datetime))
new_york$hour = format(as.POSIXct(new_york$Pickup_Datetime,format = "%m/%d/%Y %I:%M:%S %p"),"%H")
new_york$time = format(as.POSIXct(new_york$Pickup_Datetime,format = "%m/%d/%Y %I:%M:%S %p"),"%m/%d/%Y %H")
new_york$month = format(as.POSIXct(new_york$Pickup_Datetime,format = "%m/%d/%Y %I:%M:%S %p"),"%m")
new_york$Duration <- difftime(new_york$Dropoff_Datetime, new_york$Pickup_Datetime, units = "mins")
new_york$Speed <- new_york$Trip_distance / (as.numeric(new_york$Duration)/60)
new_york = subset(new_york, (is.finite(new_york$Speed) & !is.na(new_york$Speed) & new_york$Speed <= 140)) ## Maximum speed limit in NYC is 50 mph and in the USA is 137 mph.
new_york <- new_york[new_york$Fare_amount >= 0 & new_york$Extra >= 0 & new_york$MTA_tax >= 0 & new_york$Tip_amount >= 0 & new_york$Duration < 180 &
new_york$Tolls_amount >= 0 & new_york$improvement_surcharge >= 0 & new_york$Total_amount >= 2.5 & new_york$Trip_distance > 0,]
new_york <- new_york[new_york$Pickup_latitude > 40 & new_york$Pickup_longitude > -78 & new_york$Pickup_longitude < -72,]
new_york$month = as.numeric(new_york$month)
march <- new_york[new_york$month == 3 ,]
summary(march)
## ï..VendorID Store_and_fwd_flag RateCodeID
## error : true : 0 : 0 Min. :1.000
## message : Internal error: 0 N:1528748 1st Qu.:1.000
## status : 500 : 0 Y: 5937 Median :1.000
## } : 0 Mean :1.063
## 1 : 326567 3rd Qu.:1.000
## 2 :1208118 Max. :6.000
## Pickup_longitude Pickup_latitude Dropoff_longitude Dropoff_latitude
## Min. :-75.21 Min. :40.00 Min. :-75.21 Min. : 0.00
## 1st Qu.:-73.96 1st Qu.:40.70 1st Qu.:-73.97 1st Qu.:40.70
## Median :-73.95 Median :40.75 Median :-73.95 Median :40.75
## Mean :-73.94 Mean :40.75 Mean :-73.90 Mean :40.73
## 3rd Qu.:-73.92 3rd Qu.:40.80 3rd Qu.:-73.91 3rd Qu.:40.79
## Max. :-72.94 Max. :41.31 Max. : 0.00 Max. :42.06
## Passenger_count Trip_distance Fare_amount Extra
## Min. :0.000 Min. : 0.010 Min. : 0.00 Min. : 0.0000
## 1st Qu.:1.000 1st Qu.: 1.060 1st Qu.: 6.50 1st Qu.: 0.0000
## Median :1.000 Median : 1.880 Median : 9.50 Median : 0.5000
## Mean :1.351 Mean : 2.828 Mean : 12.01 Mean : 0.3661
## 3rd Qu.:1.000 3rd Qu.: 3.550 3rd Qu.: 15.00 3rd Qu.: 0.5000
## Max. :9.000 Max. :118.500 Max. :476.00 Max. :10.5000
## MTA_tax Tip_amount Tolls_amount improvement_surcharge
## Min. :0.0000 Min. : 0.00 Min. : 0.0000 Min. :0.0000
## 1st Qu.:0.5000 1st Qu.: 0.00 1st Qu.: 0.0000 1st Qu.:0.3000
## Median :0.5000 Median : 0.00 Median : 0.0000 Median :0.3000
## Mean :0.4925 Mean : 1.23 Mean : 0.1077 Mean :0.2955
## 3rd Qu.:0.5000 3rd Qu.: 2.00 3rd Qu.: 0.0000 3rd Qu.:0.3000
## Max. :0.8800 Max. :450.00 Max. :60.8800 Max. :0.3000
## Total_amount Payment_type Trip_type Pickup_Datetime
## Min. : 2.5 Min. :1.000 Min. :1.000 Min. :2016-03-01 00:00:00
## 1st Qu.: 8.0 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2016-03-08 17:11:45
## Median : 11.3 Median :1.000 Median :1.000 Median :2016-03-16 12:07:22
## Mean : 14.5 Mean :1.507 Mean :1.014 Mean :2016-03-16 09:05:16
## 3rd Qu.: 17.3 3rd Qu.:2.000 3rd Qu.:1.000 3rd Qu.:2016-03-24 05:08:43
## Max. :499.0 Max. :5.000 Max. :2.000 Max. :2016-03-31 23:59:57
## Dropoff_Datetime day hour
## Min. :2016-03-01 00:01:39 Length:1534685 Length:1534685
## 1st Qu.:2016-03-08 17:26:43 Class :character Class :character
## Median :2016-03-16 12:21:43 Mode :character Mode :character
## Mean :2016-03-16 09:18:07
## 3rd Qu.:2016-03-24 05:20:18
## Max. :2016-04-01 01:04:23
## time month Duration Speed
## Length:1534685 Min. :3 Length:1534685 Min. : 0.00357
## Class :character 1st Qu.:3 Class :difftime 1st Qu.: 9.39130
## Mode :character Median :3 Mode :numeric Median : 11.69343
## Mean :3 Mean : 12.83155
## 3rd Qu.:3 3rd Qu.: 14.75936
## Max. :3 Max. :140.00000
Here we will obtain further weather data from the DarkSky API for New York City in 2016. To get the API you will need to create an account on the darksky website and obtain a Dark Sky API key.
Sys.setenv("DARKSKY_API_KEY" = "4201d4bdd31527155b61d879065f0f5f") #Set keys:
Sys.getenv("DARKSKY_API_KEY") #Check Key
## [1] "4201d4bdd31527155b61d879065f0f5f"
Then I have created the following code that allows you to get all the weather data for New York for 2016. I will break down this process into 3 parts: Step 1 - I state the necessary parameters for the DarkSKY API.
latitude_new_york = 40.730610
longitude_new_york = -73.935242
x = 1451624400
time2 <- c()
while (x < 1483246800){
x <- x + 86400
time2 <- c(time2, x)
}
Step 2 - Call the DarkSky API
result <- c()
for (i in 1:366){
result[i] <- get_forecast_for(latitude = latitude_new_york, longitude = longitude_new_york, time2[i], language = "en", units = "si",
exclude = "minutely, currently, daily")
}
Step 3: Make all the dataframes within the list consistent and have the same column names and then join all the dataframes together: Step 4 Make Month, Day, Hour from pick-up Date Time consistent with original dataframe and merge the resulting dataframe with the original dataframe by time:
weather_variables <- c("time","summary","precipProbability","apparentTemperature","humidity","windSpeed")
for (i in 1:length(result)){
result[[i]] <- data.frame(result[i])[, weather_variables]
}
df <- data.frame(0,0,0,0,0,0)
names(df) <- c("time","summary","precipProbability","apparentTemperature","humidity","windSpeed")
df <- df[-1,]
for (i in 1:length(result)) {
df <- rbind(df, result[[i]])
}
df$time = format(as.POSIXct(df$time,format = "%Y-%m-%d %H:%M:%S"),"%m/%d/%Y %H")
march <- merge(march, df, by="time")
write.csv(march, "~/Uni Temp Work/Big Data/march.csv", row.names = FALSE)