Import the sparklyr library
library(sparklyr)
##
## Attaching package: 'sparklyr'
## The following object is masked from 'package:stats':
##
## filter
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Create a spark connection
sc<-spark_connect(master='local',app_name = 'taxi_data')
Read the data using spark_read_csv
getwd()
## [1] "/Users/faiz/Downloads/BDA_POP/KLU/sparklyrdemo"
#setwd("/Users/faiz/Downloads/BDA_POP/archive")
getwd()
## [1] "/Users/faiz/Downloads/BDA_POP/KLU/sparklyrdemo"
reading the data to spark tables
taxi_zone_geo <-spark_read_csv(sc, name='taxi_zone_tbl', path = '/Users/faiz/Downloads/BDA_POP/archive/taxi_zone_geo.csv')
taxi_zone_2018 <-spark_read_csv(sc, name='taxi_zone_2018_tbl', path = '/Users/faiz/Downloads/BDA_POP/archive/original_cleaned_nyc_taxi_data_2018.csv')
taxi_trip_data <-spark_read_csv(sc, name='taxi_trip_data_tbl', path = '/Users/faiz/Downloads/BDA_POP/archive/taxi_trip_data.csv')
count(taxi_trip_data)
#head(taxi_zone_geo)
taxi_bronx <- taxi_zone_geo %>%
select("zone_name","borough") %>%
filter(borough =='JFK')
taxi_bronx
taxi_bronx <- taxi_zone_geo %>%
select("borough") %>% distinct("borough")
taxi_bronx
dplyr::show_query(taxi_bronx)
## <SQL>
## SELECT FIRST(`borough`, FALSE) AS `borough`
## FROM (
## SELECT `borough`, "borough" AS `"borough"`
## FROM `taxi_zone_tbl`
## ) `q01`
## GROUP BY `borough`
Disconnect Spark Connection without fail to save resources and
money
spark_disconnect(sc)