Sys.setenv(SPARK_HOME = "C://Users/dannyhuang/Desktop/spark-2.1.0-bin-hadoop2.7") 


library(SparkR, lib.loc = c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib"))) 
## 
## Attaching package: 'SparkR'
## The following objects are masked from 'package:stats':
## 
##     cov, filter, lag, na.omit, predict, sd, var, window
## The following objects are masked from 'package:base':
## 
##     as.data.frame, colnames, colnames<-, drop, endsWith,
##     intersect, rank, rbind, sample, startsWith, subset, summary,
##     transform, union
library(SparkR)

sparkR.session(master = "local[*]",  enableHiveSupport = FALSE,sparkConfig = list(spark.executor.instances = '70',
                   spark.driver.memory = '4g',
                   spark.yarn.executor.memoryOverhead = '8000',spark.sql.warehouse.dir = "d:/winutils/bin",sparkPackages = "com.databricks:spark-csv_2.10:1.3.0"))
## Spark package found in SPARK_HOME: C://Users/dannyhuang/Desktop/spark-2.1.0-bin-hadoop2.7
## Launching java with spark-submit command C://Users/dannyhuang/Desktop/spark-2.1.0-bin-hadoop2.7/bin/spark-submit2.cmd   --driver-memory "4g" sparkr-shell C:\Users\DANNYH~1\AppData\Local\Temp\RtmpwVCsb6\backend_port60687f614be4
## Java ref type org.apache.spark.sql.SparkSession id 1
trip_fare<- read.df(x= "C://Users/dannyhuang/Downloads/trip_fare_12.csv", source = "csv" ,
         header = "true", inferSchema = "true")    #13971118 X 11


createOrReplaceTempView (trip_fare, "trip_fare") #13971118X 11

cacheTable("trip_fare")
sql("select *from trip_fare")
## SparkDataFrame[medallion:string, hack_license:string, vendor_id:string, pickup_datetime:timestamp, payment_type:string, fare_amount:double, surcharge:double, mta_tax:double, tip_amount:double, tolls_amount:double, total_amount:double]

Use SparkSQL to extract data

trip_stats_by_day <- sql("select year(pickup_datetime) as  year, 
month(pickup_datetime) as month, 
day(pickup_datetime) as day, 
hour(pickup_datetime)as hour ,
payment_type,
sum(tip_amount) as Tips,
sum(fare_amount) as total_fare,
sum(total_amount) as total_amount,
count(1) as trips 
from trip_fare 
group by 
year(pickup_datetime), month(pickup_datetime), day(pickup_datetime),hour(pickup_datetime), payment_type order by 3")

Fare Analysis

trip_stats_by_day<-head(trip_stats_by_day,nrow(trip_stats_by_day))
trip_stats_by_day$payment_type <- as.factor(trip_stats_by_day$payment_type)
trip_stats_by_day$payment_type <- factor(trip_stats_by_day$payment_type, levels(trip_stats_by_day$payment_type)[c(5:1)])
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:SparkR':
## 
##     arrange, between, collect, contains, count, cume_dist,
##     dense_rank, desc, distinct, explain, filter, first, group_by,
##     intersect, lag, last, lead, mutate, n, n_distinct, ntile,
##     percent_rank, rename, row_number, sample_frac, select, sql,
##     summarize, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
trip_stats_by_day$date <- paste(trip_stats_by_day$year,trip_stats_by_day$month,trip_stats_by_day$day, sep = "/")
trip_stats_by_day$date <- as.Date(trip_stats_by_day$date)
Sys.setlocale("LC_TIME","US")
## [1] "English_United States.1252"
trip_stats_by_day$weekdays <- weekdays(trip_stats_by_day$date)
trip_stats_by_day$weekdays<- factor(trip_stats_by_day$weekdays, levels =  c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday" ,"Sunday"))

total_fare_by_hour <- trip_stats_by_day %>% select(hour,total_amount,payment_type,Tips,trips) %>% group_by(hour,payment_type) %>% summarise(total_fare = sum(total_amount),trips= sum(trips),tips = sum(Tips))
#total_fare_by_hour.pct$payment_type <- as.factor(total_fare_by_hour.pct$payment_type)
#total_fare_by_hour.pct$payment_type <- factor(total_fare_by_hour.pct$payment_type, levels(total_fare_by_hour.pct$payment_type)[c(5:1)])
ggplot(total_fare_by_hour, aes(x=hour,y=total_fare_by_hour$total_fare,fill = payment_type))+geom_bar(stat = "identity", position = "stack")+labs(title="Total Fare with shares of Payment Type by hour")+xlab("Hour")+ylab("Total Fare")+guides(fill=guide_legend(title="Payment Type"))+theme_bw()

total_fare_by_hour.pct <-  total_fare_by_hour %>% select(hour,total_fare) %>% group_by(hour) %>% summarise(sum(total_fare))
total_fare_by_hour.pct<-left_join(total_fare_by_hour,total_fare_by_hour.pct, by=c("hour"="hour"))
total_fare_by_hour.pct$pct<- round(total_fare_by_hour.pct$total_fare/total_fare_by_hour.pct$`sum(total_fare)`,3)

ggplot(total_fare_by_hour.pct, aes(x=hour,y=total_fare_by_hour.pct$pct,fill = total_fare_by_hour.pct$payment_type )) + geom_bar(stat = "identity", position = "stack")+labs(title="Total Fare with shares of Payment Type in % by hour")+xlab("Hour")+ylab("Total Fare%")+guides(fill=guide_legend(title="Payment Type"))+theme_light()

total_fare_by_day <- trip_stats_by_day %>%select(day, total_amount,payment_type,trips) %>% group_by(day,payment_type) %>% summarise(total_fare = sum(total_amount),trips= sum(trips))

ggplot(total_fare_by_day, aes(x=day,y=total_fare_by_day$total_fare,fill = payment_type)) + geom_bar(stat = "identity", position = "stack")+guides(fill=guide_legend(title="Payment Type"))+xlab("Day")+ylab("Total Fare")+labs(title ="Total Fare by Day with shares of Payment Type ")+theme_bw()+geom_segment(aes(x = 25, y = 7500000, xend = 25, yend = 3750000),
                  arrow = arrow(length = unit(0.5, "cm")),size = 2,color = "yellow")+annotate("text", x = 25, y = 8000000, label = "X'mas",size = 5.5,color = "red")

total_fare_by_day.pct <-  total_fare_by_day %>% select(day,total_fare) %>% group_by(day) %>% summarise(sum(total_fare))
total_fare_by_day.pct<-left_join(total_fare_by_day,total_fare_by_day.pct, by=c("day"="day"))
total_fare_by_day.pct$pct<- round(total_fare_by_day.pct$total_fare/total_fare_by_day.pct$`sum(total_fare)`,3)

ggplot(total_fare_by_day.pct, aes(x=day,y=total_fare_by_day.pct$pct,fill = total_fare_by_day.pct$payment_type )) + geom_bar(stat = "identity", position = "stack")+labs(title="Total Fare with shares of Payment Type (%) by day")+xlab("Day")+ylab("Total Fare%")+guides(fill=guide_legend(title="Payment Type"))+theme_bw()+geom_hline(yintercept=0.5,color = "red",size = 1,linetype = "solid")+geom_segment(aes(x = 31, y = 0.9, xend = 25, yend = 0.7),
                  arrow = arrow(length = unit(0.5, "cm")),size = 2,color = "yellow")

trips <-trip_stats_by_day %>% select(day,trips,payment_type) %>% group_by(day,payment_type) %>% summarise(trips =sum(trips)) %>% filter(payment_type =="CSH"|payment_type=="CRD")
trips 
## Source: local data frame [62 x 3]
## Groups: day [31]
## 
##      day payment_type  trips
##    <int>       <fctr>  <dbl>
## 1      1          CSH 193188
## 2      1          CRD 202175
## 3      2          CSH 186685
## 4      2          CRD 237293
## 5      3          CSH 194714
## 6      3          CRD 265799
## 7      4          CSH 194440
## 8      4          CRD 274093
## 9      5          CSH 205707
## 10     5          CRD 288836
## # ... with 52 more rows
ggplot(trips,aes(day,trips ,fill= payment_type,color= payment_type))+geom_line(linetype = "solid")+annotate("rect", xmin = 24, xmax = 26, ymin = 0, ymax = Inf,alpha = .2,colors = "red")+labs(title = "Cash or Card In Every Transcations by Day")+theme_bw()
## Warning: Ignoring unknown parameters: colors

Tips Analysis

tips_by_payment <- trip_stats_by_day %>% select(payment_type,Tips,total_amount,trips) %>% group_by(payment_type) %>% summarise(tips = sum(Tips),fare= sum(total_amount),trips = sum(trips))
library(dplyr)
Total_tips_by_hour <- trip_stats_by_day %>%select(hour, total_amount,payment_type,Tips,trips) %>% group_by(hour,payment_type) %>% summarise(total_fare = sum(total_amount),trips= sum(trips),tips = sum(Tips))
ggplot(Total_tips_by_hour, aes(x=hour,y=Total_tips_by_hour$tips,fill = payment_type))+geom_bar(stat = "identity", position = "stack")+xlab("Hour")+ylab("Total Tips")+guides(fill=guide_legend(title="Payment Type"))+labs(title="Total Tips in December by hour  ")+theme_bw()

tips_from_card_hour <-trip_stats_by_day %>% select(hour, total_fare,payment_type,Tips,trips) %>% group_by(hour,payment_type) %>% summarise(total_fare = sum(total_fare),trips= sum(trips),tips = sum(Tips)) %>%filter(payment_type== "CRD")
tips_from_card_hour$ratio <- round(tips_from_card_hour$tips/(tips_from_card_hour$total_fare-tips_from_card_hour$tips),4)
ggplot(tips_from_card_hour, aes(x=hour,y=ratio))+geom_bar(stat = "identity",fill= "white", color= "blue")+theme_bw()+labs(title = "Average tip as a percertage of fare by hour from credit card in December")+guides(fill=guide_legend(title=""))+ylab("ratio(tip/fare)")+geom_hline(yintercept=0.2411,color = "red",size = 1)

tips_from_card_day <-trip_stats_by_day %>% select(day, total_fare,payment_type,Tips,trips) %>% group_by(day,payment_type) %>% summarise(total_fare = sum(total_fare),trips= sum(trips),tips = sum(Tips)) %>%filter(payment_type== "CRD")

tips_from_card_day$ratio <- round(tips_from_card_day$tips/(tips_from_card_day$total_fare-tips_from_card_day$tips),3)
ggplot(tips_from_card_day, aes(x=day,y=ratio))+geom_bar(stat = "identity",fill= "white", color= "blue")+geom_hline(yintercept=0.25,color = "green",size = 1)+theme_bw()+labs(title = "Average tip as a percertage of fare by day from credit card in December")+guides(fill=guide_legend(title=""))+ylab("ratio(tip/fare)")+geom_segment(aes(x = 31, y = 0.15, xend = 25, yend = 0.05),
                  arrow = arrow(length = unit(0.5, "cm")),size = 2,color = "yellow")+annotate("text", x = 31, y = 0.175, label = "X'mas",size = 5.5,color = "red")