Sys.setenv(SPARK_HOME = "C://Users/dannyhuang/Desktop/spark-2.1.0-bin-hadoop2.7")
library(SparkR, lib.loc = c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib")))
##
## Attaching package: 'SparkR'
## The following objects are masked from 'package:stats':
##
## cov, filter, lag, na.omit, predict, sd, var, window
## The following objects are masked from 'package:base':
##
## as.data.frame, colnames, colnames<-, drop, endsWith,
## intersect, rank, rbind, sample, startsWith, subset, summary,
## transform, union
library(SparkR)
sparkR.session(master = "local[*]", enableHiveSupport = FALSE,sparkConfig = list(spark.executor.instances = '70',
spark.driver.memory = '4g',
spark.yarn.executor.memoryOverhead = '8000',spark.sql.warehouse.dir = "d:/winutils/bin",sparkPackages = "com.databricks:spark-csv_2.10:1.3.0"))
## Spark package found in SPARK_HOME: C://Users/dannyhuang/Desktop/spark-2.1.0-bin-hadoop2.7
## Launching java with spark-submit command C://Users/dannyhuang/Desktop/spark-2.1.0-bin-hadoop2.7/bin/spark-submit2.cmd --driver-memory "4g" sparkr-shell C:\Users\DANNYH~1\AppData\Local\Temp\RtmpwVCsb6\backend_port60687f614be4
## Java ref type org.apache.spark.sql.SparkSession id 1
trip_fare<- read.df(x= "C://Users/dannyhuang/Downloads/trip_fare_12.csv", source = "csv" ,
header = "true", inferSchema = "true") #13971118 X 11
createOrReplaceTempView (trip_fare, "trip_fare") #13971118X 11
cacheTable("trip_fare")
sql("select *from trip_fare")
## SparkDataFrame[medallion:string, hack_license:string, vendor_id:string, pickup_datetime:timestamp, payment_type:string, fare_amount:double, surcharge:double, mta_tax:double, tip_amount:double, tolls_amount:double, total_amount:double]
Fare Analysis
trip_stats_by_day<-head(trip_stats_by_day,nrow(trip_stats_by_day))
trip_stats_by_day$payment_type <- as.factor(trip_stats_by_day$payment_type)
trip_stats_by_day$payment_type <- factor(trip_stats_by_day$payment_type, levels(trip_stats_by_day$payment_type)[c(5:1)])
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:SparkR':
##
## arrange, between, collect, contains, count, cume_dist,
## dense_rank, desc, distinct, explain, filter, first, group_by,
## intersect, lag, last, lead, mutate, n, n_distinct, ntile,
## percent_rank, rename, row_number, sample_frac, select, sql,
## summarize, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
trip_stats_by_day$date <- paste(trip_stats_by_day$year,trip_stats_by_day$month,trip_stats_by_day$day, sep = "/")
trip_stats_by_day$date <- as.Date(trip_stats_by_day$date)
Sys.setlocale("LC_TIME","US")
## [1] "English_United States.1252"
trip_stats_by_day$weekdays <- weekdays(trip_stats_by_day$date)
trip_stats_by_day$weekdays<- factor(trip_stats_by_day$weekdays, levels = c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday" ,"Sunday"))
total_fare_by_hour <- trip_stats_by_day %>% select(hour,total_amount,payment_type,Tips,trips) %>% group_by(hour,payment_type) %>% summarise(total_fare = sum(total_amount),trips= sum(trips),tips = sum(Tips))
#total_fare_by_hour.pct$payment_type <- as.factor(total_fare_by_hour.pct$payment_type)
#total_fare_by_hour.pct$payment_type <- factor(total_fare_by_hour.pct$payment_type, levels(total_fare_by_hour.pct$payment_type)[c(5:1)])
ggplot(total_fare_by_hour, aes(x=hour,y=total_fare_by_hour$total_fare,fill = payment_type))+geom_bar(stat = "identity", position = "stack")+labs(title="Total Fare with shares of Payment Type by hour")+xlab("Hour")+ylab("Total Fare")+guides(fill=guide_legend(title="Payment Type"))+theme_bw()

total_fare_by_hour.pct <- total_fare_by_hour %>% select(hour,total_fare) %>% group_by(hour) %>% summarise(sum(total_fare))
total_fare_by_hour.pct<-left_join(total_fare_by_hour,total_fare_by_hour.pct, by=c("hour"="hour"))
total_fare_by_hour.pct$pct<- round(total_fare_by_hour.pct$total_fare/total_fare_by_hour.pct$`sum(total_fare)`,3)
ggplot(total_fare_by_hour.pct, aes(x=hour,y=total_fare_by_hour.pct$pct,fill = total_fare_by_hour.pct$payment_type )) + geom_bar(stat = "identity", position = "stack")+labs(title="Total Fare with shares of Payment Type in % by hour")+xlab("Hour")+ylab("Total Fare%")+guides(fill=guide_legend(title="Payment Type"))+theme_light()

total_fare_by_day <- trip_stats_by_day %>%select(day, total_amount,payment_type,trips) %>% group_by(day,payment_type) %>% summarise(total_fare = sum(total_amount),trips= sum(trips))
ggplot(total_fare_by_day, aes(x=day,y=total_fare_by_day$total_fare,fill = payment_type)) + geom_bar(stat = "identity", position = "stack")+guides(fill=guide_legend(title="Payment Type"))+xlab("Day")+ylab("Total Fare")+labs(title ="Total Fare by Day with shares of Payment Type ")+theme_bw()+geom_segment(aes(x = 25, y = 7500000, xend = 25, yend = 3750000),
arrow = arrow(length = unit(0.5, "cm")),size = 2,color = "yellow")+annotate("text", x = 25, y = 8000000, label = "X'mas",size = 5.5,color = "red")

total_fare_by_day.pct <- total_fare_by_day %>% select(day,total_fare) %>% group_by(day) %>% summarise(sum(total_fare))
total_fare_by_day.pct<-left_join(total_fare_by_day,total_fare_by_day.pct, by=c("day"="day"))
total_fare_by_day.pct$pct<- round(total_fare_by_day.pct$total_fare/total_fare_by_day.pct$`sum(total_fare)`,3)
ggplot(total_fare_by_day.pct, aes(x=day,y=total_fare_by_day.pct$pct,fill = total_fare_by_day.pct$payment_type )) + geom_bar(stat = "identity", position = "stack")+labs(title="Total Fare with shares of Payment Type (%) by day")+xlab("Day")+ylab("Total Fare%")+guides(fill=guide_legend(title="Payment Type"))+theme_bw()+geom_hline(yintercept=0.5,color = "red",size = 1,linetype = "solid")+geom_segment(aes(x = 31, y = 0.9, xend = 25, yend = 0.7),
arrow = arrow(length = unit(0.5, "cm")),size = 2,color = "yellow")

trips <-trip_stats_by_day %>% select(day,trips,payment_type) %>% group_by(day,payment_type) %>% summarise(trips =sum(trips)) %>% filter(payment_type =="CSH"|payment_type=="CRD")
trips
## Source: local data frame [62 x 3]
## Groups: day [31]
##
## day payment_type trips
## <int> <fctr> <dbl>
## 1 1 CSH 193188
## 2 1 CRD 202175
## 3 2 CSH 186685
## 4 2 CRD 237293
## 5 3 CSH 194714
## 6 3 CRD 265799
## 7 4 CSH 194440
## 8 4 CRD 274093
## 9 5 CSH 205707
## 10 5 CRD 288836
## # ... with 52 more rows
ggplot(trips,aes(day,trips ,fill= payment_type,color= payment_type))+geom_line(linetype = "solid")+annotate("rect", xmin = 24, xmax = 26, ymin = 0, ymax = Inf,alpha = .2,colors = "red")+labs(title = "Cash or Card In Every Transcations by Day")+theme_bw()
## Warning: Ignoring unknown parameters: colors

Tips Analysis
tips_by_payment <- trip_stats_by_day %>% select(payment_type,Tips,total_amount,trips) %>% group_by(payment_type) %>% summarise(tips = sum(Tips),fare= sum(total_amount),trips = sum(trips))
library(dplyr)
Total_tips_by_hour <- trip_stats_by_day %>%select(hour, total_amount,payment_type,Tips,trips) %>% group_by(hour,payment_type) %>% summarise(total_fare = sum(total_amount),trips= sum(trips),tips = sum(Tips))
ggplot(Total_tips_by_hour, aes(x=hour,y=Total_tips_by_hour$tips,fill = payment_type))+geom_bar(stat = "identity", position = "stack")+xlab("Hour")+ylab("Total Tips")+guides(fill=guide_legend(title="Payment Type"))+labs(title="Total Tips in December by hour ")+theme_bw()

tips_from_card_hour <-trip_stats_by_day %>% select(hour, total_fare,payment_type,Tips,trips) %>% group_by(hour,payment_type) %>% summarise(total_fare = sum(total_fare),trips= sum(trips),tips = sum(Tips)) %>%filter(payment_type== "CRD")
tips_from_card_hour$ratio <- round(tips_from_card_hour$tips/(tips_from_card_hour$total_fare-tips_from_card_hour$tips),4)
ggplot(tips_from_card_hour, aes(x=hour,y=ratio))+geom_bar(stat = "identity",fill= "white", color= "blue")+theme_bw()+labs(title = "Average tip as a percertage of fare by hour from credit card in December")+guides(fill=guide_legend(title=""))+ylab("ratio(tip/fare)")+geom_hline(yintercept=0.2411,color = "red",size = 1)

tips_from_card_day <-trip_stats_by_day %>% select(day, total_fare,payment_type,Tips,trips) %>% group_by(day,payment_type) %>% summarise(total_fare = sum(total_fare),trips= sum(trips),tips = sum(Tips)) %>%filter(payment_type== "CRD")
tips_from_card_day$ratio <- round(tips_from_card_day$tips/(tips_from_card_day$total_fare-tips_from_card_day$tips),3)
ggplot(tips_from_card_day, aes(x=day,y=ratio))+geom_bar(stat = "identity",fill= "white", color= "blue")+geom_hline(yintercept=0.25,color = "green",size = 1)+theme_bw()+labs(title = "Average tip as a percertage of fare by day from credit card in December")+guides(fill=guide_legend(title=""))+ylab("ratio(tip/fare)")+geom_segment(aes(x = 31, y = 0.15, xend = 25, yend = 0.05),
arrow = arrow(length = unit(0.5, "cm")),size = 2,color = "yellow")+annotate("text", x = 31, y = 0.175, label = "X'mas",size = 5.5,color = "red")



