LOADING DATA AND SEEING HEAD RECORDS
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
library(RCurl)
## Loading required package: bitops
library(ggplot2)
data<- read.csv("C:\\Users\\jkks9\\Documents\\DATA 607\\v_taxi_daily-1.csv")
head(data)
##WHICH TAXI TYPE (YELLOW TAXI, FOR HIRE VEHICLES, GREEN TAXI) HAS THE MOST NUMBER OF RIDES ANNUALY FOR EACH YEAR FROM 2014-2018?
x<-sqldf('SELECT yr,taxi,sum(rides) FROM data WHERE Yr between 2014 and 2018 GROUP BY yr,taxi')
x2<-sqldf('WITH TAXI AS(SELECT yr,taxi,sum(rides) as rides, DENSE_RANK() OVER(PARTITION BY Yr ORDER BY rides desc) as Rank1 FROM data WHERE Yr between 2014 and 2018 GROUP BY yr,taxi) select * FROM TAXI WHERE Rank1=1')
x2
ggplot(data=x2, aes(x=yr, y=rides, fill=taxi)) +
geom_bar(stat="identity")
###YELLOW TAXI had the most rides from 2014-2015 with FOR HIRE VEHICLES gaining a bigger market share starting in 2016.
###WHAT IS THE RIDE DISTRIBUTION FOR EACH OF THE TAXI TYPES FROM 2014 - 2018?
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
x3<-sqldf('WITH TAXI AS(SELECT yr,taxi,sum(rides) as rides FROM data WHERE Yr between 2014 and 2018 GROUP BY yr,taxi),TAXI2 AS(select *,SUM(rides) OVER(PARTITION BY yr) as TotalRidesYr FROM TAXI) SELECT * FROM TAXI2')
x3
x4<-mutate(x3,x3$rides/x3$TotalRidesYr)
x4
ggplot(x4, aes(fill=yr, y=x3$rides/x3$TotalRidesYr, x=taxi)) +
geom_bar(position="fill", stat="identity")
###YELLOW TAXI had the higest ride distribution early on from 2014 to 2015 but now FOR HIRE VEHICLES has the highest distribution starting in 2016.
##LOADING DATA AND SEEING HEAD RECORDS
data2<- read.csv("C:\\Users\\jkks9\\Documents\\DATA 607\\v_taxi_daily-2.csv")
head(data2)
##WHICH MONTH HAS THE MOST NUMBER OF RIDES FOR EACH YEAR FOR ALL TAXIS FROM 2014-2018?
xx<-sqldf('SELECT yr,mth,AVG(rides) FROM data2 WHERE Yr between 2014 and 2018 GROUP BY yr,mth')
xx2<-sqldf('WITH TAXI AS(SELECT yr,mth,sum(rides) as rides,DENSE_RANK() OVER(PARTITION BY Yr ORDER BY rides desc) as rank1 FROM data2 WHERE Yr between 2014 and 2018 GROUP BY yr,mth) SELECT * FROM TAXI WHERE rank1=1 ORDER BY Yr')
xx2
ggplot(data=xx2, aes(x=yr, y=rides, fill=mth)) +
geom_bar(stat="identity")
###The months that are the most busiest for taxi ridership is either November or December for the time period of 2014-2018.