LOADING DATA AND SEEING HEAD RECORDS

library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
library(RCurl)
## Loading required package: bitops
library(ggplot2)
data<- read.csv("C:\\Users\\jkks9\\Documents\\DATA 607\\v_taxi_daily-1.csv")
head(data)


##WHICH TAXI TYPE (YELLOW TAXI, FOR HIRE VEHICLES, GREEN TAXI) HAS THE MOST NUMBER OF RIDES ANNUALY FOR EACH YEAR FROM 2014-2018?


x<-sqldf('SELECT yr,taxi,sum(rides) FROM data WHERE Yr between 2014 and 2018 GROUP BY yr,taxi')
x2<-sqldf('WITH TAXI AS(SELECT yr,taxi,sum(rides) as rides, DENSE_RANK() OVER(PARTITION BY Yr ORDER BY rides desc) as Rank1 FROM data WHERE Yr between 2014 and 2018 GROUP BY yr,taxi) select * FROM TAXI WHERE Rank1=1')
x2
ggplot(data=x2, aes(x=yr, y=rides, fill=taxi)) +
    geom_bar(stat="identity")


###YELLOW TAXI had the most rides from 2014-2015 with FOR HIRE VEHICLES gaining a bigger market share starting in 2016.


###WHAT IS THE RIDE DISTRIBUTION FOR EACH OF THE TAXI TYPES FROM 2014 - 2018?


library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
x3<-sqldf('WITH TAXI AS(SELECT yr,taxi,sum(rides) as rides FROM data WHERE Yr between 2014 and 2018 GROUP BY yr,taxi),TAXI2 AS(select *,SUM(rides) OVER(PARTITION BY yr) as TotalRidesYr FROM TAXI) SELECT * FROM TAXI2')
x3
x4<-mutate(x3,x3$rides/x3$TotalRidesYr)
x4
ggplot(x4, aes(fill=yr, y=x3$rides/x3$TotalRidesYr, x=taxi)) + 
    geom_bar(position="fill", stat="identity")


###YELLOW TAXI had the higest ride distribution early on from 2014 to 2015 but now FOR HIRE VEHICLES has the highest distribution starting in 2016.


##LOADING DATA AND SEEING HEAD RECORDS


data2<- read.csv("C:\\Users\\jkks9\\Documents\\DATA 607\\v_taxi_daily-2.csv")
head(data2)


##WHICH MONTH HAS THE MOST NUMBER OF RIDES FOR EACH YEAR FOR ALL TAXIS FROM 2014-2018?


xx<-sqldf('SELECT yr,mth,AVG(rides) FROM data2 WHERE Yr between 2014 and 2018 GROUP BY yr,mth')
xx2<-sqldf('WITH TAXI AS(SELECT yr,mth,sum(rides) as rides,DENSE_RANK() OVER(PARTITION BY Yr ORDER BY rides desc) as rank1 FROM data2 WHERE Yr between 2014 and 2018 GROUP BY yr,mth) SELECT * FROM TAXI WHERE rank1=1 ORDER BY Yr')
xx2
ggplot(data=xx2, aes(x=yr, y=rides, fill=mth)) +
    geom_bar(stat="identity")


###The months that are the most busiest for taxi ridership is either November or December for the time period of 2014-2018.