Loading Uber Data
setwd("D:/imarticus/1/Uber-dataset")
apr_data <- read.csv("uber-raw-data-apr14.csv")
may_data <- read.csv("uber-raw-data-may14.csv")
jun_data <- read.csv("uber-raw-data-jun14.csv")
jul_data <- read.csv("uber-raw-data-jul14.csv")
aug_data <- read.csv("uber-raw-data-aug14.csv")
sep_data <- read.csv("uber-raw-data-sep14.csv")
#now binding all the data in one table
apr_data=apr_data[1:100000,]
may_data=may_data[1:100000,]
jun_data=jun_data[1:100000,]
jul_data=jul_data[1:100000,]
aug_data=aug_data[1:100000,]
sep_data=sep_data[1:100000,]
data_uber=rbind(apr_data,may_data,jun_data,jul_data,aug_data,sep_data)
str(data_uber)
'data.frame': 600000 obs. of 4 variables:
$ Date.Time: Factor w/ 260093 levels "4/1/2014 0:00:00",..: 11 17 21 28 33 33 38 44 54 58 ...
$ Lat : num 40.8 40.7 40.7 40.8 40.8 ...
$ Lon : num -74 -74 -74 -74 -74 ...
$ Base : Factor w/ 5 levels "B02512","B02598",..: 1 1 1 1 1 1 1 1 1 1 ...
Now we have combined uber data of 6 months with 600000 observation of 4 variables.
But as dvariable data.time is in factor format we have to convert it in time format.
#converting time from m/d/y format to y/m/d
data_uber$Date.Time <- as.POSIXct(data_uber$Date.Time, format = "%m/%d/%Y %H:%M:%S")
head(data_uber)
NA
Now as the data is in time format we can a new variable for time format in Hour-Min-Sec only
#making time as adifferent coloumn
data_uber$Time <- format(as.POSIXct(data_uber$Date.Time, format = "%m/%d/%Y %H:%M:%S"), format="%H:%M:%S")
head(data_uber)
Now we have made a seperate coloumn for time for analysis.
For ggplot we need to convert each time format individually as a factor.
data_uber$Date.Time <- ymd_hms(data_uber$Date.Time)
head(data_uber)
Now we can pullout individual time and covert it into factor.
data_uber$day <- factor(day(data_uber$Date.Time))
data_uber$month <- factor(month(data_uber$Date.Time, label = TRUE))
head(data_uber)
In the above table we have made day and month a different variable as a factor
#making tine as a factor in seperate coloumn
data_uber$hour <- factor(hour(hms(data_uber$Time)))
data_uber$minute <- factor(minute(hms(data_uber$Time)))
data_uber$second <- factor(second(hms(data_uber$Time)))
head(data_uber)
Similarily all the time value as a adifferent variable in a fator form. Now for ploting we are creating a new data grouped by hour.
hour_data <- data_uber %>%
group_by(hour) %>%
dplyr::summarize(Total = n())
head(hour_data)
NA
NA
Now in order in determine the peak hour at which maximum customer uses uber we will ggplot graph.
ggplot(hour_data, aes(hour, Total)) +
geom_bar( stat = "identity", fill = "steelblue", color = "red") +
ggtitle("Trips Every Hour") +
theme(legend.position = "none")

NA
Now from the above plot we can clearly state that rush hour is between 16.5hrs to 17.5hrs (4:30pm - 5:30pm).
#Grouping data by month and hour to determine peak hour monthwise
month_hour <- data_uber %>%
group_by(month, hour)%>%
dplyr::summarize(Total = n())
head(month_hour)
View(month_hour)
Now in order to determine peak hour time monthwise we have to plot another graph using the above data.
ggplot(month_hour, aes(hour, Total, fill = month)) +
geom_bar( stat = "identity") +
ggtitle("Trips by Hour and Month")

Now here we can see that out of all the days in every month rate of higher rides are in between 16 to 17 or 17 to 18.
ggplot(month_hour, aes(month, Total)) +
geom_bar( stat = "identity",fill="steelblue") +
ggtitle("Trips by Hour and Month")

As it is very clear from the graph that number of rides in every month is same.
#making a group data on the basis of day
day_group <- data_uber %>%
group_by(day) %>%
dplyr::summarize(Total = n())
head(day_group)
Now in order to determine the day with maximum number of rides we havew to make a plot using the above data.
ggplot(day_group, aes(day, Total)) +
geom_bar( stat = "identity", fill = "steelblue",col="green") +
ggtitle("Trips Every Day") +
theme(legend.position = "none")

Here we can see that the first week of the month is having max number of rides.
day_month_group <- data_uber %>%
group_by(month, day) %>%
dplyr:: summarize(Total = n())
head(day_month_group)
Ploting the graph to determine the day with highest number of rides monthwise.
ggplot(day_month_group,aes(day,Total,fill=month))+geom_bar(stat = "identity")+ scale_fill_manual(values = colors)

Now moving over weekly rides in every month.
data_month_week=data_uber %>%
group_by(month,dayofweek) %>%
dplyr:: summarise(Total=n())
head(data_month_week)
NA
ggplot(data_uber,aes(month,fill=dayofweek))+geom_bar(position = "dodge")+scale_fill_manual(values = colors)

ggplot(data_month_week,aes(month,Total,fill=dayofweek))+geom_bar(stat = "identity",position = "dodge")

From the above graph we can easily determine the day of week with maximum rides monthwise.
Now in order to determine the hourly rides of each day we have to make another data grouped by day and hour.
day_and_hour <- data_uber %>%
group_by(day, hour) %>%
dplyr::summarize(Total = n())
head(day_and_hour)
Now we can determine the number of rides on hourly basis of each day usiing heatmap.
colo="red"
ggplot(day_and_hour, aes(day, hour, fill = Total)) +
geom_tile(color = "black") +
ggtitle("Heat Map by Hour and Day")+scale_color_manual(values = colo,aesthetics = colo )

As we can clearly see in the above graph that on the very first week the numbers of rides are more in between 4pm to 8pm.
ggplot(day_month_group, aes(day, month, fill = Total)) +
geom_tile(color = "darkgrey") +
ggtitle("Heat Map by Month and Day")

From the above graph we can determine the day of month with more number of rides,which in most of the cases are the first 9days of the month.
We can also plot the heatmap to determine the day of week in a month with max number rides.
ggplot(data_month_week, aes(dayofweek, month, fill = Total)) +
geom_tile(color = "black") +
ggtitle("Heat Map by Month and Day of Week")

So from the above heatmap we can clearly determine the busiest day week of each month.
Now we have to determine the base with maximum number of rides each month and for that we are creating the new data.
month_base <- data_uber %>%
group_by(Base, month) %>%
dplyr::summarize(Total = n())
head(month_base)
NA
Now we can plot this data to show determine the busiest base of each month.
ggplot(month_base, aes(Base, month, fill = Total)) +
geom_tile(color = "white") +
ggtitle("Heat Map by Month and Bases")

So we can clearly see the busiest base of each month is B02598.
min_lat <- 40.5774
max_lat <- 40.9176
min_long <- -74.15
max_long <- -73.7004
ggplot(data_uber, aes(x=Lon, y=Lat, color = Base)) +
geom_point(size=1) +
scale_x_continuous(limits=c(min_long, max_long)) +
scale_y_continuous(limits=c(min_lat, max_lat)) +
theme_map() +
ggtitle("NYC MAP BASED ON UBER RIDES DURING 2014 (APR-SEP) by BASE")

The above show is the map plot the location showing the number of rides for both of the bases.
This how we can transform the uber data into usefull information from using ggplot2 which gives the insights of the busiest time of month ,week,day,hour and the busiest base.
Conclusion: So from the help of above analysis we can get the important insights of the data for decision taking like Where and When to avail the uber cabs more.
---
title: "Uber Analysis"
output: html_notebook
---
Uber Analysis by Data visualization
```{r}

library(ggplot2)
library(dplyr)
library(tidyr)
library(lubridate)
library(Scale)
library(DT)
library(ggthemes)

```


## Making color vectors


```{r}
colors = c("#CC1011", "#665555", "#05a399", "#cfcaca", "#f5e840", "#0683c9", "#e075b0")
colors

```

## Loading Uber Data


```{r}
setwd("D:/imarticus/1/Uber-dataset")
apr_data <- read.csv("uber-raw-data-apr14.csv")
may_data <- read.csv("uber-raw-data-may14.csv")
jun_data <- read.csv("uber-raw-data-jun14.csv")
jul_data <- read.csv("uber-raw-data-jul14.csv")
aug_data <- read.csv("uber-raw-data-aug14.csv")
sep_data <- read.csv("uber-raw-data-sep14.csv")
```

```{r}
#now binding all the data in one table
apr_data=apr_data[1:100000,]
may_data=may_data[1:100000,]
jun_data=jun_data[1:100000,]
jul_data=jul_data[1:100000,]
aug_data=aug_data[1:100000,]
sep_data=sep_data[1:100000,]

data_uber=rbind(apr_data,may_data,jun_data,jul_data,aug_data,sep_data)
str(data_uber)
```
Now we have  combined uber data of 6 months with 600000 observation of 4 variables.

But as dvariable data.time is in factor  format we have to convert it in time format.
```{r}
#converting time from m/d/y format to y/m/d
data_uber$Date.Time <- as.POSIXct(data_uber$Date.Time, format = "%m/%d/%Y %H:%M:%S")
head(data_uber)

```
Now as the data is in time format we can a new variable for time format in Hour-Min-Sec only 
```{r}
#making time as a different coloumn
data_uber$Time <- format(as.POSIXct(data_uber$Date.Time, format = "%m/%d/%Y %H:%M:%S"), format="%H:%M:%S")
head(data_uber)
```
Now we have made a seperate coloumn for time for analysis.

For ggplot we need to convert each time format  individually as a factor.
```{r}
data_uber$Date.Time <- ymd_hms(data_uber$Date.Time)
head(data_uber)
```
Now we can pullout individual time  and covert it into factor.
```{r}
data_uber$day <- factor(day(data_uber$Date.Time))
data_uber$month <- factor(month(data_uber$Date.Time, label = TRUE))
head(data_uber)
```
In the above table we have made day and month a different variable as a factor
```{r}
#making date,month year and day of week as a factor in seperate coloumn
data_uber$year <- factor(year(data_uber$Date.Time))
data_uber$dayofweek <- factor(wday(data_uber$Date.Time, label = TRUE))
head(data_uber)
```

```{r}
#making tine as a factor in seperate coloumn
data_uber$hour <- factor(hour(hms(data_uber$Time)))
data_uber$minute <- factor(minute(hms(data_uber$Time)))
data_uber$second <- factor(second(hms(data_uber$Time)))
head(data_uber)
```
Similarily all the time value as a adifferent variable in a fator form. 
Now for ploting we are creating a new data grouped by hour.
```{r}
hour_data <- data_uber %>%
           group_by(hour) %>%
               dplyr::summarize(Total = n()) 
head(hour_data)


```
Now in order in determine the peak hour at which  maximum customer uses uber we will ggplot graph.
```{r}
ggplot(hour_data, aes(hour, Total)) + 
        geom_bar( stat = "identity", fill = "steelblue", color = "red") +
           ggtitle("Trips Every Hour") +
            theme(legend.position = "none") 
          
```
Now from the above plot we can clearly state that  rush hour is between 16.5hrs to 17.5hrs (4:30pm - 5:30pm).
```{r}
#Grouping data by month and hour to determine peak hour monthwise 
month_hour <- data_uber %>%
          group_by(month, hour)%>%
             dplyr::summarize(Total = n())
head(month_hour)
View(month_hour)
```
Now in order to determine peak hour time monthwise we have to plot another graph using the above data.
```{r}
ggplot(month_hour, aes(hour, Total, fill = month)) + 
       geom_bar( stat = "identity") +
          ggtitle("Trips by Hour and Month") 
```
Now here we can see that out of all the days in every month rate of higher rides are in between 16 to 17 or 17 to 18.
```{r}
ggplot(month_hour, aes(month, Total)) + 
       geom_bar( stat = "identity",fill="steelblue") +
          ggtitle("Trips by Hour and Month") 
```
As it is very clear from the graph that number of rides in every month is same.
```{r}
#making a group data on the basis of day
day_group <- data_uber %>%
          group_by(day) %>%
             dplyr::summarize(Total = n()) 
head(day_group)
```
Now in order to determine the day with maximum number of rides we havew to make a plot using the above data.
```{r}
ggplot(day_group, aes(day, Total)) + 
        geom_bar( stat = "identity", fill = "steelblue",col="green") +
           ggtitle("Trips Every Day") +
            theme(legend.position = "none")
```
Here we can see that the first week of the month is having max number of rides.
```{r}
day_month_group <- data_uber %>%
         group_by(month, day) %>%
            dplyr:: summarize(Total = n())
head(day_month_group)
```
Ploting the graph to determine the day with highest number of rides monthwise.
```{r}
ggplot(day_month_group,aes(day,Total,fill=month))+geom_bar(stat = "identity")+ scale_fill_manual(values = colors)
```
Now moving over weekly rides in every month.
```{r}
data_month_week=data_uber %>%
  group_by(month,dayofweek) %>%
 dplyr:: summarise(Total=n())
head(data_month_week)
  
```

```{r}
ggplot(data_uber,aes(month,fill=dayofweek))+geom_bar(position = "dodge")+scale_fill_manual(values = colors)
```

```{r}
ggplot(data_month_week,aes(month,Total,fill=dayofweek))+geom_bar(stat = "identity",position = "dodge")
```
From the above graph we can easily determine the day of week with maximum rides monthwise.

Now in order to determine the hourly rides of each day we have to make another data grouped by day and hour.
```{r}
day_and_hour <- data_uber %>%
         group_by(day, hour) %>%
            dplyr::summarize(Total = n())
head(day_and_hour)
```
Now we can determine the number of rides on hourly basis of each day usiing heatmap.
```{r}
colo="red"
ggplot(day_and_hour, aes(day, hour, fill = Total)) +
            geom_tile(color = "black") +
              ggtitle("Heat Map by Hour and Day")+scale_color_manual(values = colo,aesthetics = colo )
```
As we can clearly see in the above graph that on the very first week the numbers of rides are more in between 4pm to 8pm.

```{r}
ggplot(day_month_group, aes(day, month, fill = Total)) +
            geom_tile(color = "darkgrey") +
              ggtitle("Heat Map by Month and Day")
```
From the above graph we can determine the day of month with more number of rides,which in most of the cases are the first 9days of the month.

We can also plot the heatmap to determine the day of week in a month with max number rides.
```{r}
ggplot(data_month_week, aes(dayofweek, month, fill = Total)) +
            geom_tile(color = "black") +
              ggtitle("Heat Map by Month and Day of Week")
```
So from the above  heatmap we can clearly determine the busiest day week of each month.

Now we have to determine the base with maximum number of rides each month and for that we are creating the new data.
```{r}
month_base <-  data_uber %>%
                    group_by(Base, month) %>%
                        dplyr::summarize(Total = n())
head(month_base)

```
Now we can plot this data to show determine the busiest base of each month.
```{r}
ggplot(month_base, aes(Base, month, fill = Total)) +
            geom_tile(color = "white") +
              ggtitle("Heat Map by Month and Bases")
```
So we can clearly see the busiest base of each month is B02598.
```{r}
min_lat <- 40.5774
max_lat <- 40.9176
min_long <- -74.15
max_long <- -73.7004

ggplot(data_uber, aes(x=Lon, y=Lat, color = Base)) +
  geom_point(size=1) +
     scale_x_continuous(limits=c(min_long, max_long)) +
      scale_y_continuous(limits=c(min_lat, max_lat)) +
       theme_map() +
          ggtitle("NYC MAP BASED ON UBER RIDES DURING 2014 (APR-SEP) by BASE")
```
The above show is the map plot the location showing the number of rides for both of the bases.

This how we can transform the uber data into usefull information from using ggplot2 which gives the insights of the busiest time of month ,week,day,hour and the busiest base.

Conclusion:
So from the help of above analysis we can get the important insights of the data for decision taking like Where and When to avail the uber cabs
more.

