Firstly read the 5 datasets:

#US holdiay dataset
hol<-read.csv("US Bank holidays.csv")
hol$X2012.01.02<-as.Date(hol$X2012.01.02)
names(hol)[names(hol) == "X2012.01.02"] <- "date"

#cab dataset
cab<-read.csv("Cab_Data.csv")
cab$City<-factor(cab$City)
cab$Company<-factor(cab$Company)
profit<- cab$Price.Charged-cab$Cost.of.Trip #obtaining profit variable
cab<-cbind(cab,profit) #add the profit to the dataset

#city dataset
library(readxl)
city <- read_excel("city.xlsx")
city$City<-factor(city$City)

#customer dataset
customer<-read.csv("Customer_ID.csv")
customer$Gender<-factor(customer$Gender)


#transaction dataset
transaction<-read.csv("Transaction_ID.csv")
transaction$Payment_Mode<-factor(transaction$Payment_Mode)

# long ve latitude of the cities
df <- read.csv('https://raw.githubusercontent.com/plotly/datasets/master/2011_february_us_airport_traffic.csv')

Let’s look at how the datasets looks like:

head(hol)
##   X1       date                          New.Year.Day
## 1  2 2012-01-16            Martin Luther King Jr. Day
## 2  3 2012-02-20 Presidents Day (Washingtons Birthday)
## 3  4 2012-05-28                          Memorial Day
## 4  5 2012-07-04                      Independence Day
## 5  6 2012-09-03                             Labor Day
## 6  7 2012-10-08                          Columbus Day
head(cab)
##   Transaction.ID  Company       City KM.Travelled Price.Charged Cost.of.Trip
## 1       10000011 Pink Cab ATLANTA GA        30.45        370.95      313.635
## 2       10000012 Pink Cab ATLANTA GA        28.62        358.52      334.854
## 3       10000013 Pink Cab ATLANTA GA         9.04        125.20       97.632
## 4       10000014 Pink Cab ATLANTA GA        33.17        377.40      351.602
## 5       10000015 Pink Cab ATLANTA GA         8.73        114.62       97.776
## 6       10000016 Pink Cab ATLANTA GA         6.06         72.43       63.024
##   profit       date
## 1 57.315 2016-01-08
## 2 23.666 2016-01-06
## 3 27.568 2016-01-02
## 4 25.798 2016-01-07
## 5 16.844 2016-01-03
## 6  9.406 2016-01-07
head(city)
## # A tibble: 6 x 3
##   City           Population  Users
##   <fct>               <dbl>  <dbl>
## 1 NEW YORK NY       8405837 302149
## 2 CHICAGO IL        1955130 164468
## 3 LOS ANGELES CA    1595037 144132
## 4 MIAMI FL          1339155  17675
## 5 SILICON VALLEY    1177609  27247
## 6 ORANGE COUNTY     1030185  12994
head(customer)
##   Customer.ID Gender Age Income..USD.Month.
## 1       29290   Male  28              10813
## 2       27703   Male  27               9237
## 3       28712   Male  53              11242
## 4       28020   Male  23              23327
## 5       27182   Male  33               8536
## 6       27318   Male  25              13984
head(transaction)
##   Transaction.ID Customer.ID Payment_Mode
## 1       10000011       29290         Card
## 2       10000012       27703         Card
## 3       10000013       28712         Cash
## 4       10000014       28020         Cash
## 5       10000015       27182         Card
## 6       10000016       27318         Cash
head(df)
##   iata                           airport              city state country
## 1  ORD      Chicago O'Hare International           Chicago    IL     USA
## 2  ATL William B Hartsfield-Atlanta Intl           Atlanta    GA     USA
## 3  DFW   Dallas-Fort Worth International Dallas-Fort Worth    TX     USA
## 4  PHX  Phoenix Sky Harbor International           Phoenix    AZ     USA
## 5  DEN                       Denver Intl            Denver    CO     USA
## 6  IAH      George Bush Intercontinental           Houston    TX     USA
##        lat       long   cnt
## 1 41.97960  -87.90446 25129
## 2 33.64044  -84.42694 21925
## 3 32.89595  -97.03720 20662
## 4 33.43417 -112.00806 17290
## 5 39.85841 -104.66700 13781
## 6 29.98047  -95.33972 13223
dim(cab)
## [1] 359392      8
dim(city)
## [1] 20  3
dim(customer)
## [1] 49171     4
dim(transaction)
## [1] 440098      3
dim(hol)
## [1] 89  3
anyNA(hol) #no NA
## [1] FALSE
anyNA(city) #no NA
## [1] FALSE
anyNA(customer) #no NA
## [1] FALSE
anyNA(cab) #no NA
## [1] FALSE
anyNA(transaction) #no NA
## [1] FALSE

All of the dataset do not have NA value.

Then merge the 5 datasets:

## merging datasets
library(dplyr)
city = inner_join(city, lat_long, by = "City", suffix = c("_city","_lat_long"))
ij = inner_join(transaction, customer, by = "Customer.ID", suffix = c("_transaction","_customer"))
all<-inner_join(ij,cab, by = "Transaction.ID", suffix = c("_ij","_cab"))
all<-inner_join(all,city, by = "City", suffix = c("_all","_city"))
all2<-inner_join(all,hol, by = "date", suffix = c("_all","_hol"))
names(all)[names(all) == "Income..USD.Month."] <- "income"

head(all)
##   Transaction.ID Customer.ID Payment_Mode Gender Age income  Company       City
## 1       10000011       29290         Card   Male  28  10813 Pink Cab ATLANTA GA
## 2       10000012       27703         Card   Male  27   9237 Pink Cab ATLANTA GA
## 3       10000013       28712         Cash   Male  53  11242 Pink Cab ATLANTA GA
## 4       10000014       28020         Cash   Male  23  23327 Pink Cab ATLANTA GA
## 5       10000015       27182         Card   Male  33   8536 Pink Cab ATLANTA GA
## 6       10000016       27318         Cash   Male  25  13984 Pink Cab ATLANTA GA
##   KM.Travelled Price.Charged Cost.of.Trip profit       date Population Users
## 1        30.45        370.95      313.635 57.315 2016-01-08     814885 24701
## 2        28.62        358.52      334.854 23.666 2016-01-06     814885 24701
## 3         9.04        125.20       97.632 27.568 2016-01-02     814885 24701
## 4        33.17        377.40      351.602 25.798 2016-01-07     814885 24701
## 5         8.73        114.62       97.776 16.844 2016-01-03     814885 24701
## 6         6.06         72.43       63.024  9.406 2016-01-07     814885 24701
##        lat      long
## 1 33.64044 -84.42694
## 2 33.64044 -84.42694
## 3 33.64044 -84.42694
## 4 33.64044 -84.42694
## 5 33.64044 -84.42694
## 6 33.64044 -84.42694
tail(all)
##        Transaction.ID Customer.ID Payment_Mode Gender Age income    Company
## 359387       10440100       51852         Card   Male  46   6165 Yellow Cab
## 359388       10440101       52392         Cash   Male  24  15651 Yellow Cab
## 359389       10440104       53286         Cash   Male  32   6528 Yellow Cab
## 359390       10440105       52265         Cash   Male  56   7966 Yellow Cab
## 359391       10440106       52175         Card   Male  32   6423 Yellow Cab
## 359392       10440107       52917         Card   Male  20  11284 Yellow Cab
##                 City KM.Travelled Price.Charged Cost.of.Trip   profit
## 359387 WASHINGTON DC        28.71        452.19     351.4104 100.7796
## 359388 WASHINGTON DC         4.80         69.24      63.3600   5.8800
## 359389 WASHINGTON DC         8.40        113.75     106.8480   6.9020
## 359390 WASHINGTON DC        27.75        437.07     349.6500  87.4200
## 359391 WASHINGTON DC         8.80        146.19     114.0480  32.1420
## 359392 WASHINGTON DC        12.76        191.58     177.6192  13.9608
##              date Population  Users     lat     long
## 359387 2018-01-07     418859 127001 38.8951 -77.0364
## 359388 2018-01-08     418859 127001 38.8951 -77.0364
## 359389 2018-01-04     418859 127001 38.8951 -77.0364
## 359390 2018-01-05     418859 127001 38.8951 -77.0364
## 359391 2018-01-05     418859 127001 38.8951 -77.0364
## 359392 2018-01-02     418859 127001 38.8951 -77.0364
dim(all)
## [1] 359392     17
range(all$date)
## [1] "2016-01-02" "2018-12-31"
summary(all)
##  Transaction.ID      Customer.ID    Payment_Mode     Gender      
##  Min.   :10000011   Min.   :    1   Card:215504   Female:153480  
##  1st Qu.:10110810   1st Qu.: 2705   Cash:143888   Male  :205912  
##  Median :10221036   Median : 7459                                
##  Mean   :10220761   Mean   :19192                                
##  3rd Qu.:10330937   3rd Qu.:36078                                
##  Max.   :10440107   Max.   :60000                                
##       Age            income            Company           City          
##  Min.   :18.00   Min.   : 2000   Pink Cab  : 84711   Length:359392     
##  1st Qu.:25.00   1st Qu.: 8424   Yellow Cab:274681   Class :character  
##  Median :33.00   Median :14685                       Mode  :character  
##  Mean   :35.34   Mean   :15049                                         
##  3rd Qu.:42.00   3rd Qu.:21035                                         
##  Max.   :65.00   Max.   :35000                                         
##   KM.Travelled   Price.Charged     Cost.of.Trip       profit       
##  Min.   : 1.90   Min.   :  15.6   Min.   : 19.0   Min.   :-220.06  
##  1st Qu.:12.00   1st Qu.: 206.4   1st Qu.:151.2   1st Qu.:  28.01  
##  Median :22.44   Median : 386.4   Median :282.5   Median :  81.96  
##  Mean   :22.57   Mean   : 423.4   Mean   :286.2   Mean   : 137.25  
##  3rd Qu.:32.96   3rd Qu.: 583.7   3rd Qu.:413.7   3rd Qu.: 190.03  
##  Max.   :48.00   Max.   :2048.0   Max.   :691.2   Max.   :1463.97  
##       date              Population          Users             lat       
##  Min.   :2016-01-02   Min.   : 248968   Min.   :   927   Min.   :25.79  
##  1st Qu.:2016-11-23   1st Qu.: 671238   1st Qu.: 80021   1st Qu.:33.94  
##  Median :2017-09-10   Median :1595037   Median :144132   Median :40.64  
##  Mean   :2017-08-17   Mean   :3132198   Mean   :158296   Mean   :38.55  
##  3rd Qu.:2018-05-12   3rd Qu.:8405837   3rd Qu.:302149   3rd Qu.:41.79  
##  Max.   :2018-12-31   Max.   :8405837   Max.   :302149   Max.   :47.45  
##       long        
##  Min.   :-122.31  
##  1st Qu.:-117.19  
##  Median : -80.29  
##  Mean   : -89.61  
##  3rd Qu.: -73.78  
##  Max.   : -71.01
head(all2)
##   Transaction.ID Customer.ID Payment_Mode Gender Age Income..USD.Month.
## 1       10001395       29150         Card   Male  18              13969
## 2       10001397       35258         Card   Male  33              19129
## 3       10001410       57152         Card   Male  30               4444
## 4       10001417        3242         Cash   Male  38              23148
## 5       10001422        5299         Card   Male  64              24313
## 6       10001425        3968         Cash   Male  30              21443
##    Company       City KM.Travelled Price.Charged Cost.of.Trip  profit
## 1 Pink Cab ATLANTA GA         8.12         96.30       82.012  14.288
## 2 Pink Cab  AUSTIN TX        31.92        573.32      363.888 209.432
## 3 Pink Cab  BOSTON MA         4.52         42.10       51.076  -8.976
## 4 Pink Cab CHICAGO IL        12.35        165.80      135.850  29.950
## 5 Pink Cab CHICAGO IL        44.80        643.13      497.280 145.850
## 6 Pink Cab CHICAGO IL        31.08        442.04      348.096  93.944
##         date Population  Users      lat      long X1               New.Year.Day
## 1 2016-01-18     814885  24701 33.64044 -84.42694 42 Martin Luther King Jr. Day
## 2 2016-01-18     698371  14978 30.19453 -97.66987 42 Martin Luther King Jr. Day
## 3 2016-01-18     248968  80021 42.36435 -71.00518 42 Martin Luther King Jr. Day
## 4 2016-01-18    1955130 164468 41.78598 -87.75242 42 Martin Luther King Jr. Day
## 5 2016-01-18    1955130 164468 41.78598 -87.75242 42 Martin Luther King Jr. Day
## 6 2016-01-18    1955130 164468 41.78598 -87.75242 42 Martin Luther King Jr. Day
tail(all2)
##      Transaction.ID Customer.ID Payment_Mode Gender Age Income..USD.Month.
## 7572       10439483       19735         Card Female  20              21556
## 7573       10439775       37356         Card   Male  63              24505
## 7574       10439812       13687         Cash Female  34              18921
## 7575       10439817       14876         Card Female  50               7955
## 7576       10439876       53382         Card Female  64              15372
## 7577       10439881       53705         Card   Male  40              20936
##         Company           City KM.Travelled Price.Charged Cost.of.Trip   profit
## 7572 Yellow Cab   SAN DIEGO CA        23.52        377.44     318.9312  58.5088
## 7573 Yellow Cab     SEATTLE WA        15.36        236.13     211.9680  24.1620
## 7574 Yellow Cab SILICON VALLEY         7.42        134.12     105.9576  28.1624
## 7575 Yellow Cab SILICON VALLEY        28.84        584.31     411.8352 172.4748
## 7576 Yellow Cab  WASHINGTON DC        10.56        139.90     131.7888   8.1112
## 7577 Yellow Cab  WASHINGTON DC        23.10        316.22     279.9720  36.2480
##            date Population  Users      lat      long X1 New.Year.Day
## 7572 2018-01-01     959307  69995 32.73356 -117.1897 61 New Year Day
## 7573 2018-01-01     671238  25063 47.44898 -122.3093 61 New Year Day
## 7574 2018-01-01    1177609  27247 37.37000 -122.0400 61 New Year Day
## 7575 2018-01-01    1177609  27247 37.37000 -122.0400 61 New Year Day
## 7576 2018-01-01     418859 127001 38.89510  -77.0364 61 New Year Day
## 7577 2018-01-01     418859 127001 38.89510  -77.0364 61 New Year Day
dim(all2)
## [1] 7577   19
summary(all2)
##  Transaction.ID      Customer.ID    Payment_Mode    Gender          Age       
##  Min.   :10001395   Min.   :    1   Card:4515    Female:3217   Min.   :18.00  
##  1st Qu.:10128166   1st Qu.: 2681   Cash:3062    Male  :4360   1st Qu.:25.00  
##  Median :10253748   Median : 7514                              Median :33.00  
##  Mean   :10230055   Mean   :19159                              Mean   :35.42  
##  3rd Qu.:10302493   3rd Qu.:35724                              3rd Qu.:42.00  
##  Max.   :10439881   Max.   :60000                              Max.   :65.00  
##  Income..USD.Month.       Company         City            KM.Travelled  
##  Min.   : 2007      Pink Cab  :1915   Length:7577        Min.   : 1.90  
##  1st Qu.: 8443      Yellow Cab:5662   Class :character   1st Qu.:12.00  
##  Median :14635                        Mode  :character   Median :22.44  
##  Mean   :15033                                           Mean   :22.59  
##  3rd Qu.:20986                                           3rd Qu.:32.98  
##  Max.   :34968                                           Max.   :48.00  
##  Price.Charged     Cost.of.Trip        profit             date           
##  Min.   :  15.6   Min.   : 19.59   Min.   :-133.21   Min.   :2016-01-18  
##  1st Qu.: 201.1   1st Qu.:150.42   1st Qu.:  24.84   1st Qu.:2016-12-25  
##  Median : 377.4   Median :283.18   Median :  75.68   Median :2017-11-10  
##  Mean   : 412.1   Mean   :285.57   Mean   : 126.53   Mean   :2017-09-03  
##  3rd Qu.: 569.4   3rd Qu.:414.12   3rd Qu.: 174.68   3rd Qu.:2018-01-15  
##  Max.   :1681.9   Max.   :679.68   Max.   :1058.11   Max.   :2018-12-25  
##    Population          Users             lat             long        
##  Min.   : 248968   Min.   :   927   Min.   :25.79   Min.   :-122.31  
##  1st Qu.: 698371   1st Qu.: 80021   1st Qu.:33.94   1st Qu.:-117.19  
##  Median :1595037   Median :144132   Median :40.64   Median : -80.29  
##  Mean   :3148905   Mean   :157696   Mean   :38.43   Mean   : -89.63  
##  3rd Qu.:8405837   3rd Qu.:302149   3rd Qu.:41.79   3rd Qu.: -73.78  
##  Max.   :8405837   Max.   :302149   Max.   :47.45   Max.   : -71.01  
##        X1        New.Year.Day      
##  Min.   :42.00   Length:7577       
##  1st Qu.:50.00   Class :character  
##  Median :58.00   Mode  :character  
##  Mean   :56.74                     
##  3rd Qu.:62.00                     
##  Max.   :70.00

Deduplication

“Duplication” means that if we have repeated data in our dataset. This could be due to things like data entry errors or data collection methods.

# get the row numbers of duplicated rows
duplicated_rows <- data_frame(duplicated = duplicated(all), row = 1:nrow(all)) %>%
    filter(duplicated == T)

count(duplicated_rows)
## # A tibble: 1 x 1
##       n
##   <int>
## 1     0
# Plot duplicated rows as black lines
ggplot(duplicated_rows, aes(xintercept = row)) +
    geom_vline(aes(xintercept = row)) + # plot a black line for each duplicated row
    ggtitle("Indexes of duplicated rows") + # add a title
    coord_flip() + scale_x_reverse() #flip x & y axis and reverse the x axis

Explanatory Data Analysis

library(funModeling)
numeric<-all[,c(5,6,9:12)] #extracting numeric variables
plot_num(numeric)

profiling_num(numeric) 
##        variable        mean    std_dev variation_coef       p_01      p_05
## 1           Age    35.33670   12.59423      0.3564066   18.00000   19.0000
## 2        income 15048.82294 7969.40948      0.5295703 2233.00000 3245.0000
## 3  KM.Travelled    22.56725   12.23353      0.5420919    2.10000    3.5700
## 4 Price.Charged   423.44331  274.37891      0.6479708   33.59000   63.4200
## 5  Cost.of.Trip   286.19011  157.99366      0.5520584   26.37079   46.2240
## 6        profit   137.25320  160.31184      1.1680008  -39.69309   -5.0829
##        p_25      p_50       p_75       p_95       p_99   skewness kurtosis
## 1   25.0000    33.000    42.0000    61.0000    64.0000 0.68533592 2.541593
## 2 8424.0000 14685.000 21035.0000 29659.0000 33968.0000 0.30956095 2.339507
## 3   12.0000    22.440    32.9600    42.0000    45.6300 0.05577867 1.873124
## 4  206.4375   386.360   583.6600   944.8900  1230.1090 0.87375784 3.747608
## 5  151.2000   282.480   413.6832   544.3632   610.5600 0.13795749 1.987765
## 6   28.0120    81.962   190.0300   478.5642   713.5563 1.89989600 7.376835
##          iqr                 range_98                   range_80
## 1    17.0000                 [18, 64]                   [21, 56]
## 2 12611.0000            [2233, 33968]              [4525, 24793]
## 3    20.9600             [2.1, 45.63]                [5.8, 39.2]
## 4   377.2225        [33.59, 1230.109]           [99.231, 792.79]
## 5   262.4832      [26.370792, 610.56] [72.576, 502.500600000003]
## 6   162.0180 [-39.693088, 713.556288]         [5.207, 358.98468]
freq(all2[,-c(1,2)]) 

##   Payment_Mode frequency percentage cumulative_perc
## 1         Card      4515      59.59           59.59
## 2         Cash      3062      40.41          100.00

##   Gender frequency percentage cumulative_perc
## 1   Male      4360      57.54           57.54
## 2 Female      3217      42.46          100.00

##      Company frequency percentage cumulative_perc
## 1 Yellow Cab      5662      74.73           74.73
## 2   Pink Cab      1915      25.27          100.00

##              City frequency percentage cumulative_perc
## 1     NEW YORK NY      2128      28.08           28.08
## 2      CHICAGO IL      1142      15.07           43.15
## 3  LOS ANGELES CA      1010      13.33           56.48
## 4   WASHINGTON DC       868      11.46           67.94
## 5       BOSTON MA       641       8.46           76.40
## 6    SAN DIEGO CA       477       6.30           82.70
## 7  SILICON VALLEY       172       2.27           84.97
## 8      ATLANTA GA       165       2.18           87.15
## 9        MIAMI FL       164       2.16           89.31
## 10     SEATTLE WA       161       2.12           91.43
## 11      DALLAS TX       145       1.91           93.34
## 12      AUSTIN TX       121       1.60           94.94
## 13  ORANGE COUNTY        80       1.06           96.00
## 14   NASHVILLE TN        76       1.00           97.00
## 15      DENVER CO        64       0.84           97.84
## 16  SACRAMENTO CA        54       0.71           98.55
## 17      TUCSON AZ        43       0.57           99.12
## 18     PHOENIX AZ        33       0.44           99.56
## 19  PITTSBURGH PA        33       0.44          100.00

##                             New.Year.Day frequency percentage cumulative_perc
## 1                           Veterans Day      1565      20.65           20.65
## 2                          Christmas Day      1331      17.57           38.22
## 3                       Thanksgiving Day      1166      15.39           53.61
## 4                           Columbus Day       673       8.88           62.49
## 5                              Labor Day       666       8.79           71.28
## 6                           New Year Day       610       8.05           79.33
## 7                           Memorial Day       449       5.93           85.26
## 8                       Independence Day       447       5.90           91.16
## 9  Presidents Day (Washingtons Birthday)       345       4.55           95.71
## 10            Martin Luther King Jr. Day       325       4.29          100.00
## [1] "Variables processed: Payment_Mode, Gender, Company, City, New.Year.Day"
library(plotly)
fig <- plot_ly(all, lat = ~lat, lon = ~long,
    marker = list(color = "fuchsia"),
    type = 'scattermapbox',
    hovertext = all$City) 
fig <- fig %>%
  layout(all,
    mapbox = list(
      style = 'open-street-map',
      zoom =2.5,
      center = list(lon = -88, lat = 34))) 
fig
Box-plot for travelled km, charged price and cost of the trip
g <-
  ggplot(cab, aes(x = Company, y = KM.Travelled,
                   fill = Company)) +
    labs(title = "Boxplot of travelled km by company") +
    scale_fill_brewer(palette = "Set1", guide = "none") + geom_boxplot()


f <-
  ggplot(cab, aes(x = Company, y = Price.Charged,
                   fill = Company)) +
    labs(title = "Boxplot of charged price by company") +
    scale_fill_brewer(palette = "Set2", guide = "none")+ geom_boxplot()


j <-
  ggplot(cab, aes(x = Company, y = Cost.of.Trip,
                   fill = Company)) +
    labs(title = "Boxplot of cost of the trip by company") +
    scale_fill_brewer(palette = "Set3", guide = "none")+ geom_boxplot()

library(ggpubr)
ggarrange(g,f,j,ncol = 2,nrow = 2)

  • It seems; median of travelled km of both company is equal.
  • Median of cost of the trip of yellow cab is bigger than the median of cost of the trip of pink cab.
  • Median of charged price of Yellow cab is bigger than the median of charged price of pink cab. Moreover, in charged price outliers were observed.
Customers
ggplot(customer, aes(x=Gender,y=Income..USD.Month.,fill=Gender))+ 
    geom_boxplot() +
    scale_fill_brewer(palette = "Dark2", guide = "none") +
    coord_flip() +labs(title = "Boxplot of customers' monthly income by gender ")

  • It seems no difference monthly income of male and female users.

If age taken as categorical variable :

for(i in 1:49171){
 if(customer$Age[i]>= 18 & customer$Age[i]<= 33){
  customer$Age[i]="(18-33)"
 }else if(customer$Age[i]>=34 & customer$Age[i]<= 49){
  customer$Age[i]="(34-49)"
 }else{customer$Age[i]="(50-65)"
  
 }
}
freq(customer$Age)

##       var frequency percentage cumulative_perc
## 1 (18-33)     25111      51.07           51.07
## 2 (34-49)     15713      31.96           83.03
## 3 (50-65)      8347      16.98          100.00
customer$Age<-factor(customer$Age)

ggplot(customer, aes(x=Age,y=Income..USD.Month.,fill=Age))+ 
    geom_boxplot() +
    scale_fill_brewer(palette = "Set2", guide = "none") +
    coord_flip() +labs(title = "Boxplot of customers' monthly income by age ")

  • It seems no difference monthly income of each each group.
#populations and cities
ggplot(data=city, aes(City, Population,fill=City)) +
  geom_bar(position = "dodge", stat = "identity") +
  theme_gray()+
  coord_flip()+
  theme(legend.position = "none")+
geom_text( aes(label=Population), vjust=0.5,size=3)+labs(title = "Population of the cities")

library(ggrepel)
ggplot(city, aes(x = Users, y = Population)) + 
  geom_point()  +
  geom_label_repel(aes(label = City,color=City), fontface = "bold") +
  theme(legend.position = "none") + 
  geom_smooth(method = lm, se = FALSE)+ labs(title="Scatter plot of population of the cities and user")

  • Positive linear relationship was observed between population of the cities and number of taxi users. (As population increases, users also increases)
Spearman’s Correlation
# correlations
library(corrplot)
library(PerformanceAnalytics)

M <- cor(numeric[,])
res1 <- cor(M, method="spearman")
corrplot::corrplot(res1, method= "color", order = "hclust", addCoef.col = "black", 
         tl.col="black", tl.srt=45
)

Positive correlations are shown in blue and negative correlations in red color. Color intensity is proportional to the correlation coefficients. When we look at the correlation matrix, it seen that between some variables there is a strong positive relationship such as cost of the trip and travelled km, profit and charged price. Moreover, between income and travelled km, income and cost of the trip are strong negative relationship.

HYPOTHESES

1. Is there any significant difference between mean of the number of taxi users of yellow cub and mean of the pink cub?

\[ H_0:\mu_{pink\ cab} = \mu_{yellow \ cab}\\ H_1:\mu_{pink\ cab} \ne \mu_{yellow \ cab} \]

# mean of number of users for each cab in overall
yellowcab<- all[all$Company=="Yellow Cab",]
pinkcab<- all[all$Company=="Pink Cab",]


y<-yellowcab %>% 
  select(Users, Company) %>%  
  group_by(Company) %>% 
  summarise(n = n(),
            mean = mean(Users))

p<-pinkcab %>% 
  select(Users, Company) %>%  
  group_by(Company) %>% 
  summarise(n = n(),
            mean = mean(Users))
user_total<-rbind(p,y)
user_total
## # A tibble: 2 x 3
##   Company         n    mean
##   <chr>       <int>   <dbl>
## 1 Pink Cab    84711 125409.
## 2 Yellow Cab 274681 168438.
library(scales)
library(ggplot2)
ggplot(user_total, 
       aes(x=Company,y = mean)) +
  geom_bar(stat = "identity", 
           fill = "cornflowerblue") +
  geom_text(aes(label = round(mean,0)), 
            vjust = -0.25) +
  scale_y_continuous(breaks = seq(0,500000,50000)
                     ) +
  labs(title = "Mean number of users by company", x="Company", y="mean number of users")

Let’s test it. For t-test there is two assumptions which are normality and homogeneity of variances. Before applying t-test assumptions should be checked.

  1. Normality of number of users (Anderson-Darling Test For Normality)

\[ H_0:\ Variable\ follow\ normal\ distribution.\\ H_1:\ Variable\ does \ not\ follow\ normal\ distribution.\ \]

library(nortest)
ad.test(all$Users[all$Company=="Pink Cab"])
## 
##  Anderson-Darling normality test
## 
## data:  all$Users[all$Company == "Pink Cab"]
## A = 3654.6, p-value < 2.2e-16
ad.test(all$Users[all$Company=="Yellow Cab"])
## 
##  Anderson-Darling normality test
## 
## data:  all$Users[all$Company == "Yellow Cab"]
## A = 14537, p-value < 2.2e-16

Since p-value of the normality test smaller than the significance level of 0.05, users do not distribute normal. (Box-Cox and bestnormalize methods are tried, but normality was not met). Thus, we cannot apply t-test. Since the assumptions of the t-test which are normality and homogeneity of the variances were not met, non-parametric version of the t-test (Wilcoxon Rank Sum Test) was used.

## testing difference 
wilcox.test(all$Users~all$Company )
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  all$Users by all$Company
## W = 8829742846, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

Since the p-value of wilcoxon test smaller than the significance level of 0.05, we reject the null hypothesis. Thus, there is difference between the number of the users of the companies. As it seen form the barplot, yellow cab has higher number of users.

2. Is there any significant difference between mean of the profit of yellow cub and mean of the pink cub?

\[ H_0:\mu_{pink\ cab} = \mu_{yellow \ cab}\\ H_1:\mu_{pink\ cab} \ne \mu_{yellow \ cab} \]

# mean of the profit for each cab in overall
pink_profit<-mean(all$profit[all$Company=="Pink Cab"])
yellow_profit<-mean(all$profit[all$Company=="Yellow Cab"])

mean<-c(round(pink_profit,3),round(yellow_profit,3))
company<-c("Pink Cab","Yellow Cab")
data<-data.frame(company,mean)

library(scales)
ggplot(data, 
       aes(x = company, y = mean)) +
  geom_bar(stat = "identity", 
           fill = "cornflowerblue") +
  geom_text(aes(label = dollar(mean)), 
            vjust = -0.25) +
  scale_y_continuous(breaks = seq(-500,200,50), 
                     label = dollar) +
  labs(title = "Mean profit by company")

Let’s test it. For t-test there is two assumptions which are normality and homogeneity of variances. Before applying t-test assumptions should be checked.

  1. Normality of profit (Anderson-Darling Test For Normality)

\[ H_0:\ Variable\ follow\ normal\ distribution.\\ H_1:\ Variable\ does \ not\ follow\ normal\ distribution.\ \]

library(nortest)
ad.test(all$profit[all$Company=="Pink Cab"])
## 
##  Anderson-Darling normality test
## 
## data:  all$profit[all$Company == "Pink Cab"]
## A = 2984.4, p-value < 2.2e-16
ad.test(all$profit[all$Company=="Yellow Cab"])
## 
##  Anderson-Darling normality test
## 
## data:  all$profit[all$Company == "Yellow Cab"]
## A = 13455, p-value < 2.2e-16

Since p-value of the normality test smaller than the significance level of 0.05, profit do not distribute normal. (Box-Cox and bestnormalize methods are tried, but normality was not met). Thus, we cannot apply t-test. Since the assumptions of the t-test which are normality and homogeneity of the variances were not met, non-parametric version of the t-test (Wilcoxon Rank Sum Test) was used.

## testing difference 
wilcox.test(all$profit~all$Company )
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  all$profit by all$Company
## W = 7184404010, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

Since the p-value of wilcoxon test smaller than the significance level of 0.05, we reject the null hypothesis. Thus, there is difference between the number of the users of the companies. As it seen form the barplot, yellow cab has higher profit.

3. Which cab is preferred most on holidays?

## usage of cabs in holidays

yellowcab<- all2[all2$Company=="Yellow Cab",]
pinkcab<- all2[all2$Company=="Pink Cab",]


y<-yellowcab %>% 
  select(Users, Company, New.Year.Day) %>%  
  group_by(New.Year.Day) %>% 
  summarise(n = n(),
            mean = mean(Users))

p<-pinkcab %>% 
  select(Users, Company, New.Year.Day) %>%  
  group_by(New.Year.Day) %>% 
  summarise(n = n(),
            mean = mean(Users))

pink<-cbind(p,company=rep("Pink cab",10))
yellow<-cbind(y,company=rep("Yellow cab",10))
user<-rbind(pink,yellow)
ggplot(data=user, aes(x=New.Year.Day, y=n, fill=company)) +
  geom_bar(stat="identity", position=position_dodge())+
  geom_text(aes(label=n), vjust=0.4,hjust=1, color="white",
            position = position_dodge(0.9), size=2.7, fontface="bold")+
  scale_fill_brewer(palette="Set1")+
  theme_grey()+labs(title = "Number of taxi users in holidays for each company",y="number of users",x="Holidays")+ coord_flip()

  • Bar plot shows the big majority of taxi users preferred yellow cab for all holidays. Let’s test it. (Assuming all assumptions are satisfied)
Two-Way Anova

There is two hypothesis:

\[ H_0:\ There\ is\ no\ difference\ in\ the\ means\ of\ company\\ H_0:\ There\ is\ no\ difference\ in\ means\ of\ holidays\ \]

res.aov2 <- aov(n ~ New.Year.Day + company, data = user)
summary(res.aov2)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## New.Year.Day  9 866947   96327   5.114 0.011633 *  
## company       1 702000  702000  37.271 0.000178 ***
## Residuals     9 169515   18835                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

From the ANOVA results, we can conclude the following, based on the p-values and a significance level of 0.05:

  • The p-value of holidays is 0.011633 (significant), which indicates that the levels of holidays are associated with significant different number of users.
  • The p-value of company is 0.000178 (significant), which indicates that the levels of company are associated with significant different number of users.

Thus, we statistically prove that, there is significant difference between number of users in holidays for the yellow cab and pink cab.

4. Number of taxi users in each city for each company

## usage of cabs in holidays

yellowcab<- all[all$Company=="Yellow Cab",]
pinkcab<- all[all$Company=="Pink Cab",]


y<-yellowcab %>% 
  select(Users, Company, City) %>%  
  group_by(City) %>% 
  summarise(n = n(),
            mean = mean(Users))

p<-pinkcab %>% 
  select(Users, Company, City) %>%  
  group_by(City) %>% 
  summarise(n = n(),
            mean = mean(Users))

pink<-cbind(p,company=rep("Pink cab",19))
yellow<-cbind(y,company=rep("Yellow cab",19))
City<-rbind(pink,yellow)
ggplot(data=City, aes(x=City, y=n, fill=company)) +
  geom_bar(stat="identity", position=position_dodge())+
  geom_text(aes(label=n), vjust=0.5,hjust=-0.1, color="black",
            position = position_dodge(0.9), size=2, fontface="bold")+
  scale_fill_brewer(palette="Set2")+
  theme_grey()+labs(title = "Number of taxi users in each city for each company",y="number of users")+ coord_flip()

  • Bar plot shows the number of users for Pink cab is bigger than the Yellow cab only in the cities San Diego, Sacramento, Pittsburgh and Nashville, but in big majority of the cities yellow cab is preferred. Let’s test the difference.(Assuming all assumptions are satisfied)
Two-Way Anova

There is two hypothesis:

\[ H_0:\ There\ is\ no\ difference\ in\ the\ means\ of\ company\\ H_0:\ There\ is\ no\ difference\ in\ means\ of\ city\ \]

res.aov <- aov(n ~ City + company, data = City)
summary(res.aov)
##             Df    Sum Sq   Mean Sq F value Pr(>F)  
## City        18 6.135e+09 340838845   1.879 0.0953 .
## company      1 9.497e+08 949700024   5.235 0.0344 *
## Residuals   18 3.265e+09 181400671                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

From the ANOVA results, we can conclude the following, based on the p-values and a significance level of 0.05:

  • The p-value of company is 0.0344 (significant), which indicates that the levels of company are associated with significant different number of users.

Thus, we statistically prove that, there is significant difference between number of users for the yellow cab and pink cab.

5. How much has the number of users changed in each company over the years?

## profit and number of users for each year
first.period<-cab%>% 
  select(date, profit, Company, Price.Charged,KM.Travelled) %>%  
  filter(date >="2016-01-01 "& date <"2017-01-01") %>% 
  group_by(Company) %>% 
  summarise(n = n(),
           mean_profit= mean(profit), mean_price_charged=mean(Price.Charged),mean_km=mean(KM.Travelled),mean_profit_per_km=mean(profit)/mean(KM.Travelled))

first<-cbind(first.period,year=rep(2016,2))

second.period<-cab%>% 
  select(date, profit, Company, Price.Charged,KM.Travelled) %>%  
  filter(date >="2017-01-01 "& date <"2018-01-01") %>% 
  group_by(Company) %>% 
  summarise(n = n(),
            mean_profit= mean(profit), mean_price_charged=mean(Price.Charged),mean_km=mean(KM.Travelled),mean_profit_per_km=mean(profit)/mean(KM.Travelled))
second<-cbind(second.period,year=rep(2017,2))

third.period<-cab%>% 
  select(date, profit, Company, Price.Charged,KM.Travelled) %>%  
  filter(date >="2018-01-01 "& date <="2018-12-31") %>% 
  group_by(Company) %>% 
  summarise(n = n(),
            mean_profit=mean(profit), mean_price_charged=mean(Price.Charged),mean_km=mean(KM.Travelled), mean_profit_per_km=mean(profit)/mean(KM.Travelled))
third<-cbind(third.period,year=rep(2018,2))
time<-rbind(first,second,third)
ggplot(data=time, aes(x=year, y=n, fill=Company)) +
  geom_bar(stat="identity", position=position_dodge(),color="grey")+
  geom_text(aes(label=n), vjust=1.6,hjust=0.5, color="black",
            position = position_dodge(0.9), size=3, fontface="bold")+
  scale_fill_brewer(palette="Pastel1")+
  theme_grey()+labs(title = "Users for each year",y="Users")

Percentage change of number of users by years:

Years Pink cab Yellow cab
2016-2017 20,89% increasing 19,39% increasing
2017-2018 3,33% decreasing 4% decreasing
2016-2018 16,86% increasing 14,6% increasing
  • From 2016 to 2018, percentage of number of users of of pink cab increased more than yellow cab, but in overall big majority of the users choose the yellow cab.

6. How much has the profit by per km changed in each company over the years?

ggplot(data=time, aes(x=year, y=round(mean_profit_per_km,4), fill=Company)) +
  geom_bar(stat="identity", position=position_dodge(),color="grey")+
  geom_text(aes(label=round(mean_profit_per_km,4)), vjust=1.6,hjust=0.5, color="black",
            position = position_dodge(0.9), size=3, fontface="bold")+
  scale_fill_brewer(palette="Accent")+
  theme_grey()+labs(title = "Mean profit by per km for each year",y="mean profit by per km")

Percentage change of profit by per km by years:

Years Pink cab Yellow cab
2016-2017 2,48% decreasing 0,02% increasing
2017-2018 20,49% decreasing 15,05% decreasing
2016-2018 22,46% decreasing 15,02% decreasing
  • From 2016 to 2017 the profit by per km in yellow cab increased, while pink cab decreased.
  • From 2016 to 2018, profit by per km of pink cab decreased more than yellow cab.
  • In overall, profit by per km of yellow cab was higher in all years compared to pink cab.

7. How much has the profit changed in each company over the years?

ggplot(data=time, aes(x=year, y=mean_profit, fill=Company)) +
  geom_bar(stat="identity", position=position_dodge())+
  geom_text(aes(label=round(mean_profit,4)), vjust=1.6,hjust=0.5, color="white",
            position = position_dodge(0.9), size=3, fontface="bold")+
  scale_fill_brewer(palette="Dark2")+
  theme_grey()+labs(title = "Mean profit for each year")

Percentage change of profit by years:

Years Pink cab Yellow cab
2016-2017 1,83% decreasing 0,31% decreasing
2017-2018 20,63% decreasing 15,04% decreasing
2016-2018 22,08% decreasing 15,31% decreasing
  • From 2016 to 2018, profit of pink cab decreased more than yellow cab.
  • In overall, yellow cab has earned more profit in all years compared to pink cab.

8. Forecasted number of users and profit of pink cab for 2019

library(ggplot2)
library(ggtext)
library(tidyverse)
library(forecast)
pink<-time[time$Company=="Pink Cab", c(1:3,7)]

pink.forecast<-ts(pink,frequency = 1,start = 2016,end = 2018)
forecasted_user<-pink.forecast[,2]
tbats<-tbats(forecasted_user)
forecast<-forecast::forecast(tbats,h=1)
forecast
##      Point Forecast    Lo 80    Hi 80   Lo 95    Hi 95
## 2019       28215.02 25319.96 31110.09 23787.4 32642.65

Using one of the forecasting method “tbats”, we find the number of users in 2019 for pink cab as 28215 with 95% CI (23787.4, 32642.65).

forecasted_profit<-pink.forecast[,3]
tbats<-tbats(forecasted_profit)
forecast2<-forecast::forecast(tbats,h=1)
forecast2
##      Point Forecast    Lo 80    Hi 80    Lo 95   Hi 95
## 2019       47.86364 44.09721 51.63007 42.10339 53.6239

Using one of the forecasting method “tbats”, we find the profit in 2019 for pink cab as 47.86364 with 95% CI (42.10339, 53.6239).

Then, we can visualize it.

pink<-rbind(pink, data.frame(Company="Pink Cab", n=28215, year=2019, mean_profit = 47.86364 ))

days.highlight <- pink$year[3:4]

pink$highlight <- ifelse(pink$year %in% days.highlight, "blue", "black")

pink$lower<-c(25080,30321,29310,23787.4)
pink$upper<-c(25080,30321,29310,32642.65)

g<-ggplot(pink, aes(x = year, y = n, colour = highlight, group = 1)) +
    geom_line() + geom_point() +
    scale_colour_identity(pink$highlight)+ geom_vline(xintercept = 2018,color="red",linetype="dashed")

lab_md <- "Forecasted number of pink cab users is <b style='color:red;'>*28215*"

g +geom_ribbon(aes(ymin=lower, ymax=upper),alpha=0.1,       #transparency
                linetype=1,      #solid, dashed or other line types
                colour="black", #border line color
                size=0.1,          #border line size
                fill="blue") +  geom_richtext(aes(x =2018.3 , y = 25000, label = lab_md),
                stat = "unique") +labs(title = "Forecasted number of Pink cab users for 2019",y="number of users")

pink$lower<-c(68.32182,67.07084,53.22969,42.10339)
pink$upper<-c(68.32182,67.07084,53.22969,53.6239)

g<-ggplot(pink, aes(x = year, y = mean_profit, colour = highlight, group = 1)) +
    geom_line() + geom_point() +
    scale_colour_identity(pink$highlight)+ geom_vline(xintercept = 2018,color="red",linetype="dashed")

lab_md <- "Forecasted mean profit of pink cab  is <b style='color:red;'>*47.86*"

g +geom_ribbon(aes(ymin=lower, ymax=upper),alpha=0.1,       #transparency
                linetype=1,      #solid, dashed or other line types
                colour="black", #border line color
                size=0.1,          #border line size
                fill="green") +  geom_richtext(aes(x =2018.3 , y = 60, label = lab_md),
                stat = "unique") +labs(title = "Forecasted profit for Pink cab in 2019",y="profit")

9. Forecasted number of users and profit of yellow cab for 2019

Yellow<-time[time$Company=="Yellow Cab", c(1:3,7)]

Yellow.forecast<-ts(Yellow,frequency = 1,start = 2016,end = 2018)
forecasted_user<-Yellow.forecast[,2]
tbats<-tbats(forecasted_user)
forecast3<-forecast::forecast(tbats,h=1)
forecast3
##      Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 2019       91496.84 82845.56 100148.1 78265.85 104727.8

Using one of the forecasting method “tbats”, we find the number of users in 2019 for Yellow cab as 91496 with 95% CI (78265.85, 104727.8).

forecasted_profit<-Yellow.forecast[,3]
tbats<-tbats(forecasted_profit)
forecast4<-forecast::forecast(tbats,h=1)
forecast4
##      Point Forecast    Lo 80    Hi 80   Lo 95    Hi 95
## 2019       134.7566 127.3165 142.1967 123.378 146.1352

Using one of the forecasting method “tbats”, we find the profit in 2019 for Yellow cab as 134.7566 with 95% CI (123.378, 146.1352).

Then, we can visualize it.

Yellow<-rbind(Yellow, data.frame(Company="Yellow Cab", n=91496, year=2019, mean_profit = 134.7566 ))

days.highlight <- Yellow$year[3:4]
Yellow$highlight <- ifelse(Yellow$year %in% days.highlight, "blue", "black")

Yellow$lower<-c(82239,98189,94253,78265.85)
Yellow$upper<-c(82239,98189,94253,104727.8)

g<-ggplot(Yellow, aes(x = year, y = n, colour = highlight, group = 1)) +
    geom_line() +geom_point() +
    scale_colour_identity(Yellow$highlight)+ geom_vline(xintercept = 2018,color="red",linetype="dashed")

lab_md <- "Forecasted number of yellow cab users is <b style='color:red;'>*91496*"

g +geom_ribbon(aes(ymin=lower, ymax=upper),alpha=0.1,     
                linetype=1,      
                colour="black", 
                size=0.1,         
                fill="blue") +  geom_richtext(aes(x =2018.2 , y = 80000, label = lab_md),stat = "unique") +labs(title = "Forecasted number of Yellow cab users for 2019",y="number of users")

Yellow$lower<-c(169.34782,168.81706,143.41612,123.378)
Yellow$upper<-c(169.34782,168.81706,143.41612,146.1352)

g<-ggplot(Yellow, aes(x = year, y = mean_profit, colour = highlight, group = 1)) +
    geom_line() + geom_point() +
    scale_colour_identity(Yellow$highlight)+ geom_vline(xintercept = 2018,color="red",linetype="dashed")

lab_md <- "Forecasted mean profit of yellow cab  is <b style='color:red;'>*134.76*"

g +geom_ribbon(aes(ymin=lower, ymax=upper),alpha=0.1,       
                linetype=1,      
                colour="black", 
                size=0.1,          
                fill="green") +  geom_richtext(aes(x =2018.3 , y = 150, label = lab_md), stat = "unique") +labs(title = "Forecasted profit for Yellow cab in 2019",y="profit")

10. Is there any seasonality in number of customers using the cab service.

yellowcab<- all[all$Company=="Yellow Cab",]
pinkcab<- all[all$Company=="Pink Cab",]

seasonality.yellow<-yellowcab%>% 
  select( profit, Company, date,Users) %>%  
  group_by(date) %>% 
  summarise(n = n(),
           mean_profit= mean(profit),mean_users=mean(Users))
head(seasonality.yellow)
## # A tibble: 6 x 4
##   date           n mean_profit mean_users
##   <date>     <int>       <dbl>      <dbl>
## 1 2016-01-02   140        324.    155635.
## 2 2016-01-03   126        305.    151121.
## 3 2016-01-04    21        174.    149093.
## 4 2016-01-05    41        277.    176513.
## 5 2016-01-06    86        217.    148281.
## 6 2016-01-07   117        224.    148852.
range(seasonality.yellow$date)
## [1] "2016-01-02" "2018-12-31"
seasonality.pink<-pinkcab%>% 
  select( profit, Company, date,Users) %>%  
  group_by(date) %>% 
  summarise(n = n(),
           mean_profit= mean(profit),mean_users=mean(Users))
head(seasonality.pink)
## # A tibble: 6 x 4
##   date           n mean_profit mean_users
##   <date>     <int>       <dbl>      <dbl>
## 1 2016-01-02    41       117.      97443.
## 2 2016-01-03    52       204.     107849.
## 3 2016-01-04     4       107.     223140.
## 4 2016-01-05     6       124.      93150.
## 5 2016-01-06    23       102.     112624.
## 6 2016-01-07    24        52.3     95571.
range(seasonality.pink$date)
## [1] "2016-01-02" "2018-12-31"
time<-ts(seasonality.yellow,start =c(2016,01), end =c(2018,12), frequency = 12 )
gt<-time[,2] #taking number of users column
a<-gt %>%
  stl(t.window=13, s.window="periodic", robust=TRUE) 
  autoplot(a, main="Seasonality in number users for yellow cab ")

  • Yellow cab users show a certain fluctuation, so there is seasonality. While the number of users increases in the first months of the year, it decreases regularly in the last months of the year.
time2<-ts(seasonality.pink,start =c(2016,01), end =c(2018,12), frequency = 12 )
gt2<-time2[,2] #taking number of users column
a2<-gt2 %>%
  stl(t.window=13, s.window="periodic", robust=TRUE) 
  autoplot(a2 , main="Seasonality in number users for pink cab ")

  • Pink cab users show a more fluctuation than yellow cab, so there is seasonality. While the number of users increases in certain months of the year, it decreases regularly in certain months of the year.

11. Is there any effect of user features on usage of taxi

#if income taken as categorical variable
for(i in 1:359392){
 if(all$income[i]>= 0 & all$income[i]<= 18000){
  all$income[i]="(2000-10000)"
 }else if(all$income[i]>=18001 & all$income[i]<= 28000){
  all$income[i]="(10000-20000)"
 }else {all$income[i]="(20001-35000)"}
}
all$income<-factor(all$income)
#if age taken as categorical variable
for(i in 1:359392){
 if(all$Age[i]>= 18 & all$Age[i]<= 28){
  all$Age[i]="(18-28)"
 }else if(all$Age[i]>=34 & all$Age[i]<= 44){
  all$Age[i]="(34-44)"
 }else{all$Age[i]="(45-65)"
  
 }
}
all$Age<-factor(all$Age)

Age effect

library(dplyr)
library(plotly)
f<- all[all$Age=="(18-28)",]
s<- all[all$Age=="(34-44)",]
t<- all[all$Age=="(45-65)",]

age1<-f%>% 
  select( profit, Company, Age) %>%  
  group_by(Company) %>% 
  summarise(n = n(),
           mean_profit= mean(profit))

age2<-s%>% 
  select( profit, Company, Age) %>%  
  group_by(Company) %>% 
  summarise(n = n(),
           mean_profit= mean(profit))

age3<-t%>% 
  select( profit, Company, Age) %>%  
  group_by(Company) %>% 
  summarise(n = n(),
           mean_profit= mean(profit))
fig <- plot_ly(data = age1, labels = ~Company, values = ~n,marker = list(colors = c('Pink', 'Gold')))
fig <- fig %>% add_pie(data = age1, labels = ~Company, values = ~n,textinfo='label+percent+value', 
                         name = "Company", domain = list(row = 0, column = 0),title="Percentage of (18-28)")

fig <- fig %>% add_pie(data = age2, labels = ~Company, values = ~n,textinfo='label+percent+value',
                      name = "Company", domain = list(row = 0, column = 1),title="Percentage of (34-44)")


fig <- fig %>% add_pie(data = age3, labels = ~Company, values = ~n,textinfo='label+percent+value',
                     name = "Company", domain = list(row = 0, column = 2),title="Percentage of (45-65)")

fig <- fig %>% layout(title = "Percentage of taxi users by age for each company", showlegend = F,
                      grid=list(rows=1, columns=3),
                      xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
                      yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
fig
  • All age groups mostly prefer the yellow cab.
##       Age      n mean_profit    company
## 1 (18-28)  97897   161.06144 yellow cab
## 2 (34-44)  72071   159.80837 yellow cab
## 3 (45-65) 104713   159.82154 yellow cab
## 4 (18-28)  30295    63.29110   pink cab
## 5 (34-44)  22059    61.99979   pink cab
## 6 (45-65)  32357    62.49872   pink cab
ggplot(data=num.age, aes(x=Age, y=n, fill=company)) +
  geom_bar(stat="identity", position=position_dodge())+
  geom_text(aes(label=(n)), vjust=2.5,hjust=0.5, color="black",
            position = position_dodge(0.9), size=3, fontface="bold")+
  scale_fill_brewer(palette="Set1")+
  theme_grey()+labs(title = "Number of taxi users by age for each company",y="number of users")

  • In both companies, people between the ages of 45-65 generally use taxi and people between the ages of at least 34-44 use a taxi.
ggplot(data=num.age, aes(x=company, y=mean_profit, fill=Age)) +
  geom_bar(stat="identity", position=position_dodge(),color="grey")+
  geom_text(aes(label=round(mean_profit,3)), vjust=2.5,hjust=0.5, color="black",
            position = position_dodge(0.9), size=3, fontface="bold")+
  scale_fill_brewer(palette="RdYlGn")+
  theme_grey()+labs(title = "Profit contribution of age in each company",y="mean profit")

  • People aged 18-28 in both companies contribute more to the profit.

Gender effect

gender<-pinkcab%>% 
  select( profit, Company, Gender) %>%  
  group_by(Gender) %>% 
  summarise(n = n(),
           mean_profit= mean(profit))

gender2<-yellowcab%>% 
  select( profit, Company, Gender) %>%  
  group_by(Gender) %>% 
  summarise(n = n(),
           mean_profit= mean(profit))

num.gender<-rbind(gender,gender2)
company=data.frame(company=c( rep("pink cab",2),rep("yellow cab",2)))
num.gender<-cbind(num.gender,company)
percent<-c(24.42,22.94,75.58,77.06)
num.gender<-cbind(num.gender,percent)

num.gender
##   Gender      n mean_profit    company percent
## 1 Female  37480    62.18070   pink cab   24.42
## 2   Male  47231    63.02631   pink cab   22.94
## 3 Female 116000   156.30532 yellow cab   75.58
## 4   Male 158681   163.15095 yellow cab   77.06
fig <- plot_ly(data = gender, labels = ~Company, values = ~n,marker = list(colors = c('Pink', 'Gold')))
fig <- fig %>% add_pie(data = gender, labels = ~Company, values = ~n,textinfo='label+percent+value', 
                         name = "Company", domain = list(row = 0, column = 0),title="Percentage of Males")

fig <- fig %>% add_pie(data = gender2, labels = ~Company, values = ~n,textinfo='label+percent+value',
                       name = "Company", domain = list(row = 0, column = 1),title="Percentage of Females")

fig <- fig %>% layout(title = "Percentage of taxi users by gender for each company", showlegend = F,
                      grid=list(rows=1, columns=2),
                      xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
                      yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))

fig
  • 77.1% of males use yellow cab while 22.9% use pink cab.
  • 75.6% of females use yellow cab while 24.4% use pink cab.
ggplot(data=num.gender, aes(x=company, y=mean_profit, fill=Gender)) +
  geom_bar(stat="identity", position=position_dodge())+
  geom_text(aes(label=(round(mean_profit,3))), vjust=4,hjust=0.5, color="black",
            position = position_dodge(0.9), size=3, fontface="bold")+
  scale_fill_brewer(palette="Pastel1")+
  theme_grey()+labs(title = "Profit contribution of gender in each company",y="Mean profit")

  • Males in both companies contribute more to profit.

Income effect

inc<-pinkcab%>% 
  select( profit, Company, income) %>%  
  group_by(income) %>% 
  summarise(n = n(),
           mean_profit= mean(profit))

inc2<-yellowcab%>% 
  select( profit, Company, income) %>%  
  group_by(income) %>% 
  summarise(n = n(),
           mean_profit= mean(profit))

num.income<-rbind(inc,inc2)
company=data.frame(company=c( rep("pink cab",3),rep("yellow cab",3)))
num.income<-cbind(num.income,company)
num.income
##          income     n mean_profit    company
## 1 (10000-20000) 28539    63.17429   pink cab
## 2  (2000-10000) 26616    62.43748   pink cab
## 3 (20001-35000) 29556    62.34137   pink cab
## 4 (10000-20000) 91660   161.85423 yellow cab
## 5  (2000-10000) 87990   161.66051 yellow cab
## 6 (20001-35000) 95031   157.42554 yellow cab
ggplot(data=num.income, aes(x=income, y=n, fill=company)) +
  geom_bar(stat="identity", position=position_dodge())+
  geom_text(aes(label=(n)), vjust=4,hjust=0.5, color="black",
            position = position_dodge(0.9), size=3, fontface="bold")+
  scale_fill_brewer(palette="Dark2")+
  theme_grey()+labs(title = "Number of taxi users by income for each company",y="number of users")

  • The highest number of users in both companies is between the 20001-35000 income range while the smallest number of users in both companies is between the 2000-10000 income range.
ggplot(data=num.income, aes(x=company, y=mean_profit, fill=income)) +
  geom_bar(stat="identity", position=position_dodge())+
  geom_text(aes(label=(round(mean_profit,3))), vjust=4,hjust=0.5, color="black",
            position = position_dodge(0.9), size=3.2, fontface="bold")+
  scale_fill_brewer(palette="Reds")+
  theme_grey()+labs(title = "Profit contribution of income of users in each company",y="Mean profit")

  • Users in the income range of 2000-10000 contribute the most to the profit of yellow cab.
  • Users in the income range of 10000-20000 contribute the most to the profit of pink cab.

Let’s test it with chi-square independence test.

\[ H_0:\ Variables\ are\ independent\\ H_1:\ There\ is\ relationship\ between\ variables \]

library(kableExtra)
chis <- lapply(all[,c(4,5,6)], function(x) chisq.test(all[,c(7)],x,simulate.p.value=TRUE))
chis.out <- do.call(rbind, chis)[,c(1,3)]
a <- data.frame(chis.out)
a <- matrix(unlist(a), ncol = 2)
colnames(a) <- colnames(chis.out)
rownames(a) <- rownames(chis.out)

kable(round(a,4), caption = "Chi-Square Test Results") %>%
  kable_styling(font_size = 9)
Chi-Square Test Results
statistic p.value
Gender 107.3029 0.0005
Age 1.3355 0.5162
income 11.2423 0.0040
  • As seen in the p-values of gender and income are smaller than the significance level of 0.05, so we reject Ho. That means there is statistically relationship between choice of company and gender, income of users. However, age and company are independent.

Modeling

# taking company as 1: Yellow cab, 0: Pink cab for modeling part
for(i in 1:359392){
 if(all$Company[i]== "Pink Cab"){
  all$Company[i]=0
 }else{all$Company[i]=1
  
 }
}
# spliting data into a training set and testing set
library(rsample)
set.seed(9)
split <- initial_split(all, prop = .8)
train1 <- training(split)
test  <- testing(split)
nrow(train1)
## [1] 287514
nrow(test)
## [1] 71878

Multiple Logistic Regression

  • Research Question : If gender, age, income of users, charged price, cost of the trip, population of the cities affect the choice of cab type?
  • Response variable: Company
  • Covariates : gender, age, income, charged price, cost of the trip, population
model<-glm(Company~ Gender+ scale(Cost.of.Trip)+ scale(Price.Charged)  +scale(Age)  +scale(income )  +scale(Population) ,data = train1,family  = binomial)
summary(model)
## 
## Call:
## glm(formula = Company ~ Gender + scale(Cost.of.Trip) + scale(Price.Charged) + 
##     scale(Age) + scale(income) + scale(Population), family = binomial, 
##     data = train1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.2428   0.0972   0.5887   0.8396   1.5261  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           1.416305   0.007802 181.521  < 2e-16 ***
## GenderMale            0.065072   0.009253   7.033 2.03e-12 ***
## scale(Cost.of.Trip)  -1.276955   0.013666 -93.439  < 2e-16 ***
## scale(Price.Charged)  2.125129   0.018168 116.973  < 2e-16 ***
## scale(Age)            0.007677   0.004582   1.675   0.0939 .  
## scale(income)        -0.001900   0.004579  -0.415   0.6782    
## scale(Population)    -0.109563   0.006134 -17.861  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 314277  on 287513  degrees of freedom
## Residual deviance: 286146  on 287507  degrees of freedom
## AIC: 286160
## 
## Number of Fisher Scoring iterations: 5
  • Gender, charged price, cost of the trip, population of the cities have significant effect on the choice of cab type at the significance level of 0.05.
  • If significance level taken as 0.1, age also has significant effect on choice of cab type.

\[ Model: \]

\[ \etaᵢ= log\left( \frac{\piᵢ}{1−\piᵢ} \right) = 1.413 + 0.068 (Gender) + 0.0075(Age) -0.007 (Income) + 2.102 (Charged\ price) -0.102 (population) -1.26 (Cost\ of\ the\ trip) \]

\[ where\ \piᵢ = E(Yᵢ)\ \\\textrm{probability of succes} \]

\[ where,\ Gender=\ 1:Male,\ 0:Female \]

glm.probs2 <- predict(model,
                    newdata = test,
                    type = "response")

glm.pred2 <- ifelse(glm.probs2 > 0.5, "1", "0")
caret::confusionMatrix(factor(glm.pred2),factor(test$Company))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0   205   214
##          1 16624 54835
##                                           
##                Accuracy : 0.7657          
##                  95% CI : (0.7626, 0.7688)
##     No Information Rate : 0.7659          
##     P-Value [Acc > NIR] : 0.5337          
##                                           
##                   Kappa : 0.0125          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.012181        
##             Specificity : 0.996113        
##          Pos Pred Value : 0.489260        
##          Neg Pred Value : 0.767363        
##              Prevalence : 0.234133        
##          Detection Rate : 0.002852        
##    Detection Prevalence : 0.005829        
##       Balanced Accuracy : 0.504147        
##                                           
##        'Positive' Class : 0               
## 
  • The “no-information rate” is the largest proportion of the observed classes. A hypothesis test is also computed to evaluate whether the overall accuracy rate is greater than the rate of the largest class.
pred <- predict(model, newdata = test, type = 'response')
RMSE(pred, test$Company)
## [1] 0.405972
Values<-c(0.7657,0.7659,0.01,0.99,0.41)
Metrics<-c("Accuracy", "No Information Rate","Sensitivity","Specificity ","RMSE")
v1=data.frame(Values,Metrics)

ggplot(v1, aes(x=Metrics, y=Values)) + 
  geom_bar(stat = "identity",fill="gold")+coord_flip()+ggtitle("Model Evaluation Metrics for logistic regression")+geom_text(aes(label = Values), vjust = 0, hjust = 0.15,size=4) 

When we look at the model evaluation metrics:

  • While accuracy is 0.7657, no information rate is 0.7659. 95% CI (0.7626, 0.7688) includes no information rate, so accuracy is not satisfactory for this model.

  • Sensitivity is 0.01 which is the ability of a test to correctly classify an users as “choose yellow cab”. It is not good. Model is doing the mistake of 99% when predicting the people who really choose yellow cab.

  • Specificity is 0.99 which is the ability of test to correctly classify an user as “choose pink cab”.

  • Root mean square error is 0.41. This shows how actual values are close to the estimates.

mean(predict(model, test, type="response"))
## [1] 0.7640167
Probability of
yellow cab being chosen
0.764
Probability of
pink cab being chosen
0.236

Checking assumptions for logistic regression

  • Residual davinace is 286146 on 287507 degrees of freedom. Division of two gives the overdispersion coefficient. Since it is smaller than 1, we do not have overdispersion problem in this model.
  • At the EDA part, it is seem that, some variables are highly correlated. This situation may cause the multicollinearity problem in the models. Thus, variance inflation factor values were checked.
car::vif(model)
##               Gender  scale(Cost.of.Trip) scale(Price.Charged) 
##             1.001448             8.443942             8.241572 
##           scale(Age)        scale(income)    scale(Population) 
##             1.000152             1.000135             1.346998
  • As seen in the table some of the VIF values are big and close to 10. that means. Because of the high correlation between variables, conducting different model may be a good solution.

Classification and Regression Tree

library(rpart)
tree = rpart(Company ~ Gender+Payment_Mode+Age+income+KM.Travelled+Price.Charged+Population+Users,
    data= train2, method='class')
rpart.plot(tree,box.palette = "Blues")

  • At the top, it had shown the overall probability of the company chosen yellow cab percentage. 76% of the company is defined Yellow cab.

  • The first node is about the charged price variable. While the percent of charged price is smaller then the 574 with the 74 percent, the probability of a yellow cab 0.71. On the other hand,charged price is bigger then the 574 with 26 percent, while profit is less than 14 out of 100, the probability of a yellow cab 0.93.

  • It had also divided into the third node to number of users. When it is less than 75000 with the 18 percent, the probability of the yellow cab is 0.54. However, it can be said that number of users is greater than 75000 with the 56 percent corresponds to the probability of yellow cab 0.76.

And tree is going like that.

pred <- predict(optimal_tree, newdata = test)
RMSE(pred = pred, obs = test$Company) 
## [1] 0.3786888
rpartpred <- predict(optmal_tree,test,type="class")
caret::confusionMatrix(rpartpred,test$Company)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0  4776  1554
##          1 12053 53495
##                                           
##                Accuracy : 0.8107          
##                  95% CI : (0.8078, 0.8136)
##     No Information Rate : 0.7659          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3262          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.28380         
##             Specificity : 0.97177         
##          Pos Pred Value : 0.75450         
##          Neg Pred Value : 0.81612         
##              Prevalence : 0.23413         
##          Detection Rate : 0.06645         
##    Detection Prevalence : 0.08807         
##       Balanced Accuracy : 0.62778         
##                                           
##        'Positive' Class : 0               
## 
Values<-c(0.81,0.77,0.28,0.97,0.38)
Metrics<-c("Accuracy", "No Information Rate","Sensitivity","Specificity ","RMSE")
v1=data.frame(Values,Metrics)


ggplot(v1, aes(x=Metrics, y=Values)) + 
  geom_bar(stat = "identity",fill="gold")+coord_flip()+ggtitle("Model Evaluation Metrics for decision tree")+geom_text(aes(label = Values), vjust = 0, hjust = 1.2) 

When we look at the model evaluation metrics:

  • While accuracy is 0.81, no information rate is 0.76. 95% CI (0.8078, 0.8136) does not include no information rate, so accuracy is satisfactory for this model.

  • Sensitivity is 0.28 which is the ability of a test to correctly classify an users as “choose yellow cab”. It is not good. Model is doing the mistake of 72% when predicting the people who really choose yellow cab. However, it is good compared with logistic regression model.

  • Specificity is 0.97 which is the ability of test to correctly classify an user as “choose pink cab”.

  • Root mean square error is 0.37. This shows how actual values are close to the estimates.

Which model is better?

  • Accuracy of the classification and regression tree is bigger than the logistic regression model.
  • Sensitivity of the classification and regression tree is bigger than the logistic regression model.
  • There is not so much difference between the specificity of the two models.
  • RMSE of the classification and regression tree is smaller than the logistic regression model.

Thus, all of the metrics indicates that classification and regression tree is better model.

Discussion and Conclusion