Firstly read the 5 datasets:
#US holdiay dataset
hol<-read.csv("US Bank holidays.csv")
hol$X2012.01.02<-as.Date(hol$X2012.01.02)
names(hol)[names(hol) == "X2012.01.02"] <- "date"
#cab dataset
cab<-read.csv("Cab_Data.csv")
cab$City<-factor(cab$City)
cab$Company<-factor(cab$Company)
profit<- cab$Price.Charged-cab$Cost.of.Trip #obtaining profit variable
cab<-cbind(cab,profit) #add the profit to the dataset
#city dataset
library(readxl)
city <- read_excel("city.xlsx")
city$City<-factor(city$City)
#customer dataset
customer<-read.csv("Customer_ID.csv")
customer$Gender<-factor(customer$Gender)
#transaction dataset
transaction<-read.csv("Transaction_ID.csv")
transaction$Payment_Mode<-factor(transaction$Payment_Mode)
# long ve latitude of the cities
df <- read.csv('https://raw.githubusercontent.com/plotly/datasets/master/2011_february_us_airport_traffic.csv')
Let’s look at how the datasets looks like:
head(hol)
## X1 date New.Year.Day
## 1 2 2012-01-16 Martin Luther King Jr. Day
## 2 3 2012-02-20 Presidents Day (Washingtons Birthday)
## 3 4 2012-05-28 Memorial Day
## 4 5 2012-07-04 Independence Day
## 5 6 2012-09-03 Labor Day
## 6 7 2012-10-08 Columbus Day
head(cab)
## Transaction.ID Company City KM.Travelled Price.Charged Cost.of.Trip
## 1 10000011 Pink Cab ATLANTA GA 30.45 370.95 313.635
## 2 10000012 Pink Cab ATLANTA GA 28.62 358.52 334.854
## 3 10000013 Pink Cab ATLANTA GA 9.04 125.20 97.632
## 4 10000014 Pink Cab ATLANTA GA 33.17 377.40 351.602
## 5 10000015 Pink Cab ATLANTA GA 8.73 114.62 97.776
## 6 10000016 Pink Cab ATLANTA GA 6.06 72.43 63.024
## profit date
## 1 57.315 2016-01-08
## 2 23.666 2016-01-06
## 3 27.568 2016-01-02
## 4 25.798 2016-01-07
## 5 16.844 2016-01-03
## 6 9.406 2016-01-07
head(city)
## # A tibble: 6 x 3
## City Population Users
## <fct> <dbl> <dbl>
## 1 NEW YORK NY 8405837 302149
## 2 CHICAGO IL 1955130 164468
## 3 LOS ANGELES CA 1595037 144132
## 4 MIAMI FL 1339155 17675
## 5 SILICON VALLEY 1177609 27247
## 6 ORANGE COUNTY 1030185 12994
head(customer)
## Customer.ID Gender Age Income..USD.Month.
## 1 29290 Male 28 10813
## 2 27703 Male 27 9237
## 3 28712 Male 53 11242
## 4 28020 Male 23 23327
## 5 27182 Male 33 8536
## 6 27318 Male 25 13984
head(transaction)
## Transaction.ID Customer.ID Payment_Mode
## 1 10000011 29290 Card
## 2 10000012 27703 Card
## 3 10000013 28712 Cash
## 4 10000014 28020 Cash
## 5 10000015 27182 Card
## 6 10000016 27318 Cash
head(df)
## iata airport city state country
## 1 ORD Chicago O'Hare International Chicago IL USA
## 2 ATL William B Hartsfield-Atlanta Intl Atlanta GA USA
## 3 DFW Dallas-Fort Worth International Dallas-Fort Worth TX USA
## 4 PHX Phoenix Sky Harbor International Phoenix AZ USA
## 5 DEN Denver Intl Denver CO USA
## 6 IAH George Bush Intercontinental Houston TX USA
## lat long cnt
## 1 41.97960 -87.90446 25129
## 2 33.64044 -84.42694 21925
## 3 32.89595 -97.03720 20662
## 4 33.43417 -112.00806 17290
## 5 39.85841 -104.66700 13781
## 6 29.98047 -95.33972 13223
dim(cab)
## [1] 359392 8
dim(city)
## [1] 20 3
dim(customer)
## [1] 49171 4
dim(transaction)
## [1] 440098 3
dim(hol)
## [1] 89 3
anyNA(hol) #no NA
## [1] FALSE
anyNA(city) #no NA
## [1] FALSE
anyNA(customer) #no NA
## [1] FALSE
anyNA(cab) #no NA
## [1] FALSE
anyNA(transaction) #no NA
## [1] FALSE
All of the dataset do not have NA value.
Then merge the 5 datasets:
## merging datasets
library(dplyr)
city = inner_join(city, lat_long, by = "City", suffix = c("_city","_lat_long"))
ij = inner_join(transaction, customer, by = "Customer.ID", suffix = c("_transaction","_customer"))
all<-inner_join(ij,cab, by = "Transaction.ID", suffix = c("_ij","_cab"))
all<-inner_join(all,city, by = "City", suffix = c("_all","_city"))
all2<-inner_join(all,hol, by = "date", suffix = c("_all","_hol"))
names(all)[names(all) == "Income..USD.Month."] <- "income"
head(all)
## Transaction.ID Customer.ID Payment_Mode Gender Age income Company City
## 1 10000011 29290 Card Male 28 10813 Pink Cab ATLANTA GA
## 2 10000012 27703 Card Male 27 9237 Pink Cab ATLANTA GA
## 3 10000013 28712 Cash Male 53 11242 Pink Cab ATLANTA GA
## 4 10000014 28020 Cash Male 23 23327 Pink Cab ATLANTA GA
## 5 10000015 27182 Card Male 33 8536 Pink Cab ATLANTA GA
## 6 10000016 27318 Cash Male 25 13984 Pink Cab ATLANTA GA
## KM.Travelled Price.Charged Cost.of.Trip profit date Population Users
## 1 30.45 370.95 313.635 57.315 2016-01-08 814885 24701
## 2 28.62 358.52 334.854 23.666 2016-01-06 814885 24701
## 3 9.04 125.20 97.632 27.568 2016-01-02 814885 24701
## 4 33.17 377.40 351.602 25.798 2016-01-07 814885 24701
## 5 8.73 114.62 97.776 16.844 2016-01-03 814885 24701
## 6 6.06 72.43 63.024 9.406 2016-01-07 814885 24701
## lat long
## 1 33.64044 -84.42694
## 2 33.64044 -84.42694
## 3 33.64044 -84.42694
## 4 33.64044 -84.42694
## 5 33.64044 -84.42694
## 6 33.64044 -84.42694
tail(all)
## Transaction.ID Customer.ID Payment_Mode Gender Age income Company
## 359387 10440100 51852 Card Male 46 6165 Yellow Cab
## 359388 10440101 52392 Cash Male 24 15651 Yellow Cab
## 359389 10440104 53286 Cash Male 32 6528 Yellow Cab
## 359390 10440105 52265 Cash Male 56 7966 Yellow Cab
## 359391 10440106 52175 Card Male 32 6423 Yellow Cab
## 359392 10440107 52917 Card Male 20 11284 Yellow Cab
## City KM.Travelled Price.Charged Cost.of.Trip profit
## 359387 WASHINGTON DC 28.71 452.19 351.4104 100.7796
## 359388 WASHINGTON DC 4.80 69.24 63.3600 5.8800
## 359389 WASHINGTON DC 8.40 113.75 106.8480 6.9020
## 359390 WASHINGTON DC 27.75 437.07 349.6500 87.4200
## 359391 WASHINGTON DC 8.80 146.19 114.0480 32.1420
## 359392 WASHINGTON DC 12.76 191.58 177.6192 13.9608
## date Population Users lat long
## 359387 2018-01-07 418859 127001 38.8951 -77.0364
## 359388 2018-01-08 418859 127001 38.8951 -77.0364
## 359389 2018-01-04 418859 127001 38.8951 -77.0364
## 359390 2018-01-05 418859 127001 38.8951 -77.0364
## 359391 2018-01-05 418859 127001 38.8951 -77.0364
## 359392 2018-01-02 418859 127001 38.8951 -77.0364
dim(all)
## [1] 359392 17
range(all$date)
## [1] "2016-01-02" "2018-12-31"
summary(all)
## Transaction.ID Customer.ID Payment_Mode Gender
## Min. :10000011 Min. : 1 Card:215504 Female:153480
## 1st Qu.:10110810 1st Qu.: 2705 Cash:143888 Male :205912
## Median :10221036 Median : 7459
## Mean :10220761 Mean :19192
## 3rd Qu.:10330937 3rd Qu.:36078
## Max. :10440107 Max. :60000
## Age income Company City
## Min. :18.00 Min. : 2000 Pink Cab : 84711 Length:359392
## 1st Qu.:25.00 1st Qu.: 8424 Yellow Cab:274681 Class :character
## Median :33.00 Median :14685 Mode :character
## Mean :35.34 Mean :15049
## 3rd Qu.:42.00 3rd Qu.:21035
## Max. :65.00 Max. :35000
## KM.Travelled Price.Charged Cost.of.Trip profit
## Min. : 1.90 Min. : 15.6 Min. : 19.0 Min. :-220.06
## 1st Qu.:12.00 1st Qu.: 206.4 1st Qu.:151.2 1st Qu.: 28.01
## Median :22.44 Median : 386.4 Median :282.5 Median : 81.96
## Mean :22.57 Mean : 423.4 Mean :286.2 Mean : 137.25
## 3rd Qu.:32.96 3rd Qu.: 583.7 3rd Qu.:413.7 3rd Qu.: 190.03
## Max. :48.00 Max. :2048.0 Max. :691.2 Max. :1463.97
## date Population Users lat
## Min. :2016-01-02 Min. : 248968 Min. : 927 Min. :25.79
## 1st Qu.:2016-11-23 1st Qu.: 671238 1st Qu.: 80021 1st Qu.:33.94
## Median :2017-09-10 Median :1595037 Median :144132 Median :40.64
## Mean :2017-08-17 Mean :3132198 Mean :158296 Mean :38.55
## 3rd Qu.:2018-05-12 3rd Qu.:8405837 3rd Qu.:302149 3rd Qu.:41.79
## Max. :2018-12-31 Max. :8405837 Max. :302149 Max. :47.45
## long
## Min. :-122.31
## 1st Qu.:-117.19
## Median : -80.29
## Mean : -89.61
## 3rd Qu.: -73.78
## Max. : -71.01
head(all2)
## Transaction.ID Customer.ID Payment_Mode Gender Age Income..USD.Month.
## 1 10001395 29150 Card Male 18 13969
## 2 10001397 35258 Card Male 33 19129
## 3 10001410 57152 Card Male 30 4444
## 4 10001417 3242 Cash Male 38 23148
## 5 10001422 5299 Card Male 64 24313
## 6 10001425 3968 Cash Male 30 21443
## Company City KM.Travelled Price.Charged Cost.of.Trip profit
## 1 Pink Cab ATLANTA GA 8.12 96.30 82.012 14.288
## 2 Pink Cab AUSTIN TX 31.92 573.32 363.888 209.432
## 3 Pink Cab BOSTON MA 4.52 42.10 51.076 -8.976
## 4 Pink Cab CHICAGO IL 12.35 165.80 135.850 29.950
## 5 Pink Cab CHICAGO IL 44.80 643.13 497.280 145.850
## 6 Pink Cab CHICAGO IL 31.08 442.04 348.096 93.944
## date Population Users lat long X1 New.Year.Day
## 1 2016-01-18 814885 24701 33.64044 -84.42694 42 Martin Luther King Jr. Day
## 2 2016-01-18 698371 14978 30.19453 -97.66987 42 Martin Luther King Jr. Day
## 3 2016-01-18 248968 80021 42.36435 -71.00518 42 Martin Luther King Jr. Day
## 4 2016-01-18 1955130 164468 41.78598 -87.75242 42 Martin Luther King Jr. Day
## 5 2016-01-18 1955130 164468 41.78598 -87.75242 42 Martin Luther King Jr. Day
## 6 2016-01-18 1955130 164468 41.78598 -87.75242 42 Martin Luther King Jr. Day
tail(all2)
## Transaction.ID Customer.ID Payment_Mode Gender Age Income..USD.Month.
## 7572 10439483 19735 Card Female 20 21556
## 7573 10439775 37356 Card Male 63 24505
## 7574 10439812 13687 Cash Female 34 18921
## 7575 10439817 14876 Card Female 50 7955
## 7576 10439876 53382 Card Female 64 15372
## 7577 10439881 53705 Card Male 40 20936
## Company City KM.Travelled Price.Charged Cost.of.Trip profit
## 7572 Yellow Cab SAN DIEGO CA 23.52 377.44 318.9312 58.5088
## 7573 Yellow Cab SEATTLE WA 15.36 236.13 211.9680 24.1620
## 7574 Yellow Cab SILICON VALLEY 7.42 134.12 105.9576 28.1624
## 7575 Yellow Cab SILICON VALLEY 28.84 584.31 411.8352 172.4748
## 7576 Yellow Cab WASHINGTON DC 10.56 139.90 131.7888 8.1112
## 7577 Yellow Cab WASHINGTON DC 23.10 316.22 279.9720 36.2480
## date Population Users lat long X1 New.Year.Day
## 7572 2018-01-01 959307 69995 32.73356 -117.1897 61 New Year Day
## 7573 2018-01-01 671238 25063 47.44898 -122.3093 61 New Year Day
## 7574 2018-01-01 1177609 27247 37.37000 -122.0400 61 New Year Day
## 7575 2018-01-01 1177609 27247 37.37000 -122.0400 61 New Year Day
## 7576 2018-01-01 418859 127001 38.89510 -77.0364 61 New Year Day
## 7577 2018-01-01 418859 127001 38.89510 -77.0364 61 New Year Day
dim(all2)
## [1] 7577 19
summary(all2)
## Transaction.ID Customer.ID Payment_Mode Gender Age
## Min. :10001395 Min. : 1 Card:4515 Female:3217 Min. :18.00
## 1st Qu.:10128166 1st Qu.: 2681 Cash:3062 Male :4360 1st Qu.:25.00
## Median :10253748 Median : 7514 Median :33.00
## Mean :10230055 Mean :19159 Mean :35.42
## 3rd Qu.:10302493 3rd Qu.:35724 3rd Qu.:42.00
## Max. :10439881 Max. :60000 Max. :65.00
## Income..USD.Month. Company City KM.Travelled
## Min. : 2007 Pink Cab :1915 Length:7577 Min. : 1.90
## 1st Qu.: 8443 Yellow Cab:5662 Class :character 1st Qu.:12.00
## Median :14635 Mode :character Median :22.44
## Mean :15033 Mean :22.59
## 3rd Qu.:20986 3rd Qu.:32.98
## Max. :34968 Max. :48.00
## Price.Charged Cost.of.Trip profit date
## Min. : 15.6 Min. : 19.59 Min. :-133.21 Min. :2016-01-18
## 1st Qu.: 201.1 1st Qu.:150.42 1st Qu.: 24.84 1st Qu.:2016-12-25
## Median : 377.4 Median :283.18 Median : 75.68 Median :2017-11-10
## Mean : 412.1 Mean :285.57 Mean : 126.53 Mean :2017-09-03
## 3rd Qu.: 569.4 3rd Qu.:414.12 3rd Qu.: 174.68 3rd Qu.:2018-01-15
## Max. :1681.9 Max. :679.68 Max. :1058.11 Max. :2018-12-25
## Population Users lat long
## Min. : 248968 Min. : 927 Min. :25.79 Min. :-122.31
## 1st Qu.: 698371 1st Qu.: 80021 1st Qu.:33.94 1st Qu.:-117.19
## Median :1595037 Median :144132 Median :40.64 Median : -80.29
## Mean :3148905 Mean :157696 Mean :38.43 Mean : -89.63
## 3rd Qu.:8405837 3rd Qu.:302149 3rd Qu.:41.79 3rd Qu.: -73.78
## Max. :8405837 Max. :302149 Max. :47.45 Max. : -71.01
## X1 New.Year.Day
## Min. :42.00 Length:7577
## 1st Qu.:50.00 Class :character
## Median :58.00 Mode :character
## Mean :56.74
## 3rd Qu.:62.00
## Max. :70.00
“Duplication” means that if we have repeated data in our dataset. This could be due to things like data entry errors or data collection methods.
# get the row numbers of duplicated rows
duplicated_rows <- data_frame(duplicated = duplicated(all), row = 1:nrow(all)) %>%
filter(duplicated == T)
count(duplicated_rows)
## # A tibble: 1 x 1
## n
## <int>
## 1 0
# Plot duplicated rows as black lines
ggplot(duplicated_rows, aes(xintercept = row)) +
geom_vline(aes(xintercept = row)) + # plot a black line for each duplicated row
ggtitle("Indexes of duplicated rows") + # add a title
coord_flip() + scale_x_reverse() #flip x & y axis and reverse the x axis
library(funModeling)
numeric<-all[,c(5,6,9:12)] #extracting numeric variables
plot_num(numeric)
profiling_num(numeric)
## variable mean std_dev variation_coef p_01 p_05
## 1 Age 35.33670 12.59423 0.3564066 18.00000 19.0000
## 2 income 15048.82294 7969.40948 0.5295703 2233.00000 3245.0000
## 3 KM.Travelled 22.56725 12.23353 0.5420919 2.10000 3.5700
## 4 Price.Charged 423.44331 274.37891 0.6479708 33.59000 63.4200
## 5 Cost.of.Trip 286.19011 157.99366 0.5520584 26.37079 46.2240
## 6 profit 137.25320 160.31184 1.1680008 -39.69309 -5.0829
## p_25 p_50 p_75 p_95 p_99 skewness kurtosis
## 1 25.0000 33.000 42.0000 61.0000 64.0000 0.68533592 2.541593
## 2 8424.0000 14685.000 21035.0000 29659.0000 33968.0000 0.30956095 2.339507
## 3 12.0000 22.440 32.9600 42.0000 45.6300 0.05577867 1.873124
## 4 206.4375 386.360 583.6600 944.8900 1230.1090 0.87375784 3.747608
## 5 151.2000 282.480 413.6832 544.3632 610.5600 0.13795749 1.987765
## 6 28.0120 81.962 190.0300 478.5642 713.5563 1.89989600 7.376835
## iqr range_98 range_80
## 1 17.0000 [18, 64] [21, 56]
## 2 12611.0000 [2233, 33968] [4525, 24793]
## 3 20.9600 [2.1, 45.63] [5.8, 39.2]
## 4 377.2225 [33.59, 1230.109] [99.231, 792.79]
## 5 262.4832 [26.370792, 610.56] [72.576, 502.500600000003]
## 6 162.0180 [-39.693088, 713.556288] [5.207, 358.98468]
freq(all2[,-c(1,2)])
## Payment_Mode frequency percentage cumulative_perc
## 1 Card 4515 59.59 59.59
## 2 Cash 3062 40.41 100.00
## Gender frequency percentage cumulative_perc
## 1 Male 4360 57.54 57.54
## 2 Female 3217 42.46 100.00
## Company frequency percentage cumulative_perc
## 1 Yellow Cab 5662 74.73 74.73
## 2 Pink Cab 1915 25.27 100.00
## City frequency percentage cumulative_perc
## 1 NEW YORK NY 2128 28.08 28.08
## 2 CHICAGO IL 1142 15.07 43.15
## 3 LOS ANGELES CA 1010 13.33 56.48
## 4 WASHINGTON DC 868 11.46 67.94
## 5 BOSTON MA 641 8.46 76.40
## 6 SAN DIEGO CA 477 6.30 82.70
## 7 SILICON VALLEY 172 2.27 84.97
## 8 ATLANTA GA 165 2.18 87.15
## 9 MIAMI FL 164 2.16 89.31
## 10 SEATTLE WA 161 2.12 91.43
## 11 DALLAS TX 145 1.91 93.34
## 12 AUSTIN TX 121 1.60 94.94
## 13 ORANGE COUNTY 80 1.06 96.00
## 14 NASHVILLE TN 76 1.00 97.00
## 15 DENVER CO 64 0.84 97.84
## 16 SACRAMENTO CA 54 0.71 98.55
## 17 TUCSON AZ 43 0.57 99.12
## 18 PHOENIX AZ 33 0.44 99.56
## 19 PITTSBURGH PA 33 0.44 100.00
## New.Year.Day frequency percentage cumulative_perc
## 1 Veterans Day 1565 20.65 20.65
## 2 Christmas Day 1331 17.57 38.22
## 3 Thanksgiving Day 1166 15.39 53.61
## 4 Columbus Day 673 8.88 62.49
## 5 Labor Day 666 8.79 71.28
## 6 New Year Day 610 8.05 79.33
## 7 Memorial Day 449 5.93 85.26
## 8 Independence Day 447 5.90 91.16
## 9 Presidents Day (Washingtons Birthday) 345 4.55 95.71
## 10 Martin Luther King Jr. Day 325 4.29 100.00
## [1] "Variables processed: Payment_Mode, Gender, Company, City, New.Year.Day"
library(plotly)
fig <- plot_ly(all, lat = ~lat, lon = ~long,
marker = list(color = "fuchsia"),
type = 'scattermapbox',
hovertext = all$City)
fig <- fig %>%
layout(all,
mapbox = list(
style = 'open-street-map',
zoom =2.5,
center = list(lon = -88, lat = 34)))
fig