library(ggplot2)
library(tidyverse)
library(knitr)
library(flexdashboard)
library(readr)
BOZA_RIDES <- read_csv("~/Documents/BOZA (Work Files)/BOZA (Work)/Raw Data/Cleaned Data/BOZA_RIDES.csv")
colnames(BOZA_RIDES)
## [1] "Date" "Request_Id" "trip_start_time"
## [4] "trip_end_time" "User_Name" "Driver_Name"
## [7] "Trip_Status" "Paid_Status" "Payment_Option"
## [10] "Vehicle_Type" "Ride_Type" "Trip_Time_Minutes"
## [13] "Trip_Distance_Km" "Admin_Commision_ZAR" "Total_Amount_ZAR"
## [16] "Driver_Commission_ZAR"
Date: Represents the date of the trip.
Request_Id: Unique identifier for each ride request.
trip_start_time: The time stamp when the trip started.
trip_end_time: The time stamp when the trip ended.
User_Name: The name or identifier of the user who requested the ride.
Driver_Name: The name or identifier of the driver who completed the ride.
Trip_Status: Indicates the status of the trip, such as completed, canceled, or ongoing.
Paid_Status: Represents the payment status of the trip, whether it has been paid or not.
Payment_Option: The chosen payment method for the trip.
Vehicle_Type: Specifies the type of vehicle used for the ride.
Ride_Type: Indicates the type of ride, such as regular-instant, or regular-scheduled.
Trip_Time_Minutes: Represents the duration of the trip in minutes.
Trip_Distance_Km: The distance traveled during the trip in kilometers.
Admin_Commission_ZAR: The commission amount charged by the administration for the ride in ZAR (South African Rand).
Total_Amount_ZAR: The total amount paid for the trip in ZAR.
Driver_Commission_ZAR: The commission amount earned by the driver for the ride in ZAR.
library(dplyr)
# Data Processing, Cleaning and application of necessary data transformations.
BOZA_RIDES <- BOZA_RIDES %>%
mutate(Date = as.Date(Date, format = "%m/%d/%Y"),
Request_Id = as.character(Request_Id),
trip_start_time = ifelse(
trip_start_time == "-" | is.na(trip_start_time),
"NA", gsub(".*?(\\d{2}:\\d{2} [AP]M)$", "\\1", trip_start_time)),
trip_end_time = ifelse(
trip_end_time == "-" | is.na(trip_end_time),
"NA", gsub(".*?(\\d{2}:\\d{2} [AP]M)$", "\\1", trip_end_time)),
User_Name = as.character(User_Name),
Driver_Name = ifelse(Driver_Name == "-" | is.na(Driver_Name),
"NA", toupper(Driver_Name)),
Trip_Status = factor(Trip_Status,
levels = c("Cancelled", "Completed", "Not Started"),
labels = c("Cancelled", "Completed", "Not Started")),
Paid_Status = factor(Paid_Status, levels = c("Not Paid", "Paid"),
labels = c("Not Paid", "Paid")),
Payment_Option = factor(Payment_Option, levels = c("Cash", "Card"),
labels = c("Cash", "Card")),
Vehicle_Type = factor(Vehicle_Type,
levels = c("XL", "Comfort", "Pickup", "Regular"),
labels = c("XL", "Comfort", "Pickup", "Regular")),
Ride_Type = factor(Ride_Type, levels = c("Regular-Instant", "Regular-Scheduled"), labels = c("Regular-Instant", "Regular-Scheduled")),
Trip_Time_Minutes = as.numeric(gsub(" Mins", "", Trip_Time_Minutes)),
Trip_Distance_Km = as.numeric(gsub(" Km", "", Trip_Distance_Km)),
Driver_Commission_ZAR = as.numeric(gsub("R ", "", Driver_Commission_ZAR)),
Admin_Commision_ZAR = as.numeric(gsub("R ", "", Admin_Commision_ZAR)),
Total_Amount_ZAR = as.numeric(gsub("R ", "", Total_Amount_ZAR)))
The dataset “BOZA_RIDES” has undergone pre-processing, preparation and cleaning.
A cursory examination of the dataset reveals that there are 62 observations and 16 variables, which will be utilized in later analysis.
We now take a look at the dataset more closely, including the characteristics of its variables, to comprehend its nature and derive useful insights.
head(BOZA_RIDES)
## # A tibble: 6 × 16
## Date Request_Id trip_start_time trip_end_time User_Name Driver_Name
## <date> <chr> <chr> <chr> <chr> <chr>
## 1 2023-07-28 REQ_000536 17:24:00 17:58:00 Flubert Taga WANDILE
## 2 2023-07-21 REQ_000458 10:10:00 10:23:00 Noluthando WANDILE
## 3 2023-07-27 REQ_000521 18:08:00 18:56:00 Flubert Taga WANDILE
## 4 2023-07-20 REQ_000268 12:23:00 12:43:00 Sharon Monama BETHUEL FRE…
## 5 2023-12-23 REQ_002722 17:28:00 18:05:00 Flubert Taga BRIGHTON
## 6 2024-04-05 REQ_004785 07:26:00 07:28:00 Flubert Taga WANDILE
## # ℹ 10 more variables: Trip_Status <fct>, Paid_Status <fct>,
## # Payment_Option <fct>, Vehicle_Type <fct>, Ride_Type <fct>,
## # Trip_Time_Minutes <dbl>, Trip_Distance_Km <dbl>, Admin_Commision_ZAR <dbl>,
## # Total_Amount_ZAR <dbl>, Driver_Commission_ZAR <dbl>
str(BOZA_RIDES)
## tibble [62 × 16] (S3: tbl_df/tbl/data.frame)
## $ Date : Date[1:62], format: "2023-07-28" "2023-07-21" ...
## $ Request_Id : chr [1:62] "REQ_000536" "REQ_000458" "REQ_000521" "REQ_000268" ...
## $ trip_start_time : chr [1:62] "17:24:00" "10:10:00" "18:08:00" "12:23:00" ...
## $ trip_end_time : chr [1:62] "17:58:00" "10:23:00" "18:56:00" "12:43:00" ...
## $ User_Name : chr [1:62] "Flubert Taga" "Noluthando" "Flubert Taga" "Sharon Monama" ...
## $ Driver_Name : chr [1:62] "WANDILE" "WANDILE" "WANDILE" "BETHUEL FREEDOM" ...
## $ Trip_Status : Factor w/ 3 levels "Cancelled","Completed",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Paid_Status : Factor w/ 2 levels "Not Paid","Paid": 2 2 2 2 2 2 2 2 2 2 ...
## $ Payment_Option : Factor w/ 2 levels "Cash","Card": 2 2 2 2 2 2 2 2 2 2 ...
## $ Vehicle_Type : Factor w/ 4 levels "XL","Comfort",..: 2 2 2 3 1 2 3 2 2 2 ...
## $ Ride_Type : Factor w/ 2 levels "Regular-Instant",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Trip_Time_Minutes : num [1:62] 34 12 47 19 36 2 46 32 43 5 ...
## $ Trip_Distance_Km : num [1:62] 23.22 6.27 23.12 9.76 50.61 ...
## $ Admin_Commision_ZAR : num [1:62] 32.1 10.4 35 11.1 79.4 ...
## $ Total_Amount_ZAR : num [1:62] 246 80 269 86 609 38 207 253 257 44 ...
## $ Driver_Commission_ZAR: num [1:62] 213.7 69.2 233.1 74 529.3 ...
After data processing and preparation, we observe that:
5 of the 16 variables are numeric or quantitative variables. These are Trip_Time_Minutes, Trip_Distance_Km, Admin_Commission_ZAR, Total_Amount_ZAR, and Driver_Commission_ZAR.
5 are factor (or categorical variables), with varying levels. These include: Paid_Status, Trip_status, payment_option, vehicle_type, and ride_type.
5 variables are character or name variables: Request_Id, trip_start_time, trip_end_time, User_name, and Driver_name.
1 variable is a Date variable.
BOZA_RIDES <- BOZA_RIDES[, !(names(BOZA_RIDES) %in% c("User_Name", "Driver_Name"))]
head(BOZA_RIDES)
## # A tibble: 6 × 14
## Date Request_Id trip_start_time trip_end_time Trip_Status Paid_Status
## <date> <chr> <chr> <chr> <fct> <fct>
## 1 2023-07-28 REQ_000536 17:24:00 17:58:00 Completed Paid
## 2 2023-07-21 REQ_000458 10:10:00 10:23:00 Completed Paid
## 3 2023-07-27 REQ_000521 18:08:00 18:56:00 Completed Paid
## 4 2023-07-20 REQ_000268 12:23:00 12:43:00 Completed Paid
## 5 2023-12-23 REQ_002722 17:28:00 18:05:00 Completed Paid
## 6 2024-04-05 REQ_004785 07:26:00 07:28:00 Completed Paid
## # ℹ 8 more variables: Payment_Option <fct>, Vehicle_Type <fct>,
## # Ride_Type <fct>, Trip_Time_Minutes <dbl>, Trip_Distance_Km <dbl>,
## # Admin_Commision_ZAR <dbl>, Total_Amount_ZAR <dbl>,
## # Driver_Commission_ZAR <dbl>
For ease of analysis, username and driver name has been dropped from the dataset.
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
Based on the histograms provided, we can see that drivers earn an average commission of around R200, as this is where the graph’s concentration lies. Minimum commission is R31. Maximum commission is R551.70.
The average or mean total amount per trip is R228.10 Minimum total amount is R36.00 Maximum total amount is R635.00
The admin commission is approximately about 15% of the total amount made on each trip. The balance practically goes to the driver’s pocket.
Trip distance (in kilometers): Based on the visualization generated, it appears that the majority of trips in the dataset fall within the 20-25 km range. This suggests that most customers book rides within this distances range. There are exceptions of course, with a maximum trip of over 60 kilometers.
POINTS TO NOTE: Pricing Optimization: By analyzing this distribution of trip distances, we can determine if certain distance ranges are more popular among customers. If so, we can use this information to adjust pricing strategies, such as implementing dynamic pricing or surge pricing during peak demand periods or for longer trips. This can help maximize revenue.
Route Planning: Our understanding of the distribution of trip distances can also assist in optimizing route planning. For instance, we can identify areas with high demand for trips within specific distance ranges and allocate more vehicles in those areas. By strategically positioning vehicles closer to areas with frequent trips, we can reduce wait times and increase the number of completed trips, thereby generating more revenue.
Marketing and Promotions: Analyzing the distance distribution can guide marketing and promotional efforts. We may target advertising campaigns towards customers who frequently take trips within specific distance ranges.
TRIP TIME IN MINUTES. Based on the histogram analysis you, we can see that the majority of trip times in the dataset are concentrated around the 25-minute mark, with the highest frequency of trips occurring in the 25-30 minute range. This suggests that a significant number of trips have duration within this range. There are fewer trips with durations in the 0-15 minute range and the 100-115 minute range. This indicates that trips with very short or very long durations are less common in our dataset.
library(ggplot2)
# Histograms for quantitative variables
Measurable_Variables <- c("Driver_Commission_ZAR", "Total_Amount_ZAR", "Admin_Commision_ZAR", "Trip_Distance_Km", "Trip_Time_Minutes")
histograms <- lapply(Measurable_Variables, function(var) {
ggplot(BOZA_RIDES, aes(x = .data[[var]])) +
geom_bar(fill = "skyblue", color = "black", stat = "count") +
labs(title = paste("Histogram of", var), x = var, y = "Frequency") +
geom_density(color = "red", linetype = "dashed")
})
# Combine and print histograms on one page
multiplot <- do.call(gridExtra::grid.arrange, c(histograms, ncol = 2))
print(multiplot)
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
## 5 5 (3-3,1-1) arrange gtable[layout]
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
# Compute and print kernel density for each variable
Measurable_Variables <- c("Driver_Commission_ZAR", "Total_Amount_ZAR", "Admin_Commision_ZAR", "Trip_Distance_Km", "Trip_Time_Minutes")
density_plots <- list() # Create an empty list to store the density plots
for (var in Measurable_Variables) {
BOZA_RIDES_VAR <- BOZA_RIDES[[var]]
density_plot <- ggplot(data.frame(x = BOZA_RIDES_VAR), aes(x = BOZA_RIDES_VAR)) +
geom_density(color = "blue", fill = "skyblue", alpha = 0.5) +
labs(title = paste("Kernel Density of", var), x = var, y = "Density")
density_plots[[var]] <- density_plot # Store the density plot in the list
print(density_plot)
}
multiplot <- do.call(grid.arrange, c(density_plots, ncol = 2)) # Pass the list of density plots to grid.arrange
print(multiplot)
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## Driver_Commission_ZAR 1 (1-1,1-1) arrange gtable[layout]
## Total_Amount_ZAR 2 (1-1,2-2) arrange gtable[layout]
## Admin_Commision_ZAR 3 (2-2,1-1) arrange gtable[layout]
## Trip_Distance_Km 4 (2-2,2-2) arrange gtable[layout]
## Trip_Time_Minutes 5 (3-3,1-1) arrange gtable[layout]
The distributions of our variables are displayed in the kernel density plots above. The peak of each distribution represents the area of (average) concentration of the related activity. For instance, average driver commission is slightly above R200 (in South African rand).
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
## TableGrob (4 x 2) "arrange": 7 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (4-4,1-1) arrange gtable[layout]
## 6 6 (4-4,2-2) arrange gtable[layout]
## 7 7 (1-1,1-2) arrange text[GRID.text.1300]
summary(BOZA_RIDES)
## Date Request_Id trip_start_time trip_end_time
## Min. :2023-06-23 Length:62 Length:62 Length:62
## 1st Qu.:2023-07-20 Class :character Class :character Class :character
## Median :2023-07-28 Mode :character Mode :character Mode :character
## Mean :2023-09-08
## 3rd Qu.:2023-09-09
## Max. :2024-04-05
## Trip_Status Paid_Status Payment_Option Vehicle_Type
## Cancelled : 0 Not Paid: 0 Cash: 0 XL : 2
## Completed :62 Paid :62 Card:62 Comfort:40
## Not Started: 0 Pickup : 8
## Regular:12
##
##
## Ride_Type Trip_Time_Minutes Trip_Distance_Km Admin_Commision_ZAR
## Regular-Instant :62 Min. : 2.00 Min. : 0.00 Min. : 4.65
## Regular-Scheduled: 0 1st Qu.:28.25 1st Qu.:22.56 1st Qu.:25.93
## Median :34.00 Median :23.08 Median :30.70
## Mean :35.10 Mean :21.83 Mean :30.00
## 3rd Qu.:45.75 3rd Qu.:24.06 3rd Qu.:34.23
## Max. :99.00 Max. :63.14 Max. :82.76
## Total_Amount_ZAR Driver_Commission_ZAR
## Min. : 36.0 Min. : 31.0
## 1st Qu.:194.8 1st Qu.:172.9
## Median :235.0 Median :203.6
## Mean :228.1 Mean :199.4
## 3rd Qu.:256.5 3rd Qu.:228.2
## Max. :635.0 Max. :551.7
Based on the summary provided, we can make the following observations:
Date: The dataset contains rides that span from June 23, 2023, to April 5, 2024. This indicates that the rides were recorded over a period of several months.
Trip_Status: The Trip_Status variable indicates the status of each trip. All trips in the dataset are marked as “Completed,” indicating that none of them were canceled or in progress.
Paid_Status: The Paid_Status variable indicates the payment status of each trip. All trips in the dataset are marked as “Paid,” indicating that all trips have been paid for.
Payment_Option: All trips have the payment method recorded as “Card,” indicating most riders prefer the card payment option as compared to cash or wallets.
Vehicle_Type: The Vehicle_Type variable indicates the type of vehicle used for the trips. The most common vehicle type is “Comfort,” which was used for 40 out of the 62 trips. Additionally, the extra-large (XL) vehicle category is the least used.
Ride_Type: The Ride_Type variable indicates the type of ride. All 62 trips are classified as “Regular-Instant,” indicating that there were no scheduled or shared rides in the dataset.
Trip_Time_Minutes: The Trip_Time_Minutes variable provides information about the duration of each trip. The minimum trip duration is 2 minutes, while the maximum is 99 minutes. The mean or average trip duration is approximately 35.1 minutes.
Trip_Distance_Km: The Trip_Distance_Km variable provides information about the distance traveled during each trip. The minimum distance is 0 km, while the maximum is 63.14 km. The mean or average distance traveled is approximately 21.83 km.
Admin_Commission_ZAR, Total_Amount_ZAR, and Driver_Commission_ZAR: These variables represent the commission and total amount earned for each trip. The minimum and maximum values indicate the range of commission and earnings in South African Rand (ZAR). For example, the minimum admin commission is 4.65 ZAR, while the maximum is 82.76 ZAR. Similarly, the minimum total amount earned is 36.0 ZAR, while the maximum is 635.0 ZAR.
## Driver_Commission_ZAR Total_Amount_ZAR
## Driver_Commission_ZAR 1.0000000 0.9876642
## Total_Amount_ZAR 0.9876642 1.0000000
## Admin_Commision_ZAR 0.9997283 0.9875567
## Trip_Distance_Km 0.9348813 0.9306511
## Trip_Time_Minutes 0.7548161 0.7245643
## Admin_Commision_ZAR Trip_Distance_Km Trip_Time_Minutes
## Driver_Commission_ZAR 0.9997283 0.9348813 0.7548161
## Total_Amount_ZAR 0.9875567 0.9306511 0.7245643
## Admin_Commision_ZAR 1.0000000 0.9348032 0.7563925
## Trip_Distance_Km 0.9348032 1.0000000 0.7638477
## Trip_Time_Minutes 0.7563925 0.7638477 1.0000000
Based on the correlation matrix provided above, we can interpret the correlations between the variables in the BOZA_RIDES dataset as follows:
Driver Commission and Total Amount: There is a strong positive correlation of approximately 0.99 (or 99%) between the Driver Commission and Total Amount. This suggests that the driver’s commission is directly related to the total amount earned from the rides. As the total amount increases, the driver’s commission also tends to increase. This insight can be useful for evaluating our drivers’ incentives, commission structures, and optimizing revenue-sharing models. For instance, we can adjust driver commission structures (or give higher incentives) based on Total Amount a driver can generate.
Admin Commission and Driver Commission/Total Amount: There is a similarly strong positive correlation of approximately 0.99 (or 99%) between the Admin Commission and both the Driver Commission and Total Amount. We observe that the admin commission is closely related to the driver commission and the overall revenue generated from the ordered rides. This can help BOZA rides monitor and analyze the Admin Commission in relation to the Driver Commission and Total Amount of commissions, to ensure fair and appropriate distribution and effective cost management.
Trip Distance and Trip Time: The Trip Distance and Trip Time variables show moderate positive relationships with other variables. Trip Distance has a correlation of approximately 0.93 (or 93%) with Driver Commission, Total Amount, and Admin Commission. This suggests that longer trips tend to result in higher earnings and commissions. Similarly, Trip Time has a correlation of approximately 0.75 (or 75%) with these variables. This can inform decisions on pricing strategies, fare calculations, and optimizing driver allocation during peak periods.
## [1] 0.988
The scatter plot above indicates a strong positive linear relationship between the Total Amount (ZAR) and Driver Commission (ZAR) variables. This means that as the Driver Commission (ZAR) increases, there tends to be a corresponding increase in the Total Amount (ZAR) and vice-versa. The relationship appears to be linear, implying that the two variables are positively associated. The correlation coefficient of 0.988 (approximately 98.8%) further supports the observation of a strong positive linear relationship between the variables. This measures the strength and direction of the linear relationship.
## [1] 0.988
Again, this scatter plot indicates a strong positive linear relationship between the Total Amount (ZAR) and Admin Commission (ZAR). i.e as the Admin Commission (ZAR) increases, there tends to be a corresponding increase in the Total Amount (ZAR) and vice-versa. This linear relationship, implies that the two variables are positively associated. Similarly, a coefficient of determination r = 0.988 (approximately 98.8%) supports the observation of a strong positive linear relationship between the variables; measuring the strength and direction of the linear relationship.
## [1] 0.931
The scatter plot between the Total Amount (ZAR) and Trip_distance_Km
also shows a strong positive linear relationship. A coefficient of
determination r = 0.931 (approximately 93.1%) supports the observation
of a strong positive linear relationship between these variables;
implying that the two variables are strongly (and positively)
associated.
## [1] 0.725
Lastly, the scatter plot between the Total Amount (ZAR) and Trip_Time_Minutes shows a relatively strong positive linear relationship. A coefficient of determination r = 0.725 (approximately 72.5%) supports the observation of a strong positive linear relationship between these variables; This implies that the two variables are positively associated.
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
## Driver_Commission_ZAR Admin_Commision_ZAR Trip_Distance_Km
## 1863.475420 1866.230124 8.465926
## Trip_Time_Minutes
## 2.512186
## [1] 1
summary(BOZA_RIDES$Total_Amount_ZAR)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 36.0 194.8 235.0 228.1 256.5 635.0
The Total_Amount_ZAR will be our response (or dependent variable ) in a quest to optimize “BOZA RIDES” travel activities. The average ride amount sits at R228.10 as of now.
# We first convert Vehicle_Type into dummy variables
dummy_matrix <- model.matrix(~ Vehicle_Type - 1, data = BOZA_RIDES)
dummy_matrix
## Vehicle_TypeXL Vehicle_TypeComfort Vehicle_TypePickup Vehicle_TypeRegular
## 1 0 1 0 0
## 2 0 1 0 0
## 3 0 1 0 0
## 4 0 0 1 0
## 5 1 0 0 0
## 6 0 1 0 0
## 7 0 0 1 0
## 8 0 1 0 0
## 9 0 1 0 0
## 10 0 1 0 0
## 11 0 1 0 0
## 12 0 1 0 0
## 13 0 1 0 0
## 14 0 0 0 1
## 15 0 0 1 0
## 16 0 1 0 0
## 17 0 0 0 1
## 18 0 1 0 0
## 19 0 1 0 0
## 20 0 1 0 0
## 21 0 1 0 0
## 22 0 1 0 0
## 23 0 1 0 0
## 24 0 0 1 0
## 25 0 0 1 0
## 26 0 0 0 1
## 27 0 0 0 1
## 28 0 1 0 0
## 29 0 0 0 1
## 30 0 1 0 0
## 31 0 0 0 1
## 32 0 0 0 1
## 33 0 1 0 0
## 34 0 1 0 0
## 35 0 0 1 0
## 36 0 1 0 0
## 37 0 1 0 0
## 38 0 1 0 0
## 39 0 1 0 0
## 40 0 0 0 1
## 41 0 0 1 0
## 42 0 0 1 0
## 43 0 1 0 0
## 44 0 1 0 0
## 45 0 1 0 0
## 46 0 1 0 0
## 47 0 1 0 0
## 48 0 1 0 0
## 49 0 1 0 0
## 50 0 0 0 1
## 51 0 1 0 0
## 52 0 1 0 0
## 53 0 1 0 0
## 54 1 0 0 0
## 55 0 1 0 0
## 56 0 1 0 0
## 57 0 0 0 1
## 58 0 0 0 1
## 59 0 1 0 0
## 60 0 0 0 1
## 61 0 1 0 0
## 62 0 1 0 0
## attr(,"assign")
## [1] 1 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$Vehicle_Type
## [1] "contr.treatment"
##
## Call:
## lm(formula = Total_Amount_ZAR ~ Driver_Commission_ZAR + Admin_Commision_ZAR +
## Trip_Distance_Km + Trip_Time_Minutes + dummy_matrix, data = BOZA_RIDES)
##
## Coefficients:
## (Intercept) Driver_Commission_ZAR
## 7.9191 0.4435
## Admin_Commision_ZAR Trip_Distance_Km
## 5.3951 0.2148
## Trip_Time_Minutes dummy_matrixVehicle_TypeXL
## -0.7521 -43.9501
## dummy_matrixVehicle_TypeComfort dummy_matrixVehicle_TypePickup
## -10.1233 -3.4424
## dummy_matrixVehicle_TypeRegular
## NA
##
## Call:
## lm(formula = Total_Amount_ZAR ~ Driver_Commission_ZAR + Admin_Commision_ZAR +
## Trip_Distance_Km + Trip_Time_Minutes + dummy_matrix, data = BOZA_RIDES)
##
## Residuals:
## Min 1Q Median 3Q Max
## -130.658 -2.318 1.057 5.506 32.532
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.9191 8.5497 0.926 0.3584
## Driver_Commission_ZAR 0.4435 1.0136 0.438 0.6634
## Admin_Commision_ZAR 5.3951 6.8041 0.793 0.4313
## Trip_Distance_Km 0.2148 0.7424 0.289 0.7734
## Trip_Time_Minutes -0.7521 0.3249 -2.315 0.0244 *
## dummy_matrixVehicle_TypeXL -43.9501 31.2034 -1.409 0.1647
## dummy_matrixVehicle_TypeComfort -10.1233 7.6235 -1.328 0.1898
## dummy_matrixVehicle_TypePickup -3.4424 9.3950 -0.366 0.7155
## dummy_matrixVehicle_TypeRegular NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.67 on 54 degrees of freedom
## Multiple R-squared: 0.9785, Adjusted R-squared: 0.9757
## F-statistic: 351.1 on 7 and 54 DF, p-value: < 2.2e-16
## (Intercept) Driver_Commission_ZAR
## 7.9191087 0.4435342
## Admin_Commision_ZAR Trip_Distance_Km
## 5.3951075 0.2147967
## Trip_Time_Minutes dummy_matrixVehicle_TypeXL
## -0.7521028 -43.9501021
## dummy_matrixVehicle_TypeComfort dummy_matrixVehicle_TypePickup
## -10.1232908 -3.4424219
## dummy_matrixVehicle_TypeRegular
## NA
## [1] 20895.84
##
## Call:
## lm(formula = Total_Amount_ZAR ~ Driver_Commission_ZAR + Trip_Distance_Km +
## Trip_Time_Minutes, data = BOZA_RIDES)
##
## Coefficients:
## (Intercept) Driver_Commission_ZAR Trip_Distance_Km
## 3.2402 1.1051 0.8730
## Trip_Time_Minutes
## -0.4147
##
## Call:
## lm(formula = Total_Amount_ZAR ~ Driver_Commission_ZAR + Trip_Distance_Km +
## Trip_Time_Minutes, data = BOZA_RIDES)
##
## Residuals:
## Min 1Q Median 3Q Max
## -137.122 -1.173 0.209 4.563 35.196
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.24023 5.57480 0.581 0.5633
## Driver_Commission_ZAR 1.10510 0.06558 16.852 <2e-16 ***
## Trip_Distance_Km 0.87296 0.58482 1.493 0.1409
## Trip_Time_Minutes -0.41467 0.21398 -1.938 0.0575 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.47 on 58 degrees of freedom
## Multiple R-squared: 0.9774, Adjusted R-squared: 0.9762
## F-statistic: 835 on 3 and 58 DF, p-value: < 2.2e-16
## (Intercept) Driver_Commission_ZAR Trip_Distance_Km
## 3.2402299 1.1051000 0.8729634
## Trip_Time_Minutes
## -0.4146722
## [1] 21996.26
## (Intercept) Driver_Commission_ZAR Trip_Distance_Km
## 3.2402299 1.1051000 0.8729634
## Trip_Time_Minutes
## -0.4146722
## [1] 21996.26
Deviance refers to the measure of the discrepancy between the observed (collected) values of the “Total_Amount_ZAR” and the values predicted by the model.
The reference point for evaluating the deviance in this case is compared to the deviance of a baseline model.
Our baseline model is the model with only an intercept term ( i.e approximately R3.2402299, no independent variables), and this would serve as a point of comparison for assessing the improvement in fit achieved by adding independent variables to the model.
The difference in deviance between the full MLR model and the baseline model is often used to evaluate the model’s goodness-of-fit.
## 14 19 59
## 14 19 59
##
## Call:
## lm(formula = Total_Amount_ZAR ~ Driver_Commission_ZAR + Trip_Distance_Km +
## Trip_Time_Minutes, data = BOZA_RIDES_new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.789 -1.156 -0.487 0.187 35.211
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.467811 1.399638 0.334 0.739
## Driver_Commission_ZAR 1.125321 0.016441 68.445 <2e-16 ***
## Trip_Distance_Km 0.247212 0.148029 1.670 0.100
## Trip_Time_Minutes 0.007835 0.055488 0.141 0.888
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.878 on 57 degrees of freedom
## Multiple R-squared: 0.9986, Adjusted R-squared: 0.9985
## F-statistic: 1.343e+04 on 3 and 57 DF, p-value: < 2.2e-16
## (Intercept) Driver_Commission_ZAR Trip_Distance_Km
## 0.467811438 1.125320685 0.247212397
## Trip_Time_Minutes
## 0.007835193
*** #Total_Amount_ZAR = 0.49325053 + (1.15215918 * Driver_Commission_ZAR) + (0.03123941 * Trip_Distance_Km ) + (0.00993803 * Trip_Time_Minutes ) + error.***
## [1] 1356.434
Based on the provided model summary, we now interpret the outputs and comment on the model:
For example, when distance = 1KM, *** #Total_Amount_ZAR = 0.49325053 + (1.15215918 * Driver_Commission_ZAR) + (0.03123941 * Trip_Distance_Km ) + (0.00993803 * Trip_Time_Minutes ) + error. i.e. #Total_Amount_ZAR = 0.49325053 + (1.15215918 * 0) + (-0.03123941 * 1) + (0.00993803 * 0 ) + error. #Total_Amount_ZAR = 0.49325053 - (0.03123941 * 1 ) *** = R0.462 (approx).
Overall, the model appears to have a good fit, as indicated by the high Adjusted R-squared value and the statistically significant F-statistic.
## Total Amount (ZAR)
# Coefficients from the regression model
intercept <- 0.49325053
driver_commission <- 1.15215918
trip_distance <- -0.03123941
trip_time <- 0.00993803
# Input values
driver_commission_input <- 213.74
trip_distance_input <- 23.22
trip_time_input <- 34
# Calculate the predicted total amount
total_amount <- intercept + (driver_commission * driver_commission_input) +
(trip_distance * trip_distance_input) +
(trip_time * trip_time_input)
# Print the predicted total amount
total_amount
## [1] 246.3683
## Driver Commission (ZAR)
# Coefficients from the regression model
intercept <- 0.49325053
driver_commission <- 1.15215918
trip_distance <- -0.03123941
trip_time <- 0.00993803
# Input values
total_amount_input <- 213.74
trip_distance_input <- 23.22
trip_time_input <- 34
# Calculate the driver commission
driver_commission_calculated <- (total_amount_input - intercept - (trip_distance * trip_distance_input) - (trip_time * trip_time_input)) / driver_commission
# Print the calculated driver commission
driver_commission_calculated
## [1] 185.4208
## Trip time in minutes
# Coefficients from the regression model
intercept <- 0.49325053
driver_commission <- 1.15215918
trip_distance <- -0.03123941
trip_time <- 0.00993803
# Input values
total_amount_input <- 635.01
driver_commission_input <- 551.73
trip_distance_input <- 51.26
# Calculate the trip time in minutes
trip_time_calculated <- (total_amount_input - intercept - (driver_commission * driver_commission_input) - (trip_distance * trip_distance_input)) / trip_time
# Print the calculated trip time in minutes
trip_time_calculated
## [1] 44.00241
## Trip distance in kilometers
# Coefficients from the regression model
intercept <- 0.49325053
driver_commission <- 1.15215918
trip_distance <- -0.03123941
trip_time <- 0.00993803
# Input values
total_amount_input <- 635.01
driver_commission_input <- 551.73
trip_time_input <- 44
# Calculate the trip distance in kilometers
trip_distance_calculated <- (total_amount_input - intercept - (driver_commission * driver_commission_input) - (trip_time * trip_time_input)) / trip_distance
# Print the calculated trip distance in kilometers
trip_distance_calculated
## [1] 51.25923