Install and load necessary packages

library(ggplot2)
library(tidyverse)
library(knitr)
library(flexdashboard)

Load Dataset

library(readr)
BOZA_RIDES <- read_csv("~/Documents/BOZA (Work Files)/BOZA (Work)/Raw Data/Cleaned Data/BOZA_RIDES.csv")

Preview of data at a glance.

colnames(BOZA_RIDES)
##  [1] "Date"                  "Request_Id"            "trip_start_time"      
##  [4] "trip_end_time"         "User_Name"             "Driver_Name"          
##  [7] "Trip_Status"           "Paid_Status"           "Payment_Option"       
## [10] "Vehicle_Type"          "Ride_Type"             "Trip_Time_Minutes"    
## [13] "Trip_Distance_Km"      "Admin_Commision_ZAR"   "Total_Amount_ZAR"     
## [16] "Driver_Commission_ZAR"

Our dataset “BOZA_RIDES” contains the following variables:

  1. Date: Represents the date of the trip.

  2. Request_Id: Unique identifier for each ride request.

  3. trip_start_time: The time stamp when the trip started.

  4. trip_end_time: The time stamp when the trip ended.

  5. User_Name: The name or identifier of the user who requested the ride.

  6. Driver_Name: The name or identifier of the driver who completed the ride.

  7. Trip_Status: Indicates the status of the trip, such as completed, canceled, or ongoing.

  8. Paid_Status: Represents the payment status of the trip, whether it has been paid or not.

  9. Payment_Option: The chosen payment method for the trip.

  10. Vehicle_Type: Specifies the type of vehicle used for the ride.

  11. Ride_Type: Indicates the type of ride, such as regular-instant, or regular-scheduled.

  12. Trip_Time_Minutes: Represents the duration of the trip in minutes.

  13. Trip_Distance_Km: The distance traveled during the trip in kilometers.

  14. Admin_Commission_ZAR: The commission amount charged by the administration for the ride in ZAR (South African Rand).

  15. Total_Amount_ZAR: The total amount paid for the trip in ZAR.

  16. Driver_Commission_ZAR: The commission amount earned by the driver for the ride in ZAR.

Data processing, preparation, and feature engineering.

library(dplyr)
# Data Processing, Cleaning and application of necessary data transformations.
BOZA_RIDES <- BOZA_RIDES %>%

mutate(Date = as.Date(Date, format = "%m/%d/%Y"),
         Request_Id = as.character(Request_Id),
         
trip_start_time = ifelse(
trip_start_time == "-" | is.na(trip_start_time),
"NA", gsub(".*?(\\d{2}:\\d{2} [AP]M)$", "\\1", trip_start_time)),
    
trip_end_time = ifelse(
trip_end_time == "-" | is.na(trip_end_time),
"NA", gsub(".*?(\\d{2}:\\d{2} [AP]M)$", "\\1", trip_end_time)),
    
    
User_Name = as.character(User_Name),
Driver_Name = ifelse(Driver_Name == "-" | is.na(Driver_Name), 
"NA", toupper(Driver_Name)),

Trip_Status = factor(Trip_Status, 
levels = c("Cancelled", "Completed", "Not Started"), 
labels = c("Cancelled", "Completed", "Not Started")),

Paid_Status = factor(Paid_Status, levels = c("Not Paid", "Paid"), 
labels = c("Not Paid", "Paid")),

Payment_Option = factor(Payment_Option, levels = c("Cash", "Card"), 
labels = c("Cash", "Card")),

Vehicle_Type = factor(Vehicle_Type, 
levels = c("XL", "Comfort", "Pickup", "Regular"),
labels = c("XL", "Comfort", "Pickup", "Regular")),

Ride_Type = factor(Ride_Type, levels = c("Regular-Instant", "Regular-Scheduled"), labels = c("Regular-Instant", "Regular-Scheduled")),

Trip_Time_Minutes = as.numeric(gsub(" Mins", "", Trip_Time_Minutes)),
Trip_Distance_Km  = as.numeric(gsub(" Km", "", Trip_Distance_Km)),
Driver_Commission_ZAR = as.numeric(gsub("R ", "", Driver_Commission_ZAR)),
Admin_Commision_ZAR = as.numeric(gsub("R ", "", Admin_Commision_ZAR)),
Total_Amount_ZAR = as.numeric(gsub("R ", "", Total_Amount_ZAR)))

The dataset “BOZA_RIDES” has undergone pre-processing, preparation and cleaning.

A cursory examination of the dataset reveals that there are 62 observations and 16 variables, which will be utilized in later analysis.

We now take a look at the dataset more closely, including the characteristics of its variables, to comprehend its nature and derive useful insights.

head(BOZA_RIDES)
## # A tibble: 6 × 16
##   Date       Request_Id trip_start_time trip_end_time User_Name     Driver_Name 
##   <date>     <chr>      <chr>           <chr>         <chr>         <chr>       
## 1 2023-07-28 REQ_000536 17:24:00        17:58:00      Flubert Taga  WANDILE     
## 2 2023-07-21 REQ_000458 10:10:00        10:23:00      Noluthando    WANDILE     
## 3 2023-07-27 REQ_000521 18:08:00        18:56:00      Flubert Taga  WANDILE     
## 4 2023-07-20 REQ_000268 12:23:00        12:43:00      Sharon Monama BETHUEL FRE…
## 5 2023-12-23 REQ_002722 17:28:00        18:05:00      Flubert Taga  BRIGHTON    
## 6 2024-04-05 REQ_004785 07:26:00        07:28:00      Flubert Taga  WANDILE     
## # ℹ 10 more variables: Trip_Status <fct>, Paid_Status <fct>,
## #   Payment_Option <fct>, Vehicle_Type <fct>, Ride_Type <fct>,
## #   Trip_Time_Minutes <dbl>, Trip_Distance_Km <dbl>, Admin_Commision_ZAR <dbl>,
## #   Total_Amount_ZAR <dbl>, Driver_Commission_ZAR <dbl>
str(BOZA_RIDES)
## tibble [62 × 16] (S3: tbl_df/tbl/data.frame)
##  $ Date                 : Date[1:62], format: "2023-07-28" "2023-07-21" ...
##  $ Request_Id           : chr [1:62] "REQ_000536" "REQ_000458" "REQ_000521" "REQ_000268" ...
##  $ trip_start_time      : chr [1:62] "17:24:00" "10:10:00" "18:08:00" "12:23:00" ...
##  $ trip_end_time        : chr [1:62] "17:58:00" "10:23:00" "18:56:00" "12:43:00" ...
##  $ User_Name            : chr [1:62] "Flubert Taga" "Noluthando" "Flubert Taga" "Sharon Monama" ...
##  $ Driver_Name          : chr [1:62] "WANDILE" "WANDILE" "WANDILE" "BETHUEL FREEDOM" ...
##  $ Trip_Status          : Factor w/ 3 levels "Cancelled","Completed",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Paid_Status          : Factor w/ 2 levels "Not Paid","Paid": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Payment_Option       : Factor w/ 2 levels "Cash","Card": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Vehicle_Type         : Factor w/ 4 levels "XL","Comfort",..: 2 2 2 3 1 2 3 2 2 2 ...
##  $ Ride_Type            : Factor w/ 2 levels "Regular-Instant",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Trip_Time_Minutes    : num [1:62] 34 12 47 19 36 2 46 32 43 5 ...
##  $ Trip_Distance_Km     : num [1:62] 23.22 6.27 23.12 9.76 50.61 ...
##  $ Admin_Commision_ZAR  : num [1:62] 32.1 10.4 35 11.1 79.4 ...
##  $ Total_Amount_ZAR     : num [1:62] 246 80 269 86 609 38 207 253 257 44 ...
##  $ Driver_Commission_ZAR: num [1:62] 213.7 69.2 233.1 74 529.3 ...

After data processing and preparation, we observe that:

5 of the 16 variables are numeric or quantitative variables. These are Trip_Time_Minutes, Trip_Distance_Km, Admin_Commission_ZAR, Total_Amount_ZAR, and Driver_Commission_ZAR.

5 are factor (or categorical variables), with varying levels. These include: Paid_Status, Trip_status, payment_option, vehicle_type, and ride_type.

5 variables are character or name variables: Request_Id, trip_start_time, trip_end_time, User_name, and Driver_name.

1 variable is a Date variable.

Drop irrelevant variables

BOZA_RIDES <- BOZA_RIDES[, !(names(BOZA_RIDES) %in% c("User_Name", "Driver_Name"))]
head(BOZA_RIDES)
## # A tibble: 6 × 14
##   Date       Request_Id trip_start_time trip_end_time Trip_Status Paid_Status
##   <date>     <chr>      <chr>           <chr>         <fct>       <fct>      
## 1 2023-07-28 REQ_000536 17:24:00        17:58:00      Completed   Paid       
## 2 2023-07-21 REQ_000458 10:10:00        10:23:00      Completed   Paid       
## 3 2023-07-27 REQ_000521 18:08:00        18:56:00      Completed   Paid       
## 4 2023-07-20 REQ_000268 12:23:00        12:43:00      Completed   Paid       
## 5 2023-12-23 REQ_002722 17:28:00        18:05:00      Completed   Paid       
## 6 2024-04-05 REQ_004785 07:26:00        07:28:00      Completed   Paid       
## # ℹ 8 more variables: Payment_Option <fct>, Vehicle_Type <fct>,
## #   Ride_Type <fct>, Trip_Time_Minutes <dbl>, Trip_Distance_Km <dbl>,
## #   Admin_Commision_ZAR <dbl>, Total_Amount_ZAR <dbl>,
## #   Driver_Commission_ZAR <dbl>

For ease of analysis, username and driver name has been dropped from the dataset.

EXPLORATORY DATA ANALYSIS (EDA).

1. Data vizualization (Descriptive Statistics for Numeric Variables)

## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

Based on the histograms provided, we can see that drivers earn an average commission of around R200, as this is where the graph’s concentration lies. Minimum commission is R31. Maximum commission is R551.70.

The average or mean total amount per trip is R228.10 Minimum total amount is R36.00 Maximum total amount is R635.00

The admin commission is approximately about 15% of the total amount made on each trip. The balance practically goes to the driver’s pocket.

Trip distance (in kilometers): Based on the visualization generated, it appears that the majority of trips in the dataset fall within the 20-25 km range. This suggests that most customers book rides within this distances range. There are exceptions of course, with a maximum trip of over 60 kilometers.

POINTS TO NOTE: Pricing Optimization: By analyzing this distribution of trip distances, we can determine if certain distance ranges are more popular among customers. If so, we can use this information to adjust pricing strategies, such as implementing dynamic pricing or surge pricing during peak demand periods or for longer trips. This can help maximize revenue.

Route Planning: Our understanding of the distribution of trip distances can also assist in optimizing route planning. For instance, we can identify areas with high demand for trips within specific distance ranges and allocate more vehicles in those areas. By strategically positioning vehicles closer to areas with frequent trips, we can reduce wait times and increase the number of completed trips, thereby generating more revenue.

Marketing and Promotions: Analyzing the distance distribution can guide marketing and promotional efforts. We may target advertising campaigns towards customers who frequently take trips within specific distance ranges.

TRIP TIME IN MINUTES. Based on the histogram analysis you, we can see that the majority of trip times in the dataset are concentrated around the 25-minute mark, with the highest frequency of trips occurring in the 25-30 minute range. This suggests that a significant number of trips have duration within this range. There are fewer trips with durations in the 0-15 minute range and the 100-115 minute range. This indicates that trips with very short or very long durations are less common in our dataset.

library(ggplot2)

# Histograms for quantitative variables
Measurable_Variables <- c("Driver_Commission_ZAR", "Total_Amount_ZAR", "Admin_Commision_ZAR", "Trip_Distance_Km", "Trip_Time_Minutes")
histograms <- lapply(Measurable_Variables, function(var) {
  ggplot(BOZA_RIDES, aes(x = .data[[var]])) +
    geom_bar(fill = "skyblue", color = "black", stat = "count") +
    labs(title = paste("Histogram of", var), x = var, y = "Frequency") +
    geom_density(color = "red", linetype = "dashed")
})

# Combine and print histograms on one page
multiplot <- do.call(gridExtra::grid.arrange, c(histograms, ncol = 2))

print(multiplot)
## TableGrob (3 x 2) "arrange": 5 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
## 5 5 (3-3,1-1) arrange gtable[layout]
library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
# Compute and print kernel density for each variable
Measurable_Variables <- c("Driver_Commission_ZAR", "Total_Amount_ZAR", "Admin_Commision_ZAR", "Trip_Distance_Km", "Trip_Time_Minutes")

density_plots <- list()  # Create an empty list to store the density plots

for (var in Measurable_Variables) {
  BOZA_RIDES_VAR <- BOZA_RIDES[[var]]
  density_plot <- ggplot(data.frame(x = BOZA_RIDES_VAR), aes(x = BOZA_RIDES_VAR)) +
    geom_density(color = "blue", fill = "skyblue", alpha = 0.5) +
    labs(title = paste("Kernel Density of", var), x = var, y = "Density")
  
  density_plots[[var]] <- density_plot  # Store the density plot in the list
  print(density_plot)
}

multiplot <- do.call(grid.arrange, c(density_plots, ncol = 2))  # Pass the list of density plots to grid.arrange

print(multiplot)
## TableGrob (3 x 2) "arrange": 5 grobs
##                       z     cells    name           grob
## Driver_Commission_ZAR 1 (1-1,1-1) arrange gtable[layout]
## Total_Amount_ZAR      2 (1-1,2-2) arrange gtable[layout]
## Admin_Commision_ZAR   3 (2-2,1-1) arrange gtable[layout]
## Trip_Distance_Km      4 (2-2,2-2) arrange gtable[layout]
## Trip_Time_Minutes     5 (3-3,1-1) arrange gtable[layout]

The distributions of our variables are displayed in the kernel density plots above. The peak of each distribution represents the area of (average) concentration of the related activity. For instance, average driver commission is slightly above R200 (in South African rand).

2. Data vizualization (Descriptive Statistics for Categorical Variables)

## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## TableGrob (4 x 2) "arrange": 7 grobs
##   z     cells    name                 grob
## 1 1 (2-2,1-1) arrange       gtable[layout]
## 2 2 (2-2,2-2) arrange       gtable[layout]
## 3 3 (3-3,1-1) arrange       gtable[layout]
## 4 4 (3-3,2-2) arrange       gtable[layout]
## 5 5 (4-4,1-1) arrange       gtable[layout]
## 6 6 (4-4,2-2) arrange       gtable[layout]
## 7 7 (1-1,1-2) arrange text[GRID.text.1300]

summary(BOZA_RIDES)
##       Date             Request_Id        trip_start_time    trip_end_time     
##  Min.   :2023-06-23   Length:62          Length:62          Length:62         
##  1st Qu.:2023-07-20   Class :character   Class :character   Class :character  
##  Median :2023-07-28   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :2023-09-08                                                           
##  3rd Qu.:2023-09-09                                                           
##  Max.   :2024-04-05                                                           
##       Trip_Status   Paid_Status Payment_Option  Vehicle_Type
##  Cancelled  : 0   Not Paid: 0   Cash: 0        XL     : 2   
##  Completed  :62   Paid    :62   Card:62        Comfort:40   
##  Not Started: 0                                Pickup : 8   
##                                                Regular:12   
##                                                             
##                                                             
##              Ride_Type  Trip_Time_Minutes Trip_Distance_Km Admin_Commision_ZAR
##  Regular-Instant  :62   Min.   : 2.00     Min.   : 0.00    Min.   : 4.65      
##  Regular-Scheduled: 0   1st Qu.:28.25     1st Qu.:22.56    1st Qu.:25.93      
##                         Median :34.00     Median :23.08    Median :30.70      
##                         Mean   :35.10     Mean   :21.83    Mean   :30.00      
##                         3rd Qu.:45.75     3rd Qu.:24.06    3rd Qu.:34.23      
##                         Max.   :99.00     Max.   :63.14    Max.   :82.76      
##  Total_Amount_ZAR Driver_Commission_ZAR
##  Min.   : 36.0    Min.   : 31.0        
##  1st Qu.:194.8    1st Qu.:172.9        
##  Median :235.0    Median :203.6        
##  Mean   :228.1    Mean   :199.4        
##  3rd Qu.:256.5    3rd Qu.:228.2        
##  Max.   :635.0    Max.   :551.7

Based on the summary provided, we can make the following observations:

  1. Date: The dataset contains rides that span from June 23, 2023, to April 5, 2024. This indicates that the rides were recorded over a period of several months.

  2. Trip_Status: The Trip_Status variable indicates the status of each trip. All trips in the dataset are marked as “Completed,” indicating that none of them were canceled or in progress.

  3. Paid_Status: The Paid_Status variable indicates the payment status of each trip. All trips in the dataset are marked as “Paid,” indicating that all trips have been paid for.

  4. Payment_Option: All trips have the payment method recorded as “Card,” indicating most riders prefer the card payment option as compared to cash or wallets.

  5. Vehicle_Type: The Vehicle_Type variable indicates the type of vehicle used for the trips. The most common vehicle type is “Comfort,” which was used for 40 out of the 62 trips. Additionally, the extra-large (XL) vehicle category is the least used.

  6. Ride_Type: The Ride_Type variable indicates the type of ride. All 62 trips are classified as “Regular-Instant,” indicating that there were no scheduled or shared rides in the dataset.

  7. Trip_Time_Minutes: The Trip_Time_Minutes variable provides information about the duration of each trip. The minimum trip duration is 2 minutes, while the maximum is 99 minutes. The mean or average trip duration is approximately 35.1 minutes.

  8. Trip_Distance_Km: The Trip_Distance_Km variable provides information about the distance traveled during each trip. The minimum distance is 0 km, while the maximum is 63.14 km. The mean or average distance traveled is approximately 21.83 km.

  9. Admin_Commission_ZAR, Total_Amount_ZAR, and Driver_Commission_ZAR: These variables represent the commission and total amount earned for each trip. The minimum and maximum values indicate the range of commission and earnings in South African Rand (ZAR). For example, the minimum admin commission is 4.65 ZAR, while the maximum is 82.76 ZAR. Similarly, the minimum total amount earned is 36.0 ZAR, while the maximum is 635.0 ZAR.

Correlations (i.e. Important relationships)

##                       Driver_Commission_ZAR Total_Amount_ZAR
## Driver_Commission_ZAR             1.0000000        0.9876642
## Total_Amount_ZAR                  0.9876642        1.0000000
## Admin_Commision_ZAR               0.9997283        0.9875567
## Trip_Distance_Km                  0.9348813        0.9306511
## Trip_Time_Minutes                 0.7548161        0.7245643
##                       Admin_Commision_ZAR Trip_Distance_Km Trip_Time_Minutes
## Driver_Commission_ZAR           0.9997283        0.9348813         0.7548161
## Total_Amount_ZAR                0.9875567        0.9306511         0.7245643
## Admin_Commision_ZAR             1.0000000        0.9348032         0.7563925
## Trip_Distance_Km                0.9348032        1.0000000         0.7638477
## Trip_Time_Minutes               0.7563925        0.7638477         1.0000000

Based on the correlation matrix provided above, we can interpret the correlations between the variables in the BOZA_RIDES dataset as follows:

  1. Driver Commission and Total Amount: There is a strong positive correlation of approximately 0.99 (or 99%) between the Driver Commission and Total Amount. This suggests that the driver’s commission is directly related to the total amount earned from the rides. As the total amount increases, the driver’s commission also tends to increase. This insight can be useful for evaluating our drivers’ incentives, commission structures, and optimizing revenue-sharing models. For instance, we can adjust driver commission structures (or give higher incentives) based on Total Amount a driver can generate.

  2. Admin Commission and Driver Commission/Total Amount: There is a similarly strong positive correlation of approximately 0.99 (or 99%) between the Admin Commission and both the Driver Commission and Total Amount. We observe that the admin commission is closely related to the driver commission and the overall revenue generated from the ordered rides. This can help BOZA rides monitor and analyze the Admin Commission in relation to the Driver Commission and Total Amount of commissions, to ensure fair and appropriate distribution and effective cost management.

  3. Trip Distance and Trip Time: The Trip Distance and Trip Time variables show moderate positive relationships with other variables. Trip Distance has a correlation of approximately 0.93 (or 93%) with Driver Commission, Total Amount, and Admin Commission. This suggests that longer trips tend to result in higher earnings and commissions. Similarly, Trip Time has a correlation of approximately 0.75 (or 75%) with these variables. This can inform decisions on pricing strategies, fare calculations, and optimizing driver allocation during peak periods.

Test for Linearity - Generate scatter plots and correlation co-efficients:

## [1] 0.988

The scatter plot above indicates a strong positive linear relationship between the Total Amount (ZAR) and Driver Commission (ZAR) variables. This means that as the Driver Commission (ZAR) increases, there tends to be a corresponding increase in the Total Amount (ZAR) and vice-versa. The relationship appears to be linear, implying that the two variables are positively associated. The correlation coefficient of 0.988 (approximately 98.8%) further supports the observation of a strong positive linear relationship between the variables. This measures the strength and direction of the linear relationship.

## [1] 0.988

Again, this scatter plot indicates a strong positive linear relationship between the Total Amount (ZAR) and Admin Commission (ZAR). i.e as the Admin Commission (ZAR) increases, there tends to be a corresponding increase in the Total Amount (ZAR) and vice-versa. This linear relationship, implies that the two variables are positively associated. Similarly, a coefficient of determination r = 0.988 (approximately 98.8%) supports the observation of a strong positive linear relationship between the variables; measuring the strength and direction of the linear relationship.

## [1] 0.931

The scatter plot between the Total Amount (ZAR) and Trip_distance_Km also shows a strong positive linear relationship. A coefficient of determination r = 0.931 (approximately 93.1%) supports the observation of a strong positive linear relationship between these variables; implying that the two variables are strongly (and positively) associated.

## [1] 0.725

Lastly, the scatter plot between the Total Amount (ZAR) and Trip_Time_Minutes shows a relatively strong positive linear relationship. A coefficient of determination r = 0.725 (approximately 72.5%) supports the observation of a strong positive linear relationship between these variables; This implies that the two variables are positively associated.

Test for multicollinearity

## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
## Driver_Commission_ZAR   Admin_Commision_ZAR      Trip_Distance_Km 
##           1863.475420           1866.230124              8.465926 
##     Trip_Time_Minutes 
##              2.512186

Further test that Driver_Commission_ZAR and Admin_Commision_ZAR exhibit multicolinearity:

## [1] 1
summary(BOZA_RIDES$Total_Amount_ZAR)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    36.0   194.8   235.0   228.1   256.5   635.0

The Total_Amount_ZAR will be our response (or dependent variable ) in a quest to optimize “BOZA RIDES” travel activities. The average ride amount sits at R228.10 as of now.

Model building - first Model.

# We first convert Vehicle_Type into dummy variables
dummy_matrix <- model.matrix(~ Vehicle_Type - 1, data = BOZA_RIDES)
dummy_matrix
##    Vehicle_TypeXL Vehicle_TypeComfort Vehicle_TypePickup Vehicle_TypeRegular
## 1               0                   1                  0                   0
## 2               0                   1                  0                   0
## 3               0                   1                  0                   0
## 4               0                   0                  1                   0
## 5               1                   0                  0                   0
## 6               0                   1                  0                   0
## 7               0                   0                  1                   0
## 8               0                   1                  0                   0
## 9               0                   1                  0                   0
## 10              0                   1                  0                   0
## 11              0                   1                  0                   0
## 12              0                   1                  0                   0
## 13              0                   1                  0                   0
## 14              0                   0                  0                   1
## 15              0                   0                  1                   0
## 16              0                   1                  0                   0
## 17              0                   0                  0                   1
## 18              0                   1                  0                   0
## 19              0                   1                  0                   0
## 20              0                   1                  0                   0
## 21              0                   1                  0                   0
## 22              0                   1                  0                   0
## 23              0                   1                  0                   0
## 24              0                   0                  1                   0
## 25              0                   0                  1                   0
## 26              0                   0                  0                   1
## 27              0                   0                  0                   1
## 28              0                   1                  0                   0
## 29              0                   0                  0                   1
## 30              0                   1                  0                   0
## 31              0                   0                  0                   1
## 32              0                   0                  0                   1
## 33              0                   1                  0                   0
## 34              0                   1                  0                   0
## 35              0                   0                  1                   0
## 36              0                   1                  0                   0
## 37              0                   1                  0                   0
## 38              0                   1                  0                   0
## 39              0                   1                  0                   0
## 40              0                   0                  0                   1
## 41              0                   0                  1                   0
## 42              0                   0                  1                   0
## 43              0                   1                  0                   0
## 44              0                   1                  0                   0
## 45              0                   1                  0                   0
## 46              0                   1                  0                   0
## 47              0                   1                  0                   0
## 48              0                   1                  0                   0
## 49              0                   1                  0                   0
## 50              0                   0                  0                   1
## 51              0                   1                  0                   0
## 52              0                   1                  0                   0
## 53              0                   1                  0                   0
## 54              1                   0                  0                   0
## 55              0                   1                  0                   0
## 56              0                   1                  0                   0
## 57              0                   0                  0                   1
## 58              0                   0                  0                   1
## 59              0                   1                  0                   0
## 60              0                   0                  0                   1
## 61              0                   1                  0                   0
## 62              0                   1                  0                   0
## attr(,"assign")
## [1] 1 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$Vehicle_Type
## [1] "contr.treatment"
## 
## Call:
## lm(formula = Total_Amount_ZAR ~ Driver_Commission_ZAR + Admin_Commision_ZAR + 
##     Trip_Distance_Km + Trip_Time_Minutes + dummy_matrix, data = BOZA_RIDES)
## 
## Coefficients:
##                     (Intercept)            Driver_Commission_ZAR  
##                          7.9191                           0.4435  
##             Admin_Commision_ZAR                 Trip_Distance_Km  
##                          5.3951                           0.2148  
##               Trip_Time_Minutes       dummy_matrixVehicle_TypeXL  
##                         -0.7521                         -43.9501  
## dummy_matrixVehicle_TypeComfort   dummy_matrixVehicle_TypePickup  
##                        -10.1233                          -3.4424  
## dummy_matrixVehicle_TypeRegular  
##                              NA
## 
## Call:
## lm(formula = Total_Amount_ZAR ~ Driver_Commission_ZAR + Admin_Commision_ZAR + 
##     Trip_Distance_Km + Trip_Time_Minutes + dummy_matrix, data = BOZA_RIDES)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -130.658   -2.318    1.057    5.506   32.532 
## 
## Coefficients: (1 not defined because of singularities)
##                                 Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                       7.9191     8.5497   0.926   0.3584  
## Driver_Commission_ZAR             0.4435     1.0136   0.438   0.6634  
## Admin_Commision_ZAR               5.3951     6.8041   0.793   0.4313  
## Trip_Distance_Km                  0.2148     0.7424   0.289   0.7734  
## Trip_Time_Minutes                -0.7521     0.3249  -2.315   0.0244 *
## dummy_matrixVehicle_TypeXL      -43.9501    31.2034  -1.409   0.1647  
## dummy_matrixVehicle_TypeComfort -10.1233     7.6235  -1.328   0.1898  
## dummy_matrixVehicle_TypePickup   -3.4424     9.3950  -0.366   0.7155  
## dummy_matrixVehicle_TypeRegular       NA         NA      NA       NA  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.67 on 54 degrees of freedom
## Multiple R-squared:  0.9785, Adjusted R-squared:  0.9757 
## F-statistic: 351.1 on 7 and 54 DF,  p-value: < 2.2e-16
##                     (Intercept)           Driver_Commission_ZAR 
##                       7.9191087                       0.4435342 
##             Admin_Commision_ZAR                Trip_Distance_Km 
##                       5.3951075                       0.2147967 
##               Trip_Time_Minutes      dummy_matrixVehicle_TypeXL 
##                      -0.7521028                     -43.9501021 
## dummy_matrixVehicle_TypeComfort  dummy_matrixVehicle_TypePickup 
##                     -10.1232908                      -3.4424219 
## dummy_matrixVehicle_TypeRegular 
##                              NA
## [1] 20895.84

Second Model (Stepwise Regression)

## 
## Call:
## lm(formula = Total_Amount_ZAR ~ Driver_Commission_ZAR + Trip_Distance_Km + 
##     Trip_Time_Minutes, data = BOZA_RIDES)
## 
## Coefficients:
##           (Intercept)  Driver_Commission_ZAR       Trip_Distance_Km  
##                3.2402                 1.1051                 0.8730  
##     Trip_Time_Minutes  
##               -0.4147
## 
## Call:
## lm(formula = Total_Amount_ZAR ~ Driver_Commission_ZAR + Trip_Distance_Km + 
##     Trip_Time_Minutes, data = BOZA_RIDES)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -137.122   -1.173    0.209    4.563   35.196 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            3.24023    5.57480   0.581   0.5633    
## Driver_Commission_ZAR  1.10510    0.06558  16.852   <2e-16 ***
## Trip_Distance_Km       0.87296    0.58482   1.493   0.1409    
## Trip_Time_Minutes     -0.41467    0.21398  -1.938   0.0575 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.47 on 58 degrees of freedom
## Multiple R-squared:  0.9774, Adjusted R-squared:  0.9762 
## F-statistic:   835 on 3 and 58 DF,  p-value: < 2.2e-16
##           (Intercept) Driver_Commission_ZAR      Trip_Distance_Km 
##             3.2402299             1.1051000             0.8729634 
##     Trip_Time_Minutes 
##            -0.4146722
## [1] 21996.26

Diagnostic plots.

##           (Intercept) Driver_Commission_ZAR      Trip_Distance_Km 
##             3.2402299             1.1051000             0.8729634 
##     Trip_Time_Minutes 
##            -0.4146722
## [1] 21996.26

Deviance refers to the measure of the discrepancy between the observed (collected) values of the “Total_Amount_ZAR” and the values predicted by the model.

The reference point for evaluating the deviance in this case is compared to the deviance of a baseline model.

Our baseline model is the model with only an intercept term ( i.e approximately R3.2402299, no independent variables), and this would serve as a point of comparison for assessing the improvement in fit achieved by adding independent variables to the model.

The difference in deviance between the full MLR model and the baseline model is often used to evaluate the model’s goodness-of-fit.

Now we perform a new check for inluential observations.

## 14 19 59 
## 14 19 59
## 
## Call:
## lm(formula = Total_Amount_ZAR ~ Driver_Commission_ZAR + Trip_Distance_Km + 
##     Trip_Time_Minutes, data = BOZA_RIDES_new)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.789 -1.156 -0.487  0.187 35.211 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           0.467811   1.399638   0.334    0.739    
## Driver_Commission_ZAR 1.125321   0.016441  68.445   <2e-16 ***
## Trip_Distance_Km      0.247212   0.148029   1.670    0.100    
## Trip_Time_Minutes     0.007835   0.055488   0.141    0.888    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.878 on 57 degrees of freedom
## Multiple R-squared:  0.9986, Adjusted R-squared:  0.9985 
## F-statistic: 1.343e+04 on 3 and 57 DF,  p-value: < 2.2e-16
##           (Intercept) Driver_Commission_ZAR      Trip_Distance_Km 
##           0.467811438           1.125320685           0.247212397 
##     Trip_Time_Minutes 
##           0.007835193

*** #Total_Amount_ZAR = 0.49325053 + (1.15215918 * Driver_Commission_ZAR) + (0.03123941 * Trip_Distance_Km ) + (0.00993803 * Trip_Time_Minutes ) + error.***

## [1] 1356.434

Based on the provided model summary, we now interpret the outputs and comment on the model:

  1. Coefficients: The coefficients represent the estimated effect of each predictor variable on the total amount in ZAR.

For example, when distance = 1KM, *** #Total_Amount_ZAR = 0.49325053 + (1.15215918 * Driver_Commission_ZAR) + (0.03123941 * Trip_Distance_Km ) + (0.00993803 * Trip_Time_Minutes ) + error. i.e.  #Total_Amount_ZAR = 0.49325053 + (1.15215918 * 0) + (-0.03123941 * 1) + (0.00993803 * 0 ) + error. #Total_Amount_ZAR = 0.49325053 - (0.03123941 * 1 ) *** = R0.462 (approx).

  1. Residuals: The residuals represent the differences between the actual total amounts and the predicted total amounts by the model. The residuals should ideally be normally distributed with mean zero and constant variance.
  1. Adjusted R-squared: The adjusted R-squared value indicates the proportion of the variance in the total amount that can be explained by the predictor variables.
  1. F-statistic: The F-statistic tests the overall significance of the model.

Overall, the model appears to have a good fit, as indicated by the high Adjusted R-squared value and the statistically significant F-statistic.

Now, reassess the diagnostic plots

TEST RUNNING MODEL.

## Total Amount (ZAR)

# Coefficients from the regression model
intercept <- 0.49325053 
driver_commission <- 1.15215918 
trip_distance <- -0.03123941   
trip_time <- 0.00993803

# Input values
driver_commission_input <- 213.74
trip_distance_input <- 23.22
trip_time_input <- 34

# Calculate the predicted total amount
total_amount <- intercept + (driver_commission * driver_commission_input) +
                (trip_distance * trip_distance_input) +
                (trip_time * trip_time_input)

# Print the predicted total amount
total_amount
## [1] 246.3683

## Driver Commission (ZAR)

# Coefficients from the regression model
intercept <- 0.49325053 
driver_commission <- 1.15215918 
trip_distance <- -0.03123941   
trip_time <- 0.00993803

# Input values
total_amount_input <- 213.74
trip_distance_input <- 23.22
trip_time_input <- 34

# Calculate the driver commission
driver_commission_calculated <- (total_amount_input - intercept - (trip_distance * trip_distance_input) - (trip_time * trip_time_input)) / driver_commission

# Print the calculated driver commission
driver_commission_calculated
## [1] 185.4208

## Trip time in minutes

# Coefficients from the regression model
intercept <- 0.49325053 
driver_commission <- 1.15215918 
trip_distance <- -0.03123941   
trip_time <- 0.00993803

# Input values
total_amount_input <- 635.01
driver_commission_input <- 551.73
trip_distance_input <- 51.26

# Calculate the trip time in minutes
trip_time_calculated <- (total_amount_input - intercept - (driver_commission * driver_commission_input) - (trip_distance * trip_distance_input)) / trip_time

# Print the calculated trip time in minutes
trip_time_calculated
## [1] 44.00241

## Trip distance in kilometers

# Coefficients from the regression model
intercept <- 0.49325053 
driver_commission <- 1.15215918 
trip_distance <- -0.03123941   
trip_time <- 0.00993803

# Input values
total_amount_input <- 635.01
driver_commission_input <- 551.73
trip_time_input <- 44

# Calculate the trip distance in kilometers
trip_distance_calculated <- (total_amount_input - intercept - (driver_commission * driver_commission_input) - (trip_time * trip_time_input)) / trip_distance

# Print the calculated trip distance in kilometers
trip_distance_calculated
## [1] 51.25923