# Installing packages
# install.packages("dplyr")
# install.packages("ggplot2")
# install.packages("corrplot")
# Loading required libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(datasets)
# Loading the data into new dataframe after removing unnecessary columns
df <- read.csv("C:/Users/aameh/OneDrive - UT Arlington/My Files/SEM-4/dynamic_pricing.csv", header = TRUE)
#*1. Number_of_Riders,
#*2. Number_of_Drivers,
#*3. Location_Category,
#*4. Customer_Loyalty_Status,
#*5. Number_of_Past_Rides,
#*6. Average_Ratings,
#*7. Time_of_Booking,
#*8. Vehicle_Type,
#*9. Expected_Ride_Duration,
#*10. Historical_Cost_of_Ride
# Initial Data inspection
head(df)
## Number_of_Riders Number_of_Drivers Location_Category Customer_Loyalty_Status
## 1 90 45 Urban Silver
## 2 58 39 Suburban Silver
## 3 42 31 Rural Silver
## 4 89 28 Rural Regular
## 5 78 22 Rural Regular
## 6 59 35 Urban Silver
## Number_of_Past_Rides Average_Ratings Time_of_Booking Vehicle_Type
## 1 13 4.47 Night Premium
## 2 72 4.06 Evening Economy
## 3 0 3.99 Afternoon Premium
## 4 67 4.31 Afternoon Premium
## 5 74 3.77 Afternoon Economy
## 6 83 3.51 Night Economy
## Expected_Ride_Duration Historical_Cost_of_Ride
## 1 90 284.2573
## 2 43 173.8748
## 3 76 329.7955
## 4 134 470.2012
## 5 149 579.6814
## 6 128 339.9554
tail(df)
## Number_of_Riders Number_of_Drivers Location_Category
## 995 33 14 Suburban
## 996 33 23 Urban
## 997 84 29 Urban
## 998 44 6 Suburban
## 999 53 27 Suburban
## 1000 78 63 Rural
## Customer_Loyalty_Status Number_of_Past_Rides Average_Ratings
## 995 Regular 87 4.81
## 996 Gold 24 4.21
## 997 Regular 92 4.55
## 998 Gold 80 4.13
## 999 Regular 78 3.63
## 1000 Gold 14 4.21
## Time_of_Booking Vehicle_Type Expected_Ride_Duration
## 995 Evening Premium 17
## 996 Morning Premium 11
## 997 Morning Premium 94
## 998 Night Premium 40
## 999 Night Premium 58
## 1000 Afternoon Economy 147
## Historical_Cost_of_Ride
## 995 118.98653
## 996 91.38953
## 997 424.15599
## 998 157.36483
## 999 279.09505
## 1000 655.06511
str(df)
## 'data.frame': 1000 obs. of 10 variables:
## $ Number_of_Riders : int 90 58 42 89 78 59 93 62 79 42 ...
## $ Number_of_Drivers : int 45 39 31 28 22 35 43 39 14 6 ...
## $ Location_Category : chr "Urban" "Suburban" "Rural" "Rural" ...
## $ Customer_Loyalty_Status: chr "Silver" "Silver" "Silver" "Regular" ...
## $ Number_of_Past_Rides : int 13 72 0 67 74 83 44 83 71 21 ...
## $ Average_Ratings : num 4.47 4.06 3.99 4.31 3.77 3.51 4.41 3.59 3.74 3.85 ...
## $ Time_of_Booking : chr "Night" "Evening" "Afternoon" "Afternoon" ...
## $ Vehicle_Type : chr "Premium" "Economy" "Premium" "Premium" ...
## $ Expected_Ride_Duration : int 90 43 76 134 149 128 16 47 128 128 ...
## $ Historical_Cost_of_Ride: num 284 174 330 470 580 ...
summary(df)
## Number_of_Riders Number_of_Drivers Location_Category Customer_Loyalty_Status
## Min. : 20.00 Min. : 5.00 Length:1000 Length:1000
## 1st Qu.: 40.00 1st Qu.:11.00 Class :character Class :character
## Median : 60.00 Median :22.00 Mode :character Mode :character
## Mean : 60.37 Mean :27.08
## 3rd Qu.: 81.00 3rd Qu.:38.00
## Max. :100.00 Max. :89.00
## Number_of_Past_Rides Average_Ratings Time_of_Booking Vehicle_Type
## Min. : 0.00 Min. :3.500 Length:1000 Length:1000
## 1st Qu.: 25.00 1st Qu.:3.870 Class :character Class :character
## Median : 51.00 Median :4.270 Mode :character Mode :character
## Mean : 50.03 Mean :4.257
## 3rd Qu.: 75.00 3rd Qu.:4.633
## Max. :100.00 Max. :5.000
## Expected_Ride_Duration Historical_Cost_of_Ride
## Min. : 10.00 Min. : 25.99
## 1st Qu.: 59.75 1st Qu.:221.37
## Median :102.00 Median :362.02
## Mean : 99.59 Mean :372.50
## 3rd Qu.:143.00 3rd Qu.:510.50
## Max. :180.00 Max. :836.12
# Data Cleaning and Pre-processing
# Missing values
any(is.na(df))
## [1] FALSE
# Duplicates
duplicates <- df[duplicated(df)]
any(is.na(duplicates))
## [1] FALSE
# Data Visualization
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.3.2
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(GGally)
## Warning: package 'GGally' was built under R version 4.3.2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(dplyr)
# Ordinal Encoding
df <- df %>%
mutate(Customer_Loyalty_Status = as.integer(factor(Customer_Loyalty_Status,
levels = c("Silver", "Regular", "Gold"), ordered = TRUE)),
Vehicle_Type = as.integer(factor(Vehicle_Type,
levels = c("Economy", "Premium", "Luxury"), ordered = TRUE)))
# One-Hot Encoding for Nominal Variables
# Convert categorical columns to factors
df <- df %>%
mutate(Location_Category = factor(Location_Category),
Time_of_Booking = factor(Time_of_Booking))
# Label Encoding for Ordinal Variables
df <- df %>%
mutate(
# Encoding for Time_of_Booking
Time_of_Booking = factor(Time_of_Booking, levels = c("Morning", "Afternoon", "Evening", "Night"), ordered = TRUE) %>%
as.integer(),
# Encoding for Location_Category
Location_Category = factor(Location_Category, levels = c("Rural", "Suburban", "Urban"), ordered = TRUE) %>%
as.integer()
)
# View result
head(df)
## Number_of_Riders Number_of_Drivers Location_Category Customer_Loyalty_Status
## 1 90 45 3 1
## 2 58 39 2 1
## 3 42 31 1 1
## 4 89 28 1 2
## 5 78 22 1 2
## 6 59 35 3 1
## Number_of_Past_Rides Average_Ratings Time_of_Booking Vehicle_Type
## 1 13 4.47 4 2
## 2 72 4.06 3 1
## 3 0 3.99 2 2
## 4 67 4.31 2 2
## 5 74 3.77 2 1
## 6 83 3.51 4 1
## Expected_Ride_Duration Historical_Cost_of_Ride
## 1 90 284.2573
## 2 43 173.8748
## 3 76 329.7955
## 4 134 470.2012
## 5 149 579.6814
## 6 128 339.9554
print(colnames(df))
## [1] "Number_of_Riders" "Number_of_Drivers"
## [3] "Location_Category" "Customer_Loyalty_Status"
## [5] "Number_of_Past_Rides" "Average_Ratings"
## [7] "Time_of_Booking" "Vehicle_Type"
## [9] "Expected_Ride_Duration" "Historical_Cost_of_Ride"
boxplot(df[["Number_of_Riders"]], main="Number_of_Riders", col="lightblue")
boxplot(df[["Number_of_Drivers"]], main="Number_of_Drivers", col="cyan")
boxplot(df[["Location_Category"]], main="Location_Category", col="green")
boxplot(df[["Customer_Loyalty_Status"]], main="Customer_Loyalty_Status", col="violet")
boxplot(df[["Number_of_Past_Rides"]], main="Number_of_Past_Rides", col="yellow")
boxplot(df[["Average_Ratings"]], main="Average_Ratings", col="orange")
boxplot(df[["Time_of_Booking"]], main="Time_of_Booking", col="pink")
boxplot(df[["Vehicle_Type"]], main="Vehicle_Type", col="darkblue")
boxplot(df[["Expected_Ride_Duration"]], main="Expected_Ride_Duration", col="purple")
boxplot(df[["Historical_Cost_of_Ride"]], main="Historical_Cost_of_Ride", col="red")
# Function to remove outliers based on IQR
remove_outliers <- function(df, threshold = 1.5) {
# Identify outliers for each column
outliers <- sapply(df, function(col) {
q <- quantile(col, c(0.25, 0.75), na.rm = TRUE)
iqr <- IQR(col, na.rm = TRUE)
lower_bound <- q[1] - threshold * iqr
upper_bound <- q[2] + threshold * iqr
col < lower_bound | col > upper_bound
})
# Remove rows with outliers
df[!apply(outliers, 1, any), , drop = FALSE]
}
# Remove outliers from your DataFrame
df1 <- remove_outliers(df)
df1 <- as.data.frame(df)
# Check for rows that are in 'df' but not in 'df1'
setdiff(df, df1)
## [1] Number_of_Riders Number_of_Drivers Location_Category
## [4] Customer_Loyalty_Status Number_of_Past_Rides Average_Ratings
## [7] Time_of_Booking Vehicle_Type Expected_Ride_Duration
## [10] Historical_Cost_of_Ride
## <0 rows> (or 0-length row.names)
# Check for rows that are in 'df1' but not in 'df'
setdiff(df1, df)
## [1] Number_of_Riders Number_of_Drivers Location_Category
## [4] Customer_Loyalty_Status Number_of_Past_Rides Average_Ratings
## [7] Time_of_Booking Vehicle_Type Expected_Ride_Duration
## [10] Historical_Cost_of_Ride
## <0 rows> (or 0-length row.names)
Pair Plot
# Create a pairplot using GGally
ggpairs(df1, title="Pair plot of the data")
SCATTER PLOT
# 1. Ride Price vs. Ride Duration
scatter1 <- ggplot(df1, aes(x=Expected_Ride_Duration, y=Historical_Cost_of_Ride)) + geom_point()
scatter1
# 2. Number of Riders vs. Number of Drivers
scatter2 <- ggplot(df1, aes(x=Number_of_Riders, y=Number_of_Drivers)) + geom_point()
scatter2
# 3. Average Ratings vs. Ride Price
scatter3 <- ggplot(df1, aes(x=Average_Ratings, y=Historical_Cost_of_Ride)) + geom_point()
scatter3
# 4. Booking Time vs. Ride Duration
scatter4 <- ggplot(df1, aes(x=Time_of_Booking, y=Expected_Ride_Duration)) + geom_point()
scatter4
# 5. Ride Price vs. Vehicle Type
scatter5 <- ggplot(df1, aes(x=Vehicle_Type, y=Historical_Cost_of_Ride)) + geom_point()
scatter5
# 6. Loyalty Status vs. Ride Price
scatter6 <- ggplot(df1, aes(x=Customer_Loyalty_Status, y=Historical_Cost_of_Ride)) + geom_point()
scatter6
# Load the randomForest package
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
# Split the data into training and testing sets
set.seed(123)
train_index <- sample(1:nrow(df), 0.8 * nrow(df))
train_data <- df[train_index, ]
test_data <- df[-train_index, ]
# Train the random forest model
rf_model <- randomForest(Historical_Cost_of_Ride ~ ., data = train_data, ntree = 100)
# Print the model summary
print(rf_model)
##
## Call:
## randomForest(formula = Historical_Cost_of_Ride ~ ., data = train_data, ntree = 100)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 5167.617
## % Var explained: 84.79
In this random forest model: - Used multiple features such as
‘Number_of_Riders’, ‘Number_of_Drivers’, ‘Location_Category’,
‘Customer_Loyalty_Status’, ‘Number_of_Past_Rides’, ‘Average_Ratings’,
‘Time_of_Booking’, ‘Vehicle_Type’, ‘Expected_Ride_Duration’,
‘Historical_Cost_of_Ride’ to predict
Historical_Cost_of_Ride. - The model trains on 80% of the
data and evaluates performance on the remaining 20% using metrics like
Mean Squared Error (MSE). - The feature importance plot helps identify
the key variables driving the cost predictions.
# Predict on test data
rf_predictions <- predict(rf_model, newdata = test_data)
# Calculate Mean Squared Error (MSE)
mse <- mean((rf_predictions - test_data$Historical_Cost_of_Ride)^2)
cat("Mean Squared Error: ", mse)
## Mean Squared Error: 5191.93
# Extract feature importance
feature_importance <- importance(rf_model)
print(feature_importance)
## IncNodePurity
## Number_of_Riders 1098156.3
## Number_of_Drivers 1038886.3
## Location_Category 332353.3
## Customer_Loyalty_Status 287365.9
## Number_of_Past_Rides 1001525.2
## Average_Ratings 1170091.2
## Time_of_Booking 386449.9
## Vehicle_Type 414929.2
## Expected_Ride_Duration 21035165.3
# Plot feature importance
varImpPlot(rf_model)
# Make predictions (for example, on test data)
predictions <- predict(rf_model, newdata = test_data)
# Compare the predictions to the actual values
comparison <- data.frame(Actual = test_data$Historical_Cost_of_Ride, Predicted = predictions)
print(comparison)
## Actual Predicted
## 1 284.25727 359.03537
## 3 329.79547 300.02290
## 7 104.06154 119.72299
## 9 501.41252 434.54665
## 12 414.99010 531.85371
## 22 64.07117 112.33361
## 25 235.65813 279.11181
## 27 182.47443 155.30807
## 28 652.61730 556.27572
## 32 485.05466 580.16096
## 35 294.73289 277.89300
## 43 530.75999 451.79147
## 47 514.87717 589.94288
## 60 247.17456 310.56281
## 66 449.54563 331.04152
## 70 295.58871 256.66897
## 75 443.25816 482.38038
## 86 454.49212 521.59313
## 97 685.76813 563.96840
## 101 404.04990 566.44642
## 102 156.03833 161.38746
## 103 501.26296 520.59398
## 109 754.23448 567.30896
## 126 778.63336 586.83604
## 133 49.23478 93.44110
## 140 150.53929 148.36160
## 144 132.21476 175.81992
## 145 70.11182 118.74482
## 147 102.04473 150.63219
## 149 183.65831 214.10115
## 150 436.55765 424.93074
## 154 320.85762 339.52310
## 156 758.50153 620.60006
## 157 665.72953 634.58775
## 176 424.61206 335.64287
## 182 232.28345 252.38964
## 183 185.31184 187.99812
## 192 715.75479 626.39440
## 198 453.37695 455.31028
## 202 234.07145 260.80494
## 208 671.71283 526.30027
## 213 694.42471 544.18207
## 214 617.36322 572.37291
## 215 328.63537 282.19462
## 216 376.30006 409.56115
## 233 151.35930 143.69521
## 245 490.50408 441.95079
## 247 424.92751 351.57593
## 249 258.42492 304.36438
## 253 658.58343 599.50692
## 257 434.57105 433.09035
## 269 307.62195 294.73811
## 283 746.33913 556.96844
## 285 417.91314 310.66459
## 288 492.46401 451.90064
## 293 410.34271 339.44823
## 296 74.32838 135.74411
## 300 563.98033 578.37260
## 314 369.85464 396.66735
## 325 431.42166 351.64415
## 329 669.52101 580.53975
## 335 179.93988 152.42485
## 345 322.90007 352.03360
## 353 717.46732 575.19124
## 354 266.97722 286.17260
## 356 266.43362 265.91858
## 359 588.69418 501.30451
## 360 305.24267 246.75641
## 366 192.38502 182.53117
## 367 744.03598 590.14204
## 368 412.25746 402.52309
## 369 189.56056 169.31403
## 375 317.88096 416.18916
## 380 651.73039 519.08822
## 385 201.05754 219.16180
## 387 546.85745 531.35277
## 408 281.96257 260.60897
## 411 124.64550 153.35373
## 425 106.59208 131.81791
## 432 323.55769 266.27231
## 436 146.51277 182.11009
## 449 59.44541 134.71171
## 453 210.63177 195.98507
## 454 152.22919 153.22218
## 467 640.13734 495.97321
## 472 310.78840 257.93257
## 474 373.50471 520.41402
## 482 227.18257 231.34860
## 484 213.82330 231.10243
## 485 652.58789 643.06770
## 486 289.64631 272.02382
## 491 629.20443 600.17450
## 495 561.43852 626.95861
## 497 133.38738 115.24517
## 502 585.80759 569.36312
## 506 198.17047 267.56419
## 514 196.31514 205.01942
## 515 622.49143 617.29780
## 518 635.93305 585.64918
## 520 324.09877 379.63647
## 525 238.31899 302.69086
## 531 527.17920 426.10846
## 542 117.63328 146.40275
## 543 493.01495 520.64581
## 546 305.00889 260.39564
## 556 480.10437 397.72922
## 565 380.28537 414.34621
## 568 565.66915 518.13988
## 576 336.56848 278.84989
## 579 264.67575 249.17687
## 580 185.61792 180.00154
## 583 92.65681 155.12779
## 584 332.22938 401.20885
## 587 60.98182 141.22489
## 592 147.45974 163.34742
## 594 62.06928 109.60630
## 599 680.93307 541.55659
## 607 510.19579 553.36822
## 616 53.53910 94.97788
## 622 347.89597 421.46723
## 628 418.10682 443.25632
## 631 384.20929 291.98960
## 641 575.41027 610.27616
## 642 190.97885 241.21541
## 643 398.15938 419.61283
## 653 181.78094 168.95236
## 669 631.63130 545.32649
## 674 283.41328 329.45463
## 675 148.18649 195.08177
## 683 510.65808 552.08789
## 684 171.62712 191.89434
## 689 128.01676 147.02415
## 693 263.16118 316.56313
## 701 490.34216 433.46563
## 708 184.24294 200.55240
## 713 638.13402 614.76256
## 715 55.63737 109.55072
## 728 306.39046 274.38023
## 730 614.63671 535.25318
## 731 231.16517 254.80864
## 735 135.17727 159.81170
## 736 360.85855 457.65760
## 737 231.32632 254.41609
## 740 721.72127 578.08863
## 743 415.97945 580.29557
## 749 478.76976 409.81884
## 756 465.71906 546.71757
## 759 576.96757 579.82821
## 763 532.50978 549.36986
## 772 482.74221 556.67584
## 773 205.49576 210.64065
## 776 396.95870 438.25873
## 791 828.21313 604.59207
## 793 86.17345 102.94696
## 795 531.06198 473.67443
## 796 108.60236 138.60889
## 799 271.01266 267.08237
## 806 829.53438 647.36346
## 825 383.31780 323.99624
## 826 313.06816 372.68687
## 827 706.68964 589.23155
## 828 142.61531 147.41675
## 829 708.82240 609.24927
## 830 593.06592 573.86238
## 833 685.76125 596.59914
## 839 230.83259 317.55310
## 848 576.28112 634.57396
## 849 381.39121 370.89676
## 850 543.77025 486.38231
## 855 292.92466 265.49146
## 868 762.88228 605.93136
## 874 521.40805 558.37638
## 875 450.38788 466.75201
## 879 333.40117 429.51243
## 884 466.97207 418.50555
## 896 220.49834 237.65631
## 907 339.28707 434.35390
## 909 279.26870 244.69857
## 914 366.72826 383.94868
## 921 233.50662 292.18364
## 924 761.01676 573.05291
## 929 508.67300 549.30812
## 936 516.13304 570.00229
## 939 330.78364 426.21888
## 946 215.74176 271.24417
## 950 455.20979 348.91581
## 952 281.09707 380.79652
## 963 332.09362 392.75987
## 964 457.08532 437.40106
## 967 288.34515 316.22408
## 971 493.23388 593.71237
## 972 291.51108 247.54782
## 973 459.06065 498.45925
## 978 319.94238 316.30109
## 984 561.37488 615.26040
## 985 516.51117 539.71531
## 989 66.78324 90.24536
## 993 632.56014 584.39707
## 995 118.98653 157.53869
## 997 424.15599 369.84609
# Evaluate performance (e.g., RMSE or R-squared)
mse <- mean((predictions - test_data$Historical_Cost_of_Ride)^2)
rmse <- sqrt(mse)
cat("Root Mean Squared Error (RMSE):", rmse, "\n")
## Root Mean Squared Error (RMSE): 72.05505
# Calculate R-squared value
actual_values <- test_data$Historical_Cost_of_Ride
residuals <- actual_values - predictions
ss_residuals <- sum(residuals^2)
ss_total <- sum((actual_values - mean(actual_values))^2)
r_squared <- 1 - (ss_residuals / ss_total)
cat("R-squared:", r_squared, "\n")
## R-squared: 0.8666792
The performance of the Random Forest regression model was evaluated
based on the R-squared value, which measures the proportion of the
variance in the dependent variable
(Historical_Cost_of_Ride) explained by the independent
variables.
The R-squared value for the model is 0.8667. This means that approximately 86.67% of the variability in the historical cost of the ride can be explained by the predictor variables in the model. A high R-squared value indicates that the model is a good fit for the data and that the predictor variables have a strong influence on the target variable.