# Load necessary packages
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
##Problem 1
#Loading the dataset
df <- read.csv("C:/Users/doris/OneDrive/Documents/rideshare_kaggle.csv/rideshare_kaggle.csv", stringsAsFactors = FALSE)
##Problem 2
# Filter the data to keep the relevant columns and rows for "UberX" and "Lyft"
df_filtered <- df[df$name %in% c("UberX", "Lyft"), c("hour", "day", "month", "source", "destination", "name", "price", "distance", "surge_multiplier", "temperature")]
#Total number of rows (data lines) after filtering
total_rows = nrow(df_filtered)
total_rows
## [1] 106329
# Check for missing values in the filtered dataset
missing_values <- colSums(is.na(df_filtered))
missing_values
## hour day month source
## 0 0 0 0
## destination name price distance
## 0 0 0 0
## surge_multiplier temperature
## 0 0
# This will show the total number of missing values across all columns
total_missing = sum(is.na(df_filtered))
total_missing
## [1] 0
#After filtering the data we have a total of 106,329 data lines and
#There are no missing values
##Problem 3
# Create 'Day' as the day of the week
df_filtered$Day = weekdays(as.Date(paste("2018", df_filtered$month, df_filtered$day, sep = "-"), format = "%Y-%m-%d"))
# Create 'DayType' to classify as 'Weekday' or 'Weekend'
df_filtered$DayType <- ifelse(df_filtered$Day %in% c("Saturday", "Sunday"), "Weekend", "Weekday")
##Problem 4
df_filtered <- df_filtered[df_filtered$source == "Beacon Hill" & df_filtered$destination %in% c("Boston University", "Northeastern University"), ]
#how many rows remain in updated dataset
new_rows <- nrow(df_filtered)
print(new_rows)
## [1] 2901
head(df_filtered)
# After filtering, we currently have 2,901 rows remaining in the dataset.
#This means there are 2,901 rides recorded from Beacon Hill to either
#Boston University or Northeastern University in our dataset.
##Problem 5
# Load the ggplot2 package
library(ggplot2)
# Create boxplots for price against destination with UberX and Lyft side by side
ggplot(df_filtered, aes(x = destination, y = price, fill = name)) +
geom_boxplot() +
facet_wrap(~ name) + # Creates separate boxplots for UberX and Lyft
theme_minimal() +
labs(title = "Price Distribution by Destination for UberX and Lyft",
x = "Destination",
y = "Price") +
scale_fill_manual(values = c("UberX" = "#5cc9f5", "Lyft" = "#b131a2")) # Custom colors for UberX and Lyft
#interpretation of the plots above
#The plots above presents a comparison of ride prices for UberX and Lyft
#from Beacon Hill to two destinations Boston University and North eastern University.
#For Boston University, the boxplots shows that the price for Lyft is lower
#compared to the price for UberX
#Indicating that for Boston Lyft is cheaper
#While for Northeastern University the price for UberX appears to also be higher than the price for Lyft
#Thus in conclusion Lyft is cheaper for both destinations.
##Problem 6
# Fit the multiple linear regression model
model <- lm(price ~ hour + day + month + destination + name + distance + surge_multiplier + temperature, data = df_filtered)
# Display the model summary
summary(model)
##
## Call:
## lm(formula = price ~ hour + day + month + destination + name +
## distance + surge_multiplier + temperature, data = df_filtered)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.5223 -0.6822 -0.3281 0.6433 9.6127
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.130465 1.215243 -0.107 0.915
## hour 0.001309 0.003204 0.409 0.683
## day -0.006029 0.004354 -1.385 0.166
## month -0.055185 0.089729 -0.615 0.539
## destinationNortheastern University 0.180884 0.043497 4.159 3.3e-05 ***
## nameUberX 0.620179 0.049582 12.508 < 2e-16 ***
## distance 1.383539 0.116870 11.838 < 2e-16 ***
## surge_multiplier 7.058836 0.207681 33.989 < 2e-16 ***
## temperature -0.001352 0.003513 -0.385 0.700
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.163 on 2892 degrees of freedom
## Multiple R-squared: 0.3165, Adjusted R-squared: 0.3147
## F-statistic: 167.4 on 8 and 2892 DF, p-value: < 2.2e-16
##Conclusions
#Intercept: Not significant (p = 0.915). It doesn't provide a meaningful
#baseline effect on price.
#hour: Not significant (p = 0.683). The hour of the day does not
#significantly affect the price.
#day: Not significant (p = 0.166). The day of the week does not have a
#meaningful effect on price.
#month: Not significant (p = 0.539). The month doesn't significantly affect
#the price.
#destination (Northeastern University): Significant (p = 3.3e-05).
#Rides to Northeastern University are $0.18 more expensive than to Boston University.
#name (UberX): Significant (p < 2e-16). UberX rides cost $0.62 more than Lyft rides.
#distance: Significant (p < 2e-16). Each additional unit of distance increases price by $1.38.
#surge_multiplier: Significant (p < 2e-16). Each unit increase in surge
#multiplier raises the price by $7.06.
#temperature: Not significant (p = 0.700). Temperature does not have a
#significant effect on price.
#Model Performance
#F-statistic = 167.4, p < 2.2e-16: This indicates the overall model is
#statistically significant, meaning at least one of the predictors has a
#significant effect on price.
#R-squared = 0.3165 (Adjusted R² = 0.3147): The model explains only 31.65%
#of thevariation in the price.
#This suggests there are other factors influencing the price not captured in the model.
##Problem 7
#Yes they are predictors not contributing to Rideshare Prices
#From the model output, the following predictors are not significant (p > 0.05);
#hour (p = 0.683), day (p = 0.166), month (p = 0.539) and temperature (p = 0.700)
#These variables do not appear to have a meaningful impact on the rideshare price and might not be necessary in the model.
#Full Model
model_full <- lm(price ~ hour + day + month + destination + name + distance + surge_multiplier + temperature, data = df_filtered)
#Reduced Model (without hour, day, month, and temperature)
model_reduced <- lm(price ~ destination + name + distance + surge_multiplier, data = df_filtered)
#Performing the Partial F-Test
anova(model_reduced, model_full)
#Conclusions
#Partial F-Test Results
#The p-value from the partial F-test is 0.4925.
#Since this value is greater than 0.05, we fail to reject the null
#hypothesis. This suggests that removing the non-significant predictors
#(hour, day, month, and temperature) does not significantly worsen the model.
#Thus, we can conclude that these predictors can be removed from the model
#without a substantial loss in explanatory power.
##Problem 8
#Full Model with Day and DayType
model_daytype <- lm(price ~ hour + Day + DayType + destination + name + distance + surge_multiplier + temperature, data = df_filtered)
summary(model_daytype)
##
## Call:
## lm(formula = price ~ hour + Day + DayType + destination + name +
## distance + surge_multiplier + temperature, data = df_filtered)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4839 -0.6651 -0.3068 0.6295 9.5620
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.735e-01 3.821e-01 -2.548 0.0109 *
## hour 8.297e-04 3.303e-03 0.251 0.8017
## DayMonday -7.248e-02 8.671e-02 -0.836 0.4033
## DaySaturday 1.540e-01 8.914e-02 1.727 0.0843 .
## DaySunday 6.135e-02 8.942e-02 0.686 0.4927
## DayThursday 9.697e-03 8.630e-02 0.112 0.9105
## DayTuesday 1.640e-02 8.354e-02 0.196 0.8444
## DayWednesday 5.736e-02 9.229e-02 0.621 0.5343
## DayTypeWeekend NA NA NA NA
## destinationNortheastern University 1.819e-01 4.350e-02 4.180 3e-05 ***
## nameUberX 6.232e-01 4.958e-02 12.569 <2e-16 ***
## distance 1.392e+00 1.170e-01 11.903 <2e-16 ***
## surge_multiplier 7.064e+00 2.076e-01 34.020 <2e-16 ***
## temperature -9.358e-05 3.887e-03 -0.024 0.9808
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.162 on 2888 degrees of freedom
## Multiple R-squared: 0.318, Adjusted R-squared: 0.3152
## F-statistic: 112.2 on 12 and 2888 DF, p-value: < 2.2e-16
##Results
#The addition of Day and DayType variables shows minimal improvement in the
#model's R-squared (from 0.3165 to 0.318), suggesting they don't
#significantly affect the price prediction.
#Key findings:
#Significant predictors: destination, nameUberX, distance, and surge_multiplier.
#Non-significant predictors: hour, Day variables (except Saturday), and temperature.
#Model fit: The model explains 31.8% of the variance in price (R-squared), and the F-statistic indicates the model is significant
#Perform an F-test to compare models:
anova(model_reduced, model_daytype)
#Conclusions
#The F-test comparing the two models indicates that the addition of
#the Day and DayType variables does not significantly improve the model.
#The p-value (0.2925) is greater than the usual significance level of 0.05,
#suggesting that the extra variables don't contribute meaningfully to
#explaining the variability in rideshare prices. Therefore, the reduced
#model without these variables is a better fit.