# Load necessary packages
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
##Problem 1
#Loading the dataset
df <- read.csv("C:/Users/doris/OneDrive/Documents/rideshare_kaggle.csv/rideshare_kaggle.csv", stringsAsFactors = FALSE)
##Problem 2
# Filter the data to keep the relevant columns and rows for "UberX" and "Lyft"
df_1 <- df[df$name %in% c("UberX", "Lyft"), c("hour", "day", "month", "source", "destination", "name", "price", "distance", "surge_multiplier", "temperature")]
# Count the number of rows(data lines) after filtering
total_rows <- nrow(df_1)
total_rows
## [1] 106329
# Check for missing values in the dataset
missing_values <- colSums(is.na(df_1))
missing_values
## hour day month source
## 0 0 0 0
## destination name price distance
## 0 0 0 0
## surge_multiplier temperature
## 0 0
# This will show the total number of missing values across all columns
# in the dataset
total_missing = sum(is.na(df_1))
total_missing
## [1] 0
#After filtering the dataset we now have a total of 106,329 data lines and
#There are no missing values
##Problem 3
# Create 'Day' as the day of the week
df_1$Day = weekdays(as.Date(paste("2018", df_1$month, df_1$day, sep = "-"), format = "%Y-%m-%d"))
# Create 'DayType' to classify as 'Weekday' or 'Weekend'
df_1$DayType <- ifelse(df_1$Day %in% c("Saturday", "Sunday"), "Weekend", "Weekday")
##Problem 4
# Filter the data based on source and destination
df_2 <- df_1 %>%
filter(source == "Beacon Hill" & destination %in% c("Boston University", "Northeastern University"))
# Display the first few rows of the filtered data
head(df_2)
# Count the number of rows after filtering
total_rows <- nrow(df_2)
# Print the total number of rows
print(total_rows)
## [1] 2901
# After filtering, we currently have 2,901 rows remaining in the dataset.
#This means there are 2,901 rides recorded from Beacon Hill to either
#Boston University or Northeastern University in our dataset.
##Problem 5
# Creating side by side boxplots of price against destinations
ggplot(df_2, aes(x = destination, y = price, fill = name)) +
geom_boxplot(position = position_dodge(0.75), width = 0.6) +
theme_minimal() +
labs(title = "Price Distribution by Destination for UberX and Lyft",
x = "Destination",
y = "Price") +
scale_fill_manual(values = c("UberX" = "blue", "Lyft" = "purple"))
#The median prices (represented by the horizontal line in each box) are
#fairly similar across both services and destinations, hovering around $10
#In terms of variability UberX displays greater price variability than Lyft
# for both destinations. This is evident from the wider boxes and longer
#whiskers for UberX indicating a large range.
#This implies that UberX prices fluctuate more than Lyft prices,
#with greater spread from the cheapest to the most expensive rides.
#For Boston University:
#Lyft's median appears to be slightly lower than UberX's median
#For Northeastern University
#Again, Lyft's median price appears marginally lower than UberX's median
#The Lyft box is positioned slightly lower than UberX's box
#Therefore, based on this data, Lyft tends to be marginally cheaper than
#UberX for both destinations, though the difference is small.
#This suggests Lyft might offer slightly better typical prices,
#but the difference is probably just a dollar or two.
#However, both services show similar patterns of occasional high prices (outliers),
#so the cheapest option might vary depending on the time and demand.
str(df_2)
## 'data.frame': 2901 obs. of 12 variables:
## $ hour : int 20 4 8 6 10 9 2 3 2 0 ...
## $ day : int 26 13 17 13 14 26 16 18 27 29 ...
## $ month : int 11 12 12 12 12 11 12 12 11 11 ...
## $ source : chr "Beacon Hill" "Beacon Hill" "Beacon Hill" "Beacon Hill" ...
## $ destination : chr "Northeastern University" "Boston University" "Northeastern University" "Northeastern University" ...
## $ name : chr "UberX" "Lyft" "UberX" "UberX" ...
## $ price : num 9 9 8.5 8.5 9 9.5 11 10.5 10.5 9.5 ...
## $ distance : num 1.97 2.35 1.92 1.97 2.31 2.27 1.92 1.92 2.05 2.56 ...
## $ surge_multiplier: num 1 1 1 1 1 1 1 1 1 1 ...
## $ temperature : num 45 23.2 37.8 19.3 27.7 ...
## $ Day : chr "Monday" "Thursday" "Monday" "Thursday" ...
## $ DayType : chr "Weekday" "Weekday" "Weekday" "Weekday" ...
#Converting character variables using as.factor
df_2$source = as.factor(df_2$source)
df_2$destination = as.factor(df_2$destination)
df_2$name = as.factor(df_2$name)
df_2$Day = as.factor(df_2$Day)
df_2$DayType = as.factor(df_2$DayType)
str(df_2)
## 'data.frame': 2901 obs. of 12 variables:
## $ hour : int 20 4 8 6 10 9 2 3 2 0 ...
## $ day : int 26 13 17 13 14 26 16 18 27 29 ...
## $ month : int 11 12 12 12 12 11 12 12 11 11 ...
## $ source : Factor w/ 1 level "Beacon Hill": 1 1 1 1 1 1 1 1 1 1 ...
## $ destination : Factor w/ 2 levels "Boston University",..: 2 1 2 2 1 1 2 2 2 2 ...
## $ name : Factor w/ 2 levels "Lyft","UberX": 2 1 2 2 1 2 2 2 1 2 ...
## $ price : num 9 9 8.5 8.5 9 9.5 11 10.5 10.5 9.5 ...
## $ distance : num 1.97 2.35 1.92 1.97 2.31 2.27 1.92 1.92 2.05 2.56 ...
## $ surge_multiplier: num 1 1 1 1 1 1 1 1 1 1 ...
## $ temperature : num 45 23.2 37.8 19.3 27.7 ...
## $ Day : Factor w/ 7 levels "Friday","Monday",..: 2 5 2 5 1 2 4 6 6 5 ...
## $ DayType : Factor w/ 2 levels "Weekday","Weekend": 1 1 1 1 1 1 2 1 1 1 ...
##Problem 6
# Run the multiple linear regression model
model <- lm(price ~ hour + day + month + destination + name + distance +
surge_multiplier + temperature, data = df_2)
# Display the summary of the model
summary(model)
##
## Call:
## lm(formula = price ~ hour + day + month + destination + name +
## distance + surge_multiplier + temperature, data = df_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.5223 -0.6822 -0.3281 0.6433 9.6127
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.130465 1.215243 -0.107 0.915
## hour 0.001309 0.003204 0.409 0.683
## day -0.006029 0.004354 -1.385 0.166
## month -0.055185 0.089729 -0.615 0.539
## destinationNortheastern University 0.180884 0.043497 4.159 3.3e-05 ***
## nameUberX 0.620179 0.049582 12.508 < 2e-16 ***
## distance 1.383539 0.116870 11.838 < 2e-16 ***
## surge_multiplier 7.058836 0.207681 33.989 < 2e-16 ***
## temperature -0.001352 0.003513 -0.385 0.700
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.163 on 2892 degrees of freedom
## Multiple R-squared: 0.3165, Adjusted R-squared: 0.3147
## F-statistic: 167.4 on 8 and 2892 DF, p-value: < 2.2e-16
#Summary of findings
#Significant variables:
#DestinationNortheastern University (p < 0.001) → Rides to Northeastern
#University are slightly more expensive than those to Boston University.
#nameUberX (p < 0.001) → UberX is, on average, more expensive than Lyft.
#distance (p < 0.001) → Strong positive relationship, implying that longer trips cost more.
#surge_multiplier (p < 0.001) → The strongest predictor, surge pricing significantly increases fares.
#Insignificant Variables:
#hour, day, month, and temperature have high p-values (> 0.05),
#suggesting they do not significantly impact price in this dataset.
#Model Fit:
#R² = 0.3165, meaning only 31.65% of the variance in price is explained by the model.
#Residual standard error = 1.163, indicating some unaccounted variability.
#Conclusion:
#The model captures key pricing drivers like distance, ride type, and surge
#pricing but does not explain all variations in price.
#Additional factors may improve the model.
##Problem 7
# From the results above predictors that do not contribute to
#rideshareprices are the predictors with high p-values (> 0.05);
#hour (p = 0.683),day (p = 0.166),month (p = 0.539),temperature (p = 0.700)
#Running a Reduced Model Without Insignificant Predictors
reduced_model <- lm(price ~ destination + name + distance + surge_multiplier, data = df_2)
#Perform a Partial F-Test
#To check if we can remove these predictors simultaneously,
#we compare the full and reduced models using an F-test:
anova(reduced_model, model)
#Based on the Partial F-Test Results
#Null Hypothesis (H0): The removed predictors (hour, day, month,
#temperature) do not significantly contribute to predicting price.
#Alternative Hypothesis (H1): At least one of the removed predictors affects price.
#Key Findings from the ANOVA Table:
#F-statistic = 0.8514
#p-value = 0.4925 (> 0.05)
#Since the p-value is much greater than 0.05, we fail to reject H0,
#meaning the removed predictors do not significantly contribute to the price.
#Conclusion:
#Thus we can safely remove hour, day, month, and temperature from the model
#without losing predictive power.
#The simpler model (price ~ destination +name + distance + surge_multiplier)
#is preferable because it is more efficient and still effectively explains the variance in price.
##Problem 8
#New Model with Day and DayType
# Create the new model with Day and DayType
new_model <- lm(price ~ hour + Day + DayType + destination + name + distance + surge_multiplier + temperature, data = df_2)
# Summary of both models
summary(model)# Original model with day and month
##
## Call:
## lm(formula = price ~ hour + day + month + destination + name +
## distance + surge_multiplier + temperature, data = df_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.5223 -0.6822 -0.3281 0.6433 9.6127
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.130465 1.215243 -0.107 0.915
## hour 0.001309 0.003204 0.409 0.683
## day -0.006029 0.004354 -1.385 0.166
## month -0.055185 0.089729 -0.615 0.539
## destinationNortheastern University 0.180884 0.043497 4.159 3.3e-05 ***
## nameUberX 0.620179 0.049582 12.508 < 2e-16 ***
## distance 1.383539 0.116870 11.838 < 2e-16 ***
## surge_multiplier 7.058836 0.207681 33.989 < 2e-16 ***
## temperature -0.001352 0.003513 -0.385 0.700
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.163 on 2892 degrees of freedom
## Multiple R-squared: 0.3165, Adjusted R-squared: 0.3147
## F-statistic: 167.4 on 8 and 2892 DF, p-value: < 2.2e-16
summary(new_model)# New model with Day and DayType
##
## Call:
## lm(formula = price ~ hour + Day + DayType + destination + name +
## distance + surge_multiplier + temperature, data = df_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4839 -0.6651 -0.3068 0.6295 9.5620
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.735e-01 3.821e-01 -2.548 0.0109 *
## hour 8.297e-04 3.303e-03 0.251 0.8017
## DayMonday -7.248e-02 8.671e-02 -0.836 0.4033
## DaySaturday 1.540e-01 8.914e-02 1.727 0.0843 .
## DaySunday 6.135e-02 8.942e-02 0.686 0.4927
## DayThursday 9.697e-03 8.630e-02 0.112 0.9105
## DayTuesday 1.640e-02 8.354e-02 0.196 0.8444
## DayWednesday 5.736e-02 9.229e-02 0.621 0.5343
## DayTypeWeekend NA NA NA NA
## destinationNortheastern University 1.819e-01 4.350e-02 4.180 3e-05 ***
## nameUberX 6.232e-01 4.958e-02 12.569 <2e-16 ***
## distance 1.392e+00 1.170e-01 11.903 <2e-16 ***
## surge_multiplier 7.064e+00 2.076e-01 34.020 <2e-16 ***
## temperature -9.358e-05 3.887e-03 -0.024 0.9808
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.162 on 2888 degrees of freedom
## Multiple R-squared: 0.318, Adjusted R-squared: 0.3152
## F-statistic: 112.2 on 12 and 2888 DF, p-value: < 2.2e-16
# Compare using ANOVA
anova(model, new_model)
#vif(new_model)
#When we run the vif we get an error message
#indicating perfect collinearity or some of the predictors are highly
#correlated with each other
#Model Comparison and Results
# Model Summary (Original vs. New Model):
#Original Model:
#Adjusted R-squared: 0.3147
#F-statistic: 167.4 (p-value: < 2.2e-16)
#Significant predictors: destination(Northeastern University), name (UberX),
#distance, and surge_multiplier.
#The day and month variables, as well as temperature and hour,
#do not appear to be statistically significant.
#New Model (with Day and DayType variables):
#Adjusted R-squared: 0.3152
#F-statistic: 112.2 (p-value: < 2.2e-16)
#Significant predictors: destination(Northeastern University), name (UberX),
#distance, surge_multiplier.
#The Day variables are not statistically significant
#DayType Weekend is not defined due to collinearity with other variables,
#and temperature is not significant.
#2. Interpretation of Key Changes:
#Adjusted R-squared has only marginally improved from 0.3147 to 0.3152,
#indicating that the inclusion of Day and DayType did not substantially
#increase the explanatory power of the model.
#The F-statistic has decreased from 167.4 to 112.2, suggesting that the
#overall fit of the new model is weaker than the original model.
#Most of the Day and DayType variables (such as DayMonday, DaySunday, DayTuesday, etc.)
#are not significant (p-value > 0.05)
#Temperature is still not significant in the new model,
#which aligns with the findings from the original model.
#3. ANOVA Comparison:
#ANOVA results show a p-value of 0.1836, indicating that there is no
#significant difference between the two models.
#This suggests that replacing day and month with Day and DayType does not
#improve the model significantly in terms of explaining rideshare prices.
#Conclusion:
#The addition of Day and DayType variables does not seem to provide a
#substantial improvement in the model.
#The original model, which incorporates day and month, is a bit more
#efficient and better explains the variation in price.
#Adding Day and DayType only slightly improves the explanatory power and
#doesn't provide significant additional value, as shown by the ANOVA
#comparison and the significance of the variables.