Homework 2

# Load necessary packages
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.4.3

##Problem 1

#Loading the dataset
df <- read.csv("C:/Users/doris/OneDrive/Documents/rideshare_kaggle.csv/rideshare_kaggle.csv", stringsAsFactors = FALSE)

##Problem 2

# Filter the data to keep the relevant columns and rows for "UberX" and "Lyft"
df_filtered <- df[df$name %in% c("UberX", "Lyft"), c("hour", "day", "month", "source", "destination", "name", "price", "distance", "surge_multiplier", "temperature")]

#Total number of rows (data lines) after filtering
total_rows = nrow(df_filtered)
total_rows

## [1] 106329

# Check for missing values in the filtered dataset
missing_values <- colSums(is.na(df_filtered))
missing_values

##             hour              day            month           source 
##                0                0                0                0 
##      destination             name            price         distance 
##                0                0                0                0 
## surge_multiplier      temperature 
##                0                0

# This will show the total number of missing values across all columns
total_missing = sum(is.na(df_filtered))
total_missing

## [1] 0

#After filtering the data we have a total of 106,329 data lines and
#There are no missing values

##Problem 3

# Create 'Day' as the day of the week
df_filtered$Day = weekdays(as.Date(paste("2018", df_filtered$month, df_filtered$day, sep = "-"), format = "%Y-%m-%d"))

# Create 'DayType' to classify as 'Weekday' or 'Weekend'
df_filtered$DayType <- ifelse(df_filtered$Day %in% c("Saturday", "Sunday"), "Weekend", "Weekday")

##Problem 4

df_filtered <- df_filtered[df_filtered$source == "Beacon Hill" & df_filtered$destination %in% c("Boston University", "Northeastern University"), ]

#how many rows remain in updated dataset
new_rows <- nrow(df_filtered)
print(new_rows)

## [1] 2901

head(df_filtered)

# After filtering, we currently have 2,901 rows remaining in the dataset. 
#This means there are 2,901 rides recorded from Beacon Hill to either 
#Boston University or Northeastern University in our dataset.

##Problem 5

# Load the ggplot2 package
library(ggplot2)

# Create boxplots for price against destination with UberX and Lyft side by side
ggplot(df_filtered, aes(x = destination, y = price, fill = name)) +
  geom_boxplot() +
  facet_wrap(~ name) + # Creates separate boxplots for UberX and Lyft
  theme_minimal() +
  labs(title = "Price Distribution by Destination for UberX and Lyft",
       x = "Destination", 
       y = "Price") +
  scale_fill_manual(values = c("UberX" = "#5cc9f5", "Lyft" = "#b131a2")) # Custom colors for UberX and Lyft

#interpretation of the plots above
#The plots above presents a comparison of ride prices for UberX and Lyft
#from Beacon Hill to two destinations Boston University and North eastern University.
#For Boston University, the boxplots shows that the price for Lyft is lower 
#compared to the price for UberX
#Indicating that for Boston Lyft is cheaper

#While for Northeastern University the price for UberX appears to also be higher than the price for Lyft

#Thus in conclusion Lyft is cheaper for both destinations.

##Problem 6

# Fit the multiple linear regression model
model <- lm(price ~ hour + day + month + destination + name + distance + surge_multiplier + temperature, data = df_filtered)

# Display the model summary
summary(model)

## 
## Call:
## lm(formula = price ~ hour + day + month + destination + name + 
##     distance + surge_multiplier + temperature, data = df_filtered)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5223 -0.6822 -0.3281  0.6433  9.6127 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        -0.130465   1.215243  -0.107    0.915    
## hour                                0.001309   0.003204   0.409    0.683    
## day                                -0.006029   0.004354  -1.385    0.166    
## month                              -0.055185   0.089729  -0.615    0.539    
## destinationNortheastern University  0.180884   0.043497   4.159  3.3e-05 ***
## nameUberX                           0.620179   0.049582  12.508  < 2e-16 ***
## distance                            1.383539   0.116870  11.838  < 2e-16 ***
## surge_multiplier                    7.058836   0.207681  33.989  < 2e-16 ***
## temperature                        -0.001352   0.003513  -0.385    0.700    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.163 on 2892 degrees of freedom
## Multiple R-squared:  0.3165, Adjusted R-squared:  0.3147 
## F-statistic: 167.4 on 8 and 2892 DF,  p-value: < 2.2e-16

##Conclusions
#Intercept: Not significant (p = 0.915). It doesn't provide a meaningful
#baseline effect on price.

#hour: Not significant (p = 0.683). The hour of the day does not
#significantly affect the price.

#day: Not significant (p = 0.166). The day of the week does not have a
#meaningful effect on price.

#month: Not significant (p = 0.539). The month doesn't significantly affect
#the price.
#destination (Northeastern University): Significant (p = 3.3e-05). 
#Rides to Northeastern University are $0.18 more expensive than to Boston University.

#name (UberX): Significant (p < 2e-16). UberX rides cost $0.62 more than Lyft rides.

#distance: Significant (p < 2e-16). Each additional unit of distance increases price by $1.38.

#surge_multiplier: Significant (p < 2e-16). Each unit increase in surge
#multiplier raises the price by $7.06.

#temperature: Not significant (p = 0.700). Temperature does not have a
#significant effect on price.


#Model Performance
#F-statistic = 167.4, p < 2.2e-16: This indicates the overall model is
#statistically significant, meaning at least one of the predictors has a
#significant effect on price.

#R-squared = 0.3165 (Adjusted R² = 0.3147): The model explains only 31.65%
#of thevariation in the price. 
#This suggests there are other factors influencing the price not captured in the model.

##Problem 7

#Yes they are predictors not contributing to Rideshare Prices
#From the model output, the following predictors are not significant (p > 0.05);
#hour (p = 0.683), day (p = 0.166), month (p = 0.539) and temperature (p = 0.700)

#These variables do not appear to have a meaningful impact on the rideshare price and might not be necessary in the model.

#Full Model
model_full <- lm(price ~ hour + day + month + destination + name + distance + surge_multiplier + temperature, data = df_filtered)

#Reduced Model (without hour, day, month, and temperature)
model_reduced <- lm(price ~ destination + name + distance + surge_multiplier, data = df_filtered)

#Performing the Partial F-Test
anova(model_reduced, model_full)

#Conclusions
#Partial F-Test Results
#The p-value from the partial F-test is 0.4925.
#Since this value is greater than 0.05, we fail to reject the null
#hypothesis. This suggests that removing the non-significant predictors
#(hour, day, month, and temperature) does not significantly worsen the model.

#Thus, we can conclude that these predictors can be removed from the model
#without a substantial loss in explanatory power.

##Problem 8

#Full Model with Day and DayType
model_daytype <- lm(price ~ hour + Day + DayType + destination + name + distance + surge_multiplier + temperature, data = df_filtered)

summary(model_daytype)

## 
## Call:
## lm(formula = price ~ hour + Day + DayType + destination + name + 
##     distance + surge_multiplier + temperature, data = df_filtered)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4839 -0.6651 -0.3068  0.6295  9.5620 
## 
## Coefficients: (1 not defined because of singularities)
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        -9.735e-01  3.821e-01  -2.548   0.0109 *  
## hour                                8.297e-04  3.303e-03   0.251   0.8017    
## DayMonday                          -7.248e-02  8.671e-02  -0.836   0.4033    
## DaySaturday                         1.540e-01  8.914e-02   1.727   0.0843 .  
## DaySunday                           6.135e-02  8.942e-02   0.686   0.4927    
## DayThursday                         9.697e-03  8.630e-02   0.112   0.9105    
## DayTuesday                          1.640e-02  8.354e-02   0.196   0.8444    
## DayWednesday                        5.736e-02  9.229e-02   0.621   0.5343    
## DayTypeWeekend                             NA         NA      NA       NA    
## destinationNortheastern University  1.819e-01  4.350e-02   4.180    3e-05 ***
## nameUberX                           6.232e-01  4.958e-02  12.569   <2e-16 ***
## distance                            1.392e+00  1.170e-01  11.903   <2e-16 ***
## surge_multiplier                    7.064e+00  2.076e-01  34.020   <2e-16 ***
## temperature                        -9.358e-05  3.887e-03  -0.024   0.9808    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.162 on 2888 degrees of freedom
## Multiple R-squared:  0.318,  Adjusted R-squared:  0.3152 
## F-statistic: 112.2 on 12 and 2888 DF,  p-value: < 2.2e-16

##Results
#The addition of Day and DayType variables shows minimal improvement in the
#model's R-squared (from 0.3165 to 0.318), suggesting they don't
#significantly affect the price prediction.

#Key findings:
#Significant predictors: destination, nameUberX, distance, and surge_multiplier.
#Non-significant predictors: hour, Day variables (except Saturday), and temperature.

#Model fit: The model explains 31.8% of the variance in price (R-squared), and the F-statistic indicates the model is significant

#Perform an F-test to compare models:
anova(model_reduced, model_daytype)

#Conclusions
#The F-test comparing the two models indicates that the addition of 
#the Day and DayType variables does not significantly improve the model. 
#The p-value (0.2925) is greater than the usual significance level of 0.05,
#suggesting that the extra variables don't contribute meaningfully to
#explaining the variability in rideshare prices. Therefore, the reduced
#model without these variables is a better fit.

Homework 2

Doris Mbitazi Asongafac

2025-02-19