Homework 2

# Load necessary packages
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.4.3

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

##Problem 1

#Loading the dataset
df <- read.csv("C:/Users/doris/OneDrive/Documents/rideshare_kaggle.csv/rideshare_kaggle.csv", stringsAsFactors = FALSE)

##Problem 2

# Filter the data to keep the relevant columns and rows for "UberX" and "Lyft"
df_1 <- df[df$name %in% c("UberX", "Lyft"), c("hour", "day", "month", "source", "destination", "name", "price", "distance", "surge_multiplier", "temperature")]

# Count the number of rows(data lines) after filtering
total_rows <- nrow(df_1)
total_rows

## [1] 106329

# Check for missing values in the dataset
missing_values <- colSums(is.na(df_1))
missing_values

##             hour              day            month           source 
##                0                0                0                0 
##      destination             name            price         distance 
##                0                0                0                0 
## surge_multiplier      temperature 
##                0                0

# This will show the total number of missing values across all columns
# in the dataset
total_missing = sum(is.na(df_1))
total_missing

## [1] 0

#After filtering the dataset we now have a total of 106,329 data lines and
#There are no missing values

##Problem 3

# Create 'Day' as the day of the week
df_1$Day = weekdays(as.Date(paste("2018", df_1$month, df_1$day, sep = "-"), format = "%Y-%m-%d"))

# Create 'DayType' to classify as 'Weekday' or 'Weekend'
df_1$DayType <- ifelse(df_1$Day %in% c("Saturday", "Sunday"), "Weekend", "Weekday")

##Problem 4

# Filter the data based on source and destination
df_2 <- df_1 %>%
  filter(source == "Beacon Hill" & destination %in% c("Boston University", "Northeastern University"))

# Display the first few rows of the filtered data
head(df_2)

# Count the number of rows after filtering
total_rows <- nrow(df_2)

# Print the total number of rows
print(total_rows)

## [1] 2901

# After filtering, we currently have 2,901 rows remaining in the dataset. 
#This means there are 2,901 rides recorded from Beacon Hill to either 
#Boston University or Northeastern University in our dataset.

##Problem 5

# Creating side by side boxplots of price against destinations
ggplot(df_2, aes(x = destination, y = price, fill = name)) +
  geom_boxplot(position = position_dodge(0.75), width = 0.6) + 
  theme_minimal() +
  labs(title = "Price Distribution by Destination for UberX and Lyft",
       x = "Destination", 
       y = "Price") +
  scale_fill_manual(values = c("UberX" = "blue", "Lyft" = "purple"))

#The median prices (represented by the horizontal line in each box) are
#fairly similar across both services and destinations, hovering around $10

#In terms of variability UberX displays greater price variability than Lyft
# for both destinations. This is evident from the wider boxes and longer
#whiskers for UberX  indicating a large range.
#This implies that UberX prices fluctuate more than Lyft prices, 
#with greater spread from the cheapest to the most expensive rides.

#For Boston University:
#Lyft's median appears to be slightly lower than UberX's median

#For Northeastern University
#Again, Lyft's median price appears marginally lower than UberX's median
#The Lyft box is positioned slightly lower than UberX's box

#Therefore, based on this data, Lyft tends to be marginally cheaper than
#UberX for both destinations, though the difference is small.
#This suggests Lyft might offer slightly better typical prices, 
#but the difference is probably just a dollar or two. 
#However, both services show similar patterns of occasional high prices (outliers), 
#so the cheapest option might vary depending on the time and demand.

str(df_2)

## 'data.frame':    2901 obs. of  12 variables:
##  $ hour            : int  20 4 8 6 10 9 2 3 2 0 ...
##  $ day             : int  26 13 17 13 14 26 16 18 27 29 ...
##  $ month           : int  11 12 12 12 12 11 12 12 11 11 ...
##  $ source          : chr  "Beacon Hill" "Beacon Hill" "Beacon Hill" "Beacon Hill" ...
##  $ destination     : chr  "Northeastern University" "Boston University" "Northeastern University" "Northeastern University" ...
##  $ name            : chr  "UberX" "Lyft" "UberX" "UberX" ...
##  $ price           : num  9 9 8.5 8.5 9 9.5 11 10.5 10.5 9.5 ...
##  $ distance        : num  1.97 2.35 1.92 1.97 2.31 2.27 1.92 1.92 2.05 2.56 ...
##  $ surge_multiplier: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ temperature     : num  45 23.2 37.8 19.3 27.7 ...
##  $ Day             : chr  "Monday" "Thursday" "Monday" "Thursday" ...
##  $ DayType         : chr  "Weekday" "Weekday" "Weekday" "Weekday" ...

#Converting character variables using as.factor
df_2$source = as.factor(df_2$source)
df_2$destination = as.factor(df_2$destination)
df_2$name = as.factor(df_2$name)
df_2$Day = as.factor(df_2$Day)
df_2$DayType = as.factor(df_2$DayType)

str(df_2)

## 'data.frame':    2901 obs. of  12 variables:
##  $ hour            : int  20 4 8 6 10 9 2 3 2 0 ...
##  $ day             : int  26 13 17 13 14 26 16 18 27 29 ...
##  $ month           : int  11 12 12 12 12 11 12 12 11 11 ...
##  $ source          : Factor w/ 1 level "Beacon Hill": 1 1 1 1 1 1 1 1 1 1 ...
##  $ destination     : Factor w/ 2 levels "Boston University",..: 2 1 2 2 1 1 2 2 2 2 ...
##  $ name            : Factor w/ 2 levels "Lyft","UberX": 2 1 2 2 1 2 2 2 1 2 ...
##  $ price           : num  9 9 8.5 8.5 9 9.5 11 10.5 10.5 9.5 ...
##  $ distance        : num  1.97 2.35 1.92 1.97 2.31 2.27 1.92 1.92 2.05 2.56 ...
##  $ surge_multiplier: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ temperature     : num  45 23.2 37.8 19.3 27.7 ...
##  $ Day             : Factor w/ 7 levels "Friday","Monday",..: 2 5 2 5 1 2 4 6 6 5 ...
##  $ DayType         : Factor w/ 2 levels "Weekday","Weekend": 1 1 1 1 1 1 2 1 1 1 ...

##Problem 6

# Run the multiple linear regression model
model <- lm(price ~ hour + day + month + destination + name + distance +
              surge_multiplier + temperature, data = df_2)

# Display the summary of the model
summary(model)

## 
## Call:
## lm(formula = price ~ hour + day + month + destination + name + 
##     distance + surge_multiplier + temperature, data = df_2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5223 -0.6822 -0.3281  0.6433  9.6127 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        -0.130465   1.215243  -0.107    0.915    
## hour                                0.001309   0.003204   0.409    0.683    
## day                                -0.006029   0.004354  -1.385    0.166    
## month                              -0.055185   0.089729  -0.615    0.539    
## destinationNortheastern University  0.180884   0.043497   4.159  3.3e-05 ***
## nameUberX                           0.620179   0.049582  12.508  < 2e-16 ***
## distance                            1.383539   0.116870  11.838  < 2e-16 ***
## surge_multiplier                    7.058836   0.207681  33.989  < 2e-16 ***
## temperature                        -0.001352   0.003513  -0.385    0.700    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.163 on 2892 degrees of freedom
## Multiple R-squared:  0.3165, Adjusted R-squared:  0.3147 
## F-statistic: 167.4 on 8 and 2892 DF,  p-value: < 2.2e-16

#Summary of findings
#Significant variables:
#DestinationNortheastern University (p < 0.001) → Rides to Northeastern
#University are slightly more expensive than those to Boston University.

#nameUberX (p < 0.001) → UberX is, on average, more expensive than Lyft.

#distance (p < 0.001) → Strong positive relationship, implying that longer trips cost more.

#surge_multiplier (p < 0.001) → The strongest predictor, surge pricing significantly increases fares.

#Insignificant Variables:
#hour, day, month, and temperature have high p-values (> 0.05), 
#suggesting they do not significantly impact price in this dataset.

#Model Fit:
#R² = 0.3165, meaning only 31.65% of the variance in price is explained by the model.
#Residual standard error = 1.163, indicating some unaccounted variability.

#Conclusion:
#The model captures key pricing drivers like distance, ride type, and surge
#pricing but does not explain all variations in price. 
#Additional factors  may improve the model.

##Problem 7

# From the results above predictors that do not contribute to
#rideshareprices are the predictors with high p-values (> 0.05);
#hour (p = 0.683),day (p = 0.166),month (p = 0.539),temperature (p = 0.700)

#Running a Reduced Model Without Insignificant Predictors
reduced_model <- lm(price ~ destination + name + distance + surge_multiplier, data = df_2)

#Perform a Partial F-Test
#To check if we can remove these predictors simultaneously,
#we compare the full and reduced models using an F-test:

anova(reduced_model, model)

#Based on the Partial F-Test Results 
#Null Hypothesis (H0): The removed predictors (hour, day, month,
#temperature) do not significantly contribute to predicting price.

#Alternative Hypothesis (H1): At least one of the removed predictors affects price.

#Key Findings from the ANOVA Table:
#F-statistic = 0.8514
#p-value = 0.4925 (> 0.05)

#Since the p-value is much greater than 0.05, we fail to reject H0,
#meaning the removed predictors do not significantly contribute to the price.

#Conclusion:
#Thus we can safely remove hour, day, month, and temperature from the model
#without losing predictive power. 
#The simpler model (price ~ destination +name + distance + surge_multiplier)
#is preferable because it is more efficient and still effectively explains the variance in price.

##Problem 8

#New Model with Day and DayType
# Create the new model with Day and DayType
new_model <- lm(price ~ hour + Day + DayType + destination + name + distance + surge_multiplier + temperature, data = df_2)

# Summary of both models
summary(model)# Original model with day and month

## 
## Call:
## lm(formula = price ~ hour + day + month + destination + name + 
##     distance + surge_multiplier + temperature, data = df_2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5223 -0.6822 -0.3281  0.6433  9.6127 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        -0.130465   1.215243  -0.107    0.915    
## hour                                0.001309   0.003204   0.409    0.683    
## day                                -0.006029   0.004354  -1.385    0.166    
## month                              -0.055185   0.089729  -0.615    0.539    
## destinationNortheastern University  0.180884   0.043497   4.159  3.3e-05 ***
## nameUberX                           0.620179   0.049582  12.508  < 2e-16 ***
## distance                            1.383539   0.116870  11.838  < 2e-16 ***
## surge_multiplier                    7.058836   0.207681  33.989  < 2e-16 ***
## temperature                        -0.001352   0.003513  -0.385    0.700    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.163 on 2892 degrees of freedom
## Multiple R-squared:  0.3165, Adjusted R-squared:  0.3147 
## F-statistic: 167.4 on 8 and 2892 DF,  p-value: < 2.2e-16

summary(new_model)# New model with Day and DayType

## 
## Call:
## lm(formula = price ~ hour + Day + DayType + destination + name + 
##     distance + surge_multiplier + temperature, data = df_2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4839 -0.6651 -0.3068  0.6295  9.5620 
## 
## Coefficients: (1 not defined because of singularities)
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        -9.735e-01  3.821e-01  -2.548   0.0109 *  
## hour                                8.297e-04  3.303e-03   0.251   0.8017    
## DayMonday                          -7.248e-02  8.671e-02  -0.836   0.4033    
## DaySaturday                         1.540e-01  8.914e-02   1.727   0.0843 .  
## DaySunday                           6.135e-02  8.942e-02   0.686   0.4927    
## DayThursday                         9.697e-03  8.630e-02   0.112   0.9105    
## DayTuesday                          1.640e-02  8.354e-02   0.196   0.8444    
## DayWednesday                        5.736e-02  9.229e-02   0.621   0.5343    
## DayTypeWeekend                             NA         NA      NA       NA    
## destinationNortheastern University  1.819e-01  4.350e-02   4.180    3e-05 ***
## nameUberX                           6.232e-01  4.958e-02  12.569   <2e-16 ***
## distance                            1.392e+00  1.170e-01  11.903   <2e-16 ***
## surge_multiplier                    7.064e+00  2.076e-01  34.020   <2e-16 ***
## temperature                        -9.358e-05  3.887e-03  -0.024   0.9808    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.162 on 2888 degrees of freedom
## Multiple R-squared:  0.318,  Adjusted R-squared:  0.3152 
## F-statistic: 112.2 on 12 and 2888 DF,  p-value: < 2.2e-16

# Compare using ANOVA
anova(model, new_model)

#vif(new_model)
#When we run the vif we get an error message 
#indicating perfect collinearity or some of the predictors are highly
#correlated with each other

#Model Comparison and Results
# Model Summary (Original vs. New Model):

#Original Model:
#Adjusted R-squared: 0.3147
#F-statistic: 167.4 (p-value: < 2.2e-16)
#Significant predictors: destination(Northeastern University), name (UberX),
#distance, and surge_multiplier.
#The day and month variables, as well as temperature and hour, 
#do not appear to be statistically significant.

#New Model (with Day and DayType variables):
#Adjusted R-squared: 0.3152
#F-statistic: 112.2 (p-value: < 2.2e-16)
#Significant predictors: destination(Northeastern University), name (UberX),
#distance, surge_multiplier.
#The Day variables are not statistically significant
#DayType Weekend is not defined due to collinearity with other variables, 
#and temperature is not significant.

#2. Interpretation of Key Changes:
#Adjusted R-squared has only marginally improved from 0.3147 to 0.3152,
#indicating that the inclusion of Day and DayType did not substantially
#increase the explanatory power of the model.

#The F-statistic has decreased from 167.4 to 112.2, suggesting that the
#overall fit of the new model is weaker than the original model.
#Most of the Day and DayType variables (such as DayMonday, DaySunday, DayTuesday, etc.) 
#are not significant (p-value > 0.05)
#Temperature is still not significant in the new model, 
#which aligns with the findings from the original model.

#3. ANOVA Comparison:
#ANOVA results show a p-value of 0.1836, indicating that there is no
#significant difference between the two models. 
#This suggests that replacing day and month with Day and DayType does not
#improve the model significantly in terms of explaining rideshare prices.

#Conclusion:
#The addition of Day and DayType variables does not seem to provide a
#substantial improvement in the model. 
#The original model, which incorporates day and month, is a bit more
#efficient and better explains the variation in price. 
#Adding Day and DayType only slightly improves the explanatory power and
#doesn't provide significant additional value, as shown by the ANOVA
#comparison and the significance of the variables.

Homework 2

Doris Mbitazi Asongafac

2025-02-23