HW08

Author

Xiangzhe Li

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   4.0.0     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(moderndive)
theme_set(theme_minimal())
data(bikes, package = "bayesrules")
glimpse(bikes)

Rows: 500
Columns: 13
$ date        <date> 2011-01-01, 2011-01-03, 2011-01-04, 2011-01-05, 2011-01-0…
$ season      <fct> winter, winter, winter, winter, winter, winter, winter, wi…
$ year        <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011…
$ month       <fct> Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan…
$ day_of_week <fct> Sat, Mon, Tue, Wed, Fri, Sat, Mon, Tue, Wed, Thu, Fri, Sat…
$ weekend     <lgl> TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALS…
$ holiday     <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, yes, n…
$ temp_actual <dbl> 57.39952, 46.49166, 46.76000, 48.74943, 46.50332, 44.17700…
$ temp_feel   <dbl> 64.72625, 49.04645, 51.09098, 52.63430, 50.79551, 46.60286…
$ humidity    <dbl> 80.5833, 43.7273, 59.0435, 43.6957, 49.8696, 53.5833, 48.2…
$ windspeed   <dbl> 10.749882, 16.636703, 10.739832, 12.522300, 11.304642, 17.…
$ weather_cat <fct> categ2, categ1, categ1, categ1, categ2, categ2, categ1, ca…
$ rides       <int> 654, 1229, 1454, 1518, 1362, 891, 1280, 1220, 1137, 1368, …

Question 0

#Number of rows
nrow(bikes)

[1] 500

#Meaning of each row
#A row represents one day of bike rentals, including the date, weather conditions, and number of rides on that day.

#Dates covered by the dataset
range(bikes$date)

[1] "2011-01-01" "2012-12-31"

#Highest observed ridership
max(bikes$rides)

[1] 6946

#Highest recorded windspeed
max(bikes$windspeed)

[1] 34.00002

Question 1

cor(bikes$rides, bikes$temp_feel)

[1] 0.5824898

#The correlation between number of rides and the temperature feel like is 0.5824898.

cor(bikes$rides, bikes$windspeed)

[1] -0.1949352

#The correlation between number of rides and the windspeed is -0.1949352.

Question 2

bikes <- bikes |>
  mutate(wind_kph = windspeed * 1.61)

cor(bikes$windspeed, bikes$wind_kph)

[1] 1

#The correlation between windspeed MPH and windspeed KPH is one. This is because these two varibales have a linear relationship that could be mapped in a straight line y=1.61x, and this is called the linear transformation.

Question 3

model_mph <- lm(rides ~ windspeed, data = bikes)
summary(model_mph)


Call:
lm(formula = rides ~ windspeed, data = bikes)

Residuals:
    Min      1Q  Median      3Q     Max 
-3365.6 -1167.2   -58.7  1087.9  3650.1 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  4205.06     177.04  23.752  < 2e-16 ***
windspeed     -55.52      12.52  -4.435 1.13e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1546 on 498 degrees of freedom
Multiple R-squared:  0.038, Adjusted R-squared:  0.03607 
F-statistic: 19.67 on 1 and 498 DF,  p-value: 1.133e-05

model_kph <- lm(rides ~ wind_kph, data = bikes)
summary(model_kph)


Call:
lm(formula = rides ~ wind_kph, data = bikes)

Residuals:
    Min      1Q  Median      3Q     Max 
-3365.6 -1167.2   -58.7  1087.9  3650.1 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 4205.065    177.038  23.752  < 2e-16 ***
wind_kph     -34.486      7.775  -4.435 1.13e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1546 on 498 degrees of freedom
Multiple R-squared:  0.038, Adjusted R-squared:  0.03607 
F-statistic: 19.67 on 1 and 498 DF,  p-value: 1.133e-05

#While the two model have the same intercept, their coefficients are different. When we change wind speed from miles per hour to kilometers per hour, we are only changing the unit, not the real relationship between wind speed and bike rides. The intercept stay the same because zero miles per hour is the same as zero kilometers per hour. However, the slope becomes smaller because one mile per hour equal 1.61 kilometers per hour, so the new slope is the old slope divided by 1.61. This means the numbers look different, but both models tell the same story: as wind speed increases, the number of rides would go down.

Question 4

#20MPH
predict(model_mph, newdata = data.frame(windspeed = 20))

       1 
3094.611

#20KPH
predict(model_kph, newdata = data.frame(wind_kph = 20))

       1 
3515.342

Question 5

bikes <- bikes |>
  mutate(temp_c = (temp_feel - 32) * 5/9)

model_multi <- lm(rides ~ wind_kph + temp_c, data = bikes)
summary(model_multi)


Call:
lm(formula = rides ~ wind_kph + temp_c, data = bikes)

Residuals:
    Min      1Q  Median      3Q     Max 
-3302.6 -1035.0  -137.6   989.8  3481.8 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  942.426    255.833   3.684 0.000255 ***
wind_kph     -19.842      6.459  -3.072 0.002244 ** 
temp_c       143.234      9.238  15.504  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1271 on 497 degrees of freedom
Multiple R-squared:  0.3516,    Adjusted R-squared:  0.349 
F-statistic: 134.8 on 2 and 497 DF,  p-value: < 2.2e-16

#Slope1: 
#For a 1 km/h increase in wind speed, the model expects a 19.8 decrease in rides.
#Slope2: 
#For a 1 °C increase in temperature, the model expects a 143.2 increase in rides.
#Intercept: When wind speed is 0 km/h and the temperature feels like 0 °C, the model expects about 942 rides.

Question 6

pred_df <- tibble(
  situation = 1:3,
  temp_c = c(25, 15, 10),
  wind_kph = c(15, 5, 40)
)

pred_df <- pred_df |>
  mutate(predicted_rides = 942.426 - 19.842 * wind_kph + 143.234 * temp_c)

pred_df

# A tibble: 3 × 4
  situation temp_c wind_kph predicted_rides
      <int>  <dbl>    <dbl>           <dbl>
1         1     25       15           4226.
2         2     15        5           2992.
3         3     10       40           1581.

Question 7

model_weekend <- lm(rides ~ wind_kph + temp_c + weekend, data = bikes)
summary(model_weekend)


Call:
lm(formula = rides ~ wind_kph + temp_c + weekend, data = bikes)

Residuals:
    Min      1Q  Median      3Q     Max 
-3490.3  -948.8  -107.0   936.4  3258.0 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 1214.803    252.126   4.818 1.93e-06 ***
wind_kph     -20.386      6.256  -3.259   0.0012 ** 
temp_c       140.339      8.960  15.662  < 2e-16 ***
weekendTRUE -713.575    122.478  -5.826 1.02e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1230 on 496 degrees of freedom
Multiple R-squared:  0.3931,    Adjusted R-squared:  0.3895 
F-statistic: 107.1 on 3 and 496 DF,  p-value: < 2.2e-16

#For a weekend day (TRUE) compares to a weekday (FALSE), the model expects 713 fewer rides if hold wind speed and temperature constant.

Question 8

mean(bikes$temp_c)

[1] 20.63539

mean(bikes$wind_kph)

[1] 20.96062

pred_df <- tibble(
  weekend = c(FALSE, TRUE),
  temp_c = mean(bikes$temp_c),
  wind_kph = mean(bikes$wind_kph)
)

pred_df <- pred_df |>
  mutate(predicted_rides = predict(model_weekend, newdata = pred_df))

pred_df

# A tibble: 2 × 4
  weekend temp_c wind_kph predicted_rides
  <lgl>    <dbl>    <dbl>           <dbl>
1 FALSE     20.6     21.0           3683.
2 TRUE      20.6     21.0           2970.

Question 9

library(broom)

aug_df <- augment(model_weekend, data = bikes)

aug_df |>
  mutate(abs_resid = abs(.resid)) |>
  arrange(desc(abs_resid)) |>
  slice(1) |>
  select(date, rides, .fitted, .resid, abs_resid, weekend, temp_c, wind_kph)

# A tibble: 1 × 8
  date       rides .fitted .resid abs_resid weekend temp_c wind_kph
  <date>     <int>   <dbl>  <dbl>     <dbl> <lgl>    <dbl>    <dbl>
1 2012-10-29    20   3510. -3490.     3490. FALSE     22.0     38.6

#The biggest residual was on October 29, 2012, during Hurricane Sandy. On that day, the real number of rides was only about 20, but the model predicted around 3,500. This happened because the model only used temperature, wind speed, and weekend information, and it doesn't consider extreme weather. Since almost no one rode bikes during the hurricane, the model’s prediction is far too high.