library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(ggplot2)
library(dplyr)
library(lubridate)

SeoulBikeData <- read.csv("C:\\Users\\Jakob\\Documents\\1. Classes\\H510 Stats\\SeoulBikeData.csv")

#First, let's examine the summary statistics of hourly bikeshare rentals for a few times in the day. This will give us an intuition for what normal usage looks like. 
SeoulBikeData |>
  filter(hour == 8) |>
  summarise(
    min_rented = min(rented_bikes),
    max_rented = max(rented_bikes),
    mean_rented = mean(rented_bikes),
    median_rented = median(rented_bikes)
  )
##   min_rented max_rented mean_rented median_rented
## 1          0       2495    1015.701           728
SeoulBikeData |>
  filter(hour == 12) |>
  summarise(
    min_rented = min(rented_bikes),
    max_rented = max(rented_bikes),
    mean_rented = mean(rented_bikes),
    median_rented = median(rented_bikes)
  )
##   min_rented max_rented mean_rented median_rented
## 1          0       1798    699.4411           709
SeoulBikeData |>
  filter(hour == 16) |>
  summarise(
    min_rented = min(rented_bikes),
    max_rented = max(rented_bikes),
    mean_rented = mean(rented_bikes),
    median_rented = median(rented_bikes)
  )
##   min_rented max_rented mean_rented median_rented
## 1          0       2479    930.6219           911
SeoulBikeData |>
  filter(hour == 19) |>
  summarise(
    min_rented = min(rented_bikes),
    max_rented = max(rented_bikes),
    mean_rented = mean(rented_bikes),
    median_rented = median(rented_bikes)
  )
##   min_rented max_rented mean_rented median_rented
## 1          0       2984    1195.148          1224
#We can tell from the summary statistics that commuting hours are not peak hours for bikeshare. This is suprising because from an American point of view, I assumed that commuting hours would have the highest demand, however 9pm is far busier than 8am, 12pm, or 4pm. This makes me wonder how demand varies throughout the day, and the cultural and systemic drivers of this.
  
#Next I will take the summary statistics of solar radiation at noon. This field describes the amount of solar energy per square meter. I'd like to what values to expect.
  
SeoulBikeData |>
  filter(hour == 12) |>  
  summarise(
    min_rads = min(solar_radiation_mJ_m2),
    max_rads = max(solar_radiation_mJ_m2),
    mean_rads = mean(solar_radiation_mJ_m2),
    median_rads = median(solar_radiation_mJ_m2)
  )
##   min_rads max_rads mean_rads median_rads
## 1        0     3.44   1.82211        1.91
#A normal amount of solar radiation seems to be around 1.8 MJ/M^2, however occasionally it has reached a high of 3.44. What caused this condition, how damaging is it to human health, and how does this flucuate throughout the year?

#Next I will calculate value counts from seasons, holidays, and functioning_days to see how many observations fall in each category and check their unique values.
table(SeoulBikeData$seasons)
## 
## Autumn Spring Summer Winter 
##   2184   2208   2208   2160
#Interestingly, seasonal demand is flat.
table(SeoulBikeData$holiday)
## 
##    Holiday No Holiday 
##        432       8328
#Not many holidays.
table(SeoulBikeData$functioning_day)
## 
##   No  Yes 
##  295 8465
#Not many non-functional days.

#Question 1: How does temperature and precipitation affect bikeshare usage?
#Question 2: What is the expected decrease in riders per inch of snow?
#Question 3: What times of day have the greatest and least demand for bikeshare?

aggregate(rented_bikes ~ hour, data = SeoulBikeData, FUN = mean)
##    hour rented_bikes
## 1     0     541.4603
## 2     1     426.1836
## 3     2     301.6301
## 4     3     203.3315
## 5     4     132.5918
## 6     5     139.0822
## 7     6     287.5644
## 8     7     606.0055
## 9     8    1015.7014
## 10    9     645.9836
## 11   10     527.8219
## 12   11     600.8521
## 13   12     699.4411
## 14   13     733.2466
## 15   14     758.8247
## 16   15     829.1863
## 17   16     930.6219
## 18   17    1138.5096
## 19   18    1502.9260
## 20   19    1195.1479
## 21   20    1068.9644
## 22   21    1031.4493
## 23   22     922.7973
## 24   23     671.1260
#This information tell us that there is a spike of demand at 8am, a slight decrease in the following two hours, then a steady rise until peak hours between 5-9pm, and finally a steady fall to lows between 4-5 am. The large spike at 6pm may be caused by many people leaving work. The data shows that while commuting is an important driver of bikeshare demand, the evening hours exhibit heavy demand. The data also shows that there is still plenty of bikeshare activity happening during the night. This output would be important for bikeshare business operations and planning.


ggplot(SeoulBikeData, mapping = aes(x = temp_c, y = rented_bikes)) +
  geom_point(mapping = aes(color = seasons)) +
  labs(
    title = "Scatter plot of Temperature and Bike Rentals by Season",
    x = "Temperature(celsius)", 
    y = "Number of Rented Bikes"
  ) +
  theme_minimal()

#The data below shows a clear correlation between temperature and bikeshare usage. As expected, winter sees fewer riders while summer has the most. However, when temperature is very hot, ridership decreases.

ggplot(SeoulBikeData, aes(x = as.factor(hour), y = rented_bikes)) + 
  geom_boxplot() +
  labs(
    title = "Boxplot of Rented Bikes by Hour",
    x = "Hour", 
    y = "Number of Rented Bikes"
  ) +
  theme_minimal()

#This boxplots shows us the distribution of demand by hours. The data is bimodal, which means there are two peaks in the day. Before work and after work are the busiest times. Demand grows steadily during the day until 6pm when decling steadily declines.


SeoulBikeData <- SeoulBikeData |>
  separate(date, into = c("day", "month", "year"), sep = "/") |>
  mutate(
    day = as.numeric(day),
    month = as.numeric(month),
    year = as.numeric(year)
  )
  
SeoulBikeData <- SeoulBikeData |>
  mutate(
    date = as.Date(paste(year,month,day, sep = "/", collapse = NULL))
  )

solar_month <- SeoulBikeData |>
  group_by(month, seasons) |>
  summarise(mean_solar_radiation = mean(solar_radiation_mJ_m2)) |>
  arrange(as.numeric(month))
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
solar_day <- SeoulBikeData |>
  group_by(day, seasons) |>
  summarise(mean_solar_radiation = mean(solar_radiation_mJ_m2)) |>
  arrange(as.numeric(day))
## `summarise()` has grouped output by 'day'. You can override using the `.groups`
## argument.
ggplot(solar_day, aes(x = as.numeric(day), y = mean_solar_radiation, color = seasons)) +
  geom_point(size = 3) +
  geom_line() +
  labs(
    title = "Seasonality of Solar Radiation",
    x = "Day",
    y = "Mean Solar Radiation (mJ/m²)",
    color = "Season"
  ) +
  theme_minimal()