Project#2

Libraries Needed

library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

Importing the 3 different dataset

MTA = read.table(file="https://raw.githubusercontent.com/Jlok17/Data-Science-Projects/main/MTA_Daily_Ridership_Data__Beginning_2020.csv", header=TRUE,sep=",")
Covid_Hospital = read.table(file="https://raw.githubusercontent.com/Jlok17/Data-Science-Projects/main/COVID-19_Daily_Counts_of_Cases__Hospitalizations__and_Deaths.csv", header=TRUE,sep=",")
Gasoline = read.table(file="https://raw.githubusercontent.com/Jlok17/Data-Science-Projects/main/Gasoline_Retail_Prices_Weekly_Average_by_Region__Beginning_2007.csv", header=TRUE,sep=",")

MTA Dataset:

Question: How did the days of the week compare to one another in transportation rates?

Cleaning Data:

First we will see the structure, of the MTA data set which has a lot of different Columns. In order to answer our question we will be looking at Subway Percentages compared to the different days. So first thing I will do is change the columns name to a more clear title, then I will remove the columns that are not needed and place them into subset data frames corresponding to the different days. I will be using sequences of multiples of 7 in order to get each day in their respective data frame.

str(MTA)

## 'data.frame':    1097 obs. of  15 variables:
##  $ Date                                                   : chr  "03/02/2023" "03/01/2023" "02/28/2023" "02/27/2023" ...
##  $ Subways..Total.Estimated.Ridership                     : int  3760245 3773706 3408751 3335204 1623164 2041901 3244250 3498242 3454275 3325483 ...
##  $ Subways....of.Comparable.Pre.Pandemic.Day              : num  0.68 0.68 0.63 0.61 0.74 0.71 0.6 0.64 0.64 0.61 ...
##  $ Buses..Total.Estimated.Ridership                       : int  1390337 1477880 1334393 1415210 677706 802244 1244173 1326674 1304613 1272859 ...
##  $ Buses....of.Comparable.Pre.Pandemic.Day                : num  0.62 0.66 0.62 0.66 0.69 0.63 0.58 0.62 0.61 0.59 ...
##  $ LIRR..Total.Estimated.Ridership                        : int  195243 192142 177532 180277 74226 86403 179793 188438 193753 194967 ...
##  $ LIRR....of.Comparable.Pre.Pandemic.Day                 : num  0.62 0.61 0.59 0.6 0.95 0.92 0.59 0.62 0.64 0.64 ...
##  $ Metro.North..Total.Estimated.Ridership                 : int  176678 176661 139326 160346 67702 81742 153058 167193 171187 174283 ...
##  $ Metro.North....of.Comparable.Pre.Pandemic.Day          : num  0.64 0.64 0.52 0.6 0.74 0.62 0.57 0.62 0.64 0.65 ...
##  $ Access.A.Ride..Total.Scheduled.Trips                   : int  28978 29449 26116 26730 15891 15461 26077 27336 27895 27224 ...
##  $ Access.A.Ride....of.Comparable.Pre.Pandemic.Day        : num  0.97 0.99 0.89 0.91 0.94 0.95 0.89 0.93 0.95 0.93 ...
##  $ Bridges.and.Tunnels..Total.Traffic                     : int  934427 901530 751643 854625 807475 831613 928057 906322 869960 864047 ...
##  $ Bridges.and.Tunnels....of.Comparable.Pre.Pandemic.Day  : num  1.01 0.98 0.85 0.97 1.07 1 1.05 1.03 0.98 0.98 ...
##  $ Staten.Island.Railway..Total.Estimated.Ridership       : int  7428 7401 6689 6751 1095 1396 5550 6695 6538 6525 ...
##  $ Staten.Island.Railway....of.Comparable.Pre.Pandemic.Day: num  0.47 0.46 0.41 0.42 0.39 0.33 0.34 0.41 0.4 0.4 ...

MTA <- MTA[, 1:7]
print(names(MTA))

## [1] "Date"                                     
## [2] "Subways..Total.Estimated.Ridership"       
## [3] "Subways....of.Comparable.Pre.Pandemic.Day"
## [4] "Buses..Total.Estimated.Ridership"         
## [5] "Buses....of.Comparable.Pre.Pandemic.Day"  
## [6] "LIRR..Total.Estimated.Ridership"          
## [7] "LIRR....of.Comparable.Pre.Pandemic.Day"

#The Rows that are being created for each mode of transportation, are Total is Total amount of users that day for that mode of transportation. Percent is the Percentage of people compared to Pre-Pandemic to Post-Pandemic
MTA <- MTA  %>% rename(Subway.Total = Subways..Total.Estimated.Ridership, Subway.Percent = Subways....of.Comparable.Pre.Pandemic.Day, Bus.Total = Buses..Total.Estimated.Ridership, Bus.Percent = Buses....of.Comparable.Pre.Pandemic.Day, LIRR.Total = LIRR..Total.Estimated.Ridership, LIRR.Percent = LIRR....of.Comparable.Pre.Pandemic.Day)
MTA$Date <- as.Date(MTA$Date, format = "%m/%d/%Y")
MTA <- MTA[rev(seq_len(nrow(MTA))), ]

Sunday <- MTA[seq(1, nrow(MTA), by = 7), ]
Monday <-MTA[seq(2, nrow(MTA), by = 7), ]
Tuesday <-MTA[seq(3, nrow(MTA), by = 7), ]
Wednesday <-MTA[seq(4, nrow(MTA), by = 7), ]
Thursday <-MTA[seq(5, nrow(MTA), by = 7), ]
Friday <-MTA[seq(6, nrow(MTA), by = 7), ]
Saturday <-MTA[seq(7, nrow(MTA), by = 7), ]

Graphs Pt.1:

As seen in the graphs below, the weekday’s have a lower recovery trend compared to the weekend. The Graphs will be the different subway percentage for each Day data frame.

ggplot(Monday, aes(x = Date, y = Subway.Percent)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE)+
  labs(title = "Monday, Subway Percentage", x = "Time", y = "Percentage of Passengers")

## `geom_smooth()` using formula = 'y ~ x'

ggplot(Tuesday, aes(x = Date, y = Subway.Percent)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE)+
  labs(title = "Tuesday, Subway Percentage", x = "Time", y = "Percentage of Passengers")

## `geom_smooth()` using formula = 'y ~ x'

ggplot(Wednesday, aes(x = Date, y = Subway.Percent)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE)+
  labs(title = "Wednesday, Subway Percentage", x = "Time", y = "Percentage of Passengers")

## `geom_smooth()` using formula = 'y ~ x'

ggplot(Thursday, aes(x = Date, y = Subway.Percent)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE)+
  labs(title = "Thursday, Subway Percentage", x = "Time", y = "Percentage of Passengers")

## `geom_smooth()` using formula = 'y ~ x'

ggplot(Friday, aes(x = Date, y = Subway.Percent)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE)+
  labs(title = "Friday, Subway Percentage", x = "Time", y = "Percentage of Passengers")

## `geom_smooth()` using formula = 'y ~ x'

ggplot(Saturday, aes(x = Date, y = Subway.Percent)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE)+
  labs(title = "Saturday, Subway Percentage", x = "Time", y = "Percentage of Passengers")

## `geom_smooth()` using formula = 'y ~ x'

ggplot(Sunday, aes(x = Date, y = Subway.Percent)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE)+
  labs(title = "Sunday, Subway Percentage", x = "Time", y = "Percentage of Passengers")

## `geom_smooth()` using formula = 'y ~ x'

Graphs pt.2:

Below will be the graphs of the LIRR to further prove the summary and general trend that was created through the Subway graphs, where the weekdays had a lower recovery trend compared to the weekends.

summary(Saturday$Subway.Percent)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0800  0.3600  0.5700  0.5128  0.6800  0.9200

summary(Sunday$Subway.Percent)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0800  0.3500  0.5800  0.5193  0.6800  0.9700

summary(Monday$Subway.Percent)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0700  0.3000  0.4800  0.4452  0.5600  1.0400

summary(Tuesday$Subway.Percent)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0800  0.3100  0.4800  0.4469  0.5900  0.9800

ggplot(Monday, aes(x = Date, y = LIRR.Percent)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE)+
  labs(title = "Monday, LIRR Percentage", x = "Time", y = "Percentage of Passengers")

## `geom_smooth()` using formula = 'y ~ x'

ggplot(Tuesday, aes(x = Date, y = LIRR.Percent)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE)+
  labs(title = "Tuesday, LIRR Percentage", x = "Time", y = "Percentage of Passengers")

## `geom_smooth()` using formula = 'y ~ x'

ggplot(Saturday, aes(x = Date, y = LIRR.Percent)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE)+
  labs(title = "Saturday, LIRR Percentage", x = "Time", y = "Percentage of Passengers")

## `geom_smooth()` using formula = 'y ~ x'

ggplot(Sunday, aes(x = Date, y = LIRR.Percent)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE)+
  labs(title = "Sunday, LIRR Percentage", x = "Time", y = "Percentage of Passengers")

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 1 rows containing missing values (`geom_point()`).

#### Conclusion of MTA Data Set: For the Subway Graphs it is seen that Saturday and Sunday had a higher recover rate for Population Percentages comparing Pre to Post Pandemic travel. This can be accounted to the idea of a higher amount of people wanting to go outside and visit around the city for travel/necessities. This is compared to the weekday trend decrease which is contributed by the increase of workers going to remote/hybrid situation. Overall showing an increase in foot traffic throughout the weekend. The LIRR Graphs show majority of Days of the week are peaking towards .75 while the weekend has fully recover or even increased more than prior. Outliers on the Monday Graph were around the holidays such as Juneteenth and MLK day which had the highest Percentage of Passengers.

Covid Hospital Dataset:

The Analysis we are going for here is to see the percentage of Covid-19 Cases in each borough. In the transformation below, I will be turning the values in the Date columns into a date value instead of how it is, which is a character value. As well as changing the Columns names that I will be sub-setting into different data frames. The data that I care about the most are the case count for the everywhere then the different boroughs and then the 7 day averages for cases popping up.

View(Covid_Hospital)
str(Covid_Hospital)

## 'data.frame':    1096 obs. of  67 variables:
##  $ date_of_interest               : chr  "02/29/2020" "03/01/2020" "03/02/2020" "03/03/2020" ...
##  $ CASE_COUNT                     : int  1 0 0 1 5 3 8 7 21 57 ...
##  $ PROBABLE_CASE_COUNT            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HOSPITALIZED_COUNT             : int  1 1 2 7 2 14 8 8 18 36 ...
##  $ DEATH_COUNT                    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PROBABLE_DEATH_COUNT           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CASE_COUNT_7DAY_AVG            : int  0 0 0 0 0 0 3 3 6 15 ...
##  $ ALL_CASE_COUNT_7DAY_AVG        : int  0 0 0 0 0 0 3 3 6 15 ...
##  $ HOSP_COUNT_7DAY_AVG            : int  0 0 0 0 0 0 5 6 8 13 ...
##  $ DEATH_COUNT_7DAY_AVG           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ALL_DEATH_COUNT_7DAY_AVG       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ BX_CASE_COUNT                  : int  0 0 0 0 0 0 2 0 3 4 ...
##  $ BX_PROBABLE_CASE_COUNT         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ BX_HOSPITALIZED_COUNT          : int  0 1 0 1 0 1 1 1 5 7 ...
##  $ BX_DEATH_COUNT                 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ BX_PROBABLE_DEATH_COUNT        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ BX_CASE_COUNT_7DAY_AVG         : int  0 0 0 0 0 0 0 0 1 1 ...
##  $ BX_PROBABLE_CASE_COUNT_7DAY_AVG: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ BX_ALL_CASE_COUNT_7DAY_AVG     : int  0 0 0 0 0 0 0 0 1 1 ...
##  $ BX_HOSPITALIZED_COUNT_7DAY_AVG : int  0 0 0 0 0 0 1 1 1 2 ...
##  $ BX_DEATH_COUNT_7DAY_AVG        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ BX_ALL_DEATH_COUNT_7DAY_AVG    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ BK_CASE_COUNT                  : int  0 0 0 0 1 3 1 2 5 16 ...
##  $ BK_PROBABLE_CASE_COUNT         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ BK_HOSPITALIZED_COUNT          : int  1 0 2 3 1 3 1 3 8 11 ...
##  $ BK_DEATH_COUNT                 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ BK_PROBABLE_DEATH_COUNT        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ BK_CASE_COUNT_7DAY_AVG         : int  0 0 0 0 0 0 1 1 2 4 ...
##  $ BK_PROBABLE_CASE_COUNT_7DAY_AVG: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ BK_ALL_CASE_COUNT_7DAY_AVG     : int  0 0 0 0 0 0 1 1 2 4 ...
##  $ BK_HOSPITALIZED_COUNT_7DAY_AVG : int  0 0 0 0 0 0 2 2 3 4 ...
##  $ BK_DEATH_COUNT_7DAY_AVG        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ BK_ALL_DEATH_COUNT_7DAY_AVG    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MN_CASE_COUNT                  : int  1 0 0 0 2 0 3 1 6 24 ...
##  $ MN_PROBABLE_CASE_COUNT         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MN_HOSPITALIZED_COUNT          : int  0 0 0 1 1 5 3 0 1 9 ...
##  $ MN_DEATH_COUNT                 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MN_PROBABLE_DEATH_COUNT        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MN_CASE_COUNT_7DAY_AVG         : int  0 0 0 0 0 0 1 1 2 5 ...
##  $ MN_PROBABLE_CASE_COUNT_7DAY_AVG: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MN_ALL_CASE_COUNT_7DAY_AVG     : int  0 0 0 0 0 0 1 1 2 5 ...
##  $ MN_HOSPITALIZED_COUNT_7DAY_AVG : int  0 0 0 0 0 0 1 1 2 3 ...
##  $ MN_DEATH_COUNT_7DAY_AVG        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MN_ALL_DEATH_COUNT_7DAY_AVG    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ QN_CASE_COUNT                  : int  0 0 0 1 2 0 1 3 6 10 ...
##  $ QN_PROBABLE_CASE_COUNT         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ QN_HOSPITALIZED_COUNT          : int  0 0 0 2 0 4 2 4 4 8 ...
##  $ QN_DEATH_COUNT                 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ QN_PROBABLE_DEATH_COUNT        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ QN_CASE_COUNT_7DAY_AVG         : int  0 0 0 0 0 0 1 1 2 3 ...
##  $ QN_PROBABLE_CASE_COUNT_7DAY_AVG: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ QN_ALL_CASE_COUNT_7DAY_AVG     : int  0 0 0 0 0 0 1 1 2 3 ...
##  $ QN_HOSPITALIZED_COUNT_7DAY_AVG : int  0 0 0 0 0 0 1 2 2 3 ...
##  $ QN_DEATH_COUNT_7DAY_AVG        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ QN_ALL_DEATH_COUNT_7DAY_AVG    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ SI_CASE_COUNT                  : int  0 0 0 0 0 0 1 1 1 3 ...
##  $ SI_PROBABLE_CASE_COUNT         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ SI_HOSPITALIZED_COUNT          : int  0 0 0 0 0 1 1 0 0 2 ...
##  $ SI_DEATH_COUNT                 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ SI_PROBABLE_DEATH_COUNT        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ SI_PROBABLE_CASE_COUNT_7DAY_AVG: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ SI_CASE_COUNT_7DAY_AVG         : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ SI_ALL_CASE_COUNT_7DAY_AVG     : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ SI_HOSPITALIZED_COUNT_7DAY_AVG : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ SI_DEATH_COUNT_7DAY_AVG        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ SI_ALL_DEATH_COUNT_7DAY_AVG    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ INCOMPLETE                     : int  0 0 0 0 0 0 0 0 0 0 ...

Covid_Hospital$date_of_interest <- as.Date(Covid_Hospital$date_of_interest, format = "%m/%d/%Y")
names(Covid_Hospital)

##  [1] "date_of_interest"                "CASE_COUNT"                     
##  [3] "PROBABLE_CASE_COUNT"             "HOSPITALIZED_COUNT"             
##  [5] "DEATH_COUNT"                     "PROBABLE_DEATH_COUNT"           
##  [7] "CASE_COUNT_7DAY_AVG"             "ALL_CASE_COUNT_7DAY_AVG"        
##  [9] "HOSP_COUNT_7DAY_AVG"             "DEATH_COUNT_7DAY_AVG"           
## [11] "ALL_DEATH_COUNT_7DAY_AVG"        "BX_CASE_COUNT"                  
## [13] "BX_PROBABLE_CASE_COUNT"          "BX_HOSPITALIZED_COUNT"          
## [15] "BX_DEATH_COUNT"                  "BX_PROBABLE_DEATH_COUNT"        
## [17] "BX_CASE_COUNT_7DAY_AVG"          "BX_PROBABLE_CASE_COUNT_7DAY_AVG"
## [19] "BX_ALL_CASE_COUNT_7DAY_AVG"      "BX_HOSPITALIZED_COUNT_7DAY_AVG" 
## [21] "BX_DEATH_COUNT_7DAY_AVG"         "BX_ALL_DEATH_COUNT_7DAY_AVG"    
## [23] "BK_CASE_COUNT"                   "BK_PROBABLE_CASE_COUNT"         
## [25] "BK_HOSPITALIZED_COUNT"           "BK_DEATH_COUNT"                 
## [27] "BK_PROBABLE_DEATH_COUNT"         "BK_CASE_COUNT_7DAY_AVG"         
## [29] "BK_PROBABLE_CASE_COUNT_7DAY_AVG" "BK_ALL_CASE_COUNT_7DAY_AVG"     
## [31] "BK_HOSPITALIZED_COUNT_7DAY_AVG"  "BK_DEATH_COUNT_7DAY_AVG"        
## [33] "BK_ALL_DEATH_COUNT_7DAY_AVG"     "MN_CASE_COUNT"                  
## [35] "MN_PROBABLE_CASE_COUNT"          "MN_HOSPITALIZED_COUNT"          
## [37] "MN_DEATH_COUNT"                  "MN_PROBABLE_DEATH_COUNT"        
## [39] "MN_CASE_COUNT_7DAY_AVG"          "MN_PROBABLE_CASE_COUNT_7DAY_AVG"
## [41] "MN_ALL_CASE_COUNT_7DAY_AVG"      "MN_HOSPITALIZED_COUNT_7DAY_AVG" 
## [43] "MN_DEATH_COUNT_7DAY_AVG"         "MN_ALL_DEATH_COUNT_7DAY_AVG"    
## [45] "QN_CASE_COUNT"                   "QN_PROBABLE_CASE_COUNT"         
## [47] "QN_HOSPITALIZED_COUNT"           "QN_DEATH_COUNT"                 
## [49] "QN_PROBABLE_DEATH_COUNT"         "QN_CASE_COUNT_7DAY_AVG"         
## [51] "QN_PROBABLE_CASE_COUNT_7DAY_AVG" "QN_ALL_CASE_COUNT_7DAY_AVG"     
## [53] "QN_HOSPITALIZED_COUNT_7DAY_AVG"  "QN_DEATH_COUNT_7DAY_AVG"        
## [55] "QN_ALL_DEATH_COUNT_7DAY_AVG"     "SI_CASE_COUNT"                  
## [57] "SI_PROBABLE_CASE_COUNT"          "SI_HOSPITALIZED_COUNT"          
## [59] "SI_DEATH_COUNT"                  "SI_PROBABLE_DEATH_COUNT"        
## [61] "SI_PROBABLE_CASE_COUNT_7DAY_AVG" "SI_CASE_COUNT_7DAY_AVG"         
## [63] "SI_ALL_CASE_COUNT_7DAY_AVG"      "SI_HOSPITALIZED_COUNT_7DAY_AVG" 
## [65] "SI_DEATH_COUNT_7DAY_AVG"         "SI_ALL_DEATH_COUNT_7DAY_AVG"    
## [67] "INCOMPLETE"

Covid_Hospital <- Covid_Hospital %>% rename(Date = date_of_interest, 
                                            Case.Count = CASE_COUNT, 
                                            All.Aver.7Day = ALL_CASE_COUNT_7DAY_AVG, 
                                            Aver.7Day = CASE_COUNT_7DAY_AVG, 
                                            SI.Case.Count = SI_CASE_COUNT, 
                                            SI.7Day.Aver.All = SI_ALL_CASE_COUNT_7DAY_AVG, 
                                            SI.7Day.Aver = SI_CASE_COUNT_7DAY_AVG, 
                                            BK.Case.Count = BK_CASE_COUNT, 
                                            BK.7Day.Aver.All = BK_ALL_CASE_COUNT_7DAY_AVG, 
                                            BK.7Day.Aver = BK_CASE_COUNT_7DAY_AVG, 
                                            BX.Case.Count = BX_CASE_COUNT, 
                                            BX.7Day.Aver.All = BX_ALL_CASE_COUNT_7DAY_AVG, 
                                            BX.7Day.Aver = BX_CASE_COUNT_7DAY_AVG, 
                                            QN.Case.Count = QN_CASE_COUNT, 
                                            QN.7Day.Aver.All = QN_ALL_CASE_COUNT_7DAY_AVG, 
                                            QN.7Day.Aver = QN_CASE_COUNT_7DAY_AVG,
                                            MN.Case.Count = MN_CASE_COUNT, 
                                            MN.7Day.Aver.All = MN_ALL_CASE_COUNT_7DAY_AVG, 
                                            MN.7Day.Aver = MN_CASE_COUNT_7DAY_AVG)


Covid_Hospital <- Covid_Hospital %>% select("Date", "Case.Count", "All.Aver.7Day", "Aver.7Day", "SI.Case.Count", "SI.7Day.Aver.All", "SI.7Day.Aver", "BK.Case.Count", "BK.7Day.Aver.All", "BK.7Day.Aver", "BX.Case.Count", "BX.7Day.Aver.All", "BX.7Day.Aver", "QN.Case.Count", "QN.7Day.Aver.All", "QN.7Day.Aver", "MN.Case.Count", "MN.7Day.Aver.All", "MN.7Day.Aver")

#Separating the Data into each subset for each Borough

SI <- Covid_Hospital %>% select("Date", "Case.Count", "All.Aver.7Day", "Aver.7Day", "SI.Case.Count", "SI.7Day.Aver.All", "SI.7Day.Aver")
BK <- Covid_Hospital %>% select("Date", "Case.Count", "All.Aver.7Day", "Aver.7Day", "BK.Case.Count", "BK.7Day.Aver.All", "BK.7Day.Aver")
BX <- Covid_Hospital %>% select("Date", "Case.Count", "All.Aver.7Day", "Aver.7Day", "BX.Case.Count", "BX.7Day.Aver.All", "BX.7Day.Aver")
MN <- Covid_Hospital %>% select("Date", "Case.Count", "All.Aver.7Day", "Aver.7Day", "MN.Case.Count", "MN.7Day.Aver.All", "MN.7Day.Aver")
QN <- Covid_Hospital %>% select("Date", "Case.Count", "All.Aver.7Day", "Aver.7Day", "QN.Case.Count", "QN.7Day.Aver.All", "QN.7Day.Aver")

#Creating the ratio of Count Per borough to the Total
SI$Ratio <- SI$SI.7Day.Aver/SI$Aver.7Day
BK$Ratio <- BK$BK.7Day.Aver/BK$Aver.7Day
BX$Ratio <- BX$BX.7Day.Aver/BX$Aver.7Day 
MN$Ratio <- MN$MN.7Day.Aver/MN$Aver.7Day
QN$Ratio <- QN$QN.7Day.Aver/QN$Aver.7Day
#Replacing all the NA with the Value of 0 since the Column of ratio will present a value of NA since its dividing by 0
SI[is.na(SI)] <- 0
BK[is.na(BK)] <- 0
BX[is.na(BX)] <- 0
MN[is.na(MN)] <- 0
QN[is.na(QN)] <- 0

Graphs:

Here we are taking the Case Percentages of each Borough and we are getting the different ratios to the total amount of cases in NYC.

ggplot(SI, aes(x = Date, y = Ratio)) + 
  geom_point() +
  labs(title = "Staten Island Case Percentage", x = "Time from 2/2020-2/2023", y = "Percentage of Cases in Borough compared to Total")

ggplot(BK, aes(x = Date, y = Ratio)) + 
  geom_point() +
  labs(title = "Brooklyn Case Percentage", x = "Time from 2/2020-2/2023", y = "Percentage of Cases in Borough compared to Total")

ggplot(BX, aes(x = Date, y = Ratio)) + 
  geom_point() +
  labs(title = "Bronx Case Percentage", x = "Time from 2/2020-2/2023", y = "Percentage of Cases in Borough compared to Total")

ggplot(MN, aes(x = Date, y = Ratio)) + 
  geom_point() +
  labs(title = "Manhatten Case Percentage", x = "Time from 2/2020-2/2023", y = "Percentage of Cases in Borough compared to Total")

ggplot(QN, aes(x = Date, y = Ratio)) + 
  geom_point() +
  labs(title = "Queens Case Percentage", x = "Time from 2/2020-2/2023", y = "Percentage of Cases in Borough compared to Total")

summary(SI$Ratio)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.06073 0.06901 0.07352 0.08707 0.15882

summary(BK$Ratio)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.2894  0.3010  0.3076  0.3231  0.4593

summary(BX$Ratio)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.1304  0.1612  0.1578  0.1818  0.2602

summary(MN$Ratio)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.1506  0.1815  0.1880  0.2051  0.3973

summary(QN$Ratio)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.2468  0.2774  0.2673  0.2912  0.3333

Conclusion for Covid-19 Dataset:

As an interesting relation to this data set is the highest percentage ratio comes on average from Brooklyn compared to every other Borough with Staten Island being the lowers on average. This is interesting to me since after Brooklyn it is Queens which when you think about it at face value, it is more thought upon that Manhattan has the most foot traffic out of every other place. So for Manhattan to only have on average 18.8% is interesting.

Gasoline Data Set:

In this data set we will be looking at the different gas prices throughout the different cities in New York. This is very interesting since we are comparing the different prices, it can account for cost of living as Gas prices are a huge factor when creating CPI report and Overall living costs. For the most part, I will be only taking a few of the cities, as some of the cities doesn’t even have data that stretches throughout the entire time period that is recorded. The main cities, that will be looked at are Albany, Syracuse, Rochester, and NYC.

### Gasoline
### Determing Gas Prices based on the different major Cities in New York

str(Gasoline)

## 'data.frame':    801 obs. of  18 variables:
##  $ Date                          : chr  "02/27/2023" "02/20/2023" "02/13/2023" "02/06/2023" ...
##  $ New.York.State.Average....gal.: num  3.47 3.51 3.54 3.57 3.56 3.48 3.44 3.45 3.41 3.45 ...
##  $ Albany.Average....gal.        : num  3.51 3.54 3.85 3.61 3.59 3.48 3.41 3.44 3.4 3.44 ...
##  $ Batavia.Average....gal.       : num  3.38 3.38 3.72 3.44 3.43 3.37 3.32 3.35 3.32 3.33 ...
##  $ Binghamton.Average....gal.    : num  3.41 3.45 3.77 3.52 3.5 3.45 3.44 3.44 3.44 3.47 ...
##  $ Buffalo.Average....gal.       : num  3.39 3.41 3.75 3.45 3.45 3.41 3.4 3.44 3.46 3.49 ...
##  $ Dutchess.Average....gal.      : num  3.56 3.59 3.9 3.63 3.62 3.54 3.51 3.51 3.48 3.51 ...
##  $ Elmira.Average....gal.        : num  3.4 3.43 3.77 3.51 3.47 3.37 3.29 3.3 3.33 3.38 ...
##  $ Glens.Falls.Average....gal.   : num  3.62 3.63 3.88 3.69 3.67 3.58 3.51 3.53 3.47 3.49 ...
##  $ Ithaca.Average....gal.        : num  3.47 3.5 3.82 3.53 3.51 3.49 3.53 3.57 3.57 3.59 ...
##  $ Kingston.Average....gal.      : num  3.38 3.4 3.74 3.49 3.47 3.38 3.31 3.3 3.27 3.34 ...
##  $ Nassau.Average....gal.        : num  3.36 3.42 3.73 3.58 3.49 3.37 3.3 3.29 3.2 3.25 ...
##  $ New.York.City.Average....gal. : num  3.48 3.54 3.82 3.64 3.62 3.53 3.5 3.53 3.45 3.5 ...
##  $ Rochester.Average....gal.     : num  3.45 3.47 3.78 3.52 3.51 3.46 3.46 3.49 3.49 3.53 ...
##  $ Syracuse.Average....gal.      : num  3.45 3.47 3.81 3.52 3.5 3.44 3.41 3.43 3.4 3.45 ...
##  $ Utica.Average....gal.         : num  3.5 3.53 3.87 3.62 3.62 3.59 3.57 3.59 3.5 3.53 ...
##  $ Watertown.Average....gal.     : num  3.48 3.5 3.86 3.56 3.54 3.47 3.44 3.46 3.44 3.47 ...
##  $ White.Plains.Average....gal.  : num  3.56 3.61 3.94 3.66 3.66 3.58 3.54 3.54 3.49 3.53 ...

Gasoline$Date <- as.Date(Gasoline$Date, format = "%m/%d/%Y")
Gasoline <- Gasoline[rev(seq_len(nrow(Gasoline))), ]
Gasoline <- Gasoline  %>% rename(Albany.Price = Albany.Average....gal., NYS.Price = New.York.State.Average....gal., Syracuse.Price = Syracuse.Average....gal., NYC.Price = New.York.City.Average....gal., Rochester.Price = Rochester.Average....gal.)
Gasoline <- Gasoline %>% select("Date", "NYS.Price", "Albany.Price","Syracuse.Price", "Rochester.Price", "NYC.Price")

How I would Transpose the Data set into a wide Structure:

To showcase the transformation into a wider structure, below would indicate how one would transpose the data frame into a wider structure.

wide_format <- t(Gasoline)
colnames(wide_format) <- wide_format[1,]
wide_format <- wide_format[-1,]
rownames <- row.names(wide_format)
wide_format <- as.data.frame(wide_format, row.names = rownames(wide_format), col.names = colnames(wide_format))
wide_format <- sapply(wide_format, as.numeric)
rownames(wide_format) <- rownames
str(wide_format)

##  num [1:5, 1:801] 2.98 2.97 2.99 3 2.97 3.1 3.09 3.1 3.09 3.07 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:5] "NYS.Price" "Albany.Price" "Syracuse.Price" "Rochester.Price" ...
##   ..$ : chr [1:801] "2007-10-29" "2007-11-05" "2007-11-12" "2007-11-19" ...

Data Analysis:

Below is just creating another column of data that will be compared, which is the percentage difference of each city compared to the Average State price of $/gal for gas.

Syracuse <- Gasoline %>% select("Date", "NYS.Price", "Syracuse.Price")
Albany <- Gasoline %>% select("Date", "NYS.Price", "Albany.Price")
Rochester <- Gasoline %>% select("Date", "NYS.Price", "Rochester.Price")
NYC <- Gasoline %>% select("Date", "NYS.Price", "NYC.Price")

Syracuse$Percent.Diff <- ((Syracuse$Syracuse.Price-Syracuse$NYS.Price)/Syracuse$NYS.Price)*100
Albany$Percent.Diff <- ((Albany$Albany.Price-Albany$NYS.Price)/Albany$NYS.Price)*100
Rochester$Percent.Diff <- ((Rochester$Rochester.Price-Rochester$NYS.Price)/Rochester$NYS.Price)*100
NYC$Percent.Diff <- ((NYC$NYC.Price-NYC$NYS.Price)/NYC$NYS.Price)*100

Summary Analysis:

As seen below, majority of the cities that are not NYC, has a lower percentage difference and for most part is negative compared to the average.

summary(Syracuse$Percent.Diff)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -8.621  -3.704  -2.643  -2.749  -1.460   7.627

summary(Albany$Percent.Diff)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -6.087  -3.376  -2.320  -2.332  -1.258   8.757

summary(Rochester$Percent.Diff)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -4.6025 -2.0000 -1.3274 -1.1742 -0.5249  6.7797

summary(NYC$Percent.Diff)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -1.225   1.796   2.564   2.723   3.425   7.910

Comparison:

Here will be graphed side by side comparison of the Syracuse and NYC Gas Price, Percentage Difference.

Comparison <- data.frame(Syracuse$Date, Syracuse$Percent.Diff,NYC$Percent.Diff)

ggplot() + 
  geom_line(data = Comparison, aes(x = Syracuse.Date, y = Syracuse.Percent.Diff, color = "NYC")) +
  geom_line(data = Comparison, aes(x = Syracuse.Date, y = NYC.Percent.Diff, color = "Syracuse")) +
  scale_color_manual(values=c("red", "blue"), labels=c("Syracuse", "NYC")) +
  xlab('Time from 10/29/2007-2/27/2023') +
  ylab('Percent Difference')

Conclusion for Gas Prices

As seen through the graph, NYC is substantially higher than Syracuse, throughout the period of 2007 into 2023. This can indicate a higher cost of living in NYC through the gas prices, as it is a necessity for work and travel places. Another thing is interesting is cities typically have higher volumes of public transportation which one might think that the cities would subsidize the travel cost to assist and accommodate every economic class.