DATA 607 Project 2 Beshkia Kvarnstrom

knitr::opts_chunk$set(echo = TRUE)
pkges <- c("tidyr","readr","kableExtra","dplyr","ggplot2","stringr", "scales")

# Loop through the packages
for (p in pkges) {
  # Check if package is installed
  if (!requireNamespace(p, quietly = TRUE)) {
    install.packages(p) #If the package is not installed, install the package
    
    library(p, character.only = TRUE) #Load the package
  } else {
    library(p, character.only = TRUE) #If the package is already installed, load the package
  }
}

The goal of this assignment is to give you practice in preparing different datasets for downstream analysis work.

Your task is to:

  1. Choose any three of the “wide” datasets identified in the Week 6 Discussion items. (You may use your own dataset; please don’t use my Sample Post dataset, since that was used in your Week 6 assignment!) For each of the three chosen datasets:

 Create a .CSV file (or optionally, a MySQL database!) that includes all of the information included in the dataset. You’re encouraged to use a “wide” structure similar to how the information appears in the discussion item, so that you can practice tidying and transformations as described below.

 Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data. [Most of your grade will be based on this step!]

 Perform the analysis requested in the discussion item.

 Your code should be in an R Markdown file, posted to rpubs.com, and should include narrative descriptions of your data cleanup work, analysis, and conclusions.

  1. Please include in your homework submission, for each of the three chosen datasets:

 The URL to the .Rmd file in your GitHub repository, and  The URL for your rpubs.com web page.

Dataset 1 - Student Test Score Results

Import Data from Github and load in R

Test_Score_Data <- read_csv('https://raw.githubusercontent.com/BeshkiaKvarnstrom/MSDS-DATA607/main/student_results.csv', 
                          show_col_types = FALSE)

Create dataframe to store the data

Test_Score_Data <- data.frame(Test_Score_Data)
Test_Score_Data %>% 
head(20) %>%
  kable() %>% 
  kable_styling(bootstrap_options = "striped", font_size = 12, fixed_thead = T) %>%  scroll_box(height = "300px", width = "100%")
id name phone sex.and.age test.number term.1 term.2 term.3
1 Mike 134 m_12 test 1 76 84 87
2 Linda 270 f_13 test 1 88 90 73
3 Sam 210 m_11 test 1 78 74 80
4 Esther 617 f_12 test 1 68 75 74
5 Mary 114 f_14 test 1 65 67 64
1 Mike 134 m_12 test 2 85 80 90
2 Linda 270 f_13 test 2 87 82 94
3 Sam 210 m_11 test 2 80 87 80
4 Esther 617 f_12 test 2 70 75 78
5 Mary 114 f_14 test 2 68 70 63

Tidying the Student Test Score Results data

colnames(Test_Score_Data) <- c("ID", "NAME", "PHONE", "GENDER", "AGE", "TEST NO.", "TERM", "TEST SCORE")
colnames(Test_Score_Data) <- c("ID", "NAME", "PHONE", "GENDER", "AGE", "TEST NO.", "TERM", "TEST SCORE")
Test_Score_Data%>% 
head(20) %>%
  kable() %>% 
  kable_styling(bootstrap_options = "striped", font_size = 12, fixed_thead = T) %>%  scroll_box(height = "300px", width = "100%")
ID NAME PHONE GENDER AGE TEST NO. TERM TEST SCORE
1 Mike 134 m_12 test 1 76 84 87
2 Linda 270 f_13 test 1 88 90 73
3 Sam 210 m_11 test 1 78 74 80
4 Esther 617 f_12 test 1 68 75 74
5 Mary 114 f_14 test 1 65 67 64
1 Mike 134 m_12 test 2 85 80 90
2 Linda 270 f_13 test 2 87 82 94
3 Sam 210 m_11 test 2 80 87 80
4 Esther 617 f_12 test 2 70 75 78
5 Mary 114 f_14 test 2 68 70 63

Analyze and Visualize the Student Test Score Results data

# Analyzing the test score to determine the minimum, maximum, mean and median test scores for each term by gender. From the analysis you can see that females did better than males overall.
Agg_TestScores <- Test_Score_Data %>% group_by(GENDER) %>% group_by(GENDER, TERM) %>%
            summarise(
                      "MIN TEST SCORE" = min(`TEST SCORE`), 
                      "MAX TEST SCORE" = max(`TEST SCORE`),
"MEAN TEST SCORE" = mean(`TEST SCORE`),              "MEDIAN TEST SCORE" = median(`TEST SCORE`),     
                     .groups = 'drop'
                    )

Agg_TestScores%>% 
head(20) %>%
  kable() %>% 
  kable_styling(bootstrap_options = "striped", font_size = 12, fixed_thead = T) %>%  scroll_box(height = "300px", width = "100%")
GENDER TERM MIN TEST SCORE MAX TEST SCORE MEAN TEST SCORE MEDIAN TEST SCORE
f_12 75 74 78 76 76
f_13 82 94 94 94 94
f_13 90 73 73 73 73
f_14 67 64 64 64 64
f_14 70 63 63 63 63
m_11 74 80 80 80 80
m_11 87 80 80 80 80
m_12 80 90 90 90 90
m_12 84 87 87 87 87
# The following plot compares the test scores for each gender and the terms taken. 
ggplot(data = Agg_TestScores) +
  geom_bar(mapping = aes(x = TERM, y = `MEAN TEST SCORE`, fill = GENDER), stat = 'identity') +
  facet_grid(~ GENDER) +
  theme(axis.text.x = element_text(angle = 70, hjust = 1, size=5)) +
  labs(title = 'AVERAGE TEST SCORES PER TERM', subtitle = "By Gender")

# This visualition displays the test scores per student for each term
ggplot(data = Test_Score_Data) +
  geom_bar(mapping = aes(x=NAME, y=`TEST SCORE`, fill = `TERM`), stat = 'identity') +
  facet_grid(~ `TEST NO.`) +
  theme(axis.text.x = element_text(angle = 70, hjust = 1, size=5)) +
  labs(title = 'STUDENT TEST SCORES' , subtitle = "By Term")

Dataset 2 -

Int_Rate_Data <- read_csv('https://raw.githubusercontent.com/BeshkiaKvarnstrom/MSDS-DATA607/main/interest_rate_of_banks.csv', 
                          show_col_types = FALSE)

Create dataframe to store the data

Int_Rate_Data <- data.frame(Int_Rate_Data)
Int_Rate_Data%>% 
head(20) %>%
  kable() %>% 
  kable_styling(bootstrap_options = "striped", font_size = 12, fixed_thead = T) %>%  scroll_box(height = "300px", width = "100%")
date federal_reserve_system european_central_bank swiss_national_bank bank_england reserve_bank_australia bank_japan bank_brazil
1/1/1970 NA NA NA 8.0 NA NA NA
2/1/1970 NA NA NA 8.0 NA NA NA
3/1/1970 NA NA NA 8.0 NA NA NA
4/1/1970 NA NA NA 7.5 NA NA NA
5/1/1970 NA NA NA 7.0 NA NA NA
6/1/1970 NA NA NA 7.0 NA NA NA
7/1/1970 NA NA NA 7.0 NA NA NA
8/1/1970 NA NA NA 7.0 NA NA NA
9/1/1970 NA NA NA 7.0 NA NA NA
10/1/1970 NA NA NA 7.0 NA NA NA
11/1/1970 NA NA NA 7.0 NA NA NA
12/1/1970 NA NA NA 7.0 NA NA NA
1/1/1971 NA NA NA 7.0 NA NA NA
2/1/1971 NA NA NA 7.0 NA NA NA
3/1/1971 NA NA NA 7.0 NA NA NA
4/1/1971 NA NA NA 7.0 NA NA NA
5/1/1971 NA NA NA 6.0 NA NA NA
6/1/1971 NA NA NA 6.0 NA NA NA
7/1/1971 NA NA NA 6.0 NA NA NA
8/1/1971 NA NA NA 6.0 NA NA NA

Tidying the Student Test Score Results data

# Use the tidyr function gather() to unpivot data from a "wide" to a "long" format
Int_Rate_df <-gather(Int_Rate_Data, bank_name ,'interest_rate',2:8)

#Remove columns with interest rates NA value
Int_Rate_df <- Int_Rate_df[!apply(is.na(Int_Rate_df[3:3]),1,all), ]


Int_Rate_df%>% 
head(20) %>%
  kable() %>% 
  kable_styling(bootstrap_options = "striped", font_size = 12, fixed_thead = T) %>%  scroll_box(height = "300px", width = "100%")
date bank_name interest_rate
154 9/27/1982 federal_reserve_system 10.25
155 10/1/1982 federal_reserve_system 10.00
156 10/7/1982 federal_reserve_system 9.50
158 11/19/1982 federal_reserve_system 9.00
160 12/14/1982 federal_reserve_system 8.50
164 3/31/1983 federal_reserve_system 8.62
167 5/25/1983 federal_reserve_system 8.75
169 6/24/1983 federal_reserve_system 9.00
171 7/14/1983 federal_reserve_system 9.25
172 7/20/1983 federal_reserve_system 9.43
174 8/11/1983 federal_reserve_system 9.56
175 8/17/1983 federal_reserve_system 9.50
177 9/15/1983 federal_reserve_system 9.37
184 3/29/1984 federal_reserve_system 10.50
189 7/5/1984 federal_reserve_system 11.00
190 7/19/1984 federal_reserve_system 11.25
192 8/9/1984 federal_reserve_system 11.50
194 9/20/1984 federal_reserve_system 11.25
195 9/27/1984 federal_reserve_system 11.00
197 10/11/1984 federal_reserve_system 10.50

Calculate Average interest rate to analyze the trend

Avg_Int_Rate_df <- Int_Rate_df %>% group_by(bank_name) %>% summarize(`average interest rate` = mean(`interest_rate`))

head(Avg_Int_Rate_df)  %>% 
head(20) %>%
  kable() %>% 
  kable_styling(bootstrap_options = "striped", font_size = 12, fixed_thead = T) %>%  scroll_box(height = "300px", width = "100%")
bank_name average interest rate
bank_brazil 9.5520833
bank_england 6.6474715
bank_japan 0.0576471
european_central_bank 1.7221790
federal_reserve_system 3.9238148
reserve_bank_australia 6.8332470
ggplot(data=Avg_Int_Rate_df, aes(x=bank_name,y=`average interest rate`, group=1)) +
  geom_line(arrow = arrow(), color = "purple",size = 1.2)+
  geom_point(color = "green", size = 3)+
  theme(axis.text.x=element_text(angle = 45, vjust = 0.5)) +
  scale_fill_brewer(palette="Paired") + 
  ggtitle("Average Interest Rate Per Bank") +
  theme(plot.title = element_text(hjust = 0.5)) +
  geom_text(aes(label=paste0(round(`average interest rate`,2))), vjust=-1, color="black", position = position_dodge(0.9), size=3.5) 
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# The following plot compares the interest year across the years

ggplot(data = Avg_Int_Rate_df) +
  geom_bar(mapping = aes(x=bank_name,y=`average interest rate`, fill = bank_name), stat = 'identity') +
  theme(axis.text.x = element_text(angle = 70, hjust = 1, size=5)) +
  labs(title = 'AVERAGE INTEREST RATE')

Dataset 3 -

Import Data from Github and load in R

NYC_Subway_Data <- read_csv('https://raw.githubusercontent.com/BeshkiaKvarnstrom/MSDS-DATA607/main/NYC_Subway_Ridership.csv', 
                          show_col_types = FALSE)
NYC_Subway_Data%>% 
head(20) %>%
  kable() %>% 
  kable_styling(bootstrap_options = "striped", font_size = 12, fixed_thead = T) %>%  scroll_box(height = "300px", width = "100%")
Station (alphabetical by borough) 2013 2014 2015 2016 2017 2018 2017-2018 Change 2017-2018 Change2 2018 Rank
The Bronx The Bronx The Bronx The Bronx The Bronx The Bronx The Bronx The Bronx The Bronx The Bronx
138 St-Grand Concourse 957,984 1,033,559 1,056,380 1,070,024 1,036,746 944,598 -92,148 -8.9% 365
149 St-Grand Concourse 4,427,399 4,536,888 4,424,754 4,381,900 4,255,015 3,972,763 -282,252 -6.6% 121
161 St-Yankee Stadium 8,766,012 8,961,029 8,922,188 8,784,407 8,596,506 8,392,290 -204,216 -2.40% 38
167 St 3,081,534 3,067,345 3,180,274 3,179,087 2,954,228 2,933,140 -21,088 -0.7% 165
167 St 3,091,289 3,245,977 3,295,032 3,365,748 3,293,451 2,022,919 -1,270,532 -38.6% 231
170 St 2,961,575 2,941,958 3,045,205 3,038,777 2,785,331 2,562,443 -222,888 -8.0% 183
170 St 2,174,082 2,297,459 2,286,670 2,317,558 2,270,027 2,454,974 +184,947 +8.1% 192
174 St 2,301,756 2,374,812 2,313,651 2,411,413 2,334,317 2,056,692 -277,625 -11.9% 228
174-175 Sts 1,598,189 1,719,504 1,695,879 1,752,387 1,735,321 1,005,075 -730,246 -42.1% 352
176 St 1,837,995 1,848,854 1,969,221 2,036,529 1,943,854 1,803,691 -140,163 -7.2% 258
182-183 Sts 1,520,856 1,604,218 1,643,266 1,682,092 1,577,144 1,502,814 -74,330 -4.7% 293
183 St 2,002,633 2,007,140 2,057,944 2,071,316 2,001,410 1,831,457 -169,953 -8.5% 255
219 St 1,099,491 1,124,522 1,108,859 1,049,128 1,044,200 1,026,894 -17,306 -1.7% 348
225 St 1,400,377 1,445,591 1,439,495 1,365,544 1,323,910 1,224,369 -99,541 -7.5% 326
231 St 2,757,855 2,912,283 2,966,015 3,013,521 3,013,031 3,133,231 +120,200 +4.0% 158
233 St 1,719,181 1,803,693 1,762,923 1,688,487 1,565,455 1,496,673 -68,782 -4.4% 296
238 St 1,092,492 1,209,623 1,240,616 1,327,862 1,335,877 960,970 -374,907 -28.1% 362
3 Av-138 St 2,443,326 2,534,036 2,531,974 2,509,085 2,515,479 2,451,972 -63,507 -2.5% 193
3 Av-149 St 7,310,115 7,763,151 7,363,663 7,609,968 7,458,222 6,784,053 -674,169 -9.0% 58

Tidying the NYC Subway data

# Each borough is used to separate the data in the file. This separation is identified by a row entry for each borough at the beginning of the respective borough's data set. We need to identify the row numbers for each borough.
row_b <- c('The Bronx', 'Brooklyn', 'Manhattan', 'Queens')

rownum <- c()

for(i in 1:length(row_b)){
  rownum[i] <- rownames(NYC_Subway_Data[which(NYC_Subway_Data$'2013' == row_b[i]),])
}
d = NULL
for(i in 1:length(rownum)){

  dat  <- NYC_Subway_Data[as.integer(rownum[i])+1:ifelse(!is.na(as.integer(rownum[i+1])-1),as.integer(rownum[i+1])-1,dim(NYC_Subway_Data)[1]),]

  k <- as.integer(rownum[i])+1
  l<-ifelse(!is.na(as.integer(rownum[i+1])-1),as.integer(rownum[i+1])-1,dim(NYC_Subway_Data)[1])         
  
 if (i == 1){
    Bronx <- NYC_Subway_Data[k:l,]
    Bronx['Borough'] <- row_b[1]
 } else if (i == 2) {
    Brooklyn <- NYC_Subway_Data[k:l,]
    Brooklyn['Borough'] <- row_b[2]
 }else if (i == 3) {
    Manhattan <- NYC_Subway_Data[k:l,]
    Manhattan['Borough'] <- row_b[3]
 } else if (i == 4) {
    Queens <- NYC_Subway_Data[k:l,]
    Queens['Borough'] <- row_b[4]
 }

}

# combined all sub datasets
NYC_Subway_Data2 <- rbind(Bronx, Brooklyn, Manhattan, Queens)
NYC_Subway_Data2%>% 
head(20) %>%
  kable() %>% 
  kable_styling(bootstrap_options = "striped", font_size = 12, fixed_thead = T) %>%  scroll_box(height = "300px", width = "100%")
Station (alphabetical by borough) 2013 2014 2015 2016 2017 2018 2017-2018 Change 2017-2018 Change2 2018 Rank Borough
138 St-Grand Concourse 957,984 1,033,559 1,056,380 1,070,024 1,036,746 944,598 -92,148 -8.9% 365 The Bronx
The Bronx The Bronx The Bronx The Bronx The Bronx The Bronx The Bronx The Bronx The Bronx The Bronx The Bronx
138 St-Grand Concourse 957,984 1,033,559 1,056,380 1,070,024 1,036,746 944,598 -92,148 -8.9% 365 Brooklyn
The Bronx The Bronx The Bronx The Bronx The Bronx The Bronx The Bronx The Bronx The Bronx The Bronx Brooklyn
138 St-Grand Concourse 957,984 1,033,559 1,056,380 1,070,024 1,036,746 944,598 -92,148 -8.9% 365 Manhattan
The Bronx The Bronx The Bronx The Bronx The Bronx The Bronx The Bronx The Bronx The Bronx The Bronx Manhattan
138 St-Grand Concourse 957,984 1,033,559 1,056,380 1,070,024 1,036,746 944,598 -92,148 -8.9% 365 Queens
149 St-Grand Concourse 4,427,399 4,536,888 4,424,754 4,381,900 4,255,015 3,972,763 -282,252 -6.6% 121 Queens
161 St-Yankee Stadium 8,766,012 8,961,029 8,922,188 8,784,407 8,596,506 8,392,290 -204,216 -2.40% 38 Queens
167 St 3,081,534 3,067,345 3,180,274 3,179,087 2,954,228 2,933,140 -21,088 -0.7% 165 Queens
167 St 3,091,289 3,245,977 3,295,032 3,365,748 3,293,451 2,022,919 -1,270,532 -38.6% 231 Queens
170 St 2,961,575 2,941,958 3,045,205 3,038,777 2,785,331 2,562,443 -222,888 -8.0% 183 Queens
170 St 2,174,082 2,297,459 2,286,670 2,317,558 2,270,027 2,454,974 +184,947 +8.1% 192 Queens
174 St 2,301,756 2,374,812 2,313,651 2,411,413 2,334,317 2,056,692 -277,625 -11.9% 228 Queens
174-175 Sts 1,598,189 1,719,504 1,695,879 1,752,387 1,735,321 1,005,075 -730,246 -42.1% 352 Queens
176 St 1,837,995 1,848,854 1,969,221 2,036,529 1,943,854 1,803,691 -140,163 -7.2% 258 Queens
182-183 Sts 1,520,856 1,604,218 1,643,266 1,682,092 1,577,144 1,502,814 -74,330 -4.7% 293 Queens
183 St 2,002,633 2,007,140 2,057,944 2,071,316 2,001,410 1,831,457 -169,953 -8.5% 255 Queens
219 St 1,099,491 1,124,522 1,108,859 1,049,128 1,044,200 1,026,894 -17,306 -1.7% 348 Queens
225 St 1,400,377 1,445,591 1,439,495 1,365,544 1,323,910 1,224,369 -99,541 -7.5% 326 Queens
NYC_Subway_Data2 <- NYC_Subway_Data2 %>%
  mutate('2013' = as.integer(str_remove_all(NYC_Subway_Data2$'2013', ',')),
         '2014' = as.integer(str_remove_all(NYC_Subway_Data2$'2014', ',')),
         '2015' = as.integer(str_remove_all(NYC_Subway_Data2$'2015', ',')),
         '2016' = as.integer(str_remove_all(NYC_Subway_Data2$'2016', ',')),
         '2017' = as.integer(str_remove_all(NYC_Subway_Data2$'2017', ',')),
         '2018' = as.integer(str_remove_all(NYC_Subway_Data2$'2018', ',')),
         '2017-2018 Change' = as.integer(str_remove_all(NYC_Subway_Data2$'2017-2018 Change', ',')),
         '2017-2018 Change2' = as.numeric(str_remove_all(NYC_Subway_Data2$'2017-2018 Change2', '%')),
         '2018 Rank' = as.integer(NYC_Subway_Data2$'2018 Rank')) %>%
select(Borough, colnames(NYC_Subway_Data2))%>%
na.omit()
## Warning: There were 9 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `2013 = as.integer(str_remove_all(NYC_Subway_Data2$"2013",
##   ","))`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 8 remaining warnings.
NYC_Subway_Data2%>% 
head(20) %>%
  kable() %>% 
  kable_styling(bootstrap_options = "striped", font_size = 12, fixed_thead = T) %>%  scroll_box(height = "300px", width = "100%")
Borough Station (alphabetical by borough) 2013 2014 2015 2016 2017 2018 2017-2018 Change 2017-2018 Change2 2018 Rank
The Bronx 138 St-Grand Concourse 957984 1033559 1056380 1070024 1036746 944598 -92148 -8.9 365
Brooklyn 138 St-Grand Concourse 957984 1033559 1056380 1070024 1036746 944598 -92148 -8.9 365
Manhattan 138 St-Grand Concourse 957984 1033559 1056380 1070024 1036746 944598 -92148 -8.9 365
Queens 138 St-Grand Concourse 957984 1033559 1056380 1070024 1036746 944598 -92148 -8.9 365
Queens 149 St-Grand Concourse 4427399 4536888 4424754 4381900 4255015 3972763 -282252 -6.6 121
Queens 161 St-Yankee Stadium 8766012 8961029 8922188 8784407 8596506 8392290 -204216 -2.4 38
Queens 167 St 3081534 3067345 3180274 3179087 2954228 2933140 -21088 -0.7 165
Queens 167 St 3091289 3245977 3295032 3365748 3293451 2022919 -1270532 -38.6 231
Queens 170 St 2961575 2941958 3045205 3038777 2785331 2562443 -222888 -8.0 183
Queens 170 St 2174082 2297459 2286670 2317558 2270027 2454974 184947 8.1 192
Queens 174 St 2301756 2374812 2313651 2411413 2334317 2056692 -277625 -11.9 228
Queens 174-175 Sts 1598189 1719504 1695879 1752387 1735321 1005075 -730246 -42.1 352
Queens 176 St 1837995 1848854 1969221 2036529 1943854 1803691 -140163 -7.2 258
Queens 182-183 Sts 1520856 1604218 1643266 1682092 1577144 1502814 -74330 -4.7 293
Queens 183 St 2002633 2007140 2057944 2071316 2001410 1831457 -169953 -8.5 255
Queens 219 St 1099491 1124522 1108859 1049128 1044200 1026894 -17306 -1.7 348
Queens 225 St 1400377 1445591 1439495 1365544 1323910 1224369 -99541 -7.5 326
Queens 231 St 2757855 2912283 2966015 3013521 3013031 3133231 120200 4.0 158
Queens 233 St 1719181 1803693 1762923 1688487 1565455 1496673 -68782 -4.4 296
Queens 238 St 1092492 1209623 1240616 1327862 1335877 960970 -374907 -28.1 362
# Use the tidyr function gather() to unpivot data from a "wide" to a "long" format
Subway_df <-gather(NYC_Subway_Data2, Year ,'total riders',3:11)

colnames(Subway_df) <- c('Borough', 'Subway Station', 'Year', 'Total Riders')
Subway_df%>% 
  head(20) %>%
  kable() %>% 
  kable_styling(bootstrap_options = "striped", font_size = 12, fixed_thead = T) %>% 
  scroll_box(height = "300px", width = "100%")
Borough Subway Station Year Total Riders
The Bronx 138 St-Grand Concourse 2013 957984
Brooklyn 138 St-Grand Concourse 2013 957984
Manhattan 138 St-Grand Concourse 2013 957984
Queens 138 St-Grand Concourse 2013 957984
Queens 149 St-Grand Concourse 2013 4427399
Queens 161 St-Yankee Stadium 2013 8766012
Queens 167 St 2013 3081534
Queens 167 St 2013 3091289
Queens 170 St 2013 2961575
Queens 170 St 2013 2174082
Queens 174 St 2013 2301756
Queens 174-175 Sts 2013 1598189
Queens 176 St 2013 1837995
Queens 182-183 Sts 2013 1520856
Queens 183 St 2013 2002633
Queens 219 St 2013 1099491
Queens 225 St 2013 1400377
Queens 231 St 2013 2757855
Queens 233 St 2013 1719181
Queens 238 St 2013 1092492

##The following bar chart displays the average riders for each Borough from 2013-2018

Subway_df <- na.omit(Subway_df)

sum_riders <- Subway_df %>%
  group_by(Year, Borough) %>%
  summarize(
    min_riders = min(`Total Riders`),
    avg_riders = mean(`Total Riders`),
    max_riders = max(`Total Riders`),
    .groups = "drop"
  )

ggplot(data = sum_riders, aes(x = Year, y = avg_riders, fill = Borough)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(title = 'Average NYC Subway Ridership', subtitle = "By Borough (2013 - 2018)") +
 scale_y_continuous(labels = comma)+  # Format y-axis labels with commas
 
  xlab("Ridership Year") +  # Update x-axis label
  theme(axis.text.x = element_text(angle = 70, hjust = 1))

##Conclusion My Analysis shows that of all the Borough Queens had the highest Riders.