pkges <- c("tidyr","readr","kableExtra","dplyr","ggplot2","stringr", "scales")
# Loop through the packages
for (p in pkges) {
# Check if package is installed
if (!requireNamespace(p, quietly = TRUE)) {
install.packages(p) #If the package is not installed, install the package
library(p, character.only = TRUE) #Load the package
} else {
library(p, character.only = TRUE) #If the package is already installed, load the package
}
}
The goal of this assignment is to give you practice in preparing
different datasets for downstream analysis work.
Your task is to:
- Choose any three of the “wide” datasets identified in the Week 6
Discussion items. (You may use your own dataset; please don’t use my
Sample Post dataset, since that was used in your Week 6 assignment!) For
each of the three chosen datasets:
Create a .CSV file (or optionally, a MySQL database!) that includes
all of the information included in the dataset. You’re encouraged to use
a “wide” structure similar to how the information appears in the
discussion item, so that you can practice tidying and transformations as
described below.
Read the information from your .CSV file into R, and use tidyr and
dplyr as needed to tidy and transform your data. [Most of your grade
will be based on this step!]
Perform the analysis requested in the discussion item.
Your code should be in an R Markdown file, posted to rpubs.com, and
should include narrative descriptions of your data cleanup work,
analysis, and conclusions.
- Please include in your homework submission, for each of the three
chosen datasets:
The URL to the .Rmd file in your GitHub repository, and The URL
for your rpubs.com web page.
Tidying the Student Test Score Results data
colnames(Test_Score_Data) <- c("ID", "NAME", "PHONE", "GENDER", "AGE", "TEST NO.", "TERM", "TEST SCORE")
colnames(Test_Score_Data) <- c("ID", "NAME", "PHONE", "GENDER", "AGE", "TEST NO.", "TERM", "TEST SCORE")
Test_Score_Data%>%
head(20) %>%
kable() %>%
kable_styling(bootstrap_options = "striped", font_size = 12, fixed_thead = T) %>% scroll_box(height = "300px", width = "100%")
|
ID
|
NAME
|
PHONE
|
GENDER
|
AGE
|
TEST NO.
|
TERM
|
TEST SCORE
|
|
1
|
Mike
|
134
|
m_12
|
test 1
|
76
|
84
|
87
|
|
2
|
Linda
|
270
|
f_13
|
test 1
|
88
|
90
|
73
|
|
3
|
Sam
|
210
|
m_11
|
test 1
|
78
|
74
|
80
|
|
4
|
Esther
|
617
|
f_12
|
test 1
|
68
|
75
|
74
|
|
5
|
Mary
|
114
|
f_14
|
test 1
|
65
|
67
|
64
|
|
1
|
Mike
|
134
|
m_12
|
test 2
|
85
|
80
|
90
|
|
2
|
Linda
|
270
|
f_13
|
test 2
|
87
|
82
|
94
|
|
3
|
Sam
|
210
|
m_11
|
test 2
|
80
|
87
|
80
|
|
4
|
Esther
|
617
|
f_12
|
test 2
|
70
|
75
|
78
|
|
5
|
Mary
|
114
|
f_14
|
test 2
|
68
|
70
|
63
|
Analyze and Visualize the Student Test Score Results data
# Analyzing the test score to determine the minimum, maximum, mean and median test scores for each term by gender. From the analysis you can see that females did better than males overall.
Agg_TestScores <- Test_Score_Data %>% group_by(GENDER) %>% group_by(GENDER, TERM) %>%
summarise(
"MIN TEST SCORE" = min(`TEST SCORE`),
"MAX TEST SCORE" = max(`TEST SCORE`),
"MEAN TEST SCORE" = mean(`TEST SCORE`), "MEDIAN TEST SCORE" = median(`TEST SCORE`),
.groups = 'drop'
)
Agg_TestScores%>%
head(20) %>%
kable() %>%
kable_styling(bootstrap_options = "striped", font_size = 12, fixed_thead = T) %>% scroll_box(height = "300px", width = "100%")
|
GENDER
|
TERM
|
MIN TEST SCORE
|
MAX TEST SCORE
|
MEAN TEST SCORE
|
MEDIAN TEST SCORE
|
|
f_12
|
75
|
74
|
78
|
76
|
76
|
|
f_13
|
82
|
94
|
94
|
94
|
94
|
|
f_13
|
90
|
73
|
73
|
73
|
73
|
|
f_14
|
67
|
64
|
64
|
64
|
64
|
|
f_14
|
70
|
63
|
63
|
63
|
63
|
|
m_11
|
74
|
80
|
80
|
80
|
80
|
|
m_11
|
87
|
80
|
80
|
80
|
80
|
|
m_12
|
80
|
90
|
90
|
90
|
90
|
|
m_12
|
84
|
87
|
87
|
87
|
87
|
# The following plot compares the test scores for each gender and the terms taken.
ggplot(data = Agg_TestScores) +
geom_bar(mapping = aes(x = TERM, y = `MEAN TEST SCORE`, fill = GENDER), stat = 'identity') +
facet_grid(~ GENDER) +
theme(axis.text.x = element_text(angle = 70, hjust = 1, size=5)) +
labs(title = 'AVERAGE TEST SCORES PER TERM', subtitle = "By Gender")

# This visualition displays the test scores per student for each term
ggplot(data = Test_Score_Data) +
geom_bar(mapping = aes(x=NAME, y=`TEST SCORE`, fill = `TERM`), stat = 'identity') +
facet_grid(~ `TEST NO.`) +
theme(axis.text.x = element_text(angle = 70, hjust = 1, size=5)) +
labs(title = 'STUDENT TEST SCORES' , subtitle = "By Term")

Tidying the Student Test Score Results data
# Use the tidyr function gather() to unpivot data from a "wide" to a "long" format
Int_Rate_df <-gather(Int_Rate_Data, bank_name ,'interest_rate',2:8)
#Remove columns with interest rates NA value
Int_Rate_df <- Int_Rate_df[!apply(is.na(Int_Rate_df[3:3]),1,all), ]
Int_Rate_df%>%
head(20) %>%
kable() %>%
kable_styling(bootstrap_options = "striped", font_size = 12, fixed_thead = T) %>% scroll_box(height = "300px", width = "100%")
|
|
date
|
bank_name
|
interest_rate
|
|
154
|
9/27/1982
|
federal_reserve_system
|
10.25
|
|
155
|
10/1/1982
|
federal_reserve_system
|
10.00
|
|
156
|
10/7/1982
|
federal_reserve_system
|
9.50
|
|
158
|
11/19/1982
|
federal_reserve_system
|
9.00
|
|
160
|
12/14/1982
|
federal_reserve_system
|
8.50
|
|
164
|
3/31/1983
|
federal_reserve_system
|
8.62
|
|
167
|
5/25/1983
|
federal_reserve_system
|
8.75
|
|
169
|
6/24/1983
|
federal_reserve_system
|
9.00
|
|
171
|
7/14/1983
|
federal_reserve_system
|
9.25
|
|
172
|
7/20/1983
|
federal_reserve_system
|
9.43
|
|
174
|
8/11/1983
|
federal_reserve_system
|
9.56
|
|
175
|
8/17/1983
|
federal_reserve_system
|
9.50
|
|
177
|
9/15/1983
|
federal_reserve_system
|
9.37
|
|
184
|
3/29/1984
|
federal_reserve_system
|
10.50
|
|
189
|
7/5/1984
|
federal_reserve_system
|
11.00
|
|
190
|
7/19/1984
|
federal_reserve_system
|
11.25
|
|
192
|
8/9/1984
|
federal_reserve_system
|
11.50
|
|
194
|
9/20/1984
|
federal_reserve_system
|
11.25
|
|
195
|
9/27/1984
|
federal_reserve_system
|
11.00
|
|
197
|
10/11/1984
|
federal_reserve_system
|
10.50
|
Calculate Average interest rate to analyze the trend
Avg_Int_Rate_df <- Int_Rate_df %>% group_by(bank_name) %>% summarize(`average interest rate` = mean(`interest_rate`))
head(Avg_Int_Rate_df) %>%
head(20) %>%
kable() %>%
kable_styling(bootstrap_options = "striped", font_size = 12, fixed_thead = T) %>% scroll_box(height = "300px", width = "100%")
|
bank_name
|
average interest rate
|
|
bank_brazil
|
9.5520833
|
|
bank_england
|
6.6474715
|
|
bank_japan
|
0.0576471
|
|
european_central_bank
|
1.7221790
|
|
federal_reserve_system
|
3.9238148
|
|
reserve_bank_australia
|
6.8332470
|
ggplot(data=Avg_Int_Rate_df, aes(x=bank_name,y=`average interest rate`, group=1)) +
geom_line(arrow = arrow(), color = "purple",size = 1.2)+
geom_point(color = "green", size = 3)+
theme(axis.text.x=element_text(angle = 45, vjust = 0.5)) +
scale_fill_brewer(palette="Paired") +
ggtitle("Average Interest Rate Per Bank") +
theme(plot.title = element_text(hjust = 0.5)) +
geom_text(aes(label=paste0(round(`average interest rate`,2))), vjust=-1, color="black", position = position_dodge(0.9), size=3.5)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# The following plot compares the interest year across the years
ggplot(data = Avg_Int_Rate_df) +
geom_bar(mapping = aes(x=bank_name,y=`average interest rate`, fill = bank_name), stat = 'identity') +
theme(axis.text.x = element_text(angle = 70, hjust = 1, size=5)) +
labs(title = 'AVERAGE INTEREST RATE')

Tidying the NYC Subway data
# Each borough is used to separate the data in the file. This separation is identified by a row entry for each borough at the beginning of the respective borough's data set. We need to identify the row numbers for each borough.
row_b <- c('The Bronx', 'Brooklyn', 'Manhattan', 'Queens')
rownum <- c()
for(i in 1:length(row_b)){
rownum[i] <- rownames(NYC_Subway_Data[which(NYC_Subway_Data$'2013' == row_b[i]),])
}
d = NULL
for(i in 1:length(rownum)){
dat <- NYC_Subway_Data[as.integer(rownum[i])+1:ifelse(!is.na(as.integer(rownum[i+1])-1),as.integer(rownum[i+1])-1,dim(NYC_Subway_Data)[1]),]
k <- as.integer(rownum[i])+1
l<-ifelse(!is.na(as.integer(rownum[i+1])-1),as.integer(rownum[i+1])-1,dim(NYC_Subway_Data)[1])
if (i == 1){
Bronx <- NYC_Subway_Data[k:l,]
Bronx['Borough'] <- row_b[1]
} else if (i == 2) {
Brooklyn <- NYC_Subway_Data[k:l,]
Brooklyn['Borough'] <- row_b[2]
}else if (i == 3) {
Manhattan <- NYC_Subway_Data[k:l,]
Manhattan['Borough'] <- row_b[3]
} else if (i == 4) {
Queens <- NYC_Subway_Data[k:l,]
Queens['Borough'] <- row_b[4]
}
}
# combined all sub datasets
NYC_Subway_Data2 <- rbind(Bronx, Brooklyn, Manhattan, Queens)
NYC_Subway_Data2%>%
head(20) %>%
kable() %>%
kable_styling(bootstrap_options = "striped", font_size = 12, fixed_thead = T) %>% scroll_box(height = "300px", width = "100%")
|
Station (alphabetical by borough)
|
2013
|
2014
|
2015
|
2016
|
2017
|
2018
|
2017-2018 Change
|
2017-2018 Change2
|
2018 Rank
|
Borough
|
|
138 St-Grand Concourse
|
957,984
|
1,033,559
|
1,056,380
|
1,070,024
|
1,036,746
|
944,598
|
-92,148
|
-8.9%
|
365
|
The Bronx
|
|
The Bronx
|
The Bronx
|
The Bronx
|
The Bronx
|
The Bronx
|
The Bronx
|
The Bronx
|
The Bronx
|
The Bronx
|
The Bronx
|
The Bronx
|
|
138 St-Grand Concourse
|
957,984
|
1,033,559
|
1,056,380
|
1,070,024
|
1,036,746
|
944,598
|
-92,148
|
-8.9%
|
365
|
Brooklyn
|
|
The Bronx
|
The Bronx
|
The Bronx
|
The Bronx
|
The Bronx
|
The Bronx
|
The Bronx
|
The Bronx
|
The Bronx
|
The Bronx
|
Brooklyn
|
|
138 St-Grand Concourse
|
957,984
|
1,033,559
|
1,056,380
|
1,070,024
|
1,036,746
|
944,598
|
-92,148
|
-8.9%
|
365
|
Manhattan
|
|
The Bronx
|
The Bronx
|
The Bronx
|
The Bronx
|
The Bronx
|
The Bronx
|
The Bronx
|
The Bronx
|
The Bronx
|
The Bronx
|
Manhattan
|
|
138 St-Grand Concourse
|
957,984
|
1,033,559
|
1,056,380
|
1,070,024
|
1,036,746
|
944,598
|
-92,148
|
-8.9%
|
365
|
Queens
|
|
149 St-Grand Concourse
|
4,427,399
|
4,536,888
|
4,424,754
|
4,381,900
|
4,255,015
|
3,972,763
|
-282,252
|
-6.6%
|
121
|
Queens
|
|
161 St-Yankee Stadium
|
8,766,012
|
8,961,029
|
8,922,188
|
8,784,407
|
8,596,506
|
8,392,290
|
-204,216
|
-2.40%
|
38
|
Queens
|
|
167 St
|
3,081,534
|
3,067,345
|
3,180,274
|
3,179,087
|
2,954,228
|
2,933,140
|
-21,088
|
-0.7%
|
165
|
Queens
|
|
167 St
|
3,091,289
|
3,245,977
|
3,295,032
|
3,365,748
|
3,293,451
|
2,022,919
|
-1,270,532
|
-38.6%
|
231
|
Queens
|
|
170 St
|
2,961,575
|
2,941,958
|
3,045,205
|
3,038,777
|
2,785,331
|
2,562,443
|
-222,888
|
-8.0%
|
183
|
Queens
|
|
170 St
|
2,174,082
|
2,297,459
|
2,286,670
|
2,317,558
|
2,270,027
|
2,454,974
|
+184,947
|
+8.1%
|
192
|
Queens
|
|
174 St
|
2,301,756
|
2,374,812
|
2,313,651
|
2,411,413
|
2,334,317
|
2,056,692
|
-277,625
|
-11.9%
|
228
|
Queens
|
|
174-175 Sts
|
1,598,189
|
1,719,504
|
1,695,879
|
1,752,387
|
1,735,321
|
1,005,075
|
-730,246
|
-42.1%
|
352
|
Queens
|
|
176 St
|
1,837,995
|
1,848,854
|
1,969,221
|
2,036,529
|
1,943,854
|
1,803,691
|
-140,163
|
-7.2%
|
258
|
Queens
|
|
182-183 Sts
|
1,520,856
|
1,604,218
|
1,643,266
|
1,682,092
|
1,577,144
|
1,502,814
|
-74,330
|
-4.7%
|
293
|
Queens
|
|
183 St
|
2,002,633
|
2,007,140
|
2,057,944
|
2,071,316
|
2,001,410
|
1,831,457
|
-169,953
|
-8.5%
|
255
|
Queens
|
|
219 St
|
1,099,491
|
1,124,522
|
1,108,859
|
1,049,128
|
1,044,200
|
1,026,894
|
-17,306
|
-1.7%
|
348
|
Queens
|
|
225 St
|
1,400,377
|
1,445,591
|
1,439,495
|
1,365,544
|
1,323,910
|
1,224,369
|
-99,541
|
-7.5%
|
326
|
Queens
|
NYC_Subway_Data2 <- NYC_Subway_Data2 %>%
mutate('2013' = as.integer(str_remove_all(NYC_Subway_Data2$'2013', ',')),
'2014' = as.integer(str_remove_all(NYC_Subway_Data2$'2014', ',')),
'2015' = as.integer(str_remove_all(NYC_Subway_Data2$'2015', ',')),
'2016' = as.integer(str_remove_all(NYC_Subway_Data2$'2016', ',')),
'2017' = as.integer(str_remove_all(NYC_Subway_Data2$'2017', ',')),
'2018' = as.integer(str_remove_all(NYC_Subway_Data2$'2018', ',')),
'2017-2018 Change' = as.integer(str_remove_all(NYC_Subway_Data2$'2017-2018 Change', ',')),
'2017-2018 Change2' = as.numeric(str_remove_all(NYC_Subway_Data2$'2017-2018 Change2', '%')),
'2018 Rank' = as.integer(NYC_Subway_Data2$'2018 Rank')) %>%
select(Borough, colnames(NYC_Subway_Data2))%>%
na.omit()
## Warning: There were 9 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `2013 = as.integer(str_remove_all(NYC_Subway_Data2$"2013",
## ","))`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 8 remaining warnings.
NYC_Subway_Data2%>%
head(20) %>%
kable() %>%
kable_styling(bootstrap_options = "striped", font_size = 12, fixed_thead = T) %>% scroll_box(height = "300px", width = "100%")
|
Borough
|
Station (alphabetical by borough)
|
2013
|
2014
|
2015
|
2016
|
2017
|
2018
|
2017-2018 Change
|
2017-2018 Change2
|
2018 Rank
|
|
The Bronx
|
138 St-Grand Concourse
|
957984
|
1033559
|
1056380
|
1070024
|
1036746
|
944598
|
-92148
|
-8.9
|
365
|
|
Brooklyn
|
138 St-Grand Concourse
|
957984
|
1033559
|
1056380
|
1070024
|
1036746
|
944598
|
-92148
|
-8.9
|
365
|
|
Manhattan
|
138 St-Grand Concourse
|
957984
|
1033559
|
1056380
|
1070024
|
1036746
|
944598
|
-92148
|
-8.9
|
365
|
|
Queens
|
138 St-Grand Concourse
|
957984
|
1033559
|
1056380
|
1070024
|
1036746
|
944598
|
-92148
|
-8.9
|
365
|
|
Queens
|
149 St-Grand Concourse
|
4427399
|
4536888
|
4424754
|
4381900
|
4255015
|
3972763
|
-282252
|
-6.6
|
121
|
|
Queens
|
161 St-Yankee Stadium
|
8766012
|
8961029
|
8922188
|
8784407
|
8596506
|
8392290
|
-204216
|
-2.4
|
38
|
|
Queens
|
167 St
|
3081534
|
3067345
|
3180274
|
3179087
|
2954228
|
2933140
|
-21088
|
-0.7
|
165
|
|
Queens
|
167 St
|
3091289
|
3245977
|
3295032
|
3365748
|
3293451
|
2022919
|
-1270532
|
-38.6
|
231
|
|
Queens
|
170 St
|
2961575
|
2941958
|
3045205
|
3038777
|
2785331
|
2562443
|
-222888
|
-8.0
|
183
|
|
Queens
|
170 St
|
2174082
|
2297459
|
2286670
|
2317558
|
2270027
|
2454974
|
184947
|
8.1
|
192
|
|
Queens
|
174 St
|
2301756
|
2374812
|
2313651
|
2411413
|
2334317
|
2056692
|
-277625
|
-11.9
|
228
|
|
Queens
|
174-175 Sts
|
1598189
|
1719504
|
1695879
|
1752387
|
1735321
|
1005075
|
-730246
|
-42.1
|
352
|
|
Queens
|
176 St
|
1837995
|
1848854
|
1969221
|
2036529
|
1943854
|
1803691
|
-140163
|
-7.2
|
258
|
|
Queens
|
182-183 Sts
|
1520856
|
1604218
|
1643266
|
1682092
|
1577144
|
1502814
|
-74330
|
-4.7
|
293
|
|
Queens
|
183 St
|
2002633
|
2007140
|
2057944
|
2071316
|
2001410
|
1831457
|
-169953
|
-8.5
|
255
|
|
Queens
|
219 St
|
1099491
|
1124522
|
1108859
|
1049128
|
1044200
|
1026894
|
-17306
|
-1.7
|
348
|
|
Queens
|
225 St
|
1400377
|
1445591
|
1439495
|
1365544
|
1323910
|
1224369
|
-99541
|
-7.5
|
326
|
|
Queens
|
231 St
|
2757855
|
2912283
|
2966015
|
3013521
|
3013031
|
3133231
|
120200
|
4.0
|
158
|
|
Queens
|
233 St
|
1719181
|
1803693
|
1762923
|
1688487
|
1565455
|
1496673
|
-68782
|
-4.4
|
296
|
|
Queens
|
238 St
|
1092492
|
1209623
|
1240616
|
1327862
|
1335877
|
960970
|
-374907
|
-28.1
|
362
|
# Use the tidyr function gather() to unpivot data from a "wide" to a "long" format
Subway_df <-gather(NYC_Subway_Data2, Year ,'total riders',3:11)
colnames(Subway_df) <- c('Borough', 'Subway Station', 'Year', 'Total Riders')
Subway_df%>%
head(20) %>%
kable() %>%
kable_styling(bootstrap_options = "striped", font_size = 12, fixed_thead = T) %>%
scroll_box(height = "300px", width = "100%")
|
Borough
|
Subway Station
|
Year
|
Total Riders
|
|
The Bronx
|
138 St-Grand Concourse
|
2013
|
957984
|
|
Brooklyn
|
138 St-Grand Concourse
|
2013
|
957984
|
|
Manhattan
|
138 St-Grand Concourse
|
2013
|
957984
|
|
Queens
|
138 St-Grand Concourse
|
2013
|
957984
|
|
Queens
|
149 St-Grand Concourse
|
2013
|
4427399
|
|
Queens
|
161 St-Yankee Stadium
|
2013
|
8766012
|
|
Queens
|
167 St
|
2013
|
3081534
|
|
Queens
|
167 St
|
2013
|
3091289
|
|
Queens
|
170 St
|
2013
|
2961575
|
|
Queens
|
170 St
|
2013
|
2174082
|
|
Queens
|
174 St
|
2013
|
2301756
|
|
Queens
|
174-175 Sts
|
2013
|
1598189
|
|
Queens
|
176 St
|
2013
|
1837995
|
|
Queens
|
182-183 Sts
|
2013
|
1520856
|
|
Queens
|
183 St
|
2013
|
2002633
|
|
Queens
|
219 St
|
2013
|
1099491
|
|
Queens
|
225 St
|
2013
|
1400377
|
|
Queens
|
231 St
|
2013
|
2757855
|
|
Queens
|
233 St
|
2013
|
1719181
|
|
Queens
|
238 St
|
2013
|
1092492
|
##The following bar chart displays the average riders for each
Borough from 2013-2018
Subway_df <- na.omit(Subway_df)
sum_riders <- Subway_df %>%
group_by(Year, Borough) %>%
summarize(
min_riders = min(`Total Riders`),
avg_riders = mean(`Total Riders`),
max_riders = max(`Total Riders`),
.groups = "drop"
)
ggplot(data = sum_riders, aes(x = Year, y = avg_riders, fill = Borough)) +
geom_bar(stat = "identity", position = "stack") +
labs(title = 'Average NYC Subway Ridership', subtitle = "By Borough (2013 - 2018)") +
scale_y_continuous(labels = comma)+ # Format y-axis labels with commas
xlab("Ridership Year") + # Update x-axis label
theme(axis.text.x = element_text(angle = 70, hjust = 1))

##Conclusion My Analysis shows that of all the Borough Queens had the
highest Riders.