The purpose of this assignment is to create several plots using R to compare NSF funding for CSU to a competitive set of similar institutions for 2012 and over time.
library(tidyverse)
library(scales)
library(ggrepel)
library(readxl)
NSFdata_all <- read_xlsx("C:/Users/overp/Desktop/School Stuff/CIS 576 Business Data Visualization/HW/NSF_Funding_by_Year.xlsx")
The first step includes understanding the data by running glimpse, head, summary, and coronations to help better understand the data.
glimpse(NSFdata_all)
## Rows: 7,185
## Columns: 6
## $ ID <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15~
## $ Organization <chr> "3DGEO DEVELOPMENT INC", "3DGEO DEVELOPMENT INC",~
## $ AwardYear <dbl> 2005, 2004, 2008, 2010, 2011, 2005, 2010, 2006, 2~
## $ AWARD_SUM <dbl> 2162000, 200000, 365625, 328503, 131267, 145000, ~
## $ AWARD_COUNT <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 4, 2, 5, 2, 2~
## $ AvarageAwardDuration <dbl> 4.000000, 0.000000, 1.000000, 0.500000, 2.500000,~
head(NSFdata_all)
## # A tibble: 6 x 6
## ID Organization AwardYear AWARD_SUM AWARD_COUNT AvarageAwardDur~
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 3DGEO DEVELOPMENT INC 2005 2162000 2 4
## 2 2 3DGEO DEVELOPMENT INC 2004 200000 2 0
## 3 3 3F, LLC 2008 365625 2 1
## 4 4 4WIND SCIENCE AND ENGI~ 2010 328503 2 0.5
## 5 5 Abilene Christian Univ~ 2011 131267 2 2.5
## 6 6 ABIRAM ~ 2005 145000 2 1.5
summary(NSFdata_all)
## ID Organization AwardYear AWARD_SUM
## Min. : 1 Length:7185 Min. :2001 Min. : 0
## 1st Qu.:1797 Class :character 1st Qu.:2003 1st Qu.: 567690
## Median :3593 Mode :character Median :2007 Median : 1465978
## Mean :3593 Mean :2007 Mean : 7636078
## 3rd Qu.:5389 3rd Qu.:2010 3rd Qu.: 5737611
## Max. :7185 Max. :2012 Max. :894931764
## AWARD_COUNT AvarageAwardDuration
## Min. : 2.00 Min. : 0.000
## 1st Qu.: 2.00 1st Qu.: 2.556
## Median : 4.00 Median : 3.141
## Mean : 16.08 Mean : 3.030
## 3rd Qu.: 14.00 3rd Qu.: 3.622
## Max. :224.00 Max. :11.667
cor(NSFdata_all[, sapply(NSFdata_all, is.numeric)], use = "pairwise")
## ID AwardYear AWARD_SUM AWARD_COUNT
## ID 1.00000000 -0.01281994 0.08794076 0.19912857
## AwardYear -0.01281994 1.00000000 -0.02460934 0.03434795
## AWARD_SUM 0.08794076 -0.02460934 1.00000000 0.59188322
## AWARD_COUNT 0.19912857 0.03434795 0.59188322 1.00000000
## AvarageAwardDuration 0.09802567 -0.13656146 0.19828842 0.12576510
## AvarageAwardDuration
## ID 0.09802567
## AwardYear -0.13656146
## AWARD_SUM 0.19828842
## AWARD_COUNT 0.12576510
## AvarageAwardDuration 1.00000000
Question ## Question 1 Question 1 asks for a bar chart, scatter chart, and a bubble plot for 2012 NSF foundation comparing CSU funding to a comparable subset of schools.
NSFdata_subset <- NSFdata_all %>%
filter(AwardYear == "2012") %>%
filter(str_detect(Organization,"Colorado State University") | str_detect(Organization,"Iowa State University") | str_detect(Organization,"Kansas State University") | str_detect(Organization,"Michigan State University") | str_detect(Organization,"North Carolina State University") | str_detect(Organization,"Oklahoma State University") | str_detect(Organization,"Oregon State University") | str_detect(Organization,"Purdue University") | str_detect(Organization,"Texas A & M University") | str_detect(Organization,"University of California, Davis") | str_detect(Organization,"University of Illinois") | str_detect(Organization,"University of Tennessee") | str_detect(Organization,"Virginia Polytechnic") | str_detect(Organization,"Washington State University"))
levels(NSFdata_subset)
## NULL
NSFdata_subset$Organization <- as.factor(NSFdata_subset$Organization)
levels(NSFdata_subset$Organization)
## [1] "Colorado State University"
## [2] "Iowa State University"
## [3] "Kansas State University"
## [4] "Michigan State University"
## [5] "North Carolina State University"
## [6] "Oklahoma State University"
## [7] "Oregon State University"
## [8] "Purdue University"
## [9] "University of Illinois at Chicago"
## [10] "University of Illinois at Urbana-Champaign"
## [11] "University of Tennessee Chattanooga"
## [12] "University of Tennessee Knoxville"
## [13] "Virginia Polytechnic Institute and State University"
## [14] "Washington State University"
NSFdata_subset_rename <- NSFdata_subset %>%
mutate(ORGname = recode(Organization,"Colorado State University" = "CSU", "Iowa State University" = "ISU", "Kansas State University" = "KSU", "Michigan State University" = "MSU", "North Carolina State University" = "NCSU", "Oklahoma State University" = "OSU", "Oregon State University" = "OSU", "Purdue University" = "PSU", "University of Illinois at Chicago" = "UI", "University of Illinois at Urbana-Champaign" = "UI", "University of Tennessee Chattanooga" = "UT", "University of Tennessee Knoxville" = "UT", "Virginia Polytechnic Institute and State University" = "VT", "Washington State University" = "WSU"))
levels(NSFdata_subset_rename$ORGname)
## [1] "CSU" "ISU" "KSU" "MSU" "NCSU" "OSU" "PSU" "UI" "UT" "VT"
## [11] "WSU"
Graph 1 is a bar chart comparing CSU NSF funding to a peer group of schools.
NSFdata_subset_rename_Summary <- NSFdata_subset_rename %>%
group_by(ORGname) %>%
summarize(ts=sum(AWARD_SUM), tq=sum(AWARD_COUNT))
head(NSFdata_subset_rename_Summary)
## # A tibble: 6 x 3
## ORGname ts tq
## <fct> <dbl> <dbl>
## 1 CSU 11592315 41
## 2 ISU 16738011 65
## 3 KSU 8839116 23
## 4 MSU 17351663 58
## 5 NCSU 31440526 91
## 6 OSU 28511623 107
NSFdata_subset_rename_Summary <- NSFdata_subset_rename_Summary %>%
mutate(Highlight = ifelse(ORGname=="CSU","Yes","No"))
ggplot(NSFdata_subset_rename_Summary, aes(y = reorder(ORGname, ts), x = ts/1000000, fill=Highlight)) + geom_col()+
scale_x_continuous(labels = comma)+
labs(title = "2012 NSF Funding by Organization", subtitle = "CSU vs Competitive Set", x = "Total Award Amount (millions)", y = "Organization") + theme(aspect.ratio = 1/2) + scale_fill_manual(values = c("Yes"="black","No"="darkgrey"), guide=NULL)
ggplot(NSFdata_subset_rename_Summary, aes(reorder(ORGname, -ts), ts/1000000, fill=Highlight)) + geom_col() + labs(title = "2012 NSF Funding by Organization", subtitle = "CSU vs Competitive Set", y = "Total Award Amount (millions)", x = "Organization") + theme(aspect.ratio = 1/2) + scale_fill_manual(values = c("Yes"="black","No"="darkgrey"), guide=NULL)
## Question 1 Graph 2 - Scatter plot
ggplot(NSFdata_subset_rename_Summary, aes(y = ts/1000000, x = tq, color = Highlight)) + geom_point() + geom_text_repel(aes(label = ORGname)) + scale_y_continuous(labels = comma) + labs(title = "2012 NSF Funding by Organization", subtitle = "CSU vs Competitive Set", y = "Total Award Amount (millions)", x = "Number of Awards") + expand_limits(x=c(0,200),y=c(0,60)) + scale_color_manual(values = c("Yes"="blue", "No"="darkgrey"), guide=NULL)
## Question 1 Graph 3 - Bubble plot
ggplot(NSFdata_subset_rename_Summary, aes(y = ts/1000000, x= tq, size = tq, color = Highlight)) + geom_point() + geom_text_repel(aes(label = ORGname), size=3) + labs(title = "2012 NSF Funding by Organization", subtitle = "CSU vs Competitive Set", y = "Total Award Amount (millions)", x = "Number of Awards") + theme(legend.position = "top", legend.key.size = unit(0.1,'cm')) + expand_limits(x=c(0,200),y=c(0,60)) + scale_color_manual(values = c("Yes"="blue", "No"="darkgrey"), guide=NULL)
### Question 2 Question 2 asks to create a set of time series plots
comparing NSF Funding for CSU to a competitive set of institutions over
time.
NSFdata_time <- NSFdata_all %>%
filter(str_detect(Organization,"Colorado State University") | str_detect(Organization,"Iowa State University") | str_detect(Organization,"Kansas State University") | str_detect(Organization,"Michigan State University") | str_detect(Organization,"North Carolina State University") | str_detect(Organization,"Oklahoma State University") | str_detect(Organization,"Oregon State University") | str_detect(Organization,"Purdue University") | str_detect(Organization,"Texas A & M University") | str_detect(Organization,"University of California, Davis") | str_detect(Organization,"University of Illinois") | str_detect(Organization,"University of Tennessee") | str_detect(Organization,"Virginia Polytechnic") | str_detect(Organization,"Washington State University"))
NSFdata_time$Organization <- as.factor(NSFdata_time$Organization)
levels(NSFdata_time$Organization)
## [1] "Colorado State University"
## [2] "Colorado State University-Pueblo"
## [3] "Iowa State University"
## [4] "Kansas State University"
## [5] "Michigan State University"
## [6] "North Carolina State University"
## [7] "Oklahoma State University"
## [8] "Oregon State University"
## [9] "Purdue University"
## [10] "The University of Tennessee, Memphis - The Health Science Center"
## [11] "University of Illinois at Chicago"
## [12] "University of Illinois at Springfield"
## [13] "University of Illinois at Urbana-Champaign"
## [14] "University of Tennessee Chattanooga"
## [15] "University of Tennessee Institute of Agriculture"
## [16] "University of Tennessee Knoxville"
## [17] "University of Tennessee Space Institute"
## [18] "Virginia Polytechnic Institute and State University"
## [19] "Washington State University"
NSFdata_time_rename <- NSFdata_time %>%
mutate(ORGname = recode(Organization,"Colorado State University" = "CSU", "Colorado State University-Pueblo" = "CSU", "Iowa State University" = "ISU", "Kansas State University" = "KSU", "Michigan State University" = "MSU", "North Carolina State University" = "NCSU", "Oklahoma State University" = "OSU", "Oregon State University" = "OSU", "Purdue University" = "PSU", "University of Illinois at Chicago" = "UI", "University of Illinois at Urbana-Champaign" = "UI", "University of Illinois at Springfield" ="UI", "University of Tennessee Chattanooga" = "UT", "University of Tennessee Knoxville" = "UT", "The University of Tennessee, Memphis - The Health Science Center" = "UT", "University of Tennessee Institute of Agriculture" ="UT", "University of Tennessee Space Institute" = "UT", "Virginia Polytechnic Institute and State University" = "VT", "Washington State University" = "WSU"))
levels(NSFdata_time_rename$ORGname)
## [1] "CSU" "ISU" "KSU" "MSU" "NCSU" "OSU" "PSU" "UT" "UI" "VT"
## [11] "WSU"
Create a set of time series plots comparing NSF funding for CSU compared to its competitive set for ALL funding years.
Summarize Data
NSFdata_time_rename_summary <- NSFdata_time_rename %>%
group_by(AwardYear, ORGname) %>%
summarise(tts = sum(AWARD_SUM),ttq = sum(AWARD_COUNT))
## `summarise()` has grouped output by 'AwardYear'. You can override using the
## `.groups` argument.
Create Time-Series plot with all three brands in the same graph
ggplot(NSFdata_time_rename_summary, aes(x=as.factor(AwardYear), y=tts/1000000, group=ORGname, color=ORGname)) + geom_line() + xlab("") + ylab("Award Amount (millions)") + ggtitle("Total Award Amount by Year by Orgainization")
NSFdata_time_rename_summary <- NSFdata_time_rename_summary %>%
mutate(focus = ifelse(ORGname=="CSU", "Yes", "No"))
ggplot(NSFdata_time_rename_summary, aes(x=as.factor(AwardYear), y=tts/1000000, group=ORGname, color=focus)) + geom_line() + xlab("Award Year") + ylab("Award Amount (millions)") + labs(title = "NSF Funding by Year / Orgainization", subtitle = "CSU Competitive Set") + scale_color_manual(values = c("Yes"="red", "No"="darkgrey"), guide=NULL)
Create Small-Multiples Chart
ggplot(NSFdata_time_rename_summary, aes(x=as.factor(AwardYear), y=tts/1000000, group=ORGname, color=focus)) + geom_line() + facet_wrap(~ORGname) + xlab("Award Year") + ylab("Award Amount (millions)") + theme(axis.text.x = element_text(size = 5), legend.position = "none") + ggtitle("NSF Funding by Year") + scale_color_manual(values = c("Yes"="red", "No"="darkgrey"), guide=NULL)
## Appendix
library(tidyverse)
library(scales)
library(ggrepel)
library(readxl)
NSFdata_all <- read_xlsx("C:/Users/overp/Desktop/School Stuff/CIS 576 Business Data Visualization/HW/NSF_Funding_by_Year.xlsx")
glimpse(NSFdata_all)
head(NSFdata_all)
summary(NSFdata_all)
cor(NSFdata_all[, sapply(NSFdata_all, is.numeric)], use = "pairwise")
NSFdata_subset <- NSFdata_all %>%
filter(AwardYear == "2012") %>%
filter(str_detect(Organization,"Colorado State University") | str_detect(Organization,"Iowa State University") | str_detect(Organization,"Kansas State University") | str_detect(Organization,"Michigan State University") | str_detect(Organization,"North Carolina State University") | str_detect(Organization,"Oklahoma State University") | str_detect(Organization,"Oregon State University") | str_detect(Organization,"Purdue University") | str_detect(Organization,"Texas A & M University") | str_detect(Organization,"University of California, Davis") | str_detect(Organization,"University of Illinois") | str_detect(Organization,"University of Tennessee") | str_detect(Organization,"Virginia Polytechnic") | str_detect(Organization,"Washington State University"))
levels(NSFdata_subset)
NSFdata_subset$Organization <- as.factor(NSFdata_subset$Organization)
levels(NSFdata_subset$Organization)
NSFdata_subset_rename <- NSFdata_subset %>%
mutate(ORGname = recode(Organization,"Colorado State University" = "CSU", "Iowa State University" = "ISU", "Kansas State University" = "KSU", "Michigan State University" = "MSU", "North Carolina State University" = "NCSU", "Oklahoma State University" = "OSU", "Oregon State University" = "OSU", "Purdue University" = "PSU", "University of Illinois at Chicago" = "UI", "University of Illinois at Urbana-Champaign" = "UI", "University of Tennessee Chattanooga" = "UT", "University of Tennessee Knoxville" = "UT", "Virginia Polytechnic Institute and State University" = "VT", "Washington State University" = "WSU"))
levels(NSFdata_subset_rename$ORGname)
NSFdata_subset_rename_Summary <- NSFdata_subset_rename %>%
group_by(ORGname) %>%
summarize(ts=sum(AWARD_SUM), tq=sum(AWARD_COUNT))
head(NSFdata_subset_rename_Summary)
NSFdata_subset_rename_Summary <- NSFdata_subset_rename_Summary %>%
mutate(Highlight = ifelse(ORGname=="CSU","Yes","No"))
ggplot(NSFdata_subset_rename_Summary, aes(y = reorder(ORGname, ts), x = ts/1000000, fill=Highlight)) + geom_col()+
scale_x_continuous(labels = comma)+
labs(title = "2012 NSF Funding by Organization", subtitle = "CSU vs Competitive Set", x = "Total Award Amount (millions)", y = "Organization") + theme(aspect.ratio = 1/2) + scale_fill_manual(values = c("Yes"="black","No"="darkgrey"), guide=NULL)
ggplot(NSFdata_subset_rename_Summary, aes(reorder(ORGname, -ts), ts/1000000, fill=Highlight)) + geom_col() + labs(title = "2012 NSF Funding by Organization", subtitle = "CSU vs Competitive Set", y = "Total Award Amount (millions)", x = "Organization") + theme(aspect.ratio = 1/2) + scale_fill_manual(values = c("Yes"="black","No"="darkgrey"), guide=NULL)
## Question 1 Graph 2 - Scatter plot
ggplot(NSFdata_subset_rename_Summary, aes(y = ts/1000000, x = tq, color = Highlight)) + geom_point() + geom_text_repel(aes(label = ORGname)) + scale_y_continuous(labels = comma) + labs(title = "2012 NSF Funding by Organization", subtitle = "CSU vs Competitive Set", y = "Total Award Amount (millions)", x = "Number of Awards") + expand_limits(x=c(0,200),y=c(0,60)) + scale_color_manual(values = c("Yes"="blue", "No"="darkgrey"), guide=NULL)
ggplot(NSFdata_subset_rename_Summary, aes(y = ts/1000000, x= tq, size = tq, color = Highlight)) + geom_point() + geom_text_repel(aes(label = ORGname), size=3) + labs(title = "2012 NSF Funding by Organization", subtitle = "CSU vs Competitive Set", y = "Total Award Amount (millions)", x = "Number of Awards") + theme(legend.position = "top", legend.key.size = unit(0.1,'cm')) + expand_limits(x=c(0,200),y=c(0,60)) + scale_color_manual(values = c("Yes"="blue", "No"="darkgrey"), guide=NULL)
NSFdata_time <- NSFdata_all %>%
filter(str_detect(Organization,"Colorado State University") | str_detect(Organization,"Iowa State University") | str_detect(Organization,"Kansas State University") | str_detect(Organization,"Michigan State University") | str_detect(Organization,"North Carolina State University") | str_detect(Organization,"Oklahoma State University") | str_detect(Organization,"Oregon State University") | str_detect(Organization,"Purdue University") | str_detect(Organization,"Texas A & M University") | str_detect(Organization,"University of California, Davis") | str_detect(Organization,"University of Illinois") | str_detect(Organization,"University of Tennessee") | str_detect(Organization,"Virginia Polytechnic") | str_detect(Organization,"Washington State University"))
NSFdata_time$Organization <- as.factor(NSFdata_time$Organization)
levels(NSFdata_time$Organization)
NSFdata_time_rename <- NSFdata_time %>%
mutate(ORGname = recode(Organization,"Colorado State University" = "CSU", "Colorado State University-Pueblo" = "CSU", "Iowa State University" = "ISU", "Kansas State University" = "KSU", "Michigan State University" = "MSU", "North Carolina State University" = "NCSU", "Oklahoma State University" = "OSU", "Oregon State University" = "OSU", "Purdue University" = "PSU", "University of Illinois at Chicago" = "UI", "University of Illinois at Urbana-Champaign" = "UI", "University of Illinois at Springfield" ="UI", "University of Tennessee Chattanooga" = "UT", "University of Tennessee Knoxville" = "UT", "The University of Tennessee, Memphis - The Health Science Center" = "UT", "University of Tennessee Institute of Agriculture" ="UT", "University of Tennessee Space Institute" = "UT", "Virginia Polytechnic Institute and State University" = "VT", "Washington State University" = "WSU"))
levels(NSFdata_time_rename$ORGname)
NSFdata_time_rename_summary <- NSFdata_time_rename %>%
group_by(AwardYear, ORGname) %>%
summarise(tts = sum(AWARD_SUM),ttq = sum(AWARD_COUNT))
ggplot(NSFdata_time_rename_summary, aes(x=as.factor(AwardYear), y=tts/1000000, group=ORGname, color=ORGname)) + geom_line() + xlab("") + ylab("Award Amount (millions)") + ggtitle("Total Award Amount by Year by Orgainization")
NSFdata_time_rename_summary <- NSFdata_time_rename_summary %>%
mutate(focus = ifelse(ORGname=="CSU", "Yes", "No"))
ggplot(NSFdata_time_rename_summary, aes(x=as.factor(AwardYear), y=tts/1000000, group=ORGname, color=focus)) + geom_line() + xlab("Award Year") + ylab("Award Amount (millions)") + labs(title = "NSF Funding by Year / Orgainization", subtitle = "CSU Competitive Set") + scale_color_manual(values = c("Yes"="red", "No"="darkgrey"), guide=NULL)
ggplot(NSFdata_time_rename_summary, aes(x=as.factor(AwardYear), y=tts/1000000, group=ORGname, color=focus)) + geom_line() + facet_wrap(~ORGname) + xlab("Award Year") + ylab("Award Amount (millions)") + theme(axis.text.x = element_text(size = 5), legend.position = "none") + ggtitle("NSF Funding by Year") + scale_color_manual(values = c("Yes"="red", "No"="darkgrey"), guide=NULL)