R-Markdown Document: Homework #2

The purpose of this assignment is to create several plots using R to compare NSF funding for CSU to a competitive set of similar institutions for 2012 and over time.

Step 1: Load R Packages from Library

library(tidyverse)
library(scales)
library(ggrepel)
library(readxl)

Step 2: load the NSF Data

NSFdata_all <- read_xlsx("C:/Users/overp/Desktop/School Stuff/CIS 576 Business Data Visualization/HW/NSF_Funding_by_Year.xlsx")

Step 3: Review Data

The first step includes understanding the data by running glimpse, head, summary, and coronations to help better understand the data.

glimpse(NSFdata_all)
## Rows: 7,185
## Columns: 6
## $ ID                   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15~
## $ Organization         <chr> "3DGEO DEVELOPMENT INC", "3DGEO DEVELOPMENT INC",~
## $ AwardYear            <dbl> 2005, 2004, 2008, 2010, 2011, 2005, 2010, 2006, 2~
## $ AWARD_SUM            <dbl> 2162000, 200000, 365625, 328503, 131267, 145000, ~
## $ AWARD_COUNT          <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 4, 2, 5, 2, 2~
## $ AvarageAwardDuration <dbl> 4.000000, 0.000000, 1.000000, 0.500000, 2.500000,~
head(NSFdata_all)
## # A tibble: 6 x 6
##      ID Organization            AwardYear AWARD_SUM AWARD_COUNT AvarageAwardDur~
##   <dbl> <chr>                       <dbl>     <dbl>       <dbl>            <dbl>
## 1     1 3DGEO DEVELOPMENT INC        2005   2162000           2              4  
## 2     2 3DGEO DEVELOPMENT INC        2004    200000           2              0  
## 3     3 3F, LLC                      2008    365625           2              1  
## 4     4 4WIND SCIENCE AND ENGI~      2010    328503           2              0.5
## 5     5 Abilene Christian Univ~      2011    131267           2              2.5
## 6     6 ABIRAM                ~      2005    145000           2              1.5
summary(NSFdata_all)
##        ID       Organization         AwardYear      AWARD_SUM        
##  Min.   :   1   Length:7185        Min.   :2001   Min.   :        0  
##  1st Qu.:1797   Class :character   1st Qu.:2003   1st Qu.:   567690  
##  Median :3593   Mode  :character   Median :2007   Median :  1465978  
##  Mean   :3593                      Mean   :2007   Mean   :  7636078  
##  3rd Qu.:5389                      3rd Qu.:2010   3rd Qu.:  5737611  
##  Max.   :7185                      Max.   :2012   Max.   :894931764  
##   AWARD_COUNT     AvarageAwardDuration
##  Min.   :  2.00   Min.   : 0.000      
##  1st Qu.:  2.00   1st Qu.: 2.556      
##  Median :  4.00   Median : 3.141      
##  Mean   : 16.08   Mean   : 3.030      
##  3rd Qu.: 14.00   3rd Qu.: 3.622      
##  Max.   :224.00   Max.   :11.667
cor(NSFdata_all[, sapply(NSFdata_all, is.numeric)], use = "pairwise")
##                               ID   AwardYear   AWARD_SUM AWARD_COUNT
## ID                    1.00000000 -0.01281994  0.08794076  0.19912857
## AwardYear            -0.01281994  1.00000000 -0.02460934  0.03434795
## AWARD_SUM             0.08794076 -0.02460934  1.00000000  0.59188322
## AWARD_COUNT           0.19912857  0.03434795  0.59188322  1.00000000
## AvarageAwardDuration  0.09802567 -0.13656146  0.19828842  0.12576510
##                      AvarageAwardDuration
## ID                             0.09802567
## AwardYear                     -0.13656146
## AWARD_SUM                      0.19828842
## AWARD_COUNT                    0.12576510
## AvarageAwardDuration           1.00000000

Question ## Question 1 Question 1 asks for a bar chart, scatter chart, and a bubble plot for 2012 NSF foundation comparing CSU funding to a comparable subset of schools.

Q1 Step 1: filter award year = 2012 and 14 competitive set institutions

NSFdata_subset <- NSFdata_all %>%
 filter(AwardYear == "2012") %>%
  filter(str_detect(Organization,"Colorado State University") | str_detect(Organization,"Iowa State University") | str_detect(Organization,"Kansas State University") | str_detect(Organization,"Michigan State University") | str_detect(Organization,"North Carolina State University") | str_detect(Organization,"Oklahoma State University") | str_detect(Organization,"Oregon State University") | str_detect(Organization,"Purdue University") | str_detect(Organization,"Texas A & M University") | str_detect(Organization,"University of California, Davis") | str_detect(Organization,"University of Illinois") | str_detect(Organization,"University of Tennessee") | str_detect(Organization,"Virginia Polytechnic") | str_detect(Organization,"Washington State University"))
levels(NSFdata_subset)
## NULL
NSFdata_subset$Organization <- as.factor(NSFdata_subset$Organization)
levels(NSFdata_subset$Organization)
##  [1] "Colorado State University"                          
##  [2] "Iowa State University"                              
##  [3] "Kansas State University"                            
##  [4] "Michigan State University"                          
##  [5] "North Carolina State University"                    
##  [6] "Oklahoma State University"                          
##  [7] "Oregon State University"                            
##  [8] "Purdue University"                                  
##  [9] "University of Illinois at Chicago"                  
## [10] "University of Illinois at Urbana-Champaign"         
## [11] "University of Tennessee Chattanooga"                
## [12] "University of Tennessee Knoxville"                  
## [13] "Virginia Polytechnic Institute and State University"
## [14] "Washington State University"

Q1 Step 2: rename competitive set organizations

NSFdata_subset_rename <- NSFdata_subset %>%
  mutate(ORGname = recode(Organization,"Colorado State University" = "CSU", "Iowa State University" = "ISU", "Kansas State University" = "KSU", "Michigan State University" = "MSU", "North Carolina State University" = "NCSU", "Oklahoma State University" = "OSU", "Oregon State University" = "OSU", "Purdue University" = "PSU", "University of Illinois at Chicago" = "UI", "University of Illinois at Urbana-Champaign" = "UI", "University of Tennessee Chattanooga" = "UT", "University of Tennessee Knoxville" = "UT", "Virginia Polytechnic Institute and State University" = "VT", "Washington State University" = "WSU"))
levels(NSFdata_subset_rename$ORGname)
##  [1] "CSU"  "ISU"  "KSU"  "MSU"  "NCSU" "OSU"  "PSU"  "UI"   "UT"   "VT"  
## [11] "WSU"

Q1 - DATA VISUALIZATION

Q1 Graph 1 - Bar Chart

Graph 1 is a bar chart comparing CSU NSF funding to a peer group of schools.

Q1 Graph 1 - Step 1 - Create Summary for Plotting

NSFdata_subset_rename_Summary <- NSFdata_subset_rename %>%
  group_by(ORGname) %>%
  summarize(ts=sum(AWARD_SUM), tq=sum(AWARD_COUNT))
head(NSFdata_subset_rename_Summary)
## # A tibble: 6 x 3
##   ORGname       ts    tq
##   <fct>      <dbl> <dbl>
## 1 CSU     11592315    41
## 2 ISU     16738011    65
## 3 KSU      8839116    23
## 4 MSU     17351663    58
## 5 NCSU    31440526    91
## 6 OSU     28511623   107

Q1 Graph 1 - Step 2 - Add Highlight Column

NSFdata_subset_rename_Summary <- NSFdata_subset_rename_Summary %>%
  mutate(Highlight = ifelse(ORGname=="CSU","Yes","No"))

Q1 Graph 1 - Step 3 - Create Bar Chart

ggplot(NSFdata_subset_rename_Summary, aes(y = reorder(ORGname, ts), x = ts/1000000, fill=Highlight)) + geom_col()+
  scale_x_continuous(labels = comma)+
  labs(title = "2012 NSF Funding by Organization", subtitle = "CSU vs Competitive Set", x = "Total Award Amount (millions)", y = "Organization") + theme(aspect.ratio = 1/2) + scale_fill_manual(values = c("Yes"="black","No"="darkgrey"), guide=NULL)

ggplot(NSFdata_subset_rename_Summary, aes(reorder(ORGname, -ts), ts/1000000, fill=Highlight)) + geom_col() + labs(title = "2012 NSF Funding by Organization", subtitle = "CSU vs Competitive Set", y = "Total Award Amount (millions)", x = "Organization") + theme(aspect.ratio = 1/2) + scale_fill_manual(values = c("Yes"="black","No"="darkgrey"), guide=NULL)

## Question 1 Graph 2 - Scatter plot
ggplot(NSFdata_subset_rename_Summary, aes(y = ts/1000000, x = tq, color = Highlight)) + geom_point() + geom_text_repel(aes(label = ORGname)) + scale_y_continuous(labels = comma) + labs(title = "2012 NSF Funding by Organization", subtitle = "CSU vs Competitive Set", y = "Total Award Amount (millions)", x = "Number of Awards") + expand_limits(x=c(0,200),y=c(0,60)) + scale_color_manual(values = c("Yes"="blue", "No"="darkgrey"), guide=NULL)

## Question 1 Graph 3 - Bubble plot

ggplot(NSFdata_subset_rename_Summary, aes(y = ts/1000000, x= tq, size = tq, color = Highlight)) + geom_point() + geom_text_repel(aes(label = ORGname), size=3) + labs(title = "2012 NSF Funding by Organization", subtitle = "CSU vs Competitive Set", y = "Total Award Amount (millions)", x = "Number of Awards") + theme(legend.position = "top", legend.key.size = unit(0.1,'cm')) + expand_limits(x=c(0,200),y=c(0,60)) + scale_color_manual(values = c("Yes"="blue", "No"="darkgrey"), guide=NULL)

### Question 2 Question 2 asks to create a set of time series plots comparing NSF Funding for CSU to a competitive set of institutions over time.

Question 2 Step 1 - Data Preperation

NSFdata_time <- NSFdata_all %>%
  filter(str_detect(Organization,"Colorado State University") | str_detect(Organization,"Iowa State University") | str_detect(Organization,"Kansas State University") | str_detect(Organization,"Michigan State University") | str_detect(Organization,"North Carolina State University") | str_detect(Organization,"Oklahoma State University") | str_detect(Organization,"Oregon State University") | str_detect(Organization,"Purdue University") | str_detect(Organization,"Texas A & M University") | str_detect(Organization,"University of California, Davis") | str_detect(Organization,"University of Illinois") | str_detect(Organization,"University of Tennessee") | str_detect(Organization,"Virginia Polytechnic") | str_detect(Organization,"Washington State University"))
NSFdata_time$Organization <- as.factor(NSFdata_time$Organization)
levels(NSFdata_time$Organization)
##  [1] "Colorado State University"                                       
##  [2] "Colorado State University-Pueblo"                                
##  [3] "Iowa State University"                                           
##  [4] "Kansas State University"                                         
##  [5] "Michigan State University"                                       
##  [6] "North Carolina State University"                                 
##  [7] "Oklahoma State University"                                       
##  [8] "Oregon State University"                                         
##  [9] "Purdue University"                                               
## [10] "The University of Tennessee, Memphis - The Health Science Center"
## [11] "University of Illinois at Chicago"                               
## [12] "University of Illinois at Springfield"                           
## [13] "University of Illinois at Urbana-Champaign"                      
## [14] "University of Tennessee Chattanooga"                             
## [15] "University of Tennessee Institute of Agriculture"                
## [16] "University of Tennessee Knoxville"                               
## [17] "University of Tennessee Space Institute"                         
## [18] "Virginia Polytechnic Institute and State University"             
## [19] "Washington State University"
NSFdata_time_rename <- NSFdata_time %>%
  mutate(ORGname = recode(Organization,"Colorado State University" = "CSU", "Colorado State University-Pueblo" = "CSU", "Iowa State University" = "ISU", "Kansas State University" = "KSU", "Michigan State University" = "MSU", "North Carolina State University" = "NCSU", "Oklahoma State University" = "OSU", "Oregon State University" = "OSU", "Purdue University" = "PSU", "University of Illinois at Chicago" = "UI", "University of Illinois at Urbana-Champaign" = "UI", "University of Illinois at Springfield" ="UI", "University of Tennessee Chattanooga" = "UT", "University of Tennessee Knoxville" = "UT", "The University of Tennessee, Memphis - The Health Science Center" = "UT", "University of Tennessee Institute of Agriculture" ="UT", "University of Tennessee Space Institute" = "UT", "Virginia Polytechnic Institute and State University" = "VT", "Washington State University" = "WSU"))
levels(NSFdata_time_rename$ORGname)
##  [1] "CSU"  "ISU"  "KSU"  "MSU"  "NCSU" "OSU"  "PSU"  "UT"   "UI"   "VT"  
## [11] "WSU"

Q2 - DATA VISUALIZATION

Create a set of time series plots comparing NSF funding for CSU compared to its competitive set for ALL funding years.

Question 2 Graph 1

Question 2 Graph 1 - Step 1

Summarize Data

NSFdata_time_rename_summary <- NSFdata_time_rename %>%
  group_by(AwardYear, ORGname) %>%
  summarise(tts = sum(AWARD_SUM),ttq = sum(AWARD_COUNT))
## `summarise()` has grouped output by 'AwardYear'. You can override using the
## `.groups` argument.

Question 2 Graph 1 - Step 2

Create Time-Series plot with all three brands in the same graph

ggplot(NSFdata_time_rename_summary, aes(x=as.factor(AwardYear), y=tts/1000000, group=ORGname, color=ORGname)) + geom_line() + xlab("") + ylab("Award Amount (millions)") + ggtitle("Total Award Amount by Year by Orgainization")

NSFdata_time_rename_summary <- NSFdata_time_rename_summary %>%
  mutate(focus = ifelse(ORGname=="CSU", "Yes", "No"))
ggplot(NSFdata_time_rename_summary, aes(x=as.factor(AwardYear), y=tts/1000000, group=ORGname, color=focus)) + geom_line() + xlab("Award Year") + ylab("Award Amount (millions)") + labs(title = "NSF Funding by Year / Orgainization", subtitle = "CSU Competitive Set") + scale_color_manual(values = c("Yes"="red", "No"="darkgrey"), guide=NULL)

Question 2 Graph 2

Create Small-Multiples Chart

ggplot(NSFdata_time_rename_summary, aes(x=as.factor(AwardYear), y=tts/1000000, group=ORGname, color=focus)) + geom_line() + facet_wrap(~ORGname) + xlab("Award Year") + ylab("Award Amount (millions)") + theme(axis.text.x = element_text(size = 5), legend.position = "none") + ggtitle("NSF Funding by Year") + scale_color_manual(values = c("Yes"="red", "No"="darkgrey"), guide=NULL)

## Appendix

library(tidyverse)
library(scales)
library(ggrepel)
library(readxl)
NSFdata_all <- read_xlsx("C:/Users/overp/Desktop/School Stuff/CIS 576 Business Data Visualization/HW/NSF_Funding_by_Year.xlsx")
glimpse(NSFdata_all)
head(NSFdata_all)
summary(NSFdata_all)
cor(NSFdata_all[, sapply(NSFdata_all, is.numeric)], use = "pairwise")
NSFdata_subset <- NSFdata_all %>%
 filter(AwardYear == "2012") %>%
  filter(str_detect(Organization,"Colorado State University") | str_detect(Organization,"Iowa State University") | str_detect(Organization,"Kansas State University") | str_detect(Organization,"Michigan State University") | str_detect(Organization,"North Carolina State University") | str_detect(Organization,"Oklahoma State University") | str_detect(Organization,"Oregon State University") | str_detect(Organization,"Purdue University") | str_detect(Organization,"Texas A & M University") | str_detect(Organization,"University of California, Davis") | str_detect(Organization,"University of Illinois") | str_detect(Organization,"University of Tennessee") | str_detect(Organization,"Virginia Polytechnic") | str_detect(Organization,"Washington State University"))
levels(NSFdata_subset)
NSFdata_subset$Organization <- as.factor(NSFdata_subset$Organization)
levels(NSFdata_subset$Organization)
NSFdata_subset_rename <- NSFdata_subset %>%
  mutate(ORGname = recode(Organization,"Colorado State University" = "CSU", "Iowa State University" = "ISU", "Kansas State University" = "KSU", "Michigan State University" = "MSU", "North Carolina State University" = "NCSU", "Oklahoma State University" = "OSU", "Oregon State University" = "OSU", "Purdue University" = "PSU", "University of Illinois at Chicago" = "UI", "University of Illinois at Urbana-Champaign" = "UI", "University of Tennessee Chattanooga" = "UT", "University of Tennessee Knoxville" = "UT", "Virginia Polytechnic Institute and State University" = "VT", "Washington State University" = "WSU"))
levels(NSFdata_subset_rename$ORGname)
NSFdata_subset_rename_Summary <- NSFdata_subset_rename %>%
  group_by(ORGname) %>%
  summarize(ts=sum(AWARD_SUM), tq=sum(AWARD_COUNT))
head(NSFdata_subset_rename_Summary)
NSFdata_subset_rename_Summary <- NSFdata_subset_rename_Summary %>%
  mutate(Highlight = ifelse(ORGname=="CSU","Yes","No"))
ggplot(NSFdata_subset_rename_Summary, aes(y = reorder(ORGname, ts), x = ts/1000000, fill=Highlight)) + geom_col()+
  scale_x_continuous(labels = comma)+
  labs(title = "2012 NSF Funding by Organization", subtitle = "CSU vs Competitive Set", x = "Total Award Amount (millions)", y = "Organization") + theme(aspect.ratio = 1/2) + scale_fill_manual(values = c("Yes"="black","No"="darkgrey"), guide=NULL)
ggplot(NSFdata_subset_rename_Summary, aes(reorder(ORGname, -ts), ts/1000000, fill=Highlight)) + geom_col() + labs(title = "2012 NSF Funding by Organization", subtitle = "CSU vs Competitive Set", y = "Total Award Amount (millions)", x = "Organization") + theme(aspect.ratio = 1/2) + scale_fill_manual(values = c("Yes"="black","No"="darkgrey"), guide=NULL)
## Question 1 Graph 2 - Scatter plot
ggplot(NSFdata_subset_rename_Summary, aes(y = ts/1000000, x = tq, color = Highlight)) + geom_point() + geom_text_repel(aes(label = ORGname)) + scale_y_continuous(labels = comma) + labs(title = "2012 NSF Funding by Organization", subtitle = "CSU vs Competitive Set", y = "Total Award Amount (millions)", x = "Number of Awards") + expand_limits(x=c(0,200),y=c(0,60)) + scale_color_manual(values = c("Yes"="blue", "No"="darkgrey"), guide=NULL)
ggplot(NSFdata_subset_rename_Summary, aes(y = ts/1000000, x= tq, size = tq, color = Highlight)) + geom_point() + geom_text_repel(aes(label = ORGname), size=3) + labs(title = "2012 NSF Funding by Organization", subtitle = "CSU vs Competitive Set", y = "Total Award Amount (millions)", x = "Number of Awards") + theme(legend.position = "top", legend.key.size = unit(0.1,'cm')) + expand_limits(x=c(0,200),y=c(0,60)) + scale_color_manual(values = c("Yes"="blue", "No"="darkgrey"), guide=NULL)
NSFdata_time <- NSFdata_all %>%
  filter(str_detect(Organization,"Colorado State University") | str_detect(Organization,"Iowa State University") | str_detect(Organization,"Kansas State University") | str_detect(Organization,"Michigan State University") | str_detect(Organization,"North Carolina State University") | str_detect(Organization,"Oklahoma State University") | str_detect(Organization,"Oregon State University") | str_detect(Organization,"Purdue University") | str_detect(Organization,"Texas A & M University") | str_detect(Organization,"University of California, Davis") | str_detect(Organization,"University of Illinois") | str_detect(Organization,"University of Tennessee") | str_detect(Organization,"Virginia Polytechnic") | str_detect(Organization,"Washington State University"))
NSFdata_time$Organization <- as.factor(NSFdata_time$Organization)
levels(NSFdata_time$Organization)
NSFdata_time_rename <- NSFdata_time %>%
  mutate(ORGname = recode(Organization,"Colorado State University" = "CSU", "Colorado State University-Pueblo" = "CSU", "Iowa State University" = "ISU", "Kansas State University" = "KSU", "Michigan State University" = "MSU", "North Carolina State University" = "NCSU", "Oklahoma State University" = "OSU", "Oregon State University" = "OSU", "Purdue University" = "PSU", "University of Illinois at Chicago" = "UI", "University of Illinois at Urbana-Champaign" = "UI", "University of Illinois at Springfield" ="UI", "University of Tennessee Chattanooga" = "UT", "University of Tennessee Knoxville" = "UT", "The University of Tennessee, Memphis - The Health Science Center" = "UT", "University of Tennessee Institute of Agriculture" ="UT", "University of Tennessee Space Institute" = "UT", "Virginia Polytechnic Institute and State University" = "VT", "Washington State University" = "WSU"))
levels(NSFdata_time_rename$ORGname)
NSFdata_time_rename_summary <- NSFdata_time_rename %>%
  group_by(AwardYear, ORGname) %>%
  summarise(tts = sum(AWARD_SUM),ttq = sum(AWARD_COUNT))
ggplot(NSFdata_time_rename_summary, aes(x=as.factor(AwardYear), y=tts/1000000, group=ORGname, color=ORGname)) + geom_line() + xlab("") + ylab("Award Amount (millions)") + ggtitle("Total Award Amount by Year by Orgainization")
NSFdata_time_rename_summary <- NSFdata_time_rename_summary %>%
  mutate(focus = ifelse(ORGname=="CSU", "Yes", "No"))
ggplot(NSFdata_time_rename_summary, aes(x=as.factor(AwardYear), y=tts/1000000, group=ORGname, color=focus)) + geom_line() + xlab("Award Year") + ylab("Award Amount (millions)") + labs(title = "NSF Funding by Year / Orgainization", subtitle = "CSU Competitive Set") + scale_color_manual(values = c("Yes"="red", "No"="darkgrey"), guide=NULL)
ggplot(NSFdata_time_rename_summary, aes(x=as.factor(AwardYear), y=tts/1000000, group=ORGname, color=focus)) + geom_line() + facet_wrap(~ORGname) + xlab("Award Year") + ylab("Award Amount (millions)") + theme(axis.text.x = element_text(size = 5), legend.position = "none") + ggtitle("NSF Funding by Year") + scale_color_manual(values = c("Yes"="red", "No"="darkgrey"), guide=NULL)