Load the required R packages:
library(tidyverse)
library(scales)
library(ggrepel)
library(readxl)
library(readxl)
# Load the .xlsx file
data_all <- read_xlsx("/Users/vishnujordan/Downloads/NSF_Funding_by_Year.xlsx")
# If you have a .xls file instead, use read_xls()
# data_all <- read_xls("/Users/vishnujordan/Downloads/NSF_Funding_by_Year.xls")
# View the first few rows of the dataset
head(data_all)
## # A tibble: 6 × 6
## ID Organization AwardYear AWARD_SUM AWARD_COUNT AvarageAwardDuration
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 3DGEO DEVELOPMENT … 2005 2162000 2 4
## 2 2 3DGEO DEVELOPMENT … 2004 200000 2 0
## 3 3 3F, LLC 2008 365625 2 1
## 4 4 4WIND SCIENCE AND … 2010 328503 2 0.5
## 5 5 Abilene Christian … 2011 131267 2 2.5
## 6 6 ABIRAM … 2005 145000 2 1.5
# Uncomment the next line if running in RStudio (it won't work in R Markdown)
# View(data_all)
Create a subset for visualization using the filter()
and select() functions from the dplyr
package. |
The filter() function is used to subset a data frame,
retaining all rows that satisfy your conditions. The
select() function selects variables in a data frame.
str_detect (from stringr) returns a logical
vector with TRUE for each element of string that matches
pattern and FALSE otherwise.
data_subset <- data_all %>%
dplyr::select(Organization, AwardYear, AWARD_SUM, AWARD_COUNT, AvarageAwardDuration) %>%
filter(AwardYear == "2012") %>%
filter(
str_detect(Organization, "Iowa State University") |
str_detect(Organization, "Kansas State University") |
str_detect(Organization, "Michigan State University") |
str_detect(Organization, "North Carolina State University") |
str_detect(Organization, "Oklahoma State University") |
str_detect(Organization, "Oregon State University") |
str_detect(Organization, "Purdue University") |
str_detect(Organization, "Texas A & M University") |
str_detect(Organization, "University of California, Davis") |
str_detect(Organization, "University of Illinois, Urbana-Champaign") |
str_detect(Organization, "University of Tennessee") |
str_detect(Organization, "Virginia Polytechnic Institute and State University") |
str_detect(Organization, "Washington State University") |
str_detect(Organization, "Colorado State University")
)
view(data_subset)
An error may occur because the select() function from the MASS
package clashes with the select() function from the dplyr package. The
easiest way to fix this error is to explicitly tell R to use the
select() function from the dplyr package by using
dplyr::select().
View a list of Organization: (converting them into factor instead of charactor)
data_subset$Organization <- as.factor(data_subset$Organization)
levels(data_subset$Organization)
## [1] "Colorado State University"
## [2] "Iowa State University"
## [3] "Kansas State University"
## [4] "Michigan State University"
## [5] "North Carolina State University"
## [6] "Oklahoma State University"
## [7] "Oregon State University"
## [8] "Purdue University"
## [9] "University of Tennessee Chattanooga"
## [10] "University of Tennessee Knoxville"
## [11] "Virginia Polytechnic Institute and State University"
## [12] "Washington State University"
#unique(data_subset$Organization)
levels(data_subset$Organization)
## [1] "Colorado State University"
## [2] "Iowa State University"
## [3] "Kansas State University"
## [4] "Michigan State University"
## [5] "North Carolina State University"
## [6] "Oklahoma State University"
## [7] "Oregon State University"
## [8] "Purdue University"
## [9] "University of Tennessee Chattanooga"
## [10] "University of Tennessee Knoxville"
## [11] "Virginia Polytechnic Institute and State University"
## [12] "Washington State University"
#I have used Approaching 3,, becuase it is scalable and more reliable for mutate and filtering (so ignoreing appraching 1 and 2)
Approach 3
Approach 1 and Approach 2 are not quite scalable. Look for a more efficient approach when the list of Organization gets longer:
library(dplyr)
library(stringr)
# Replace university names with simplified names
data_three_alternative <- data_subset %>%
mutate(Names = str_replace(Organization, ".*Iowa.*", "ISU")) %>%
mutate(Names = str_replace(Names, ".*Kansas.*", "KSU")) %>%
mutate(Names = str_replace(Names, ".*Michigan.*", "MSU")) %>%
mutate(Names = str_replace(Names, ".*Carolina.*", "NCSU")) %>%
mutate(Names = str_replace(Names, ".*Oklahoma.*", "OKSU")) %>%
mutate(Names = str_replace(Names, ".*Oregon.*", "OSU")) %>%
mutate(Names = str_replace(Names, ".*Purdue.*", "Purdue")) %>%
mutate(Names = str_replace(Names, ".*Texas.*", "TAMU")) %>%
mutate(Names = str_replace(Names, ".*California.*", "CU-D")) %>%
mutate(Names = str_replace(Names, ".*Illinois.*", "UIUC")) %>%
mutate(Names = str_replace(Names, ".*Tennessee.*", "UTen")) %>%
mutate(Names = str_replace(Names, ".*Virginia.*", "VPISU")) %>%
mutate(Names = str_replace(Names, ".*Washington.*", "WSU")) %>%
mutate(Names = str_replace(Names, ".*Colorado.*", "CSU")) %>%
# Filter only specific values in the "Three_Names" column
filter(Names %in% c("ISU", "KSU", "MSU", "NCSU", "OKSU", "OSU", "Purdue",
"TAMU", "CU-D", "UIUC", "UTen", "VPISU", "WSU", "CSU")) %>%
droplevels()
# View unique names
levels(as.factor(data_three_alternative$Names))
## [1] "CSU" "ISU" "KSU" "MSU" "NCSU" "OKSU" "OSU" "Purdue"
## [9] "UTen" "VPISU" "WSU"
Create a summary for plotting:
data_three_alternative_summary <- data_three_alternative %>%
group_by(Names) %>%
summarise(ts=sum(AWARD_SUM), tc=sum(AWARD_COUNT))
data_three_alternative_summary
## # A tibble: 11 × 3
## Names ts tc
## <chr> <dbl> <dbl>
## 1 CSU 11592315 41
## 2 ISU 16738011 65
## 3 KSU 8839116 23
## 4 MSU 17351663 58
## 5 NCSU 31440526 91
## 6 OKSU 5331516 21
## 7 OSU 23180107 86
## 8 Purdue 33991966 121
## 9 UTen 13646485 51
## 10 VPISU 19609620 78
## 11 WSU 14509261 43
Verifying the data
data_three_alternative_summary <- data_three_alternative %>%
group_by(Names) %>%
summarise(ts = sum(AWARD_SUM, na.rm = TRUE),
tc = sum(AWARD_COUNT, na.rm = TRUE))
library(ggplot2)
ggplot(data_three_alternative_summary, aes(x = Names, y = ts, fill = Names)) +
geom_col() +
theme_minimal() +
labs(title = "Total Award Sum by University",
x = "University",
y = "Total Award Sum")
#### Data Visualization
colnames(data_three_alternative_summary)
## [1] "Names" "ts" "tc"
head(data_three_alternative_summary)
## # A tibble: 6 × 3
## Names ts tc
## <chr> <dbl> <dbl>
## 1 CSU 11592315 41
## 2 ISU 16738011 65
## 3 KSU 8839116 23
## 4 MSU 17351663 58
## 5 NCSU 31440526 91
## 6 OKSU 5331516 21
ggplot(data_three_alternative_summary, aes(y = reorder(Names, ts), x = ts)) +
geom_col()+
scale_x_continuous(labels = comma)+
labs(title = "NSF Funding (Year 2012)",
subtitle = "CSU vs. Peer Orgs",
x = "Total Sum",
y = "Orgs") +
theme(aspect.ratio = 1/2.5)
library(dplyr)
library(ggplot2)
library(scales)
# Use the dataset that already has `Names`
data_filtered <- data_three_alternative %>%
filter(AwardYear == "2012") # Keep only 2012 data
# Summarize total awards & award count
data_summary <- data_filtered %>%
group_by(Names) %>%
summarise(
Total_Awards = sum(AWARD_SUM, na.rm = TRUE),
Award_Count = sum(AWARD_COUNT, na.rm = TRUE)
)
# Add a column to highlight whether its csu or not (to filter later)
data_summary <- data_summary %>%
mutate(Highlight = ifelse(Names == "CSU", "Yes", "No"))
# Bar Chart: Highlighting CSU + Showing Award Count as Labels
ggplot(data_summary, aes(x = reorder(Names, -Total_Awards), y = Total_Awards/1000, fill = Highlight)) +
geom_col() +
geom_text(aes(label = Award_Count), vjust = -0.5, size = 4, color = "black") + # Display award count on top of bars
scale_fill_manual(values = c("Yes" = "yellow", "No" = "gray")) +
scale_y_continuous(labels = comma) +
labs(title = "Total NSF Awards (2012)",
subtitle = "CSU vs. Institutional Peers",
x = "Institution",
y = "Award Amount (in thousands)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels
ggplot(data_three_alternative_summary, aes(reorder(Names, -ts), ts/1000)) +
geom_col() +
ylab("Sales (thousands)") +
xlab("Brand") +
theme(aspect.ratio = 1/1.1)
# Use the dataset that already has `Names`
data_filtered <- data_three_alternative %>%
filter(AwardYear == "2012") # Keep only 2012 data
# Summarize total awards & award count
data_summary <- data_filtered %>%
group_by(Names) %>%
summarise(
Total_Awards = sum(AWARD_SUM, na.rm = TRUE),
Award_Count = sum(AWARD_COUNT, na.rm = TRUE)
)
# Add a column to highlight CSU
data_summary <- data_summary %>%
mutate(Highlight = ifelse(Names == "CSU", "Yes", "No"))
# Scatterplot: Highlighting CSU & Adding Labels
ggplot(data_summary, aes(x = Award_Count, y = Total_Awards, color = Highlight)) +
geom_point(size = 4) + # Scatterplot points
geom_text_repel(aes(label = Names), size = 4, fontface = "bold") + # Institution names
scale_y_continuous(labels = comma) +
scale_x_continuous(labels = comma) +
scale_color_manual(values = c("Yes" = "blue", "No" = "darkgray")) + # Highlight CSU
labs(title = "NSF Awards vs. Award Count (2012)",
subtitle = "CSU vs. Institutional Peers",
x = "Total Award Count",
y = "Total NSF Funding") +
theme_minimal()
data_filtered <- data_three_alternative %>%
filter(AwardYear == "2012") # Keep only 2012 data
# Summarize total awards & award count
data_summary <- data_filtered %>%
group_by(Names) %>%
summarise(
Total_Awards = sum(AWARD_SUM, na.rm = TRUE),
Award_Count = sum(AWARD_COUNT, na.rm = TRUE)
)
# Scale down award values for better readability
data_summary <- data_summary %>%
mutate(Total_Awards_Scaled = Total_Awards / 10000) # Dividing by 10,000
# Add a column to highlight CSU
data_summary <- data_summary %>%
mutate(Highlight = ifelse(Names == "CSU", "Yes", "No"))
# Bubble Plot: Award Count (X), Award Amount (Y), Bubble Size = Award Amount
ggplot(data_summary, aes(x = Award_Count, y = Total_Awards_Scaled, size = Total_Awards_Scaled, color = Highlight)) +
geom_point(alpha = 0.6) + # Transparent bubbles
# geom_text(aes(label = round(Total_Awards_Scaled, 1)), color = "black", fontface = "bold", size = 4, vjust = -1.2) + # Award value on top
geom_text_repel(aes(label = Names), size = 4) + # Institution names
scale_x_continuous(labels = comma) +
scale_y_continuous(labels = comma) +
labs(title = "NSF Funding: Award Amount vs. Award Count (2012)",
subtitle = "Bubble size represents total award amount (scaled by 10K)",
x = "Total Award Count",
y = "Total NSF Funding (in 10K)") +
theme_minimal() +
theme(legend.position = "top") +
scale_color_manual(values = c("Yes" = "darkblue", "No" = "darkgray")) # Highlight CSU
#First part of question 2
Create a different data set for plotting time series:
library(readxl)
# Load the .xlsx file
data_all <- read_xlsx("/Users/vishnujordan/Downloads/NSF_Funding_by_Year.xlsx")
# View the first few rows of the dataset
head(data_all)
## # A tibble: 6 × 6
## ID Organization AwardYear AWARD_SUM AWARD_COUNT AvarageAwardDuration
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 3DGEO DEVELOPMENT … 2005 2162000 2 4
## 2 2 3DGEO DEVELOPMENT … 2004 200000 2 0
## 3 3 3F, LLC 2008 365625 2 1
## 4 4 4WIND SCIENCE AND … 2010 328503 2 0.5
## 5 5 Abilene Christian … 2011 131267 2 2.5
## 6 6 ABIRAM … 2005 145000 2 1.5
library(ggplot2)
library(dplyr)
library(stringr)
library(scales)
library(readxl)
# Load the dataset
data_all <- read_xlsx("/Users/vishnujordan/Downloads/NSF_Funding_by_Year.xlsx")
# Assign short names to universities
data_time <- data_all %>%
filter(str_detect(Organization, paste(
c("Iowa State University", "Kansas State University", "Michigan State University",
"North Carolina State University", "Oklahoma State University", "Oregon State University",
"Purdue University", "Texas A & M University", "University of California, Davis",
"University of Illinois, Urbana-Champaign", "University of Tennessee",
"Virginia Polytechnic Institute and State University", "Washington State University",
"Colorado State University"), collapse = "|"))) %>%
mutate(Names = case_when(
str_detect(Organization, "Iowa State") ~ "ISU",
str_detect(Organization, "Kansas State") ~ "KSU",
str_detect(Organization, "Michigan State") ~ "MSU",
str_detect(Organization, "North Carolina State") ~ "NCSU",
str_detect(Organization, "Oklahoma State") ~ "OKSU",
str_detect(Organization, "Oregon State") ~ "OSU",
str_detect(Organization, "Purdue") ~ "Purdue",
str_detect(Organization, "Texas A & M") ~ "TAMU",
str_detect(Organization, "California, Davis") ~ "CU-D",
str_detect(Organization, "Illinois, Urbana-Champaign") ~ "UIUC",
str_detect(Organization, "Tennessee") ~ "UTen",
str_detect(Organization, "Virginia Polytechnic") ~ "VPISU",
str_detect(Organization, "Washington State") ~ "WSU",
str_detect(Organization, "Colorado State") ~ "CSU",
TRUE ~ "Other"
)) %>%
filter(Names != "Other") # Keep only relevant universities
# Group by Award Year and University
data_time_summary <- data_time %>%
group_by(AwardYear, Names) %>%
summarise(
Total_Awards = sum(AWARD_SUM, na.rm = TRUE),
Award_Count = sum(AWARD_COUNT, na.rm = TRUE)
) %>%
mutate(Total_Awards_Scaled = Total_Awards / 10000) # Scale for readability
# Convert AwardYear to numeric for correct plotting
data_time_summary <- data_time_summary %>%
mutate(AwardYear = as.numeric(AwardYear))
ggplot(data_time_summary, aes(x = AwardYear, y = Total_Awards_Scaled, group = Names, color = Names)) +
geom_line(size = .7) +
geom_point(size = .5) +
scale_y_continuous(labels = comma) +
labs(title = "NSF Funding Over Time",
subtitle = "Comparing Institutional Peers",
x = "Award Year",
y = "Total NSF Funding (in 10K)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Add a column to highlight CSU
data_time_summary <- data_time_summary %>%
mutate(focus = ifelse(Names == "CSU", "Yes", "No"))
ggplot(data_time_summary, aes(x = AwardYear, y = Total_Awards_Scaled, group = Names, color = focus)) +
geom_line(size = 1) +
geom_point(size = 1.5) +
scale_y_continuous(labels = comma) +
labs(title = "NSF Funding Over Time",
subtitle = "Highlighting CSU vs. Institutional Peers",
x = "Award Year",
y = "Total NSF Funding (in 10K)") +
scale_color_manual(values = c("Yes" = "darkgreen", "No" = "gray")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom")
#Second part of question
Create small-multiples:
library(ggplot2)
library(dplyr)
library(scales)
# Small Multiples: Time-Series of NSF Funding for Each Institution
ggplot(data_time_summary, aes(x = as.factor(AwardYear), y = Total_Awards_Scaled, group = Names, color = Names)) +
geom_line(size = 1) +
facet_wrap(~ Names, scales = "free_y") + # Creates separate plots per institution
xlab("Year") +
labs(title = "NSF Funding Over Time (Small Multiples)",
subtitle = "Comparing Institutional Funding Trends",
x = "Year",
y = "Total NSF Funding (in 10K)") +
theme_minimal() +
scale_color_manual(values = scales::hue_pal()(length(unique(data_time_summary$Names)))) + # Assign unique colors
theme(axis.text.x = element_text(size = 8, angle = 45, hjust = 1),
legend.position = "none") # Hide legend since each panel represents one institution