HomeWork Assignment 2

Question 1:

Bar chart, scatterplot, and bubble plot

Data Preparation

Load the required R packages:

library(tidyverse)
library(scales)

library(ggrepel)
library(readxl)

library(readxl)

# Load the .xlsx file
data_all <- read_xlsx("/Users/vishnujordan/Downloads/NSF_Funding_by_Year.xlsx")

# If you have a .xls file instead, use read_xls()
# data_all <- read_xls("/Users/vishnujordan/Downloads/NSF_Funding_by_Year.xls")

# View the first few rows of the dataset
head(data_all)

## # A tibble: 6 × 6
##      ID Organization        AwardYear AWARD_SUM AWARD_COUNT AvarageAwardDuration
##   <dbl> <chr>                   <dbl>     <dbl>       <dbl>                <dbl>
## 1     1 3DGEO DEVELOPMENT …      2005   2162000           2                  4  
## 2     2 3DGEO DEVELOPMENT …      2004    200000           2                  0  
## 3     3 3F, LLC                  2008    365625           2                  1  
## 4     4 4WIND SCIENCE AND …      2010    328503           2                  0.5
## 5     5 Abilene Christian …      2011    131267           2                  2.5
## 6     6 ABIRAM            …      2005    145000           2                  1.5

# Uncomment the next line if running in RStudio (it won't work in R Markdown)
# View(data_all)

Create a subset for visualization using the filter() and select() functions from the dplyr package.

The filter() function is used to subset a data frame, retaining all rows that satisfy your conditions. The select() function selects variables in a data frame.

str_detect (from stringr) returns a logical vector with TRUE for each element of string that matches pattern and FALSE otherwise.

data_subset <- data_all %>%
  dplyr::select(Organization, AwardYear, AWARD_SUM, AWARD_COUNT, AvarageAwardDuration) %>%
  filter(AwardYear == "2012") %>%
  filter(
    str_detect(Organization, "Iowa State University") |
    str_detect(Organization, "Kansas State University") |
    str_detect(Organization, "Michigan State University") |
    str_detect(Organization, "North Carolina State University") |
    str_detect(Organization, "Oklahoma State University") |
    str_detect(Organization, "Oregon State University") |
    str_detect(Organization, "Purdue University") |
    str_detect(Organization, "Texas A & M University") |
    str_detect(Organization, "University of California, Davis") |
    str_detect(Organization, "University of Illinois, Urbana-Champaign") |
    str_detect(Organization, "University of Tennessee") |
    str_detect(Organization, "Virginia Polytechnic Institute and State University") |
    str_detect(Organization, "Washington State University") |
    str_detect(Organization, "Colorado State University")
  )

view(data_subset)

An error may occur because the select() function from the MASS package clashes with the select() function from the dplyr package. The easiest way to fix this error is to explicitly tell R to use the select() function from the dplyr package by using dplyr::select().

View a list of Organization: (converting them into factor instead of charactor)

data_subset$Organization <- as.factor(data_subset$Organization)
levels(data_subset$Organization)

##  [1] "Colorado State University"                          
##  [2] "Iowa State University"                              
##  [3] "Kansas State University"                            
##  [4] "Michigan State University"                          
##  [5] "North Carolina State University"                    
##  [6] "Oklahoma State University"                          
##  [7] "Oregon State University"                            
##  [8] "Purdue University"                                  
##  [9] "University of Tennessee Chattanooga"                
## [10] "University of Tennessee Knoxville"                  
## [11] "Virginia Polytechnic Institute and State University"
## [12] "Washington State University"

#unique(data_subset$Organization)
levels(data_subset$Organization)

##  [1] "Colorado State University"                          
##  [2] "Iowa State University"                              
##  [3] "Kansas State University"                            
##  [4] "Michigan State University"                          
##  [5] "North Carolina State University"                    
##  [6] "Oklahoma State University"                          
##  [7] "Oregon State University"                            
##  [8] "Purdue University"                                  
##  [9] "University of Tennessee Chattanooga"                
## [10] "University of Tennessee Knoxville"                  
## [11] "Virginia Polytechnic Institute and State University"
## [12] "Washington State University"

#I have used Approaching 3,, becuase it is scalable and more reliable for mutate and filtering (so ignoreing appraching 1 and 2)

Approach 3

Approach 1 and Approach 2 are not quite scalable. Look for a more efficient approach when the list of Organization gets longer:

library(dplyr)
library(stringr)

# Replace university names with simplified names
data_three_alternative <- data_subset %>%
  mutate(Names = str_replace(Organization, ".*Iowa.*", "ISU")) %>%
  mutate(Names = str_replace(Names, ".*Kansas.*", "KSU")) %>%
  mutate(Names = str_replace(Names, ".*Michigan.*", "MSU")) %>%
  mutate(Names = str_replace(Names, ".*Carolina.*", "NCSU")) %>%
  mutate(Names = str_replace(Names, ".*Oklahoma.*", "OKSU")) %>%
  mutate(Names = str_replace(Names, ".*Oregon.*", "OSU")) %>%
  mutate(Names = str_replace(Names, ".*Purdue.*", "Purdue")) %>%
  mutate(Names = str_replace(Names, ".*Texas.*", "TAMU")) %>%
  mutate(Names = str_replace(Names, ".*California.*", "CU-D")) %>%
  mutate(Names = str_replace(Names, ".*Illinois.*", "UIUC")) %>%
  mutate(Names = str_replace(Names, ".*Tennessee.*", "UTen")) %>%
  mutate(Names = str_replace(Names, ".*Virginia.*", "VPISU")) %>%
  mutate(Names = str_replace(Names, ".*Washington.*", "WSU")) %>%
  mutate(Names = str_replace(Names, ".*Colorado.*", "CSU")) %>%

  # Filter only specific values in the "Three_Names" column
filter(Names %in% c("ISU", "KSU", "MSU", "NCSU", "OKSU", "OSU", "Purdue", 
                      "TAMU", "CU-D", "UIUC", "UTen", "VPISU", "WSU", "CSU")) %>%

  droplevels()

# View unique names
levels(as.factor(data_three_alternative$Names))

##  [1] "CSU"    "ISU"    "KSU"    "MSU"    "NCSU"   "OKSU"   "OSU"    "Purdue"
##  [9] "UTen"   "VPISU"  "WSU"

Create a summary for plotting:

data_three_alternative_summary <- data_three_alternative %>% 
  group_by(Names) %>%
  summarise(ts=sum(AWARD_SUM), tc=sum(AWARD_COUNT))
data_three_alternative_summary

## # A tibble: 11 × 3
##    Names        ts    tc
##    <chr>     <dbl> <dbl>
##  1 CSU    11592315    41
##  2 ISU    16738011    65
##  3 KSU     8839116    23
##  4 MSU    17351663    58
##  5 NCSU   31440526    91
##  6 OKSU    5331516    21
##  7 OSU    23180107    86
##  8 Purdue 33991966   121
##  9 UTen   13646485    51
## 10 VPISU  19609620    78
## 11 WSU    14509261    43

Verifying the data

data_three_alternative_summary <- data_three_alternative %>%
  group_by(Names) %>%
  summarise(ts = sum(AWARD_SUM, na.rm = TRUE), 
            tc = sum(AWARD_COUNT, na.rm = TRUE))
          
library(ggplot2)

ggplot(data_three_alternative_summary, aes(x = Names, y = ts, fill = Names)) +
  geom_col() +
  theme_minimal() +
  labs(title = "Total Award Sum by University",
       x = "University",
       y = "Total Award Sum")

#### Data Visualization

colnames(data_three_alternative_summary)

## [1] "Names" "ts"    "tc"

Bar chart

head(data_three_alternative_summary)

## # A tibble: 6 × 3
##   Names       ts    tc
##   <chr>    <dbl> <dbl>
## 1 CSU   11592315    41
## 2 ISU   16738011    65
## 3 KSU    8839116    23
## 4 MSU   17351663    58
## 5 NCSU  31440526    91
## 6 OKSU   5331516    21

ggplot(data_three_alternative_summary, aes(y = reorder(Names, ts), x = ts)) +
  geom_col()+
  scale_x_continuous(labels = comma)+
  labs(title = "NSF Funding (Year 2012)",
       subtitle = "CSU vs. Peer Orgs",
       x = "Total Sum",
       y = "Orgs") +
   theme(aspect.ratio = 1/2.5)

library(dplyr)
library(ggplot2)
library(scales)

# Use the dataset that already has `Names`
data_filtered <- data_three_alternative %>%
  filter(AwardYear == "2012")  # Keep only 2012 data

# Summarize total awards & award count
data_summary <- data_filtered %>%
  group_by(Names) %>%
  summarise(
    Total_Awards = sum(AWARD_SUM, na.rm = TRUE),
    Award_Count = sum(AWARD_COUNT, na.rm = TRUE)
  )

# Add a column to highlight whether its csu or not (to filter later)
data_summary <- data_summary %>%
  mutate(Highlight = ifelse(Names == "CSU", "Yes", "No"))

# Bar Chart: Highlighting CSU + Showing Award Count as Labels
ggplot(data_summary, aes(x = reorder(Names, -Total_Awards), y = Total_Awards/1000, fill = Highlight)) +
  geom_col() +
  geom_text(aes(label = Award_Count), vjust = -0.5, size = 4, color = "black") +  # Display award count on top of bars
  scale_fill_manual(values = c("Yes" = "yellow", "No" = "gray")) +
  scale_y_continuous(labels = comma) +
  labs(title = "Total NSF Awards (2012)",
       subtitle = "CSU vs. Institutional Peers",
       x = "Institution",
       y = "Award Amount (in thousands)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels

ggplot(data_three_alternative_summary, aes(reorder(Names, -ts), ts/1000)) +
  geom_col() +
  ylab("Sales (thousands)") +
  xlab("Brand") +
  theme(aspect.ratio = 1/1.1)

Scatterplot

# Use the dataset that already has `Names`
data_filtered <- data_three_alternative %>%
  filter(AwardYear == "2012")  # Keep only 2012 data

# Summarize total awards & award count
data_summary <- data_filtered %>%
  group_by(Names) %>%
  summarise(
    Total_Awards = sum(AWARD_SUM, na.rm = TRUE),
    Award_Count = sum(AWARD_COUNT, na.rm = TRUE)
  )

# Add a column to highlight CSU
data_summary <- data_summary %>%
  mutate(Highlight = ifelse(Names == "CSU", "Yes", "No"))

# Scatterplot: Highlighting CSU & Adding Labels
ggplot(data_summary, aes(x = Award_Count, y = Total_Awards, color = Highlight)) +
  geom_point(size = 4) +  # Scatterplot points
  geom_text_repel(aes(label = Names), size = 4, fontface = "bold") +  # Institution names
  scale_y_continuous(labels = comma) +
  scale_x_continuous(labels = comma) +
  scale_color_manual(values = c("Yes" = "blue", "No" = "darkgray")) +  # Highlight CSU
  labs(title = "NSF Awards vs. Award Count (2012)",
       subtitle = "CSU vs. Institutional Peers",
       x = "Total Award Count",
       y = "Total NSF Funding") +
  theme_minimal()

Bubble plot

data_filtered <- data_three_alternative %>%
  filter(AwardYear == "2012")  # Keep only 2012 data

# Summarize total awards & award count
data_summary <- data_filtered %>%
  group_by(Names) %>%
  summarise(
    Total_Awards = sum(AWARD_SUM, na.rm = TRUE),
    Award_Count = sum(AWARD_COUNT, na.rm = TRUE)
  )

# Scale down award values for better readability
data_summary <- data_summary %>%
  mutate(Total_Awards_Scaled = Total_Awards / 10000)  # Dividing by 10,000

# Add a column to highlight CSU
data_summary <- data_summary %>%
  mutate(Highlight = ifelse(Names == "CSU", "Yes", "No"))

# Bubble Plot: Award Count (X), Award Amount (Y), Bubble Size = Award Amount
ggplot(data_summary, aes(x = Award_Count, y = Total_Awards_Scaled, size = Total_Awards_Scaled, color = Highlight)) +
  geom_point(alpha = 0.6) +  # Transparent bubbles
 # geom_text(aes(label = round(Total_Awards_Scaled, 1)), color = "black", fontface = "bold", size = 4, vjust = -1.2) +  # Award value on top
  geom_text_repel(aes(label = Names), size = 4) +  # Institution names
  scale_x_continuous(labels = comma) +
  scale_y_continuous(labels = comma) +
  labs(title = "NSF Funding: Award Amount vs. Award Count (2012)",
       subtitle = "Bubble size represents total award amount (scaled by 10K)",
       x = "Total Award Count",
       y = "Total NSF Funding (in 10K)") +
  theme_minimal() +
  theme(legend.position = "top") +
  scale_color_manual(values = c("Yes" = "darkblue", "No" = "darkgray"))  # Highlight CSU

Question 2: Time series

#First part of question 2

Data Preparation

Create a different data set for plotting time series:

library(readxl)

# Load the .xlsx file
data_all <- read_xlsx("/Users/vishnujordan/Downloads/NSF_Funding_by_Year.xlsx")



# View the first few rows of the dataset
head(data_all)

## # A tibble: 6 × 6
##      ID Organization        AwardYear AWARD_SUM AWARD_COUNT AvarageAwardDuration
##   <dbl> <chr>                   <dbl>     <dbl>       <dbl>                <dbl>
## 1     1 3DGEO DEVELOPMENT …      2005   2162000           2                  4  
## 2     2 3DGEO DEVELOPMENT …      2004    200000           2                  0  
## 3     3 3F, LLC                  2008    365625           2                  1  
## 4     4 4WIND SCIENCE AND …      2010    328503           2                  0.5
## 5     5 Abilene Christian …      2011    131267           2                  2.5
## 6     6 ABIRAM            …      2005    145000           2                  1.5

library(ggplot2)
library(dplyr)
library(stringr)
library(scales)
library(readxl)

# Load the dataset
data_all <- read_xlsx("/Users/vishnujordan/Downloads/NSF_Funding_by_Year.xlsx")

# Assign short names to universities
data_time <- data_all %>%
  filter(str_detect(Organization, paste(
    c("Iowa State University", "Kansas State University", "Michigan State University",
      "North Carolina State University", "Oklahoma State University", "Oregon State University",
      "Purdue University", "Texas A & M University", "University of California, Davis",
      "University of Illinois, Urbana-Champaign", "University of Tennessee",
      "Virginia Polytechnic Institute and State University", "Washington State University",
      "Colorado State University"), collapse = "|"))) %>%
  mutate(Names = case_when(
    str_detect(Organization, "Iowa State") ~ "ISU",
    str_detect(Organization, "Kansas State") ~ "KSU",
    str_detect(Organization, "Michigan State") ~ "MSU",
    str_detect(Organization, "North Carolina State") ~ "NCSU",
    str_detect(Organization, "Oklahoma State") ~ "OKSU",
    str_detect(Organization, "Oregon State") ~ "OSU",
    str_detect(Organization, "Purdue") ~ "Purdue",
    str_detect(Organization, "Texas A & M") ~ "TAMU",
    str_detect(Organization, "California, Davis") ~ "CU-D",
    str_detect(Organization, "Illinois, Urbana-Champaign") ~ "UIUC",
    str_detect(Organization, "Tennessee") ~ "UTen",
    str_detect(Organization, "Virginia Polytechnic") ~ "VPISU",
    str_detect(Organization, "Washington State") ~ "WSU",
    str_detect(Organization, "Colorado State") ~ "CSU",
    TRUE ~ "Other"
  )) %>%
  filter(Names != "Other")  # Keep only relevant universities
# Group by Award Year and University
data_time_summary <- data_time %>%
  group_by(AwardYear, Names) %>%
  summarise(
    Total_Awards = sum(AWARD_SUM, na.rm = TRUE),
    Award_Count = sum(AWARD_COUNT, na.rm = TRUE)
  ) %>%
  mutate(Total_Awards_Scaled = Total_Awards / 10000)  # Scale for readability

# Convert AwardYear to numeric for correct plotting
data_time_summary <- data_time_summary %>%
  mutate(AwardYear = as.numeric(AwardYear))
ggplot(data_time_summary, aes(x = AwardYear, y = Total_Awards_Scaled, group = Names, color = Names)) + 
  geom_line(size = .7) +  
  geom_point(size = .5) +  
  scale_y_continuous(labels = comma) +
  labs(title = "NSF Funding Over Time",
       subtitle = "Comparing Institutional Peers",
       x = "Award Year",
       y = "Total NSF Funding (in 10K)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "bottom")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Add a column to highlight CSU
data_time_summary <- data_time_summary %>%
  mutate(focus = ifelse(Names == "CSU", "Yes", "No"))

ggplot(data_time_summary, aes(x = AwardYear, y = Total_Awards_Scaled, group = Names, color = focus)) + 
  geom_line(size = 1) +
  geom_point(size = 1.5) +
  scale_y_continuous(labels = comma) +
  labs(title = "NSF Funding Over Time",
       subtitle = "Highlighting CSU vs. Institutional Peers",
       x = "Award Year",
       y = "Total NSF Funding (in 10K)") +
  scale_color_manual(values = c("Yes" = "darkgreen", "No" = "gray")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "bottom")

#Second part of question

Create small-multiples:

library(ggplot2)
library(dplyr)
library(scales)

# Small Multiples: Time-Series of NSF Funding for Each Institution
ggplot(data_time_summary, aes(x = as.factor(AwardYear), y = Total_Awards_Scaled, group = Names, color = Names)) + 
  geom_line(size = 1) +  
  facet_wrap(~ Names, scales = "free_y") +  # Creates separate plots per institution
  xlab("Year") + 
  labs(title = "NSF Funding Over Time (Small Multiples)",
       subtitle = "Comparing Institutional Funding Trends",
       x = "Year",
       y = "Total NSF Funding (in 10K)") +
  theme_minimal() +
  scale_color_manual(values = scales::hue_pal()(length(unique(data_time_summary$Names)))) +  # Assign unique colors
  theme(axis.text.x = element_text(size = 8, angle = 45, hjust = 1),
        legend.position = "none")  # Hide legend since each panel represents one institution

HomeWork Assignment 2 - Jawahar

Question 1:

Bar chart, scatterplot, and bubble plot

Data Preparation

Question 2: Time series

Data Preparation