# Research Questions
# 1. What percentage of the Top 50 national universities are located in Northeast?
# 2. Do top ranked Northeast schools have the highest employment rates post graduation?
# 3. How does the employment rate differ between top tier public and private universities across the country?
# 4. Does the Northeast region maintain a higher ratio of international students compared to other U.S. regions within the Top 50?
# install.packages("RKaggle")
library(RKaggle)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
universitydata <- RKaggle::get_dataset("nudratabbas/top-50-universities-in-the-u-s-forbes-2026")
## Rows: 50 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): University_Name, Institution_Type, State
## dbl (5): National_Rank, Founded_Year, Research_Impact_Score, Intl_Student_Ra...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(universitydata)
## spc_tbl_ [50 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ University_Name      : chr [1:50] "Massachusetts Institute of Technology (MIT)" "Columbia University" "Princeton University" "Stanford University" ...
##  $ National_Rank        : num [1:50] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Founded_Year         : num [1:50] 1861 1754 1746 1891 1868 ...
##  $ Institution_Type     : chr [1:50] "Private" "Private" "Private" "Private" ...
##  $ State                : chr [1:50] "MA" "NY" "NJ" "CA" ...
##  $ Research_Impact_Score: num [1:50] 100 95.9 99 99.5 98.9 98.9 45.2 96.3 97.2 96.1 ...
##  $ Intl_Student_Ratio   : num [1:50] 91.6 83.7 70 73.5 70.6 81.4 8.5 94.2 72.7 76.3 ...
##  $ Employment_Rate      : num [1:50] 96.2 92.1 94.5 97.8 91.4 95 89.1 90.5 93.3 94.8 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   University_Name = col_character(),
##   ..   National_Rank = col_double(),
##   ..   Founded_Year = col_double(),
##   ..   Institution_Type = col_character(),
##   ..   State = col_character(),
##   ..   Research_Impact_Score = col_double(),
##   ..   Intl_Student_Ratio = col_double(),
##   ..   Employment_Rate = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(universitydata)
##  University_Name    National_Rank    Founded_Year  Institution_Type  
##  Length:50          Min.   : 1.00   Min.   :1636   Length:50         
##  Class :character   1st Qu.:13.25   1st Qu.:1818   Class :character  
##  Mode  :character   Median :25.50   Median :1858   Mode  :character  
##                     Mean   :25.50   Mean   :1844                     
##                     3rd Qu.:37.75   3rd Qu.:1884                     
##                     Max.   :50.00   Max.   :1965                     
##     State           Research_Impact_Score Intl_Student_Ratio Employment_Rate
##  Length:50          Min.   : 28.50        Min.   : 4.20      Min.   :81.90  
##  Class :character   1st Qu.: 82.10        1st Qu.:14.90      1st Qu.:86.90  
##  Mode  :character   Median : 89.65        Median :35.80      Median :89.30  
##                     Mean   : 80.66        Mean   :44.63      Mean   :89.42  
##                     3rd Qu.: 96.25        3rd Qu.:73.95      3rd Qu.:92.33  
##                     Max.   :100.00        Max.   :94.20      Max.   :97.80
sum(is.na(universitydata))
## [1] 0
ne_states <- c("CT", "ME", "MA", "NH", "RI", "VT", "NY", "NJ", "PA")
west_states <- c("CA", "WA", "OR", "CO","HI", "AZ", "MT", "NV", "UT", "NM", "ID", "AK", "WY" )
south_states <- c("TX", "FL", "GA", "NC", "VA", "SC", "LA", "OK", "AR", "MS", "TN", "WV", "AL", "MD", "DE", "DC")
midwest_states <- c("ND", "SD", "NE", "KS", "MN", "IA", "MO", "WI", "IL", "MI", "IN", "OH")
# 1. What percentage of the Top 50 national universities are located in Northeast?
ne_summary <- universitydata %>%
  mutate(is_ne = ifelse(State %in% ne_states, "New England", "Other")) %>%
  group_by(is_ne) %>%
  summarise(count = n()) %>%
  mutate(percentage = (count / sum(count)) * 100)

ne_summary
## # A tibble: 2 × 3
##   is_ne       count percentage
##   <chr>       <int>      <dbl>
## 1 New England    18         36
## 2 Other          32         64
# 2. Do top ranked Northeast schools have the highest employment rates post graduation?
university_data <- universitydata %>%
  mutate(Region = case_when(
    State %in% ne_states ~ "Northeast",
    State %in% west_states      ~ "West",
    State %in% south_states     ~ "South",
    State %in% midwest_states   ~ "Midwest",
    TRUE                        ~ "Other"
  ))

regional_stats <- university_data %>%
  group_by(Region) %>%
  summarise(
    School_Count = n(),
    Employment = mean(Employment_Rate, na.rm = TRUE),
  ) %>%
  mutate(Percentage_of_Top_50 = (School_Count / sum(School_Count)) * 100)

print(regional_stats)
## # A tibble: 4 × 4
##   Region    School_Count Employment Percentage_of_Top_50
##   <chr>            <int>      <dbl>                <dbl>
## 1 Midwest              6       90.2                   12
## 2 Northeast           18       90.5                   36
## 3 South               13       88.6                   26
## 4 West                13       88.4                   26
# 3. How does the employment rate differ between top tier public and private universities across the country?
type_comparison <- university_data %>%
  group_by(Institution_Type) %>% 
  summarise(
    School_Count = n(),
    EmploymentRate = mean(Employment_Rate, na.rm = TRUE),
  )

print(type_comparison)
## # A tibble: 2 × 3
##   Institution_Type School_Count EmploymentRate
##   <chr>                   <int>          <dbl>
## 1 Private                    35           90.5
## 2 Public                     15           86.9
# 4. Does the Northeast region maintain a higher ratio of international students compared to other U.S. regions within the Top 50?

international_comparison <- university_data %>%
  group_by(Region) %>%
  summarise(
    Avg_International_Ratio = mean(Intl_Student_Ratio, na.rm = TRUE),
    School_Count = n()
  ) %>%
  arrange(desc(Avg_International_Ratio))

print(international_comparison)
## # A tibble: 4 × 3
##   Region    Avg_International_Ratio School_Count
##   <chr>                       <dbl>        <int>
## 1 Midwest                      54.3            6
## 2 Northeast                    48.9           18
## 3 West                         45.3           13
## 4 South                        33.6           13