# Research Questions
# 1. What percentage of the Top 50 national universities are located in Northeast?
# 2. Do top ranked Northeast schools have the highest employment rates post graduation?
# 3. How does the employment rate differ between top tier public and private universities across the country?
# 4. Does the Northeast region maintain a higher ratio of international students compared to other U.S. regions within the Top 50?
# install.packages("RKaggle")
library(RKaggle)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
universitydata <- RKaggle::get_dataset("nudratabbas/top-50-universities-in-the-u-s-forbes-2026")
## Rows: 50 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): University_Name, Institution_Type, State
## dbl (5): National_Rank, Founded_Year, Research_Impact_Score, Intl_Student_Ra...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(universitydata)
## spc_tbl_ [50 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ University_Name : chr [1:50] "Massachusetts Institute of Technology (MIT)" "Columbia University" "Princeton University" "Stanford University" ...
## $ National_Rank : num [1:50] 1 2 3 4 5 6 7 8 9 10 ...
## $ Founded_Year : num [1:50] 1861 1754 1746 1891 1868 ...
## $ Institution_Type : chr [1:50] "Private" "Private" "Private" "Private" ...
## $ State : chr [1:50] "MA" "NY" "NJ" "CA" ...
## $ Research_Impact_Score: num [1:50] 100 95.9 99 99.5 98.9 98.9 45.2 96.3 97.2 96.1 ...
## $ Intl_Student_Ratio : num [1:50] 91.6 83.7 70 73.5 70.6 81.4 8.5 94.2 72.7 76.3 ...
## $ Employment_Rate : num [1:50] 96.2 92.1 94.5 97.8 91.4 95 89.1 90.5 93.3 94.8 ...
## - attr(*, "spec")=
## .. cols(
## .. University_Name = col_character(),
## .. National_Rank = col_double(),
## .. Founded_Year = col_double(),
## .. Institution_Type = col_character(),
## .. State = col_character(),
## .. Research_Impact_Score = col_double(),
## .. Intl_Student_Ratio = col_double(),
## .. Employment_Rate = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(universitydata)
## University_Name National_Rank Founded_Year Institution_Type
## Length:50 Min. : 1.00 Min. :1636 Length:50
## Class :character 1st Qu.:13.25 1st Qu.:1818 Class :character
## Mode :character Median :25.50 Median :1858 Mode :character
## Mean :25.50 Mean :1844
## 3rd Qu.:37.75 3rd Qu.:1884
## Max. :50.00 Max. :1965
## State Research_Impact_Score Intl_Student_Ratio Employment_Rate
## Length:50 Min. : 28.50 Min. : 4.20 Min. :81.90
## Class :character 1st Qu.: 82.10 1st Qu.:14.90 1st Qu.:86.90
## Mode :character Median : 89.65 Median :35.80 Median :89.30
## Mean : 80.66 Mean :44.63 Mean :89.42
## 3rd Qu.: 96.25 3rd Qu.:73.95 3rd Qu.:92.33
## Max. :100.00 Max. :94.20 Max. :97.80
sum(is.na(universitydata))
## [1] 0
ne_states <- c("CT", "ME", "MA", "NH", "RI", "VT", "NY", "NJ", "PA")
west_states <- c("CA", "WA", "OR", "CO","HI", "AZ", "MT", "NV", "UT", "NM", "ID", "AK", "WY" )
south_states <- c("TX", "FL", "GA", "NC", "VA", "SC", "LA", "OK", "AR", "MS", "TN", "WV", "AL", "MD", "DE", "DC")
midwest_states <- c("ND", "SD", "NE", "KS", "MN", "IA", "MO", "WI", "IL", "MI", "IN", "OH")
# 1. What percentage of the Top 50 national universities are located in Northeast?
ne_summary <- universitydata %>%
mutate(is_ne = ifelse(State %in% ne_states, "New England", "Other")) %>%
group_by(is_ne) %>%
summarise(count = n()) %>%
mutate(percentage = (count / sum(count)) * 100)
ne_summary
## # A tibble: 2 × 3
## is_ne count percentage
## <chr> <int> <dbl>
## 1 New England 18 36
## 2 Other 32 64
# 2. Do top ranked Northeast schools have the highest employment rates post graduation?
university_data <- universitydata %>%
mutate(Region = case_when(
State %in% ne_states ~ "Northeast",
State %in% west_states ~ "West",
State %in% south_states ~ "South",
State %in% midwest_states ~ "Midwest",
TRUE ~ "Other"
))
regional_stats <- university_data %>%
group_by(Region) %>%
summarise(
School_Count = n(),
Employment = mean(Employment_Rate, na.rm = TRUE),
) %>%
mutate(Percentage_of_Top_50 = (School_Count / sum(School_Count)) * 100)
print(regional_stats)
## # A tibble: 4 × 4
## Region School_Count Employment Percentage_of_Top_50
## <chr> <int> <dbl> <dbl>
## 1 Midwest 6 90.2 12
## 2 Northeast 18 90.5 36
## 3 South 13 88.6 26
## 4 West 13 88.4 26
# 3. How does the employment rate differ between top tier public and private universities across the country?
type_comparison <- university_data %>%
group_by(Institution_Type) %>%
summarise(
School_Count = n(),
EmploymentRate = mean(Employment_Rate, na.rm = TRUE),
)
print(type_comparison)
## # A tibble: 2 × 3
## Institution_Type School_Count EmploymentRate
## <chr> <int> <dbl>
## 1 Private 35 90.5
## 2 Public 15 86.9
# 4. Does the Northeast region maintain a higher ratio of international students compared to other U.S. regions within the Top 50?
international_comparison <- university_data %>%
group_by(Region) %>%
summarise(
Avg_International_Ratio = mean(Intl_Student_Ratio, na.rm = TRUE),
School_Count = n()
) %>%
arrange(desc(Avg_International_Ratio))
print(international_comparison)
## # A tibble: 4 × 3
## Region Avg_International_Ratio School_Count
## <chr> <dbl> <int>
## 1 Midwest 54.3 6
## 2 Northeast 48.9 18
## 3 West 45.3 13
## 4 South 33.6 13