# Set global chunk options for the document
knitr::opts_chunk$set(
  echo = TRUE,      # Show code in the output
  warning = FALSE,  # Suppress warnings
  message = FALSE   # Suppress messages
)

Morgan State University

Department of Information Science & Systems

Fall 2024

INSS 615: Data Wrangling for Visualization

Name: AMOLE ADEYEMI

Due: Dec 1, 2024 (Sunday)

Questions

A. Scrape the College Ranked by Acceptance Rate dataset available at this link: https://www.oedb.org/rankings/acceptance-rate/#table-rankings and select the first 9 columns [Rank, School, Student to Faculty Ratio, Graduation Rate, Retention Rate, Acceptance Rate, Enrollment Rate, Institutional Aid Rate, and Default Rate] as the dataset for this assignment. [20 Points]

Hint: There are 6 pages of data, so you may want to use a for loop to automate the scraping process and combine the data from all 6 pages. This is just a suggestion—you are free to create the dataset without automating the web scrapping process. # Load necessary libraries

Solution:

# Load necessary libraries
library(rvest) 
library(tidyverse)
library(rvest)
library(dplyr)
library(stringr)
library(prettydoc) # for rpub publishing 

#Extract data and combine pages 1 to 6 #Define the URL for the first page you want to scrape and scrape

#extract data and combine pages 1 t0 6

# Define the URL for the first page you want to scrape
url_1 <- "https://www.oedb.org/rankings/acceptance-rate/page/1/#table-rankings"

# Read the HTML content from the URL
webpage_1 <- read_html(url_1)

# Extract the data table from the first page
table_1 <- webpage_1 %>%
  html_nodes("table") %>%
  html_table(fill = TRUE)

# Since the table is returned as a list, we extract the first table
page_1 <- table_1[[1]]

# View the scraped data from page 1
print(page_1)
## # A tibble: 100 × 15
##     Rank School        Student to Faculty R…¹ `Graduation Rate` `Retention Rate`
##    <int> <chr>         <chr>                  <chr>             <chr>           
##  1     1 Harvard Univ… 7 to 1                 98%               98%             
##  2     2 Yale Univers… 6 to 1                 97%               99%             
##  3     3 University o… 6 to 1                 95%               98%             
##  4     4 Johns Hopkin… 10 to 1                94%               97%             
##  5     5 Cornell Univ… 9 to 1                 93%               97%             
##  6     6 Tufts Univer… 9 to 1                 93%               97%             
##  7     7 University o… 17 to 1                92%               96%             
##  8     8 University o… 16 to 1                91%               96%             
##  9     9 Georgetown U… 11 to 1                94%               96%             
## 10    10 Washington U… 8 to 1                 93%               96%             
## # ℹ 90 more rows
## # ℹ abbreviated name: ¹​`Student to Faculty Ratio`
## # ℹ 10 more variables: `Acceptance Rate` <chr>, `Enrollment Rate` <chr>,
## #   `Institutional Aid Rate` <chr>, `Default Rate` <chr>, `` <chr>, `` <chr>,
## #   `` <chr>, `` <chr>, `` <chr>, `` <chr>

# Define the URL for the second page you want to scrape and scrape

# Define the URL for the second page you want to scrape
url_2 <- "https://www.oedb.org/rankings/acceptance-rate/page/2/#table-rankings"

# Read the HTML content from the URL
webpage_2 <- read_html(url_2)

# Extract the data table from the second page
table_2 <- webpage_2 %>%
  html_nodes("table") %>%
  html_table(fill = TRUE)

# Since the table is returned as a list, we extract the first table
page_2 <- table_2[[1]]

# View the scraped data from page 2
print(page_2)
## # A tibble: 100 × 15
##     Rank School        Student to Faculty R…¹ `Graduation Rate` `Retention Rate`
##    <int> <chr>         <chr>                  <chr>             <chr>           
##  1   101 Villanova Un… 12 to 1                90%               96%             
##  2   102 Brigham Youn… 20 to 1                80%               86%             
##  3   103 Fashion Inst… 20 to 1                70%               92%             
##  4   104 Howard Unive… 11 to 1                61%               89%             
##  5   105 University o… 30 to 1                70%               89%             
##  6   106 Augustana Co… 12 to 1                76%               86%             
##  7   107 North Park U… 11 to 1                52%               75%             
##  8   108 University o… 8 to 1                 41%               75%             
##  9   109 Emerson Coll… 13 to 1                80%               100%            
## 10   110 Worcester Po… 13 to 1                85%               96%             
## # ℹ 90 more rows
## # ℹ abbreviated name: ¹​`Student to Faculty Ratio`
## # ℹ 10 more variables: `Acceptance Rate` <chr>, `Enrollment Rate` <chr>,
## #   `Institutional Aid Rate` <chr>, `Default Rate` <chr>, `` <chr>, `` <chr>,
## #   `` <chr>, `` <chr>, `` <chr>, `` <chr>

# Define the URL for the third page you want to scrape and scrape

# Define the URL for the third page you want to scrape
url_3 <- "https://www.oedb.org/rankings/acceptance-rate/page/3/#table-rankings"

# Read the HTML content from the URL
webpage_3 <- read_html(url_3)

# Extract the data table from the second page
table_3 <- webpage_3 %>%
  html_nodes("table") %>%
  html_table(fill = TRUE)

# Since the table is returned as a list, we extract the first table
page_3 <- table_3[[1]]

# View the scraped data from page 3
print(page_3)
## # A tibble: 100 × 15
##     Rank School        Student to Faculty R…¹ `Graduation Rate` `Retention Rate`
##    <int> <chr>         <chr>                  <chr>             <chr>           
##  1   201 Sacred Heart… 15 to 1                64%               83%             
##  2   202 Nova Southea… 16 to 1                44%               80%             
##  3   203 Saint Mary-o… 6 to 1                 50%               74%             
##  4   204 University o… 19 to 1                63%               86%             
##  5   205 Augsburg Col… 12 to 1                56%               79%             
##  6   206 Cox College   9 to 1                 99%               100%            
##  7   207 Grace Univer… 12 to 1                46%               93%             
##  8   208 University o… 17 to 1                56%               77%             
##  9   209 Oklahoma Chr… 13 to 1                49%               79%             
## 10   210 Multnomah Un… 13 to 1                55%               70%             
## # ℹ 90 more rows
## # ℹ abbreviated name: ¹​`Student to Faculty Ratio`
## # ℹ 10 more variables: `Acceptance Rate` <chr>, `Enrollment Rate` <chr>,
## #   `Institutional Aid Rate` <chr>, `Default Rate` <chr>, `` <chr>, `` <chr>,
## #   `` <chr>, `` <chr>, `` <chr>, `` <chr>

# Define the URL for the fourth page you want to scrape and scrape

# Define the URL for the fourth page you want to scrape
url_4 <- "https://www.oedb.org/rankings/acceptance-rate/page/4/#table-rankings"

# Read the HTML content from the URL
webpage_4 <- read_html(url_4)

# Extract the data table from the second page
table_4 <- webpage_4 %>%
  html_nodes("table") %>%
  html_table(fill = TRUE)

# Since the table is returned as a list, we extract the first table
page_4 <- table_4[[1]]

# View the scraped data from page 4
print(page_4)
## # A tibble: 100 × 15
##     Rank School        Student to Faculty R…¹ `Graduation Rate` `Retention Rate`
##    <int> <chr>         <chr>                  <chr>             <chr>           
##  1   301 Lesley Unive… 10 to 1                54%               80%             
##  2   302 Wentworth In… 15 to 1                67%               84%             
##  3   303 Minnesota St… 23 to 1                49%               74%             
##  4   304 St Catherine… 10 to 1                57%               86%             
##  5   305 Ithaca Colle… 11 to 1                76%               86%             
##  6   306 Manhattan Co… 13 to 1                72%               84%             
##  7   307 The New Scho… 10 to 1                65%               81%             
##  8   308 Queens Unive… 9 to 1                 53%               74%             
##  9   309 Cedar Crest … 10 to 1                58%               79%             
## 10   310 Texas A & M … 20 to 1                79%               90%             
## # ℹ 90 more rows
## # ℹ abbreviated name: ¹​`Student to Faculty Ratio`
## # ℹ 10 more variables: `Acceptance Rate` <chr>, `Enrollment Rate` <chr>,
## #   `Institutional Aid Rate` <chr>, `Default Rate` <chr>, `` <chr>, `` <chr>,
## #   `` <chr>, `` <chr>, `` <chr>, `` <chr>

# Define the URL for the fifth page you want to scrape and scrape

# Define the URL for the Five  page you want to scrape
url_5 <- "https://www.oedb.org/rankings/acceptance-rate/page/5/#table-rankings"

# Read the HTML content from the URL
webpage_5 <- read_html(url_5)

# Extract the data table from the second page
table_5 <- webpage_5 %>%
  html_nodes("table") %>%
  html_table(fill = TRUE)

# Since the table is returned as a list, we extract the first table
page_5 <- table_5[[1]]

# View the scraped data from page 5
print(page_5)
## # A tibble: 100 × 15
##     Rank School        Student to Faculty R…¹ `Graduation Rate` `Retention Rate`
##    <int> <chr>         <chr>                  <chr>             <chr>           
##  1   401 Drexel Unive… 10 to 1                68%               84%             
##  2   402 La Salle Uni… 12 to 1                63%               78%             
##  3   403 Moravian Col… 12 to 1                69%               80%             
##  4   404 University o… 22 to 1                61%               80%             
##  5   405 Georgia Coll… 17 to 1                60%               86%             
##  6   406 Assumption C… 12 to 1                73%               83%             
##  7   407 University o… 21 to 1                67%               83%             
##  8   408 Seton Hall U… 13 to 1                63%               84%             
##  9   409 Nazareth Col… 9 to 1                 73%               84%             
## 10   410 The Universi… 16 to 1                56%               78%             
## # ℹ 90 more rows
## # ℹ abbreviated name: ¹​`Student to Faculty Ratio`
## # ℹ 10 more variables: `Acceptance Rate` <chr>, `Enrollment Rate` <chr>,
## #   `Institutional Aid Rate` <chr>, `Default Rate` <chr>, `` <chr>, `` <chr>,
## #   `` <chr>, `` <chr>, `` <chr>, `` <chr>
# Define the URL for the Five  page you want to scrape
url_6 <- "https://www.oedb.org/rankings/acceptance-rate/page/6/#table-rankings"

# Read the HTML content from the URL
webpage_6 <- read_html(url_6)

# Extract the data table from the second page
table_6 <- webpage_6 %>%
  html_nodes("table") %>%
  html_table(fill = TRUE)

# Since the table is returned as a list, we extract the first table
page_6 <- table_6[[1]]

# View the scraped data from page 6
print(page_6)
## # A tibble: 71 × 15
##     Rank School        Student to Faculty R…¹ `Graduation Rate` `Retention Rate`
##    <int> <chr>         <chr>                  <chr>             <chr>           
##  1   501 California U… 21 to 1                50%               77%             
##  2   502 University o… 21 to 1                65%               80%             
##  3   503 University o… 23 to 1                46%               75%             
##  4   504 Fort Hays St… 16 to 1                43%               70%             
##  5   505 Bryan Colleg… 9 to 1                 79%               86%             
##  6   506 Valley City … 11 to 1                41%               71%             
##  7   507 University o… 18 to 1                62%               88%             
##  8   508 Texas Woman'… 18 to 1                41%               76%             
##  9   509 West Virgini… 20 to 1                57%               79%             
## 10   510 Iowa State U… 19 to 1                71%               87%             
## # ℹ 61 more rows
## # ℹ abbreviated name: ¹​`Student to Faculty Ratio`
## # ℹ 10 more variables: `Acceptance Rate` <chr>, `Enrollment Rate` <chr>,
## #   `Institutional Aid Rate` <chr>, `Default Rate` <chr>, `` <chr>, `` <chr>,
## #   `` <chr>, `` <chr>, `` <chr>, `` <chr>
# Combine both pages into one dataset by appending (row binding)
final_data <- rbind(page_1, page_2 ,page_3,page_4 ,page_5,page_6)

#select the needed columns
final_data <- final_data[, 1:9]



# View the combined data
print(final_data)
## # A tibble: 571 × 9
##     Rank School        Student to Faculty R…¹ `Graduation Rate` `Retention Rate`
##    <int> <chr>         <chr>                  <chr>             <chr>           
##  1     1 Harvard Univ… 7 to 1                 98%               98%             
##  2     2 Yale Univers… 6 to 1                 97%               99%             
##  3     3 University o… 6 to 1                 95%               98%             
##  4     4 Johns Hopkin… 10 to 1                94%               97%             
##  5     5 Cornell Univ… 9 to 1                 93%               97%             
##  6     6 Tufts Univer… 9 to 1                 93%               97%             
##  7     7 University o… 17 to 1                92%               96%             
##  8     8 University o… 16 to 1                91%               96%             
##  9     9 Georgetown U… 11 to 1                94%               96%             
## 10    10 Washington U… 8 to 1                 93%               96%             
## # ℹ 561 more rows
## # ℹ abbreviated name: ¹​`Student to Faculty Ratio`
## # ℹ 4 more variables: `Acceptance Rate` <chr>, `Enrollment Rate` <chr>,
## #   `Institutional Aid Rate` <chr>, `Default Rate` <chr>
View(final_data)

B. You are going to need the dataset created in Question A to answer the following questions. There are 16 questions each carrying 5 points:

  1. Replace the missing values “N/A” in the dataset with NA.

Solution:

 #Ensure consistent data types across all pages
  final_data <- final_data %>%
    mutate(
      Rank = as.character(Rank),
      School = as.character(School),
      `Student to Faculty Ratio` = as.character(`Student to Faculty Ratio`),
      `Graduation Rate` = as.character(`Graduation Rate`),
      `Retention Rate` = as.character(`Retention Rate`),
      `Acceptance Rate` = as.character(`Acceptance Rate`),
      `Enrollment Rate` = as.character(`Enrollment Rate`),
      `Institutional Aid Rate` = as.character(`Institutional Aid Rate`),
      `Default Rate` = as.character(`Default Rate`)
    ) %>%
                      mutate(across(everything(), ~ na_if(., "N/A"))) # Replace "N/A" with "NA" (empty)

# View the final dataset
print(final_data)
## # A tibble: 571 × 9
##    Rank  School        Student to Faculty R…¹ `Graduation Rate` `Retention Rate`
##    <chr> <chr>         <chr>                  <chr>             <chr>           
##  1 1     Harvard Univ… 7 to 1                 98%               98%             
##  2 2     Yale Univers… 6 to 1                 97%               99%             
##  3 3     University o… 6 to 1                 95%               98%             
##  4 4     Johns Hopkin… 10 to 1                94%               97%             
##  5 5     Cornell Univ… 9 to 1                 93%               97%             
##  6 6     Tufts Univer… 9 to 1                 93%               97%             
##  7 7     University o… 17 to 1                92%               96%             
##  8 8     University o… 16 to 1                91%               96%             
##  9 9     Georgetown U… 11 to 1                94%               96%             
## 10 10    Washington U… 8 to 1                 93%               96%             
## # ℹ 561 more rows
## # ℹ abbreviated name: ¹​`Student to Faculty Ratio`
## # ℹ 4 more variables: `Acceptance Rate` <chr>, `Enrollment Rate` <chr>,
## #   `Institutional Aid Rate` <chr>, `Default Rate` <chr>
  1. Convert percentage columns (e.g., Graduation Rate) to numeric format.

Solution:

  final_data <- final_data %>%
                              mutate(
              `Graduation Rate` = as.numeric(str_remove(`Graduation Rate`, "%")) / 100,
              `Retention Rate` = as.numeric(str_remove(`Retention Rate`, "%")) / 100,
              `Acceptance Rate` = as.numeric(str_remove(`Acceptance Rate`, "%")) / 100,
              `Enrollment Rate` = as.numeric(str_remove(`Enrollment Rate`, "%")) / 100,
              `Institutional Aid Rate` = as.numeric(str_remove(`Institutional Aid Rate`, "%")) / 100,
              `Default Rate` = as.numeric(str_remove(`Default Rate`, "%")) / 100
            )


# View the final dataset
print(final_data)
## # A tibble: 571 × 9
##    Rank  School        Student to Faculty R…¹ `Graduation Rate` `Retention Rate`
##    <chr> <chr>         <chr>                              <dbl>            <dbl>
##  1 1     Harvard Univ… 7 to 1                              0.98             0.98
##  2 2     Yale Univers… 6 to 1                              0.97             0.99
##  3 3     University o… 6 to 1                              0.95             0.98
##  4 4     Johns Hopkin… 10 to 1                             0.94             0.97
##  5 5     Cornell Univ… 9 to 1                              0.93             0.97
##  6 6     Tufts Univer… 9 to 1                              0.93             0.97
##  7 7     University o… 17 to 1                             0.92             0.96
##  8 8     University o… 16 to 1                             0.91             0.96
##  9 9     Georgetown U… 11 to 1                             0.94             0.96
## 10 10    Washington U… 8 to 1                              0.93             0.96
## # ℹ 561 more rows
## # ℹ abbreviated name: ¹​`Student to Faculty Ratio`
## # ℹ 4 more variables: `Acceptance Rate` <dbl>, `Enrollment Rate` <dbl>,
## #   `Institutional Aid Rate` <dbl>, `Default Rate` <dbl>
  1. Transform the “Student to Faculty Ratio” column into two separate numeric columns: Students and Faculty.

Solution:

final_data <- final_data %>%
  separate(
    `Student to Faculty Ratio`,
    into = c("Students", "Faculty"),
    sep = " to ",
    convert = TRUE
                  )%>%
                    mutate(`Students` = as.numeric( Students),
                            `Faculty` = as.numeric( Faculty),
                           )



# View the resulting dataset
print(final_data)
## # A tibble: 571 × 10
##    Rank  School              Students Faculty `Graduation Rate` `Retention Rate`
##    <chr> <chr>                  <dbl>   <dbl>             <dbl>            <dbl>
##  1 1     Harvard University         7       1              0.98             0.98
##  2 2     Yale University            6       1              0.97             0.99
##  3 3     University of Penn…        6       1              0.95             0.98
##  4 4     Johns Hopkins Univ…       10       1              0.94             0.97
##  5 5     Cornell University         9       1              0.93             0.97
##  6 6     Tufts University           9       1              0.93             0.97
##  7 7     University of Cali…       17       1              0.92             0.96
##  8 8     University of Cali…       16       1              0.91             0.96
##  9 9     Georgetown Univers…       11       1              0.94             0.96
## 10 10    Washington Univers…        8       1              0.93             0.96
## # ℹ 561 more rows
## # ℹ 4 more variables: `Acceptance Rate` <dbl>, `Enrollment Rate` <dbl>,
## #   `Institutional Aid Rate` <dbl>, `Default Rate` <dbl>
  1. What is the count of missing values in the “Default Rate” column? Impute the missing values in the “Default Rate” column with the median value.

Solution:

# Count missing values in the "Default Rate" column
missing_count <- sum(is.na(final_data$`Default Rate`))
cat("Number of missing values in 'Default Rate':", missing_count, "\n")
## Number of missing values in 'Default Rate': 291
# Calculate the median of the "Default Rate" column, excluding NAs
default_rate_median <- median(final_data$`Default Rate`, na.rm = TRUE)

# Impute missing values in the "Default Rate" column with the median value
final_data <- final_data %>%
  mutate(`Default Rate` = ifelse(is.na(`Default Rate`), default_rate_median, `Default Rate`))

# Verify that there are no more missing values in the "Default Rate" column
cat("Number of missing values in 'Default Rate' after imputation:", sum(is.na(final_data$`Default Rate`)), "\n")
## Number of missing values in 'Default Rate' after imputation: 0
  1. Find the average graduation rate for universities ranked in the top 50.

Solution:

# Select the top 50 ranked universities
top_50_universities <- final_data %>%
  filter(as.numeric(Rank) <= 50)

# View the resulting dataset
print(top_50_universities)
## # A tibble: 50 × 10
##    Rank  School              Students Faculty `Graduation Rate` `Retention Rate`
##    <chr> <chr>                  <dbl>   <dbl>             <dbl>            <dbl>
##  1 1     Harvard University         7       1              0.98             0.98
##  2 2     Yale University            6       1              0.97             0.99
##  3 3     University of Penn…        6       1              0.95             0.98
##  4 4     Johns Hopkins Univ…       10       1              0.94             0.97
##  5 5     Cornell University         9       1              0.93             0.97
##  6 6     Tufts University           9       1              0.93             0.97
##  7 7     University of Cali…       17       1              0.92             0.96
##  8 8     University of Cali…       16       1              0.91             0.96
##  9 9     Georgetown Univers…       11       1              0.94             0.96
## 10 10    Washington Univers…        8       1              0.93             0.96
## # ℹ 40 more rows
## # ℹ 4 more variables: `Acceptance Rate` <dbl>, `Enrollment Rate` <dbl>,
## #   `Institutional Aid Rate` <dbl>, `Default Rate` <dbl>
# Calculate the average graduation rate
average_graduation_rate <- top_50_universities %>%
  summarise(AverageGraduationRate = mean(`Graduation Rate`, na.rm = TRUE)) %>%
  pull(AverageGraduationRate)

# Print the result
cat("The average graduation rate for universities ranked in the top 50 is:", round(average_graduation_rate * 100, 2), "%\n")
## The average graduation rate for universities ranked in the top 50 is: 79.18 %
  1. Filter universities with a retention rate above 90% and find the count of rows in the subset.

Solution:

# Filter universities with a retention rate above 90%
high_retention_universities <- final_data %>%
  filter(`Retention Rate` > 0.90)

# Count the number of rows in the filtered subset
count_high_retention <- nrow(high_retention_universities)

# Print the result
cat("The number of universities with a retention rate above 90% is:", count_high_retention, "\n")
## The number of universities with a retention rate above 90% is: 98
  1. Rank universities by enrollment rate in descending order and display the last 6 rows.

Solution:

# Rank universities by enrollment rate in descending order
ranked_universities <- final_data %>%
  arrange(desc(`Enrollment Rate`))

# Display the last 6 rows
last_6_universities <- tail(ranked_universities, 6)

# View the result
print(last_6_universities)
## # A tibble: 6 × 10
##   Rank  School               Students Faculty `Graduation Rate` `Retention Rate`
##   <chr> <chr>                   <dbl>   <dbl>             <dbl>            <dbl>
## 1 566   Southeastern Baptis…        6       1              0.8              0.4 
## 2 567   Touro University Wo…       13       1             NA                1   
## 3 568   Unitek College             16       1             NA                1   
## 4 569   University of Weste…       16       1              0.88            NA   
## 5 570   Virginia Baptist Co…        5       1              1                0.25
## 6 571   West Virginia Junio…       25       1              0.53             0.69
## # ℹ 4 more variables: `Acceptance Rate` <dbl>, `Enrollment Rate` <dbl>,
## #   `Institutional Aid Rate` <dbl>, `Default Rate` <dbl>
  1. Create a histogram of graduation rates using ggplot2 library.

Solution:

# Load ggplot2 library
library(ggplot2)

# Create a histogram of graduation rates
ggplot(final_data, aes(x = `Graduation Rate`)) +
  geom_histogram(binwidth = 0.05, fill = "blue", color = "black", alpha = 0.7) +
  labs(
    title = "Histogram of Graduation Rates",
    x = "Graduation Rate (Proportion)",
    y = "Frequency"
  ) +
  theme_minimal()

  1. Plot a scatterplot between acceptance rate and enrollment rate using ggplot2 library.

Solution:

# Load ggplot2 library
library(ggplot2)

# Create a scatterplot of acceptance rate vs. enrollment rate
ggplot(final_data, aes(x = `Acceptance Rate`, y = `Enrollment Rate`)) +
  geom_point(color = "blue", alpha = 0.7) +
  labs(
    title = "Scatterplot of Acceptance Rate vs Enrollment Rate",
    x = "Acceptance Rate (Proportion)",
    y = "Enrollment Rate (Proportion)"
  ) +
  theme_minimal()

  1. Calculate the average default rate by aid rate category (e.g., grouped into ranges like 0-20%, 20-40%). Display the categories.

Solution:

# Load necessary library
library(dplyr)

# Assume 'final_data' contains the necessary data
# Create aid rate categories
final_data <- final_data %>%
  mutate(AidRateCategory = case_when(
    `Institutional Aid Rate` >= 0 & `Institutional Aid Rate` < 0.2 ~ "0-20%",
    `Institutional Aid Rate` >= 0.2 & `Institutional Aid Rate` < 0.4 ~ "20-40%",
    `Institutional Aid Rate` >= 0.4 & `Institutional Aid Rate` < 0.6 ~ "40-60%",
    `Institutional Aid Rate` >= 0.6 & `Institutional Aid Rate` < 0.8 ~ "60-80%",
    `Institutional Aid Rate` >= 0.8 & `Institutional Aid Rate` <= 1 ~ "80-100%",
    TRUE ~ "Other"
  ))

# Calculate the average default rate for each aid rate category
average_default_rate <- final_data %>%
  group_by(AidRateCategory) %>%
  summarise(AverageDefaultRate = round( mean(`Default Rate`, na.rm = TRUE),2)) %>%
  arrange(AidRateCategory)

# Display the results
print(average_default_rate)
## # A tibble: 5 × 2
##   AidRateCategory AverageDefaultRate
##   <chr>                        <dbl>
## 1 0-20%                         0.12
## 2 20-40%                        0.08
## 3 40-60%                        0.06
## 4 60-80%                        0.06
## 5 80-100%                       0.06
  1. Normalize the acceptance rate to a scale of 0-1 and save in a new column “Acceptance Rate Normalized”. Display the first 6 values.

Solution:

library(scales)
# Normalize and rescale the Acceptance Rate to 0-1
final_data <- final_data %>%
  mutate(
    # Normalize: Scale Acceptance Rate to 0-1
    `Acceptance Rate Normalized` = (`Acceptance Rate` - min(`Acceptance Rate`, na.rm = TRUE)) / 
                                    (max(`Acceptance Rate`, na.rm = TRUE) - min(`Acceptance Rate`, na.rm = TRUE)),
    
    # Rescale: Adjust normalized values to fall within a target range of 0-1
    `Acceptance Rate Rescaled` = scales::rescale(`Acceptance Rate`, to = c(0, 1))
  )

# Display the first 6 values of the new columns
head(final_data %>% select(`Acceptance Rate`, `Acceptance Rate Normalized`, `Acceptance Rate Rescaled`))
## # A tibble: 6 × 3
##   `Acceptance Rate` `Acceptance Rate Normalized` `Acceptance Rate Rescaled`
##               <dbl>                        <dbl>                      <dbl>
## 1              0.06                       0                          0     
## 2              0.07                       0.0106                     0.0106
## 3              0.1                        0.0426                     0.0426
## 4              0.14                       0.0851                     0.0851
## 5              0.15                       0.0957                     0.0957
## 6              0.16                       0.106                      0.106
  1. What is the count of the duplicate entries in the “School” column? Remove duplicate university entries.

Solution:

# Count duplicate entries in the "School" column
duplicate_count <- final_data %>%
  filter(duplicated(School)) %>%
  nrow()

# Print the count of duplicates
cat("The number of duplicate entries in the 'School' column is:", duplicate_count, "\n")
## The number of duplicate entries in the 'School' column is: 3
# Remove duplicate entries in the "School" column
final_data <- final_data %>%
  distinct(School, .keep_all = TRUE)

# Verify the removal of duplicates
cat("The number of rows after removing duplicates is:", nrow(final_data), "\n")
## The number of rows after removing duplicates is: 568
  1. Find the correlation between graduation rate and retention rate (exclude the NAs in both columns).

Solution:

# Calculate the correlation between Graduation Rate and Retention Rate
correlation <- cor(
  final_data$`Graduation Rate`,
  final_data$`Retention Rate`,
  use = "complete.obs" # Exclude NAs
)

# Print the correlation
cat("The correlation between Graduation Rate and Retention Rate is (indicates a strong positive relationship between the two variables):", round(correlation, 4), "\n")
## The correlation between Graduation Rate and Retention Rate is (indicates a strong positive relationship between the two variables): 0.616
  1. Extract the values in School column into a new variable without “University” in the string. For example “Rowan University” becomes “Rowan”

Solution:

# Create a new variable with "University" removed from the School column
final_data <- final_data %>%
  mutate(School_Without_University = str_remove(School, " University"))

# Display the first 6 rows of the new variable
head(final_data %>% select(School, School_Without_University))
## # A tibble: 6 × 2
##   School                     School_Without_University 
##   <chr>                      <chr>                     
## 1 Harvard University         Harvard                   
## 2 Yale University            Yale                      
## 3 University of Pennsylvania University of Pennsylvania
## 4 Johns Hopkins University   Johns Hopkins             
## 5 Cornell University         Cornell                   
## 6 Tufts University           Tufts
  1. Count how many universities have “Institute” in their name.

Solution:

# Count the number of universities with "Institute" in their name
institute_count <- final_data %>%
  filter(str_detect(School, "Institute")) %>%
  nrow()

# Print the result
cat("The number of universities with 'Institute' in their name is:", institute_count, "\n")
## The number of universities with 'Institute' in their name is: 17
  1. Export the cleaned and processed dataset to a CSV file.

Solution:

# Specify the file path and name
output_file <- "final_data_processed.csv"

# Save the final_data dataset to a CSV file
write.csv(final_data, file = output_file, row.names = FALSE)

# Print a confirmation message
cat("The processed final_data has been saved to:", output_file, "\n")
## The processed final_data has been saved to: final_data_processed.csv