# Set global chunk options for the document
knitr::opts_chunk$set(
echo = TRUE, # Show code in the output
warning = FALSE, # Suppress warnings
message = FALSE # Suppress messages
)Morgan State University
Department of Information Science & Systems
Fall 2024
INSS 615: Data Wrangling for Visualization
Name: AMOLE ADEYEMI
Due: Dec 1, 2024 (Sunday)
Questions
A. Scrape the College Ranked by Acceptance Rate dataset available at this link: https://www.oedb.org/rankings/acceptance-rate/#table-rankings and select the first 9 columns [Rank, School, Student to Faculty Ratio, Graduation Rate, Retention Rate, Acceptance Rate, Enrollment Rate, Institutional Aid Rate, and Default Rate] as the dataset for this assignment. [20 Points]
Hint: There are 6 pages of data, so you may want to use a for loop to automate the scraping process and combine the data from all 6 pages. This is just a suggestion—you are free to create the dataset without automating the web scrapping process. # Load necessary libraries
Solution:
# Load necessary libraries
library(rvest)
library(tidyverse)
library(rvest)
library(dplyr)
library(stringr)
library(prettydoc) # for rpub publishing #Extract data and combine pages 1 to 6 #Define the URL for the first page you want to scrape and scrape
#extract data and combine pages 1 t0 6
# Define the URL for the first page you want to scrape
url_1 <- "https://www.oedb.org/rankings/acceptance-rate/page/1/#table-rankings"
# Read the HTML content from the URL
webpage_1 <- read_html(url_1)
# Extract the data table from the first page
table_1 <- webpage_1 %>%
html_nodes("table") %>%
html_table(fill = TRUE)
# Since the table is returned as a list, we extract the first table
page_1 <- table_1[[1]]
# View the scraped data from page 1
print(page_1)## # A tibble: 100 × 15
## Rank School Student to Faculty R…¹ `Graduation Rate` `Retention Rate`
## <int> <chr> <chr> <chr> <chr>
## 1 1 Harvard Univ… 7 to 1 98% 98%
## 2 2 Yale Univers… 6 to 1 97% 99%
## 3 3 University o… 6 to 1 95% 98%
## 4 4 Johns Hopkin… 10 to 1 94% 97%
## 5 5 Cornell Univ… 9 to 1 93% 97%
## 6 6 Tufts Univer… 9 to 1 93% 97%
## 7 7 University o… 17 to 1 92% 96%
## 8 8 University o… 16 to 1 91% 96%
## 9 9 Georgetown U… 11 to 1 94% 96%
## 10 10 Washington U… 8 to 1 93% 96%
## # ℹ 90 more rows
## # ℹ abbreviated name: ¹`Student to Faculty Ratio`
## # ℹ 10 more variables: `Acceptance Rate` <chr>, `Enrollment Rate` <chr>,
## # `Institutional Aid Rate` <chr>, `Default Rate` <chr>, `` <chr>, `` <chr>,
## # `` <chr>, `` <chr>, `` <chr>, `` <chr>
# Define the URL for the second page you want to scrape and scrape
# Define the URL for the second page you want to scrape
url_2 <- "https://www.oedb.org/rankings/acceptance-rate/page/2/#table-rankings"
# Read the HTML content from the URL
webpage_2 <- read_html(url_2)
# Extract the data table from the second page
table_2 <- webpage_2 %>%
html_nodes("table") %>%
html_table(fill = TRUE)
# Since the table is returned as a list, we extract the first table
page_2 <- table_2[[1]]
# View the scraped data from page 2
print(page_2)## # A tibble: 100 × 15
## Rank School Student to Faculty R…¹ `Graduation Rate` `Retention Rate`
## <int> <chr> <chr> <chr> <chr>
## 1 101 Villanova Un… 12 to 1 90% 96%
## 2 102 Brigham Youn… 20 to 1 80% 86%
## 3 103 Fashion Inst… 20 to 1 70% 92%
## 4 104 Howard Unive… 11 to 1 61% 89%
## 5 105 University o… 30 to 1 70% 89%
## 6 106 Augustana Co… 12 to 1 76% 86%
## 7 107 North Park U… 11 to 1 52% 75%
## 8 108 University o… 8 to 1 41% 75%
## 9 109 Emerson Coll… 13 to 1 80% 100%
## 10 110 Worcester Po… 13 to 1 85% 96%
## # ℹ 90 more rows
## # ℹ abbreviated name: ¹`Student to Faculty Ratio`
## # ℹ 10 more variables: `Acceptance Rate` <chr>, `Enrollment Rate` <chr>,
## # `Institutional Aid Rate` <chr>, `Default Rate` <chr>, `` <chr>, `` <chr>,
## # `` <chr>, `` <chr>, `` <chr>, `` <chr>
# Define the URL for the third page you want to scrape and scrape
# Define the URL for the third page you want to scrape
url_3 <- "https://www.oedb.org/rankings/acceptance-rate/page/3/#table-rankings"
# Read the HTML content from the URL
webpage_3 <- read_html(url_3)
# Extract the data table from the second page
table_3 <- webpage_3 %>%
html_nodes("table") %>%
html_table(fill = TRUE)
# Since the table is returned as a list, we extract the first table
page_3 <- table_3[[1]]
# View the scraped data from page 3
print(page_3)## # A tibble: 100 × 15
## Rank School Student to Faculty R…¹ `Graduation Rate` `Retention Rate`
## <int> <chr> <chr> <chr> <chr>
## 1 201 Sacred Heart… 15 to 1 64% 83%
## 2 202 Nova Southea… 16 to 1 44% 80%
## 3 203 Saint Mary-o… 6 to 1 50% 74%
## 4 204 University o… 19 to 1 63% 86%
## 5 205 Augsburg Col… 12 to 1 56% 79%
## 6 206 Cox College 9 to 1 99% 100%
## 7 207 Grace Univer… 12 to 1 46% 93%
## 8 208 University o… 17 to 1 56% 77%
## 9 209 Oklahoma Chr… 13 to 1 49% 79%
## 10 210 Multnomah Un… 13 to 1 55% 70%
## # ℹ 90 more rows
## # ℹ abbreviated name: ¹`Student to Faculty Ratio`
## # ℹ 10 more variables: `Acceptance Rate` <chr>, `Enrollment Rate` <chr>,
## # `Institutional Aid Rate` <chr>, `Default Rate` <chr>, `` <chr>, `` <chr>,
## # `` <chr>, `` <chr>, `` <chr>, `` <chr>
# Define the URL for the fourth page you want to scrape and scrape
# Define the URL for the fourth page you want to scrape
url_4 <- "https://www.oedb.org/rankings/acceptance-rate/page/4/#table-rankings"
# Read the HTML content from the URL
webpage_4 <- read_html(url_4)
# Extract the data table from the second page
table_4 <- webpage_4 %>%
html_nodes("table") %>%
html_table(fill = TRUE)
# Since the table is returned as a list, we extract the first table
page_4 <- table_4[[1]]
# View the scraped data from page 4
print(page_4)## # A tibble: 100 × 15
## Rank School Student to Faculty R…¹ `Graduation Rate` `Retention Rate`
## <int> <chr> <chr> <chr> <chr>
## 1 301 Lesley Unive… 10 to 1 54% 80%
## 2 302 Wentworth In… 15 to 1 67% 84%
## 3 303 Minnesota St… 23 to 1 49% 74%
## 4 304 St Catherine… 10 to 1 57% 86%
## 5 305 Ithaca Colle… 11 to 1 76% 86%
## 6 306 Manhattan Co… 13 to 1 72% 84%
## 7 307 The New Scho… 10 to 1 65% 81%
## 8 308 Queens Unive… 9 to 1 53% 74%
## 9 309 Cedar Crest … 10 to 1 58% 79%
## 10 310 Texas A & M … 20 to 1 79% 90%
## # ℹ 90 more rows
## # ℹ abbreviated name: ¹`Student to Faculty Ratio`
## # ℹ 10 more variables: `Acceptance Rate` <chr>, `Enrollment Rate` <chr>,
## # `Institutional Aid Rate` <chr>, `Default Rate` <chr>, `` <chr>, `` <chr>,
## # `` <chr>, `` <chr>, `` <chr>, `` <chr>
# Define the URL for the fifth page you want to scrape and scrape
# Define the URL for the Five page you want to scrape
url_5 <- "https://www.oedb.org/rankings/acceptance-rate/page/5/#table-rankings"
# Read the HTML content from the URL
webpage_5 <- read_html(url_5)
# Extract the data table from the second page
table_5 <- webpage_5 %>%
html_nodes("table") %>%
html_table(fill = TRUE)
# Since the table is returned as a list, we extract the first table
page_5 <- table_5[[1]]
# View the scraped data from page 5
print(page_5)## # A tibble: 100 × 15
## Rank School Student to Faculty R…¹ `Graduation Rate` `Retention Rate`
## <int> <chr> <chr> <chr> <chr>
## 1 401 Drexel Unive… 10 to 1 68% 84%
## 2 402 La Salle Uni… 12 to 1 63% 78%
## 3 403 Moravian Col… 12 to 1 69% 80%
## 4 404 University o… 22 to 1 61% 80%
## 5 405 Georgia Coll… 17 to 1 60% 86%
## 6 406 Assumption C… 12 to 1 73% 83%
## 7 407 University o… 21 to 1 67% 83%
## 8 408 Seton Hall U… 13 to 1 63% 84%
## 9 409 Nazareth Col… 9 to 1 73% 84%
## 10 410 The Universi… 16 to 1 56% 78%
## # ℹ 90 more rows
## # ℹ abbreviated name: ¹`Student to Faculty Ratio`
## # ℹ 10 more variables: `Acceptance Rate` <chr>, `Enrollment Rate` <chr>,
## # `Institutional Aid Rate` <chr>, `Default Rate` <chr>, `` <chr>, `` <chr>,
## # `` <chr>, `` <chr>, `` <chr>, `` <chr>
# Define the URL for the Five page you want to scrape
url_6 <- "https://www.oedb.org/rankings/acceptance-rate/page/6/#table-rankings"
# Read the HTML content from the URL
webpage_6 <- read_html(url_6)
# Extract the data table from the second page
table_6 <- webpage_6 %>%
html_nodes("table") %>%
html_table(fill = TRUE)
# Since the table is returned as a list, we extract the first table
page_6 <- table_6[[1]]
# View the scraped data from page 6
print(page_6)## # A tibble: 71 × 15
## Rank School Student to Faculty R…¹ `Graduation Rate` `Retention Rate`
## <int> <chr> <chr> <chr> <chr>
## 1 501 California U… 21 to 1 50% 77%
## 2 502 University o… 21 to 1 65% 80%
## 3 503 University o… 23 to 1 46% 75%
## 4 504 Fort Hays St… 16 to 1 43% 70%
## 5 505 Bryan Colleg… 9 to 1 79% 86%
## 6 506 Valley City … 11 to 1 41% 71%
## 7 507 University o… 18 to 1 62% 88%
## 8 508 Texas Woman'… 18 to 1 41% 76%
## 9 509 West Virgini… 20 to 1 57% 79%
## 10 510 Iowa State U… 19 to 1 71% 87%
## # ℹ 61 more rows
## # ℹ abbreviated name: ¹`Student to Faculty Ratio`
## # ℹ 10 more variables: `Acceptance Rate` <chr>, `Enrollment Rate` <chr>,
## # `Institutional Aid Rate` <chr>, `Default Rate` <chr>, `` <chr>, `` <chr>,
## # `` <chr>, `` <chr>, `` <chr>, `` <chr>
# Combine both pages into one dataset by appending (row binding)
final_data <- rbind(page_1, page_2 ,page_3,page_4 ,page_5,page_6)
#select the needed columns
final_data <- final_data[, 1:9]
# View the combined data
print(final_data)## # A tibble: 571 × 9
## Rank School Student to Faculty R…¹ `Graduation Rate` `Retention Rate`
## <int> <chr> <chr> <chr> <chr>
## 1 1 Harvard Univ… 7 to 1 98% 98%
## 2 2 Yale Univers… 6 to 1 97% 99%
## 3 3 University o… 6 to 1 95% 98%
## 4 4 Johns Hopkin… 10 to 1 94% 97%
## 5 5 Cornell Univ… 9 to 1 93% 97%
## 6 6 Tufts Univer… 9 to 1 93% 97%
## 7 7 University o… 17 to 1 92% 96%
## 8 8 University o… 16 to 1 91% 96%
## 9 9 Georgetown U… 11 to 1 94% 96%
## 10 10 Washington U… 8 to 1 93% 96%
## # ℹ 561 more rows
## # ℹ abbreviated name: ¹`Student to Faculty Ratio`
## # ℹ 4 more variables: `Acceptance Rate` <chr>, `Enrollment Rate` <chr>,
## # `Institutional Aid Rate` <chr>, `Default Rate` <chr>
B. You are going to need the dataset created in Question A to answer the following questions. There are 16 questions each carrying 5 points:
- Replace the missing values “N/A” in the dataset with NA.
Solution:
#Ensure consistent data types across all pages
final_data <- final_data %>%
mutate(
Rank = as.character(Rank),
School = as.character(School),
`Student to Faculty Ratio` = as.character(`Student to Faculty Ratio`),
`Graduation Rate` = as.character(`Graduation Rate`),
`Retention Rate` = as.character(`Retention Rate`),
`Acceptance Rate` = as.character(`Acceptance Rate`),
`Enrollment Rate` = as.character(`Enrollment Rate`),
`Institutional Aid Rate` = as.character(`Institutional Aid Rate`),
`Default Rate` = as.character(`Default Rate`)
) %>%
mutate(across(everything(), ~ na_if(., "N/A"))) # Replace "N/A" with "NA" (empty)
# View the final dataset
print(final_data)## # A tibble: 571 × 9
## Rank School Student to Faculty R…¹ `Graduation Rate` `Retention Rate`
## <chr> <chr> <chr> <chr> <chr>
## 1 1 Harvard Univ… 7 to 1 98% 98%
## 2 2 Yale Univers… 6 to 1 97% 99%
## 3 3 University o… 6 to 1 95% 98%
## 4 4 Johns Hopkin… 10 to 1 94% 97%
## 5 5 Cornell Univ… 9 to 1 93% 97%
## 6 6 Tufts Univer… 9 to 1 93% 97%
## 7 7 University o… 17 to 1 92% 96%
## 8 8 University o… 16 to 1 91% 96%
## 9 9 Georgetown U… 11 to 1 94% 96%
## 10 10 Washington U… 8 to 1 93% 96%
## # ℹ 561 more rows
## # ℹ abbreviated name: ¹`Student to Faculty Ratio`
## # ℹ 4 more variables: `Acceptance Rate` <chr>, `Enrollment Rate` <chr>,
## # `Institutional Aid Rate` <chr>, `Default Rate` <chr>
- Convert percentage columns (e.g., Graduation Rate) to numeric format.
Solution:
final_data <- final_data %>%
mutate(
`Graduation Rate` = as.numeric(str_remove(`Graduation Rate`, "%")) / 100,
`Retention Rate` = as.numeric(str_remove(`Retention Rate`, "%")) / 100,
`Acceptance Rate` = as.numeric(str_remove(`Acceptance Rate`, "%")) / 100,
`Enrollment Rate` = as.numeric(str_remove(`Enrollment Rate`, "%")) / 100,
`Institutional Aid Rate` = as.numeric(str_remove(`Institutional Aid Rate`, "%")) / 100,
`Default Rate` = as.numeric(str_remove(`Default Rate`, "%")) / 100
)
# View the final dataset
print(final_data)## # A tibble: 571 × 9
## Rank School Student to Faculty R…¹ `Graduation Rate` `Retention Rate`
## <chr> <chr> <chr> <dbl> <dbl>
## 1 1 Harvard Univ… 7 to 1 0.98 0.98
## 2 2 Yale Univers… 6 to 1 0.97 0.99
## 3 3 University o… 6 to 1 0.95 0.98
## 4 4 Johns Hopkin… 10 to 1 0.94 0.97
## 5 5 Cornell Univ… 9 to 1 0.93 0.97
## 6 6 Tufts Univer… 9 to 1 0.93 0.97
## 7 7 University o… 17 to 1 0.92 0.96
## 8 8 University o… 16 to 1 0.91 0.96
## 9 9 Georgetown U… 11 to 1 0.94 0.96
## 10 10 Washington U… 8 to 1 0.93 0.96
## # ℹ 561 more rows
## # ℹ abbreviated name: ¹`Student to Faculty Ratio`
## # ℹ 4 more variables: `Acceptance Rate` <dbl>, `Enrollment Rate` <dbl>,
## # `Institutional Aid Rate` <dbl>, `Default Rate` <dbl>
- Transform the “Student to Faculty Ratio” column into two separate numeric columns: Students and Faculty.
Solution:
final_data <- final_data %>%
separate(
`Student to Faculty Ratio`,
into = c("Students", "Faculty"),
sep = " to ",
convert = TRUE
)%>%
mutate(`Students` = as.numeric( Students),
`Faculty` = as.numeric( Faculty),
)
# View the resulting dataset
print(final_data)## # A tibble: 571 × 10
## Rank School Students Faculty `Graduation Rate` `Retention Rate`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 Harvard University 7 1 0.98 0.98
## 2 2 Yale University 6 1 0.97 0.99
## 3 3 University of Penn… 6 1 0.95 0.98
## 4 4 Johns Hopkins Univ… 10 1 0.94 0.97
## 5 5 Cornell University 9 1 0.93 0.97
## 6 6 Tufts University 9 1 0.93 0.97
## 7 7 University of Cali… 17 1 0.92 0.96
## 8 8 University of Cali… 16 1 0.91 0.96
## 9 9 Georgetown Univers… 11 1 0.94 0.96
## 10 10 Washington Univers… 8 1 0.93 0.96
## # ℹ 561 more rows
## # ℹ 4 more variables: `Acceptance Rate` <dbl>, `Enrollment Rate` <dbl>,
## # `Institutional Aid Rate` <dbl>, `Default Rate` <dbl>
- What is the count of missing values in the “Default Rate” column? Impute the missing values in the “Default Rate” column with the median value.
Solution:
# Count missing values in the "Default Rate" column
missing_count <- sum(is.na(final_data$`Default Rate`))
cat("Number of missing values in 'Default Rate':", missing_count, "\n")## Number of missing values in 'Default Rate': 291
# Calculate the median of the "Default Rate" column, excluding NAs
default_rate_median <- median(final_data$`Default Rate`, na.rm = TRUE)
# Impute missing values in the "Default Rate" column with the median value
final_data <- final_data %>%
mutate(`Default Rate` = ifelse(is.na(`Default Rate`), default_rate_median, `Default Rate`))
# Verify that there are no more missing values in the "Default Rate" column
cat("Number of missing values in 'Default Rate' after imputation:", sum(is.na(final_data$`Default Rate`)), "\n")## Number of missing values in 'Default Rate' after imputation: 0
- Find the average graduation rate for universities ranked in the top 50.
Solution:
# Select the top 50 ranked universities
top_50_universities <- final_data %>%
filter(as.numeric(Rank) <= 50)
# View the resulting dataset
print(top_50_universities)## # A tibble: 50 × 10
## Rank School Students Faculty `Graduation Rate` `Retention Rate`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 Harvard University 7 1 0.98 0.98
## 2 2 Yale University 6 1 0.97 0.99
## 3 3 University of Penn… 6 1 0.95 0.98
## 4 4 Johns Hopkins Univ… 10 1 0.94 0.97
## 5 5 Cornell University 9 1 0.93 0.97
## 6 6 Tufts University 9 1 0.93 0.97
## 7 7 University of Cali… 17 1 0.92 0.96
## 8 8 University of Cali… 16 1 0.91 0.96
## 9 9 Georgetown Univers… 11 1 0.94 0.96
## 10 10 Washington Univers… 8 1 0.93 0.96
## # ℹ 40 more rows
## # ℹ 4 more variables: `Acceptance Rate` <dbl>, `Enrollment Rate` <dbl>,
## # `Institutional Aid Rate` <dbl>, `Default Rate` <dbl>
# Calculate the average graduation rate
average_graduation_rate <- top_50_universities %>%
summarise(AverageGraduationRate = mean(`Graduation Rate`, na.rm = TRUE)) %>%
pull(AverageGraduationRate)
# Print the result
cat("The average graduation rate for universities ranked in the top 50 is:", round(average_graduation_rate * 100, 2), "%\n")## The average graduation rate for universities ranked in the top 50 is: 79.18 %
- Filter universities with a retention rate above 90% and find the count of rows in the subset.
Solution:
# Filter universities with a retention rate above 90%
high_retention_universities <- final_data %>%
filter(`Retention Rate` > 0.90)
# Count the number of rows in the filtered subset
count_high_retention <- nrow(high_retention_universities)
# Print the result
cat("The number of universities with a retention rate above 90% is:", count_high_retention, "\n")## The number of universities with a retention rate above 90% is: 98
- Rank universities by enrollment rate in descending order and display the last 6 rows.
Solution:
# Rank universities by enrollment rate in descending order
ranked_universities <- final_data %>%
arrange(desc(`Enrollment Rate`))
# Display the last 6 rows
last_6_universities <- tail(ranked_universities, 6)
# View the result
print(last_6_universities)## # A tibble: 6 × 10
## Rank School Students Faculty `Graduation Rate` `Retention Rate`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 566 Southeastern Baptis… 6 1 0.8 0.4
## 2 567 Touro University Wo… 13 1 NA 1
## 3 568 Unitek College 16 1 NA 1
## 4 569 University of Weste… 16 1 0.88 NA
## 5 570 Virginia Baptist Co… 5 1 1 0.25
## 6 571 West Virginia Junio… 25 1 0.53 0.69
## # ℹ 4 more variables: `Acceptance Rate` <dbl>, `Enrollment Rate` <dbl>,
## # `Institutional Aid Rate` <dbl>, `Default Rate` <dbl>
- Create a histogram of graduation rates using ggplot2 library.
Solution:
# Load ggplot2 library
library(ggplot2)
# Create a histogram of graduation rates
ggplot(final_data, aes(x = `Graduation Rate`)) +
geom_histogram(binwidth = 0.05, fill = "blue", color = "black", alpha = 0.7) +
labs(
title = "Histogram of Graduation Rates",
x = "Graduation Rate (Proportion)",
y = "Frequency"
) +
theme_minimal()- Plot a scatterplot between acceptance rate and enrollment rate using ggplot2 library.
Solution:
# Load ggplot2 library
library(ggplot2)
# Create a scatterplot of acceptance rate vs. enrollment rate
ggplot(final_data, aes(x = `Acceptance Rate`, y = `Enrollment Rate`)) +
geom_point(color = "blue", alpha = 0.7) +
labs(
title = "Scatterplot of Acceptance Rate vs Enrollment Rate",
x = "Acceptance Rate (Proportion)",
y = "Enrollment Rate (Proportion)"
) +
theme_minimal()- Calculate the average default rate by aid rate category (e.g., grouped into ranges like 0-20%, 20-40%). Display the categories.
Solution:
# Load necessary library
library(dplyr)
# Assume 'final_data' contains the necessary data
# Create aid rate categories
final_data <- final_data %>%
mutate(AidRateCategory = case_when(
`Institutional Aid Rate` >= 0 & `Institutional Aid Rate` < 0.2 ~ "0-20%",
`Institutional Aid Rate` >= 0.2 & `Institutional Aid Rate` < 0.4 ~ "20-40%",
`Institutional Aid Rate` >= 0.4 & `Institutional Aid Rate` < 0.6 ~ "40-60%",
`Institutional Aid Rate` >= 0.6 & `Institutional Aid Rate` < 0.8 ~ "60-80%",
`Institutional Aid Rate` >= 0.8 & `Institutional Aid Rate` <= 1 ~ "80-100%",
TRUE ~ "Other"
))
# Calculate the average default rate for each aid rate category
average_default_rate <- final_data %>%
group_by(AidRateCategory) %>%
summarise(AverageDefaultRate = round( mean(`Default Rate`, na.rm = TRUE),2)) %>%
arrange(AidRateCategory)
# Display the results
print(average_default_rate)## # A tibble: 5 × 2
## AidRateCategory AverageDefaultRate
## <chr> <dbl>
## 1 0-20% 0.12
## 2 20-40% 0.08
## 3 40-60% 0.06
## 4 60-80% 0.06
## 5 80-100% 0.06
- Normalize the acceptance rate to a scale of 0-1 and save in a new column “Acceptance Rate Normalized”. Display the first 6 values.
Solution:
library(scales)
# Normalize and rescale the Acceptance Rate to 0-1
final_data <- final_data %>%
mutate(
# Normalize: Scale Acceptance Rate to 0-1
`Acceptance Rate Normalized` = (`Acceptance Rate` - min(`Acceptance Rate`, na.rm = TRUE)) /
(max(`Acceptance Rate`, na.rm = TRUE) - min(`Acceptance Rate`, na.rm = TRUE)),
# Rescale: Adjust normalized values to fall within a target range of 0-1
`Acceptance Rate Rescaled` = scales::rescale(`Acceptance Rate`, to = c(0, 1))
)
# Display the first 6 values of the new columns
head(final_data %>% select(`Acceptance Rate`, `Acceptance Rate Normalized`, `Acceptance Rate Rescaled`))## # A tibble: 6 × 3
## `Acceptance Rate` `Acceptance Rate Normalized` `Acceptance Rate Rescaled`
## <dbl> <dbl> <dbl>
## 1 0.06 0 0
## 2 0.07 0.0106 0.0106
## 3 0.1 0.0426 0.0426
## 4 0.14 0.0851 0.0851
## 5 0.15 0.0957 0.0957
## 6 0.16 0.106 0.106
- What is the count of the duplicate entries in the “School” column? Remove duplicate university entries.
Solution:
# Count duplicate entries in the "School" column
duplicate_count <- final_data %>%
filter(duplicated(School)) %>%
nrow()
# Print the count of duplicates
cat("The number of duplicate entries in the 'School' column is:", duplicate_count, "\n")## The number of duplicate entries in the 'School' column is: 3
# Remove duplicate entries in the "School" column
final_data <- final_data %>%
distinct(School, .keep_all = TRUE)
# Verify the removal of duplicates
cat("The number of rows after removing duplicates is:", nrow(final_data), "\n")## The number of rows after removing duplicates is: 568
- Find the correlation between graduation rate and retention rate (exclude the NAs in both columns).
Solution:
# Calculate the correlation between Graduation Rate and Retention Rate
correlation <- cor(
final_data$`Graduation Rate`,
final_data$`Retention Rate`,
use = "complete.obs" # Exclude NAs
)
# Print the correlation
cat("The correlation between Graduation Rate and Retention Rate is (indicates a strong positive relationship between the two variables):", round(correlation, 4), "\n")## The correlation between Graduation Rate and Retention Rate is (indicates a strong positive relationship between the two variables): 0.616
- Extract the values in School column into a new variable without “University” in the string. For example “Rowan University” becomes “Rowan”
Solution:
# Create a new variable with "University" removed from the School column
final_data <- final_data %>%
mutate(School_Without_University = str_remove(School, " University"))
# Display the first 6 rows of the new variable
head(final_data %>% select(School, School_Without_University))## # A tibble: 6 × 2
## School School_Without_University
## <chr> <chr>
## 1 Harvard University Harvard
## 2 Yale University Yale
## 3 University of Pennsylvania University of Pennsylvania
## 4 Johns Hopkins University Johns Hopkins
## 5 Cornell University Cornell
## 6 Tufts University Tufts
- Count how many universities have “Institute” in their name.
Solution:
# Count the number of universities with "Institute" in their name
institute_count <- final_data %>%
filter(str_detect(School, "Institute")) %>%
nrow()
# Print the result
cat("The number of universities with 'Institute' in their name is:", institute_count, "\n")## The number of universities with 'Institute' in their name is: 17
- Export the cleaned and processed dataset to a CSV file.
Solution:
# Specify the file path and name
output_file <- "final_data_processed.csv"
# Save the final_data dataset to a CSV file
write.csv(final_data, file = output_file, row.names = FALSE)
# Print a confirmation message
cat("The processed final_data has been saved to:", output_file, "\n")## The processed final_data has been saved to: final_data_processed.csv