Morgan State University
Department of Information Science & Systems
Fall 2024
INSS 615: Data Wrangling for Visualization
Name: FNU Farhana Begum
Due: Dec 1, 2024 (Sunday)
Questions
A. Scrape the College Ranked by Acceptance Rate dataset available at this link: https://www.oedb.org/rankings/acceptance-rate/#table-rankings and select the first 9 columns [Rank, School, Student to Faculty Ratio, Graduation Rate, Retention Rate, Acceptance Rate, Enrollment Rate, Institutional Aid Rate, and Default Rate] as the dataset for this assignment. [20 Points]
Hint: There are 6 pages of data, so you may want to use a for loop to automate the scraping process and combine the data from all 6 pages. This is just a suggestion—you are free to create the dataset without automating the web scrapping process.
Solution:
library(rvest)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
scrape_page <- function(url) {
content <- read_html(url)
table <- content %>% html_node("table")
rows <- table %>% html_nodes("tr")
data_list <- list()
for (i in 2:length(rows)) {
cells <- rows[i] %>% html_nodes("td") %>% html_text(trim = TRUE)
if (length(cells) >= 9) {
data_list[[i - 1]] <- cells[1:9] # Get only the first 9 columns
}
}
data <- do.call(rbind, data_list) %>% as.data.frame(stringsAsFactors = FALSE)
colnames(data) <- c("Rank", "School", "Student to Faculty Ratio",
"Graduation Rate", "Retention Rate", "Acceptance Rate",
"Enrollment Rate", "Institutional Aid Rate", "Default Rate")
return(data)
}
base_url <- "https://www.oedb.org/rankings/acceptance-rate/page/"
all_data <- data.frame()
for (page_num in 1:6) {
url <- paste0(base_url, page_num, "/#table-rankings")
page_data <- scrape_page(url)
all_data <- bind_rows(all_data, page_data)
}
head(all_data)
View(all_data)
B. You are going to need the dataset created in Question A to answer the following questions. There are 16 questions each carrying 5 points:
Solution:
all_data[all_data == "N/A"] <- NA
head(all_data)
write.csv(all_data, "college_ranked_by_acceptance_rate_cleaned.csv", row.names = FALSE)
View(all_data)
Solution:
percentage_columns <- c("Graduation Rate", "Retention Rate", "Acceptance Rate",
"Enrollment Rate", "Institutional Aid Rate", "Default Rate")
all_data <- all_data %>%
mutate(across(all_of(percentage_columns),
~ as.numeric(gsub("%", "", .))))
head(all_data)
View(all_data)
Solution:
# Load necessary libraries
library(dplyr)
library(tidyr)
all_data$`Student to Faculty Ratio` <- gsub(" to ", ":", all_data$`Student to Faculty Ratio`)
all_data <- all_data %>%
separate(`Student to Faculty Ratio`, into = c("Students", "Faculty"), sep = ":", convert = TRUE)
all_data$Students <- as.numeric(all_data$Students)
all_data$Faculty <- as.numeric(all_data$Faculty)
head(all_data)
Solution:
library(dplyr)
missing_count <- sum(is.na(all_data$`Default Rate`))
cat("Count of missing values in 'Default Rate' column:", missing_count, "\n")
## Count of missing values in 'Default Rate' column: 291
median_value <- median(all_data$`Default Rate`, na.rm = TRUE)
all_data$`Default Rate`[is.na(all_data$`Default Rate`)] <- median_value
cat("Count of missing values after imputation:", sum(is.na(all_data$`Default Rate`)), "\n")
## Count of missing values after imputation: 0
Solution:
top_50_universities <- all_data %>% filter(Rank <= 50)
average_graduation_rate <- mean(top_50_universities$`Graduation Rate`, na.rm = TRUE)
cat("Average Graduation Rate for Top 50 Universities:", average_graduation_rate, "\n")
## Average Graduation Rate for Top 50 Universities: 65.9009
6. Filter universities with a retention rate above 90% and find the count of rows in the subset.
Solution:
``` r
high_retention_universities <- all_data %>% filter(`Retention Rate` > 90)
count_high_retention <- nrow(high_retention_universities)
cat("Number of universities with a retention rate above 90%:", count_high_retention, "\n")
## Number of universities with a retention rate above 90%: 98
Solution:
library(dplyr)
all_data$`Enrollment Rate` <- as.numeric(as.character(all_data$`Enrollment Rate`))
sorted_data <- all_data %>% arrange(desc(`Enrollment Rate`))
last_6_rows <- tail(sorted_data, 6)
print(last_6_rows)
## Rank School Students Faculty
## 566 566 Southeastern Baptist College 6 1
## 567 567 Touro University Worldwide 13 1
## 568 568 Unitek College 16 1
## 569 569 University of Western States 16 1
## 570 570 Virginia Baptist College 5 1
## 571 571 West Virginia Junior College-Morgantown 25 1
## Graduation Rate Retention Rate Acceptance Rate Enrollment Rate
## 566 80 40 NA NA
## 567 NA 100 NA NA
## 568 NA 100 NA NA
## 569 88 NA NA NA
## 570 100 25 NA NA
## 571 53 69 NA NA
## Institutional Aid Rate Default Rate
## 566 21 6
## 567 76 4
## 568 20 6
## 569 56 6
## 570 38 6
## 571 88 6
#Excluding NAs
library(dplyr)
sorted_data <- all_data %>%
filter(!is.na(`Enrollment Rate`)) %>%
arrange(desc(`Enrollment Rate`))
last_6_rows <- tail(sorted_data, 6)
print(last_6_rows)
## Rank School Students Faculty Graduation Rate
## 537 64 Spring Hill College 14 1 53
## 538 211 Arcadia University 10 1 58
## 539 2 Yale University 6 1 97
## 540 4 Johns Hopkins University 10 1 94
## 541 98 Fordham University 14 1 81
## 542 1 Harvard University 7 1 98
## Retention Rate Acceptance Rate Enrollment Rate Institutional Aid Rate
## 537 81 41 6 99
## 538 79 59 6 94
## 539 99 7 5 52
## 540 97 14 5 51
## 541 91 48 5 80
## 542 98 6 4 44
## Default Rate
## 537 8
## 538 6
## 539 6
## 540 6
## 541 5
## 542 6
Solution:
library(ggplot2)
ggplot(all_data, aes(x = `Graduation Rate`)) +
geom_histogram(binwidth = 2, fill = "skyblue", color = "black", alpha = 0.7) +
labs(
title = "Histogram of Graduation Rates",
x = "Graduation Rate (%)",
y = "Frequency"
) +
theme_minimal()
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_bin()`).
Solution:
library(ggplot2)
ggplot(all_data, aes(x = `Acceptance Rate`, y = `Enrollment Rate`)) +
geom_point(color = "blue", alpha = 0.7) + # Scatter plot points
labs(
title = "Scatterplot of Acceptance Rate vs Enrollment Rate",
x = "Acceptance Rate (%)",
y = "Enrollment Rate (%)"
) +
theme_minimal()
## Warning: Removed 29 rows containing missing values or values outside the scale range
## (`geom_point()`).
Solution:
library(dplyr)
all_data <- all_data %>%
mutate(
AidRateCategory = case_when(
`Institutional Aid Rate` >= 0 & `Institutional Aid Rate` <= 20 ~ "0-20%",
`Institutional Aid Rate` > 20 & `Institutional Aid Rate` <= 40 ~ "20-40%",
`Institutional Aid Rate` > 40 & `Institutional Aid Rate` <= 60 ~ "40-60%",
`Institutional Aid Rate` > 60 & `Institutional Aid Rate` <= 80 ~ "60-80%",
`Institutional Aid Rate` > 80 & `Institutional Aid Rate` <= 100 ~ "80-100%",
TRUE ~ "Other" # For values outside the expected range, if any
)
)
average_default_rate_by_aid <- all_data %>%
group_by(AidRateCategory) %>%
summarise(AverageDefaultRate = mean(`Default Rate`, na.rm = TRUE))
print(average_default_rate_by_aid)
## # A tibble: 5 × 2
## AidRateCategory AverageDefaultRate
## <chr> <dbl>
## 1 0-20% 10
## 2 20-40% 7.56
## 3 40-60% 6.05
## 4 60-80% 6.51
## 5 80-100% 5.62
Solution:
library(scales)
all_data <- all_data %>%
mutate(
`Acceptance Rate Normalized` = (`Acceptance Rate` - min(`Acceptance Rate`, na.rm = TRUE)) /
(max(`Acceptance Rate`, na.rm = TRUE) - min(`Acceptance Rate`, na.rm = TRUE))
)
head(all_data$`Acceptance Rate Normalized`)
## [1] 0.00000000 0.01063830 0.04255319 0.08510638 0.09574468 0.10638298
Solution:
duplicate_count <- sum(duplicated(all_data$School))
print(paste("Count of duplicate entries in the 'School' column:", duplicate_count))
## [1] "Count of duplicate entries in the 'School' column: 3"
all_data_unique <- all_data %>%
distinct(School, .keep_all = TRUE)
head(all_data_unique)
Solution:
correlation <- cor(all_data$`Graduation Rate`, all_data$`Retention Rate`, use = "complete.obs")
print(paste("The correlation between Graduation Rate and Retention Rate is:", correlation))
## [1] "The correlation between Graduation Rate and Retention Rate is: 0.616971168087149"
Solution:
all_data$`School Name` <- gsub(" University", "", all_data$School)
head(all_data[, c("School", "School Name")])
Solution:
count_institute <- sum(grepl("Institute", all_data$School, ignore.case = TRUE))
print(paste("Number of universities with 'Institute' in their name:", count_institute))
## [1] "Number of universities with 'Institute' in their name: 17"
Solution:
write.csv(all_data, "cleaned_college_data.csv", row.names = FALSE)
print("The cleaned dataset has been saved as 'cleaned_college_data.csv'.")
## [1] "The cleaned dataset has been saved as 'cleaned_college_data.csv'."