Morgan State University
Department of Information Science & Systems
Fall 2024
INSS 615: Data Wrangling for Visualization
Name: Frenandez Lawrence
Due: Dec 1, 2024 (Sunday)
Questions
A. Scrape the College Ranked by Acceptance Rate dataset available at
this link: https://www.oedb.org/rankings/acceptance-rate/#table-rankings
and select the first 9 columns [Rank, School, Student to Faculty Ratio,
Graduation Rate, Retention Rate, Acceptance Rate, Enrollment Rate,
Institutional Aid Rate, and Default Rate] as the dataset for this
assignment. [20 Points]
Hint: There are 6 pages of data, so you may want to use a for loop to
automate the scraping process and combine the data from all 6 pages.
This is just a suggestion—you are free to create the dataset without
automating the web scrapping process.
Solution:
library(rvest)
library(tidyverse)
library(dplyr)
# Define the base URL and an empty list to store data
base_url <- "https://www.oedb.org/rankings/acceptance-rate/#table-rankings"
base_URL_modified <- "https://www.oedb.org/rankings/acceptance-rate/"
base_URL_end <- "/#table-rankings"
all_pages_data <- list()
# Scraping page 1 since base url has different format than others.
page1_html <- read_html(base_url)
# Extract the table from the HTML
table_node <- html_node(page1_html, "table")
# Read the table into a dataframe
page1_data <- html_table(table_node, fill = TRUE)
all_pages_data[[1]] <- page1_data
#view(all_pages_data)
# Loop through the other 5 pages
for (page in 2:6) {
# Construct the URL for each page (adjust if necessary)
page_url <- paste0(base_URL_modified, "page/", page, base_URL_end)
# Scrape the page
page_html <- read_html(page_url)
# Extract the table from the HTML
table_node <- html_node(page_html, "table")
# Read the table into a dataframe
if (!is.null(table_node)) {
page_data <- html_table(table_node, fill = TRUE)
all_pages_data[[page]] <- page_data
} else {
message("Table not found on page: ", page)
}
}
# Combine data from all pages into a single dataframe
dataset <- bind_rows(all_pages_data)
New names:
• `` -> `...10`
• `` -> `...11`
• `` -> `...12`
• `` -> `...13`
• `` -> `...14`
• `` -> `...15`
New names:
• `` -> `...10`
• `` -> `...11`
• `` -> `...12`
• `` -> `...13`
• `` -> `...14`
• `` -> `...15`
New names:
• `` -> `...10`
• `` -> `...11`
• `` -> `...12`
• `` -> `...13`
• `` -> `...14`
• `` -> `...15`
New names:
• `` -> `...10`
• `` -> `...11`
• `` -> `...12`
• `` -> `...13`
• `` -> `...14`
• `` -> `...15`
New names:
• `` -> `...10`
• `` -> `...11`
• `` -> `...12`
• `` -> `...13`
• `` -> `...14`
• `` -> `...15`
New names:
• `` -> `...10`
• `` -> `...11`
• `` -> `...12`
• `` -> `...13`
• `` -> `...14`
• `` -> `...15`
# Select the first 9 required columns
selected_columns <- c(
"Rank", "School", "Student to Faculty Ratio", "Graduation Rate",
"Retention Rate", "Acceptance Rate", "Enrollment Rate",
"Institutional Aid Rate", "Default Rate"
)
dataset <- dataset %>% select(all_of(selected_columns))
# Replace "N/A" with NA
#dataset <- dataset %>% mutate(across(everything(), ~ na_if(.x, "N/A")))
# Display the cleaned dataset
print(dataset)
file_path <- "~/Downloads/college_acceptance_data.csv"
# Save the dataset as a CSV file
write.csv(dataset, file = file_path, row.names = FALSE)
# Message to confirm saving
cat("Dataset saved as:", file_path)
Dataset saved as: ~/Downloads/college_acceptance_data.csv
# multipage scraping using a for loop
B. You are going to need the dataset created in Question A to answer
the following questions. There are 16 questions each carrying 5
points:
- Replace the missing values “N/A” in the dataset with NA.
Solution:
path = "/Users/frenandezlawrence/Downloads/college_acceptance_data.csv"
data <- read.csv(path)
head(data, n=5)
# Replace "N/A" with NA in the entire dataframe
data_cleaned <- data
data_cleaned[data_cleaned == "N/A"] <- NA
View(data_cleaned)
- Convert percentage columns (e.g., Graduation Rate) to numeric
format.
Solution:
# Load necessary library
library(dplyr)
# Convert percentage columns to numeric
data_cleaned <- data_cleaned %>%
mutate(across(
c('Graduation.Rate', 'Retention.Rate', 'Acceptance.Rate', 'Enrollment.Rate', 'Institutional.Aid.Rate', 'Default.Rate'),
~ as.numeric(gsub("%", "", .))
))
- Transform the “Student to Faculty Ratio” column into two separate
numeric columns: Students and Faculty.
Solution:
#necessary libraries are loaded
library(dplyr)
library(stringr)
# Extract "Students" and "Faculty" into new columns
data_cleaned <- data_cleaned %>%
mutate(
Students = as.numeric(str_extract(Student.to.Faculty.Ratio, "^[0-9]+")),
Faculty = as.numeric(str_extract(Student.to.Faculty.Ratio, "[0-9]+$"))
)
view(data_cleaned)
- What is the count of missing values in the “Default Rate” column?
Impute the missing values in the “Default Rate” column with the median
value.
Solution:
# Count the number of missing values in "Default Rate"
missing_count <- sum(is.na(data_cleaned$'Default.Rate'))
print(missing_count)
[1] 291
# Calculate the median of the "Default Rate" column
median_default_rate <- median(data_cleaned$'Default.Rate', na.rm = TRUE)
# Replace missing values with the median
data_cleaned$'Default.Rate'[is.na(data_cleaned$'Default.Rate')] <- median_default_rate
#View(data_cleaned)
- Find the average graduation rate for universities ranked in the top
50.
Solution:
# Filter the dataset for top 50 universities
top_50_universities <- data_cleaned %>% filter(Rank <= 50)
# Calculate the average graduation rate
average_graduation_rate <- mean(top_50_universities$'Graduation.Rate', na.rm = TRUE)
# Display the result
print(average_graduation_rate)
[1] 79.18
- Filter universities with a retention rate above 90% and find the
count of rows in the subset.
Solution:
# Load dplyr
library(dplyr)
# Filter universities with Retention Rate > 90%
high_retention <- data_cleaned %>% filter(Retention.Rate > 90)
# Count the rows in the subset
high_retention_count <- nrow(high_retention)
# Display the count
print(high_retention_count)
[1] 98
- Rank universities by enrollment rate in descending order and display
the last 6 rows.
Solution:
# Load dplyr
library(dplyr)
# Rank universities by Enrollment Rate in descending order
ranked_data <- data_cleaned %>% arrange(desc(Enrollment.Rate))
# Display the last 6 rows
tail(ranked_data, 6)
NA
- Create a histogram of graduation rates using ggplot2 library.
Solution:
# Load ggplot2
library(ggplot2)
# Create a histogram of Graduation Rates
ggplot(data_cleaned, aes(x = Graduation.Rate)) +
geom_histogram(binwidth = 5, fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Histogram of Graduation Rates",
x = "Graduation Rate (%)",
y = "Count") +
theme_minimal()
Warning: Removed 6 rows containing non-finite outside the scale range (`stat_bin()`).

- Plot a scatterplot between acceptance rate and enrollment rate using
ggplot2 library.
Solution:
# Load ggplot2
library(ggplot2)
# Create a scatterplot
ggplot(data_cleaned, aes(x = Acceptance.Rate, y = Enrollment.Rate)) +
geom_point(color = "blue", size = 2, alpha = 0.7) +
labs(title = "Scatterplot of Acceptance Rate vs Enrollment Rate",
x = "Acceptance Rate (%)",
y = "Enrollment Rate (%)") +
theme_minimal()
Warning: Removed 29 rows containing missing values or values outside the scale range (`geom_point()`).

- Calculate the average default rate by aid rate category (e.g.,
grouped into ranges like 0-20%, 20-40%). Display the categories.
Solution:
# Create Aid Rate categories
data_cleaned$AidRateCategory <- cut(
data_cleaned$Institutional.Aid.Rate,
breaks = c(0, 20, 40, 60, 80, 100),
labels = c("0-20%", "20-40%", "40-60%", "60-80%", "80-100%"),
include.lowest = TRUE
)
# Load dplyr
library(dplyr)
# Calculate average default rate by aid rate category
average_default_rate <- data_cleaned %>%
group_by(AidRateCategory) %>%
summarize(AverageDefaultRate = mean(Default.Rate, na.rm = TRUE))
# Display the result
print(average_default_rate)
view(data_cleaned)
- Normalize the acceptance rate to a scale of 0-1 and save in a new
column “Acceptance Rate Normalized”. Display the first 6 values.
Solution:
library(scales)
# Normalize Acceptance Rate to 0-1 scale
data_cleaned$`Acceptance Rate Normalized` <- (data_cleaned$Acceptance.Rate - min(data_cleaned$Acceptance.Rate, na.rm = TRUE)) /
(max(data_cleaned$Acceptance.Rate, na.rm = TRUE) - min(data_cleaned$Acceptance.Rate, na.rm = TRUE))
# Display the first 6 values of the normalized column
head(data_cleaned$'Acceptance Rate Normalized')
[1] 0.00000000 0.01063830 0.04255319 0.08510638 0.09574468 0.10638298
- What is the count of the duplicate entries in the “School” column?
Remove duplicate university entries.
Solution:
# Count duplicate entries in the "School" column
duplicate_count <- sum(duplicated(data_cleaned$School))
print(duplicate_count)
[1] 3
# Load dplyr
library(dplyr)
# Remove duplicate entries based on "School"
data_cleaned <- data_cleaned %>% distinct(School, .keep_all = TRUE)
- Find the correlation between graduation rate and retention rate
(exclude the NAs in both columns).
Solution:
# Calculate the correlation between Graduation Rate and Retention Rate
correlation <- cor(data_cleaned$Graduation.Rate, data_cleaned$Retention.Rate, use = "complete.obs")
# Display the result
print(correlation)
[1] 0.6159709
- Extract the values in School column into a new variable without
“University” in the string. For example “Rowan University” becomes
“Rowan”
Solution:
# Remove "University" from the School column
#The exaple provided was for schools that had university at as the second portion of the string, hence the solution was written like that.
# Load stringr
library(stringr)
# Remove "University" from the School column
school_names <- str_replace(data_cleaned$School, " University", "")
# Display the result
print(head(school_names))
[1] "Harvard" "Yale" "University of Pennsylvania" "Johns Hopkins"
[5] "Cornell" "Tufts"
- Count how many universities have “Institute” in their name.
Solution:
# Load stringr
library(stringr)
# Count universities with "Institute" in their name
institute_count <- sum(str_detect(data_cleaned$School, "Institute"))
# Display the count
print(institute_count)
[1] 17
- Export the cleaned and processed dataset to a CSV file.
Solution:
file_path <- "~/Downloads/cleaned_college_acceptance_data.csv"
# Save the dataset as a CSV file
write.csv(dataset, file = file_path, row.names = FALSE)
---
title: "INSS615 Homework 5"
output:
  # word_document: default
  html_notebook: default
  html_document:
    df_print: paged
---


**Morgan State University**

**Department of Information Science & Systems**

**Fall 2024**

**INSS 615: Data Wrangling for Visualization**

**Name: Frenandez Lawrence **

*Due: Dec 1, 2024 (Sunday)*



Questions


A. Scrape the College Ranked by Acceptance Rate dataset available at this link: https://www.oedb.org/rankings/acceptance-rate/#table-rankings and select the first 9 columns [Rank, School, Student to Faculty Ratio, Graduation Rate, Retention Rate, Acceptance Rate, Enrollment Rate, Institutional Aid Rate, and Default Rate] as the dataset for this assignment. [20 Points]

Hint: There are 6 pages of data, so you may want to use a for loop to automate the scraping process and combine the data from all 6 pages. This is just a suggestion—you are free to create the dataset without automating the web scrapping process.

 
  Solution:
```{r}
library(rvest) 
library(tidyverse)
library(dplyr)

# Define the base URL and an empty list to store data
base_url <- "https://www.oedb.org/rankings/acceptance-rate/#table-rankings"
base_URL_modified <- "https://www.oedb.org/rankings/acceptance-rate/"
base_URL_end <- "/#table-rankings" 
all_pages_data <- list()


  # Scraping page 1 since base url has different format than others.
  page1_html <- read_html(base_url)
  
  # Extract the table from the HTML
  table_node <- html_node(page1_html, "table")
  
  # Read the table into a dataframe
  
    page1_data <- html_table(table_node, fill = TRUE)
    all_pages_data[[1]] <- page1_data
    
    #view(all_pages_data)

# Loop through the other 5 pages
for (page in 2:6) {
  # Construct the URL for each page (adjust if necessary)
 page_url <- paste0(base_URL_modified, "page/", page, base_URL_end)
  
  
  # Scrape the page
  page_html <- read_html(page_url)
  
  # Extract the table from the HTML
  table_node <- html_node(page_html, "table")
  
  # Read the table into a dataframe
  if (!is.null(table_node)) {
    page_data <- html_table(table_node, fill = TRUE)
    all_pages_data[[page]] <- page_data
  } else {
    message("Table not found on page: ", page)
  }
}

# Combine data from all pages into a single dataframe
dataset <- bind_rows(all_pages_data)

# Select the first 9 required columns
selected_columns <- c(
  "Rank", "School", "Student to Faculty Ratio", "Graduation Rate",
  "Retention Rate", "Acceptance Rate", "Enrollment Rate", 
  "Institutional Aid Rate", "Default Rate"
)
dataset <- dataset %>% select(all_of(selected_columns))

# Replace "N/A" with NA
#dataset <- dataset %>% mutate(across(everything(), ~ na_if(.x, "N/A")))

# Display the cleaned dataset
print(dataset)

file_path <- "~/Downloads/college_acceptance_data.csv"

# Save the dataset as a CSV file
write.csv(dataset, file = file_path, row.names = FALSE)

# Message to confirm saving
cat("Dataset saved as:", file_path)


# multipage scraping using a for loop


```

B. You are going to need the dataset created in Question A to answer the following questions. There are 16 questions each carrying 5 points:

1. Replace the missing values "N/A" in the dataset with NA.


  Solution:
```{r}
path = "/Users/frenandezlawrence/Downloads/college_acceptance_data.csv"
data <- read.csv(path)
head(data, n=5)

# Replace "N/A" with NA in the entire dataframe
data_cleaned <- data
data_cleaned[data_cleaned == "N/A"] <- NA


View(data_cleaned)
```

2. Convert percentage columns (e.g., Graduation Rate) to numeric format.

  
  Solution:
```{r}
# Load necessary library
library(dplyr)

# Convert percentage columns to numeric
data_cleaned <- data_cleaned %>%
  mutate(across(
    c('Graduation.Rate', 'Retention.Rate', 'Acceptance.Rate', 'Enrollment.Rate', 'Institutional.Aid.Rate', 'Default.Rate'), 
    ~ as.numeric(gsub("%", "", .))
  ))

```


3. Transform the "Student to Faculty Ratio" column into two separate numeric columns: Students and Faculty.


  Solution:
```{r}
#necessary libraries are loaded

library(dplyr)
library(stringr)

# Extract "Students" and "Faculty" into new columns
data_cleaned <- data_cleaned %>%
  mutate(
    Students = as.numeric(str_extract(Student.to.Faculty.Ratio, "^[0-9]+")),
    Faculty = as.numeric(str_extract(Student.to.Faculty.Ratio, "[0-9]+$"))
  )


view(data_cleaned)
```




4. What is the count of missing values in the "Default Rate" column? Impute the missing values in the "Default Rate" column with the median value.


  Solution:
```{r}
# Count the number of missing values in "Default Rate"
missing_count <- sum(is.na(data_cleaned$'Default.Rate'))
print(missing_count)

# Calculate the median of the "Default Rate" column
median_default_rate <- median(data_cleaned$'Default.Rate', na.rm = TRUE)

# Replace missing values with the median
data_cleaned$'Default.Rate'[is.na(data_cleaned$'Default.Rate')] <- median_default_rate

#View(data_cleaned)
```


5. Find the average graduation rate for universities ranked in the top 50.


  Solution:
```{r}
# Filter the dataset for top 50 universities
top_50_universities <- data_cleaned %>% filter(Rank <= 50)

# Calculate the average graduation rate
average_graduation_rate <- mean(top_50_universities$'Graduation.Rate', na.rm = TRUE)

# Display the result
print(average_graduation_rate)


```


6. Filter universities with a retention rate above 90% and find the count of rows in the subset.


  Solution:
```{r}
# Load dplyr
library(dplyr)

# Filter universities with Retention Rate > 90%
high_retention <- data_cleaned %>% filter(Retention.Rate > 90)

# Count the rows in the subset
high_retention_count <- nrow(high_retention)

# Display the count
print(high_retention_count)

```


7. Rank universities by enrollment rate in descending order and display the last 6 rows.


  Solution:
```{r}
# Load dplyr
library(dplyr)

# Rank universities by Enrollment Rate in descending order
ranked_data <- data_cleaned %>% arrange(desc(Enrollment.Rate))

# Display the last 6 rows
tail(ranked_data, 6)

```


8. Create a histogram of graduation rates using ggplot2 library.


  Solution:
```{r}
# Load ggplot2
library(ggplot2)

# Create a histogram of Graduation Rates
ggplot(data_cleaned, aes(x = Graduation.Rate)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Histogram of Graduation Rates",
       x = "Graduation Rate (%)",
       y = "Count") +
  theme_minimal()


```


9. Plot a scatterplot between acceptance rate and enrollment rate using ggplot2 library.


  Solution:
```{r}
# Load ggplot2
library(ggplot2)

# Create a scatterplot
ggplot(data_cleaned, aes(x = Acceptance.Rate, y = Enrollment.Rate)) +
  geom_point(color = "blue", size = 2, alpha = 0.7) +
  labs(title = "Scatterplot of Acceptance Rate vs Enrollment Rate",
       x = "Acceptance Rate (%)",
       y = "Enrollment Rate (%)") +
  theme_minimal()


```


10. Calculate the average default rate by aid rate category (e.g., grouped into ranges like 0-20%, 20-40%). Display the categories.


  Solution:
```{r}
# Create Aid Rate categories
data_cleaned$AidRateCategory <- cut(
  data_cleaned$Institutional.Aid.Rate,
  breaks = c(0, 20, 40, 60, 80, 100),
  labels = c("0-20%", "20-40%", "40-60%", "60-80%", "80-100%"),
  include.lowest = TRUE
)

# Load dplyr
library(dplyr)

# Calculate average default rate by aid rate category
average_default_rate <- data_cleaned %>%
  group_by(AidRateCategory) %>%
  summarize(AverageDefaultRate = mean(Default.Rate, na.rm = TRUE))

# Display the result
print(average_default_rate)

view(data_cleaned)
```




11. Normalize the acceptance rate to a scale of 0-1 and save in a new column "Acceptance Rate Normalized". Display the first 6 values.


  Solution:
 
```{r}
library(scales)
# Normalize Acceptance Rate to 0-1 scale
data_cleaned$`Acceptance Rate Normalized` <- (data_cleaned$Acceptance.Rate - min(data_cleaned$Acceptance.Rate, na.rm = TRUE)) / 
                                             (max(data_cleaned$Acceptance.Rate, na.rm = TRUE) - min(data_cleaned$Acceptance.Rate, na.rm = TRUE))

# Display the first 6 values of the normalized column
head(data_cleaned$'Acceptance Rate Normalized')


```

12. What is the count of the duplicate entries in the "School" column? Remove duplicate university entries.


 Solution:

```{r}
# Count duplicate entries in the "School" column
duplicate_count <- sum(duplicated(data_cleaned$School))
print(duplicate_count)

# Load dplyr
library(dplyr)

# Remove duplicate entries based on "School"
data_cleaned <- data_cleaned %>% distinct(School, .keep_all = TRUE)


```


13. Find the correlation between graduation rate and retention rate (exclude the NAs in both columns).


 Solution:

```{r}
# Calculate the correlation between Graduation Rate and Retention Rate
correlation <- cor(data_cleaned$Graduation.Rate, data_cleaned$Retention.Rate, use = "complete.obs")

# Display the result
print(correlation)


```



14. Extract the values in School column into a new variable without "University" in the string. For example "Rowan University" becomes "Rowan"


 Solution:

```{r}
# Remove "University" from the School column
#The exaple provided was for schools that had university at as the second portion of the string, hence the solution was written like that. 

# Load stringr
library(stringr)

# Remove "University" from the School column
school_names <- str_replace(data_cleaned$School, " University", "")


# Display the result
print(head(school_names))

```




15. Count how many universities have "Institute" in their name.


 Solution:

```{r}
# Load stringr
library(stringr)

# Count universities with "Institute" in their name
institute_count <- sum(str_detect(data_cleaned$School, "Institute"))

# Display the count
print(institute_count)



```

16. Export the cleaned and processed dataset to a CSV file.


 Solution:

```{r}
file_path <- "~/Downloads/cleaned_college_acceptance_data.csv"

# Save the dataset as a CSV file
write.csv(dataset, file = file_path, row.names = FALSE)


```

