Morgan State University
Department of Information Science & Systems
Fall 2024
INSS 615: Data Wrangling for Visualization
Name: Cristina Sanchez
Due: Dec 1, 2024 (Sunday)
Questions
A. Scrape the College Ranked by Acceptance Rate dataset available at
this link: https://www.oedb.org/rankings/acceptance-rate/#table-rankings
and select the first 9 columns [Rank, School, Student to Faculty Ratio,
Graduation Rate, Retention Rate, Acceptance Rate, Enrollment Rate,
Institutional Aid Rate, and Default Rate] as the dataset for this
assignment. [20 Points]
Hint: There are 6 pages of data, so you may want to use a for loop to
automate the scraping process and combine the data from all 6 pages.
This is just a suggestion—you are free to create the dataset without
automating the web scrapping process.
Solution:
library(rvest)
library(tidyverse)
#Initialize an empty list to store the data from all pages
all_data <- list()
#Loop through the pages (1 to 6)
for (i in 1:6) {
#Construct the URL for the page using the pager number
url <- paste0("https://www.oedb.org/rankings/acceptance-rate/page/"
,i, "/#table-rankings")
#Read the HTML content of the page
page <- read_html(url)
#Extract the table using the XPath
table <- page %>%
html_node(xpath = "/html/body/div[1]/div/div[1]/table") %>%
html_table(fill = TRUE)
#Select the first 9 columns
table_subset <- table %>%
select(1:9)
#Append the cleaned table data to the list
all_data[[i]] <- table_subset
}
#Combine all data into one data frame
final_data <- bind_rows(all_data)
#View the combined data
head(final_data)
B. You are going to need the dataset created in Question A to answer
the following questions. There are 16 questions each carrying 5
points:
- Replace the missing values “N/A” in the dataset with NA.
Solution:
#Replace "N/A" with NA for character columns
final_data_clean <- final_data %>%
mutate(across(where(is.character), ~ na_if(.x, "N/A")))
#View the cleaned data
head(final_data_clean)
- Convert percentage columns (e.g., Graduation Rate) to numeric
format.
Solution:
#Initialize vector with column names of percentage variables
per_col <- c("Graduation Rate", "Retention Rate", "Acceptance Rate",
"Enrollment Rate", "Institutional Aid Rate", "Default Rate")
#Convert percentage columns to numeric
final_data_clean <- final_data_clean %>%
mutate(across(per_col, ~ as.numeric(gsub("%", "", .x)) / 100))
#View the cleaned data
head(final_data_clean)
- Transform the “Student to Faculty Ratio” column into two separate
numeric columns: Students and Faculty.
Solution:
#Transform "Student to Faculty Ratio" into two separate columns: "Students" and "Faculty"
final_data_clean <- final_data_clean %>%
mutate(
`Students` = as.numeric(sub(" to .*", "", `Student to Faculty Ratio`)), #Extract number of Students
`Faculty` = as.numeric(sub(".*to ", "", `Student to Faculty Ratio`)) #Extract number of Faculty
) #%>% select(-`Student to Faculty Ratio`) #Remove the original column
# View the cleaned data
head(final_data_clean)
NA
- What is the count of missing values in the “Default Rate” column?
Impute the missing values in the “Default Rate” column with the median
value.
Solution:
#Count the number of missing values in the "Default Rate" column
na_cnt <- sum(is.na(final_data_clean$`Default Rate`))
print(paste("Number of missing values in 'Default Rate':", na_cnt))
[1] "Number of missing values in 'Default Rate': 291"
#Impute missing values in "Default Rate" with the median of the column
final_data_clean <- final_data_clean %>%
mutate(
`Default Rate` = ifelse(
is.na(`Default Rate`),
median(`Default Rate`, na.rm = TRUE), #Use the median, ignoring NA values
`Default Rate` #Keep the original value if not missing
)
)
#View the cleaned data
head(final_data_clean)
- Find the average graduation rate for universities ranked in the top
50.
Solution:
#Calculate the average graduation rate for universities ranked in the top 50
avg_grad_rate_top_50 <- final_data_clean %>%
filter(Rank <= 50) %>% # Filter for top 50 ranked universities
summarise(avg_grad_rate = mean(`Graduation Rate`, na.rm = TRUE)) #Calculate the average
#View the result
print(avg_grad_rate_top_50)
- Filter universities with a retention rate above 90% and find the
count of rows in the subset.
Solution:
uni_abv_90_retn <- final_data_clean %>%
filter(`Retention Rate` > .90) %>% #Filter for retention rate above 90%
summarise(count = n()) #Count the number of rows in the subset
#View the count of universities
print(uni_abv_90_retn)
- Rank universities by enrollment rate in descending order and display
the last 6 rows.
Solution:
last_6_uni <- final_data_clean %>%
arrange(desc(`Enrollment Rate`)) %>% #Sort by Enrollment Rate in descending order
filter(!is.na(`Enrollment Rate`)) %>%
tail(6) #Get the last 6 rows
#View the last 6 universities
print(last_6_uni)
- Create a histogram of graduation rates using ggplot2 library.
Solution:
library(ggplot2)
final_data_clean %>%
filter(!is.na(`Graduation Rate`), is.finite(`Graduation Rate`)) %>%
ggplot(aes(x = `Graduation Rate`)) +
geom_histogram(binwidth = 0.05, fill = "skyblue", color = "black", alpha = 0.7) +
labs(
title = "Histogram of Graduation Rates",
x = "Graduation Rate (%)",
y = "Count"
) +
theme_minimal()

- Plot a scatterplot between acceptance rate and enrollment rate using
ggplot2 library.
Solution:
ggplot(final_data_clean %>%
filter(!is.na(`Acceptance Rate`), !is.na(`Enrollment Rate`)), #Remove rows with NA in either column
aes(x = `Acceptance Rate`, y = `Enrollment Rate`)) +
geom_point(color = "blue", alpha = 0.7) +
labs(
title = "Scatterplot of Acceptance Rate vs Enrollment Rate",
x = "Acceptance Rate (%)",
y = "Enrollment Rate (%)"
) +
theme_minimal()

- Calculate the average default rate by aid rate category (e.g.,
grouped into ranges like 0-20%, 20-40%). Display the categories.
Solution:
library(dplyr)
avg_dflt_rate_by_aid_ctgry <- final_data_clean %>%
mutate(
# Create categories for Institutional Aid Rate
aid_rate_ctgry = case_when(
`Institutional Aid Rate` <= 0.2 ~ "0-20%",
`Institutional Aid Rate` <= 0.4 ~ "20-40%",
`Institutional Aid Rate` <= 0.6 ~ "40-60%",
`Institutional Aid Rate` <= 0.8 ~ "60-80%",
`Institutional Aid Rate` > 0.8 ~ "80-100%",
)
) %>%
# Group by aid rate category and calculate the average default rate
group_by(aid_rate_ctgry) %>%
summarise(
avg_dflt_rate = mean(`Default Rate`, na.rm = TRUE)
) %>%
#Arrange in the correct order of categories
mutate(aid_rate_ctgry = factor(aid_rate_ctgry,
levels = c("0-20%", "20-40%", "40-60%", "60-80%", "80-100%"))) %>%
arrange(aid_rate_ctgry)
# Display the result
print(avg_dflt_rate_by_aid_ctgry)
- Normalize the acceptance rate to a scale of 0-1 and save in a new
column “Acceptance Rate Normalized”. Display the first 6 values.
Solution:
library(scales)
final_data_clean2 <- final_data_clean %>%
mutate(
`Acceptance Rate Normalized` = rescale(`Acceptance Rate`)
)
#Display the first 6 rows of the updated data
head(final_data_clean)
- What is the count of the duplicate entries in the “School” column?
Remove duplicate university entries.
Solution:
#Count the duplicate entries in the "School" column
duplicate_count <- final_data_clean2 %>%
count(`School`) %>%
filter(n > 1)
#Display the count of duplicates
print(duplicate_count)
# Remove duplicate universities from the "School" column,
final_data_clean2 <- final_data_clean2 %>%
distinct(`School`, .keep_all = TRUE)
- Find the correlation between graduation rate and retention rate
(exclude the NAs in both columns).
Solution:
# Calculate the correlation between Graduation Rate and Retention Rate, excluding NA values
correlation <- cor(final_data_clean2$`Graduation Rate`, final_data_clean2$`Retention Rate`, use = "complete.obs")
# Display the correlation result
print(correlation)
[1] 0.6159709
- Extract the values in School column into a new variable without
“University” in the string. For example “Rowan University” becomes
“Rowan”
Solution:
#Create a new column without the word "University" in the "School" name
final_data_clean3 <- final_data_clean2 %>%
mutate(
`School Name` = gsub(" University$", "", `School`)
)
# Display the first few rows to check the result
head(final_data_clean3)
- Count how many universities have “Institute” in their name.
Solution:
#Count how many universities have "Institute" in their name
institute_count <- sum(grepl("Institute", final_data_clean3$`School`, ignore.case = TRUE))
#Display the count
print(institute_count)
[1] 17
- Export the cleaned and processed dataset to a CSV file.
Solution:
library(readr)
write_csv(final_data_clean3, "University_Ranking_Clean.csv", na = "")
---
title: "INSS615 Homework 5"
output:
  # word_document: default
  html_notebook: default
  html_document:
    df_print: paged
---


**Morgan State University**

**Department of Information Science & Systems**

**Fall 2024**

**INSS 615: Data Wrangling for Visualization**

**Name: Cristina Sanchez**

*Due: Dec 1, 2024 (Sunday)*



Questions


A. Scrape the College Ranked by Acceptance Rate dataset available at this link: https://www.oedb.org/rankings/acceptance-rate/#table-rankings and select the first 9 columns [Rank, School, Student to Faculty Ratio, Graduation Rate, Retention Rate, Acceptance Rate, Enrollment Rate, Institutional Aid Rate, and Default Rate] as the dataset for this assignment. [20 Points]

Hint: There are 6 pages of data, so you may want to use a for loop to automate the scraping process and combine the data from all 6 pages. This is just a suggestion—you are free to create the dataset without automating the web scrapping process.

 
  Solution:
```{r}
library(rvest) 
library(tidyverse)

#Initialize an empty list to store the data from all pages
all_data <- list()

#Loop through the pages (1 to 6)
for (i in 1:6) {
  
  #Construct the URL for the page using the pager number
  url <- paste0("https://www.oedb.org/rankings/acceptance-rate/page/"
                ,i, "/#table-rankings")
  
  #Read the HTML content of the page
  page <- read_html(url)
  
  #Extract the table using the XPath
  table <- page %>%
    html_node(xpath = "/html/body/div[1]/div/div[1]/table") %>%
    html_table(fill = TRUE)
  
    #Select the first 9 columns
  table_subset <- table %>% 
    select(1:9)
  
  #Append the cleaned table data to the list
  all_data[[i]] <- table_subset
}

#Combine all data into one data frame
final_data <- bind_rows(all_data)

#View the combined data
head(final_data)
```


B. You are going to need the dataset created in Question A to answer the following questions. There are 16 questions each carrying 5 points:

1. Replace the missing values "N/A" in the dataset with NA.


  Solution:
```{r}
#Replace "N/A" with NA for character columns
final_data_clean <- final_data %>%
  mutate(across(where(is.character), ~ na_if(.x, "N/A")))

#View the cleaned data
head(final_data_clean)
```

2. Convert percentage columns (e.g., Graduation Rate) to numeric format.

  
  Solution:
```{r}
#Initialize vector with column names of percentage variables
per_col <- c("Graduation Rate", "Retention Rate", "Acceptance Rate", 
             "Enrollment Rate", "Institutional Aid Rate", "Default Rate")

#Convert percentage columns to numeric
final_data_clean <- final_data_clean %>% 
  mutate(across(per_col, ~ as.numeric(gsub("%", "", .x)) / 100))

#View the cleaned data
head(final_data_clean)
```


3. Transform the "Student to Faculty Ratio" column into two separate numeric columns: Students and Faculty.


  Solution:
```{r}
#Transform "Student to Faculty Ratio" into two separate columns: "Students" and "Faculty"
final_data_clean <- final_data_clean %>%
  mutate(
    `Students` = as.numeric(sub(" to .*", "", `Student to Faculty Ratio`)),  #Extract number of Students
    `Faculty` = as.numeric(sub(".*to ", "", `Student to Faculty Ratio`))   #Extract number of Faculty
  )

#View the cleaned data
head(final_data_clean)
```




4. What is the count of missing values in the "Default Rate" column? Impute the missing values in the "Default Rate" column with the median value.


  Solution:
```{r}
#Count the number of missing values in the "Default Rate" column
na_cnt <- sum(is.na(final_data_clean$`Default Rate`))

print(paste("Number of missing values in 'Default Rate':", na_cnt))

#Impute missing values in "Default Rate" with the median of the column
final_data_clean <- final_data_clean %>%
  mutate(
    `Default Rate` = ifelse(
      is.na(`Default Rate`), 
      median(`Default Rate`, na.rm = TRUE),  #Use the median, ignoring NA values
      `Default Rate`  #Keep the original value if not missing
    )
  )

#View the cleaned data
head(final_data_clean)
```


5. Find the average graduation rate for universities ranked in the top 50.


  Solution:
```{r}
#Calculate the average graduation rate for universities ranked in the top 50
avg_grad_rate_top_50 <- final_data_clean %>%
  filter(Rank <= 50) %>%  #Filter for top 50 ranked universities
  summarise(avg_grad_rate = mean(`Graduation Rate`, na.rm = TRUE))  #Calculate the average

#View the result
print(avg_grad_rate_top_50)
```


6. Filter universities with a retention rate above 90% and find the count of rows in the subset.


  Solution:
```{r}
uni_abv_90_retn <- final_data_clean %>%
  filter(`Retention Rate` > .90) %>%  #Filter for retention rate above 90%
  summarise(count = n())  #Count the number of rows in the subset

#View the count of universities
print(uni_abv_90_retn)
```


7. Rank universities by enrollment rate in descending order and display the last 6 rows.


  Solution:
```{r}
last_6_uni <- final_data_clean %>%
  arrange(desc(`Enrollment Rate`)) %>%  #Sort by Enrollment Rate in descending order
  filter(!is.na(`Enrollment Rate`)) %>% #Remove rows with NA in Enrollment Rate
  tail(6)  #Get the last 6 rows

#View the last 6 universities
print(last_6_uni)
```


8. Create a histogram of graduation rates using ggplot2 library.


  Solution:
```{r}
library(ggplot2)

final_data_clean %>%
  filter(!is.na(`Graduation Rate`), is.finite(`Graduation Rate`)) %>%
  ggplot(aes(x = `Graduation Rate`)) +
  geom_histogram(binwidth = 0.05, fill = "skyblue", color = "black", alpha = 0.7) +
  labs(
    title = "Histogram of Graduation Rates",
    x = "Graduation Rate (%)",
    y = "Count"
  ) +
  theme_minimal()
```


9. Plot a scatterplot between acceptance rate and enrollment rate using ggplot2 library.


  Solution:
```{r}
ggplot(final_data_clean %>%
         filter(!is.na(`Acceptance Rate`), !is.na(`Enrollment Rate`)),  #Remove rows with NA in either column
       aes(x = `Acceptance Rate`, y = `Enrollment Rate`)) +
  geom_point(color = "blue", alpha = 0.7) +  
  labs(
    title = "Scatterplot of Acceptance Rate vs Enrollment Rate",
    x = "Acceptance Rate (%)",
    y = "Enrollment Rate (%)"
  ) +
  theme_minimal()
```


10. Calculate the average default rate by aid rate category (e.g., grouped into ranges like 0-20%, 20-40%). Display the categories.


  Solution:
```{r}
avg_dflt_rate_by_aid_ctgry <- final_data_clean %>%
  mutate(
    #Create categories for Institutional Aid Rate
    aid_rate_ctgry = case_when(
      `Institutional Aid Rate` <= 0.2 ~ "0-20%",
      `Institutional Aid Rate` <= 0.4 ~ "20-40%",
      `Institutional Aid Rate` <= 0.6 ~ "40-60%",
      `Institutional Aid Rate` <= 0.8 ~ "60-80%",
      `Institutional Aid Rate` > 0.8 ~ "80-100%",
    )
  ) %>%
  #Group by aid rate category and calculate the average default rate
  group_by(aid_rate_ctgry) %>%
  summarise(
    avg_dflt_rate = mean(`Default Rate`, na.rm = TRUE)
  ) %>%
  #Arrange in the correct order of categories
  mutate(aid_rate_ctgry = factor(aid_rate_ctgry, 
                                    levels = c("0-20%", "20-40%", "40-60%", "60-80%", "80-100%"))) %>%
  arrange(aid_rate_ctgry)

#Display the result
print(avg_dflt_rate_by_aid_ctgry)
```




11. Normalize the acceptance rate to a scale of 0-1 and save in a new column "Acceptance Rate Normalized". Display the first 6 values.


  Solution:
 
```{r}
library(scales)

final_data_clean2 <- final_data_clean %>%
  mutate(
    `Acceptance Rate Normalized` = rescale(`Acceptance Rate`)
  )

#Display the first 6 rows of the updated data
head(final_data_clean)
```

12. What is the count of the duplicate entries in the "School" column? Remove duplicate university entries.


 Solution:

```{r}
#Count the duplicate entries in the "School" column
duplicate_count <- final_data_clean2 %>%
  count(`School`) %>%
  filter(n > 1)

#Display the count of duplicates
print(duplicate_count)

#Remove duplicate universities from the "School" column,
final_data_clean2 <- final_data_clean2 %>%
  distinct(`School`, .keep_all = TRUE)
```


13. Find the correlation between graduation rate and retention rate (exclude the NAs in both columns).


 Solution:

```{r}
#Calculate the correlation between Graduation Rate and Retention Rate, excluding NA values
correlation <- cor(final_data_clean2$`Graduation Rate`, final_data_clean2$`Retention Rate`, use = "complete.obs")

#Display the correlation result
print(correlation)
```



14. Extract the values in School column into a new variable without "University" in the string. For example "Rowan University" becomes "Rowan"


 Solution:

```{r}
#Create a new column without the word "University" in the "School" name
final_data_clean3 <- final_data_clean2 %>%
  mutate(
    `School Name` = gsub(" University$", "", `School`)
  )

#Display the first few rows to check the result
head(final_data_clean3)
```




15. Count how many universities have "Institute" in their name.


 Solution:

```{r}
#Count how many universities have "Institute" in their name
institute_count <- sum(grepl("Institute", final_data_clean3$`School`, ignore.case = TRUE))

#Display the count
print(institute_count)
```

16. Export the cleaned and processed dataset to a CSV file.


 Solution:

```{r}
library(readr)

write_csv(final_data_clean3, "University_Ranking_Clean.csv", na = "")
```

