Morgan State University
Department of Information Science & Systems
Fall 2024
INSS 615: Data Wrangling for Visualization
Name: Teearha Hill
Due: Dec 1, 2024 (Sunday)
Questions
A. Scrape the College Ranked by Acceptance Rate dataset available at this link: https://www.oedb.org/rankings/acceptance-rate/#table-rankings and select the first 9 columns [Rank, School, Student to Faculty Ratio, Graduation Rate, Retention Rate, Acceptance Rate, Enrollment Rate, Institutional Aid Rate, and Default Rate] as the dataset for this assignment. [20 Points]
Hint: There are 6 pages of data, so you may want to use a for loop to automate the scraping process and combine the data from all 6 pages. This is just a suggestion—you are free to create the dataset without automating the web scrapping process.
Solution:
library(rvest)
library(tidyverse)
# multipage scraping using a for loop
B. You are going to need the dataset created in Question A to answer the following questions. There are 16 questions each carrying 5 points:
Solution:
# Display a specific column using select()
oedb %>%
select(Default.Rate)
Solution:
# Convert percentage column to numeric (remove "%" and divide by 100)
oedb$Graduation.Rate <- as.numeric(gsub("%", "", df$Graduation.Rate)) / 100
# Display the cleaned data frame
print(oedb)
Solution:
# Load necessary library
library(dplyr)
# Separate the "StudentToFacultyRatio" column into "Students" and "Faculty"
oedb <- oedb %>%
separate(X.Student.to.Faculty.Ratio, into = c("Students", "Faculty"), sep = ":", convert = TRUE)
Warning: Expected 2 pieces. Missing pieces filled with `NA` in 571 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
# Display only the "Students" and "Faculty" columns
oedb[, c("Students", "Faculty")]
NA
Solution:
library(dplyr)
# Example data frame
df <- data.frame(DefaultRate = c(10, 15, NA, 20, NA, 25))
# 1. Count the missing values in the "Default Rate" column
missing_count <- sum(is.na(oedb$Default.Rate))
print(paste("Missing values in Default Rate column:", missing_count))
[1] "Missing values in Default Rate column: 291"
# 2. Impute missing values with the median of the "Default Rate" column
oedb <- oedb %>%
mutate(Default.Rate = ifelse(is.na(Default.Rate), median(Default.Rate, na.rm = TRUE), Default.Rate))
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `Default.Rate = ifelse(...)`.
Caused by warning in `mean.default()`:
! argument is not numeric or logical: returning NA
# Display the updated data frame
print(df)
NA
NA
Solution:
library(dplyr)
# Example data frame
oedb <- data.frame(
Rank = c(1, 2, 3, 51, 52, 50),
University = c("Uni A", "Uni B", "Uni C", "Uni D", "Uni E", "Uni F"),
GraduationRate = c(90, 85, 88, 75, 80, 92)
)
# 1. Filter and calculate the average graduation rate for the top 50 universities
average_graduation_rate <- oedb %>%
filter(Rank <= 50) %>%
summarize(AverageGraduationRate = mean(GraduationRate)) %>%
pull(AverageGraduationRate)
print(paste("Average Graduation Rate for Top 50 Universities:", average_graduation_rate))
[1] "Average Graduation Rate for Top 50 Universities: 88.75"
Solution:
# Convert retention_rate to numeric
oedb$Retention.Rate <- as.numeric(sub("%", "", oedb$Retention.Rate))
# Filter universities with a retention rate above 90%
filtered_df <- oedb[oedb$Retention.Rate > 90, ]
# Count the number of rows in the subset
count_rows <- nrow(filtered_df)
Solution:
# Convert enrollment_rate to numeric
oedb$Enrollment.Rate <- as.numeric(sub("%", "", oedb$Enrollment.Rate))
Warning: NAs introduced by coercion
# Rank universities by enrollment rate in descending order
df_sorted <- oedb[order(-oedb$Enrollment.Rate), ]
# Display the last 6 rows
tail(df_sorted, 6)
Solution:
library(ggplot2)
# Convert graduation_rate to numeric
oedb$Graduation.Rate <- as.numeric(sub("%", "", oedb$Graduation.Rate))
Warning: NAs introduced by coercion
# Create a histogram of graduation rates
ggplot(oedb, aes(x = Graduation.Rate)) +
geom_histogram(binwidth = 5, fill = "blue", color = "black") +
labs(title = "Histogram of Graduation Rates", x = "Graduation Rate (%)", y = "Frequency")
Solution:
library(ggplot2)
# Convert acceptance_rate to numeric
oedb$Acceptance.Rate <- as.numeric(sub("%", "", oedb$Acceptance.Rate))
Warning: NAs introduced by coercion
# Create a scatterplot between acceptance rate and enrollment rate
ggplot(oedb, aes(x = Acceptance.Rate, y = Enrollment.Rate)) +
geom_point(color = "blue") +
labs(title = "Scatterplot of Acceptance Rate vs Enrollment Rate", x = "Acceptance Rate (%)", y = "Enrollment Rate (%)")
Solution:
# Convert institutional_aid_rate and default_rate to numeric
oedb$Institutional.Aid.Rate <- as.numeric(sub("%", "", oedb$Institutional.Aid.Rate))
oedb$Default.Rate <- as.numeric(sub("%", "", oedb$Default.Rate))
Warning: NAs introduced by coercion
# Define aid rate categories
oedb$aid_rate_category <- cut(oedb$Institutional.Aid.Rate, breaks = seq(0, 100, by = 20), include.lowest = TRUE, right = FALSE)
# Calculate the average default rate by aid rate category
average_default.rate <- aggregate(Default.Rate ~ aid_rate_category, data = oedb, FUN = mean, na.rm = TRUE)
# Display the categories and their average default rates
average_default.rate
NA
Solution:
# Normalize the acceptance rate to a scale of 0-1
oedb$Acceptance.Rate <- as.numeric(sub("%", "", oedb$Acceptance.Rate))
oedb$`Acceptance Rate Normalized` <- (oedb$Acceptance.Rate - min(oedb$Acceptance.Rate, na.rm = TRUE)) / (max(oedb$Acceptance.Rate, na.rm = TRUE) - min(oedb$Acceptance.Rate, na.rm = TRUE))
# Display the first 6 values of the new column
head(oedb$`Acceptance Rate Normalized`, 6)
[1] 0.00000000 0.01063830 0.04255319 0.08510638 0.09574468 0.10638298
Solution:
# Count the duplicate entries in the "School" column
duplicate_count <- sum(duplicated(oedb$School))
# Remove duplicate university entries
df_unique <- oedb[!duplicated(df$school), ]
duplicate_count
[1] 3
Solution:
# Convert graduation_rate and retention_rate to numeric
oedb$Graduation.Rate <- as.numeric(sub("%", "", oedb$Graduation.Rate))
oedb$Retention.Rate <- as.numeric(oedb$Retention.Rate)
# Calculate the correlation between graduation rate and retention rate, excluding NAs
correlation <- cor(oedb$Graduation.Rate, oedb$Retention.Rate, use = "complete.obs")
correlation
[1] 0.6169712
Solution:
# Extract values in the School column without "University" in the string
school_names <- gsub(" University", "", oedb$School)
# Display the new variable
school_names
[1] "Harvard" "Yale"
[3] "University of Pennsylvania" "Johns Hopkins"
[5] "Cornell" "Tufts"
[7] "University of California-Berkeley" "University of California-Los Angeles"
[9] "Georgetown" "Washington in St Louis"
[11] "University of Notre Dame" "Liberty"
[13] "Emory" "New Mexico Institute of Mining and Technology"
[15] "Piedmont International" "Carnegie Mellon"
[17] "California Institute of the Arts" "Maine College of Health Professions"
[19] "Babson College" "University of Michigan-Ann Arbor"
[21] "Northeastern" "Southwestern Assemblies of God"
[23] "Berklee College of Music" "Boston College"
[25] "Wake Forest" "Tulane of Louisiana"
[27] "Lehigh" "University of Virginia-Main Campus"
[29] "California Polytechnic State-San Luis Obispo" "University of North Carolina at Chapel Hill"
[31] "Corban" "University of Richmond"
[33] "Georgia Institute of Technology-Main Campus" "CUNY Bernard M Baruch College"
[35] "New York" "University of California-Santa Barbara"
[37] "Boston" "California State-Long Beach"
[39] "Hope International" "San Diego State"
[41] "Brandeis" "Nebraska Methodist College of Nursing & Allied Health"
[43] "University of Rochester" "Columbia International"
[45] "College of William and Mary" "American"
[47] "Emmaus Bible College" "Skidmore College"
[49] "Laguna College of Art and Design" "Maharishi of Management"
[51] "CUNY Brooklyn College" "University of California-Davis"
[53] "Colorado School of Mines" "University of Miami"
[55] "California State Polytechnic-Pomona" "University of California-Irvine"
[57] "Andrews" "CUNY Hunter College"
[59] "The of Texas at Austin" "Park"
[61] "CUNY Queens College" "Western Carolina"
[63] "University of Memphis" "Spring Hill College"
[65] "Fashion Institute of Technology" "Stony Brook"
[67] "SUNY at Purchase College" "California State-Fullerton"
[69] "The of West Florida" "New York School of Interior Design"
[71] "SUNY at Binghamton" "State of New York at New Paltz"
[73] "Dallas Baptist" "Texas Christian"
[75] "Stevens Institute of Technology" "King"
[77] "Florida Southern College" "University of South Florida-Main Campus"
[79] "Toccoa Falls College" "University of Maryland-College Park"
[81] "University of Minnesota-Twin Cities" "Marist College"
[83] "Monroe College" "Wilson College"
[85] "LeTourneau" "George Washington"
[87] "Southeastern" "Midway"
[89] "St Lawrence" "Bryan College-Dayton"
[91] "Bristol" "University of La Verne"
[93] "Pfeiffer" "Texas A & M-Commerce"
[95] "Chapman" "University of Florida"
[97] "Graceland-Lamoni" "Fordham"
[99] "Syracuse" "Muhlenberg College"
[101] "Villanova" "Brigham Young-Provo"
[103] "Fashion Institute of Design & Merchandising-Los Angeles" "Howard"
[105] "University of Central Florida" "Augustana College"
[107] "North Park" "University of Saint Mary"
[109] "Emerson College" "Worcester Polytechnic Institute"
[111] "William Jewell College" "Ohio State-Main Campus"
[113] "Cumberland" "University of California-Santa Cruz"
[115] "Florida International" "Concordia-Chicago"
[117] "Eastern Illinois" "Mount Holyoke College"
[119] "Saint Elizabeth College of Nursing" "SUNY College at Plattsburgh"
[121] "North Carolina State at Raleigh" "Abilene Christian"
[123] "Loyola Marymount" "Warner"
[125] "University of St Francis" "Belhaven"
[127] "Avila" "SUNY College at Cortland"
[129] "SUNY College at Oswego" "Concordia-Portland"
[131] "Pennsylvania State-Main Campus" "Clemson"
[133] "California State-Fresno" "Simpson"
[135] "The Baptist College of Florida" "Notre Dame of Maryland"
[137] "Clarkson College" "SUNY College of Environmental Science and Forestry"
[139] "Eastern" "University of Charleston"
[141] "University of Connecticut" "University of Georgia"
[143] "Illinois Institute of Technology" "Crown College"
[145] "Ramapo College of New Jersey" "SUNY College at Brockport"
[147] "Gardner-Webb" "University of Washington-Seattle Campus"
[149] "Pennsylvania College of Health Sciences" "Grace College of Divinity"
[151] "The of Alabama" "Jacksonville"
[153] "The Sage Colleges" "SUNY College of Technology at Delhi"
[155] "The Christ College of Nursing and Health Sciences" "University of Pittsburgh-Pittsburgh Campus"
[157] "California State-San Marcos" "San Jose State"
[159] "Berry College" "Elmhurst College"
[161] "Oakland City" "Lawrence Technological"
[163] "Concordia-Saint Paul" "Chatham"
[165] "Anderson" "Averett-Non-Traditional Programs"
[167] "University of California-Riverside" "Florida State"
[169] "Northwest Nazarene" "Greenville College"
[171] "Marian" "Morningside College"
[173] "Webster" "SUNY at Albany"
[175] "Temple" "University of Florida-Online"
[177] "University of North Florida" "University of West Georgia"
[179] "Asbury" "University of Massachusetts-Lowell"
[181] "William Carey" "Rivier"
[183] "Rochester Institute of Technology" "SUNY College of Technology at Alfred"
[185] "Elon" "Minot State"
[187] "Oregon Institute of Technology" "Providence College"
[189] "Norwich" "Rose-Hulman Institute of Technology"
[191] "Friends" "University of Massachusetts-Amherst"
[193] "Simmons College" "Rutgers-New Brunswick"
[195] "University of Dayton" "University of the Sciences"
[197] "North Greenville" "Johnson"
[199] "University of Wisconsin-Madison" "Concordia-Irvine"
[201] "Sacred Heart" "Nova Southeastern"
[203] "Saint Mary-of-the-Woods College" "University of Maryland-Baltimore County"
[205] "Augsburg College" "Cox College"
[207] "Grace" "University of North Carolina at Greensboro"
[209] "Oklahoma Christian" "Multnomah"
[211] "Arcadia" "West Chester of Pennsylvania"
[213] "Purdue-Main Campus" "University of Alabama at Birmingham"
[215] "University of Arkansas" "Georgia Southern"
[217] "Lincoln Christian" "Anderson"
[219] "Stevenson" "Madonna"
[221] "University of Minnesota-Morris" "University at Buffalo"
[223] "SUNY Polytechnic Institute" "Baldwin Wallace"
[225] "Ball State" "Mount Mercy"
[227] "MidAmerica Nazarene" "Loyola Maryland"
[229] "Salisbury" "The College of Saint Scholastica"
[231] "Evangel" "University of North Carolina Wilmington"
[233] "Summit of Pennsylvania" "Lipscomb"
[235] "The of Texas at Dallas" "Wheeling Jesuit"
[237] "University of California-Merced" "California Lutheran"
[239] "Agnes Scott College" "Illinois Wesleyan"
[241] "Lewis" "University of Michigan-Dearborn"
[243] "Winona State" "Daniel Webster College"
[245] "Le Moyne College" "Saint John Fisher College"
[247] "Salem College" "Southern Wesleyan"
[249] "Presbyterian College" "Eastern Mennonite"
[251] "Whitworth" "California State-Chico"
[253] "Whittier College" "Stetson"
[255] "McKendree" "Dominican"
[257] "University of Illinois at Springfield" "Frostburg State"
[259] "Saint Louis" "College of Saint Elizabeth"
[261] "University of North Carolina at Charlotte" "Pittsburgh Institute of Mortuary Science Inc"
[263] "Texas Tech" "University of San Francisco"
[265] "Central College" "Faith Baptist Bible College and Theological Seminary"
[267] "Louisiana Tech" "Caldwell"
[269] "Stockton" "Oral Roberts"
[271] "Philadelphia" "California Baptist"
[273] "Fairfield" "Mississippi State"
[275] "Cleveland Institute of Art" "Miami-Oxford"
[277] "University of South Carolina-Columbia" "Augustana"
[279] "Milwaukee School of Engineering" "Milligan College"
[281] "Regis" "Brenau"
[283] "Bradley" "University of Illinois at Urbana-Champaign"
[285] "Bethel College-Indiana" "Goshen College"
[287] "Michigan State" "Minneapolis College of Art and Design"
[289] "Albany College of Pharmacy and Health Sciences" "Roberts Wesleyan College"
[291] "St Bonaventure" "Appalachian State"
[293] "The of Texas at Arlington" "Champlain College"
[295] "Viterbo" "University of Delaware"
[297] "Mercer" "Drake"
[299] "Luther College" "Mount St Mary's"
[301] "Lesley" "Wentworth Institute of Technology"
[303] "Minnesota State-Mankato" "St Catherine"
[305] "Ithaca College" "Manhattan College"
[307] "The New School" "Queens of Charlotte"
[309] "Cedar Crest College" "Texas A & M-College Station"
[311] "Concordia-Wisconsin" "University of Redlands"
[313] "Vanguard of Southern California" "Moody Bible Institute"
[315] "Buena Vista" "Georgetown College"
[317] "Stephens College" "Clarkson"
[319] "Saint Joseph's College-New York" "SUNY Maritime College"
[321] "Northwest Christian" "Lancaster Bible College"
[323] "Slippery Rock of Pennsylvania" "Widener-Main Campus"
[325] "Chamberlain College of Nursing-Illinois" "Central Michigan"
[327] "Northwood-Michigan" "Spring Arbor"
[329] "Rider" "East Carolina"
[331] "Malone" "Ohio Northern"
[333] "Union" "George Mason"
[335] "Maranatha Baptist" "Biola"
[337] "Savannah College of Art and Design" "University of Evansville"
[339] "Cornell College" "Dordt College"
[341] "Kettering" "Creighton"
[343] "Montclair State" "Saint Vincent College"
[345] "Baptist Missionary Association Theological Seminary" "Point Loma Nazarene"
[347] "Loyola Chicago" "Framingham State"
[349] "Massachusetts College of Art and Design" "Rowan"
[351] "Elizabethtown College" "Marywood"
[353] "Misericordia" "University of Vermont"
[355] "University of Idaho" "DePaul"
[357] "Clarke" "Northwestern College"
[359] "Cleary" "Hope College"
[361] "Hamline" "Maryville of Saint Louis"
[363] "Adelphi" "Xavier"
[365] "Lebanon Valley College" "University of Scranton"
[367] "Shasta Bible College and Graduate School" "University of Denver"
[369] "Saint Leo" "Towson"
[371] "Endicott College" "University of Detroit Mercy"
[373] "William Woods" "SUNY College at Geneseo"
[375] "Geneva College" "Salve Regina"
[377] "Trevecca Nazarene" "Sam Houston State"
[379] "James Madison" "Virginia Polytechnic Institute and State"
[381] "Gonzaga" "John Brown"
[383] "Quinnipiac" "Wartburg College"
[385] "Massachusetts Maritime Academy" "Calvin College"
[387] "Rockhurst" "Plymouth State"
[389] "Monmouth" "Cedarville"
[391] "Muskingum" "Ohio-Main Campus"
[393] "University of Oregon" "Alvernia"
[395] "Saint Francis" "University of South Dakota"
[397] "Marquette" "Michigan Technological"
[399] "Otterbein" "Oklahoma State-Main Campus"
[401] "Drexel" "La Salle"
[403] "Moravian College" "University of Arizona"
[405] "Georgia College and State" "Assumption College"
[407] "University of Nebraska-Lincoln" "Seton Hall"
[409] "Nazareth College" "The of Findlay"
[411] "Duquesne" "Gannon"
[413] "Susquehanna" "The of Tennessee-Knoxville"
[415] "Saint Michael's College" "Pacific Lutheran"
[417] "Northern Arizona" "University of Illinois at Chicago"
[419] "Olivet Nazarene" "Grace College and Theological Seminary"
[421] "Louisiana State and Agricultural & Mechanical College" "Goucher College"
[423] "Bay Path" "Siena Heights"
[425] "Ashland" "George Fox"
[427] "Juniata College" "Citadel Military College of South Carolina"
[429] "John A Gupton College" "Edgewood College"
[431] "Auburn" "Indiana-Bloomington"
[433] "Saint Joseph's College of Maine" "Concordia College at Moorhead"
[435] "Martin Luther College" "Saint Mary's of Minnesota"
[437] "University of Missouri-Columbia" "Concordia-Nebraska"
[439] "University of Oklahoma-Norman Campus" "Oregon State"
[441] "DeSales" "Robert Morris"
[443] "Roger Williams" "University of St Thomas"
[445] "Saint Norbert College" "Catholic of America"
[447] "Merrimack College" "University of Mississippi"
[449] "Truman State" "University of New Hampshire-Main Campus"
[451] "Franciscan of Steubenville" "Pacific"
[453] "Immaculata" "Messiah College"
[455] "Longwood" "Virginia Commonwealth"
[457] "University of Colorado Boulder" "Saint Mary's College"
[459] "University of Northern Iowa" "McDaniel College"
[461] "Westfield State" "Yeshiva"
[463] "Walsh" "Belmont"
[465] "Washington State" "University of Wisconsin-Whitewater"
[467] "University of Wisconsin-La Crosse" "Azusa Pacific"
[469] "Colorado State-Fort Collins" "University of Iowa"
[471] "Grace Bible College" "Grand Valley State"
[473] "Carlow" "Dakota State"
[475] "University of Utah" "Chaminade of Honolulu"
[477] "Valparaiso" "Baker"
[479] "University of North Dakota" "John Carroll"
[481] "Mount Carmel College of Nursing" "Saint Joseph's"
[483] "Wilkes" "Bob Jones"
[485] "Shenandoah" "Central Washington"
[487] "Seattle Pacific" "Western Washington"
[489] "Utica College" "University of Mary Washington"
[491] "Old Dominion" "Huntington"
[493] "University of St Thomas" "Linfield College-McMinnville Campus"
[495] "Regent" "Taylor"
[497] "University of New England" "University of Nebraska at Kearney"
[499] "Niagara" "Kent State at Kent"
[501] "California of Pennsylvania" "University of Wisconsin-Eau Claire"
[503] "University of Wisconsin-Green Bay" "Fort Hays State"
[505] "Bryan College of Health Sciences" "Valley City State"
[507] "University of Cincinnati-Main Campus" "Texas Woman's"
[509] "West Virginia" "Iowa State"
[511] "MCPHS" "University of Northwestern-St Paul"
[513] "Canisius College" "Illinois State"
[515] "Missouri of Science and Technology" "University of Kentucky"
[517] "Columbia College" "Loyola New Orleans"
[519] "Southern New Hampshire" "South Dakota State"
[521] "Wisconsin Lutheran College" "Samford"
[523] "Gordon College" "University of Wisconsin-Stout"
[525] "North Dakota State-Main Campus" "University of Wisconsin-Platteville"
[527] "The Master's College and Seminary" "Indiana Wesleyan-Marion"
[529] "Kansas State" "Bethel"
[531] "Houghton College" "University of Mary"
[533] "Clarion of Pennsylvania" "Westminster College"
[535] "University of Wyoming" "Capitol Technology"
[537] "Fontbonne" "Utah State"
[539] "Harding" "Luther Rice & Seminary"
[541] "Brigham Young-Idaho" "Midwives College of Utah"
[543] "Alexandria Technical & Community College" "American Medical Academy"
[545] "American Sentinel" "Apex School of Theology"
[547] "Baptist Health System School of Health Professions" "Bethesda"
[549] "Clear Creek Baptist Bible College" "Colorado Christian"
[551] "Columbia College" "Faith Evangelical College & Seminary"
[553] "Hobe Sound Bible College" "Holy Apostles College and Seminary"
[555] "International Baptist College and Seminary" "Keiser-Ft Lauderdale"
[557] "Laurus College" "Mountain State College"
[559] "Nazarene Bible College" "North Central Kansas Technical College"
[561] "North Florida Community College" "Northwest Iowa Community College"
[563] "Pamlico Community College" "Provo College"
[565] "Sessions College for Professional Design" "Southeastern Baptist College"
[567] "Touro Worldwide" "Unitek College"
[569] "University of Western States" "Virginia Baptist College"
[571] "West Virginia Junior College-Morgantown"
Solution:
# Count how many universities have "Institute" in their name
institute_count <- sum(grepl("Institute", oedb$School))
institute_count
[1] 17
Solution:
# Export the cleaned and processed dataset to a CSV file named "clean_oedb.csv"
write.csv(oedb, "clean_oedb.csv", row.names = FALSE)