The following is Custom HTML/CSS for formatting ordered lists with a hanging indent.
# The necessary packages to run the R code These are summarized after successfully running all code. They are referenced throughout the file.
library(readr)
library(tidyverse) # for data handling.
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.0.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(DT) # for interactive tables.
library(janitor) # for cleaning column names.
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(stringr) # for
library(tm) # for text mining.
## Loading required package: NLP
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
library(RColorBrewer) # color palettes for creating graphics.
library(tidytext) # for text mining.
library(gt) # Part of the tidyverse for creating tables in R.
library(nnet) # To run multinomial classifiers.
library(SnowballC) # A text preprocessing program to reduce words to their root form.
library(xgboost) # To build decision trees.
##
## Attaching package: 'xgboost'
##
## The following object is masked from 'package:dplyr':
##
## slice
library(DMwR) # To run the SMOTE function.
## Loading required package: lattice
## Loading required package: grid
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(text2vec) # For tokenization and vectorization.
library(Matrix) # To transform data machine learning preparation.
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(MLmetrics) # For extra performance metrics
##
## Attaching package: 'MLmetrics'
##
## The following object is masked from 'package:base':
##
## Recall
library(caret) # To split data, model training, and evaluation
##
## Attaching package: 'caret'
##
## The following objects are masked from 'package:MLmetrics':
##
## MAE, RMSE
##
## The following object is masked from 'package:purrr':
##
## lift
library(keras) # For deep learning.
##
## Attaching package: 'keras'
##
## The following objects are masked from 'package:text2vec':
##
## fit, normalize
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
Hypothesis H1: In Nonprofit graduate level higher education, course names vary by schools that self-identify as Business, Public Administration and Professional Studies. Hypothesis H2: In Nonprofit graduate level higher education, course descriptions vary by schools that self-identify as Business, Public Administration and Professional Studies.
From scratch, I created a CSV file titled, “College_Website_Data” by copying and pasting curriculum related data from college websites. Variables included such items as university, department, program and course names, course descriptions, modality, school location, tuition rates and enrollment data when available. From the Institute of Education Sciences’ (a branch of the USDOE), Integrated Postsecondary Education Data System website, I obtained a CSV an Institutional Characteristics survey (file titled, “HD2023”). I downloaded it in my working directory and renamed it “Institutional_Characteristics_Survey_Data”. I combined the “College_Website_Data” and select variables from the “Institutional_Characteristics_Survey_Data” and created a new object named “Master_Programs1”. Below is the R code and series of steps that I used to create the “Programs” object.
# I created two objects, "College_Website_Data1" to read in the College data manually created, and "Institutional_Characteristics_Data" to read in the USDOE survey data. Since my research team inaccurately recorded names with non-ASCII characters, I set the fileEncoding to "latin1" to be able to read in the non-ASCII characters. To standardize the format of the column names in each of the two data sets, I used the janitor library and the "clean_names() function. This makes all the variable names case insensitive, spaced with underscores, and removes excess whitespaces.
College_Website_Data1 <- read.csv("College_Website_Data1.csv") %>% clean_names()
Institutional_Characteristics_Data <- read.csv("Institutional_Characteristics_Survey_Data.csv",
fileEncoding = "latin1") %>% clean_names()
# I selected college identifying columns from the "Institutional_Characteristics_Survey_Data.csv" file.
Institutional_Characteristics_subset <- Institutional_Characteristics_Data %>%
select(unitid, instnm, addr, city, stabbr, zip)
# The next step was to somehow match the subset variables with the college names that the research team recorded. To do this, I took each dataset and within them, with the mutate function, I created new temporary variables "university_clean" and "instnm_clean" to join the two datasets. I used the "tolower" function to clean the values of the rows within the columns, and then use the ".after = " to place the column after the original columns.
College_Website_Data1 <- College_Website_Data1 %>%
mutate(university_clean = tolower(trimws(university))) %>%
relocate(university_clean, .after = university)
Institutional_Characteristics_subset <- Institutional_Characteristics_subset %>%
mutate(instnm_clean = tolower(trimws(instnm))) %>%
relocate(instnm_clean, .after = instnm)
# I left joined the cleaned datasets by matching the cleaned names of the new temporary variables.
merged_data <- left_join(College_Website_Data1, Institutional_Characteristics_subset,
by = c("university_clean" = "instnm_clean"))
# After the datasets were merged, I moved the matched institutional columns to appear immediately after "university" to make the dataset more readable.
merged_data <- merged_data %>%
relocate(unitid, instnm, addr, city, stabbr, zip, .after = university)
# After merging, I no longer had a need for the temporary 'university_clean' and 'instnm_clean' columns, so I removed them.
merged_data <- merged_data %>%
select(-university_clean, -instnm)
# Export the merged data to a new CSV file without row numbers
write.csv(merged_data, "Master_Programs1.csv", row.names = FALSE) # Save cleaned and merged output
Once the “Master_Programs1.csv” file was created, certain schools that were recorded by my research team did not match the merged “unitid” columns. Some of the school names were mispelled or some other small nuance. Therefore, after downloading the “Master_Programs1.csv”, I had to go back and manually correct school names to properly match “unitid” data. I saved the new file as “Master_Programs2.csv” and uploaded it back into RStudio as shown below.
Master_Programs2 <- read.csv("Master_Programs2.csv", fileEncoding = "UTF-8-BOM", na.strings = c("NA", "N/A", ""))
To disclose the variable explanations, I inserted an interactive table. I uploaded a CSV file, “Masters_Programs_Variable_Names”. I labeled the object “Master_Programs2_variables”. I used the DT library to make an interactive data table.
# Upload the file.
Master_Programs2_variables_url <- read.csv("Masters_Programs_Variable_Names.csv")
# I ran the datatable function to display the interactive data table. The options tell RStudio how many rows to display and the scrollx option produces a horizontal scrolling bar.
library(DT)
datatable(Master_Programs2_variables_url, options = list(pageLength = 20, scrollX = TRUE, width = "90%"))
Using dplyer, I ran the glimpse() function to inspect the dataset to determine the amount of rows and columns, the vector structure, and get a sense of the data appearance.
# Examine the data and its structure.
library(dplyr)
glimpse(Master_Programs2)
## Rows: 1,020
## Columns: 37
## $ university <chr> "Indiana University-Northwest", "Indiana U…
## $ unitid <int> 151360, 151360, 151360, 151360, 151360, 15…
## $ addr <chr> "3400 Broadway", "3400 Broadway", "3400 Br…
## $ city <chr> "Gary", "Gary", "Gary", "Gary", "Gary", "G…
## $ stabbr <chr> "IN", "IN", "IN", "IN", "IN", "IN", "IN", …
## $ zip <chr> "46408-1197", "46408-1197", "46408-1197", …
## $ school_name <chr> "O'Neill School of Public and Environmenta…
## $ school_type <chr> "PA", "PA", "PA", "PA", "PA", "PA", "PA", …
## $ program_title <chr> "Nonprofit Management", "Nonprofit Managem…
## $ specialization <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, "Finan…
## $ program_type <chr> "Certificate", "Certificate", "Certificate…
## $ latitude <dbl> 41.55732, 41.55732, 41.55732, 41.55732, 41…
## $ longitude <dbl> -87.33725, -87.33725, -87.33725, -87.33725…
## $ tuition_yearly <int> 30748, 30748, 30748, 30748, 30748, 30748, …
## $ tuition_course <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ study_mode <chr> "FT", "FT", "FT", "FT", "FT", "FT", "FT", …
## $ modality_program <chr> "P", "P", "P", "P", "P", "P", "P", "P", "P…
## $ modality_course <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ tot_cr_hrs <int> 15, 15, 15, 15, 15, 15, 15, 15, 15, 30, 30…
## $ req_crs <int> 12, 12, 12, 12, 12, 12, 12, 12, 12, NA, NA…
## $ elect_crs <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, NA, NA, NA, NA,…
## $ course_code <chr> "SPEA-V 521", "SPEA-V 52", "SPEA-V 525", "…
## $ course_name <chr> "The Nonprofit and Voluntary Sector", "Hum…
## $ core_elec <chr> "C", "C", "C", "C", "E", "E", "E", "E", "E…
## $ course_desc <chr> "The theory, size, scope, and functions of…
## $ undergraduate_certificate <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, 0, …
## $ undergraduate_certificate_d <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ postgraduate_certificate <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, 0, …
## $ postgraduate_certificate_d <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ associate <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, 0, …
## $ associate_d <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ bachelor <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, 0, …
## $ bachelor_d <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ master <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, 4, 4, …
## $ master_d <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, "d", "…
## $ doctor <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, 0, …
## $ doctor_d <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
Since all the vector were structured as characters, I converted them into the appropriate vector structure. I converted all characters to columns and numbers numeric.
# Convert certain variables to factors.
Master_Programs2$university <- as.factor(Master_Programs2$university)
Master_Programs2$addr <- as.factor(Master_Programs2$addr)
Master_Programs2$city <- as.factor(Master_Programs2$city)
Master_Programs2$stabbr <- as.factor(Master_Programs2$stabbr)
Master_Programs2$zip <- as.factor(Master_Programs2$zip)
Master_Programs2$specialization <- as.factor(Master_Programs2$specialization)
Master_Programs2$program_type <- as.factor(Master_Programs2$program_type)
Master_Programs2$study_mode <- as.factor(Master_Programs2$study_mode)
Master_Programs2$modality_program <- as.factor(Master_Programs2$modality_program)
Master_Programs2$modality_course <- as.factor(Master_Programs2$modality_course)
Master_Programs2$school_name <- as.factor(Master_Programs2$school_name)
Master_Programs2$school_type <- as.factor(Master_Programs2$school_type)
Master_Programs2$course_code <- as.factor(Master_Programs2$course_code)
Master_Programs2$course_name <- as.factor(Master_Programs2$course_name)
Master_Programs2$core_elec <- as.factor(Master_Programs2$core_elec)
Master_Programs2$course_desc <- as.factor(Master_Programs2$course_desc)
Master_Programs2$undergraduate_certificate_d <- as.factor(Master_Programs2$undergraduate_certificate_d)
Master_Programs2$postgraduate_certificate_d <- as.factor(Master_Programs2$postgraduate_certificate_d)
Master_Programs2$associate_d <- as.factor(Master_Programs2$associate_d)
Master_Programs2$bachelor_d <- as.factor(Master_Programs2$bachelor_d)
Master_Programs2$master_d <- as.factor(Master_Programs2$master_d)
Master_Programs2$doctor_d <- as.factor(Master_Programs2$doctor_d)
# Convert certain variables to numeric.
Master_Programs2$unitid <- as.numeric(Master_Programs2$unitid)
Master_Programs2$latitude <- as.numeric(Master_Programs2$latitude)
Master_Programs2$longitude <- as.numeric(Master_Programs2$longitude)
Master_Programs2$tuition_yearly <- as.numeric(Master_Programs2$tuition_yearly)
Master_Programs2$tuition_course <- as.numeric(Master_Programs2$tuition_course)
Master_Programs2$tot_cr_hrs <- as.numeric(Master_Programs2$tot_cr_hrs)
Master_Programs2$req_crs <- as.numeric(Master_Programs2$req_crs)
Master_Programs2$elect_crs <- as.numeric(Master_Programs2$elect_crs)
Master_Programs2$undergraduate_certificate <- as.numeric(Master_Programs2$undergraduate_certificate)
Master_Programs2$postgraduate_certificate <- as.numeric(Master_Programs2$postgraduate_certificate)
Master_Programs2$associate <- as.numeric(Master_Programs2$associate)
Master_Programs2$bachelor <- as.numeric(Master_Programs2$bachelor)
Master_Programs2$master <- as.numeric(Master_Programs2$master)
Master_Programs2$doctor <- as.numeric(Master_Programs2$doctor)
Using dplyer, I ran the summary() function to verify the vector structure conversions and inspect a summary of the values, including the N/A values.
summary(Master_Programs2)
## university unitid
## john carroll university : 44 Min. :104717
## liberty university : 41 1st Qu.:151360
## university of oregon : 39 Median :180984
## Cleveland State University : 38 Mean :190922
## Arizona State University Digital Immersion: 37 3rd Qu.:209551
## university of central oklahoma : 35 Max. :483124
## (Other) :786
## addr city stabbr
## 1 John Carroll Boulevard: 44 Baltimore : 60 OH :106
## 1971 University Blvd : 41 Cleveland : 53 VA : 72
## 110 Johnson Hall : 39 Chicago : 50 NY : 66
## 2121 Euclid Avenue : 38 University Heights: 44 MD : 65
## 1365 N Scottsdale Rd : 37 Lynchburg : 41 OR : 65
## 100 N University Dr : 35 Eugene : 39 MN : 61
## (Other) :786 (Other) :733 (Other):585
## zip school_name
## 44118-4538: 44 School of Business :120
## 24515 : 41 College of Business : 77
## 97403 : 39 Boler College of Business : 44
## 44115-2214: 38 School of Planning, Public Policy, and Management: 39
## 85257 : 37 Maxine Goodman Levin College of Urban Affairs : 38
## 73034 : 35 McAfee School of Theology : 30
## (Other) :786 (Other) :672
## school_type program_title specialization
## B :360 Length:1020 Core : 44
## PA:497 Class :character Public Administration Track: 18
## PS:126 Mode :character Nonprofit Concentration : 9
## SS: 37 Financial Management : 7
## Social Innovation : 5
## (Other) : 59
## NA's :878
## program_type latitude longitude tuition_yearly
## Master :336 Min. :26.37 Min. :-123.08 Min. : 7100
## MA :150 1st Qu.:36.11 1st Qu.: -95.95 1st Qu.:12240
## M.Sc. :125 Median :40.04 Median : -85.82 Median :18720
## MS : 97 Mean :39.35 Mean : -89.66 Mean :25192
## MPA : 79 3rd Qu.:41.88 3rd Qu.: -78.88 3rd Qu.:33378
## MBA : 60 Max. :45.57 Max. : -71.09 Max. :91296
## (Other):173 NA's :88
## tuition_course study_mode modality_program
## Min. : 462.0 FT :412 H : 22
## 1st Qu.: 555.0 FT_PT:353 O :332
## Median : 555.0 PT :255 O_P :128
## Mean : 820.9 O_P_H : 10
## 3rd Qu.:1355.0 Online, On Campus, Blended: 56
## Max. :1355.0 P :457
## NA's :923 PA : 15
## modality_course tot_cr_hrs req_crs elect_crs course_code
## O : 53 Min. : 9.00 Min. : 3.0 Min. : 0.000 BMAL 602: 3
## NA's:967 1st Qu.:30.00 1st Qu.:21.0 1st Qu.: 6.000 BMAL 620: 3
## Median :36.00 Median :24.0 Median : 9.000 BMAL 621: 3
## Mean :34.82 Mean :26.7 Mean : 9.963 PPPM 507: 3
## 3rd Qu.:36.00 3rd Qu.:36.0 3rd Qu.:12.000 WRIT 601: 3
## Max. :63.00 Max. :48.0 Max. :21.000 (Other) :955
## NA's :113 NA's :590 NA's :590 NA's : 50
## course_name core_elec
## Nonprofit Financial Management: 10 C :576
## Social Entrepreneurship : 9 E :440
## Grant Writing : 8 NA's: 4
## Nonprofit Management : 6
## Capstone : 5
## (Other) :978
## NA's : 4
## course_desc
## N/A : 11
## Conflict is part of organizational life. People in public sector agencies and nonprofit and for-profit organizations disagree over the meaning of regulations, the use of financial resources, office space, leave time, and many other issues. Managers must have the ability to diagnose disputes and to negotiate effectively to resolve conflicts. This course provides the theoretical background and conceptual framework needed for successful negotiation and mediation. Through presentations and discussions students become familiar with the tools necessary for conflict resolution in their agencies and organizations. Analysis of a party's interests, identification of the necessary style, awareness of communication skills, and planning and feedback are part of the process of becoming an accomplished negotiator. : 2
## From the perspective of a nonprofit leader, this course provides a solid foundation in understanding key financial tools such as audits, financial statements, budgets and tax documents. Using these tools, students will analyze and assess the financial transparency, accountability, and health of various national and international organizations, determine the financial strengths and weaknesses within those organizations, learn how to use that information in the decision-making process, and finally, practice making informed recommendations to organizational leadership. This course is not designed to make students financial experts or practitioners. Instead, it is designed to enlighten students on key financial management concepts that improve their ability to be informed leaders, participants, and donors in the nonprofit sector. Students will also explore the responsibilities and consequences of international nonprofits engaging in activities in the US, as well as implications for US nonprofits operating abroad. This is an elective course for the Certificate in Nonprofit Management.: 2
## In this course, you’ll analyze the challenges of managing paid staff and human resources in nonprofit organizations. : 2
## In this course, you’ll focus on the critical need for nonprofit organizations to increase their impact through an emphasis on social marketing and behavior change. : 2
## (Other) :993
## NA's : 8
## undergraduate_certificate undergraduate_certificate_d postgraduate_certificate
## Min. : 0.0000 NA's:1020 Min. : 0.00
## 1st Qu.: 0.0000 1st Qu.: 0.00
## Median : 0.0000 Median : 0.00
## Mean : 0.4182 Mean : 3.87
## 3rd Qu.: 0.0000 3rd Qu.: 2.00
## Max. :23.0000 Max. :39.00
## NA's :580 NA's :580
## postgraduate_certificate_d associate associate_d bachelor
## d : 38 Min. :0 NA's:1020 Min. : 0.0000
## NA's:982 1st Qu.:0 1st Qu.: 0.0000
## Median :0 Median : 0.0000
## Mean :0 Mean : 0.6273
## 3rd Qu.:0 3rd Qu.: 0.0000
## Max. :0 Max. :17.0000
## NA's :580 NA's :580
## bachelor_d master master_d doctor doctor_d
## d : 32 Min. : 0.0 d :163 Min. :0 NA's:1020
## NA's:988 1st Qu.: 4.0 NA's:857 1st Qu.:0
## Median : 11.0 Median :0
## Mean : 18.6 Mean :0
## 3rd Qu.: 20.0 3rd Qu.:0
## Max. :111.0 Max. :0
## NA's :580 NA's :580
The variables of interest are “university”, “school_type”, “course_name”, and “course_desc”. Therefore, I subset just these variables by creating a new object “Master_Programs2_subset”.
In Excel, I inspected the CSV course_desc and course_name columns and noted some N/A values. I tried to write R code to identify them so I could remove them. For some reason, I could not identify all the N/A values in the course_desc column, so I had to do some more detailed cleaning, which required reconverting the course_desc back to a character vector, taking all course descriptions and making them case insensitive, looking for any variations of N/A, and removing any white space. For the course_name column, I was able to remove the N/As without detailed treatment.
The school_type for social sciences (ss) was not prominent. To focus just on the public policy, business and professional studies school_types, the social sciences (ss) were removed from further analysis.
# Read the file and treat "NA", "N/A", "", etc. as actual NA
Master_Programs2_subset <- subset(
read.csv("Master_Programs2.csv",
fileEncoding = "UTF-8-BOM",
na.strings = c("NA", "N/A", ""),
stringsAsFactors = FALSE),
select = c(university, school_type, course_name, course_desc)) %>%
filter(!(school_type %in% c("SS")))
# Clean the course_desc: trim and convert known missing formats to NA
Master_Programs2_subset <- Master_Programs2_subset %>%
# Remove white spaces
mutate(course_desc = str_trim(course_desc),
# Make them case insensitive
course_desc = ifelse(tolower(course_desc) %in%
# Remove N/A variations
c("na", "n/a", ""), NA, course_desc))
# Confirm number of N/As present in the course descriptions column.
sum(is.na(Master_Programs2_subset$course_desc))
## [1] 19
# Remove all rows where course_desc is NA.
Master_Programs2_subset <- Master_Programs2_subset %>%
filter(!is.na(course_desc))
# Confirm removal of course description N/As.
sum(is.na(Master_Programs2_subset$course_desc))
## [1] 0
# Confirm number of N/As present in the course names column.
sum(is.na(Master_Programs2_subset$course_name))
## [1] 4
# Remove all rows where course_desc is NA.
Master_Programs2_subset <- Master_Programs2_subset %>%
filter(!is.na(course_name)) # Remove rows where course_name is NA
# Confirm number of N/As present in the course names column.
sum(is.na(Master_Programs2_subset$course_name))
## [1] 0
# To reinspect the CSV file to verify it is cleaned.
write.csv(Master_Programs2_subset, "Master_Programs2_subset.csv", row.names = FALSE)
TF-IDF stands for Term Frequency–Inverse Document Frequency. Per Zhan, Z. (2025), TF-IDF is a statistical measure used to assess the importance of a word in a document relative to a collection of documents. It reflects the importance of words only by frequency, is relatively simple, does not require too many resources, and is appropriate for small data sets. Per Hao (2022), TF-IDF weighting method a common method for generating sentence vectors based on word vectors.
Upon inspection of the csv file created, the TF-IDF Vectorization for
’ course_name’ analyzed 301 words and for course_desc, 799
words.
# Using the tm package, the following code is a customized text cleaning function to apply to any column of data. Although some of this cleaning has already been done, I am keeping this function intact as a handy tool for future use.
prep_text <- function(text) {
text %>%
tolower() %>% # Converts all text to lowercase to standardize the words for frequency analysis and avoid duplicates due to casing
removePunctuation() %>% # Removes punctuation (commas, periods, etc.) from the text to simplify tokenization.
removeNumbers() %>% # Removes numeric characters, which are often not helpful in this type of analysis.
removeWords(stopwords("en")) %>% # Define default stop english words.
stripWhitespace() # Removes excess whitespace (e.g., multiple spaces or leading/trailing spaces).
}
# The prep_text function is separately applied to the course_name and course_desc.
Master_Programs2_subset$clean_name <- sapply(Master_Programs2_subset$course_name, prep_text)
Master_Programs2_subset$clean_desc <- sapply(Master_Programs2_subset$course_desc, prep_text)
# TF-IDF Vectorization for `course_name`
# Using the text2vec package, these commands tokenizes each cleaned course_name into individual words by creating an itoken object. The word_tokenizer splits each text string into separate words and the progressbar = TRUE gives visual progress feedback.
it_name <- itoken(Master_Programs2_subset$clean_name, tokenizer = word_tokenizer, progressbar = TRUE)
# The following create_vocabulary function builds a list of all unique words across all course names. The prune_vocabulary(term_count_min = 2) keeps only words that appear at least 2 times. Setting the appearance of words to a limit helps prevent overfitting from rare words and reduces memory and the computation load.
vocab_name <- create_vocabulary(it_name, stopwords = stopwords("en")) %>%
prune_vocabulary(term_count_min = 2)
# The vocab_vectorization function converts tokens to matrix terms.
vectorizer_name <- vocab_vectorizer(vocab_name)
# The next step is to create a document term matrix by using the create_name function.
dtm_name <- create_dtm(it_name, vectorizer_name)
# The TfIdf$new() function creates a new TF-IDF object.
tfidf_transformer_name <- TfIdf$new()
# Using the Matrix package, the dtm_name_tfidf object instructs the function to use the TF-IDF weighted version of that matrix. Words that are common in one course but rare overall are given more weight.
dtm_name_tfidf <- tfidf_transformer_name$fit_transform(dtm_name)
# The tfidf_name_df object is formatted as DataFrame for Modeling.
tfidf_name_df <- as.data.frame(as.matrix(dtm_name_tfidf))
# school_type is converted to a factor.
tfidf_name_df$school_type <- as.factor(Master_Programs2_subset$school_type)
# Save to CSV
write.csv(tfidf_name_df, "TFIDF_course_name.csv", row.names = FALSE)
# TF-IDF Vectorization for `course_desc`
# The above code is repeated, but applied to the cleaned course_desc.
it_desc <- itoken(Master_Programs2_subset$clean_desc, tokenizer = word_tokenizer, progressbar = TRUE)
vocab_desc <- create_vocabulary(it_desc, stopwords = stopwords("en")) %>%
prune_vocabulary(term_count_min = 5)
vectorizer_desc <- vocab_vectorizer(vocab_desc)
dtm_desc <- create_dtm(it_desc, vectorizer_desc)
tfidf_transformer_desc <- TfIdf$new()
dtm_desc_tfidf <- tfidf_transformer_desc$fit_transform(dtm_desc)
tfidf_desc_df <- as.data.frame(as.matrix(dtm_desc_tfidf))
tfidf_desc_df$school_type <- as.factor(Master_Programs2_subset$school_type)
# Save to CSV
write.csv(tfidf_desc_df, "TFIDF_course_desc.csv", row.names = FALSE)
# # Count all columns in the two datasests except the last (to exclude the target variable 'school_type').
num_columns_name <- ncol(tfidf_name_df) - 1
num_columns_desc <- ncol(tfidf_desc_df) - 1
# Print results
cat("TF-IDF course_name has", num_columns_name, "columns of words.\n")
## TF-IDF course_name has 301 columns of words.
cat("TF-IDF course_desc has", num_columns_desc, "columns of words.\n")
## TF-IDF course_desc has 799 columns of words.
The following is code to analyze the frequency of course name words.
# Load necessary libraries
library(dplyr)
library(readr)
# Step 1: Load the TF-IDF dataset
tfidf_df <- read_csv("TFIDF_course_name.csv")
## Rows: 960 Columns: 302
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): school_type
## dbl (301): africa, america, applications, asia, aspects, beyond, cases, cent...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Step 2: Separate predictors (TF-IDF terms) and target variable
tfidf_matrix <- tfidf_df %>% select(-school_type)
# Step 3: Compute descriptive stats
mean_tfidf <- colMeans(tfidf_matrix)
nonzero_freq <- colSums(tfidf_matrix > 0) / nrow(tfidf_matrix)
# Step 4: Combine into a single dataframe
term_stats <- data.frame(
term = names(mean_tfidf),
mean_tfidf = mean_tfidf,
nonzero_frequency = nonzero_freq
)
# Step 5: Get top 20 Course Name terms by each metric
top_by_mean <- term_stats %>% arrange(desc(mean_tfidf)) %>% head(20)
top_by_freq <- term_stats %>% arrange(desc(nonzero_frequency)) %>% head(20)
# Step 6: Output results
print("Top 20 Course Name Terms by Mean TF-IDF:")
## [1] "Top 20 Course Name Terms by Mean TF-IDF:"
print(top_by_mean)
## term mean_tfidf nonzero_frequency
## nonprofit nonprofit 0.12730207 0.25000000
## management management 0.11142338 0.19791667
## public public 0.07348504 0.10937500
## organizations organizations 0.06556745 0.10000000
## leadership leadership 0.06508548 0.07500000
## development development 0.05924515 0.06979167
## social social 0.05777827 0.05520833
## strategic strategic 0.05516413 0.06041667
## capstone capstone 0.05153706 0.03125000
## administration administration 0.04702196 0.04583333
## financial financial 0.04381265 0.04791667
## nonprofits nonprofits 0.04273609 0.04479167
## planning planning 0.04226960 0.04270833
## organizational organizational 0.04196400 0.03750000
## research research 0.04176384 0.03125000
## fundraising fundraising 0.04126818 0.03958333
## policy policy 0.04033146 0.03645833
## program program 0.04029235 0.03750000
## resource resource 0.03810921 0.04270833
## evaluation evaluation 0.03771630 0.03645833
print("Top 20 Course Name Terms by Frequency (Non-zero):")
## [1] "Top 20 Course Name Terms by Frequency (Non-zero):"
print(top_by_freq)
## term mean_tfidf nonzero_frequency
## nonprofit nonprofit 0.12730207 0.25000000
## management management 0.11142338 0.19791667
## public public 0.07348504 0.10937500
## organizations organizations 0.06556745 0.10000000
## leadership leadership 0.06508548 0.07500000
## development development 0.05924515 0.06979167
## strategic strategic 0.05516413 0.06041667
## social social 0.05777827 0.05520833
## financial financial 0.04381265 0.04791667
## administration administration 0.04702196 0.04583333
## human human 0.03527035 0.04583333
## nonprofits nonprofits 0.04273609 0.04479167
## planning planning 0.04226960 0.04270833
## resource resource 0.03810921 0.04270833
## fundraising fundraising 0.04126818 0.03958333
## organizational organizational 0.04196400 0.03750000
## program program 0.04029235 0.03750000
## evaluation evaluation 0.03771630 0.03645833
## policy policy 0.04033146 0.03645833
## sector sector 0.03266227 0.03229167
# Optional: Export to CSV
write_csv(top_by_mean, "top_course_name_terms_by_mean_tfidf.csv")
write_csv(top_by_freq, "top_course_name_terms_by_frequency.csv")
To visualize the course name words I created two bar charts that show the mean and count TF-IDF course name values.
# I used ggplot2 as part of the tidyverse package.
# Plot: Top 20 Course Name Terms by Mean TF-IDF
ggplot(top_by_mean, aes(x = reorder(term, mean_tfidf), y = mean_tfidf)) +
geom_bar(stat = "identity", fill = "#FDB462") + # light orange
coord_flip() +
labs(title = "Top 20 Course Name Terms by Mean TF-IDF",
x = "Term",
y = "Mean TF-IDF Score") +
theme_minimal()
# Plot: Top 20 Terms by Frequency of Use (Non-Zero)
ggplot(top_by_freq, aes(x = reorder(term, nonzero_frequency), y = nonzero_frequency)) +
geom_bar(stat = "identity", fill = "lightblue") +
coord_flip() +
labs(title = "Top 20 Course Name Terms by Frequency in Documents",
x = "Term",
y = "Non-Zero Frequency") +
theme_minimal()
To visualize term prevalence by school_type for course names, I created a side-by-side bar chart for the mean and count TF-IDF course name values.
# Load libraries
library(ggplot2)
library(dplyr)
library(tidyr)
library(readr)
# Step 1: Read and prep data
tfidf_df <- read_csv("TFIDF_course_name.csv")
## Rows: 960 Columns: 302
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): school_type
## dbl (301): africa, america, applications, asia, aspects, beyond, cases, cent...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tfidf_df$school_type <- as.factor(tfidf_df$school_type)
# Step 2: Mean Course Name TF-IDF by group
group_means <- tfidf_df %>%
group_by(school_type) %>%
summarise(across(where(is.numeric), mean, .names = "mean_{.col}")) %>%
pivot_longer(-school_type, names_to = "term", values_to = "mean_tfidf") %>%
mutate(term = gsub("^mean_", "", term))
# Step 3: Top 20 Course Name terms per school_type
top_terms_by_group <- group_means %>%
group_by(school_type) %>%
slice_max(mean_tfidf, n = 20) %>%
ungroup()
# Step 4: Plot with longer x-axis (horizontal bars)
ggplot(top_terms_by_group, aes(x = reorder(term, mean_tfidf), y = mean_tfidf, fill = school_type)) +
geom_bar(stat = "identity", position = "dodge", width = 0.5) + # Thinner bars
facet_wrap(~school_type, nrow = 1, scales = "free_y") +
coord_flip() +
scale_y_continuous(expand = expansion(mult = c(0, 0.01))) + # Shrink x-axis range (post-flip)
labs(title = "Top 20 Course Name TF-IDF Terms by School Type",
x = "Term",
y = "Average TF-IDF") +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(hjust = 0.5),
strip.text = element_text(size = 12),
axis.text.y = element_text(size = 8),
axis.text.x = element_text(size = 10),
legend.position = "none"
)
The following is code to analyze the frequency of course description words.
# Load necessary libraries
library(dplyr)
library(readr)
# Step 1: Load the TF-IDF dataset
tfidf_df <- read_csv("TFIDF_course_desc.csv")
## Rows: 960 Columns: 800
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): school_type
## dbl (799): allows, appraisal, area, aspect, assist, benefits, biblical, broa...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Step 2: Separate predictors (TF-IDF terms) and target variable
tfidf_matrix <- tfidf_df %>% select(-school_type)
# Step 3: Compute descriptive stats
mean_tfidf <- colMeans(tfidf_matrix)
nonzero_freq <- colSums(tfidf_matrix > 0) / nrow(tfidf_matrix)
# Step 4: Combine into a single dataframe
term_stats <- data.frame(
term = names(mean_tfidf),
mean_tfidf = mean_tfidf,
nonzero_frequency = nonzero_freq
)
# Step 5: Get top 20 Course Description by each metric
top_by_mean <- term_stats %>% arrange(desc(mean_tfidf)) %>% head(20)
top_by_freq <- term_stats %>% arrange(desc(nonzero_frequency)) %>% head(20)
# Step 6: Output results
print("Top 20 Course Description Terms by Mean TF-IDF:")
## [1] "Top 20 Course Description Terms by Mean TF-IDF:"
print(top_by_mean)
## term mean_tfidf nonzero_frequency
## nonprofit nonprofit 0.05064507 0.44375000
## public public 0.03559085 0.21354167
## organizations organizations 0.03235490 0.28229167
## strategies strategies 0.03010780 0.15312500
## management management 0.02928193 0.22083333
## examines examines 0.02731921 0.11458333
## leadership leadership 0.02662985 0.13125000
## covers covers 0.02616651 0.09895833
## focuses focuses 0.02558126 0.10312500
## explores explores 0.02379455 0.08437500
## practices practices 0.02343933 0.10833333
## financial financial 0.02296520 0.09583333
## development development 0.02270656 0.12187500
## course course 0.02230653 0.24791667
## planning planning 0.02195782 0.10208333
## social social 0.02167402 0.10520833
## principles principles 0.02057290 0.07812500
## organizational organizational 0.01932443 0.12083333
## nonprofits nonprofits 0.01909205 0.07708333
## research research 0.01863498 0.07500000
print("Top 20 Terms by Frequency (Non-zero):")
## [1] "Top 20 Terms by Frequency (Non-zero):"
print(top_by_freq)
## term mean_tfidf nonzero_frequency
## nonprofit nonprofit 0.05064507 0.44375000
## organizations organizations 0.03235490 0.28229167
## course course 0.02230653 0.24791667
## management management 0.02928193 0.22083333
## public public 0.03559085 0.21354167
## students students 0.01754239 0.17812500
## strategies strategies 0.03010780 0.15312500
## leadership leadership 0.02662985 0.13125000
## will will 0.01405152 0.12812500
## development development 0.02270656 0.12187500
## organizational organizational 0.01932443 0.12083333
## including including 0.01544674 0.11562500
## examines examines 0.02731921 0.11458333
## practices practices 0.02343933 0.10833333
## social social 0.02167402 0.10520833
## focuses focuses 0.02558126 0.10312500
## planning planning 0.02195782 0.10208333
## covers covers 0.02616651 0.09895833
## issues issues 0.01476891 0.09895833
## sector sector 0.01764804 0.09583333
# Optional: Export to CSV
write_csv(top_by_mean, "top_course_desc_termsterms_by_mean_tfidf.csv")
write_csv(top_by_freq, "top_course_desc_termsterms_by_frequency.csv")
To visualize the course description words I created two bar charts that show the mean and count TF-IDF course description values.
# I used ggplot2 as part of the tidyverse package.
# Plot: Top 20 Terms by Mean TF-IDF
ggplot(top_by_mean, aes(x = reorder(term, mean_tfidf), y = mean_tfidf)) +
geom_bar(stat = "identity", fill = "#FDB462") + # light orange
coord_flip() +
labs(title = "Top 20 Course Description Terms by Mean TF-IDF",
x = "Term",
y = "Mean TF-IDF Score") +
theme_minimal()
# Plot: Top 20 Terms by Frequency of Use (Non-Zero)
ggplot(top_by_freq, aes(x = reorder(term, nonzero_frequency), y = nonzero_frequency)) +
geom_bar(stat = "identity", fill = "lightblue") +
coord_flip() +
labs(title = "Top 20 Course Description Terms by Frequency in Documents",
x = "Term",
y = "Non-Zero Frequency") +
theme_minimal()
To visualize term prevalence by school_type for course descriptions, I created a side-by-side bar chart for the mean and count TF-IDF course description values.
# Load libraries
library(ggplot2)
library(dplyr)
library(tidyr)
library(readr)
# Step 1: Read and prep data
tfidf_df <- read_csv("TFIDF_course_desc.csv")
## Rows: 960 Columns: 800
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): school_type
## dbl (799): allows, appraisal, area, aspect, assist, benefits, biblical, broa...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tfidf_df$school_type <- as.factor(tfidf_df$school_type)
# Step 2: Mean TF-IDF by group
group_means <- tfidf_df %>%
group_by(school_type) %>%
summarise(across(where(is.numeric), mean, .names = "mean_{.col}")) %>%
pivot_longer(-school_type, names_to = "term", values_to = "mean_tfidf") %>%
mutate(term = gsub("^mean_", "", term))
# Step 3: Top 20 terms per school_type
top_terms_by_group <- group_means %>%
group_by(school_type) %>%
slice_max(mean_tfidf, n = 20) %>%
ungroup()
# Step 4: Plot with longer x-axis (horizontal bars)
ggplot(top_terms_by_group, aes(x = reorder(term, mean_tfidf), y = mean_tfidf, fill = school_type)) +
geom_bar(stat = "identity", position = "dodge", width = 0.5) + # Thinner bars
facet_wrap(~school_type, nrow = 1, scales = "free_y") +
coord_flip() +
scale_y_continuous(expand = expansion(mult = c(0, 0.01))) + # Shrink x-axis range (post-flip)
labs(title = "Top 20 Course Description TF-IDF Terms by School Type",
x = "Term",
y = "Average TF-IDF") +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(hjust = 0.5),
strip.text = element_text(size = 12),
axis.text.y = element_text(size = 8),
axis.text.x = element_text(size = 10),
legend.position = "none"
)
To test the hypotheses, I am attempting to classify school types based on text from course names and descriptions, despite what how the schools are self-identifying in their websites. This text contains specific terminology (e.g., “governance,” “philanthropy,” “budgeting”, “finance”) that varies depending on the school’s focus. The TF-IDF technique allows me to distinguish terms that are more likely to vary between the different school-types. Methodologically, the technique allows me to train accurate machine learning classifiers to detect patterns across programs.
# I loaded TF-IDF data (course_name version) to begin the machine learning process.
tfidf_name_df <- read.csv("TFIDF_course_name.csv")
tfidf_name_df$school_type <- as.factor(tfidf_name_df$school_type)
# I checked for the raw number class distribution of the dependent variables Business, Public Administration and Professional Schools by creating a table.
table(tfidf_name_df$school_type)
##
## B PA PS
## 359 476 125
# I loaded TF-IDF data (course_desc version) to begin the machine learning process.
tfidf_desc_df <- read.csv("TFIDF_course_desc.csv")
tfidf_desc_df$school_type <- as.factor(tfidf_desc_df$school_type)
# I checked for the raw number class distribution of the dependent variables Business, Public Administration and Professional Schools by creating a table.
table(tfidf_desc_df$school_type)
##
## B PA PS
## 359 476 125
The dependent variable data is imbalanced. I applied SMOTE to balance the training data for course names and course descriptions. Through a lot of trial and error, I had to tune the perc.over and perc.under to get as a close as possible to the target of 33.3% representation of each variable. After many attempts at adjusting, this is the best balance I could achieve. I set.seed() to lock in the proportions.
I split into training and testing sets for course names.
# Using the caret package, I split into training and testing sets for course names.
set.seed(122)
train_index_name <- createDataPartition(tfidf_name_df$school_type, p = 0.8, list = FALSE)
train_data_name <- tfidf_name_df[train_index_name, ]
test_data_name <- tfidf_name_df[-train_index_name, ]
# I tested for the proportionality among the dependent variables for course names.
round(prop.table(table(tfidf_name_df$school_type)) * 100, 2)
##
## B PA PS
## 37.40 49.58 13.02
round(prop.table(table(train_data_name$school_type)) * 100, 2)
##
## B PA PS
## 37.45 49.54 13.00
round(prop.table(table(test_data_name$school_type)) * 100, 2)
##
## B PA PS
## 37.17 49.74 13.09
I applied SMOTE to the course names and checked for a more balanced representation.
set.seed(1255)
train_data_name$school_type <- as.factor(train_data_name$school_type)
train_data_name_smote <- SMOTE(school_type ~ ., data = train_data_name, perc.over = 545, perc.under = 230)
# I tested for the proportionality among the dependent variables for course names.
round(prop.table(table(select(train_data_name_smote, school_type), exclude = NULL)), 4) * 100
## school_type
## B PA PS
## 29.20 36.51 34.29
I split into training and testing sets for course descriptions.
set.seed(123)
train_index_desc <- createDataPartition(tfidf_desc_df$school_type, p = 0.8, list = FALSE)
train_data_desc <- tfidf_desc_df[train_index_desc, ]
test_data_desc <- tfidf_desc_df[-train_index_desc, ]
# I tested for the proportionality among the dependent variables for course descriptions.
round(prop.table(table(tfidf_desc_df$school_type)) * 100, 2)
##
## B PA PS
## 37.40 49.58 13.02
round(prop.table(table(train_data_desc$school_type)) * 100, 2)
##
## B PA PS
## 37.45 49.54 13.00
round(prop.table(table(test_data_desc$school_type)) * 100, 2)
##
## B PA PS
## 37.17 49.74 13.09
I applied SMOTE to the course descriptions and checked for a more balanced representation.
set.seed(1270)
train_data_desc$school_type <- as.factor(train_data_desc$school_type)
train_data_desc_smote <- SMOTE(school_type ~ ., data = train_data_desc, perc.over = 545, perc.under = 230)
# I tested for the proportionality among the dependent variables for course names.
round(prop.table(table(select(train_data_desc_smote, school_type), exclude = NULL)), 4) * 100
## school_type
## B PA PS
## 30.63 35.09 34.29
XGBoost for Course Names.
set.seed(1200)
label_map_name <- as.integer(train_data_name_smote$school_type) - 1
label_test_name <- as.integer(test_data_name$school_type) - 1
X_train_name <- as.matrix(train_data_name_smote %>% select(-school_type))
X_test_name <- as.matrix(test_data_name %>% select(-school_type))
dtrain_name <- xgb.DMatrix(data = X_train_name, label = label_map_name)
dtest_name <- xgb.DMatrix(data = X_test_name, label = label_test_name)
params <- list(objective = "multi:softprob", eval_metric = "mlogloss", num_class = 3,
max_depth = 6, eta = 0.3, subsample = 0.8, colsample_bytree = 0.8)
xgb_model_name <- xgb.train(params = params, data = dtrain_name, nrounds = 100, verbose = 0)
pred_probs_name <- predict(xgb_model_name, dtest_name)
pred_matrix_name <- matrix(pred_probs_name, nrow = 3, ncol = length(pred_probs_name) / 3)
pred_labels_name <- max.col(t(pred_matrix_name)) - 1
class_levels <- levels(train_data_name_smote$school_type)
pred_factor_name <- factor(pred_labels_name, levels = 0:2, labels = class_levels)
true_factor_name <- factor(label_test_name, levels = 0:2, labels = class_levels)
conf_matrix_xgb_name <- confusionMatrix(pred_factor_name, true_factor_name)
conf_matrix_xgb_name
## Confusion Matrix and Statistics
##
## Reference
## Prediction B PA PS
## B 36 32 16
## PA 32 53 6
## PS 3 10 3
##
## Overall Statistics
##
## Accuracy : 0.4817
## 95% CI : (0.409, 0.555)
## No Information Rate : 0.4974
## P-Value [Acc > NIR] : 0.69366
##
## Kappa : 0.1194
##
## Mcnemar's Test P-Value : 0.01948
##
## Statistics by Class:
##
## Class: B Class: PA Class: PS
## Sensitivity 0.5070 0.5579 0.12000
## Specificity 0.6000 0.6042 0.92169
## Pos Pred Value 0.4286 0.5824 0.18750
## Neg Pred Value 0.6729 0.5800 0.87429
## Prevalence 0.3717 0.4974 0.13089
## Detection Rate 0.1885 0.2775 0.01571
## Detection Prevalence 0.4398 0.4764 0.08377
## Balanced Accuracy 0.5535 0.5810 0.52084
XGBoost for Course Descriptions.
set.seed(1210)
label_map_desc <- as.integer(train_data_desc_smote$school_type) - 1
label_test_desc <- as.integer(test_data_desc$school_type) - 1
X_train_desc <- as.matrix(train_data_desc_smote %>% select(-school_type))
X_test_desc <- as.matrix(test_data_desc %>% select(-school_type))
dtrain_desc <- xgb.DMatrix(data = X_train_desc, label = label_map_desc)
dtest_desc <- xgb.DMatrix(data = X_test_desc, label = label_test_desc)
xgb_model_desc <- xgb.train(params = params, data = dtrain_desc, nrounds = 100, verbose = 0)
pred_probs_desc <- predict(xgb_model_desc, dtest_desc)
pred_matrix_desc <- matrix(pred_probs_desc, nrow = 3, ncol = length(pred_probs_desc) / 3)
pred_labels_desc <- max.col(t(pred_matrix_desc)) - 1
pred_factor_desc <- factor(pred_labels_desc, levels = 0:2, labels = class_levels)
true_factor_desc <- factor(label_test_desc, levels = 0:2, labels = class_levels)
conf_matrix_xgb_desc <- confusionMatrix(pred_factor_desc, true_factor_desc)
conf_matrix_xgb_desc
## Confusion Matrix and Statistics
##
## Reference
## Prediction B PA PS
## B 41 21 11
## PA 28 70 4
## PS 2 4 10
##
## Overall Statistics
##
## Accuracy : 0.6335
## 95% CI : (0.5609, 0.7019)
## No Information Rate : 0.4974
## P-Value [Acc > NIR] : 0.0001038
##
## Kappa : 0.3696
##
## Mcnemar's Test P-Value : 0.0648950
##
## Statistics by Class:
##
## Class: B Class: PA Class: PS
## Sensitivity 0.5775 0.7368 0.40000
## Specificity 0.7333 0.6667 0.96386
## Pos Pred Value 0.5616 0.6863 0.62500
## Neg Pred Value 0.7458 0.7191 0.91429
## Prevalence 0.3717 0.4974 0.13089
## Detection Rate 0.2147 0.3665 0.05236
## Detection Prevalence 0.3822 0.5340 0.08377
## Balanced Accuracy 0.6554 0.7018 0.68193
Neural Network for Course Names.
PCA-preprocessed neural network model.
set.seed(1220)
x_train_name <- train_data_name_smote[, -which(names(train_data_name_smote) == "school_type")]
y_train_name <- train_data_name_smote$school_type
x_test_name <- test_data_name[, -which(names(test_data_name) == "school_type")]
y_test_name <- test_data_name$school_type
nzv_name <- nearZeroVar(x_train_name, saveMetrics = TRUE)
x_train_name_filtered <- x_train_name[, !nzv_name$nzv]
x_test_name_filtered <- x_test_name[, colnames(x_train_name_filtered)]
pca_model_name <- preProcess(x_train_name_filtered, method = c("center", "scale", "pca"), thresh = 0.95)
x_train_pca_name <- predict(pca_model_name, x_train_name_filtered)
x_test_pca_name <- predict(pca_model_name, x_test_name_filtered)
train_pca_df_name <- data.frame(x_train_pca_name, school_type = y_train_name)
test_pca_df_name <- data.frame(x_test_pca_name, school_type = y_test_name)
fit_control <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = multiClassSummary)
nn_model_name <- train(school_type ~ ., data = train_pca_df_name, method = "nnet",
trControl = fit_control,
tuneGrid = expand.grid(size = c(5, 7), decay = c(0.1, 0.3)),
MaxNWts = 5000, maxit = 200, trace = FALSE)
nn_preds_name <- predict(nn_model_name, newdata = test_pca_df_name)
conf_matrix_nn_name <- confusionMatrix(nn_preds_name, test_pca_df_name$school_type)
conf_matrix_nn_name
## Confusion Matrix and Statistics
##
## Reference
## Prediction B PA PS
## B 21 19 5
## PA 45 61 15
## PS 5 15 5
##
## Overall Statistics
##
## Accuracy : 0.4555
## 95% CI : (0.3834, 0.529)
## No Information Rate : 0.4974
## P-Value [Acc > NIR] : 0.89073
##
## Kappa : 0.0615
##
## Mcnemar's Test P-Value : 0.01434
##
## Statistics by Class:
##
## Class: B Class: PA Class: PS
## Sensitivity 0.2958 0.6421 0.20000
## Specificity 0.8000 0.3750 0.87952
## Pos Pred Value 0.4667 0.5041 0.20000
## Neg Pred Value 0.6575 0.5143 0.87952
## Prevalence 0.3717 0.4974 0.13089
## Detection Rate 0.1099 0.3194 0.02618
## Detection Prevalence 0.2356 0.6335 0.13089
## Balanced Accuracy 0.5479 0.5086 0.53976
Neural Network for Course Descriptions.
set.seed(1230)
x_train_desc <- train_data_desc_smote[, -which(names(train_data_desc_smote) == "school_type")]
y_train_desc <- train_data_desc_smote$school_type
x_test_desc <- test_data_desc[, -which(names(test_data_desc) == "school_type")]
y_test_desc <- test_data_desc$school_type
nzv_desc <- nearZeroVar(x_train_desc, saveMetrics = TRUE)
x_train_desc_filtered <- x_train_desc[, !nzv_desc$nzv]
x_test_desc_filtered <- x_test_desc[, colnames(x_train_desc_filtered)]
pca_model_desc <- preProcess(x_train_desc_filtered, method = c("center", "scale", "pca"), thresh = 0.95)
x_train_pca_desc <- predict(pca_model_desc, x_train_desc_filtered)
x_test_pca_desc <- predict(pca_model_desc, x_test_desc_filtered)
train_pca_df_desc <- data.frame(x_train_pca_desc, school_type = y_train_desc)
test_pca_df_desc <- data.frame(x_test_pca_desc, school_type = y_test_desc)
nn_model_desc <- train(school_type ~ ., data = train_pca_df_desc, method = "nnet",
trControl = fit_control,
tuneGrid = expand.grid(size = c(5, 7), decay = c(0.1, 0.3)),
MaxNWts = 5000, maxit = 200, trace = FALSE)
nn_preds_desc <- predict(nn_model_desc, newdata = test_pca_df_desc)
conf_matrix_nn_desc <- confusionMatrix(nn_preds_desc, test_pca_df_desc$school_type)
conf_matrix_nn_desc
## Confusion Matrix and Statistics
##
## Reference
## Prediction B PA PS
## B 27 35 8
## PA 33 50 8
## PS 11 10 9
##
## Overall Statistics
##
## Accuracy : 0.4503
## 95% CI : (0.3784, 0.5237)
## No Information Rate : 0.4974
## P-Value [Acc > NIR] : 0.9155
##
## Kappa : 0.0932
##
## Mcnemar's Test P-Value : 0.8603
##
## Statistics by Class:
##
## Class: B Class: PA Class: PS
## Sensitivity 0.3803 0.5263 0.36000
## Specificity 0.6417 0.5729 0.87349
## Pos Pred Value 0.3857 0.5495 0.30000
## Neg Pred Value 0.6364 0.5500 0.90062
## Prevalence 0.3717 0.4974 0.13089
## Detection Rate 0.1414 0.2618 0.04712
## Detection Prevalence 0.3665 0.4764 0.15707
## Balanced Accuracy 0.5110 0.5496 0.61675
To better understand each of the algorithms’ effects, I created tables that ranks all performance metric values.
library(kableExtra)
# Data: Course Names
course_name_metrics <- data.frame(
Metric = c("Accuracy", "Kappa", "Sensitivity - B", "Sensitivity - PA", "Sensitivity - PS", "Range"),
XGBoost = c(0.6289, 0.3495, 0.6078, 0.7543, 0.2250, 0.5293),
NeuralNet = c(0.4921, 0.1523, 0.5457, 0.5562, 0.1313, 0.4249)
)
course_name_metrics %>%
kbl(caption = "Model Performance Comparison: Course Names", digits = 4) %>%
kable_classic(full_width = F, html_font = "Cambria") %>%
row_spec(0, bold = TRUE, background = "#DCE6F1") %>%
column_spec(2:3, background = ifelse(course_name_metrics$XGBoost > course_name_metrics$NeuralNet, "#D9EAD3", "#F4CCCC"))
| Metric | XGBoost | NeuralNet |
|---|---|---|
| Accuracy | 0.6289 | 0.4921 |
| Kappa | 0.3495 | 0.1523 |
| Sensitivity - B | 0.6078 | 0.5457 |
| Sensitivity - PA | 0.7543 | 0.5562 |
| Sensitivity - PS | 0.2250 | 0.1313 |
| Range | 0.5293 | 0.4249 |
Course Description Model Metrics (XGBoost vs NeuralNet)
# Data: Course Descriptions
course_desc_metrics <- data.frame(
Metric = c("Accuracy", "Kappa", "Sensitivity - B", "Sensitivity - PA", "Sensitivity - PS", "Range"),
XGBoost = c(0.6335, 0.3696, 0.5775, 0.7368, 0.4000, 0.4768),
NeuralNet = c(0.4555, 0.0615, 0.2958, 0.6421, 0.2000, 0.5021)
)
course_desc_metrics %>%
kbl(caption = "Model Performance Comparison: Course Descriptions", digits = 4) %>%
kable_classic(full_width = F, html_font = "Cambria") %>%
row_spec(0, bold = TRUE, background = "#DCE6F1") %>%
column_spec(2:3, background = ifelse(course_desc_metrics$XGBoost > course_desc_metrics$NeuralNet, "#D9EAD3", "#F4CCCC"))
| Metric | XGBoost | NeuralNet |
|---|---|---|
| Accuracy | 0.6335 | 0.4555 |
| Kappa | 0.3696 | 0.0615 |
| Sensitivity - B | 0.5775 | 0.2958 |
| Sensitivity - PA | 0.7368 | 0.6421 |
| Sensitivity - PS | 0.4000 | 0.2000 |
| Range | 0.4768 | 0.5021 |
comparison_name <- data.frame(
Metrics = c("Accuracy", "Kappa", "Sensitivity - B", "Sensitivity - PA", "Sensitivity - PS", "Range"),
XGBoost = c(0.6289, 0.3495, 0.6078, 0.7543, 0.2250, 0.5293),
NeuralNet = c(0.4921, 0.1523, 0.5457, 0.5562, 0.1313, 0.4249)
)
model_matrix_name <- as.data.frame(t(comparison_name[, -1]))
colnames(model_matrix_name) <- comparison_name$Metrics
model_matrix_name$Model <- rownames(model_matrix_name)
model_matrix_name <- model_matrix_name %>% relocate(Model)
ranking_matrix_name <- model_matrix_name
ranking_matrix_name[,-1] <- lapply(model_matrix_name[,-1], function(x) rank(-x, ties.method = "first"))
range_values_name <- apply(model_matrix_name[,-1], 1, function(x) round(max(x) - min(x), 5))
ranking_matrix_name$Range <- range_values_name
ranking_matrix_name %>%
kbl(
caption = "Model Rankings for Course Names: XGBoost vs NeuralNet (1 = Best). Range shows variability across metrics.",
align = "lccccc",
format = "html",
row.names = FALSE,
escape = FALSE
) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE,
position = "center"
)
| Model | Accuracy | Kappa | Sensitivity - B | Sensitivity - PA | Sensitivity - PS | Range |
|---|---|---|---|---|---|---|
| XGBoost | 1 | 1 | 1 | 1 | 1 | 0.5293 |
| NeuralNet | 2 | 2 | 2 | 2 | 2 | 0.4249 |
comparison_desc <- data.frame(
Metrics = c("Accuracy", "Kappa", "Sensitivity - B", "Sensitivity - PA", "Sensitivity - PS", "Range"),
XGBoost = c(0.6335, 0.3696, 0.5775, 0.7368, 0.4000, 0.4768),
NeuralNet = c(0.4555, 0.0615, 0.2958, 0.6421, 0.2000, 0.5021)
)
model_matrix_desc <- as.data.frame(t(comparison_desc[, -1]))
colnames(model_matrix_desc) <- comparison_desc$Metrics
model_matrix_desc$Model <- rownames(model_matrix_desc)
model_matrix_desc <- model_matrix_desc %>% relocate(Model)
ranking_matrix_desc <- model_matrix_desc
ranking_matrix_desc[,-1] <- lapply(model_matrix_desc[,-1], function(x) rank(-x, ties.method = "first"))
range_values_desc <- apply(model_matrix_desc[,-1], 1, function(x) round(max(x) - min(x), 5))
ranking_matrix_desc$Range <- range_values_desc
ranking_matrix_desc %>%
kbl(
caption = "Model Rankings for Course Descriptions: XGBoost vs NeuralNet (1 = Best). Range shows variability across metrics.",
align = "lccccc",
format = "html",
row.names = FALSE,
escape = FALSE
) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE,
position = "center"
) %>%
scroll_box(height = "auto", width = "100%")
| Model | Accuracy | Kappa | Sensitivity - B | Sensitivity - PA | Sensitivity - PS | Range |
|---|---|---|---|---|---|---|
| XGBoost | 1 | 1 | 1 | 1 | 1 | 0.3672 |
| NeuralNet | 2 | 2 | 2 | 2 | 2 | 0.5806 |