Coffy Andrews-Guo, Joseph Foy, Krutika Patel, Peter Phung
Please refer to the project write-up document for further information.
library(RMySQL)
library(tidyr)
library(tidyverse)
library(dplyr)
library(readr)
csv <- data.frame(read.csv("https://raw.githubusercontent.com/Patel-Krutika/Data_607/main/Acct_Curricula2.csv"))
col <- colnames(csv)
col[1] <- "School"
colnames(csv) <- col
csv <- distinct(csv)
# Create School_ID
# School_ID serves as the primary key for the School table, and identifies a unique school
csv <- csv %>% group_by(School, City, State, ZipCode, Total.Tuition.In.State, Tuition.per.Credit.In.State, In.state.fees, Total.Tuition.Out.of.State, Tuition.per.Credit.Out.of.State, Out.of.state.fees) %>% mutate(School_ID=cur_group_id())
# Create Program_ID
# Program_ID serves as the primary key for the Program table, and identifies a unique program
csv <- csv %>% group_by(School_ID, Degree.Program.Name, Modality, Program.Degree.Credits, Programmic.Accrediation) %>% mutate(Program_ID=cur_group_id())
school <- csv %>% select(School_ID, School, City, State, ZipCode, Total.Tuition.In.State, Tuition.per.Credit.In.State, In.state.fees, Total.Tuition.Out.of.State, Tuition.per.Credit.Out.of.State, Out.of.state.fees)
school <- subset(school, select = -c(Modality, Degree.Program.Name, Program.Degree.Credits, Programmic.Accrediation) )
colnames(school) <- c("School_ID", "School", "City", "State", "ZipCode", "Total_Tuition_In_State", "Tuition_per_Credit_In_State", "In_state_fees", "Total_Tuition_Out_of_State", "Tuition_per_Credit_Out_of_State", "Out_of_state_fees")
school <- distinct(school)
program <- csv %>% select(Program_ID, School_ID, Name = Degree.Program.Name, Modality, Credits = Program.Degree.Credits, Program_Accrediation = Programmic.Accrediation)
program <- distinct(program)
#Filter Description Column
rem_dup_word <- function(x){
x <- tolower(x)
x <- gsub("-", " ", x)
x <- gsub("[[:punct:]]", "", x)
x <- gsub("[[:digit:]]", "", x)
return(paste(unique(trimws(unlist(strsplit(x,split=" ",fixed=F,perl=T)))),
collapse = " "))
}
course_data <- csv
course_description_df <- course_data %>%
select(Description) %>%
apply(MARGIN = 2, FUN = enc2utf8) %>%
as.data.frame() %>%
rowwise() %>%
mutate(Filtered_Description = rem_dup_word(Description)) %>%
mutate(Filtered_Description = na_if(Filtered_Description, "NA"))
course_data$Filtered_description <- course_description_df$Filtered_Description
course_data <- course_data %>%
select(-Description)
course <- data.frame(Course_Num = course_data$Grad.Course.Number, Program_ID = course_data$Program_ID, Name = course_data$Name.of.Course, Description = course_data$Filtered_description, Credits = course_data$Course.Credits, Core_Or_Elective = course_data$Core.or.Elective)
course <- distinct(course)
connection <- dbConnect(MySQL(), user = user, password = pass, host = 'localhost', dbname = 'data_science')
dbSendQuery(connection, "SET GLOBAL local_infile = true;")
## <MySQLResult:8,0,0>
rs <- dbSendQuery(connection, 'set character set "utf8"')
rs <- dbSendQuery(connection, 'SET NAMES utf8')
dbWriteTable(connection, "school", school, overwrite = TRUE)
## [1] TRUE
dbWriteTable(connection, "program", program, overwrite = TRUE)
## [1] TRUE
dbWriteTable(connection, "course", course, overwrite = TRUE)
## [1] TRUE