Members

Coffy Andrews-Guo, Joseph Foy, Krutika Patel, Peter Phung

Please refer to the project write-up document for further information.

library(RMySQL)
library(tidyr)
library(tidyverse)
library(dplyr)
library(readr)

Data

csv <- data.frame(read.csv("https://raw.githubusercontent.com/Patel-Krutika/Data_607/main/Acct_Curricula2.csv"))
col <- colnames(csv) 
col[1] <- "School"
colnames(csv) <- col

csv <- distinct(csv)

# Create School_ID
# School_ID serves as the primary key for the School table, and identifies a unique school
csv <- csv %>% group_by(School, City, State, ZipCode, Total.Tuition.In.State, Tuition.per.Credit.In.State, In.state.fees, Total.Tuition.Out.of.State, Tuition.per.Credit.Out.of.State, Out.of.state.fees) %>% mutate(School_ID=cur_group_id())

# Create Program_ID
# Program_ID serves as the primary key for the Program table, and identifies a unique program
csv <- csv %>% group_by(School_ID, Degree.Program.Name, Modality, Program.Degree.Credits, Programmic.Accrediation) %>% mutate(Program_ID=cur_group_id())

School

school <- csv %>% select(School_ID, School, City, State, ZipCode, Total.Tuition.In.State, Tuition.per.Credit.In.State, In.state.fees, Total.Tuition.Out.of.State, Tuition.per.Credit.Out.of.State, Out.of.state.fees)

school <- subset(school, select = -c(Modality, Degree.Program.Name, Program.Degree.Credits, Programmic.Accrediation) )

colnames(school) <- c("School_ID", "School", "City", "State", "ZipCode", "Total_Tuition_In_State", "Tuition_per_Credit_In_State", "In_state_fees", "Total_Tuition_Out_of_State", "Tuition_per_Credit_Out_of_State", "Out_of_state_fees")

school <- distinct(school)

Program

program <- csv %>% select(Program_ID, School_ID, Name = Degree.Program.Name, Modality, Credits = Program.Degree.Credits, Program_Accrediation = Programmic.Accrediation)

program <- distinct(program)

Course

#Filter Description Column
rem_dup_word <- function(x){
  x <- tolower(x)
  x <- gsub("-", " ", x)
  x <- gsub("[[:punct:]]", "", x)
  x <- gsub("[[:digit:]]", "", x)
  return(paste(unique(trimws(unlist(strsplit(x,split=" ",fixed=F,perl=T)))),
        collapse =  " "))
}

course_data <- csv
course_description_df <- course_data %>%
  select(Description) %>%
  apply(MARGIN = 2, FUN = enc2utf8) %>%
  as.data.frame() %>%
  rowwise() %>%
  mutate(Filtered_Description = rem_dup_word(Description)) %>%
  mutate(Filtered_Description = na_if(Filtered_Description, "NA"))

course_data$Filtered_description <- course_description_df$Filtered_Description

course_data <- course_data %>%
  select(-Description)
course <- data.frame(Course_Num = course_data$Grad.Course.Number, Program_ID = course_data$Program_ID, Name = course_data$Name.of.Course, Description = course_data$Filtered_description, Credits = course_data$Course.Credits, Core_Or_Elective = course_data$Core.or.Elective)

course <- distinct(course)

Connection

connection <- dbConnect(MySQL(), user = user, password = pass, host = 'localhost', dbname = 'data_science')

Tables

dbSendQuery(connection, "SET GLOBAL local_infile = true;")
## <MySQLResult:8,0,0>
rs <- dbSendQuery(connection, 'set character set "utf8"')
rs <- dbSendQuery(connection, 'SET NAMES utf8')

dbWriteTable(connection, "school", school, overwrite = TRUE)
## [1] TRUE
dbWriteTable(connection, "program", program, overwrite = TRUE)
## [1] TRUE
dbWriteTable(connection, "course", course, overwrite = TRUE)
## [1] TRUE