Section 1: Clean up the original data

Step 1: Load the essential package and read the data table

# Step 1: Load the essential package and read the data table
library(readxl)
library(dplyr)
library(ggplot2)
# Load the transformation referrence table
trans<-read_xlsx("TransTable.xlsx")
# Load the data set
S201510 <-read_xlsx("uncleaned Data.xlsx",sheet = 1)
S201530 <-read_xlsx("uncleaned Data.xlsx",sheet = 2)
S201540 <-read_xlsx("uncleaned Data.xlsx",sheet = 3)
S201610 <-read_xlsx("uncleaned Data.xlsx",sheet = 4)
S201630 <-read_xlsx("uncleaned Data.xlsx",sheet = 5)
S201640 <-read_xlsx("uncleaned Data.xlsx",sheet = 6)
S201710 <-read_xlsx("uncleaned Data.xlsx",sheet = 7)
S201730 <-read_xlsx("uncleaned Data.xlsx",sheet = 8)
S201740 <-read_xlsx("uncleaned Data.xlsx",sheet = 9)
S201810 <-read_xlsx("uncleaned Data.xlsx",sheet = 10)
S201830 <-read_xlsx("uncleaned Data.xlsx",sheet = 11)
S201840 <-read_xlsx("uncleaned Data.xlsx",sheet = 12)
S201910 <-read_xlsx("uncleaned Data.xlsx",sheet = 13)
S201930 <-read_xlsx("uncleaned Data.xlsx",sheet = 14)
S201940 <-read_xlsx("uncleaned Data.xlsx",sheet = 15)

Step 2: Integrated Dataset

# Put Tags on each dataset
S201510$Semester <- 201510
S201530$Semester <- 201530
S201540$Semester <- 201540
S201610$Semester <- 201610
S201630$Semester <- 201630
S201640$Semester <- 201640
S201710$Semester <- 201710
S201730$Semester <- 201730
S201740$Semester <- 201740
S201810$Semester <- 201810
S201830$Semester <- 201830
S201840$Semester <- 201840
S201910$Semester <- 201910
S201930$Semester <- 201930
S201940$Semester <- 201940
# Put them into one large dataset
StudentData <- rbind(S201510,S201530,S201540,S201610,S201630,S201640,S201710,S201730,S201740,S201810,S201830,S201840,S201910,S201930,S201940)
# keep records with completed GRE_TOTAL or GRE_REVISED_TOTAL
StudentData2 <-subset(StudentData, !is.na(GRE_VERBAL) | !is.na(GRE_QUANT))
# Sort the data by Semester
StudentData3 <- StudentData2[order(-StudentData2$Semester),]
# Drop the dublicates based on CWID
StudentData3 <- distinct(StudentData3,CWID,.keep_all = TRUE)
# Check the descriptive data by semester
Sem_sum <- StudentData3 %>%
  group_by(Semester) %>%
  summarize(Numer= n())
# Check the descriptive data by degree
Degree_sum <- StudentData3 %>%
  group_by(DEGR1) %>%
  summarize(Numer= n())
# Check the descriptive data by major
Major_sum <- StudentData3 %>%
  group_by(MAJR1) %>%
  summarize(Numer= n())

Step 3: Transfer the old GRE score

#round down to nearest 10 StudentData2$GRE_VERBAL<-StudentData2$GRE_VERBAL%/%1010 StudentData2$GRE_QUANT<-StudentData2$GRE_QUANT%/%1010

library(“dplyr”) # Summarize the data StudentData2 %>% group_by(Semester) %>% summarize(n())

Transform the old GRE data to new GRE data

for (i in 1:nrow(StudentData2)) { if (is.na(StudentData2[i,]$GRE_REVISED_TOTAL)==TRUE) {

a<-which(StudentData2[i,]$GRE_VERBAL==trans$PS)
StudentData2[i,]$GRE_RV_VERBAL<-trans[a,]$VRCS

b<-which(StudentData2[i,]$GRE_QUANT==trans$PS)
StudentData2[i,]$GRE_RV_QUANT<-trans[b,]$QRCS

StudentData2[i,]$GRE_RV_VERBAL_DATE<-StudentData2[i,]$GRE_VERBAL_DATE
StudentData2[i,]$GRE_RV_QUANT_DATE<-StudentData2[i,]$GRE_VERBAL_DATE
StudentData2[i,]$GRE_RV_DATE<-StudentData2[i,]$GRE_TOTAL_DATE
StudentData2[i,]$GRE_REVISED_TOTAL<-StudentData2[i,]$GRE_RV_VERBAL+StudentData2[i,]$GRE_RV_QUANT

} }

Whether GRE can predict GPA

Cheng Hua

3/28/2020

Section 1: Clean up the original data

Transform the old GRE data to new GRE data

Emport the Data set into a excel file