Section 1: Clean up the original data

Step 1: Load the essential package and read the data table

# Step 1: Load the essential package and read the data table
library(readxl)
library(dplyr)
library(ggplot2)
# Load the transformation referrence table
trans<-read_xlsx("TransTable.xlsx")
# Load the data set
S201040 <-read_xlsx("uncleaned Data.xlsx",sheet = 1)
S201110 <-read_xlsx("uncleaned Data.xlsx",sheet = 2)
S201130 <-read_xlsx("uncleaned Data.xlsx",sheet = 3)
S201140 <-read_xlsx("uncleaned Data.xlsx",sheet = 4)
S201210 <-read_xlsx("uncleaned Data.xlsx",sheet = 5)
S201230 <-read_xlsx("uncleaned Data.xlsx",sheet = 6)
S201240 <-read_xlsx("uncleaned Data.xlsx",sheet = 7)
S201310 <-read_xlsx("uncleaned Data.xlsx",sheet = 8)
S201330 <-read_xlsx("uncleaned Data.xlsx",sheet = 9)
S201340 <-read_xlsx("uncleaned Data.xlsx",sheet = 10)
S201410 <-read_xlsx("uncleaned Data.xlsx",sheet = 11)
S201430 <-read_xlsx("uncleaned Data.xlsx",sheet = 12)
S201440 <-read_xlsx("uncleaned Data.xlsx",sheet = 13)
S201510 <-read_xlsx("uncleaned Data.xlsx",sheet = 14)
S201530 <-read_xlsx("uncleaned Data.xlsx",sheet = 15)
S201540 <-read_xlsx("uncleaned Data.xlsx",sheet = 16)
S201610 <-read_xlsx("uncleaned Data.xlsx",sheet = 17)
S201630 <-read_xlsx("uncleaned Data.xlsx",sheet = 18)
S201640 <-read_xlsx("uncleaned Data.xlsx",sheet = 19)
S201710 <-read_xlsx("uncleaned Data.xlsx",sheet = 20)
S201730 <-read_xlsx("uncleaned Data.xlsx",sheet = 21)
S201740 <-read_xlsx("uncleaned Data.xlsx",sheet = 22)
S201810 <-read_xlsx("uncleaned Data.xlsx",sheet = 23)
S201830 <-read_xlsx("uncleaned Data.xlsx",sheet = 24)
S201840 <-read_xlsx("uncleaned Data.xlsx",sheet = 25)
S201910 <-read_xlsx("uncleaned Data.xlsx",sheet = 26)
S201930 <-read_xlsx("uncleaned Data.xlsx",sheet = 27)
S201940 <-read_xlsx("uncleaned Data.xlsx",sheet = 28)

Step 2: Integrated Dataset

# Put Tags on each dataset
S201040$Semester <- 201040
S201110$Semester <- 201110
S201130$Semester <- 201130
S201140$Semester <- 201140
S201210$Semester <- 201210
S201230$Semester <- 201230
S201240$Semester <- 201240
S201310$Semester <- 201310
S201330$Semester <- 201330
S201340$Semester <- 201340
S201410$Semester <- 201410
S201430$Semester <- 201430
S201440$Semester <- 201440
S201510$Semester <- 201510
S201530$Semester <- 201530
S201540$Semester <- 201540
S201610$Semester <- 201610
S201630$Semester <- 201630
S201640$Semester <- 201640
S201710$Semester <- 201710
S201730$Semester <- 201730
S201740$Semester <- 201740
S201810$Semester <- 201810
S201830$Semester <- 201830
S201840$Semester <- 201840
S201910$Semester <- 201910
S201930$Semester <- 201930
S201940$Semester <- 201940
# Put them into one large dataset
StudentData <- rbind(S201040,S201110,S201130,S201140,S201210,S201230,S201240,S201310,S201330,S201340,S201410,S201430,S201440,S201510,S201530,S201540,S201610,S201630,S201640,S201710,S201730,S201740,S201810,S201830,S201840,S201910,S201930,S201940)
# keep records with completed GRE_TOTAL or GRE_REVISED_TOTAL
StudentData2 <-subset(StudentData, !is.na(GRE_TOTAL) | !is.na(GRE_REVISED_TOTAL))
# Drop one extreme case
StudentData2 <- StudentData2[-which(StudentData2$CWID==11458820), ]
# Sort the data by Semester
StudentData2 <- StudentData2[order(StudentData2$Semester),]
# Drop the dublicates based on CWID
StudentData3 <- distinct(StudentData2,CWID,.keep_all = TRUE)
# Drop the Student whose total GPA is zero
StudentData4 <- StudentData3[-which(StudentData3$OVERALL_GPA==0), ]
# Drop the student whose degree is NDS
StudentData4 <- StudentData4[-which(StudentData4$DEGR1=="NDS"), ]

# write.csv(StudentData4,"cleaned_data")

Step 2: Transform the old GRE score

#round down to nearest 10
StudentData4$GRE_VERBAL<-StudentData4$GRE_VERBAL%/%10*10
StudentData4$GRE_QUANT<-StudentData4$GRE_QUANT%/%10*10

library("dplyr")

# Transform the old GRE data to new GRE data
for (i in 1:nrow(StudentData4)) {
  if (is.na(StudentData4[i,]$GRE_REVISED_TOTAL)==TRUE) {
    
    a<-which(StudentData4[i,]$GRE_VERBAL==trans$PS)
    StudentData4[i,]$GRE_RV_VERBAL<-trans[a,]$VRCS
    
    b<-which(StudentData4[i,]$GRE_QUANT==trans$PS)
    StudentData4[i,]$GRE_RV_QUANT<-trans[b,]$QRCS
    
    StudentData4[i,]$GRE_RV_VERBAL_DATE<-StudentData4[i,]$GRE_VERBAL_DATE
    StudentData4[i,]$GRE_RV_QUANT_DATE<-StudentData4[i,]$GRE_VERBAL_DATE
    StudentData4[i,]$GRE_RV_DATE<-StudentData4[i,]$GRE_TOTAL_DATE
    StudentData4[i,]$GRE_REVISED_TOTAL<-StudentData4[i,]$GRE_RV_VERBAL+StudentData4[i,]$GRE_RV_QUANT
  }
}
# Emport the Data set into a excel file
# library("xlsx")
# write.xlsx(StudentData3, "StudentData.xlsx")
# write.csv(StudentData3, file = "StudentData.csv")

Section 2: Running the Analysis

Description of the Analysis Sample

library(ggpubr)
## Loading required package: magrittr
# Check the descriptive data by semester
Sem_sum <- StudentData4 %>%
  group_by(Semester) %>%
  summarize(Number= n())
Sem_sum
## # A tibble: 28 x 2
##    Semester Number
##       <dbl>  <int>
##  1   201040    420
##  2   201110     28
##  3   201130     33
##  4   201140    100
##  5   201210     30
##  6   201230     24
##  7   201240     87
##  8   201310     33
##  9   201330     32
## 10   201340     77
## # … with 18 more rows
# Create the bar plot. Use theme_pubclean() [in ggpubr]
ggplot(Sem_sum, aes(x = Semester, y =Number )) +
  geom_bar(fill = "#0073C2FF", stat = "identity") +
  geom_text(aes(label = Number), vjust = -0.3) +
  theme_pubclean()

# Check the descriptive data by degree
Degree_sum <- StudentData4 %>%
  group_by(DEGR1) %>%
  summarize(Number= n())
Degree_sum
## # A tibble: 5 x 2
##   DEGR1 Number
##   <chr>  <int>
## 1 CRT        5
## 2 EDD      193
## 3 EDS      137
## 4 MA      1082
## 5 PHD      326
# Create the bar plot. Use theme_pubclean() 
ggplot(Degree_sum, aes(x = DEGR1, y =Number )) +
  geom_bar(fill = "#0073C2FF", stat = "identity") +
  geom_text(aes(label = Number), vjust = -0.3) +
  theme_pubclean()

# Check the descriptive data by major
Major_sum <- StudentData4 %>%
  group_by(MAJR1) %>%
  summarize(Number= n())
Major_sum
## # A tibble: 27 x 2
##    MAJR1 Number
##    <chr>  <int>
##  1 CECM      51
##  2 CECO      16
##  3 CERG      62
##  4 CESC      42
##  5 CLTC       4
##  6 COED      32
##  7 EAEM       3
##  8 EASE       5
##  9 EDAD      12
## 10 EDLE      63
## # … with 17 more rows

Divde students into 25% 25-75% and 75% according to GRE score

# Create a new variable called Percentile and divided Student into three groups based on their precentile
StudentData4 <- StudentData4 %>% 
  mutate(Percentile = percent_rank(GRE_REVISED_TOTAL)) %>% 
  mutate(Rank = case_when(Percentile<=.25 ~ "Low",
        Percentile>.25 & Percentile<=.75 ~ "Middle",
        Percentile > .75 ~ "High"))
# Check the frequency for each rank
table(StudentData4$Rank) # This shows a normal distribution
## 
##   High    Low Middle 
##    428    479    836
# CrossTable between degree and GRE Rank
table(StudentData4$DEGR1,StudentData4$Rank)
##      
##       High Low Middle
##   CRT    2   0      3
##   EDD   40  48    105
##   EDS   30  36     71
##   MA   273 305    504
##   PHD   83  90    153
# CrossTable between Semester and GRE Rank
table(StudentData4$Semester,StudentData4$Rank)
##         
##          High Low Middle
##   201040   67 120    233
##   201110    5   6     17
##   201130    4   9     20
##   201140   22  30     48
##   201210   10   7     13
##   201230    5   9     10
##   201240   29  21     37
##   201310    5  10     18
##   201330   10  11     11
##   201340   23  19     35
##   201410    2   7     18
##   201430    4   9      9
##   201440   17  32     39
##   201510    4   5      8
##   201530    7  10     16
##   201540   18  35     45
##   201610    6   6     10
##   201630   10   6      8
##   201640   33  23     52
##   201710    4   6     11
##   201730    8   8     12
##   201740   30  14     33
##   201810    6   7      9
##   201830    6   8     18
##   201840   34  31     42
##   201910    7  10     10
##   201930   11   1      8
##   201940   41  19     46
# Most of the semester shows a normal distribution, however, 2015 summer & 2017 spring is skwed. Among all the semesters, the 2019 spring is the most problematic one becuase most of the admission from this semester are in the low rank for their GRE scores.

Can GRE Rank predict Student’s GPA

# Check the correlation between GPA and GRE score
StudentData4$Rank <- ordered(StudentData4$Rank,levels=c("Low","Middle","High"))
cor.test(StudentData4$OVERALL_GPA,as.numeric(StudentData4$Rank),method="spearman")
## Warning in cor.test.default(StudentData4$OVERALL_GPA,
## as.numeric(StudentData4$Rank), : Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  StudentData4$OVERALL_GPA and as.numeric(StudentData4$Rank)
## S = 673047948, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.2334338
# Scatter plot between GRE precentile and GPA score
plot(StudentData4$Percentile, StudentData4$OVERALL_GPA, main="Scatterplot Example",
   xlab="GRE Percentile", ylab="Overall GPA", pch=19)

# Use the boxplot to detect the outliers
boxplot(StudentData4$OVERALL_GPA, main="Overall GPA", boxwex=0.1)

# Run a simple regression between GPA and GRE score
model <- lm(OVERALL_GPA~Rank,data=StudentData4)
summary(model)
## 
## Call:
## lm(formula = OVERALL_GPA ~ Rank, data = StudentData4)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.96329 -0.07226  0.14714  0.24714  0.36971 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.73514    0.01025 364.391  < 2e-16 ***
## Rank.L       0.13574    0.01929   7.035 2.85e-12 ***
## Rank.Q      -0.02171    0.01607  -1.351    0.177    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4098 on 1737 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.02914,    Adjusted R-squared:  0.02803 
## F-statistic: 26.07 on 2 and 1737 DF,  p-value: 6.986e-12
# Check our model prediction
StudentData4$prediction <- predict(model,StudentData4,level=0.95)
StudentData4$prediction_diff <- abs(StudentData4$prediction - StudentData4$OVERALL_GPA)
mean(StudentData4$prediction_diff)
## [1] NA
t.test(StudentData4$prediction,StudentData4$OVERALL_GPA,paired=TRUE)
## 
##  Paired t-test
## 
## data:  StudentData4$prediction and StudentData4$OVERALL_GPA
## t = -1.1982e-12, df = 1739, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.01925622  0.01925622
## sample estimates:
## mean of the differences 
##           -1.176421e-14
# Check the difference between subgroups
al <- aov(OVERALL_GPA~Rank,data=StudentData4)
summary(al)
##               Df Sum Sq Mean Sq F value   Pr(>F)    
## Rank           2   8.76   4.378   26.07 6.99e-12 ***
## Residuals   1737 291.67   0.168                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 3 observations deleted due to missingness
TukeyHSD(al)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = OVERALL_GPA ~ Rank, data = StudentData4)
## 
## $Rank
##                   diff        lwr       upr     p adj
## Middle-Low  0.12256939 0.06743842 0.1777004 0.0000006
## High-Low    0.19196238 0.12795694 0.2559678 0.0000000
## High-Middle 0.06939299 0.01220657 0.1265794 0.0124311

Calculating the average differences and standard deviation difference

# Pull out students with 0 GPAs
# StudentData4 <- StudentData4[which(StudentData4$OVERALL_GPA==0), ]
# write.csv(StudentData4, file = "Promblematic_StudentData.csv")
# Calculating the standard deviation difference
GroupSD <- StudentData4 %>%
  group_by(Rank) %>%
  summarise(sd_GPA=sd(OVERALL_GPA,na.rm=TRUE))
GroupSD
## # A tibble: 3 x 2
##   Rank   sd_GPA
##   <ord>   <dbl>
## 1 Low     0.471
## 2 Middle  0.409
## 3 High    0.330
# Calculating the SD differences between each groups
High_Middle <- GroupSD[3,2] - GroupSD[2,2]
Middle_Low <- GroupSD[2,2] - GroupSD[1,2]
High_Low <- GroupSD[3,2] - GroupSD[1,2]
High_Middle # 0.0878
##        sd_GPA
## 1 -0.07870794
Middle_Low # 0.0596
##        sd_GPA
## 1 -0.06258332
High_Low # 0.1474
##       sd_GPA
## 1 -0.1412913
# Calculating the average differences
GroupMeans <- StudentData4 %>%
  group_by(Rank) %>%
  summarise(Average_GPA=mean(OVERALL_GPA,na.rm = TRUE))
GroupMeans
## # A tibble: 3 x 2
##   Rank   Average_GPA
##   <ord>        <dbl>
## 1 Low           3.63
## 2 Middle        3.75
## 3 High          3.82

Studuies on students who have a euqal to or higher than 3.8 GPA

### How many people have higher or equal to than 3.8 (what percentage) fall into different categroies of the GRE ranks.
StudentData5 <- StudentData4[which(StudentData4$OVERALL_GPA>=3.8), ]
1105/1743 # About 63.40% students have a GPA higher or equal to 3.8
## [1] 0.6339644
summary(StudentData5$Rank)
##    Low Middle   High 
##    235    550    320
235/1105
## [1] 0.2126697
550/1105
## [1] 0.4977376
320/1105
## [1] 0.2895928
# Among 1105 students who have a GPA higher or equal to 3.8, 235 students (21.27%) fall into the GRE low rank, 550 students (49.77%) fall into the GRE Middle rank, and 320 students (28.95%) fall into the GRE High rank.

Studuies on students who have a euqal to or lower than 3.0 GPA

### How many student have lower or equal to 3.0 GPA (what percentage) fall into different categories of the GRE ranks. 

StudentData6 <- StudentData4[which(StudentData4$OVERALL_GPA<=3.0), ]
105/1743 # About 6.02% students have a GPA lower or equal to 3.0
## [1] 0.06024096
summary(StudentData6$Rank)
##    Low Middle   High 
##     46     45     14
46/105
## [1] 0.4380952
45/105
## [1] 0.4285714
14/105
## [1] 0.1333333
# Among 105 students who have a GPA lower or equal to 3.0, 46 students (43.81%) fall into the GRE low rank, 45 students (42.85%) fall into the GRE Middle rank, and 14 students (13.33%) fall into the GRE High rank.