Step 1: Load the essential package and read the data table
# Step 1: Load the essential package and read the data table
library(readxl)
library(dplyr)
library(ggplot2)
# Load the transformation referrence table
trans<-read_xlsx("TransTable.xlsx")
# Load the data set
S201040 <-read_xlsx("uncleaned Data.xlsx",sheet = 1)
S201110 <-read_xlsx("uncleaned Data.xlsx",sheet = 2)
S201130 <-read_xlsx("uncleaned Data.xlsx",sheet = 3)
S201140 <-read_xlsx("uncleaned Data.xlsx",sheet = 4)
S201210 <-read_xlsx("uncleaned Data.xlsx",sheet = 5)
S201230 <-read_xlsx("uncleaned Data.xlsx",sheet = 6)
S201240 <-read_xlsx("uncleaned Data.xlsx",sheet = 7)
S201310 <-read_xlsx("uncleaned Data.xlsx",sheet = 8)
S201330 <-read_xlsx("uncleaned Data.xlsx",sheet = 9)
S201340 <-read_xlsx("uncleaned Data.xlsx",sheet = 10)
S201410 <-read_xlsx("uncleaned Data.xlsx",sheet = 11)
S201430 <-read_xlsx("uncleaned Data.xlsx",sheet = 12)
S201440 <-read_xlsx("uncleaned Data.xlsx",sheet = 13)
S201510 <-read_xlsx("uncleaned Data.xlsx",sheet = 14)
S201530 <-read_xlsx("uncleaned Data.xlsx",sheet = 15)
S201540 <-read_xlsx("uncleaned Data.xlsx",sheet = 16)
S201610 <-read_xlsx("uncleaned Data.xlsx",sheet = 17)
S201630 <-read_xlsx("uncleaned Data.xlsx",sheet = 18)
S201640 <-read_xlsx("uncleaned Data.xlsx",sheet = 19)
S201710 <-read_xlsx("uncleaned Data.xlsx",sheet = 20)
S201730 <-read_xlsx("uncleaned Data.xlsx",sheet = 21)
S201740 <-read_xlsx("uncleaned Data.xlsx",sheet = 22)
S201810 <-read_xlsx("uncleaned Data.xlsx",sheet = 23)
S201830 <-read_xlsx("uncleaned Data.xlsx",sheet = 24)
S201840 <-read_xlsx("uncleaned Data.xlsx",sheet = 25)
S201910 <-read_xlsx("uncleaned Data.xlsx",sheet = 26)
S201930 <-read_xlsx("uncleaned Data.xlsx",sheet = 27)
S201940 <-read_xlsx("uncleaned Data.xlsx",sheet = 28)
Step 2: Integrated Dataset
# Put Tags on each dataset
S201040$Semester <- 201040
S201110$Semester <- 201110
S201130$Semester <- 201130
S201140$Semester <- 201140
S201210$Semester <- 201210
S201230$Semester <- 201230
S201240$Semester <- 201240
S201310$Semester <- 201310
S201330$Semester <- 201330
S201340$Semester <- 201340
S201410$Semester <- 201410
S201430$Semester <- 201430
S201440$Semester <- 201440
S201510$Semester <- 201510
S201530$Semester <- 201530
S201540$Semester <- 201540
S201610$Semester <- 201610
S201630$Semester <- 201630
S201640$Semester <- 201640
S201710$Semester <- 201710
S201730$Semester <- 201730
S201740$Semester <- 201740
S201810$Semester <- 201810
S201830$Semester <- 201830
S201840$Semester <- 201840
S201910$Semester <- 201910
S201930$Semester <- 201930
S201940$Semester <- 201940
# Put them into one large dataset
StudentData <- rbind(S201040,S201110,S201130,S201140,S201210,S201230,S201240,S201310,S201330,S201340,S201410,S201430,S201440,S201510,S201530,S201540,S201610,S201630,S201640,S201710,S201730,S201740,S201810,S201830,S201840,S201910,S201930,S201940)
# keep records with completed GRE_TOTAL or GRE_REVISED_TOTAL
StudentData2 <-subset(StudentData, !is.na(GRE_TOTAL) | !is.na(GRE_REVISED_TOTAL))
# Drop one extreme case
StudentData2 <- StudentData2[-which(StudentData2$CWID==11458820), ]
# Sort the data by Semester
StudentData2 <- StudentData2[order(StudentData2$Semester),]
# Drop the dublicates based on CWID
StudentData3 <- distinct(StudentData2,CWID,.keep_all = TRUE)
# Drop the Student whose total GPA is zero
StudentData4 <- StudentData3[-which(StudentData3$OVERALL_GPA==0), ]
# Drop the student whose degree is NDS
StudentData4 <- StudentData4[-which(StudentData4$DEGR1=="NDS"), ]
# write.csv(StudentData4,"cleaned_data")
Step 2: Transform the old GRE score
#round down to nearest 10
StudentData4$GRE_VERBAL<-StudentData4$GRE_VERBAL%/%10*10
StudentData4$GRE_QUANT<-StudentData4$GRE_QUANT%/%10*10
library("dplyr")
# Transform the old GRE data to new GRE data
for (i in 1:nrow(StudentData4)) {
if (is.na(StudentData4[i,]$GRE_REVISED_TOTAL)==TRUE) {
a<-which(StudentData4[i,]$GRE_VERBAL==trans$PS)
StudentData4[i,]$GRE_RV_VERBAL<-trans[a,]$VRCS
b<-which(StudentData4[i,]$GRE_QUANT==trans$PS)
StudentData4[i,]$GRE_RV_QUANT<-trans[b,]$QRCS
StudentData4[i,]$GRE_RV_VERBAL_DATE<-StudentData4[i,]$GRE_VERBAL_DATE
StudentData4[i,]$GRE_RV_QUANT_DATE<-StudentData4[i,]$GRE_VERBAL_DATE
StudentData4[i,]$GRE_RV_DATE<-StudentData4[i,]$GRE_TOTAL_DATE
StudentData4[i,]$GRE_REVISED_TOTAL<-StudentData4[i,]$GRE_RV_VERBAL+StudentData4[i,]$GRE_RV_QUANT
}
}
# Emport the Data set into a excel file
# library("xlsx")
# write.xlsx(StudentData3, "StudentData.xlsx")
# write.csv(StudentData3, file = "StudentData.csv")
Description of the Analysis Sample
library(ggpubr)
## Loading required package: magrittr
# Check the descriptive data by semester
Sem_sum <- StudentData4 %>%
group_by(Semester) %>%
summarize(Number= n())
Sem_sum
## # A tibble: 28 x 2
## Semester Number
## <dbl> <int>
## 1 201040 420
## 2 201110 28
## 3 201130 33
## 4 201140 100
## 5 201210 30
## 6 201230 24
## 7 201240 87
## 8 201310 33
## 9 201330 32
## 10 201340 77
## # … with 18 more rows
# Create the bar plot. Use theme_pubclean() [in ggpubr]
ggplot(Sem_sum, aes(x = Semester, y =Number )) +
geom_bar(fill = "#0073C2FF", stat = "identity") +
geom_text(aes(label = Number), vjust = -0.3) +
theme_pubclean()
# Check the descriptive data by degree
Degree_sum <- StudentData4 %>%
group_by(DEGR1) %>%
summarize(Number= n())
Degree_sum
## # A tibble: 5 x 2
## DEGR1 Number
## <chr> <int>
## 1 CRT 5
## 2 EDD 193
## 3 EDS 137
## 4 MA 1082
## 5 PHD 326
# Create the bar plot. Use theme_pubclean()
ggplot(Degree_sum, aes(x = DEGR1, y =Number )) +
geom_bar(fill = "#0073C2FF", stat = "identity") +
geom_text(aes(label = Number), vjust = -0.3) +
theme_pubclean()
# Check the descriptive data by major
Major_sum <- StudentData4 %>%
group_by(MAJR1) %>%
summarize(Number= n())
Major_sum
## # A tibble: 27 x 2
## MAJR1 Number
## <chr> <int>
## 1 CECM 51
## 2 CECO 16
## 3 CERG 62
## 4 CESC 42
## 5 CLTC 4
## 6 COED 32
## 7 EAEM 3
## 8 EASE 5
## 9 EDAD 12
## 10 EDLE 63
## # … with 17 more rows
Divde students into 25% 25-75% and 75% according to GRE score
# Create a new variable called Percentile and divided Student into three groups based on their precentile
StudentData4 <- StudentData4 %>%
mutate(Percentile = percent_rank(GRE_REVISED_TOTAL)) %>%
mutate(Rank = case_when(Percentile<=.25 ~ "Low",
Percentile>.25 & Percentile<=.75 ~ "Middle",
Percentile > .75 ~ "High"))
# Check the frequency for each rank
table(StudentData4$Rank) # This shows a normal distribution
##
## High Low Middle
## 428 479 836
# CrossTable between degree and GRE Rank
table(StudentData4$DEGR1,StudentData4$Rank)
##
## High Low Middle
## CRT 2 0 3
## EDD 40 48 105
## EDS 30 36 71
## MA 273 305 504
## PHD 83 90 153
# CrossTable between Semester and GRE Rank
table(StudentData4$Semester,StudentData4$Rank)
##
## High Low Middle
## 201040 67 120 233
## 201110 5 6 17
## 201130 4 9 20
## 201140 22 30 48
## 201210 10 7 13
## 201230 5 9 10
## 201240 29 21 37
## 201310 5 10 18
## 201330 10 11 11
## 201340 23 19 35
## 201410 2 7 18
## 201430 4 9 9
## 201440 17 32 39
## 201510 4 5 8
## 201530 7 10 16
## 201540 18 35 45
## 201610 6 6 10
## 201630 10 6 8
## 201640 33 23 52
## 201710 4 6 11
## 201730 8 8 12
## 201740 30 14 33
## 201810 6 7 9
## 201830 6 8 18
## 201840 34 31 42
## 201910 7 10 10
## 201930 11 1 8
## 201940 41 19 46
# Most of the semester shows a normal distribution, however, 2015 summer & 2017 spring is skwed. Among all the semesters, the 2019 spring is the most problematic one becuase most of the admission from this semester are in the low rank for their GRE scores.
Can GRE Rank predict Student’s GPA
# Check the correlation between GPA and GRE score
StudentData4$Rank <- ordered(StudentData4$Rank,levels=c("Low","Middle","High"))
cor.test(StudentData4$OVERALL_GPA,as.numeric(StudentData4$Rank),method="spearman")
## Warning in cor.test.default(StudentData4$OVERALL_GPA,
## as.numeric(StudentData4$Rank), : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: StudentData4$OVERALL_GPA and as.numeric(StudentData4$Rank)
## S = 673047948, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.2334338
# Scatter plot between GRE precentile and GPA score
plot(StudentData4$Percentile, StudentData4$OVERALL_GPA, main="Scatterplot Example",
xlab="GRE Percentile", ylab="Overall GPA", pch=19)
# Use the boxplot to detect the outliers
boxplot(StudentData4$OVERALL_GPA, main="Overall GPA", boxwex=0.1)
# Run a simple regression between GPA and GRE score
model <- lm(OVERALL_GPA~Rank,data=StudentData4)
summary(model)
##
## Call:
## lm(formula = OVERALL_GPA ~ Rank, data = StudentData4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.96329 -0.07226 0.14714 0.24714 0.36971
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.73514 0.01025 364.391 < 2e-16 ***
## Rank.L 0.13574 0.01929 7.035 2.85e-12 ***
## Rank.Q -0.02171 0.01607 -1.351 0.177
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4098 on 1737 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.02914, Adjusted R-squared: 0.02803
## F-statistic: 26.07 on 2 and 1737 DF, p-value: 6.986e-12
# Check our model prediction
StudentData4$prediction <- predict(model,StudentData4,level=0.95)
StudentData4$prediction_diff <- abs(StudentData4$prediction - StudentData4$OVERALL_GPA)
mean(StudentData4$prediction_diff)
## [1] NA
t.test(StudentData4$prediction,StudentData4$OVERALL_GPA,paired=TRUE)
##
## Paired t-test
##
## data: StudentData4$prediction and StudentData4$OVERALL_GPA
## t = -1.1982e-12, df = 1739, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.01925622 0.01925622
## sample estimates:
## mean of the differences
## -1.176421e-14
# Check the difference between subgroups
al <- aov(OVERALL_GPA~Rank,data=StudentData4)
summary(al)
## Df Sum Sq Mean Sq F value Pr(>F)
## Rank 2 8.76 4.378 26.07 6.99e-12 ***
## Residuals 1737 291.67 0.168
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 3 observations deleted due to missingness
TukeyHSD(al)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = OVERALL_GPA ~ Rank, data = StudentData4)
##
## $Rank
## diff lwr upr p adj
## Middle-Low 0.12256939 0.06743842 0.1777004 0.0000006
## High-Low 0.19196238 0.12795694 0.2559678 0.0000000
## High-Middle 0.06939299 0.01220657 0.1265794 0.0124311
Calculating the average differences and standard deviation difference
# Pull out students with 0 GPAs
# StudentData4 <- StudentData4[which(StudentData4$OVERALL_GPA==0), ]
# write.csv(StudentData4, file = "Promblematic_StudentData.csv")
# Calculating the standard deviation difference
GroupSD <- StudentData4 %>%
group_by(Rank) %>%
summarise(sd_GPA=sd(OVERALL_GPA,na.rm=TRUE))
GroupSD
## # A tibble: 3 x 2
## Rank sd_GPA
## <ord> <dbl>
## 1 Low 0.471
## 2 Middle 0.409
## 3 High 0.330
# Calculating the SD differences between each groups
High_Middle <- GroupSD[3,2] - GroupSD[2,2]
Middle_Low <- GroupSD[2,2] - GroupSD[1,2]
High_Low <- GroupSD[3,2] - GroupSD[1,2]
High_Middle # 0.0878
## sd_GPA
## 1 -0.07870794
Middle_Low # 0.0596
## sd_GPA
## 1 -0.06258332
High_Low # 0.1474
## sd_GPA
## 1 -0.1412913
# Calculating the average differences
GroupMeans <- StudentData4 %>%
group_by(Rank) %>%
summarise(Average_GPA=mean(OVERALL_GPA,na.rm = TRUE))
GroupMeans
## # A tibble: 3 x 2
## Rank Average_GPA
## <ord> <dbl>
## 1 Low 3.63
## 2 Middle 3.75
## 3 High 3.82
Studuies on students who have a euqal to or higher than 3.8 GPA
### How many people have higher or equal to than 3.8 (what percentage) fall into different categroies of the GRE ranks.
StudentData5 <- StudentData4[which(StudentData4$OVERALL_GPA>=3.8), ]
1105/1743 # About 63.40% students have a GPA higher or equal to 3.8
## [1] 0.6339644
summary(StudentData5$Rank)
## Low Middle High
## 235 550 320
235/1105
## [1] 0.2126697
550/1105
## [1] 0.4977376
320/1105
## [1] 0.2895928
# Among 1105 students who have a GPA higher or equal to 3.8, 235 students (21.27%) fall into the GRE low rank, 550 students (49.77%) fall into the GRE Middle rank, and 320 students (28.95%) fall into the GRE High rank.
Studuies on students who have a euqal to or lower than 3.0 GPA
### How many student have lower or equal to 3.0 GPA (what percentage) fall into different categories of the GRE ranks.
StudentData6 <- StudentData4[which(StudentData4$OVERALL_GPA<=3.0), ]
105/1743 # About 6.02% students have a GPA lower or equal to 3.0
## [1] 0.06024096
summary(StudentData6$Rank)
## Low Middle High
## 46 45 14
46/105
## [1] 0.4380952
45/105
## [1] 0.4285714
14/105
## [1] 0.1333333
# Among 105 students who have a GPA lower or equal to 3.0, 46 students (43.81%) fall into the GRE low rank, 45 students (42.85%) fall into the GRE Middle rank, and 14 students (13.33%) fall into the GRE High rank.