#####################Read and Pre-Clean the Data#######################
require(psych) #to describe
## Loading required package: psych
## Warning: package 'psych' was built under R version 3.5.1
require(reticulate) #to use Python in R as well
## Loading required package: reticulate
## Warning: package 'reticulate' was built under R version 3.5.1
mydata=read.csv("C:/Users/lfult/OneDrive - Texas State University/BHA2/Peregrine Analysis/combinedposttest.csv")
mydata[,1:2]=NULL
str(mydata)
## 'data.frame': 45 obs. of 19 variables:
## $ Gender : Factor w/ 2 levels "F","M": 1 2 1 1 1 1 1 1 1 1 ...
## $ Ethnicity : Factor w/ 4 levels "A","B","C","H": 3 2 4 4 4 4 3 3 3 2 ...
## $ Cohort : num 2018 2018 2018 2018 2018 ...
## $ Fin : int 60 50 60 60 50 80 70 40 70 60 ...
## $ Mgt : int 70 60 50 70 70 80 100 70 100 70 ...
## $ Pers : int 70 80 70 50 60 70 80 80 80 70 ...
## $ SysOrg : int 80 50 90 60 70 50 40 60 80 60 ...
## $ IM : int 40 60 40 60 60 90 50 50 50 50 ...
## $ Ldrship : int 60 80 70 60 70 50 60 90 60 90 ...
## $ ChangeMgt : int 40 40 70 80 60 60 90 60 40 90 ...
## $ ClimCult : int 80 80 80 80 100 70 70 90 90 70 ...
## $ QI : int 60 50 50 60 60 60 60 80 80 50 ...
## $ Quant : int NA NA NA NA NA NA NA NA NA NA ...
## $ Strat_Mark: int 70 70 70 70 80 60 40 60 70 90 ...
## $ Comm_Envir: int 60 80 70 60 60 70 80 80 50 70 ...
## $ Legal : int 70 70 60 80 70 70 90 80 70 70 ...
## $ Score : num 63.3 64.2 65 65.8 67.5 ...
## $ Duration : num 66.7 71.2 57.8 74 53.9 ...
## $ Percent : int 58 61 64 67 72 72 77 79 79 79 ...
#########################################################################
I manually coded the variable “Minority Group” with non-Census Bureau categories of {C = caucasion / non-Hispanic, B = African American or Associated Minority Group / non-Hispanic, A = Asian / Other, H = Hispanic regardless of C or B Primary Classification}. I used this coding scheme to reflect that we are a Hispanic-serving institution. Eventually, I will categorize by race and ethnicity separately using Census coding.
#############################Descriptives 1##############################
describe(mydata[,4:ncol(mydata)])
## vars n mean sd median trimmed mad min max range
## Fin 1 45 69.78 14.85 70.00 70.00 14.83 40.00 100.00 60.00
## Mgt 2 45 77.33 12.86 80.00 77.03 14.83 50.00 100.00 50.00
## Pers 3 45 78.89 11.91 80.00 78.92 14.83 50.00 100.00 50.00
## SysOrg 4 45 70.44 13.81 70.00 71.08 14.83 40.00 100.00 60.00
## IM 5 45 65.56 16.17 60.00 64.86 14.83 40.00 100.00 60.00
## Ldrship 6 45 75.78 13.40 80.00 76.49 14.83 50.00 100.00 50.00
## ChangeMgt 7 45 73.56 17.47 80.00 75.14 14.83 30.00 100.00 70.00
## ClimCult 8 45 78.67 11.98 80.00 78.65 14.83 50.00 100.00 50.00
## QI 9 45 71.11 14.34 70.00 71.35 14.83 40.00 100.00 60.00
## Quant 10 12 52.50 18.15 50.00 53.00 22.24 20.00 80.00 60.00
## Strat_Mark 11 45 73.56 14.95 70.00 73.51 14.83 40.00 100.00 60.00
## Comm_Envir 12 45 69.78 11.96 70.00 69.73 14.83 50.00 90.00 40.00
## Legal 13 45 76.44 11.31 80.00 76.49 14.83 50.00 100.00 50.00
## Score 14 45 73.00 5.27 73.07 73.06 5.69 62.30 85.83 23.53
## Duration 15 45 56.40 12.40 56.37 56.22 14.53 34.37 82.93 48.56
## Percent 16 45 83.47 11.17 86.00 84.73 10.38 54.00 98.00 44.00
## skew kurtosis se
## Fin -0.17 -0.87 2.21
## Mgt -0.01 -0.89 1.92
## Pers -0.19 -0.44 1.78
## SysOrg -0.43 -0.16 2.06
## IM 0.38 -0.57 2.41
## Ldrship -0.33 -0.76 2.00
## ChangeMgt -0.69 -0.46 2.60
## ClimCult -0.14 -0.51 1.79
## QI -0.05 -0.96 2.14
## Quant -0.17 -1.25 5.24
## Strat_Mark -0.05 -0.69 2.23
## Comm_Envir -0.04 -0.89 1.78
## Legal -0.03 -0.51 1.69
## Score -0.04 -0.60 0.79
## Duration 0.12 -0.97 1.85
## Percent -0.90 -0.10 1.67
par(mai=c(.3,.3,.3,.3))
par(mfrow=c(5,4))
table(mydata$Gender)
##
## F M
## 33 12
table(mydata$Ethnicity)
##
## A B C H
## 4 7 26 8
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
gf=ggplot(mydata, aes(x=Score, fill=Ethnicity))+
geom_histogram(position="identity", colour="grey40", alpha=0.2, bins =5)+facet_grid(.~Gender)
gf
for (i in 4:ncol(mydata)){
hist(na.omit(mydata[,i]), ylim=c(0,15), xlim=c(0,100),xlab=NULL, ylab=NULL, main=colnames(mydata)[i])
boxplot(mydata[,i],horizontal=TRUE,xlab=NULL,ylim=c(0,100), ylab=NULL, main=colnames(mydata)[i])}
myagg=aggregate(mydata[,4:ncol(mydata)], by=list(mydata$Gender), mean)
myagg
## Group.1 Fin Mgt Pers SysOrg IM Ldrship ChangeMgt
## 1 F 68.18182 77.27273 78.18182 70.60606 64.24242 73.63636 75.75758
## 2 M 74.16667 77.50000 80.83333 70.00000 69.16667 81.66667 67.50000
## ClimCult QI Quant Strat_Mark Comm_Envir Legal Score Duration
## 1 77.57576 71.51515 NA 72.12121 67.27273 75.45455 72.14909 56.68636
## 2 81.66667 70.00000 NA 77.50000 76.66667 79.16667 75.32250 55.62750
## Percent
## 1 81.81818
## 2 88.00000
myagg2=aggregate(mydata[,4:ncol(mydata)], by=list(mydata$Ethnicity), mean)
myagg2
## Group.1 Fin Mgt Pers SysOrg IM Ldrship ChangeMgt
## 1 A 62.50000 87.50000 72.50000 70.00000 75.00000 72.50000 72.50000
## 2 B 74.28571 71.42857 78.57143 70.00000 68.57143 81.42857 74.28571
## 3 C 70.00000 77.69231 83.07692 69.61538 64.61538 75.76923 75.38462
## 4 H 68.75000 76.25000 68.75000 73.75000 61.25000 72.50000 67.50000
## ClimCult QI Quant Strat_Mark Comm_Envir Legal Score Duration
## 1 70.00000 72.50000 NA 77.50000 67.50000 85.00000 72.93000 51.05500
## 2 80.00000 57.14286 NA 78.57143 74.28571 75.71429 73.10000 64.08571
## 3 79.61538 75.38462 NA 72.69231 70.00000 75.76923 73.74923 53.26000
## 4 78.75000 68.75000 NA 70.00000 66.25000 75.00000 70.48625 62.57500
## Percent
## 1 85.00000
## 2 84.28571
## 3 84.53846
## 4 78.50000
myagg3=aggregate(mydata[,4:ncol(mydata)], by=list(mydata$Cohort), mean)
myagg3
## Group.1 Fin Mgt Pers SysOrg IM Ldrship ChangeMgt
## 1 2018.2 70.00000 77.87879 80.30303 70.90909 65.45455 76.9697 73.93939
## 2 2019.1 69.16667 75.83333 75.00000 69.16667 65.83333 72.5000 72.50000
## ClimCult QI Quant Strat_Mark Comm_Envir Legal Score Duration
## 1 80 70.30303 NA 73.0303 70.00000 76.36364 73.7600 55.52727
## 2 75 73.33333 52.5 75.0000 69.16667 76.66667 70.8925 58.81500
## Percent
## 1 84.84848
## 2 79.66667
#########################################################################
The final score is a linear combination of the other scores, and percentile rank derives from that. Percentile rank is omitted for the correlation analysis, and Spearman’s is used to avoid assumptions of normality necessary for Pearson’s. While final score is retained, it must be correlated with subscores as it is built from them.
A scatterplot illustrates that Duration is not related to overall final grade performance. There are some localized associations between duration and subscores, though.
#############################Descriptives 2##############################
mycor=round(cor(mydata[,-c(1,2,3)], method="spearman"),2)
#by definition, final is a linear combination of other scores
#% rank follows from that.
mycor[upper.tri(mycor)==TRUE]=NA
cor.test(mydata$Pers,mydata$Duration, method="spearman")
## Warning in cor.test.default(mydata$Pers, mydata$Duration, method =
## "spearman"): Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: mydata$Pers and mydata$Duration
## S = 20847, p-value = 0.01155
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.3732965
cor.test(mydata$Legal, mydata$Mgt, method="spearman")
## Warning in cor.test.default(mydata$Legal, mydata$Mgt, method = "spearman"):
## Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: mydata$Legal and mydata$Mgt
## S = 11446, p-value = 0.1034
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.2459801
library(reshape2)
meltcor=melt(mycor)
library(ggplot2)
ggplot(data = meltcor, aes(Var2, Var1, fill = value))+
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 10, hjust = 1))+
coord_fixed()
ggplot(mydata, aes(x=Duration, y=Score)) +
geom_point()+
geom_smooth(method=lm)
# Remove the confidence interval
ggplot(mydata, aes(x=Duration, y=Score)) +
geom_point()+
geom_smooth(method=lm, se=FALSE)
# Loess method
ggplot(mydata, aes(x=Duration, y=Score)) +
geom_point()+
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#########################################################################
######################Basic Inferentials########################
myt=t.test(mydata$Score~mydata$Gender)
myt
##
## Welch Two Sample t-test
##
## data: mydata$Score by mydata$Gender
## t = -1.8751, df = 20.448, p-value = 0.07513
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6.6986721 0.3518539
## sample estimates:
## mean in group F mean in group M
## 72.14909 75.32250
myt2=t.test(mydata$Score~mydata$Cohort)
myt2
##
## Welch Two Sample t-test
##
## data: mydata$Score by mydata$Cohort
## t = 1.8218, df = 24.221, p-value = 0.08086
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.3794654 6.1144654
## sample estimates:
## mean in group 2018.2 mean in group 2019.1
## 73.7600 70.8925
myaov=aov(mydata$Score~mydata$Ethnicity)
summary(myaov)
## Df Sum Sq Mean Sq F value Pr(>F)
## mydata$Ethnicity 3 65.2 21.75 0.77 0.518
## Residuals 41 1158.2 28.25
mytukeys=TukeyHSD(myaov)
mytukeys
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = mydata$Score ~ mydata$Ethnicity)
##
## $`mydata$Ethnicity`
## diff lwr upr p adj
## B-A 0.1700000 -8.750140 9.090140 0.9999515
## C-A 0.8192308 -6.824376 8.462837 0.9916370
## H-A -2.4437500 -11.158802 6.271302 0.8757869
## C-B 0.6492308 -5.410811 6.709273 0.9916474
## H-B -2.6137500 -9.979313 4.751813 0.7780422
## H-C -3.2629808 -9.016879 2.490917 0.4360406
plot(mytukeys)
################################################################