#####################Read and Pre-Clean the Data#######################
require(psych) #to describe
## Loading required package: psych
## Warning: package 'psych' was built under R version 3.5.3
require(reticulate) #to use Python in R as well
## Loading required package: reticulate
## Warning: package 'reticulate' was built under R version 3.5.3
mydata=read.csv("C:/Users/lf25/OneDrive - Texas State University/BHA2/Peregrine Analysis/spring2019posttest.csv")
mydata[,1:2]=NULL
str(mydata)
## 'data.frame': 12 obs. of 19 variables:
## $ Gender : Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ...
## $ Ethnicity : Factor w/ 4 levels "A","B","C","H": 3 1 4 4 3 3 3 2 4 3 ...
## $ Fin : int 70 50 80 50 60 70 90 90 90 70 ...
## $ Mgt : int 80 90 90 80 60 60 60 80 80 70 ...
## $ Pers : int 80 80 80 70 80 60 90 90 70 70 ...
## $ SysOrg : int 50 70 80 90 70 40 80 80 70 80 ...
## $ IM : int 50 70 70 50 70 90 60 80 60 40 ...
## $ Ldrship : int 70 80 80 80 50 90 60 70 80 70 ...
## $ ChangeMgt : int 70 90 50 60 90 70 90 90 70 50 ...
## $ ClimCult : int 100 60 70 80 50 90 70 80 90 80 ...
## $ QI : int 80 60 100 70 60 80 60 70 80 70 ...
## $ Quant : int 20 30 80 70 40 70 40 50 60 50 ...
## $ Strat_Mark: int 70 80 70 90 50 60 100 70 70 60 ...
## $ Comm_Envir: int 70 70 60 50 50 70 70 60 80 70 ...
## $ Legal : int 70 70 90 80 80 80 70 80 80 70 ...
## $ Score : num 67.7 69.2 76.9 70.8 62.3 ...
## $ Duration : num 47.2 46.6 53.9 76.7 60.2 ...
## $ Percent : int 73 77 92 81 54 83 85 91 90 65 ...
## $ Cohort : num 2019 2019 2019 2019 2019 ...
#########################################################################
I manually coded the variable “Minority Group” with non-Census Bureau categories of {C = caucasion / non-Hispanic, B = African American or Associated Minority Group / non-Hispanic, A = Asian / Other, H = Hispanic regardless of C or B Primary Classification}. I used this coding scheme to reflect that we are a Hispanic-serving institution. Eventually, I will categorize by race and ethnicity separately using Census coding.
#############################Descriptives 1##############################
describe(mydata[,3:ncol(mydata)])
## vars n mean sd median trimmed mad min max
## Fin 1 12 69.17 16.76 70.00 70.00 22.24 40.0 90.00
## Mgt 2 12 75.83 13.79 80.00 75.00 14.83 60.0 100.00
## Pers 3 12 75.00 10.00 75.00 75.00 7.41 60.0 90.00
## SysOrg 4 12 69.17 14.43 70.00 70.00 14.83 40.0 90.00
## IM 5 12 65.83 14.43 70.00 66.00 14.83 40.0 90.00
## Ldrship 6 12 72.50 10.55 70.00 73.00 14.83 50.0 90.00
## ChangeMgt 7 12 72.50 14.85 70.00 73.00 22.24 50.0 90.00
## ClimCult 8 12 75.00 14.46 75.00 75.00 14.83 50.0 100.00
## QI 9 12 73.33 13.03 70.00 72.00 14.83 60.0 100.00
## Quant 10 12 52.50 18.15 50.00 53.00 22.24 20.0 80.00
## Strat_Mark 11 12 75.00 15.67 70.00 75.00 14.83 50.0 100.00
## Comm_Envir 12 12 69.17 13.11 70.00 69.00 14.83 50.0 90.00
## Legal 13 12 76.67 12.31 80.00 77.00 14.83 50.0 100.00
## Score 14 12 70.89 4.36 71.15 71.15 3.99 62.3 76.92
## Duration 15 12 58.81 13.51 58.56 59.02 19.39 38.9 76.72
## Percent 16 12 79.67 11.26 82.00 81.00 9.64 54.0 92.00
## Cohort 17 12 2019.10 0.00 2019.10 2019.10 0.00 2019.1 2019.10
## range skew kurtosis se
## Fin 50.00 -0.20 -1.33 4.84
## Mgt 40.00 0.13 -1.44 3.98
## Pers 30.00 0.00 -1.27 2.89
## SysOrg 50.00 -0.53 -0.86 4.17
## IM 50.00 -0.15 -1.09 4.17
## Ldrship 40.00 -0.45 -0.40 3.05
## ChangeMgt 40.00 -0.09 -1.44 4.29
## ClimCult 50.00 0.00 -1.12 4.17
## QI 40.00 0.56 -0.93 3.76
## Quant 60.00 -0.17 -1.25 5.24
## Strat_Mark 50.00 0.26 -1.16 4.52
## Comm_Envir 40.00 0.14 -1.10 3.79
## Legal 50.00 -0.22 0.06 3.55
## Score 14.62 -0.37 -0.93 1.26
## Duration 37.82 -0.03 -1.65 3.90
## Percent 38.00 -0.90 -0.22 3.25
## Cohort 0.00 NaN NaN 0.00
par(mai=c(.3,.3,.3,.3))
par(mfrow=c(5,4))
table(mydata$Gender)
##
## F M
## 11 1
table(mydata$Ethnicity)
##
## A B C H
## 1 2 6 3
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
gf=ggplot(mydata, aes(x=Score, fill=Ethnicity))+
geom_histogram(position="identity", colour="grey40", alpha=0.2, bins =3)+facet_grid(.~Gender)
gf
for (i in 3:ncol(mydata)){
hist(mydata[,i], ylim=c(0,15), xlim=c(0,100),xlab=NULL, ylab=NULL, main=colnames(mydata)[i])
boxplot(mydata[,i],horizontal=TRUE,xlab=NULL,ylim=c(0,100), ylab=NULL, main=colnames(mydata)[i])}
myagg=aggregate(mydata[,3:ncol(mydata)], by=list(mydata$Gender), mean)
myagg
## Group.1 Fin Mgt Pers SysOrg IM Ldrship ChangeMgt
## 1 F 69.09091 77.27273 75.45455 70 64.54545 72.72727 72.72727
## 2 M 70.00000 60.00000 70.00000 60 80.00000 70.00000 70.00000
## ClimCult QI Quant Strat_Mark Comm_Envir Legal Score
## 1 75.45455 74.54545 52.72727 72.72727 67.27273 74.54545 70.69455
## 2 70.00000 60.00000 50.00000 100.00000 90.00000 100.00000 73.07000
## Duration Percent Cohort
## 1 58.43636 79.09091 2019.1
## 2 62.98000 86.00000 2019.1
myagg2=aggregate(mydata[,3:ncol(mydata)], by=list(mydata$Ethnicity), mean)
myagg2
## Group.1 Fin Mgt Pers SysOrg IM Ldrship ChangeMgt
## 1 A 50.00000 90.00000 80.00000 70.00000 70.00000 80.00000 90.00000
## 2 B 80.00000 70.00000 80.00000 70.00000 80.00000 70.00000 80.00000
## 3 C 66.66667 71.66667 73.33333 63.33333 63.33333 68.33333 73.33333
## 4 H 73.33333 83.33333 73.33333 80.00000 60.00000 80.00000 60.00000
## ClimCult QI Quant Strat_Mark Comm_Envir Legal Score
## 1 60 60.00000 30.00000 80.00000 70.00000 70.00000 69.23000
## 2 75 65.00000 50.00000 85.00000 75.00000 90.00000 74.61000
## 3 75 73.33333 48.33333 70.00000 70.00000 70.00000 68.20000
## 4 80 83.33333 70.00000 76.66667 63.33333 83.33333 74.35333
## Duration Percent Cohort
## 1 46.570 77.00000 2019.1
## 2 68.980 88.50000 2019.1
## 3 52.915 73.16667 2019.1
## 4 67.920 87.66667 2019.1
#########################################################################
The final score is a linear combination of the other scores, and percentile rank derives from that. Percentile rank is omitted for the correlation analysis, and Spearman’s is used to avoid assumptions of normality necessary for Pearson’s. While final score is retained, it must be correlated with subscores as it is built from them.
A scatterplot illustrates that Duration is not related to overall final grade performance. There are some localized associations between duration and subscores, though.
#############################Descriptives 2##############################
mycor=round(cor(mydata[,-c(1,2)], method="spearman"),2)
## Warning in cor(mydata[, -c(1, 2)], method = "spearman"): the standard
## deviation is zero
mycor[upper.tri(mycor)==TRUE]=NA
cor.test(mydata$Pers,mydata$Duration, method="spearman")
## Warning in cor.test.default(mydata$Pers, mydata$Duration, method =
## "spearman"): Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: mydata$Pers and mydata$Duration
## S = 356.78, p-value = 0.4381
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.2474708
cor.test(mydata$Legal, mydata$Mgt, method="spearman")
## Warning in cor.test.default(mydata$Legal, mydata$Mgt, method = "spearman"):
## Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: mydata$Legal and mydata$Mgt
## S = 374.39, p-value = 0.3283
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.3090672
library(reshape2)
meltcor=melt(mycor)
library(ggplot2)
ggplot(data = meltcor, aes(Var2, Var1, fill = value))+
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 10, hjust = 1))+
coord_fixed()
ggplot(mydata, aes(x=Duration, y=Score)) +
geom_point()+
geom_smooth(method=lm)
# Remove the confidence interval
ggplot(mydata, aes(x=Duration, y=Score)) +
geom_point()+
geom_smooth(method=lm, se=FALSE)
# Loess method
ggplot(mydata, aes(x=Duration, y=Score)) +
geom_point()+
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#########################################################################
######################Basic Inferentials########################
#myt=t.test(mydata$Score~mydata$Gender) all females
#myt
myaov=aov(mydata$Score~mydata$Ethnicity)
summary(myaov)
## Df Sum Sq Mean Sq F value Pr(>F)
## mydata$Ethnicity 3 109.83 36.61 2.945 0.0986 .
## Residuals 8 99.46 12.43
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mytukeys=TukeyHSD(myaov)
mytukeys
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = mydata$Score ~ mydata$Ethnicity)
##
## $`mydata$Ethnicity`
## diff lwr upr p adj
## B-A 5.3800000 -8.448994 19.208994 0.6179501
## C-A -1.0300000 -13.226026 11.166026 0.9925366
## H-A 5.1233333 -7.914767 18.161434 0.6109402
## C-B -6.4100000 -15.629329 2.809329 0.1956629
## H-B -0.2566667 -10.564190 10.050857 0.9998021
## H-C 6.1533333 -1.830840 14.137507 0.1405245
plot(mytukeys)
################################################################