#### loading the data #####
studentData<- read.csv('Student-performance.csv')
################################################ missing data handling(mean imputation) #############################################
studentData$mG3 <- ifelse((studentData$mG3==0),as.numeric((studentData$mG1+studentData$mG2)/2),studentData$mG3)
summary statistics of variables
library(Rcmdr)
## Loading required package: splines
## Loading required package: RcmdrMisc
## Loading required package: car
## Loading required package: carData
## Loading required package: sandwich
## Loading required package: effects
## Registered S3 methods overwritten by 'lme4':
## method from
## cooks.distance.influence.merMod car
## influence.merMod car
## dfbeta.influence.merMod car
## dfbetas.influence.merMod car
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
## The Commander GUI is launched only in interactive sessions
##
## Attaching package: 'Rcmdr'
## The following object is masked from 'package:base':
##
## errorCondition
library(abind, pos=18)
library(e1071, pos=19)
numSummary(studentData[,"failures.m", drop=FALSE],
groups=studentData$ï..school, statistics=c("mean", "sd", "IQR",
"quantiles"), quantiles=c(0,.25,.5,.75,1))
## mean sd IQR 0% 25% 50% 75% 100% failures.m:n
## GP 0.2894737 0.7431843 0.00 0 0 0 0.00 3 342
## MS 0.3000000 0.6076436 0.25 0 0 0 0.25 3 40
with(studentData, Hist(failures.m, groups=ï..school, scale="frequency",
breaks="Sturges", col="darkgray"))
numSummary(studentData[,"mG3", drop=FALSE], groups=studentData$ï..school,
statistics=c("mean", "sd", "IQR", "quantiles"), quantiles=c(0,.25,.5,.75,1))
## mean sd IQR 0% 25% 50% 75% 100% mG3:n
## GP 11.10526 3.613558 5.00 2 9 11 14.00 20 342
## MS 10.20000 3.383216 4.25 5 8 10 12.25 19 40
with(studentData, Hist(mG3, groups=ï..school, scale="frequency",
breaks="Sturges", col="darkgray"))
local({
.Table <- with(studentData, table(paid.m))
cat("\ncounts:\n")
print(.Table)
cat("\npercentages:\n")
print(round(100*.Table/sum(.Table), 2))
})
##
## counts:
## paid.m
## no yes
## 205 177
##
## percentages:
## paid.m
## no yes
## 53.66 46.34
needed_packages <- c("pastecs", "ggplot2", "semTools", "FSA")
# Extract not installed packages
not_installed <- needed_packages[!(needed_packages %in% installed.packages()[ , "Package"])]
# Install not installed packages
if(length(not_installed)) install.packages(not_installed)
library(pastecs) #For creating descriptive statistic summaries
library(ggplot2) #For creating histograms with more detail than plot
## Warning: package 'ggplot2' was built under R version 4.0.3
library(semTools) #For skewness and kurtosis
## Loading required package: lavaan
## This is lavaan 0.6-7
## lavaan is BETA software! Please report any bugs.
##
## ###############################################################################
## This is semTools 0.5-3
## All users of R (or SEM) are invited to submit functions or ideas for functions.
## ###############################################################################
##
## Attaching package: 'semTools'
## The following object is masked from 'package:RcmdrMisc':
##
## reliability
## The following object is masked from 'package:e1071':
##
## kurtosis
for variable freetime.m
gg <- ggplot(studentData, aes(x=freetime.m))
#Change the label of the x axis
gg <- gg + labs(x="freetime")
#manage binwidth and colours
gg <- gg + geom_histogram(binwidth=5, colour="black", aes(y=..density.., fill=..count..))
gg <- gg + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")
#adding a normal curve
#use stat_function to compute a normalised score for each value of freetime.m
#pass the mean and standard deviation
#use the na.rm parameter to say how missing values are handled
gg <- gg + stat_function(fun=dnorm, color="red",args=list(mean=mean(studentData$freetime.m, na.rm=TRUE), sd=sd(studentData$freetime.m,na.rm=TRUE)))
#to display the graph request the contents of the variable be shown
gg
#Create a qqplot
qqnorm(studentData$freetime.m)
qqline(studentData$freetime.m, col=2) #show a line on theplot
### Generate Summary Statistics
#stat.desc is a function from pastecs - make sure you include the basic switch=F to ensure you don't get scienfitic notation
pastecs::stat.desc(studentData$freetime.m, basic=F)
## median mean SE.mean CI.mean.0.95 var std.dev
## 3.0000000 3.2225131 0.0505624 0.0994163 0.9766047 0.9882331
## coef.var
## 0.3066654
#We can make our decision based on the value of the standardised score for skew and kurtosis
#We divide the skew statistic by the standard error to get the standardised score
#This will indicate if we have a problem
tpskew<-semTools::skew(studentData$freetime.m)
tpkurt<-semTools::kurtosis(studentData$freetime.m)
tpskew[1]/tpskew[2]
## skew (g1)
## -1.169023
tpkurt[1]/tpkurt[2]
## Excess Kur (g2)
## -1.052723
#and by calculating the percentage of standardised scores for the variable itself that are outside our acceptable range
#This will tell us how big a problem we have
# Calculate the percentage of standardised scores that are greated than 1.96
# the perc function which is part of the FSA package which calculate the percentage that are within a range - you can look for greater than "gt", greater than or equal "geq", "gt", less than or equal "leq", or less than "lt"),
# scale is a function that creates z scores, abs gets absolute value
zfreetime<- abs(scale(studentData$freetime.m))
FSA::perc(as.numeric(zfreetime), 1.96, "gt")
## [1] 4.712042
FSA::perc(as.numeric(zfreetime), 3.29, "gt")
## [1] 0
for variable studytime.m
#We will allocate the histogram to a variable to allow use to manipulate it
gg <- ggplot(studentData, aes(x=studytime.m))
#Change the label of the x axis
gg <- gg + labs(x="study time")
#manage binwidth and colours
gg <- gg + geom_histogram(binwidth=2, colour="black", aes(y=..density.., fill=..count..))
gg <- gg + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")
#adding a normal curve
#use stat_function to compute a normalised score for each value of studytime.m
#pass the mean and standard deviation
#use the na.rm parameter to say how missing values are handled
gg <- gg + stat_function(fun=dnorm, color="red",args=list(mean=mean(studentData$studytime.m, na.rm=TRUE), sd=sd(studentData$studytime.m, na.rm=TRUE)))
#to display the graph request the contents of the variable be shown
gg
### Generate Q-Q Plot
#Create a qqplot
qqnorm(studentData$studytime.m)
qqline(studentData$studytime.m, col=2) #show a line on theplot
### Generate Summary Statistics
#stat.desc is a function from pastecs - make sure you include the basic switch=F to ensure you don't get scienfitic notation
pastecs::stat.desc(studentData$studytime.m, basic=F)
## median mean SE.mean CI.mean.0.95 var std.dev
## 2.00000000 2.03403141 0.04327479 0.08508732 0.71537426 0.84579800
## coef.var
## 0.41582347
#We can make our decision based on the value of the standardised score for skew and kurtosis
#We divide the skew statistic by the standard error to get the standardised score
#This will indicate if we have a problem
tpskew<-semTools::skew(studentData$studytime.m)
tpkurt<-semTools::kurtosis(studentData$studytime.m)
tpskew[1]/tpskew[2]
## skew (g1)
## 5.120306
tpkurt[1]/tpkurt[2]
## Excess Kur (g2)
## -0.1099744
#and by calculating the percentage of standardised scores for the variable itself that are outside our acceptable range
#This will tell us how big a problem we have
# Calculate the percentage of standardised scores that are greated than 1.96
# the perc function which is part of the FSA package which calculate the percentage that are within a range -
#you can look for greater than "gt", greater than or equal "geq", "gt", less than or equal "leq", or less than "lt"),
# scale is a function that creates z scores, abs gets absolute value
zstudy<- abs(scale(studentData$studytime.m))
FSA::perc(as.numeric(zstudy), 1.96, "gt")
## [1] 7.068063
FSA::perc(as.numeric(zstudy), 3.29, "gt")
## [1] 0
##### assumption tests #####
###### checking normality for variable total score in maths (mG3)###########
#We will allocate the histogram to a variable to allow use to manipulate it
gg <- ggplot(studentData, aes(x=mG3))
#Change the label of the x axis
gg <- gg + labs(x="going out with friends")
#manage binwidth and colours
gg <- gg + geom_histogram(binwidth=2, colour="black", aes(y=..density.., fill=..count..))
gg <- gg + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")
#adding a normal curve
#use stat_function to compute a normalised score for each value of mG3
#pass the mean and standard deviation
#use the na.rm parameter to say how missing values are handled
gg <- gg + stat_function(fun=dnorm, color="red",args=list(mean=mean(studentData$mG3, na.rm=TRUE), sd=sd(studentData$mG3, na.rm=TRUE)))
#to display the graph request the contents of the variable be shown
gg
### Generate Q-Q Plot
#Create a qqplot
qqnorm(studentData$mG3)
qqline(studentData$mG3, col=2) #show a line on theplot
### Generate Summary Statistics
#stat.desc is a function from pastecs - make sure you include the basic switch=F to ensure you don't get scienfitic notation
pastecs::stat.desc(studentData$mG3, basic=F)
## median mean SE.mean CI.mean.0.95 var std.dev
## 11.0000000 11.0104712 0.1840184 0.3618189 12.9355856 3.5966075
## coef.var
## 0.3266534
#We can make our decision based on the value of the standardised score for skew and kurtosis
#We divide the skew statistic by the standard error to get the standardised score
#This will indicate if we have a problem
tpskew<-semTools::skew(studentData$mG3)
tpkurt<-semTools::kurtosis(studentData$mG3)
tpskew[1]/tpskew[2]
## skew (g1)
## 0.5602929
tpkurt[1]/tpkurt[2]
## Excess Kur (g2)
## -1.494787
#Simple scatterplot of studytime and total grades in math
#aes(x,y)
scatter <- ggplot(studentData, aes(studytime.m, mG3))
#Add a regression line
scatter + geom_point() + geom_smooth(method = "lm", colour = "Red", se = F) + labs(x = "goingoutwithfriends", y = "total grades in maths")
## `geom_smooth()` using formula 'y ~ x'
######## Conducting Correlation Test- Life going out time and total grades in math#######33
#As both variables can be considered approximately normal use Pearson
#Pearson Correlation
stats::cor.test(studentData$studytime.m, studentData$mG3, method='pearson')
##
## Pearson's product-moment correlation
##
## data: studentData$studytime.m and studentData$mG3
## t = 2.7321, df = 380, p-value = 0.006588
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.03900033 0.23584831
## sample estimates:
## cor
## 0.138795
#Moderate positive correlation which is statistically significant (r=0.091, n=382,p< 0.001)
needed_packages <- c("pastecs", "ggplot2", "psych", "semTools", "FSA", "car", "coin", "rstatix")
# Extract not installed packages
not_installed <- needed_packages[!(needed_packages %in% installed.packages()[ , "Package"])]
library(FSA) #For percentage
## ## FSA v0.8.30. See citation('FSA') if used in publication.
## ## Run fishR() for related website and fishR('IFAR') for related book.
##
## Attaching package: 'FSA'
## The following object is masked from 'package:car':
##
## bootCase
library(car) # For Levene's test for homogeneity of variance
library(effectsize) #To calculate effect size for t-test
## Warning: package 'effectsize' was built under R version 4.0.3
### the normality test of mG3 has already been done in the correlation test ############
Independent t-test
#Get descriptive stastitics by group - output as a matrix
psych::describeBy(studentData$mG3, studentData$internet, mat=TRUE)
## item group1 vars n mean sd median trimmed mad min max range
## X11 1 no 1 58 10.12069 3.323640 10 9.96875 2.9652 4.5 18 13.5
## X12 2 yes 1 324 11.16975 3.625103 11 11.17500 4.4478 2.0 20 18.0
## skew kurtosis se
## X11 0.29768997 -0.4107896 0.4364153
## X12 0.01919541 -0.3850400 0.2013946
#Conduct Levene's test for homogeneity of variance in library car -
##the null hypothesis is that variances in groups are equal so to assume homogeneity we would expect probability to not be statistically significant.
car::leveneTest(mG3 ~ internet, data=studentData)
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 1.3807 0.2407
## 380
#Pr(>F) is your probability - in this case it is not statistically significant so we can assume homogeneity
#Conduct the t-test from package stats
#In this case we can use the var.equal = TRUE option to specify equal variances and a pooled variance estimate
stats::t.test(mG3~ internet,var.equal=TRUE,data=studentData)
##
## Two Sample t-test
##
## data: mG3 by internet
## t = -2.0544, df = 380, p-value = 0.04062
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.05308690 -0.04503996
## sample estimates:
## mean in group no mean in group yes
## 10.12069 11.16975
res <- stats::t.test(mG3~ internet,var.equal=TRUE,data=studentData)
#Calculate Cohen's d
#artithmetically
effcd=round((2*res$statistic)/sqrt(res$parameter),2)
#Using function from effectsize package
effectsize::t_to_d(t = res$statistic, res$parameter)
## d | 95% CI
## ----------------------
## -0.21 | [-0.41, -0.01]
#Eta squared calculation
effes=round((res$statistic*res$statistic)/((res$statistic*res$statistic)+(res$parameter)),3)
effes
## t
## 0.011
ANOVA
needed_packages <- c("pastecs", "ggplot2", "psych", "semTools", "FSA", "sjstats", "userfriendlyscience")
# Extract not installed packages
not_installed <- needed_packages[!(needed_packages %in% installed.packages()[ , "Package"])]
# Install not installed packages
if(length(not_installed)) install.packages(not_installed)
library(pastecs) #For creating descriptive statistic summaries
library(ggplot2) #For creating histograms with more detail than plot
library(psych) # Some useful descriptive functions
## Warning: package 'psych' was built under R version 4.0.3
##
## Attaching package: 'psych'
## The following object is masked from 'package:effectsize':
##
## phi
## The following object is masked from 'package:FSA':
##
## headtail
## The following object is masked from 'package:semTools':
##
## skew
## The following object is masked from 'package:lavaan':
##
## cor2cov
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## The following object is masked from 'package:car':
##
## logit
library(FSA) #For percentage
library(sjstats) #To calculate effect size for t-test
## Warning: package 'sjstats' was built under R version 4.0.3
##
## Attaching package: 'sjstats'
## The following object is masked from 'package:psych':
##
## phi
## The following objects are masked from 'package:effectsize':
##
## cohens_f, phi
## The following object is masked from 'package:FSA':
##
## se
library(userfriendlyscience)
## Warning: package 'userfriendlyscience' was built under R version 4.0.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## Registered S3 methods overwritten by 'ufs':
## method from
## grid.draw.ggProportionPlot userfriendlyscience
## pander.associationMatrix userfriendlyscience
## pander.dataShape userfriendlyscience
## pander.descr userfriendlyscience
## pander.normalityAssessment userfriendlyscience
## print.CramersV userfriendlyscience
## print.associationMatrix userfriendlyscience
## print.confIntOmegaSq userfriendlyscience
## print.confIntV userfriendlyscience
## print.dataShape userfriendlyscience
## print.descr userfriendlyscience
## print.ggProportionPlot userfriendlyscience
## print.meanConfInt userfriendlyscience
## print.multiVarFreq userfriendlyscience
## print.normalityAssessment userfriendlyscience
## print.regrInfluential userfriendlyscience
## print.scaleDiagnosis userfriendlyscience
## print.scaleStructure userfriendlyscience
## print.scatterMatrix userfriendlyscience
##
## Attaching package: 'userfriendlyscience'
## The following objects are masked from 'package:FSA':
##
## is.even, is.odd
## The following object is masked from 'package:semTools':
##
## reliability
## The following object is masked from 'package:RcmdrMisc':
##
## reliability
### the normality test of mG3 has already been done in the correlation test ############
##Describe Total marks in maths by group (Group 1, Group 2, Group3, Group 4, Group 5 health)
#Get descriptive statistics by group - output as a matrix
psych::describeBy(studentData$mG3, studentData$health.m, mat=TRUE)
## item group1 vars n mean sd median trimmed mad min max range
## X11 1 1 1 46 12.20652 3.733595 13 12.26316 4.4478 4.0 19 15.0
## X12 2 2 1 43 11.44186 3.682664 10 11.24286 4.4478 5.5 20 14.5
## X13 3 3 1 83 10.54217 3.220745 10 10.51493 2.9652 3.0 19 16.0
## X14 4 4 1 64 10.90625 3.683807 10 10.87500 2.9652 2.5 19 16.5
## X15 5 5 1 146 10.81849 3.646387 11 10.83475 4.4478 2.0 18 16.0
## skew kurtosis se
## X11 -0.14553620 -0.7704110 0.5504886
## X12 0.44898024 -0.6116668 0.5616010
## X13 0.11973115 0.1961291 0.3535227
## X14 0.12064824 -0.3611952 0.4604759
## X15 -0.08242553 -0.6599138 0.3017771
#Conduct Bartlett's test for homogeneity of variance in library car -
##the null hypothesis is that variances in groups are equal so to assume homogeneity we would expect probability to not be statistically significant.
stats::bartlett.test(mG3~ health.m, data=studentData)
##
## Bartlett test of homogeneity of variances
##
## data: mG3 by health.m
## Bartlett's K-squared = 2.1332, df = 4, p-value = 0.7113
#p value is > 0.05 so the result is not statistically significant so we can assume homogeneity
#Conduct ANOVA using the user-friendly science test one way
#In this case we can use Tukey as the post-hoc test option since variances in the groups are equal
#If variances were not equal we would use Games-Howell
userfriendlyscience::oneway(as.factor(studentData$health.m),y=studentData$mG3,posthoc='Tukey')
## ### Oneway Anova for y=mG3 and x=health.m (groups: 1, 2, 3, 4, 5)
##
## Omega squared: 95% CI = [NA; .05], point estimate = .01
## Eta Squared: 95% CI = [0; .04], point estimate = .02
##
## SS Df MS F p
## Between groups (error + effect) 98.09 4 24.52 1.91 .107
## Within groups (error only) 4830.37 377 12.81
##
##
## ### Post hoc test: Tukey
##
## diff lwr upr p adj
## 2-1 -0.76 -2.85 1.32 .852
## 3-1 -1.66 -3.47 0.14 .086
## 4-1 -1.3 -3.2 0.6 .330
## 5-1 -1.39 -3.05 0.27 .149
## 3-2 -0.9 -2.74 0.94 .668
## 4-2 -0.54 -2.47 1.4 .942
## 5-2 -0.62 -2.33 1.08 .854
## 4-3 0.36 -1.27 2 .973
## 5-3 0.28 -1.07 1.63 .980
## 5-4 -0.09 -1.56 1.38 1.000
res1<-userfriendlyscience::oneway(as.factor(studentData$health.m),y=studentData$mG3,posthoc='Tukey')
#use the aov function - same as one way but makes it easier to access values for reporting
res2<-stats::aov(mG3~ health.m, data = studentData)
#Get the F statistic into a variable to make reporting easier
fstat<-summary(res2)[[1]][["F value"]][[1]]
#Get the p value into a variable to make reporting easier
aovpvalue<-summary(res2)[[1]][["Pr(>F)"]][[1]]
#Calculate effect
aoveta<-sjstats::eta_sq(res2)[2]
aoveta
## etasq
## 1 0.01