#### loading the data #####
studentData<- read.csv('Student-performance.csv')

################################################ missing data handling(mean imputation) #############################################

studentData$mG3 <- ifelse((studentData$mG3==0),as.numeric((studentData$mG1+studentData$mG2)/2),studentData$mG3)

summary statistics of variables

library(Rcmdr)

## Loading required package: splines

## Loading required package: RcmdrMisc

## Loading required package: car

## Loading required package: carData

## Loading required package: sandwich

## Loading required package: effects

## Registered S3 methods overwritten by 'lme4':
##   method                          from
##   cooks.distance.influence.merMod car 
##   influence.merMod                car 
##   dfbeta.influence.merMod         car 
##   dfbetas.influence.merMod        car

## lattice theme set by effectsTheme()
## See ?effectsTheme for details.

## The Commander GUI is launched only in interactive sessions

## 
## Attaching package: 'Rcmdr'

## The following object is masked from 'package:base':
## 
##     errorCondition

library(abind, pos=18)
library(e1071, pos=19)
numSummary(studentData[,"failures.m", drop=FALSE], 
           groups=studentData$ï..school, statistics=c("mean", "sd", "IQR", 
                                                      "quantiles"), quantiles=c(0,.25,.5,.75,1))

##         mean        sd  IQR 0% 25% 50%  75% 100% failures.m:n
## GP 0.2894737 0.7431843 0.00  0   0   0 0.00    3          342
## MS 0.3000000 0.6076436 0.25  0   0   0 0.25    3           40

with(studentData, Hist(failures.m, groups=ï..school, scale="frequency", 
                       breaks="Sturges", col="darkgray"))

numSummary(studentData[,"mG3", drop=FALSE], groups=studentData$ï..school, 
           statistics=c("mean", "sd", "IQR", "quantiles"), quantiles=c(0,.25,.5,.75,1))

##        mean       sd  IQR 0% 25% 50%   75% 100% mG3:n
## GP 11.10526 3.613558 5.00  2   9  11 14.00   20   342
## MS 10.20000 3.383216 4.25  5   8  10 12.25   19    40

with(studentData, Hist(mG3, groups=ï..school, scale="frequency", 
                       breaks="Sturges", col="darkgray"))

local({
  .Table <- with(studentData, table(paid.m))
  cat("\ncounts:\n")
  print(.Table)
  cat("\npercentages:\n")
  print(round(100*.Table/sum(.Table), 2))
})

## 
## counts:
## paid.m
##  no yes 
## 205 177 
## 
## percentages:
## paid.m
##    no   yes 
## 53.66 46.34

testing normality

needed_packages <- c("pastecs", "ggplot2", "semTools", "FSA")                                    
# Extract not installed packages
not_installed <- needed_packages[!(needed_packages %in% installed.packages()[ , "Package"])]    
# Install not installed packages
if(length(not_installed)) install.packages(not_installed)                              
library(pastecs) #For creating descriptive statistic summaries
library(ggplot2) #For creating histograms with more detail than plot

## Warning: package 'ggplot2' was built under R version 4.0.3

library(semTools) #For skewness and kurtosis

## Loading required package: lavaan

## This is lavaan 0.6-7

## lavaan is BETA software! Please report any bugs.

##

## ###############################################################################

## This is semTools 0.5-3

## All users of R (or SEM) are invited to submit functions or ideas for functions.

## ###############################################################################

## 
## Attaching package: 'semTools'

## The following object is masked from 'package:RcmdrMisc':
## 
##     reliability

## The following object is masked from 'package:e1071':
## 
##     kurtosis

for variable freetime.m

gg <- ggplot(studentData, aes(x=freetime.m))
#Change the label of the x axis
gg <- gg + labs(x="freetime")
#manage binwidth and colours
gg <- gg + geom_histogram(binwidth=5, colour="black", aes(y=..density.., fill=..count..))
gg <- gg + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")
#adding a normal curve
#use stat_function to compute a normalised score for each value of freetime.m
#pass the mean and standard deviation
#use the na.rm parameter to say how missing values are handled
gg <- gg + stat_function(fun=dnorm, color="red",args=list(mean=mean(studentData$freetime.m, na.rm=TRUE), sd=sd(studentData$freetime.m,na.rm=TRUE)))
#to display the graph request the contents of the variable be shown
gg

#Create a qqplot
qqnorm(studentData$freetime.m)
qqline(studentData$freetime.m, col=2) #show a line on theplot

###  Generate Summary Statistics 
#stat.desc is a function from pastecs - make sure you include the basic switch=F to ensure you don't get scienfitic notation
pastecs::stat.desc(studentData$freetime.m, basic=F)

##       median         mean      SE.mean CI.mean.0.95          var      std.dev 
##    3.0000000    3.2225131    0.0505624    0.0994163    0.9766047    0.9882331 
##     coef.var 
##    0.3066654

#We can make our decision based on the value of the standardised score for skew and kurtosis
#We divide the skew statistic by the standard error to get the standardised score
#This will indicate if we have a problem
tpskew<-semTools::skew(studentData$freetime.m)
tpkurt<-semTools::kurtosis(studentData$freetime.m)
tpskew[1]/tpskew[2]

## skew (g1) 
## -1.169023

tpkurt[1]/tpkurt[2]

## Excess Kur (g2) 
##       -1.052723

#and by calculating the percentage of standardised scores for the variable itself that are outside our acceptable range
#This will tell us how big a problem we have
# Calculate the percentage of standardised scores that are greated than 1.96
# the perc function which is part of the FSA package which calculate the percentage that are within a range - you can look for greater than "gt", greater than or equal "geq", "gt", less than or equal "leq",  or less than "lt"),
# scale is a function that creates z scores, abs gets absolute value

zfreetime<- abs(scale(studentData$freetime.m))

FSA::perc(as.numeric(zfreetime), 1.96, "gt")

## [1] 4.712042

FSA::perc(as.numeric(zfreetime), 3.29, "gt")

## [1] 0

for variable studytime.m

#We will allocate the histogram to a variable to allow use to manipulate it
gg <- ggplot(studentData, aes(x=studytime.m))

#Change the label of the x axis
gg <- gg + labs(x="study time")

#manage binwidth and colours
gg <- gg + geom_histogram(binwidth=2, colour="black", aes(y=..density.., fill=..count..))
gg <- gg + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")

#adding a normal curve
#use stat_function to compute a normalised score for each value of studytime.m
#pass the mean and standard deviation
#use the na.rm parameter to say how missing values are handled
gg <- gg + stat_function(fun=dnorm, color="red",args=list(mean=mean(studentData$studytime.m, na.rm=TRUE), sd=sd(studentData$studytime.m, na.rm=TRUE)))

#to display the graph request the contents of the variable be shown
gg

###  Generate Q-Q Plot
#Create a qqplot
qqnorm(studentData$studytime.m)
qqline(studentData$studytime.m, col=2) #show a line on theplot

### Generate Summary Statistics 
#stat.desc is a function from pastecs - make sure you include the basic switch=F to ensure you don't get scienfitic notation
pastecs::stat.desc(studentData$studytime.m, basic=F)

##       median         mean      SE.mean CI.mean.0.95          var      std.dev 
##   2.00000000   2.03403141   0.04327479   0.08508732   0.71537426   0.84579800 
##     coef.var 
##   0.41582347

#We can make our decision based on the value of the standardised score for skew and kurtosis
#We divide the skew statistic by the standard error to get the standardised score
#This will indicate if we have a problem
tpskew<-semTools::skew(studentData$studytime.m)
tpkurt<-semTools::kurtosis(studentData$studytime.m)
tpskew[1]/tpskew[2]

## skew (g1) 
##  5.120306

tpkurt[1]/tpkurt[2]

## Excess Kur (g2) 
##      -0.1099744

#and by calculating the percentage of standardised scores for the variable itself that are outside our acceptable range
#This will tell us how big a problem we have
# Calculate the percentage of standardised scores that are greated than 1.96
# the perc function which is part of the FSA package which calculate the percentage that are within a range - 
#you can look for greater than "gt", greater than or equal "geq", "gt", less than or equal "leq",  or less than "lt"),
# scale is a function that creates z scores, abs gets absolute value

zstudy<- abs(scale(studentData$studytime.m))

FSA::perc(as.numeric(zstudy), 1.96, "gt")

## [1] 7.068063

FSA::perc(as.numeric(zstudy), 3.29, "gt")

## [1] 0

Correlation test

##### assumption tests #####
###### checking normality for variable total score in maths (mG3)###########
#We will allocate the histogram to a variable to allow use to manipulate it
gg <- ggplot(studentData, aes(x=mG3))
#Change the label of the x axis
gg <- gg + labs(x="going out with friends")
#manage binwidth and colours
gg <- gg + geom_histogram(binwidth=2, colour="black", aes(y=..density.., fill=..count..))
gg <- gg + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")
#adding a normal curve
#use stat_function to compute a normalised score for each value of mG3
#pass the mean and standard deviation
#use the na.rm parameter to say how missing values are handled
gg <- gg + stat_function(fun=dnorm, color="red",args=list(mean=mean(studentData$mG3, na.rm=TRUE), sd=sd(studentData$mG3, na.rm=TRUE)))
#to display the graph request the contents of the variable be shown
gg

###  Generate Q-Q Plot
#Create a qqplot
qqnorm(studentData$mG3)
qqline(studentData$mG3, col=2) #show a line on theplot

### Generate Summary Statistics 
#stat.desc is a function from pastecs - make sure you include the basic switch=F to ensure you don't get scienfitic notation
pastecs::stat.desc(studentData$mG3, basic=F)

##       median         mean      SE.mean CI.mean.0.95          var      std.dev 
##   11.0000000   11.0104712    0.1840184    0.3618189   12.9355856    3.5966075 
##     coef.var 
##    0.3266534

#We can make our decision based on the value of the standardised score for skew and kurtosis
#We divide the skew statistic by the standard error to get the standardised score
#This will indicate if we have a problem
tpskew<-semTools::skew(studentData$mG3)
tpkurt<-semTools::kurtosis(studentData$mG3)
tpskew[1]/tpskew[2]

## skew (g1) 
## 0.5602929

tpkurt[1]/tpkurt[2]

## Excess Kur (g2) 
##       -1.494787

Scatterplot

#Simple scatterplot of studytime and total grades in math
#aes(x,y)
scatter <- ggplot(studentData, aes(studytime.m, mG3))
#Add a regression line
scatter + geom_point() + geom_smooth(method = "lm", colour = "Red", se = F) + labs(x = "goingoutwithfriends", y = "total grades in maths")

## `geom_smooth()` using formula 'y ~ x'

######## Conducting Correlation Test- Life going out time and total grades in math#######33

#As both variables can be considered approximately normal use Pearson
#Pearson Correlation
stats::cor.test(studentData$studytime.m, studentData$mG3, method='pearson')

## 
##  Pearson's product-moment correlation
## 
## data:  studentData$studytime.m and studentData$mG3
## t = 2.7321, df = 380, p-value = 0.006588
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.03900033 0.23584831
## sample estimates:
##      cor 
## 0.138795

#Moderate positive correlation which is statistically significant (r=0.091, n=382,p< 0.001)

Difference

needed_packages <- c("pastecs", "ggplot2", "psych", "semTools", "FSA", "car", "coin", "rstatix")                      
# Extract not installed packages
not_installed <- needed_packages[!(needed_packages %in% installed.packages()[ , "Package"])]
library(FSA) #For percentage

## ## FSA v0.8.30. See citation('FSA') if used in publication.
## ## Run fishR() for related website and fishR('IFAR') for related book.

## 
## Attaching package: 'FSA'

## The following object is masked from 'package:car':
## 
##     bootCase

library(car) # For Levene's test for homogeneity of variance 
library(effectsize) #To calculate effect size for t-test

## Warning: package 'effectsize' was built under R version 4.0.3

### the normality  test of mG3 has already been done in the correlation test ############

Independent t-test

#Get descriptive stastitics by group - output as a matrix
psych::describeBy(studentData$mG3, studentData$internet, mat=TRUE)

##     item group1 vars   n     mean       sd median  trimmed    mad min max range
## X11    1     no    1  58 10.12069 3.323640     10  9.96875 2.9652 4.5  18  13.5
## X12    2    yes    1 324 11.16975 3.625103     11 11.17500 4.4478 2.0  20  18.0
##           skew   kurtosis        se
## X11 0.29768997 -0.4107896 0.4364153
## X12 0.01919541 -0.3850400 0.2013946

#Conduct Levene's test for homogeneity of variance in library car - 
##the null hypothesis is that variances in groups are equal so to assume homogeneity we would expect probability to not be statistically significant.
car::leveneTest(mG3 ~ internet, data=studentData)

## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.

## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)
## group   1  1.3807 0.2407
##       380

#Pr(>F) is your probability - in this case it is not statistically significant so we can assume homogeneity

#Conduct the t-test from package stats
#In this case we can use the var.equal = TRUE option to specify equal variances and a pooled variance estimate
stats::t.test(mG3~ internet,var.equal=TRUE,data=studentData)

## 
##  Two Sample t-test
## 
## data:  mG3 by internet
## t = -2.0544, df = 380, p-value = 0.04062
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.05308690 -0.04503996
## sample estimates:
##  mean in group no mean in group yes 
##          10.12069          11.16975

res <- stats::t.test(mG3~ internet,var.equal=TRUE,data=studentData)
#Calculate Cohen's d
#artithmetically
effcd=round((2*res$statistic)/sqrt(res$parameter),2)

#Using function from effectsize package
effectsize::t_to_d(t = res$statistic, res$parameter)

##     d |         95% CI
## ----------------------
## -0.21 | [-0.41, -0.01]

#Eta squared calculation
effes=round((res$statistic*res$statistic)/((res$statistic*res$statistic)+(res$parameter)),3)
effes

##     t 
## 0.011

ANOVA

needed_packages <- c("pastecs", "ggplot2", "psych", "semTools", "FSA", "sjstats", "userfriendlyscience")                      
# Extract not installed packages
not_installed <- needed_packages[!(needed_packages %in% installed.packages()[ , "Package"])]    
# Install not installed packages
if(length(not_installed)) install.packages(not_installed) 

library(pastecs) #For creating descriptive statistic summaries
library(ggplot2) #For creating histograms with more detail than plot
library(psych) # Some useful descriptive functions

## Warning: package 'psych' was built under R version 4.0.3

## 
## Attaching package: 'psych'

## The following object is masked from 'package:effectsize':
## 
##     phi

## The following object is masked from 'package:FSA':
## 
##     headtail

## The following object is masked from 'package:semTools':
## 
##     skew

## The following object is masked from 'package:lavaan':
## 
##     cor2cov

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

## The following object is masked from 'package:car':
## 
##     logit

library(FSA) #For percentage
library(sjstats) #To calculate effect size for t-test

## Warning: package 'sjstats' was built under R version 4.0.3

## 
## Attaching package: 'sjstats'

## The following object is masked from 'package:psych':
## 
##     phi

## The following objects are masked from 'package:effectsize':
## 
##     cohens_f, phi

## The following object is masked from 'package:FSA':
## 
##     se

library(userfriendlyscience)

## Warning: package 'userfriendlyscience' was built under R version 4.0.3

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

## Registered S3 methods overwritten by 'ufs':
##   method                     from               
##   grid.draw.ggProportionPlot userfriendlyscience
##   pander.associationMatrix   userfriendlyscience
##   pander.dataShape           userfriendlyscience
##   pander.descr               userfriendlyscience
##   pander.normalityAssessment userfriendlyscience
##   print.CramersV             userfriendlyscience
##   print.associationMatrix    userfriendlyscience
##   print.confIntOmegaSq       userfriendlyscience
##   print.confIntV             userfriendlyscience
##   print.dataShape            userfriendlyscience
##   print.descr                userfriendlyscience
##   print.ggProportionPlot     userfriendlyscience
##   print.meanConfInt          userfriendlyscience
##   print.multiVarFreq         userfriendlyscience
##   print.normalityAssessment  userfriendlyscience
##   print.regrInfluential      userfriendlyscience
##   print.scaleDiagnosis       userfriendlyscience
##   print.scaleStructure       userfriendlyscience
##   print.scatterMatrix        userfriendlyscience

## 
## Attaching package: 'userfriendlyscience'

## The following objects are masked from 'package:FSA':
## 
##     is.even, is.odd

## The following object is masked from 'package:semTools':
## 
##     reliability

## The following object is masked from 'package:RcmdrMisc':
## 
##     reliability

### the normality  test of mG3 has already been done in the correlation test ############
##Describe Total marks in maths by group (Group 1, Group 2, Group3, Group 4, Group 5 health)
#Get descriptive statistics by group - output as a matrix
psych::describeBy(studentData$mG3, studentData$health.m, mat=TRUE)

##     item group1 vars   n     mean       sd median  trimmed    mad min max range
## X11    1      1    1  46 12.20652 3.733595     13 12.26316 4.4478 4.0  19  15.0
## X12    2      2    1  43 11.44186 3.682664     10 11.24286 4.4478 5.5  20  14.5
## X13    3      3    1  83 10.54217 3.220745     10 10.51493 2.9652 3.0  19  16.0
## X14    4      4    1  64 10.90625 3.683807     10 10.87500 2.9652 2.5  19  16.5
## X15    5      5    1 146 10.81849 3.646387     11 10.83475 4.4478 2.0  18  16.0
##            skew   kurtosis        se
## X11 -0.14553620 -0.7704110 0.5504886
## X12  0.44898024 -0.6116668 0.5616010
## X13  0.11973115  0.1961291 0.3535227
## X14  0.12064824 -0.3611952 0.4604759
## X15 -0.08242553 -0.6599138 0.3017771

#Conduct Bartlett's test for homogeneity of variance in library car -
##the null hypothesis is that variances in groups are equal so to assume homogeneity we would expect probability to not be statistically significant.
stats::bartlett.test(mG3~ health.m, data=studentData)

## 
##  Bartlett test of homogeneity of variances
## 
## data:  mG3 by health.m
## Bartlett's K-squared = 2.1332, df = 4, p-value = 0.7113

#p value is > 0.05 so the result is not statistically significant so we can assume homogeneity
#Conduct ANOVA using the user-friendly science test one way
#In this case we can use Tukey as the post-hoc test option since variances in the groups are equal
#If variances were not equal we would use Games-Howell
userfriendlyscience::oneway(as.factor(studentData$health.m),y=studentData$mG3,posthoc='Tukey')

## ### Oneway Anova for y=mG3 and x=health.m (groups: 1, 2, 3, 4, 5)
## 
## Omega squared: 95% CI = [NA; .05], point estimate = .01
## Eta Squared: 95% CI = [0; .04], point estimate = .02
## 
##                                      SS  Df    MS    F    p
## Between groups (error + effect)   98.09   4 24.52 1.91 .107
## Within groups (error only)      4830.37 377 12.81          
## 
## 
## ### Post hoc test: Tukey
## 
##     diff  lwr   upr  p adj
## 2-1 -0.76 -2.85 1.32 .852 
## 3-1 -1.66 -3.47 0.14 .086 
## 4-1 -1.3  -3.2  0.6  .330 
## 5-1 -1.39 -3.05 0.27 .149 
## 3-2 -0.9  -2.74 0.94 .668 
## 4-2 -0.54 -2.47 1.4  .942 
## 5-2 -0.62 -2.33 1.08 .854 
## 4-3 0.36  -1.27 2    .973 
## 5-3 0.28  -1.07 1.63 .980 
## 5-4 -0.09 -1.56 1.38 1.000

res1<-userfriendlyscience::oneway(as.factor(studentData$health.m),y=studentData$mG3,posthoc='Tukey')
#use the aov function - same as one way but makes it easier to access values for reporting
res2<-stats::aov(mG3~ health.m, data = studentData)
#Get the F statistic into a variable to make reporting easier
fstat<-summary(res2)[[1]][["F value"]][[1]]
#Get the p value into a variable to make reporting easier
aovpvalue<-summary(res2)[[1]][["Pr(>F)"]][[1]]
#Calculate effect
aoveta<-sjstats::eta_sq(res2)[2]
aoveta

##   etasq
## 1  0.01

Portfolio_statisticalexploration

Dorothy

10/12/2020

testing normality

Correlation test

Scatterplot

Difference