Analysis for student performance

Factors affecting student performance

Amrin Shaikh(S3654729)

Last updated: 23 August, 2017

Introduction

Problem Statement

Data

Student_d1 <- read_delim("D:/Intro to Statistics/Assignment 4/student-mat.csv", 
    ";", escape_double = FALSE, trim_ws = TRUE)
print(nrow(Student_d1))
## [1] 395

Summary statistics

## # A tibble: 2 × 10
##     sex   Min    Q1 Median    Q3   Max      Mean       SD     n Missing
##   <chr> <int> <dbl>  <dbl> <dbl> <int>     <dbl>    <dbl> <int>   <int>
## 1     F     0     8     10    13    19  9.966346 4.622338   208       0
## 2     M     0     9     11    14    20 10.914439 4.495297   187       0
## # A tibble: 2 × 10
##     sex   Min    Q1 Median    Q3   Max     Mean       SD     n Missing
##   <chr> <int> <dbl>  <dbl> <dbl> <int>    <dbl>    <dbl> <int>   <int>
## 1     F     4     8     10    13    19 10.62019 3.232530   208       0
## 2     M     3     9     11    14    19 11.22995 3.392839   187       0
#BoxPlot and summary statistics for sex vs G3 Score

Student_d1 %>% boxplot(G3 ~ sex, data = ., ylab = "Grade3 Score", xlab="Sex")

Data Structuring and processing

Data factorizing and Ordering factors

Student_d1$Result[Student_d1$G3<5] <- "Very Poor"
Student_d1$Result[Student_d1$G3>=5 & Student_d1$G3<=10] <- "Poor"
Student_d1$Result[Student_d1$G3>10 & Student_d1$G3<=15] <- "Good"
Student_d1$Result[Student_d1$G3>15 & Student_d1$G3<=20] <- "Very Good"
table(Student_d1$Result, Student_d1$sex)
##            
##              F  M
##   Good      87 82
##   Poor      81 66
##   Very Good 16 24
##   Very Poor 24 15
Student_d1$Result <- Student_d1$Result %>% fct_relevel('Very Good','Good','Poor','Very Poor')
table(Student_d1$Result, Student_d1$sex)
##            
##              F  M
##   Very Good 16 24
##   Good      87 82
##   Poor      81 66
##   Very Poor 24 15

Descriptive Statistics and Proportion table

tb <- table(Student_d1$Result,Student_d1$sex) %>% addmargins()
names(attributes(tb)$dimnames) <- c("Result","sex")
kable(tb,padding=0,format="html")
F M Sum
Very Good 16 24 40
Good 87 82 169
Poor 81 66 147
Very Poor 24 15 39
Sum 208 187 395
tb2 <- table(Student_d1$Result,Student_d1$sex) %>% prop.table(margin=1) %>% round(2)
names(attributes(tb2)$dimnames) <- c("Result","sex")
kable(tb2,padding=0,format = "html")
F M
Very Good 0.40 0.60
Good 0.51 0.49
Poor 0.55 0.45
Very Poor 0.62 0.38

Visualisation of the distribution

barplot(tb2,ylab="Proportion Within Result",
          ylim=c(0,1),legend=rownames(tb2),beside=TRUE,
          args.legend=c(x = "top",horiz=TRUE,title="Result category"),
          xlab="Sex", col = c("Red","Yellow","orange","Green"))

Hypothesis Testing (Chi-square test of Association)

Testing assumptions and decision rules

Assumptions:

Decision rules:

Hypothesis Testing Results

chi2 <- chisq.test(table(Student_d1$Result, Student_d1$sex)) #Chi-square test between Result and Sex
chi2
## 
##  Pearson's Chi-squared test
## 
## data:  table(Student_d1$Result, Student_d1$sex)
## X-squared = 4.251, df = 3, p-value = 0.2356
pchisq(q = 3.8623,df = 3,lower.tail = FALSE) #Pvalue finding with q value found from X2 value
## [1] 0.2767223
qchisq(p = .95,df = 3) # Critical value identification
## [1] 7.814728

Observed values

chi2$observed %>% addmargins() 
##            
##               F   M Sum
##   Very Good  16  24  40
##   Good       87  82 169
##   Poor       81  66 147
##   Very Poor  24  15  39
##   Sum       208 187 395

Expected values

chi2$expected %>% addmargins() %>% round(2)
##            
##                  F      M Sum
##   Very Good  21.06  18.94  40
##   Good       88.99  80.01 169
##   Poor       77.41  69.59 147
##   Very Poor  20.54  18.46  39
##   Sum       208.00 187.00 395

Raw residuals

chi2$observed - chi2$expected %>% round(2)
##            
##                 F     M
##   Very Good -5.06  5.06
##   Good      -1.99  1.99
##   Poor       3.59 -3.59
##   Very Poor  3.46 -3.46

Standardized residuals

chi2$stdres
##            
##                      F          M
##   Very Good -1.6913438  1.6913438
##   Good      -0.4058105  0.4058105
##   Poor       0.7489347 -0.7489347
##   Very Poor  1.1699704 -1.1699704

Hypothesis Test Summary

Correlation between G1 and G3 score

#Scatter plot for G1 and G3 scores
plot(G3~ G1, data =Student_d1, xlab = "G1 Score", ylab = "G3 Score")

#Correlation between G1 and G3 scores
cor(Student_d1$G1,Student_d1$G3,use = "complete.obs")
## [1] 0.8014679
#Full corelation analysis

bivariate<-as.matrix(dplyr::select(Student_d1, G1,G3)) #Create a matrix of the variables to be correlated
rcorr(bivariate, type = "pearson")
##     G1  G3
## G1 1.0 0.8
## G3 0.8 1.0
## 
## n= 395 
## 
## 
## P
##    G1 G3
## G1     0
## G3  0
#The confidence interval does not capture 0 values
r=cor(Student_d1$G1,Student_d1$G3)
CIr(r=r,n=395,level=.95)
## [1] 0.7631479 0.8341713

Correlation results

Discussion

References