1. DATA IMPORT AND DESCRIPTION

# Importing a dataset from Kaggle.com, of Student Performance Prediction:

mydata <- read.table("~/Library/Mobile Documents/com~apple~CloudDocs/Documents/MA_HW1.csv",
                     header = TRUE, sep = ",", dec = ".")

head(mydata, 10)
##    ID Gender Study.Hours Sleep.Hours Attendance Grades.Italian Grades.Spanish
## 1   1      1         3.4         8.2         60             79             89
## 2   2      1         3.2         5.9         76             77             78
## 3   3      2         3.2         9.3         41             32             33
## 4   4      1         3.2         8.2         47             34             45
## 5   5      2         3.8        10.0         75             76             81
## 6   6      2         3.4         9.0         47             50             55
## 7   7      2         7.9         8.1         63             80             74
## 8   8      1         1.4         8.0         47             55             56
## 9   9      2         5.4         8.8         67             73             68
## 10 10      1         1.4         9.6         42             59             56
##    Unit.Pass
## 1          1
## 2          1
## 3          2
## 4          2
## 5          1
## 6          2
## 7          1
## 8          2
## 9          1
## 10         2

EXPLANATION OF THE DATA:

DESCRIPTION OF THE VARIABLES:

DATA SOURCE: The dataset is retrieved from Keggle. This is a synthetic representation of student performance, designed to mimic real-world scenarios by considering key factors such as study habits, sleep patterns, and class attendance. Each row represents a hypothetical student, and the dataset includes both input features and the calculated target variable (course grades and passing the unit).

# Basic description of the data frame structure. Shows the number of units and variables, and types of variables (num = numeric, int = integer / full number, etc):

str(mydata)
## 'data.frame':    165 obs. of  8 variables:
##  $ ID            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender        : int  1 1 2 1 2 2 2 1 2 1 ...
##  $ Study.Hours   : num  3.4 3.2 3.2 3.2 3.8 3.4 7.9 1.4 5.4 1.4 ...
##  $ Sleep.Hours   : num  8.2 5.9 9.3 8.2 10 9 8.1 8 8.8 9.6 ...
##  $ Attendance    : num  60 76 41 47 75 47 63 47 67 42 ...
##  $ Grades.Italian: num  79 77 32 34 76 50 80 55 73 59 ...
##  $ Grades.Spanish: num  89 78 33 45 81 55 74 56 68 56 ...
##  $ Unit.Pass     : int  1 1 2 2 1 2 1 2 1 2 ...

2. DATA MANIPULATION

# Renaming some columns titles in order to make the data frame neater:

colnames(mydata)[3] <- ("Study") #Its previous name was Study.Hours
colnames(mydata)[4] <- ("Sleep") #Its previous name was Sleep.Hours
colnames(mydata)[6] <- ("Italian") #Its previous name was Grades.Italian
colnames(mydata)[7] <- ("Spanish") #Its previous name was Grades.Spanish
colnames(mydata)[8] <- ("Unit") #Its previous name was Unit.Pass

head(mydata, 5)
##   ID Gender Study Sleep Attendance Italian Spanish Unit
## 1  1      1   3.4   8.2         60      79      89    1
## 2  2      1   3.2   5.9         76      77      78    1
## 3  3      2   3.2   9.3         41      32      33    2
## 4  4      1   3.2   8.2         47      34      45    2
## 5  5      2   3.8  10.0         75      76      81    1
# Changing the depiction of Gender from "1" and "2" to words "Male" and "Female":

mydata$Gender <- factor(mydata$Gender, 
                      levels = c("1", "2"), 
                      labels = c("Male", "Female"))

# Changing the depiction of Unit from "1" and "2" to words "Pass" and "Fail":

mydata$Unit <- factor(mydata$Unit, 
                      levels = c("1", "2"), 
                      labels = c("Pass", "Fail"))

head(mydata, 5)
##   ID Gender Study Sleep Attendance Italian Spanish Unit
## 1  1   Male   3.4   8.2         60      79      89 Pass
## 2  2   Male   3.2   5.9         76      77      78 Pass
## 3  3 Female   3.2   9.3         41      32      33 Fail
## 4  4   Male   3.2   8.2         47      34      45 Fail
## 5  5 Female   3.8  10.0         75      76      81 Pass

DESCRIPTIVE STATISTICS: Presenting some descriptive statistics and explaining few estimates of parameters.

summary(mydata [ , -1]) #Removing ID because it doesn't make sense to analyse it.
##     Gender       Study          Sleep          Attendance        Italian      
##  Male  :87   Min.   :1.10   Min.   : 5.100   Min.   : 40.00   Min.   : 32.00  
##  Female:78   1st Qu.:3.20   1st Qu.: 7.100   1st Qu.: 53.00   1st Qu.: 60.00  
##              Median :3.70   Median : 8.200   Median : 67.00   Median : 76.00  
##              Mean   :4.48   Mean   : 7.978   Mean   : 68.04   Mean   : 73.47  
##              3rd Qu.:5.40   3rd Qu.: 9.000   3rd Qu.: 80.00   3rd Qu.: 89.00  
##              Max.   :9.80   Max.   :10.000   Max.   :100.00   Max.   :100.00  
##     Spanish         Unit    
##  Min.   : 33.00   Pass:125  
##  1st Qu.: 68.00   Fail: 40  
##  Median : 78.00             
##  Mean   : 75.04             
##  3rd Qu.: 88.00             
##  Max.   :100.00
mean(mydata$Attendance)
## [1] 68.03636
median(mydata$Spanish)
## [1] 78
#install.packages("pastecs")
library(pastecs)
round(stat.desc(mydata[ , -c(1,2,8)]), 1) #Round data to one decimal point.
##              Study  Sleep Attendance Italian Spanish
## nbr.val      165.0  165.0      165.0   165.0   165.0
## nbr.null       0.0    0.0        0.0     0.0     0.0
## nbr.na         0.0    0.0        0.0     0.0     0.0
## min            1.1    5.1       40.0    32.0    33.0
## max            9.8   10.0      100.0   100.0   100.0
## range          8.7    4.9       60.0    68.0    67.0
## sum          739.2 1316.4    11226.0 12123.0 12381.0
## median         3.7    8.2       67.0    76.0    78.0
## mean           4.5    8.0       68.0    73.5    75.0
## SE.mean        0.2    0.1        1.3     1.4     1.4
## CI.mean.0.95   0.3    0.2        2.6     2.8     2.7
## var            4.9    1.8      292.7   337.6   305.4
## std.dev        2.2    1.3       17.1    18.4    17.5
## coef.var       0.5    0.2        0.3     0.3     0.2

3.1. HYPOTHESIS TESTING

One unit of observation (a student) is measured twice — for Italian exam and for Spanish exam. Therefore, the variables are dependent or paired, and we need to analyse two dependent samples. The hypothesis test methods are Paired T-Test for parametric test, and either the Sign test or the Wilcoxon Signed Rank Test for non-parametric test.

RESEARCH QUESTION: Is the average grade obtained for the Italian exam different from the average grade obtained for the Spanish exam, in the observed sample of 165 students?

3.1.1. PARAMETRIC TEST - Paired T-Test:

HYPOTHESIS:

  • H0: Mean value of grades for Italian course is equal to mean value of grades for Spanish course.

  • H0: 𝜇Italian = 𝜇Spanish, or: 𝜇Difference = 0

  • H1: Mean value of grades for Italian course is not equal to mean value of grades for Spanish course.

  • H1: 𝜇Italian ≠ 𝜇Spanish, or: 𝜇Difference ≠ 0

CONDITIONS for Paired T-Test:

  1. Analyzed variable is numeric.
  2. The differences between variables are normal.
  3. Variables are comparable (same scale).

COMPARING DATA WITH STATISTICS:

# Showing only relevant data for hypothesis testing:

head(mydata [ , c(6, 7)], 5)
##   Italian Spanish
## 1      79      89
## 2      77      78
## 3      32      33
## 4      34      45
## 5      76      81
# Using descriptive statistics to compare the means:

library(pastecs)
round(stat.desc(mydata[ , c(6,7)]), 1)
##              Italian Spanish
## nbr.val        165.0   165.0
## nbr.null         0.0     0.0
## nbr.na           0.0     0.0
## min             32.0    33.0
## max            100.0   100.0
## range           68.0    67.0
## sum          12123.0 12381.0
## median          76.0    78.0
## mean            73.5    75.0
## SE.mean          1.4     1.4
## CI.mean.0.95     2.8     2.7
## var            337.6   305.4
## std.dev         18.4    17.5
## coef.var         0.3     0.2
  • Mean grade of Italian (73.5) and mean grade of Spanish (75.0) are very close numbers.
# Creating a new variable that shows a difference between each grade in each course.

mydata$Difference <- mydata$Italian - mydata$Spanish
# Building a histogram chart:

#install.packages("ggplot2")
library(ggplot2)
ggplot(mydata, aes(x = Difference)) +
geom_histogram(binwidth = 3, fill = "slategray2", colour = "skyblue4") +
ylab ("Frequency") + xlab ("Difference")

  • The histogram of differences looks normal.

SHAPIRO-WILK NORMALITY TEST:

  • Checking the condition of normality.

  • H0: Differences between grades of Italian and Spanish are normally distributed.

  • H1: Differences between grades of Italian and Spanish are not normally distributed.

# Shapiro-Wilk normality test for checking whether differences between variables are normally distributed:

shapiro.test(mydata$Difference)
## 
##  Shapiro-Wilk normality test
## 
## data:  mydata$Difference
## W = 0.98702, p-value = 0.1301
# Showing a visualization of distribution of differences, expecting to see all spots directly on the line.

#install.packages("ggpubr")
library(ggpubr)
ggqqplot(mydata$Difference,
         color = "steelblue")

  • P-value is higher than L (0.05). We cannot reject H0. We assume that variables are normally distributed (normality assumption is met), therefore we can perform a parametric test - Paired T-Test.

PAIRED T-TEST:

# Performing a Paired T-Test to check whether two means are equal.

t.test(mydata$Italian, mydata$Spanish,
       paired = TRUE,
       alternative = "two.sided")
## 
##  Paired t-test
## 
## data:  mydata$Italian and mydata$Spanish
## t = -2.3268, df = 164, p-value = 0.0212
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -2.8905626 -0.2367101
## sample estimates:
## mean difference 
##       -1.563636
  • H0: 𝜇Italian = 𝜇Spanish, or: 𝜇Difference = 0

  • H1: 𝜇Italian ≠ 𝜇Spanish, or: 𝜇Difference ≠ 0

  • We reject H0 at p<0.05.

# Checking the effect size, or how large the difference between tested variables is:

#install.packages("effectsize")
library(effectsize)
effectsize::cohens_d(mydata$Difference)
## Cohen's d |         95% CI
## --------------------------
## -0.18     | [-0.33, -0.03]
# Interpreting the effect size.

interpret_cohens_d(0.18, rules = "sawilowsky2009")
## [1] "very small"
## (Rules: sawilowsky2009)
  • The sample is large, so even a small difference is significant.

  • CONCLUSION: Based on the sample data, we reject the null hypothesis about the equality of average grades for Italian and Spanish exams (p<0.05), and assume a conclusion that the average grades are different. We found a very small effect size, expressed in Cohen’s D statistic by Sawilowsky rules of -0.18. Average grade for Italian exam is smaller than average grade for Spanish exam.

3.1.2. NON-PARAMERTIC TEST - Wilcoxon Signed Rank Test:

The variables are interval, and we use Wilcoxon Signed Rank Test.

HYPOTHESIS:

  • H0: Location distribution of grades for Italian course is equal to location distribution of grades for Spanish course.

  • H1: Location distribution of grades for Italian course is not equal to location distribution of grades for Spanish course.

# Performing a Wilcoxon Signed Rank Test:

wilcox.test(mydata$Italian, mydata$Spanish,
            paired = TRUE,
            correct = FALSE,
            exact = FALSE,
            alternative = "two.sided")
## 
##  Wilcoxon signed rank test
## 
## data:  mydata$Italian and mydata$Spanish
## V = 4921.5, p-value = 0.03335
## alternative hypothesis: true location shift is not equal to 0
effectsize(wilcox.test(mydata$Italian, mydata$Spanish,
                       paired = TRUE, 
                       correct = FALSE,
                       exact = FALSE,
                       alternative = "two.sided"))
## r (rank biserial) |         95% CI
## ----------------------------------
## -0.20             | [-0.36, -0.02]
interpret_rank_biserial(0.20)
## [1] "medium"
## (Rules: funder2019)
  • Reject H0 at p<0.05.

  • Location distributions for grades for Italian and Spanish exams are not equal. We found a medium effect size of -0.2 by Funder statistic.

3.1.4. CONCLUSION (the most appropriate test):

The assumptions for parametric test are met, and parametric tests hold more statistical power, so we prefer the results from the Paired T-Test: we reject the null hypothesis about the equality of average grades for Italian and Spanish exams (p<0.05), and assume a conclusion that there is a very small effect sizeof -0.18, meaning that the average grade for Italian exam is smaller than average grade for Spanish exam.

3.2. CORRELATION ANALYSIS

For numeric variables, we use parametric test Pearson correlation, which checks the degree of relationship (interdependence) of variables.

RESEARCH QUESTION: Is there a relationship between the percentage of Attention and grades for Italian exam?

HYPOTHESIS:

# Showing a scatter plot matrix with a correlation between Attendance and grades for Italian exam.

#install.packages("car")
library(car)
## Loading required package: carData
scatterplotMatrix(mydata[ , c(5,6)], smooth = FALSE)

# Showing scatter plots and corresponding correlation coefficients.

#install.packages("GGally")
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(mydata[ , c(5,6)])

# Showing a correlation matrix:

#install.packages("Hmisc")
library(Hmisc)
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
rcorr(as.matrix(mydata[ , c(5,6)]), 
      type = "pearson")
##            Attendance Italian
## Attendance       1.00    0.88
## Italian          0.88    1.00
## 
## n= 165 
## 
## 
## P
##            Attendance Italian
## Attendance             0     
## Italian     0

3.3. ASSOCIATION OF CATEGORICAL VARIABLES

Association between two categorical variables is checked with the analysis of frequencies performed with a parametric test Pearson Chi-Square Test or non-parametric test Fisher’s Exact Probability Test.

RESEARCH QUESTION: Does the chance of passing the unit vary depending on the gender of a student?

HYPOTHESIS:

ASSUMPTIONS::

# Performing Chi Square Test:

chi_square <- chisq.test(mydata$Unit, mydata$Gender,
                         correct = TRUE) #Implement Yates' correction to 2x2 table.

chi_square
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mydata$Unit and mydata$Gender
## X-squared = 0.76838, df = 1, p-value = 0.3807
# Using a function that gives the sums of expected or theoretical frequencies:

addmargins(round(chi_square$expected, 2))
##            mydata$Gender
## mydata$Unit  Male Female Sum
##        Pass 65.91  59.09 125
##        Fail 21.09  18.91  40
##        Sum  87.00  78.00 165
# Using a function that gives the sums of observed or empirical frequencies:

addmargins(chi_square$observed)
##            mydata$Gender
## mydata$Unit Male Female Sum
##        Pass   63     62 125
##        Fail   24     16  40
##        Sum    87     78 165
# Using a function that shows standard residuals.

round(chi_square$residuals, 2)
##            mydata$Gender
## mydata$Unit  Male Female
##        Pass -0.36   0.38
##        Fail  0.63  -0.67
# Using a function that gives proportions or the structure of observed frequencies.

addmargins(round(prop.table(chi_square$observed, 1), 3), 2)
##            mydata$Gender
## mydata$Unit  Male Female   Sum
##        Pass 0.504  0.496 1.000
##        Fail 0.600  0.400 1.000
addmargins(round(prop.table(chi_square$observed, 2), 3), 1)
##            mydata$Gender
## mydata$Unit  Male Female
##        Pass 0.724  0.795
##        Fail 0.276  0.205
##        Sum  1.000  1.000
# Checking the effect size by Cramer's V statistics:

library(effectsize)
effectsize::cramers_v(mydata$Unit, mydata$Gender)
## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.03              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].
interpret_cramers_v(0.03)
## [1] "tiny"
## (Rules: funder2019)
# Checking the effect size by the usage of Odds ratio (only for 2x2 table):

oddsratio(mydata$Unit, mydata$Gender)
## Odds ratio |       95% CI
## -------------------------
## 0.68       | [0.33, 1.40]
interpret_oddsratio(1.47)
## [1] "small"
## (Rules: cohen1988)