Load necessary libraries

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(corrplot)

## corrplot 0.95 loaded

library(broom)

Load the dataset

df <- read.csv('StudentsPerformance.csv')

Display the first few rows of the dataset

head(df)

##   gender race.ethnicity parental.level.of.education        lunch
## 1 female        group B           bachelor's degree     standard
## 2 female        group C                some college     standard
## 3 female        group B             master's degree     standard
## 4   male        group A          associate's degree free/reduced
## 5   male        group C                some college     standard
## 6 female        group B          associate's degree     standard
##   test.preparation.course math.score reading.score writing.score
## 1                    none         72            72            74
## 2               completed         69            90            88
## 3                    none         90            95            93
## 4                    none         47            57            44
## 5                    none         76            78            75
## 6                    none         71            83            78

Summary statistics

summary_stats <- summary(df)
print(summary_stats)

##     gender          race.ethnicity     parental.level.of.education
##  Length:1000        Length:1000        Length:1000                
##  Class :character   Class :character   Class :character           
##  Mode  :character   Mode  :character   Mode  :character           
##                                                                   
##                                                                   
##                                                                   
##     lunch           test.preparation.course   math.score     reading.score   
##  Length:1000        Length:1000             Min.   :  0.00   Min.   : 17.00  
##  Class :character   Class :character        1st Qu.: 57.00   1st Qu.: 59.00  
##  Mode  :character   Mode  :character        Median : 66.00   Median : 70.00  
##                                             Mean   : 66.09   Mean   : 69.17  
##                                             3rd Qu.: 77.00   3rd Qu.: 79.00  
##                                             Max.   :100.00   Max.   :100.00  
##  writing.score   
##  Min.   : 10.00  
##  1st Qu.: 57.75  
##  Median : 69.00  
##  Mean   : 68.05  
##  3rd Qu.: 79.00  
##  Max.   :100.00

Visualizations of score distributions

ggplot(df, aes(x = math.score)) + 
  geom_histogram(binwidth = 5, fill = 'blue', color = 'black') + 
  labs(title = 'Distribution of Math Scores', x = 'Math Score', y = 'Frequency')

ggplot(df, aes(x = reading.score)) + 
  geom_histogram(binwidth = 5, fill = 'green', color = 'black') + 
  labs(title = 'Distribution of Reading Scores', x = 'Reading Score', y = 'Frequency')

ggplot(df, aes(x = writing.score)) + 
  geom_histogram(binwidth = 5, fill = 'red', color = 'black') + 
  labs(title = 'Distribution of Writing Scores', x = 'Writing Score', y = 'Frequency')

score_data <- df %>% select(math.score, reading.score, writing.score) 
cor_matrix <- cor(score_data) 
cor_matrix

##               math.score reading.score writing.score
## math.score     1.0000000     0.8175797     0.8026420
## reading.score  0.8175797     1.0000000     0.9545981
## writing.score  0.8026420     0.9545981     1.0000000

corrplot(cor_matrix, method = "color", type = "upper", tl.col = "BLACK")

Analysis of relationships between continuous variables

ggplot(df, aes(x = math.score, y = reading.score)) + 
  geom_point() + 
  geom_smooth(method = 'lm', col = 'blue') + 
  labs(title = 'Math Score vs Reading Score', x = 'Math Score', y = 'Reading Score')

## `geom_smooth()` using formula = 'y ~ x'

ggplot(df, aes(x = math.score, y = writing.score)) + 
  geom_point() + 
  geom_smooth(method = 'lm', col = 'green') + 
  labs(title = 'Math Score vs Writing Score', x = 'Math Score', y = 'Writing Score')

## `geom_smooth()` using formula = 'y ~ x'

ggplot(df, aes(x = reading.score, y = writing.score)) + 
  geom_point() + 
  geom_smooth(method = 'lm', col = 'red') + 
  labs(title = 'Reading Score vs Writing Score', x = 'Reading Score', y = 'Writing Score')

## `geom_smooth()` using formula = 'y ~ x'

#One way Anova math.score v.s.gender

anova_model <- aov(math.score ~ gender, data = df)
summary(anova_model)

##              Df Sum Sq Mean Sq F value   Pr(>F)    
## gender        1   6481    6481   28.98 9.12e-08 ***
## Residuals   998 223208     224                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Create a histogram of math scores grouped by gender

# Create a histogram of math scores for males
df %>% 
  filter(gender == 'male') %>% 
  ggplot(aes(x = math.score)) +
  geom_histogram(binwidth = 5, alpha = 0.5, fill = 'blue', color = 'black') +
  labs(title = 'Math Scores for Males', x = 'Math Score', y = 'Frequency')

# Create a histogram of math scores for females
df %>% 
  filter(gender == 'female') %>% 
  ggplot(aes(x = math.score)) +
  geom_histogram(binwidth = 5, alpha = 0.5,fill = 'red', color = 'black') +
  labs(title = 'Math Scores for Females', x = 'Math Score', y = 'Frequency')

# Create a boxplot to compare the distribution of math.score across genders
ggplot(df, aes(x = gender, y = math.score)) +
  geom_boxplot() +
  labs(x = "Gender", y = "Math Score") +
  theme_classic()

The F-statistic is 28.98, which indicates a significant difference in the means of math.score between the two genders. The p-value is 9.12e-08, which is extremely small (less than 0.001). This means that we can reject the null hypothesis that the means of math.score are equal across genders with a high degree of confidence. The sum of squares for the between-group variance (gender) is 6481, which is a significant proportion of the total sum of squares (6481 + 223208 = 229689). The mean square value for the between-group variance (gender) is 6481, which is approximately 28.98 times larger than the mean square value for the within-group variance (residuals).

Overall, these results suggest that there is a statistically significant difference in math.score between the two genders. The difference is not only statistically significant but also practically significant, as indicated by the large F-statistic and the small p-value.

# Perform a multiple linear regression analysis
model <- lm(math.score ~ reading.score + writing.score + gender + race.ethnicity + parental.level.of.education + lunch, data = df)

# Summarize the model
summary(model)

## 
## Call:
## lm(formula = math.score ~ reading.score + writing.score + gender + 
##     race.ethnicity + parental.level.of.education + lunch, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.1166  -3.6936   0.1427   3.6446  15.5061 
## 
## Coefficients:
##                                              Estimate Std. Error t value
## (Intercept)                                  -6.82506    1.16393  -5.864
## reading.score                                 0.36275    0.04208   8.621
## writing.score                                 0.56215    0.04211  13.349
## gendermale                                   12.64716    0.37983  33.297
## race.ethnicitygroup B                         0.81381    0.71881   1.132
## race.ethnicitygroup C                         0.20357    0.67384   0.302
## race.ethnicitygroup D                         0.59638    0.69334   0.860
## race.ethnicitygroup E                         4.98731    0.76530   6.517
## parental.level.of.educationbachelor's degree -0.84054    0.63883  -1.316
## parental.level.of.educationhigh school        0.46709    0.55555   0.841
## parental.level.of.educationmaster's degree   -1.49424    0.82253  -1.817
## parental.level.of.educationsome college       0.46403    0.52755   0.880
## parental.level.of.educationsome high school   0.01119    0.56740   0.020
## lunchstandard                                 3.69848    0.38432   9.624
##                                              Pr(>|t|)    
## (Intercept)                                  6.17e-09 ***
## reading.score                                 < 2e-16 ***
## writing.score                                 < 2e-16 ***
## gendermale                                    < 2e-16 ***
## race.ethnicitygroup B                          0.2578    
## race.ethnicitygroup C                          0.7626    
## race.ethnicitygroup D                          0.3899    
## race.ethnicitygroup E                        1.14e-10 ***
## parental.level.of.educationbachelor's degree   0.1886    
## parental.level.of.educationhigh school         0.4007    
## parental.level.of.educationmaster's degree     0.0696 .  
## parental.level.of.educationsome college        0.3793    
## parental.level.of.educationsome high school    0.9843    
## lunchstandard                                 < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.567 on 986 degrees of freedom
## Multiple R-squared:  0.8669, Adjusted R-squared:  0.8652 
## F-statistic: 494.2 on 13 and 986 DF,  p-value: < 2.2e-16

# Extract the coefficients
tidy(model)

## # A tibble: 14 × 5
##    term                                   estimate std.error statistic   p.value
##    <chr>                                     <dbl>     <dbl>     <dbl>     <dbl>
##  1 (Intercept)                             -6.83      1.16     -5.86   6.17e-  9
##  2 reading.score                            0.363     0.0421    8.62   2.60e- 17
##  3 writing.score                            0.562     0.0421   13.3    1.74e- 37
##  4 gendermale                              12.6       0.380    33.3    1.62e-163
##  5 race.ethnicitygroup B                    0.814     0.719     1.13   2.58e-  1
##  6 race.ethnicitygroup C                    0.204     0.674     0.302  7.63e-  1
##  7 race.ethnicitygroup D                    0.596     0.693     0.860  3.90e-  1
##  8 race.ethnicitygroup E                    4.99      0.765     6.52   1.14e- 10
##  9 parental.level.of.educationbachelor's…  -0.841     0.639    -1.32   1.89e-  1
## 10 parental.level.of.educationhigh school   0.467     0.556     0.841  4.01e-  1
## 11 parental.level.of.educationmaster's d…  -1.49      0.823    -1.82   6.96e-  2
## 12 parental.level.of.educationsome colle…   0.464     0.528     0.880  3.79e-  1
## 13 parental.level.of.educationsome high …   0.0112    0.567     0.0197 9.84e-  1
## 14 lunchstandard                            3.70      0.384     9.62   5.15e- 21

# Plot the residuals
ggplot(data = df, aes(x = fitted(model), y = residuals(model))) +
  geom_point() +
  geom_hline(yintercept = 0, color = "red") +
  labs(title = "Residuals vs Fitted Values", x = "Fitted Values", y = "Residuals")

ggplot(df, aes(x = parental.level.of.education, y = math.score)) +
  geom_boxplot() +
  geom_jitter(width = 0.2, alpha = 0.5) +
  labs(
    x = "Parental Level of Education",
    y = "Math Score",
    title = "Distribution of Math Scores by Parental Education Level"
  ) +
  theme_classic() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

Model Summary The model is a multiple linear regression model that predicts math.score based on several predictor variables: reading.score, writing.score, gender, race.ethnicity, parental.level.of.education, and lunch.

The model has a high R-squared value of 0.8669, indicating that it explains about 87% of the variation in math.score. The adjusted R-squared value is 0.8652, which is slightly lower due to the inclusion of multiple predictor variables.

Coefficients The coefficients represent the change in math.score for a one-unit change in each predictor variable, while holding all other predictor variables constant.

reading.score: For every one-unit increase in reading.score, math.score increases by 0.36275 units. This is a statistically significant relationship (p-value < 2e-16). writing.score: For every one-unit increase in writing.score, math.score increases by 0.56215 units. This is a statistically significant relationship (p-value < 2e-16). gendermale: Being male is associated with a 12.64716-unit increase in math.score compared to being female. This is a statistically significant relationship (p-value < 2e-16). race.ethnicity: The coefficients for race.ethnicity are relative to the reference category (group A). Only group E has a statistically significant coefficient (p-value = 1.14e-10), indicating that students in group E have a 4.98731-unit higher math.score compared to students in group A. parental.level.of.education: The coefficients for parental.level.of.education are relative to the reference category (some college). None of the coefficients are statistically significant, indicating that parental level of education does not have a significant impact on math.score. lunch: Having a standard lunch is associated with a 3.69848-unit increase in math.score compared to not having a standard lunch. This is a statistically significant relationship (p-value < 2e-16).

Residuals The residuals are the differences between the observed math.score values and the predicted values based on the model.

The residual standard error is 5.567, indicating that the model has a moderate amount of error. Interpretation of the Residual Plot Random Scatter Around Zero: The residuals are randomly scattered around the red horizontal line at zero, which is a good sign. It suggests that: The relationship between the predictors and the response is linear. The homoscedasticity assumption (constant variance of residuals) is likely met. There is no obvious pattern, which means the model is not missing any key non-linear relationships.

F-Statistic The F-statistic is 494.2, which is highly statistically significant (p-value < 2.2e-16).

This indicates that the model is a good fit to the data and that the predictor variables are jointly significant.

Conclusion The model suggests that reading.score, writing.score, gender, and lunch are significant predictors of math.score.

The model also suggests that race.ethnicity has a significant impact on math.score, but only for group E. Parental level of education does not appear to have a significant impact on math.score. The model has a high R-squared value and a low residual standard error, indicating that it is a good fit to the data.

StudentPerformance

Bochuan Zhang LEAH STRUVE Teva Porat

2025-05-23