library(readxl)
library(tinytex)
data=read_excel("C:/Users/Lenovo/Downloads/student_finalscore_dataset.xlsx")
head(data,5)
## # A tibble: 5 × 12
##   Gender   Age HighSchoolType StudyHoursLast3Months SleepHoursPerNight
##   <chr>  <dbl> <chr>                          <dbl>              <dbl>
## 1 Female  20.6 Homeschool                     119.                9.15
## 2 Female  22.4 Homeschool                     117.                3.92
## 3 Other   NA   Private                        117.                6.70
## 4 Female  20.5 Online                         136.                6.84
## 5 Male    16.7 Online                          86.5               8.45
## # ℹ 7 more variables: ParentalEducationLevel <chr>, InternetAccessAtHome <chr>,
## #   PartTimeJob <chr>, AttendanceRate <dbl>, MentalHealthRating <dbl>,
## #   PhysicalActivityHoursPerWeek <dbl>, FinalScore <dbl>
summary(data)
##     Gender               Age        HighSchoolType     StudyHoursLast3Months
##  Length:300         Min.   :13.04   Length:300         Min.   : 22.64       
##  Class :character   1st Qu.:18.82   Class :character   1st Qu.: 98.36       
##  Mode  :character   Median :20.28   Mode  :character   Median :117.30       
##                     Mean   :20.22                      Mean   :117.30       
##                     3rd Qu.:21.61                      3rd Qu.:138.65       
##                     Max.   :26.35                      Max.   :227.99       
##                     NA's   :32                                              
##  SleepHoursPerNight ParentalEducationLevel InternetAccessAtHome
##  Min.   : 1.420     Length:300             Length:300          
##  1st Qu.: 5.805     Class :character       Class :character    
##  Median : 6.697     Mode  :character       Mode  :character    
##  Mean   : 6.697                                                
##  3rd Qu.: 7.623                                                
##  Max.   :11.744                                                
##                                                                
##  PartTimeJob        AttendanceRate   MentalHealthRating
##  Length:300         Min.   : 59.44   Min.   :-0.7088   
##  Class :character   1st Qu.: 79.81   1st Qu.: 2.3074   
##  Mode  :character   Median : 85.64   Median : 2.8687   
##                     Mean   : 85.64   Mean   : 2.8687   
##                     3rd Qu.: 90.72   3rd Qu.: 3.4749   
##                     Max.   :121.58   Max.   : 5.6924   
##                                                        
##  PhysicalActivityHoursPerWeek   FinalScore   
##  Min.   :-2.729               Min.   :50.00  
##  1st Qu.: 1.500               1st Qu.:59.44  
##  Median : 2.971               Median :64.69  
##  Mean   : 2.945               Mean   :64.51  
##  3rd Qu.: 4.539               3rd Qu.:69.35  
##  Max.   : 8.455               Max.   :88.72  
##  NA's   :34
colSums(is.na(data))
##                       Gender                          Age 
##                            0                           32 
##               HighSchoolType        StudyHoursLast3Months 
##                            0                            0 
##           SleepHoursPerNight       ParentalEducationLevel 
##                            0                            0 
##         InternetAccessAtHome                  PartTimeJob 
##                            0                            0 
##               AttendanceRate           MentalHealthRating 
##                            0                            0 
## PhysicalActivityHoursPerWeek                   FinalScore 
##                           34                            0
#cleaned<- na.omit(data)
data$Age[is.na(data$Age)] <- mean(data$Age, na.rm = TRUE)

data$PhysicalActivityHoursPerWeek[is.na(data$PhysicalActivityHoursPerWeek)]<-mean(data$PhysicalActivityHoursPerWeek, na.rm = TRUE)
colSums(is.na(data))
##                       Gender                          Age 
##                            0                            0 
##               HighSchoolType        StudyHoursLast3Months 
##                            0                            0 
##           SleepHoursPerNight       ParentalEducationLevel 
##                            0                            0 
##         InternetAccessAtHome                  PartTimeJob 
##                            0                            0 
##               AttendanceRate           MentalHealthRating 
##                            0                            0 
## PhysicalActivityHoursPerWeek                   FinalScore 
##                            0                            0
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data <- data %>%
  rename(Sex = Gender)
head(data)
## # A tibble: 6 × 12
##   Sex      Age HighSchoolType StudyHoursLast3Months SleepHoursPerNight
##   <chr>  <dbl> <chr>                          <dbl>              <dbl>
## 1 Female  20.6 Homeschool                     119.                9.15
## 2 Female  22.4 Homeschool                     117.                3.92
## 3 Other   20.2 Private                        117.                6.70
## 4 Female  20.5 Online                         136.                6.84
## 5 Male    16.7 Online                          86.5               8.45
## 6 Male    19.1 Private                         78.1               6.59
## # ℹ 7 more variables: ParentalEducationLevel <chr>, InternetAccessAtHome <chr>,
## #   PartTimeJob <chr>, AttendanceRate <dbl>, MentalHealthRating <dbl>,
## #   PhysicalActivityHoursPerWeek <dbl>, FinalScore <dbl>
library(ggplot2)
ggplot(data, aes(StudyHoursLast3Months,FinalScore))+
geom_point(size = 0.5)+
geom_smooth(method = lm, colour="Red")+
  facet_wrap(~InternetAccessAtHome)+
labs(title= "How this affect")
## `geom_smooth()` using formula = 'y ~ x'

#scatter plot with line

library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.2.0     ✔ tidyr     1.3.1
## ✔ readr     2.1.5     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
ggplot(data, aes(StudyHoursLast3Months,FinalScore, colour = ParentalEducationLevel))+
geom_point(size = 0.5)+
geom_smooth(method = lm, colour="Red")+
  facet_wrap(~InternetAccessAtHome)+
labs(title= "How this affect")
## `geom_smooth()` using formula = 'y ~ x'

  ggplot(data, aes(InternetAccessAtHome,FinalScore))+
  geom_boxplot(colour="Blue")+
geom_point(aes(size = StudyHoursLast3Months, colour= ParentalEducationLevel, alpha = 0.5))+
  facet_wrap(~HighSchoolType)+
  labs(title = "prac", x = "Internet", y="Score")

#boxplot

 ggplot(data , aes(InternetAccessAtHome,FinalScore))+
  geom_boxplot(colour="Blue")+
  facet_wrap(~HighSchoolType)+
  labs(title = "prac")

#linear model

lm_model<-lm(FinalScore~StudyHoursLast3Months + Age+ SleepHoursPerNight + AttendanceRate+ MentalHealthRating, data=data)
summary(lm_model)
## 
## Call:
## lm(formula = FinalScore ~ StudyHoursLast3Months + Age + SleepHoursPerNight + 
##     AttendanceRate + MentalHealthRating, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4388 -0.1756 -0.0560  0.0781  6.4416 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           1.3369226  0.4518499   2.959  0.00334 ** 
## StudyHoursLast3Months 0.1722288  0.0009166 187.895  < 2e-16 ***
## Age                   0.0016391  0.0149644   0.110  0.91286    
## SleepHoursPerNight    0.9674467  0.0212886  45.444  < 2e-16 ***
## AttendanceRate        0.2963155  0.0033366  88.807  < 2e-16 ***
## MentalHealthRating    3.8625397  0.0323817 119.282  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5352 on 294 degrees of freedom
## Multiple R-squared:  0.995,  Adjusted R-squared:  0.9949 
## F-statistic: 1.16e+04 on 5 and 294 DF,  p-value: < 2.2e-16

#bar plot

ggplot(data, aes(HighSchoolType))+
geom_bar()

#Regression Model

lm_modell<-lm(FinalScore~Age + MentalHealthRating + AttendanceRate + SleepHoursPerNight + StudyHoursLast3Months , data=data )
summary(lm_modell)
## 
## Call:
## lm(formula = FinalScore ~ Age + MentalHealthRating + AttendanceRate + 
##     SleepHoursPerNight + StudyHoursLast3Months, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4388 -0.1756 -0.0560  0.0781  6.4416 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           1.3369226  0.4518499   2.959  0.00334 ** 
## Age                   0.0016391  0.0149644   0.110  0.91286    
## MentalHealthRating    3.8625397  0.0323817 119.282  < 2e-16 ***
## AttendanceRate        0.2963155  0.0033366  88.807  < 2e-16 ***
## SleepHoursPerNight    0.9674467  0.0212886  45.444  < 2e-16 ***
## StudyHoursLast3Months 0.1722288  0.0009166 187.895  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5352 on 294 degrees of freedom
## Multiple R-squared:  0.995,  Adjusted R-squared:  0.9949 
## F-statistic: 1.16e+04 on 5 and 294 DF,  p-value: < 2.2e-16

Regression Analysis Interpretation 1. Model Fit

Multiple R-squared = 0.9942

Adjusted R-squared = 0.9941

This means 99.4% of the variation in the dependent variable (likely academic performance or exam score) is explained by the predictors:

Study hours

Age

Sleep hours

Attendance rate

Mental health rating

This is an exceptionally strong model, indicating the variables collectively predict performance extremely well.

F-statistic = 9049, p < 2.2e-16 This confirms the overall model is highly significant.

** 2. Coefficient Interpretation (Intercept) = 1.5497, p = 0.00194

This is the predicted score when all predictors = 0. Not usually meaningful but shows the baseline is statistically significant.

StudyHoursLast3Months = 0.1717, p < 2e-16 (significant)

For each additional hour of study in the last 3 months, performance increases by 0.1717 units on average.

Strong, positive, highly significant predictor.

Age = 0.0021, p = 0.8945 (NOT significant)

Age has no meaningful effect on performance in this dataset.

You can say: “Age is not a useful predictor of performance when other variables are accounted for.”

SleepHoursPerNight = 0.9616, p < 2e-16 (significant)

Each additional hour of sleep increases performance by 0.96 units.

Very strong and meaningful positive effect.

AttendanceRate = 0.29545, p < 2e-16 (significant)

For every 1% increase in attendance, performance increases by 0.295 units.

Extremely strong predictor.

MentalHealthRating = 3.8473, p < 2e-16 (significant)

A one-unit improvement in mental health score increases performance by 3.85 units.

This is the strongest predictor in the model.

📌 3. Residuals Min: -0.4886
1Q: -0.1875
Median: -0.0617
3Q: 0.0849
Max: 6.3621

Most residuals are small and centered around zero, indicating:

The model fits well

Errors are fairly balanced

The maximum residual (6.36) may indicate one outlier, but it is not extreme.

Residual standard error: 0.5623 ➡On average, predictions deviate from actual values by ~0.56 units, which is very low.

**4. Removed Observations

It says:

因为不存在,32个观察量被删除了

Meaning: “Because of missing data, 32 observations were removed.”

This is normal when some rows had NA values.

**5. Summary of Key Insights (easy to include in your report)

The model is extremely strong, explaining 99.4% of the variation.

Study hours, sleep hours, attendance, and mental health are all highly significant positive predictors.

Mental health rating has the largest effect, followed by sleep, attendance, then study hours.

Age is not significant, meaning age does not influence performance once other factors are considered.

Residuals are small, indicating good model accuracy.

nuya <-data%>%
  filter( Sex == "Male")
head(nuya)
## # A tibble: 6 × 12
##   Sex     Age HighSchoolType StudyHoursLast3Months SleepHoursPerNight
##   <chr> <dbl> <chr>                          <dbl>              <dbl>
## 1 Male   16.7 Online                          86.5               8.45
## 2 Male   19.1 Private                         78.1               6.59
## 3 Male   23.1 Public                         116.                9.42
## 4 Male   24.8 Private                        105.                7.62
## 5 Male   19.3 Online                         117.                6.70
## 6 Male   19.1 Private                         68.7               5.25
## # ℹ 7 more variables: ParentalEducationLevel <chr>, InternetAccessAtHome <chr>,
## #   PartTimeJob <chr>, AttendanceRate <dbl>, MentalHealthRating <dbl>,
## #   PhysicalActivityHoursPerWeek <dbl>, FinalScore <dbl>

#filter certai data

cakes <- data%>%
  filter(Age>19.0, AttendanceRate<80.0)
cakes
## # A tibble: 58 × 12
##    Sex      Age HighSchoolType StudyHoursLast3Months SleepHoursPerNight
##    <chr>  <dbl> <chr>                          <dbl>              <dbl>
##  1 Female  20.5 Online                         136.                6.84
##  2 Male    19.1 Private                         78.1               6.59
##  3 Other   20.2 Private                        109.                6.00
##  4 Male    20.2 Private                        145.                6.70
##  5 Other   20.2 Public                          51.3               4.95
##  6 Female  19.8 Homeschool                     133.                7.45
##  7 Male    19.9 Private                         81.8               6.42
##  8 Other   21.6 Public                         114.                6.36
##  9 Other   23.7 Private                         81.9               6.14
## 10 Female  20.9 Homeschool                     122.                9.89
## # ℹ 48 more rows
## # ℹ 7 more variables: ParentalEducationLevel <chr>, InternetAccessAtHome <chr>,
## #   PartTimeJob <chr>, AttendanceRate <dbl>, MentalHealthRating <dbl>,
## #   PhysicalActivityHoursPerWeek <dbl>, FinalScore <dbl>

#select a colmn

pick <- nuya%>%
  select(Sex, HighSchoolType , FinalScore)
pick
## # A tibble: 100 × 3
##    Sex   HighSchoolType FinalScore
##    <chr> <chr>               <dbl>
##  1 Male  Online               56.6
##  2 Male  Private              55.6
##  3 Male  Public               78.9
##  4 Male  Private              64.1
##  5 Male  Online               72.8
##  6 Male  Private              53.1
##  7 Male  Private              70.6
##  8 Male  Online               61.2
##  9 Male  Public               71.2
## 10 Male  Private              71.4
## # ℹ 90 more rows

#drop

dropp<-data %>%
  select(-Sex, -Age)
dropp
## # A tibble: 300 × 10
##    HighSchoolType StudyHoursLast3Months SleepHoursPerNight
##    <chr>                          <dbl>              <dbl>
##  1 Homeschool                     119.                9.15
##  2 Homeschool                     117.                3.92
##  3 Private                        117.                6.70
##  4 Online                         136.                6.84
##  5 Online                          86.5               8.45
##  6 Private                         78.1               6.59
##  7 Private                        117.                6.17
##  8 Private                        109.                6.00
##  9 Public                         106.                8.55
## 10 Private                        104.                9.95
## # ℹ 290 more rows
## # ℹ 7 more variables: ParentalEducationLevel <chr>, InternetAccessAtHome <chr>,
## #   PartTimeJob <chr>, AttendanceRate <dbl>, MentalHealthRating <dbl>,
## #   PhysicalActivityHoursPerWeek <dbl>, FinalScore <dbl>