library(readxl)
library(tinytex)
data=read_excel("C:/Users/Lenovo/Downloads/student_finalscore_dataset.xlsx")
head(data,5)
## # A tibble: 5 × 12
## Gender Age HighSchoolType StudyHoursLast3Months SleepHoursPerNight
## <chr> <dbl> <chr> <dbl> <dbl>
## 1 Female 20.6 Homeschool 119. 9.15
## 2 Female 22.4 Homeschool 117. 3.92
## 3 Other NA Private 117. 6.70
## 4 Female 20.5 Online 136. 6.84
## 5 Male 16.7 Online 86.5 8.45
## # ℹ 7 more variables: ParentalEducationLevel <chr>, InternetAccessAtHome <chr>,
## # PartTimeJob <chr>, AttendanceRate <dbl>, MentalHealthRating <dbl>,
## # PhysicalActivityHoursPerWeek <dbl>, FinalScore <dbl>
summary(data)
## Gender Age HighSchoolType StudyHoursLast3Months
## Length:300 Min. :13.04 Length:300 Min. : 22.64
## Class :character 1st Qu.:18.82 Class :character 1st Qu.: 98.36
## Mode :character Median :20.28 Mode :character Median :117.30
## Mean :20.22 Mean :117.30
## 3rd Qu.:21.61 3rd Qu.:138.65
## Max. :26.35 Max. :227.99
## NA's :32
## SleepHoursPerNight ParentalEducationLevel InternetAccessAtHome
## Min. : 1.420 Length:300 Length:300
## 1st Qu.: 5.805 Class :character Class :character
## Median : 6.697 Mode :character Mode :character
## Mean : 6.697
## 3rd Qu.: 7.623
## Max. :11.744
##
## PartTimeJob AttendanceRate MentalHealthRating
## Length:300 Min. : 59.44 Min. :-0.7088
## Class :character 1st Qu.: 79.81 1st Qu.: 2.3074
## Mode :character Median : 85.64 Median : 2.8687
## Mean : 85.64 Mean : 2.8687
## 3rd Qu.: 90.72 3rd Qu.: 3.4749
## Max. :121.58 Max. : 5.6924
##
## PhysicalActivityHoursPerWeek FinalScore
## Min. :-2.729 Min. :50.00
## 1st Qu.: 1.500 1st Qu.:59.44
## Median : 2.971 Median :64.69
## Mean : 2.945 Mean :64.51
## 3rd Qu.: 4.539 3rd Qu.:69.35
## Max. : 8.455 Max. :88.72
## NA's :34
colSums(is.na(data))
## Gender Age
## 0 32
## HighSchoolType StudyHoursLast3Months
## 0 0
## SleepHoursPerNight ParentalEducationLevel
## 0 0
## InternetAccessAtHome PartTimeJob
## 0 0
## AttendanceRate MentalHealthRating
## 0 0
## PhysicalActivityHoursPerWeek FinalScore
## 34 0
#cleaned<- na.omit(data)
data$Age[is.na(data$Age)] <- mean(data$Age, na.rm = TRUE)
data$PhysicalActivityHoursPerWeek[is.na(data$PhysicalActivityHoursPerWeek)]<-mean(data$PhysicalActivityHoursPerWeek, na.rm = TRUE)
colSums(is.na(data))
## Gender Age
## 0 0
## HighSchoolType StudyHoursLast3Months
## 0 0
## SleepHoursPerNight ParentalEducationLevel
## 0 0
## InternetAccessAtHome PartTimeJob
## 0 0
## AttendanceRate MentalHealthRating
## 0 0
## PhysicalActivityHoursPerWeek FinalScore
## 0 0
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data <- data %>%
rename(Sex = Gender)
head(data)
## # A tibble: 6 × 12
## Sex Age HighSchoolType StudyHoursLast3Months SleepHoursPerNight
## <chr> <dbl> <chr> <dbl> <dbl>
## 1 Female 20.6 Homeschool 119. 9.15
## 2 Female 22.4 Homeschool 117. 3.92
## 3 Other 20.2 Private 117. 6.70
## 4 Female 20.5 Online 136. 6.84
## 5 Male 16.7 Online 86.5 8.45
## 6 Male 19.1 Private 78.1 6.59
## # ℹ 7 more variables: ParentalEducationLevel <chr>, InternetAccessAtHome <chr>,
## # PartTimeJob <chr>, AttendanceRate <dbl>, MentalHealthRating <dbl>,
## # PhysicalActivityHoursPerWeek <dbl>, FinalScore <dbl>
library(ggplot2)
ggplot(data, aes(StudyHoursLast3Months,FinalScore))+
geom_point(size = 0.5)+
geom_smooth(method = lm, colour="Red")+
facet_wrap(~InternetAccessAtHome)+
labs(title= "How this affect")
## `geom_smooth()` using formula = 'y ~ x'
#scatter plot with line
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.2.0 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
ggplot(data, aes(StudyHoursLast3Months,FinalScore, colour = ParentalEducationLevel))+
geom_point(size = 0.5)+
geom_smooth(method = lm, colour="Red")+
facet_wrap(~InternetAccessAtHome)+
labs(title= "How this affect")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(data, aes(InternetAccessAtHome,FinalScore))+
geom_boxplot(colour="Blue")+
geom_point(aes(size = StudyHoursLast3Months, colour= ParentalEducationLevel, alpha = 0.5))+
facet_wrap(~HighSchoolType)+
labs(title = "prac", x = "Internet", y="Score")
#boxplot
ggplot(data , aes(InternetAccessAtHome,FinalScore))+
geom_boxplot(colour="Blue")+
facet_wrap(~HighSchoolType)+
labs(title = "prac")
#linear model
lm_model<-lm(FinalScore~StudyHoursLast3Months + Age+ SleepHoursPerNight + AttendanceRate+ MentalHealthRating, data=data)
summary(lm_model)
##
## Call:
## lm(formula = FinalScore ~ StudyHoursLast3Months + Age + SleepHoursPerNight +
## AttendanceRate + MentalHealthRating, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4388 -0.1756 -0.0560 0.0781 6.4416
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3369226 0.4518499 2.959 0.00334 **
## StudyHoursLast3Months 0.1722288 0.0009166 187.895 < 2e-16 ***
## Age 0.0016391 0.0149644 0.110 0.91286
## SleepHoursPerNight 0.9674467 0.0212886 45.444 < 2e-16 ***
## AttendanceRate 0.2963155 0.0033366 88.807 < 2e-16 ***
## MentalHealthRating 3.8625397 0.0323817 119.282 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5352 on 294 degrees of freedom
## Multiple R-squared: 0.995, Adjusted R-squared: 0.9949
## F-statistic: 1.16e+04 on 5 and 294 DF, p-value: < 2.2e-16
#bar plot
ggplot(data, aes(HighSchoolType))+
geom_bar()
#Regression Model
lm_modell<-lm(FinalScore~Age + MentalHealthRating + AttendanceRate + SleepHoursPerNight + StudyHoursLast3Months , data=data )
summary(lm_modell)
##
## Call:
## lm(formula = FinalScore ~ Age + MentalHealthRating + AttendanceRate +
## SleepHoursPerNight + StudyHoursLast3Months, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4388 -0.1756 -0.0560 0.0781 6.4416
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3369226 0.4518499 2.959 0.00334 **
## Age 0.0016391 0.0149644 0.110 0.91286
## MentalHealthRating 3.8625397 0.0323817 119.282 < 2e-16 ***
## AttendanceRate 0.2963155 0.0033366 88.807 < 2e-16 ***
## SleepHoursPerNight 0.9674467 0.0212886 45.444 < 2e-16 ***
## StudyHoursLast3Months 0.1722288 0.0009166 187.895 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5352 on 294 degrees of freedom
## Multiple R-squared: 0.995, Adjusted R-squared: 0.9949
## F-statistic: 1.16e+04 on 5 and 294 DF, p-value: < 2.2e-16
Regression Analysis Interpretation 1. Model Fit
Multiple R-squared = 0.9942
Adjusted R-squared = 0.9941
This means 99.4% of the variation in the dependent variable (likely academic performance or exam score) is explained by the predictors:
Study hours
Age
Sleep hours
Attendance rate
Mental health rating
This is an exceptionally strong model, indicating the variables collectively predict performance extremely well.
F-statistic = 9049, p < 2.2e-16 This confirms the overall model is highly significant.
** 2. Coefficient Interpretation (Intercept) = 1.5497, p = 0.00194
This is the predicted score when all predictors = 0. Not usually meaningful but shows the baseline is statistically significant.
StudyHoursLast3Months = 0.1717, p < 2e-16 (significant)
For each additional hour of study in the last 3 months, performance increases by 0.1717 units on average.
Strong, positive, highly significant predictor.
Age = 0.0021, p = 0.8945 (NOT significant)
Age has no meaningful effect on performance in this dataset.
You can say: “Age is not a useful predictor of performance when other variables are accounted for.”
SleepHoursPerNight = 0.9616, p < 2e-16 (significant)
Each additional hour of sleep increases performance by 0.96 units.
Very strong and meaningful positive effect.
AttendanceRate = 0.29545, p < 2e-16 (significant)
For every 1% increase in attendance, performance increases by 0.295 units.
Extremely strong predictor.
MentalHealthRating = 3.8473, p < 2e-16 (significant)
A one-unit improvement in mental health score increases performance by 3.85 units.
This is the strongest predictor in the model.
📌 3. Residuals Min: -0.4886
1Q: -0.1875
Median: -0.0617
3Q: 0.0849
Max: 6.3621
Most residuals are small and centered around zero, indicating:
The model fits well
Errors are fairly balanced
The maximum residual (6.36) may indicate one outlier, but it is not extreme.
Residual standard error: 0.5623 ➡On average, predictions deviate from actual values by ~0.56 units, which is very low.
**4. Removed Observations
It says:
因为不存在,32个观察量被删除了
Meaning: “Because of missing data, 32 observations were removed.”
This is normal when some rows had NA values.
**5. Summary of Key Insights (easy to include in your report)
The model is extremely strong, explaining 99.4% of the variation.
Study hours, sleep hours, attendance, and mental health are all highly significant positive predictors.
Mental health rating has the largest effect, followed by sleep, attendance, then study hours.
Age is not significant, meaning age does not influence performance once other factors are considered.
Residuals are small, indicating good model accuracy.
nuya <-data%>%
filter( Sex == "Male")
head(nuya)
## # A tibble: 6 × 12
## Sex Age HighSchoolType StudyHoursLast3Months SleepHoursPerNight
## <chr> <dbl> <chr> <dbl> <dbl>
## 1 Male 16.7 Online 86.5 8.45
## 2 Male 19.1 Private 78.1 6.59
## 3 Male 23.1 Public 116. 9.42
## 4 Male 24.8 Private 105. 7.62
## 5 Male 19.3 Online 117. 6.70
## 6 Male 19.1 Private 68.7 5.25
## # ℹ 7 more variables: ParentalEducationLevel <chr>, InternetAccessAtHome <chr>,
## # PartTimeJob <chr>, AttendanceRate <dbl>, MentalHealthRating <dbl>,
## # PhysicalActivityHoursPerWeek <dbl>, FinalScore <dbl>
#filter certai data
cakes <- data%>%
filter(Age>19.0, AttendanceRate<80.0)
cakes
## # A tibble: 58 × 12
## Sex Age HighSchoolType StudyHoursLast3Months SleepHoursPerNight
## <chr> <dbl> <chr> <dbl> <dbl>
## 1 Female 20.5 Online 136. 6.84
## 2 Male 19.1 Private 78.1 6.59
## 3 Other 20.2 Private 109. 6.00
## 4 Male 20.2 Private 145. 6.70
## 5 Other 20.2 Public 51.3 4.95
## 6 Female 19.8 Homeschool 133. 7.45
## 7 Male 19.9 Private 81.8 6.42
## 8 Other 21.6 Public 114. 6.36
## 9 Other 23.7 Private 81.9 6.14
## 10 Female 20.9 Homeschool 122. 9.89
## # ℹ 48 more rows
## # ℹ 7 more variables: ParentalEducationLevel <chr>, InternetAccessAtHome <chr>,
## # PartTimeJob <chr>, AttendanceRate <dbl>, MentalHealthRating <dbl>,
## # PhysicalActivityHoursPerWeek <dbl>, FinalScore <dbl>
#select a colmn
pick <- nuya%>%
select(Sex, HighSchoolType , FinalScore)
pick
## # A tibble: 100 × 3
## Sex HighSchoolType FinalScore
## <chr> <chr> <dbl>
## 1 Male Online 56.6
## 2 Male Private 55.6
## 3 Male Public 78.9
## 4 Male Private 64.1
## 5 Male Online 72.8
## 6 Male Private 53.1
## 7 Male Private 70.6
## 8 Male Online 61.2
## 9 Male Public 71.2
## 10 Male Private 71.4
## # ℹ 90 more rows
#drop
dropp<-data %>%
select(-Sex, -Age)
dropp
## # A tibble: 300 × 10
## HighSchoolType StudyHoursLast3Months SleepHoursPerNight
## <chr> <dbl> <dbl>
## 1 Homeschool 119. 9.15
## 2 Homeschool 117. 3.92
## 3 Private 117. 6.70
## 4 Online 136. 6.84
## 5 Online 86.5 8.45
## 6 Private 78.1 6.59
## 7 Private 117. 6.17
## 8 Private 109. 6.00
## 9 Public 106. 8.55
## 10 Private 104. 9.95
## # ℹ 290 more rows
## # ℹ 7 more variables: ParentalEducationLevel <chr>, InternetAccessAtHome <chr>,
## # PartTimeJob <chr>, AttendanceRate <dbl>, MentalHealthRating <dbl>,
## # PhysicalActivityHoursPerWeek <dbl>, FinalScore <dbl>