data <- read.csv("FinalMark_TotalAverage_2015.csv")
head(data)
## ID Attendance Lab2 Test1 Lab3 PartA PartB Lab4 Exam
## 1 12 2 74 31 64 32 85 75 58
## 2 13 3 74 0 64 32 85 75 52
## 3 14 3 74 34 64 32 85 75 59
## 4 15 4 40 44 77 85 85 85 47
## 5 16 4 78 36 57 40 90 90 54
## 6 17 4 75 0 64 55 97 85 60
summary(data)
## ID Attendance Lab2 Test1 Lab3
## Min. :12 Min. : 2.000 Min. :40.00 Min. : 0.00 Min. :57.00
## 1st Qu.:20 1st Qu.: 7.000 1st Qu.:70.00 1st Qu.:34.00 1st Qu.:64.00
## Median :28 Median : 9.000 Median :75.00 Median :44.00 Median :77.00
## Mean :28 Mean : 7.788 Mean :73.53 Mean :45.61 Mean :74.44
## 3rd Qu.:36 3rd Qu.:10.000 3rd Qu.:83.00 3rd Qu.:59.00 3rd Qu.:85.00
## Max. :44 Max. :10.000 Max. :99.00 Max. :90.00 Max. :89.00
## NA's :1 NA's :1
## PartA PartB Lab4 Exam
## Min. : 26.00 Min. : 80.00 Min. : 75.00 Min. :46.00
## 1st Qu.: 36.00 1st Qu.: 85.00 1st Qu.: 85.00 1st Qu.:54.00
## Median : 40.00 Median : 94.00 Median : 85.00 Median :60.00
## Mean : 52.73 Mean : 92.76 Mean : 88.22 Mean :64.52
## 3rd Qu.: 70.00 3rd Qu.: 97.00 3rd Qu.: 91.25 3rd Qu.:77.00
## Max. :100.00 Max. :110.00 Max. :100.00 Max. :87.00
## NA's :1
.1 Run a linear regression to predict marks in Exam from Attendance. Based on this model, answer the following questions.
modell <- lm(Exam ~ Attendance, data = data)
Intercept ≈ 49.41, Attendance coefficient ≈ 1.94 So the estimated regression line is: Exam = 49.41 + 1.94
summary(modell)
##
## Call:
## lm(formula = Exam ~ Attendance, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.805 -10.805 2.830 9.195 19.134
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.4139 7.3957 6.681 1.79e-07 ***
## Attendance 1.9391 0.9081 2.135 0.0407 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.43 on 31 degrees of freedom
## Multiple R-squared: 0.1282, Adjusted R-squared: 0.1001
## F-statistic: 4.56 on 1 and 31 DF, p-value: 0.04074
anova(modell)
## Analysis of Variance Table
##
## Response: Exam
## Df Sum Sq Mean Sq F value Pr(>F)
## Attendance 1 705.1 705.06 4.56 0.04074 *
## Residuals 31 4793.2 154.62
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(modell)$r.squared
## [1] 0.1282336
his gives 0.128, meaning:
Approximately 12.8% of the variation in Exam is explained by Attendance.
model2 <- lm(Exam ~ Attendance * Test1, data = data)
summary(model2)
##
## Call:
## lm(formula = Exam ~ Attendance * Test1, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.3807 -7.2193 -0.2839 6.4433 17.3291
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.77139 12.54681 4.684 6.1e-05 ***
## Attendance -1.12945 1.71863 -0.657 0.516
## Test1 -0.13463 0.37748 -0.357 0.724
## Attendance:Test1 0.05439 0.04524 1.202 0.239
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.1 on 29 degrees of freedom
## Multiple R-squared: 0.3507, Adjusted R-squared: 0.2835
## F-statistic: 5.221 on 3 and 29 DF, p-value: 0.005263
anova(model2)
## Analysis of Variance Table
##
## Response: Exam
## Df Sum Sq Mean Sq F value Pr(>F)
## Attendance 1 705.1 705.06 5.7272 0.023399 *
## Test1 1 1045.1 1045.14 8.4897 0.006812 **
## Attendance:Test1 1 177.9 177.92 1.4452 0.239020
## Residuals 29 3570.1 123.11
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model2)$adj.r.squared
## [1] 0.2835088
Adjusted R² ≈ 0.284.
summary(model2)$coefficients["Attendance", "Pr(>|t|)"]
## [1] 0.5162496
Conclusion:
Since p = 0.516 > 0.05, we fail to reject H₀. There is not significant evidence that the linear effect of Attendance differs from zero in this model.
Hypotheses: HO: B Attendance = 0 AND B Attendance,TEST1 = 0 H1: Atleast 1 is not = 0 At least one of these coefficients is non-zero F-test results: F = 0.94, df₁ = 2, df₂ = 29, p = 0.404
model2_reduced <- lm(Exam ~ Test1, data = data)
anova(model2_reduced, model2)
## Analysis of Variance Table
##
## Model 1: Exam ~ Test1
## Model 2: Exam ~ Attendance * Test1
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 31 3800.6
## 2 29 3570.1 2 230.49 0.9361 0.4037
qf(0.95, df1 = 2, df2 = 29)
## [1] 3.327654
Conclusion: F = 0.94 < 3.33 → fail to reject H₀ There is NO significant overall effect of Attendance on Exam at α = 0.05.
par(mfrow = c(2, 2))
plot(model2)
par(mfrow = c(1, 1))
new_student <- data.frame(Attendance = 8, Test1 = 60)
predict(model2, new_student, interval = "prediction", level = 0.95)
## fit lwr upr
## 1 67.76308 44.45756 91.0686
Output shows approximately: Predicted Exam mark ≈ 68 95% prediction interval ≈ (44.5, 91.1)
model3 <- lm(Exam ~ ID + Attendance + Lab2 + Test1 +
Lab3 + PartA + PartB + Lab4,
data = data)
summary(model3)
##
## Call:
## lm(formula = Exam ~ ID + Attendance + Lab2 + Test1 + Lab3 + PartA +
## PartB + Lab4, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.4348 -6.0838 -0.9311 7.9708 19.6534
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 64.26984 40.28372 1.595 0.124
## ID -0.59786 0.62467 -0.957 0.348
## Attendance 3.19481 2.75082 1.161 0.257
## Lab2 0.22344 0.27856 0.802 0.431
## Test1 0.21367 0.12747 1.676 0.107
## Lab3 0.28603 0.28011 1.021 0.318
## PartA -0.09737 0.20591 -0.473 0.641
## PartB -0.08002 0.47319 -0.169 0.867
## Lab4 -0.48366 0.44453 -1.088 0.288
##
## Residual standard error: 11.63 on 23 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.4241, Adjusted R-squared: 0.2238
## F-statistic: 2.117 on 8 and 23 DF, p-value: 0.07626
summary(model3)$adj.r.squared
## [1] 0.2237812
Adjusted R² ≈ 0.224
s <- summary(model3)
F_value <- s$fstatistic[1]
df1 <- s$fstatistic[2]
df2 <- s$fstatistic[3]
p_value <- pf(F_value, df1, df2, lower.tail = FALSE)
p_value
## value
## 0.07625506
Conclusion:
p = 0.076 > 0.05 → fail to reject H₀. The full 8-predictor model is NOT significant at α = 0.05.
library(leaps)
## Warning: package 'leaps' was built under R version 4.3.3
all_sub <- regsubsets(
Exam ~ ID + Attendance + Lab2 + Test1 +
Lab3 + PartA + PartB + Lab4,
data = data
)
sub_summary <- summary(all_sub)
sub_summary$adjr2
## [1] 0.2789609 0.3121919 0.2993281 0.2948167 0.2758365 0.2743538 0.2551987
## [8] 0.2237812
best_index <- which.max(sub_summary$adjr2)
best_index
## [1] 2
sub_summary$which[best_index, ]
## (Intercept) ID Attendance Lab2 Test1 Lab3
## TRUE FALSE FALSE TRUE TRUE FALSE
## PartA PartB Lab4
## FALSE FALSE FALSE
Best overall model (highest Adjusted R²): → Lab2 + Test1
sub_summary$which[1, ]
## (Intercept) ID Attendance Lab2 Test1 Lab3
## TRUE FALSE FALSE FALSE TRUE FALSE
## PartA PartB Lab4
## FALSE FALSE FALSE
Best one-predictor model: → Test1
sub_summary$which[3, ]
## (Intercept) ID Attendance Lab2 Test1 Lab3
## TRUE FALSE FALSE TRUE TRUE TRUE
## PartA PartB Lab4
## FALSE FALSE FALSE
Best three-predictor model: → Lab2 + Test1 + Lab3