What we need to do: correlation plot dummy variables off the top VIF Explain hours more

library(faraway)
student_data <- read.csv("student_lifestyle_dataset.csv")

student_data$Study_Hours_Per_Day + student_data$Extracurricular_Hours_Per_Day + student_data$Sleep_Hours_Per_Day + student_data$Social_Hours_Per_Day + student_data$Physical_Activity_Hours_Per_Day
##    [1] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##   [25] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##   [49] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##   [73] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##   [97] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [121] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [145] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [169] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [193] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [217] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [241] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [265] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [289] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [313] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [337] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [361] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [385] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [409] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [433] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [457] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [481] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [505] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [529] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [553] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [577] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [601] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [625] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [649] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [673] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [697] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [721] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [745] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [769] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [793] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [817] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [841] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [865] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [889] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [913] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [937] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [961] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##  [985] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1009] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1033] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1057] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1081] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1105] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1129] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1153] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1177] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1201] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1225] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1249] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1273] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1297] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1321] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1345] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1369] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1393] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1417] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1441] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1465] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1489] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1513] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1537] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1561] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1585] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1609] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1633] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1657] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1681] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1705] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1729] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1753] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1777] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1801] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1825] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1849] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1873] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1897] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1921] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1945] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1969] 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## [1993] 24 24 24 24 24 24 24 24
model <- lm(GPA ~.-Student_ID,data=student_data)
summary(model)
## 
## Call:
## lm(formula = GPA ~ . - Student_ID, data = student_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.59944 -0.13439 -0.00251  0.13369  0.77543 
## 
## Coefficients: (1 not defined because of singularities)
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      2.007257   0.044611  44.995   <2e-16 ***
## Study_Hours_Per_Day              0.154016   0.005128  30.033   <2e-16 ***
## Extracurricular_Hours_Per_Day   -0.007418   0.003959  -1.874   0.0611 .  
## Sleep_Hours_Per_Day             -0.003658   0.003609  -1.014   0.3109    
## Social_Hours_Per_Day             0.001427   0.002789   0.512   0.6088    
## Physical_Activity_Hours_Per_Day        NA         NA      NA       NA    
## Stress_LevelLow                  0.006591   0.021317   0.309   0.7572    
## Stress_LevelModerate            -0.015801   0.013469  -1.173   0.2409    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2025 on 1993 degrees of freedom
## Multiple R-squared:  0.5418, Adjusted R-squared:  0.5404 
## F-statistic: 392.7 on 6 and 1993 DF,  p-value: < 2.2e-16
plot(model)

Dummy Variable

d1 <- ifelse(student_data$Stress_Level == "Low",1,0)
d2 <- ifelse(student_data$Stress_Level == "Moderate",1,0)


new_data <- cbind.data.frame(student_data,d1,d2)

Cor

library(corrplot)
## corrplot 0.95 loaded
cor_matrix <- cor(new_data[,c("Study_Hours_Per_Day","Extracurricular_Hours_Per_Day","Sleep_Hours_Per_Day", "Social_Hours_Per_Day","Physical_Activity_Hours_Per_Day","GPA",'d1','d2')])

corrplot(cor_matrix, method = "circle", type = "lower", tl.cex = 0.8, addCoef.col = "black")

VIF

model1 <- lm(GPA ~. -Student_ID - Stress_Level - Physical_Activity_Hours_Per_Day ,data=new_data)

model2 <- lm(GPA ~. -Student_ID - Stress_Level - Social_Hours_Per_Day ,data=new_data)

model3 <- lm(GPA ~. -Student_ID - Stress_Level - Sleep_Hours_Per_Day ,data=new_data)

model4 <- lm(GPA ~. -Student_ID - Stress_Level - Extracurricular_Hours_Per_Day ,data=new_data)


vif(model1)
##           Study_Hours_Per_Day Extracurricular_Hours_Per_Day 
##                      2.599471                      1.020729 
##           Sleep_Hours_Per_Day          Social_Hours_Per_Day 
##                      1.355431                      1.081113 
##                            d1                            d2 
##                      2.802699                      1.977307
vif(model2)
##             Study_Hours_Per_Day   Extracurricular_Hours_Per_Day 
##                        3.180111                        1.323115 
##             Sleep_Hours_Per_Day Physical_Activity_Hours_Per_Day 
##                        1.782787                        2.396792 
##                              d1                              d2 
##                        2.802699                        1.977307
vif(model3) #higher vif with phy act
##             Study_Hours_Per_Day   Extracurricular_Hours_Per_Day 
##                        5.227288                        1.831459 
##            Social_Hours_Per_Day Physical_Activity_Hours_Per_Day 
##                        2.381436                        4.013988 
##                              d1                              d2 
##                        2.802699                        1.977307
vif(model4) #higher vif with phy act
##             Study_Hours_Per_Day             Sleep_Hours_Per_Day 
##                        4.111329                        2.925908 
##            Social_Hours_Per_Day Physical_Activity_Hours_Per_Day 
##                        2.823582                        4.829172 
##                              d1                              d2 
##                        2.802699                        1.977307
summary(model1) #same r2
## 
## Call:
## lm(formula = GPA ~ . - Student_ID - Stress_Level - Physical_Activity_Hours_Per_Day, 
##     data = new_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.59944 -0.13439 -0.00251  0.13369  0.77543 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    2.007257   0.044611  44.995   <2e-16 ***
## Study_Hours_Per_Day            0.154016   0.005128  30.033   <2e-16 ***
## Extracurricular_Hours_Per_Day -0.007418   0.003959  -1.874   0.0611 .  
## Sleep_Hours_Per_Day           -0.003658   0.003609  -1.014   0.3109    
## Social_Hours_Per_Day           0.001427   0.002789   0.512   0.6088    
## d1                             0.006591   0.021317   0.309   0.7572    
## d2                            -0.015801   0.013469  -1.173   0.2409    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2025 on 1993 degrees of freedom
## Multiple R-squared:  0.5418, Adjusted R-squared:  0.5404 
## F-statistic: 392.7 on 6 and 1993 DF,  p-value: < 2.2e-16
summary(model2)
## 
## Call:
## lm(formula = GPA ~ . - Student_ID - Stress_Level - Social_Hours_Per_Day, 
##     data = new_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.59944 -0.13439 -0.00251  0.13369  0.77543 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      2.041517   0.065850  31.003   <2e-16 ***
## Study_Hours_Per_Day              0.152588   0.005672  26.902   <2e-16 ***
## Extracurricular_Hours_Per_Day   -0.008846   0.004507  -1.963   0.0498 *  
## Sleep_Hours_Per_Day             -0.005085   0.004139  -1.229   0.2194    
## Physical_Activity_Hours_Per_Day -0.001427   0.002789  -0.512   0.6088    
## d1                               0.006591   0.021317   0.309   0.7572    
## d2                              -0.015801   0.013469  -1.173   0.2409    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2025 on 1993 degrees of freedom
## Multiple R-squared:  0.5418, Adjusted R-squared:  0.5404 
## F-statistic: 392.7 on 6 and 1993 DF,  p-value: < 2.2e-16
#choosing first model

Base model

model <- lm(GPA ~.,data=student_data)
res_sat <- residuals(model)
fit_sat <- fitted(model)
plot(fit_sat,res_sat,main  = "Residuals against Y Hat")

After VIF model

res_sat <- residuals(model1)
fit_sat <- fitted(model1)
plot(fit_sat,res_sat,main  = "Residuals against Y Hat")

Some outliers

boxplot(residuals(model))

lev <- hatvalues(model)
halfnorm(lev,labs = row.names(student_data))

each predictor ploted

res <- residuals(model1)
fit <- fitted(model1)

plot(fit,res,main  = "Residuals against Y Hat")

plot(student_data$Student_ID,res, main = "Residuals against Student ID")

plot(student_data$Study_Hours_Per_Day,res, main = "Residuals against Study Hours", col = 'blue')

plot(student_data$Extracurricular_Hours_Per_Day,res, main = "Residuals against Extracurricular",col = 'red')

plot(student_data$Sleep_Hours_Per_Day,res, main = "Residuals against Sleep",col = 'orange')

plot(student_data$Social_Hours_Per_Day,res, main = "Residuals against Social Hours", col = 'darkgreen')

qqnorm(residuals(model1))
qqline(residuals(model1))

summary(model)
## 
## Call:
## lm(formula = GPA ~ ., data = student_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.59820 -0.13423 -0.00278  0.13389  0.77506 
## 
## Coefficients: (1 not defined because of singularities)
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      2.009e+00  4.525e-02  44.389   <2e-16 ***
## Student_ID                      -1.546e-06  7.878e-06  -0.196   0.8444    
## Study_Hours_Per_Day              1.540e-01  5.131e-03  30.024   <2e-16 ***
## Extracurricular_Hours_Per_Day   -7.434e-03  3.960e-03  -1.877   0.0607 .  
## Sleep_Hours_Per_Day             -3.672e-03  3.611e-03  -1.017   0.3093    
## Social_Hours_Per_Day             1.423e-03  2.790e-03   0.510   0.6100    
## Physical_Activity_Hours_Per_Day         NA         NA      NA       NA    
## Stress_LevelLow                  6.579e-03  2.132e-02   0.309   0.7577    
## Stress_LevelModerate            -1.563e-02  1.350e-02  -1.157   0.2473    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2025 on 1992 degrees of freedom
## Multiple R-squared:  0.5418, Adjusted R-squared:  0.5401 
## F-statistic: 336.4 on 7 and 1992 DF,  p-value: < 2.2e-16
#cook distance
cook <- cooks.distance(model)
halfnorm(cook,labs = row.names(model))

predictor selection

#do this for different citerion
library(leaps)
library(faraway)

step(model1,direction="backward",criterion="AIC")
## Start:  AIC=-6381.3
## GPA ~ (Student_ID + Study_Hours_Per_Day + Extracurricular_Hours_Per_Day + 
##     Sleep_Hours_Per_Day + Social_Hours_Per_Day + Physical_Activity_Hours_Per_Day + 
##     Stress_Level + d1 + d2) - Student_ID - Stress_Level - Physical_Activity_Hours_Per_Day
## 
##                                 Df Sum of Sq     RSS     AIC
## - d1                             1     0.004  81.720 -6383.2
## - Social_Hours_Per_Day           1     0.011  81.727 -6383.0
## - Sleep_Hours_Per_Day            1     0.042  81.758 -6382.3
## - d2                             1     0.056  81.773 -6381.9
## <none>                                        81.716 -6381.3
## - Extracurricular_Hours_Per_Day  1     0.144  81.860 -6379.8
## - Study_Hours_Per_Day            1    36.984 118.700 -5636.6
## 
## Step:  AIC=-6383.2
## GPA ~ Study_Hours_Per_Day + Extracurricular_Hours_Per_Day + Sleep_Hours_Per_Day + 
##     Social_Hours_Per_Day + d2
## 
##                                 Df Sum of Sq     RSS     AIC
## - Social_Hours_Per_Day           1     0.011  81.731 -6384.9
## - Sleep_Hours_Per_Day            1     0.039  81.759 -6384.2
## <none>                                        81.720 -6383.2
## - d2                             1     0.136  81.856 -6381.9
## - Extracurricular_Hours_Per_Day  1     0.144  81.864 -6381.7
## - Study_Hours_Per_Day            1    86.504 168.224 -4941.2
## 
## Step:  AIC=-6384.93
## GPA ~ Study_Hours_Per_Day + Extracurricular_Hours_Per_Day + Sleep_Hours_Per_Day + 
##     d2
## 
##                                 Df Sum of Sq     RSS     AIC
## - Sleep_Hours_Per_Day            1     0.049  81.780 -6385.7
## <none>                                        81.731 -6384.9
## - d2                             1     0.134  81.865 -6383.7
## - Extracurricular_Hours_Per_Day  1     0.159  81.890 -6383.1
## - Study_Hours_Per_Day            1    87.638 169.370 -4929.6
## 
## Step:  AIC=-6385.73
## GPA ~ Study_Hours_Per_Day + Extracurricular_Hours_Per_Day + d2
## 
##                                 Df Sum of Sq     RSS     AIC
## <none>                                        81.780 -6385.7
## - Extracurricular_Hours_Per_Day  1     0.160  81.940 -6383.8
## - d2                             1     0.184  81.965 -6383.2
## - Study_Hours_Per_Day            1    87.940 169.721 -4927.5
## 
## Call:
## lm(formula = GPA ~ Study_Hours_Per_Day + Extracurricular_Hours_Per_Day + 
##     d2, data = new_data)
## 
## Coefficients:
##                   (Intercept)            Study_Hours_Per_Day  
##                      2.000020                       0.152278  
## Extracurricular_Hours_Per_Day                             d2  
##                     -0.007734                      -0.020983
step(model1,direction="both")
## Start:  AIC=-6381.3
## GPA ~ (Student_ID + Study_Hours_Per_Day + Extracurricular_Hours_Per_Day + 
##     Sleep_Hours_Per_Day + Social_Hours_Per_Day + Physical_Activity_Hours_Per_Day + 
##     Stress_Level + d1 + d2) - Student_ID - Stress_Level - Physical_Activity_Hours_Per_Day
## 
##                                 Df Sum of Sq     RSS     AIC
## - d1                             1     0.004  81.720 -6383.2
## - Social_Hours_Per_Day           1     0.011  81.727 -6383.0
## - Sleep_Hours_Per_Day            1     0.042  81.758 -6382.3
## - d2                             1     0.056  81.773 -6381.9
## <none>                                        81.716 -6381.3
## - Extracurricular_Hours_Per_Day  1     0.144  81.860 -6379.8
## - Study_Hours_Per_Day            1    36.984 118.700 -5636.6
## 
## Step:  AIC=-6383.2
## GPA ~ Study_Hours_Per_Day + Extracurricular_Hours_Per_Day + Sleep_Hours_Per_Day + 
##     Social_Hours_Per_Day + d2
## 
##                                 Df Sum of Sq     RSS     AIC
## - Social_Hours_Per_Day           1     0.011  81.731 -6384.9
## - Sleep_Hours_Per_Day            1     0.039  81.759 -6384.2
## <none>                                        81.720 -6383.2
## - d2                             1     0.136  81.856 -6381.9
## - Extracurricular_Hours_Per_Day  1     0.144  81.864 -6381.7
## + d1                             1     0.004  81.716 -6381.3
## - Study_Hours_Per_Day            1    86.504 168.224 -4941.2
## 
## Step:  AIC=-6384.93
## GPA ~ Study_Hours_Per_Day + Extracurricular_Hours_Per_Day + Sleep_Hours_Per_Day + 
##     d2
## 
##                                 Df Sum of Sq     RSS     AIC
## - Sleep_Hours_Per_Day            1     0.049  81.780 -6385.7
## <none>                                        81.731 -6384.9
## - d2                             1     0.134  81.865 -6383.7
## + Social_Hours_Per_Day           1     0.011  81.720 -6383.2
## - Extracurricular_Hours_Per_Day  1     0.159  81.890 -6383.1
## + d1                             1     0.004  81.727 -6383.0
## - Study_Hours_Per_Day            1    87.638 169.370 -4929.6
## 
## Step:  AIC=-6385.73
## GPA ~ Study_Hours_Per_Day + Extracurricular_Hours_Per_Day + d2
## 
##                                 Df Sum of Sq     RSS     AIC
## <none>                                        81.780 -6385.7
## + Sleep_Hours_Per_Day            1     0.049  81.731 -6384.9
## + Social_Hours_Per_Day           1     0.021  81.759 -6384.2
## - Extracurricular_Hours_Per_Day  1     0.160  81.940 -6383.8
## + d1                             1     0.002  81.779 -6383.8
## - d2                             1     0.184  81.965 -6383.2
## - Study_Hours_Per_Day            1    87.940 169.721 -4927.5
## 
## Call:
## lm(formula = GPA ~ Study_Hours_Per_Day + Extracurricular_Hours_Per_Day + 
##     d2, data = new_data)
## 
## Coefficients:
##                   (Intercept)            Study_Hours_Per_Day  
##                      2.000020                       0.152278  
## Extracurricular_Hours_Per_Day                             d2  
##                     -0.007734                      -0.020983
step(model1,direction="forward",criterion="BIC")
## Start:  AIC=-6381.3
## GPA ~ (Student_ID + Study_Hours_Per_Day + Extracurricular_Hours_Per_Day + 
##     Sleep_Hours_Per_Day + Social_Hours_Per_Day + Physical_Activity_Hours_Per_Day + 
##     Stress_Level + d1 + d2) - Student_ID - Stress_Level - Physical_Activity_Hours_Per_Day
## 
## Call:
## lm(formula = GPA ~ (Student_ID + Study_Hours_Per_Day + Extracurricular_Hours_Per_Day + 
##     Sleep_Hours_Per_Day + Social_Hours_Per_Day + Physical_Activity_Hours_Per_Day + 
##     Stress_Level + d1 + d2) - Student_ID - Stress_Level - Physical_Activity_Hours_Per_Day, 
##     data = new_data)
## 
## Coefficients:
##                   (Intercept)            Study_Hours_Per_Day  
##                      2.007257                       0.154016  
## Extracurricular_Hours_Per_Day            Sleep_Hours_Per_Day  
##                     -0.007418                      -0.003658  
##          Social_Hours_Per_Day                             d1  
##                      0.001427                       0.006591  
##                            d2  
##                     -0.015801
sum3 = summary(regsubsets(GPA ~.-Stress_Level-Physical_Activity_Hours_Per_Day, data=new_data,nbest=1))
sum3$which
##   (Intercept) Student_ID Study_Hours_Per_Day Extracurricular_Hours_Per_Day
## 1        TRUE      FALSE                TRUE                         FALSE
## 2        TRUE      FALSE                TRUE                         FALSE
## 3        TRUE      FALSE                TRUE                          TRUE
## 4        TRUE      FALSE                TRUE                          TRUE
## 5        TRUE      FALSE                TRUE                          TRUE
## 6        TRUE      FALSE                TRUE                          TRUE
## 7        TRUE       TRUE                TRUE                          TRUE
##   Sleep_Hours_Per_Day Social_Hours_Per_Day    d1    d2
## 1               FALSE                FALSE FALSE FALSE
## 2               FALSE                FALSE FALSE  TRUE
## 3               FALSE                FALSE FALSE  TRUE
## 4                TRUE                FALSE FALSE  TRUE
## 5                TRUE                 TRUE FALSE  TRUE
## 6                TRUE                 TRUE  TRUE  TRUE
## 7                TRUE                 TRUE  TRUE  TRUE
sum3$rsq
## [1] 0.5394432 0.5404943 0.5413901 0.5416668 0.5417285 0.5417505 0.5417593
outputMat = cbind(sum3$which, sum3$rsq,sum3$adjr2, sum3$cp, sum3$bic, sum3$rsq)
colnames(outputMat) <- c(colnames(sum3$which), "rsq","adjr2", "cp", "bic", "rss")
View(outputMat)

termplot(model1, partial.resid = T)

plot(regsubsets(GPA ~., data=student_data,
                nbest=1,), scale="adjr2")
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 1 linear dependencies found
## Reordering variables and trying again:

#plotting AIC
AIC <- 50*log(sum3$rss/50) + (1:7)*2
plot(AIC~I(1:7),ylab="AIC",xlab="Number of Predictors",type="l",lwd=2)

#Plotting Cp
plot(1:7,sum3$cp,xlab="No.of Predictors",
     ylab="Cp Statistic",type="l",lwd=2)

new_model <- lm(GPA ~ Study_Hours_Per_Day + Extracurricular_Hours_Per_Day + d2, data = new_data)

summary(new_model)
## 
## Call:
## lm(formula = GPA ~ Study_Hours_Per_Day + Extracurricular_Hours_Per_Day + 
##     d2, data = new_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.60982 -0.13364 -0.00205  0.13395  0.77368 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    2.000020   0.027153  73.658   <2e-16 ***
## Study_Hours_Per_Day            0.152278   0.003287  46.329   <2e-16 ***
## Extracurricular_Hours_Per_Day -0.007734   0.003917  -1.975   0.0485 *  
## d2                            -0.020983   0.009899  -2.120   0.0342 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2024 on 1996 degrees of freedom
## Multiple R-squared:  0.5414, Adjusted R-squared:  0.5407 
## F-statistic: 785.4 on 3 and 1996 DF,  p-value: < 2.2e-16
model2 <- lm(GPA ~. - Student_ID - Stress_Level,data =student_data)
termplot(model2, partial.resid = T,terms=5,col.res='purple')

#partial residual