library(readxl)
data<-read.csv("C:/Users/HAKIM/OneDrive/Documents/semester 5/psd/archive (5)/Student_Performance.csv")
str(data)
## 'data.frame':    10000 obs. of  6 variables:
##  $ Hours.Studied                   : int  7 4 8 5 7 3 7 8 5 4 ...
##  $ Previous.Scores                 : int  99 82 51 52 75 78 73 45 77 89 ...
##  $ Extracurricular.Activities      : chr  "Yes" "No" "Yes" "Yes" ...
##  $ Sleep.Hours                     : int  9 4 7 5 8 9 5 4 8 4 ...
##  $ Sample.Question.Papers.Practiced: int  1 2 2 2 5 6 6 6 2 0 ...
##  $ Performance.Index               : num  91 65 45 36 66 61 63 42 61 69 ...
head(data.frame(data))
##   Hours.Studied Previous.Scores Extracurricular.Activities Sleep.Hours
## 1             7              99                        Yes           9
## 2             4              82                         No           4
## 3             8              51                        Yes           7
## 4             5              52                        Yes           5
## 5             7              75                         No           8
## 6             3              78                         No           9
##   Sample.Question.Papers.Practiced Performance.Index
## 1                                1                91
## 2                                2                65
## 3                                2                45
## 4                                2                36
## 5                                5                66
## 6                                6                61

The Student Performance Dataset is a dataset designed to examine the factors influencing academic student performance. The dataset consists of 10,000 student records, with each record containing information about various predictors and a performance index.

y<-data$Performance.Index
x1<-data$Hours.Studied
x2<-data$Previous.Scores
x3<-data$Sleep.Hours
x4<-data$Sample.Question.Papers.Practiced

Variables:

x1: Hours Studied The total number of hours spent studying by each student.

x2: Previous Scores the scores obtained by students in previous tests.

x3: Sleep Hours the average number of hours of sleep the student had per day.

x4: Sample Question Papers Practiced the number of sample question papers the student practiced.

lapply(c("car","lmtest"),library,character.only=T)[[1]]
## Loading required package: carData
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
##  [1] "car"       "carData"   "readxl"    "stats"     "graphics"  "grDevices"
##  [7] "utils"     "datasets"  "methods"   "base"

#Model Regresi Klasik

model<-lm(y~x1+x2+x3+x4,data)
summary(model)
## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.3299 -1.3831 -0.0062  1.3701  8.4864 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -33.763726   0.126841 -266.19   <2e-16 ***
## x1            2.853429   0.007962  358.40   <2e-16 ***
## x2            1.018584   0.001189  857.02   <2e-16 ***
## x3            0.476333   0.012153   39.19   <2e-16 ***
## x4            0.195198   0.007189   27.15   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.061 on 9995 degrees of freedom
## Multiple R-squared:  0.9885, Adjusted R-squared:  0.9885 
## F-statistic: 2.147e+05 on 4 and 9995 DF,  p-value: < 2.2e-16
rsq.awal<-summary(model)$r.squared
rse.awal<-summary(model)$sigma

Diperoleh model \[\hat{Y}= -33.763726+2.853429x_1+1.018584x_2+0.476333x_3+0.195198x_4\] dengan perolehan p-value 2.2e-16 < 0.05. Semua peubah x dalam model berpengaruh signifikan terhadap peubah y dengan R-squared sebesar 0.9885 dan RSE sebesar 2.061 .

Asumsi GAUSS MARKOV

1 Nilai harapan sisaan sama dengan nol

t.test(model$residuals,
       mu = 0,
       conf.level = 0.95)
## 
##  One Sample t-test
## 
## data:  model$residuals
## t = -2.4525e-15, df = 9999, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.04038966  0.04038966
## sample estimates:
##     mean of x 
## -5.053336e-17

H0: Nilai harapan sama dengan nol H1: Nilai harapantidak sama dengan nol

Keputusan: Diperoleh nilai p-value 1>0,05, maka terima H0. Dapat disipulkan bahwa nilai harapan sisaan sama dengan nol

2 Sisaan saling bebas

library(lmtest)
dwtest(model)
## 
##  Durbin-Watson test
## 
## data:  model
## DW = 2.0039, p-value = 0.5764
## alternative hypothesis: true autocorrelation is greater than 0

H0: Sisaan saling bebas H1: Sisaan tidak saling bebas

Keputusan: Diperoleh nilai p-value 0,5764>0,05, maka terima H0. Dapat disipulkan bahwa sisaan saling bebas

3 Ragam sisaan homogen

library(lmtest)
bptest(model)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 1.5045, df = 4, p-value = 0.8258

H0: Ragam sisaan homogen H1: Ragam sisaan tidak homogen

Keputusan: nilai p-value 0,8258>0,05, maka terima H0. Dapat disipulkan bahwa ragam sisaan homgen

ASUMSI NORMALITAS SISAAN

ks.test(model$residuals, "pnorm", mean=mean(model$residuals), sd=sd(model$residuals))
## Warning in ks.test.default(model$residuals, "pnorm", mean =
## mean(model$residuals), : ties should not be present for the Kolmogorov-Smirnov
## test
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  model$residuals
## D = 0.0063741, p-value = 0.8112
## alternative hypothesis: two-sided

H0: Sisaan menyebar normal H1: Sisaan tidak menyebar normal

Keputusan: nilai p-value 0.8112>0,05, maka terima H0. Dapat disipulkan bahwa sisaan menyebar normal

MULTIKOLINEARITAS

car::vif(model)
##       x1       x2       x3       x4 
## 1.000464 1.000254 1.000052 1.000386

dapat dilihat nilai VIF dari semua peubah < 10. Maka dapat disimpulkan tidak terdapat multikolinearitas

UJI ASUMSI SECARA Eksploratif

par(mfrow = c(2,2))
qqnorm(model$residuals)
qqline(model$residuals, col = "red", lwd = 2)

plot(predict(model), model$residuals, col = "steelblue", pch = 20, xlab = "Sisaan", ylab = "Fitted Values", main = "Sisaan vs Fitted Values")
abline(a = 0, b = 0, lwd = 2)

hist(model$residuals, col = "steelblue")

plot(model$residuals, col = "steelblue", pch = 20, xlab = "Sisaan", ylab = "Order", main = "Sisaan vs Order")
lines(seq(1,10000,1), model$residuals, col = "red")
abline(a = 0, b = 0, lwd = 2)

lapply(c("glmnet","lmridge"),library,character.only=T)[[1]]
## Loading required package: Matrix
## Loaded glmnet 4.1-8
## 
## Attaching package: 'lmridge'
## The following object is masked from 'package:car':
## 
##     vif
##  [1] "glmnet"    "Matrix"    "lmtest"    "zoo"       "car"       "carData"  
##  [7] "readxl"    "stats"     "graphics"  "grDevices" "utils"     "datasets" 
## [13] "methods"   "base"

REGRESI RIDGE

x<-cbind(x1,x2,x3,x4)
head(matrix(x))
##      [,1]
## [1,]    7
## [2,]    4
## [3,]    8
## [4,]    5
## [5,]    7
## [6,]    3
y<-data$Performance.Index
cv.r<-cv.glmnet(x,y,alpha=0);plot(cv.r)

Best Model Ridge

best.lr<-cv.r$lambda.min
bestridge<-glmnet(x,y,alpha=0,lambda=best.lr)
coef(bestridge)
## 5 x 1 sparse Matrix of class "dgCMatrix"
##                      s0
## (Intercept) -26.3120776
## x1            2.6078448
## x2            0.9327929
## x3            0.4416559
## x4            0.1862048

Diperoleh model dari Regresi Ridge sebagai berikut: \[\hat{Y}= -26.3120776+2.6078448x_1+0.9327929x_2+0.4416559x_3+0.1862048x_4\] #Fungsi R-Squared

rsq<-function(bestmodel,bestlambda,x,y){
 #y duga
 y.duga <- predict(bestmodel, s = bestlambda, newx = x)

 #JKG dan JKT
 jkt <- sum((y - mean(y))^2)
 jkg <- sum((y.duga- y)^2)

#find R-Squared
rsq <- 1 - jkg/jkt
return(rsq) 
}

R-Squared Ridge

rsq(bestridge,best.lr,x,y)
## [1] 0.9814513

Diperoleh R-Squared Ridge sebesar 0.9814513

RSE Ridge

train_predictionsridge <- predict(bestridge,newx = x)

# Menghitung residu (selisih antara prediksi dan nilai sebenarnya)
residualsridge <- y - train_predictionsridge

# Menghitung varian residu
dfridge <- length(y) - length(bestridge$beta)
residual_varianceridge <- sum(residualsridge^2) / dfridge

# Menghitung RSE
rse.ridge <- sqrt(residual_varianceridge)
rse.ridge
## [1] 2.617022

Diperoleh RSE Ridge sebesar 2.617022

REGRESI LASSO

cv.l<-cv.glmnet(x,y,alpha=1);plot(cv.l)

Best Model Lasso

best.ll<-cv.l$lambda.min
bestlasso<-glmnet(x,y,alpha=1,lambda=best.ll);coef(bestlasso)
## 5 x 1 sparse Matrix of class "dgCMatrix"
##                      s0
## (Intercept) -33.0172257
## x1            2.8280113
## x2            1.0147712
## x3            0.4377285
## x4            0.1727848

Diperoleh model dari Regresi Lasso sebagai berikut: \[\hat{Y}= -33.0172257+2.8280113x_1+1.0147712x_2+0.4377285x_3+0.1727848x_4\]

R-Squared Lasso

rsq(bestlasso,best.lr,x,y)
## [1] 0.9884512

Diperoleh R-Squared Lasso sebesar 0.9884512 #RSE

RSE Lasso

train_predictionslasso <- predict(bestlasso,newx = x)

# Menghitung residu (selisih antara prediksi dan nilai sebenarnya)
residualslasso <- y - train_predictionslasso

# Menghitung varian residu
dflasso <- length(y) - length(bestlasso$beta)
residual_variancelasso <- sum(residualslasso^2) / dflasso

# Menghitung RSE
rse.lasso <- sqrt(residual_variancelasso)
rse.lasso
## [1] 2.064995

Diperoleh nilai RSE Lasso sebesar 2.064995

PERBANDINGAN R-SQUARED DAN RSE METODE REGRESI KLASIK, REGRESI RIDGE, REGRESI LASSO

R2<- matrix(c(rsq.awal,rsq(bestridge,best.lr,x,y),rsq(bestlasso,best.lr,x,y),rse.awal,rse.ridge,rse.lasso), ncol=2, byrow = FALSE)
row.names(R2)<-c( "Regresi Klasik", "Regresi Ridge", "Regresi Lasso")
colnames(R2)<-c("R-Squared","RSE")
R2
##                R-Squared      RSE
## Regresi Klasik 0.9884981 2.060898
## Regresi Ridge  0.9814513 2.617022
## Regresi Lasso  0.9884512 2.064995

Dapat dilihat dari ketiga model yang sudah diujikan, Model Regresi Klasik memiliki R-Squared dan RSE terkecil. Maka dapat disimpulkan Model Regresi Klasik merupakan pemodelan terbaik bagi data student peformence

KESIMPULAN

Model terbaik diperoleh Model Regresi Klasik sebagai berikut:

model$coefficients
## (Intercept)          x1          x2          x3          x4 
## -33.7637261   2.8534292   1.0185835   0.4763330   0.1951983

\[\hat{Y}= -33.763726+2.853429x_1+1.018584x_2+0.476333x_3+0.195198x_4\]