library(readxl)
data<-read.csv("C:/Users/HAKIM/OneDrive/Documents/semester 5/psd/archive (5)/Student_Performance.csv")
str(data)
## 'data.frame': 10000 obs. of 6 variables:
## $ Hours.Studied : int 7 4 8 5 7 3 7 8 5 4 ...
## $ Previous.Scores : int 99 82 51 52 75 78 73 45 77 89 ...
## $ Extracurricular.Activities : chr "Yes" "No" "Yes" "Yes" ...
## $ Sleep.Hours : int 9 4 7 5 8 9 5 4 8 4 ...
## $ Sample.Question.Papers.Practiced: int 1 2 2 2 5 6 6 6 2 0 ...
## $ Performance.Index : num 91 65 45 36 66 61 63 42 61 69 ...
head(data.frame(data))
## Hours.Studied Previous.Scores Extracurricular.Activities Sleep.Hours
## 1 7 99 Yes 9
## 2 4 82 No 4
## 3 8 51 Yes 7
## 4 5 52 Yes 5
## 5 7 75 No 8
## 6 3 78 No 9
## Sample.Question.Papers.Practiced Performance.Index
## 1 1 91
## 2 2 65
## 3 2 45
## 4 2 36
## 5 5 66
## 6 6 61
The Student Performance Dataset is a dataset designed to examine the factors influencing academic student performance. The dataset consists of 10,000 student records, with each record containing information about various predictors and a performance index.
y<-data$Performance.Index
x1<-data$Hours.Studied
x2<-data$Previous.Scores
x3<-data$Sleep.Hours
x4<-data$Sample.Question.Papers.Practiced
Variables:
x1: Hours Studied The total number of hours spent studying by each student.
x2: Previous Scores the scores obtained by students in previous tests.
x3: Sleep Hours the average number of hours of sleep the student had per day.
x4: Sample Question Papers Practiced the number of sample question papers the student practiced.
lapply(c("car","lmtest"),library,character.only=T)[[1]]
## Loading required package: carData
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## [1] "car" "carData" "readxl" "stats" "graphics" "grDevices"
## [7] "utils" "datasets" "methods" "base"
#Model Regresi Klasik
model<-lm(y~x1+x2+x3+x4,data)
summary(model)
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.3299 -1.3831 -0.0062 1.3701 8.4864
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -33.763726 0.126841 -266.19 <2e-16 ***
## x1 2.853429 0.007962 358.40 <2e-16 ***
## x2 1.018584 0.001189 857.02 <2e-16 ***
## x3 0.476333 0.012153 39.19 <2e-16 ***
## x4 0.195198 0.007189 27.15 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.061 on 9995 degrees of freedom
## Multiple R-squared: 0.9885, Adjusted R-squared: 0.9885
## F-statistic: 2.147e+05 on 4 and 9995 DF, p-value: < 2.2e-16
rsq.awal<-summary(model)$r.squared
rse.awal<-summary(model)$sigma
Diperoleh model \[\hat{Y}= -33.763726+2.853429x_1+1.018584x_2+0.476333x_3+0.195198x_4\] dengan perolehan p-value 2.2e-16 < 0.05. Semua peubah x dalam model berpengaruh signifikan terhadap peubah y dengan R-squared sebesar 0.9885 dan RSE sebesar 2.061 .
t.test(model$residuals,
mu = 0,
conf.level = 0.95)
##
## One Sample t-test
##
## data: model$residuals
## t = -2.4525e-15, df = 9999, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -0.04038966 0.04038966
## sample estimates:
## mean of x
## -5.053336e-17
H0: Nilai harapan sama dengan nol H1: Nilai harapantidak sama dengan nol
Keputusan: Diperoleh nilai p-value 1>0,05, maka terima H0. Dapat disipulkan bahwa nilai harapan sisaan sama dengan nol
library(lmtest)
dwtest(model)
##
## Durbin-Watson test
##
## data: model
## DW = 2.0039, p-value = 0.5764
## alternative hypothesis: true autocorrelation is greater than 0
H0: Sisaan saling bebas H1: Sisaan tidak saling bebas
Keputusan: Diperoleh nilai p-value 0,5764>0,05, maka terima H0. Dapat disipulkan bahwa sisaan saling bebas
library(lmtest)
bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 1.5045, df = 4, p-value = 0.8258
H0: Ragam sisaan homogen H1: Ragam sisaan tidak homogen
Keputusan: nilai p-value 0,8258>0,05, maka terima H0. Dapat disipulkan bahwa ragam sisaan homgen
ks.test(model$residuals, "pnorm", mean=mean(model$residuals), sd=sd(model$residuals))
## Warning in ks.test.default(model$residuals, "pnorm", mean =
## mean(model$residuals), : ties should not be present for the Kolmogorov-Smirnov
## test
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: model$residuals
## D = 0.0063741, p-value = 0.8112
## alternative hypothesis: two-sided
H0: Sisaan menyebar normal H1: Sisaan tidak menyebar normal
Keputusan: nilai p-value 0.8112>0,05, maka terima H0. Dapat disipulkan bahwa sisaan menyebar normal
car::vif(model)
## x1 x2 x3 x4
## 1.000464 1.000254 1.000052 1.000386
dapat dilihat nilai VIF dari semua peubah < 10. Maka dapat disimpulkan tidak terdapat multikolinearitas
par(mfrow = c(2,2))
qqnorm(model$residuals)
qqline(model$residuals, col = "red", lwd = 2)
plot(predict(model), model$residuals, col = "steelblue", pch = 20, xlab = "Sisaan", ylab = "Fitted Values", main = "Sisaan vs Fitted Values")
abline(a = 0, b = 0, lwd = 2)
hist(model$residuals, col = "steelblue")
plot(model$residuals, col = "steelblue", pch = 20, xlab = "Sisaan", ylab = "Order", main = "Sisaan vs Order")
lines(seq(1,10000,1), model$residuals, col = "red")
abline(a = 0, b = 0, lwd = 2)
lapply(c("glmnet","lmridge"),library,character.only=T)[[1]]
## Loading required package: Matrix
## Loaded glmnet 4.1-8
##
## Attaching package: 'lmridge'
## The following object is masked from 'package:car':
##
## vif
## [1] "glmnet" "Matrix" "lmtest" "zoo" "car" "carData"
## [7] "readxl" "stats" "graphics" "grDevices" "utils" "datasets"
## [13] "methods" "base"
x<-cbind(x1,x2,x3,x4)
head(matrix(x))
## [,1]
## [1,] 7
## [2,] 4
## [3,] 8
## [4,] 5
## [5,] 7
## [6,] 3
y<-data$Performance.Index
cv.r<-cv.glmnet(x,y,alpha=0);plot(cv.r)
best.lr<-cv.r$lambda.min
bestridge<-glmnet(x,y,alpha=0,lambda=best.lr)
coef(bestridge)
## 5 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -26.3120776
## x1 2.6078448
## x2 0.9327929
## x3 0.4416559
## x4 0.1862048
Diperoleh model dari Regresi Ridge sebagai berikut: \[\hat{Y}= -26.3120776+2.6078448x_1+0.9327929x_2+0.4416559x_3+0.1862048x_4\] #Fungsi R-Squared
rsq<-function(bestmodel,bestlambda,x,y){
#y duga
y.duga <- predict(bestmodel, s = bestlambda, newx = x)
#JKG dan JKT
jkt <- sum((y - mean(y))^2)
jkg <- sum((y.duga- y)^2)
#find R-Squared
rsq <- 1 - jkg/jkt
return(rsq)
}
rsq(bestridge,best.lr,x,y)
## [1] 0.9814513
Diperoleh R-Squared Ridge sebesar 0.9814513
train_predictionsridge <- predict(bestridge,newx = x)
# Menghitung residu (selisih antara prediksi dan nilai sebenarnya)
residualsridge <- y - train_predictionsridge
# Menghitung varian residu
dfridge <- length(y) - length(bestridge$beta)
residual_varianceridge <- sum(residualsridge^2) / dfridge
# Menghitung RSE
rse.ridge <- sqrt(residual_varianceridge)
rse.ridge
## [1] 2.617022
Diperoleh RSE Ridge sebesar 2.617022
cv.l<-cv.glmnet(x,y,alpha=1);plot(cv.l)
best.ll<-cv.l$lambda.min
bestlasso<-glmnet(x,y,alpha=1,lambda=best.ll);coef(bestlasso)
## 5 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -33.0172257
## x1 2.8280113
## x2 1.0147712
## x3 0.4377285
## x4 0.1727848
Diperoleh model dari Regresi Lasso sebagai berikut: \[\hat{Y}= -33.0172257+2.8280113x_1+1.0147712x_2+0.4377285x_3+0.1727848x_4\]
rsq(bestlasso,best.lr,x,y)
## [1] 0.9884512
Diperoleh R-Squared Lasso sebesar 0.9884512 #RSE
train_predictionslasso <- predict(bestlasso,newx = x)
# Menghitung residu (selisih antara prediksi dan nilai sebenarnya)
residualslasso <- y - train_predictionslasso
# Menghitung varian residu
dflasso <- length(y) - length(bestlasso$beta)
residual_variancelasso <- sum(residualslasso^2) / dflasso
# Menghitung RSE
rse.lasso <- sqrt(residual_variancelasso)
rse.lasso
## [1] 2.064995
Diperoleh nilai RSE Lasso sebesar 2.064995
R2<- matrix(c(rsq.awal,rsq(bestridge,best.lr,x,y),rsq(bestlasso,best.lr,x,y),rse.awal,rse.ridge,rse.lasso), ncol=2, byrow = FALSE)
row.names(R2)<-c( "Regresi Klasik", "Regresi Ridge", "Regresi Lasso")
colnames(R2)<-c("R-Squared","RSE")
R2
## R-Squared RSE
## Regresi Klasik 0.9884981 2.060898
## Regresi Ridge 0.9814513 2.617022
## Regresi Lasso 0.9884512 2.064995
Dapat dilihat dari ketiga model yang sudah diujikan, Model Regresi Klasik memiliki R-Squared dan RSE terkecil. Maka dapat disimpulkan Model Regresi Klasik merupakan pemodelan terbaik bagi data student peformence
Model terbaik diperoleh Model Regresi Klasik sebagai berikut:
model$coefficients
## (Intercept) x1 x2 x3 x4
## -33.7637261 2.8534292 1.0185835 0.4763330 0.1951983
\[\hat{Y}= -33.763726+2.853429x_1+1.018584x_2+0.476333x_3+0.195198x_4\]