library(corrplot)
## corrplot 0.92 loaded
library(ggplot2)
library(car)
## Loading required package: carData
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
lapply(c("tidyverse","rvest","kableExtra"),library,character.only=T)[[1]]
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some() masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Attaching package: 'rvest'
##
##
## The following object is masked from 'package:readr':
##
## guess_encoding
##
##
##
## Attaching package: 'kableExtra'
##
##
## The following object is masked from 'package:dplyr':
##
## group_rows
## [1] "lubridate" "forcats" "stringr" "purrr" "readr" "tidyr"
## [7] "tibble" "tidyverse" "dplyr" "olsrr" "car" "carData"
## [13] "ggplot2" "corrplot" "stats" "graphics" "grDevices" "utils"
## [19] "datasets" "methods" "base"
lapply(c("glmnet","lmridge"),library,character.only=T)[[1]]
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-8
##
## Attaching package: 'lmridge'
##
## The following object is masked from 'package:car':
##
## vif
## [1] "glmnet" "Matrix" "kableExtra" "rvest" "lubridate"
## [6] "forcats" "stringr" "purrr" "readr" "tidyr"
## [11] "tibble" "tidyverse" "dplyr" "olsrr" "car"
## [16] "carData" "ggplot2" "corrplot" "stats" "graphics"
## [21] "grDevices" "utils" "datasets" "methods" "base"
library(readxl)
data<-read_excel("/Users/aisyahnuruzzahra/Downloads/data pokemon.xlsx")
str(data)
## tibble [75 × 4] (S3: tbl_df/tbl/data.frame)
## $ total_strength: num [1:75] 327 429 569 363 472 612 368 473 613 285 ...
## $ hp : num [1:75] 45 60 80 39 58 78 44 59 79 45 ...
## $ height : num [1:75] 7 10 20 6 11 17 5 10 16 3 ...
## $ weight : num [1:75] 69 130 1000 85 190 905 90 225 855 29 ...
y<-data$`total_strength`
x1<-data$`hp`
x2<-data$`height`
x3<-data$`weight`
data1 = data.frame(y,x1,x2,x3)
x <- data.matrix(data1[,c('x1', 'x2', 'x3')])
head(data1)
## y x1 x2 x3
## 1 327 45 7 69
## 2 429 60 10 130
## 3 569 80 20 1000
## 4 363 39 6 85
## 5 472 58 11 190
## 6 612 78 17 905
data1_numeric <- data1[sapply(data1, is.numeric)]
cor_matrix <- cor(data1_numeric, use = "complete.obs")
corrplot(cor_matrix, method = "number")
# Plot antara Y dengan X1
plot(x=data$hp, y=data$total_strength, main="Hubungan HP Pokemonterhadap Total Strength")
plot(x=data$height, y=data$total_strength, main="Hubungan Height Pokemon terhadap Total Strength")
# Plot antara Y dengan X3
plot(x=data$weight, y=data$total_strength, main="Hubungan Weight Pokemon terhadap Total Strength")
## Model Regresi Klasik
modelklasik <- lm(data$total_strength~data$hp+data$height+data$weight,data=data1)
s.modelklasik <- summary(modelklasik)
s.modelklasik
##
## Call:
## lm(formula = data$total_strength ~ data$hp + data$height + data$weight,
## data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -188.410 -56.933 -4.719 54.854 157.892
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 291.8696 27.6919 10.540 3.61e-16 ***
## data$hp 0.8407 0.4865 1.728 0.08834 .
## data$height 5.5378 2.2731 2.436 0.01735 *
## data$weight 0.1304 0.0389 3.352 0.00129 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 76.07 on 71 degrees of freedom
## Multiple R-squared: 0.5025, Adjusted R-squared: 0.4815
## F-statistic: 23.9 on 3 and 71 DF, p-value: 8.421e-11
Model regresi klasik yang diperoleh adalah : \[total strength=291.8696+0.8407hp+ 5.5378 height+0.1304 weight\]
R2.modelklasik <- s.modelklasik$r.squared
R2.modelklasik
## [1] 0.5024762
# RSE
rse.modelklasik <- s.modelklasik$sigma
rse.modelklasik
## [1] 76.07345
Rsq Model klasik didapatkan $ R^2 =0.5025$ dan nilai RSE $RSE = 76.07345 $
ols_plot_resid_lev(modelklasik)
plot(modelklasik,which=5)
ols_plot_diagnostics(modelklasik)
t.test(modelklasik$residuals,
mu = 0,
conf.level = 0.95)
##
## One Sample t-test
##
## data: modelklasik$residuals
## t = 6.6064e-16, df = 74, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -17.14445 17.14445
## sample estimates:
## mean of x
## 5.684342e-15
lmtest::dwtest(modelklasik)
##
## Durbin-Watson test
##
## data: modelklasik
## DW = 1.7261, p-value = 0.1069
## alternative hypothesis: true autocorrelation is greater than 0
lmtest::bptest(modelklasik)
##
## studentized Breusch-Pagan test
##
## data: modelklasik
## BP = 5.7814, df = 3, p-value = 0.1227
shapiro.test(modelklasik$residuals)
##
## Shapiro-Wilk normality test
##
## data: modelklasik$residuals
## W = 0.98549, p-value = 0.5491
library(car)
car::vif(modelklasik)
## data$hp data$height data$weight
## 1.359496 1.930949 1.853587
Tidak ada multikolinieritas antarvariabel karena nilai VIF<10.
x <- data.matrix(data[,c('hp','height' ,'weight')])
y <- data$total_strength
cv.r<-cv.glmnet(x,y,alpha=0)
best.lr<-cv.r$lambda.min
bestridge<-glmnet(x,y,alpha=0,lambda=best.lr)
coef(bestridge)
## 4 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 309.0841489
## hp 0.8105141
## height 4.8718180
## weight 0.1008961
Didapatkan Model regresi Ridge ialah : \[total strength=310.69575579+0.80424856hp+4.80558228height+0.09894507weight\]
rsq<-function(bestmodel,bestlambda,x,y){
#y duga
y.duga <- predict(bestmodel, s = bestlambda, newx = x)
#JKG dan JKT
jkt <- sum((y - mean(y))^2)
jkg <- sum((y.duga- y)^2)
#find R-Squared
rsq <- 1 - jkg/jkt
return(rsq)
}
# R2 ridge
R2.modelridge <- rsq(bestridge,best.lr,x,y)
R2.modelridge
## [1] 0.4892223
#RSE ridge
train_predictionsr <- predict(bestridge,newx = x)
# Hitung residu (selisih antara prediksi dan nilai sebenarnya)
residualsr <- y - train_predictionsr
# Hitung varian residu
dfr <- length(y) - length(bestridge$beta)
residual_variancer <- sum(residualsr^2) / dfr
# Hitung RSE
rse.modelridge <- sqrt(residual_variancer)
rse.modelridge
## [1] 76.54293
Didapatkan nilai R2 pada model ridge $ R^2 = 0.4870686$ dan RSE model ridge didapatkan nilai $RSE =76.70413 $
cv.l<-cv.glmnet(x,y,alpha=1)
best.ll<-cv.l$lambda.min
bestlasso<-glmnet(x,y,alpha=1,lambda=best.ll)
coef(bestlasso)
## 4 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 302.8036074
## hp 0.7334256
## height 5.2528910
## weight 0.1246989
Didapatkan Model regresi lasso ialah : \[total strength=297.5739953+0.7846787hp+5.3893440height+ 0.1274376weight\]
# R2 lasso
R2.modellasso <- rsq(bestlasso,best.ll,x,y)
R2.modellasso
## [1] 0.5005677
#RSElasso
train_predictionsLasso <- predict(bestlasso,newx = x)
# Hitung residu (selisih antara prediksi dan nilai sebenarnya)
residualsLasso <- y - train_predictionsLasso
# Hitung varian residu
dfLasso <- length(y) - length(bestlasso$beta)
residual_varianceLasso <- sum(residualsLasso^2) / dfLasso
# Hitung RSE
rse.modelLasso <- sqrt(residual_varianceLasso)
rse.modelLasso
## [1] 75.68807
Didapatkan nilai R2 pada model lasso $ R^2 = 0.5019571$ dan RSE model ridge didapatkan nilai $RSE =75.58272 $
perbandingan <- matrix(c(R2.modelklasik, R2.modelridge, R2.modellasso, rse.modelklasik, rse.modelridge, rse.modelLasso),ncol=2,byrow = F)
row.names(perbandingan)<- c("Model klasik","model ridge","model lasso")
colnames(perbandingan) <- c("R squared","RSE")
perbandingan
## R squared RSE
## Model klasik 0.5024762 76.07345
## model ridge 0.4892223 76.54293
## model lasso 0.5005677 75.68807
Perbandingan dengan melihat nilai RSE yang memiliki nilai terendah, maka model yang memiliki nilai RSE terendah ialah model lasso dengan nilai $ RSE = 75.58272$ dapat dikatakan bahwa model terbaiknya adalah model lasso
coef(bestlasso)
## 4 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 302.8036074
## hp 0.7334256
## height 5.2528910
## weight 0.1246989
Didapatkan Model regresi lasso ialah : \[total strength=297.5739953+0.7846787hp+5.3893440height+0.1274376weight\] ## Interpretasi Koefesien Model total strength akan meningkat jika height, weight dan hp meningkat. akan tetapi, pengaruh peningkatan tertinggi pada total strength terdapat pada height.