Input Package and Library

library(corrplot)
## corrplot 0.92 loaded
library(ggplot2)
library(car)
## Loading required package: carData
library(olsrr)
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
lapply(c("tidyverse","rvest","kableExtra"),library,character.only=T)[[1]]
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ✔ readr     2.1.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## 
## Attaching package: 'rvest'
## 
## 
## The following object is masked from 'package:readr':
## 
##     guess_encoding
## 
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
##  [1] "lubridate" "forcats"   "stringr"   "purrr"     "readr"     "tidyr"    
##  [7] "tibble"    "tidyverse" "dplyr"     "olsrr"     "car"       "carData"  
## [13] "ggplot2"   "corrplot"  "stats"     "graphics"  "grDevices" "utils"    
## [19] "datasets"  "methods"   "base"
lapply(c("glmnet","lmridge"),library,character.only=T)[[1]]
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-8
## 
## Attaching package: 'lmridge'
## 
## The following object is masked from 'package:car':
## 
##     vif
##  [1] "glmnet"     "Matrix"     "kableExtra" "rvest"      "lubridate" 
##  [6] "forcats"    "stringr"    "purrr"      "readr"      "tidyr"     
## [11] "tibble"     "tidyverse"  "dplyr"      "olsrr"      "car"       
## [16] "carData"    "ggplot2"    "corrplot"   "stats"      "graphics"  
## [21] "grDevices"  "utils"      "datasets"   "methods"    "base"

Input Data

library(readxl)
data<-read_excel("/Users/aisyahnuruzzahra/Downloads/data pokemon.xlsx")
str(data)
## tibble [75 × 4] (S3: tbl_df/tbl/data.frame)
##  $ total_strength: num [1:75] 327 429 569 363 472 612 368 473 613 285 ...
##  $ hp            : num [1:75] 45 60 80 39 58 78 44 59 79 45 ...
##  $ height        : num [1:75] 7 10 20 6 11 17 5 10 16 3 ...
##  $ weight        : num [1:75] 69 130 1000 85 190 905 90 225 855 29 ...
y<-data$`total_strength`
x1<-data$`hp`
x2<-data$`height`
x3<-data$`weight`
data1 = data.frame(y,x1,x2,x3)
x <- data.matrix(data1[,c('x1', 'x2', 'x3')])
head(data1)
##     y x1 x2   x3
## 1 327 45  7   69
## 2 429 60 10  130
## 3 569 80 20 1000
## 4 363 39  6   85
## 5 472 58 11  190
## 6 612 78 17  905

Eksplorasi Data

data1_numeric <- data1[sapply(data1, is.numeric)]
cor_matrix <- cor(data1_numeric, use = "complete.obs")
corrplot(cor_matrix, method = "number")

# Plot antara Y dengan X1

plot(x=data$hp, y=data$total_strength, main="Hubungan HP Pokemonterhadap Total Strength")

Plot antara Y dengan X2

plot(x=data$height, y=data$total_strength, main="Hubungan Height Pokemon terhadap Total Strength")

# Plot antara Y dengan X3

plot(x=data$weight, y=data$total_strength, main="Hubungan Weight Pokemon terhadap Total Strength")

## Model Regresi Klasik

modelklasik <- lm(data$total_strength~data$hp+data$height+data$weight,data=data1)
s.modelklasik <- summary(modelklasik)
s.modelklasik
## 
## Call:
## lm(formula = data$total_strength ~ data$hp + data$height + data$weight, 
##     data = data1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -188.410  -56.933   -4.719   54.854  157.892 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 291.8696    27.6919  10.540 3.61e-16 ***
## data$hp       0.8407     0.4865   1.728  0.08834 .  
## data$height   5.5378     2.2731   2.436  0.01735 *  
## data$weight   0.1304     0.0389   3.352  0.00129 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 76.07 on 71 degrees of freedom
## Multiple R-squared:  0.5025, Adjusted R-squared:  0.4815 
## F-statistic:  23.9 on 3 and 71 DF,  p-value: 8.421e-11

Model regresi klasik yang diperoleh adalah : \[total strength=291.8696+0.8407hp+ 5.5378 height+0.1304 weight\]

R Squared dan RSE Model Klasik

R squared

R2.modelklasik <- s.modelklasik$r.squared
R2.modelklasik
## [1] 0.5024762
# RSE
rse.modelklasik <- s.modelklasik$sigma
rse.modelklasik
## [1] 76.07345

Rsq Model klasik didapatkan $ R^2 =0.5025$ dan nilai RSE $RSE = 76.07345 $

Plot Pencilan dan Leverage

ols_plot_resid_lev(modelklasik)

plot(modelklasik,which=5)
ols_plot_diagnostics(modelklasik)

Uji Asumsi

1 Nilai harapan sisaan sama dengan nol

t.test(modelklasik$residuals,
       mu = 0,
       conf.level = 0.95)
## 
##  One Sample t-test
## 
## data:  modelklasik$residuals
## t = 6.6064e-16, df = 74, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -17.14445  17.14445
## sample estimates:
##    mean of x 
## 5.684342e-15

2 Sisaan saling bebas

lmtest::dwtest(modelklasik)
## 
##  Durbin-Watson test
## 
## data:  modelklasik
## DW = 1.7261, p-value = 0.1069
## alternative hypothesis: true autocorrelation is greater than 0
lmtest::bptest(modelklasik)
## 
##  studentized Breusch-Pagan test
## 
## data:  modelklasik
## BP = 5.7814, df = 3, p-value = 0.1227

Asumsi Normalitas Sisaan

shapiro.test(modelklasik$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  modelklasik$residuals
## W = 0.98549, p-value = 0.5491

Multikolineritas

library(car)
car::vif(modelklasik)
##     data$hp data$height data$weight 
##    1.359496    1.930949    1.853587

Tidak ada multikolinieritas antarvariabel karena nilai VIF<10.

Regresi Ridge

x <- data.matrix(data[,c('hp','height' ,'weight')])
y <- data$total_strength
cv.r<-cv.glmnet(x,y,alpha=0)

best.lr<-cv.r$lambda.min
bestridge<-glmnet(x,y,alpha=0,lambda=best.lr)
coef(bestridge)
## 4 x 1 sparse Matrix of class "dgCMatrix"
##                      s0
## (Intercept) 309.0841489
## hp            0.8105141
## height        4.8718180
## weight        0.1008961

Didapatkan Model regresi Ridge ialah : \[total strength=310.69575579+0.80424856hp+4.80558228height+0.09894507weight\]

R Squared dan RSE Model Ridge

fungsi untuk cari rsquared

rsq<-function(bestmodel,bestlambda,x,y){
 #y duga
 y.duga <- predict(bestmodel, s = bestlambda, newx = x)

 #JKG dan JKT
 jkt <- sum((y - mean(y))^2)
 jkg <- sum((y.duga- y)^2)

#find R-Squared
rsq <- 1 - jkg/jkt
return(rsq) 
}

# R2 ridge
R2.modelridge <- rsq(bestridge,best.lr,x,y)
R2.modelridge
## [1] 0.4892223
#RSE ridge
train_predictionsr <- predict(bestridge,newx = x)
# Hitung residu (selisih antara prediksi dan nilai sebenarnya)
residualsr <- y - train_predictionsr
# Hitung varian residu
dfr <- length(y) - length(bestridge$beta)
residual_variancer <- sum(residualsr^2) / dfr
# Hitung RSE
rse.modelridge <- sqrt(residual_variancer)
rse.modelridge
## [1] 76.54293

Didapatkan nilai R2 pada model ridge $ R^2 = 0.4870686$ dan RSE model ridge didapatkan nilai $RSE =76.70413 $

Regresi Lasso

cv.l<-cv.glmnet(x,y,alpha=1)
best.ll<-cv.l$lambda.min
bestlasso<-glmnet(x,y,alpha=1,lambda=best.ll)
coef(bestlasso)
## 4 x 1 sparse Matrix of class "dgCMatrix"
##                      s0
## (Intercept) 302.8036074
## hp            0.7334256
## height        5.2528910
## weight        0.1246989

Didapatkan Model regresi lasso ialah : \[total strength=297.5739953+0.7846787hp+5.3893440height+ 0.1274376weight\]

R Squared dan RSE Model Lasso

# R2 lasso
R2.modellasso <- rsq(bestlasso,best.ll,x,y)
R2.modellasso
## [1] 0.5005677
#RSElasso
train_predictionsLasso <- predict(bestlasso,newx = x)
# Hitung residu (selisih antara prediksi dan nilai sebenarnya)
residualsLasso <- y - train_predictionsLasso
# Hitung varian residu
dfLasso <- length(y) - length(bestlasso$beta)
residual_varianceLasso <- sum(residualsLasso^2) / dfLasso
# Hitung RSE
rse.modelLasso <- sqrt(residual_varianceLasso)
rse.modelLasso
## [1] 75.68807

Didapatkan nilai R2 pada model lasso $ R^2 = 0.5019571$ dan RSE model ridge didapatkan nilai $RSE =75.58272 $

Perbandingan Model Regresi

perbandingan <- matrix(c(R2.modelklasik, R2.modelridge, R2.modellasso, rse.modelklasik, rse.modelridge, rse.modelLasso),ncol=2,byrow = F)
row.names(perbandingan)<- c("Model klasik","model ridge","model lasso")
colnames(perbandingan) <- c("R squared","RSE")
perbandingan
##              R squared      RSE
## Model klasik 0.5024762 76.07345
## model ridge  0.4892223 76.54293
## model lasso  0.5005677 75.68807

Perbandingan dengan melihat nilai RSE yang memiliki nilai terendah, maka model yang memiliki nilai RSE terendah ialah model lasso dengan nilai $ RSE = 75.58272$ dapat dikatakan bahwa model terbaiknya adalah model lasso

Model Terbaik (Model Lasso)

coef(bestlasso)
## 4 x 1 sparse Matrix of class "dgCMatrix"
##                      s0
## (Intercept) 302.8036074
## hp            0.7334256
## height        5.2528910
## weight        0.1246989

Didapatkan Model regresi lasso ialah : \[total strength=297.5739953+0.7846787hp+5.3893440height+0.1274376weight\] ## Interpretasi Koefesien Model total strength akan meningkat jika height, weight dan hp meningkat. akan tetapi, pengaruh peningkatan tertinggi pada total strength terdapat pada height.